summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h1
-rw-r--r--fs/9p/vfs_addr.c353
-rw-r--r--fs/9p/vfs_file.c89
-rw-r--r--fs/9p/vfs_inode.c16
-rw-r--r--fs/9p/vfs_inode_dotl.c8
-rw-r--r--fs/9p/vfs_super.c14
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/affs/namei.c3
-rw-r--r--fs/afs/dir.c30
-rw-r--r--fs/afs/dynroot.c16
-rw-r--r--fs/afs/file.c213
-rw-r--r--fs/afs/inode.c28
-rw-r--r--fs/afs/internal.h72
-rw-r--r--fs/afs/proc.c5
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/afs/write.c826
-rw-r--r--fs/aio.c1
-rw-r--r--fs/anon_inodes.c51
-rw-r--r--fs/autofs/expire.c7
-rw-r--r--fs/bcachefs/Kconfig18
-rw-r--r--fs/bcachefs/Makefile3
-rw-r--r--fs/bcachefs/alloc_background.c573
-rw-r--r--fs/bcachefs/alloc_background.h39
-rw-r--r--fs/bcachefs/alloc_background_format.h92
-rw-r--r--fs/bcachefs/alloc_foreground.c53
-rw-r--r--fs/bcachefs/backpointers.c251
-rw-r--r--fs/bcachefs/backpointers.h28
-rw-r--r--fs/bcachefs/bcachefs.h197
-rw-r--r--fs/bcachefs/bcachefs_format.h1005
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h60
-rw-r--r--fs/bcachefs/bkey.c2
-rw-r--r--fs/bcachefs/bkey_methods.c9
-rw-r--r--fs/bcachefs/bkey_methods.h88
-rw-r--r--fs/bcachefs/bset.c13
-rw-r--r--fs/bcachefs/bset.h3
-rw-r--r--fs/bcachefs/btree_cache.c40
-rw-r--r--fs/bcachefs/btree_cache.h23
-rw-r--r--fs/bcachefs/btree_gc.c363
-rw-r--r--fs/bcachefs/btree_io.c170
-rw-r--r--fs/bcachefs/btree_io.h2
-rw-r--r--fs/bcachefs/btree_iter.c947
-rw-r--r--fs/bcachefs/btree_iter.h412
-rw-r--r--fs/bcachefs/btree_journal_iter.c25
-rw-r--r--fs/bcachefs/btree_key_cache.c63
-rw-r--r--fs/bcachefs/btree_key_cache.h2
-rw-r--r--fs/bcachefs/btree_locking.c149
-rw-r--r--fs/bcachefs/btree_locking.h25
-rw-r--r--fs/bcachefs/btree_trans_commit.c342
-rw-r--r--fs/bcachefs/btree_types.h148
-rw-r--r--fs/bcachefs/btree_update.c245
-rw-r--r--fs/bcachefs/btree_update.h111
-rw-r--r--fs/bcachefs/btree_update_interior.c330
-rw-r--r--fs/bcachefs/btree_update_interior.h53
-rw-r--r--fs/bcachefs/btree_write_buffer.c669
-rw-r--r--fs/bcachefs/btree_write_buffer.h53
-rw-r--r--fs/bcachefs/btree_write_buffer_types.h63
-rw-r--r--fs/bcachefs/buckets.c1509
-rw-r--r--fs/bcachefs/buckets.h60
-rw-r--r--fs/bcachefs/buckets_types.h17
-rw-r--r--fs/bcachefs/chardev.c363
-rw-r--r--fs/bcachefs/checksum.h23
-rw-r--r--fs/bcachefs/clock.c4
-rw-r--r--fs/bcachefs/compress.c4
-rw-r--r--fs/bcachefs/compress.h8
-rw-r--r--fs/bcachefs/darray.h8
-rw-r--r--fs/bcachefs/data_update.c36
-rw-r--r--fs/bcachefs/debug.c157
-rw-r--r--fs/bcachefs/dirent.c51
-rw-r--r--fs/bcachefs/dirent.h7
-rw-r--r--fs/bcachefs/dirent_format.h42
-rw-r--r--fs/bcachefs/disk_groups.c13
-rw-r--r--fs/bcachefs/ec.c406
-rw-r--r--fs/bcachefs/ec.h5
-rw-r--r--fs/bcachefs/ec_format.h19
-rw-r--r--fs/bcachefs/ec_types.h2
-rw-r--r--fs/bcachefs/errcode.h7
-rw-r--r--fs/bcachefs/error.c103
-rw-r--r--fs/bcachefs/extent_update.c2
-rw-r--r--fs/bcachefs/extents.c15
-rw-r--r--fs/bcachefs/extents.h26
-rw-r--r--fs/bcachefs/extents_format.h295
-rw-r--r--fs/bcachefs/eytzinger.h14
-rw-r--r--fs/bcachefs/fs-common.c36
-rw-r--r--fs/bcachefs/fs-io-buffered.c38
-rw-r--r--fs/bcachefs/fs-io-direct.c3
-rw-r--r--fs/bcachefs/fs-io-pagecache.c37
-rw-r--r--fs/bcachefs/fs-io-pagecache.h2
-rw-r--r--fs/bcachefs/fs-io.c29
-rw-r--r--fs/bcachefs/fs-ioctl.c54
-rw-r--r--fs/bcachefs/fs.c100
-rw-r--r--fs/bcachefs/fs.h9
-rw-r--r--fs/bcachefs/fsck.c647
-rw-r--r--fs/bcachefs/inode.c154
-rw-r--r--fs/bcachefs/inode.h15
-rw-r--r--fs/bcachefs/inode_format.h166
-rw-r--r--fs/bcachefs/io_misc.c59
-rw-r--r--fs/bcachefs/io_read.c50
-rw-r--r--fs/bcachefs/io_write.c50
-rw-r--r--fs/bcachefs/journal.c211
-rw-r--r--fs/bcachefs/journal.h4
-rw-r--r--fs/bcachefs/journal_io.c161
-rw-r--r--fs/bcachefs/journal_reclaim.c120
-rw-r--r--fs/bcachefs/journal_reclaim.h16
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c2
-rw-r--r--fs/bcachefs/journal_types.h16
-rw-r--r--fs/bcachefs/keylist.c2
-rw-r--r--fs/bcachefs/keylist.h4
-rw-r--r--fs/bcachefs/logged_ops.c18
-rw-r--r--fs/bcachefs/logged_ops_format.h30
-rw-r--r--fs/bcachefs/lru.c11
-rw-r--r--fs/bcachefs/mean_and_variance.c10
-rw-r--r--fs/bcachefs/mean_and_variance.h5
-rw-r--r--fs/bcachefs/migrate.c9
-rw-r--r--fs/bcachefs/move.c252
-rw-r--r--fs/bcachefs/move.h13
-rw-r--r--fs/bcachefs/movinggc.c49
-rw-r--r--fs/bcachefs/opts.c8
-rw-r--r--fs/bcachefs/opts.h29
-rw-r--r--fs/bcachefs/quota.c28
-rw-r--r--fs/bcachefs/quota_format.h47
-rw-r--r--fs/bcachefs/rebalance.c47
-rw-r--r--fs/bcachefs/recovery.c293
-rw-r--r--fs/bcachefs/recovery.h1
-rw-r--r--fs/bcachefs/recovery_types.h25
-rw-r--r--fs/bcachefs/reflink.c243
-rw-r--r--fs/bcachefs/reflink.h26
-rw-r--r--fs/bcachefs/reflink_format.h33
-rw-r--r--fs/bcachefs/replicas.c94
-rw-r--r--fs/bcachefs/replicas.h22
-rw-r--r--fs/bcachefs/replicas_types.h6
-rw-r--r--fs/bcachefs/sb-clean.c22
-rw-r--r--fs/bcachefs/sb-counters.c (renamed from fs/bcachefs/counters.c)2
-rw-r--r--fs/bcachefs/sb-counters.h (renamed from fs/bcachefs/counters.h)7
-rw-r--r--fs/bcachefs/sb-counters_format.h98
-rw-r--r--fs/bcachefs/sb-downgrade.c90
-rw-r--r--fs/bcachefs/sb-downgrade.h1
-rw-r--r--fs/bcachefs/sb-errors_types.h4
-rw-r--r--fs/bcachefs/sb-members.c22
-rw-r--r--fs/bcachefs/sb-members.h100
-rw-r--r--fs/bcachefs/six.c117
-rw-r--r--fs/bcachefs/six.h13
-rw-r--r--fs/bcachefs/snapshot.c178
-rw-r--r--fs/bcachefs/snapshot.h8
-rw-r--r--fs/bcachefs/snapshot_format.h36
-rw-r--r--fs/bcachefs/str_hash.h47
-rw-r--r--fs/bcachefs/subvolume.c31
-rw-r--r--fs/bcachefs/subvolume_format.h35
-rw-r--r--fs/bcachefs/subvolume_types.h4
-rw-r--r--fs/bcachefs/super-io.c174
-rw-r--r--fs/bcachefs/super-io.h7
-rw-r--r--fs/bcachefs/super.c394
-rw-r--r--fs/bcachefs/super.h6
-rw-r--r--fs/bcachefs/super_types.h2
-rw-r--r--fs/bcachefs/sysfs.c173
-rw-r--r--fs/bcachefs/tests.c193
-rw-r--r--fs/bcachefs/thread_with_file.c299
-rw-r--r--fs/bcachefs/thread_with_file.h41
-rw-r--r--fs/bcachefs/thread_with_file_types.h16
-rw-r--r--fs/bcachefs/trace.h318
-rw-r--r--fs/bcachefs/util.c204
-rw-r--r--fs/bcachefs/util.h59
-rw-r--r--fs/bcachefs/vstructs.h10
-rw-r--r--fs/bcachefs/xattr.c5
-rw-r--r--fs/bcachefs/xattr_format.h19
-rw-r--r--fs/befs/linuxvfs.c3
-rw-r--r--fs/bfs/dir.c5
-rw-r--r--fs/btrfs/compression.c23
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/extent-tree.c53
-rw-r--r--fs/btrfs/inode.c22
-rw-r--r--fs/btrfs/ioctl.c7
-rw-r--r--fs/btrfs/lzo.c34
-rw-r--r--fs/btrfs/ref-verify.c6
-rw-r--r--fs/btrfs/scrub.c36
-rw-r--r--fs/btrfs/send.c4
-rw-r--r--fs/btrfs/subpage.c3
-rw-r--r--fs/btrfs/super.c8
-rw-r--r--fs/btrfs/tree-checker.c2
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/btrfs/zlib.c73
-rw-r--r--fs/btrfs/zoned.c31
-rw-r--r--fs/btrfs/zoned.h2
-rw-r--r--fs/cachefiles/Kconfig2
-rw-r--r--fs/cachefiles/error_inject.c1
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/io.c34
-rw-r--r--fs/cachefiles/namei.c2
-rw-r--r--fs/cachefiles/ondemand.c5
-rw-r--r--fs/ceph/Kconfig1
-rw-r--r--fs/ceph/addr.c33
-rw-r--r--fs/ceph/cache.h45
-rw-r--r--fs/ceph/caps.c9
-rw-r--r--fs/ceph/dir.c23
-rw-r--r--fs/ceph/export.c2
-rw-r--r--fs/ceph/file.c8
-rw-r--r--fs/ceph/inode.c4
-rw-r--r--fs/ceph/mds_client.c37
-rw-r--r--fs/ceph/quota.c39
-rw-r--r--fs/ceph/super.h14
-rw-r--r--fs/coda/cache.c8
-rw-r--r--fs/coda/sysctl.c1
-rw-r--r--fs/coredump.c1
-rw-r--r--fs/dcache.c651
-rw-r--r--fs/devpts/inode.c1
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/erofs/Kconfig7
-rw-r--r--fs/erofs/compress.h5
-rw-r--r--fs/erofs/decompressor.c7
-rw-r--r--fs/erofs/decompressor_deflate.c19
-rw-r--r--fs/erofs/decompressor_lzma.c17
-rw-r--r--fs/erofs/fscache.c8
-rw-r--r--fs/erofs/inode.c2
-rw-r--r--fs/erofs/utils.c2
-rw-r--r--fs/erofs/zdata.c98
-rw-r--r--fs/erofs/zmap.c23
-rw-r--r--fs/eventpoll.c1
-rw-r--r--fs/exec.c108
-rw-r--r--fs/exfat/balloc.c87
-rw-r--r--fs/exfat/exfat_fs.h5
-rw-r--r--fs/exfat/file.c193
-rw-r--r--fs/exfat/inode.c137
-rw-r--r--fs/exfat/namei.c6
-rw-r--r--fs/ext2/namei.c11
-rw-r--r--fs/ext4/namei.c23
-rw-r--r--fs/f2fs/compress.c6
-rw-r--r--fs/f2fs/data.c50
-rw-r--r--fs/f2fs/f2fs.h46
-rw-r--r--fs/f2fs/file.c66
-rw-r--r--fs/f2fs/gc.c16
-rw-r--r--fs/f2fs/inode.c57
-rw-r--r--fs/f2fs/namei.c36
-rw-r--r--fs/f2fs/node.c6
-rw-r--r--fs/f2fs/recovery.c25
-rw-r--r--fs/f2fs/segment.c138
-rw-r--r--fs/f2fs/super.c34
-rw-r--r--fs/f2fs/sysfs.c50
-rw-r--r--fs/f2fs/xattr.c17
-rw-r--r--fs/file_table.c6
-rw-r--r--fs/fs-writeback.c10
-rw-r--r--fs/fscache/Kconfig40
-rw-r--r--fs/fscache/Makefile16
-rw-r--r--fs/fscache/internal.h277
-rw-r--r--fs/gfs2/dentry.c23
-rw-r--r--fs/gfs2/inode.c8
-rw-r--r--fs/hostfs/hostfs_kern.c8
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/inode.c50
-rw-r--r--fs/internal.h7
-rw-r--r--fs/jfs/jfs_dmap.c8
-rw-r--r--fs/kernfs/dir.c62
-rw-r--r--fs/kernfs/file.c2
-rw-r--r--fs/kernfs/mount.c3
-rw-r--r--fs/libfs.c62
-rw-r--r--fs/lockd/svc.c1
-rw-r--r--fs/locks.c1
-rw-r--r--fs/minix/dir.c83
-rw-r--r--fs/minix/namei.c12
-rw-r--r--fs/namei.c104
-rw-r--r--fs/namespace.c51
-rw-r--r--fs/netfs/Kconfig39
-rw-r--r--fs/netfs/Makefile22
-rw-r--r--fs/netfs/buffered_read.c237
-rw-r--r--fs/netfs/buffered_write.c1254
-rw-r--r--fs/netfs/direct_read.c125
-rw-r--r--fs/netfs/direct_write.c171
-rw-r--r--fs/netfs/fscache_cache.c (renamed from fs/fscache/cache.c)3
-rw-r--r--fs/netfs/fscache_cookie.c (renamed from fs/fscache/cookie.c)0
-rw-r--r--fs/netfs/fscache_internal.h14
-rw-r--r--fs/netfs/fscache_io.c (renamed from fs/fscache/io.c)42
-rw-r--r--fs/netfs/fscache_main.c (renamed from fs/fscache/main.c)25
-rw-r--r--fs/netfs/fscache_proc.c (renamed from fs/fscache/proc.c)23
-rw-r--r--fs/netfs/fscache_stats.c (renamed from fs/fscache/stats.c)13
-rw-r--r--fs/netfs/fscache_volume.c (renamed from fs/fscache/volume.c)0
-rw-r--r--fs/netfs/internal.h284
-rw-r--r--fs/netfs/io.c215
-rw-r--r--fs/netfs/iterator.c97
-rw-r--r--fs/netfs/locking.c216
-rw-r--r--fs/netfs/main.c109
-rw-r--r--fs/netfs/misc.c260
-rw-r--r--fs/netfs/objects.c59
-rw-r--r--fs/netfs/output.c478
-rw-r--r--fs/netfs/stats.c42
-rw-r--r--fs/nfs/Kconfig4
-rw-r--r--fs/nfs/blocklayout/blocklayout.c7
-rw-r--r--fs/nfs/blocklayout/dev.c3
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c2
-rw-r--r--fs/nfs/callback.h5
-rw-r--r--fs/nfs/callback_proc.c55
-rw-r--r--fs/nfs/callback_xdr.c5
-rw-r--r--fs/nfs/dir.c4
-rw-r--r--fs/nfs/direct.c11
-rw-r--r--fs/nfs/file.c1
-rw-r--r--fs/nfs/fscache.c7
-rw-r--r--fs/nfs/fscache.h2
-rw-r--r--fs/nfs/internal.h3
-rw-r--r--fs/nfs/nfs4proc.c3
-rw-r--r--fs/nfs/nfs4sysctl.c1
-rw-r--r--fs/nfs/nfs4xdr.c23
-rw-r--r--fs/nfs/nfstrace.h22
-rw-r--r--fs/nfs/pnfs.c3
-rw-r--r--fs/nfs/sysctl.c1
-rw-r--r--fs/nfs/unlink.c2
-rw-r--r--fs/nfs/write.c11
-rw-r--r--fs/nfsd/nfs4state.c26
-rw-r--r--fs/nfsd/nfsctl.c74
-rw-r--r--fs/nfsd/vfs.c4
-rw-r--r--fs/nilfs2/namei.c7
-rw-r--r--fs/notify/dnotify/dnotify.c1
-rw-r--r--fs/notify/fanotify/fanotify_user.c1
-rw-r--r--fs/notify/fsnotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c1
-rw-r--r--fs/nsfs.c7
-rw-r--r--fs/ntfs/sysctl.c1
-rw-r--r--fs/ocfs2/dcache.c7
-rw-r--r--fs/ocfs2/dir.c9
-rw-r--r--fs/ocfs2/namei.c8
-rw-r--r--fs/ocfs2/stackglue.c1
-rw-r--r--fs/orangefs/dir.c32
-rw-r--r--fs/overlayfs/copy_up.c9
-rw-r--r--fs/overlayfs/dir.c4
-rw-r--r--fs/overlayfs/export.c23
-rw-r--r--fs/overlayfs/namei.c43
-rw-r--r--fs/overlayfs/overlayfs.h23
-rw-r--r--fs/overlayfs/ovl_entry.h4
-rw-r--r--fs/overlayfs/readdir.c7
-rw-r--r--fs/overlayfs/super.c21
-rw-r--r--fs/overlayfs/util.c60
-rw-r--r--fs/pipe.c1
-rw-r--r--fs/proc/proc_sysctl.c24
-rw-r--r--fs/proc/task_mmu.c24
-rw-r--r--fs/quota/dquot.c1
-rw-r--r--fs/reiserfs/namei.c61
-rw-r--r--fs/smb/client/cached_dir.c24
-rw-r--r--fs/smb/client/cifs_debug.c6
-rw-r--r--fs/smb/client/cifsencrypt.c2
-rw-r--r--fs/smb/client/cifsfs.c28
-rw-r--r--fs/smb/client/cifsfs.h4
-rw-r--r--fs/smb/client/cifsglob.h70
-rw-r--r--fs/smb/client/cifsproto.h32
-rw-r--r--fs/smb/client/cifssmb.c31
-rw-r--r--fs/smb/client/connect.c29
-rw-r--r--fs/smb/client/dir.c7
-rw-r--r--fs/smb/client/file.c47
-rw-r--r--fs/smb/client/fs_context.c6
-rw-r--r--fs/smb/client/fs_context.h2
-rw-r--r--fs/smb/client/fscache.c2
-rw-r--r--fs/smb/client/inode.c145
-rw-r--r--fs/smb/client/link.c29
-rw-r--r--fs/smb/client/misc.c1
-rw-r--r--fs/smb/client/readdir.c139
-rw-r--r--fs/smb/client/sess.c59
-rw-r--r--fs/smb/client/smb2glob.h26
-rw-r--r--fs/smb/client/smb2inode.c1176
-rw-r--r--fs/smb/client/smb2maperror.c2
-rw-r--r--fs/smb/client/smb2ops.c501
-rw-r--r--fs/smb/client/smb2pdu.c412
-rw-r--r--fs/smb/client/smb2proto.h40
-rw-r--r--fs/smb/client/smb2status.h2
-rw-r--r--fs/smb/client/smbdirect.c4
-rw-r--r--fs/smb/client/smbencrypt.c7
-rw-r--r--fs/smb/client/trace.h7
-rw-r--r--fs/smb/client/transport.c18
-rw-r--r--fs/smb/server/asn1.c5
-rw-r--r--fs/smb/server/auth.c14
-rw-r--r--fs/smb/server/connection.c7
-rw-r--r--fs/smb/server/connection.h2
-rw-r--r--fs/smb/server/ksmbd_netlink.h3
-rw-r--r--fs/smb/server/mgmt/ksmbd_ida.c21
-rw-r--r--fs/smb/server/oplock.c22
-rw-r--r--fs/smb/server/smb2pdu.c53
-rw-r--r--fs/smb/server/smb_common.c6
-rw-r--r--fs/smb/server/smbacl.c11
-rw-r--r--fs/smb/server/transport_ipc.c4
-rw-r--r--fs/smb/server/transport_rdma.c11
-rw-r--r--fs/smb/server/transport_tcp.c15
-rw-r--r--fs/smb/server/vfs.c33
-rw-r--r--fs/sysctls.c1
-rw-r--r--fs/sysfs/dir.c2
-rw-r--r--fs/tracefs/event_inode.c905
-rw-r--r--fs/tracefs/inode.c286
-rw-r--r--fs/tracefs/internal.h48
-rw-r--r--fs/ubifs/auth.c21
-rw-r--r--fs/ubifs/commit.c13
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c30
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/udf/namei.c18
-rw-r--r--fs/userfaultfd.c6
-rw-r--r--fs/vboxsf/vboxsf_wrappers.c2
-rw-r--r--fs/verity/fsverity_private.h10
-rw-r--r--fs/verity/init.c2
-rw-r--r--fs/verity/measure.c84
-rw-r--r--fs/xfs/libxfs/xfs_attr.c6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c14
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.h16
-rw-r--r--fs/xfs/libxfs/xfs_sb.c14
-rw-r--r--fs/xfs/libxfs/xfs_sb.h2
-rw-r--r--fs/xfs/libxfs/xfs_types.h12
-rw-r--r--fs/xfs/scrub/rtbitmap.c1
-rw-r--r--fs/xfs/scrub/rtsummary.c1
-rw-r--r--fs/xfs/xfs_super.c27
-rw-r--r--fs/xfs/xfs_sysctl.c2
-rw-r--r--fs/zonefs/super.c2
405 files changed, 16871 insertions, 13120 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 731e3d14b67d..0e8418066a48 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -42,6 +42,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb);
void v9fs_free_inode(struct inode *inode);
struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode,
dev_t rdev);
+void v9fs_set_netfs_context(struct inode *inode);
int v9fs_init_inode(struct v9fs_session_info *v9ses,
struct inode *inode, umode_t mode, dev_t rdev);
void v9fs_evict_inode(struct inode *inode);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 8a635999a7d6..047855033d32 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -19,12 +19,45 @@
#include <linux/netfs.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
+#include <trace/events/netfs.h>
#include "v9fs.h"
#include "v9fs_vfs.h"
#include "cache.h"
#include "fid.h"
+static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq)
+{
+ struct p9_fid *fid = subreq->rreq->netfs_priv;
+ int err, len;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
+ netfs_write_subrequest_terminated(subreq, len ?: err, false);
+}
+
+static void v9fs_upload_to_server_worker(struct work_struct *work)
+{
+ struct netfs_io_subrequest *subreq =
+ container_of(work, struct netfs_io_subrequest, work);
+
+ v9fs_upload_to_server(subreq);
+}
+
+/*
+ * Set up write requests for a writeback slice. We need to add a write request
+ * for each write we want to make.
+ */
+static void v9fs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len)
+{
+ struct netfs_io_subrequest *subreq;
+
+ subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER,
+ start, len, v9fs_upload_to_server_worker);
+ if (subreq)
+ netfs_queue_write_request(subreq);
+}
+
/**
* v9fs_issue_read - Issue a read from 9P
* @subreq: The read to make
@@ -33,14 +66,10 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
struct p9_fid *fid = rreq->netfs_priv;
- struct iov_iter to;
- loff_t pos = subreq->start + subreq->transferred;
- size_t len = subreq->len - subreq->transferred;
int total, err;
- iov_iter_xarray(&to, ITER_DEST, &rreq->mapping->i_pages, pos, len);
-
- total = p9_client_read(fid, pos, &to, &err);
+ total = p9_client_read(fid, subreq->start + subreq->transferred,
+ &subreq->io_iter, &err);
/* if we just extended the file size, any portion not in
* cache won't be on server and is zeroes */
@@ -50,25 +79,42 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
}
/**
- * v9fs_init_request - Initialise a read request
+ * v9fs_init_request - Initialise a request
* @rreq: The read request
* @file: The file being read from
*/
static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
{
- struct p9_fid *fid = file->private_data;
-
- BUG_ON(!fid);
+ struct p9_fid *fid;
+ bool writing = (rreq->origin == NETFS_READ_FOR_WRITE ||
+ rreq->origin == NETFS_WRITEBACK ||
+ rreq->origin == NETFS_WRITETHROUGH ||
+ rreq->origin == NETFS_LAUNDER_WRITE ||
+ rreq->origin == NETFS_UNBUFFERED_WRITE ||
+ rreq->origin == NETFS_DIO_WRITE);
+
+ if (file) {
+ fid = file->private_data;
+ if (!fid)
+ goto no_fid;
+ p9_fid_get(fid);
+ } else {
+ fid = v9fs_fid_find_inode(rreq->inode, writing, INVALID_UID, true);
+ if (!fid)
+ goto no_fid;
+ }
/* we might need to read from a fid that was opened write-only
* for read-modify-write of page cache, use the writeback fid
* for that */
- WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE &&
- !(fid->mode & P9_ORDWR));
-
- p9_fid_get(fid);
+ WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && !(fid->mode & P9_ORDWR));
rreq->netfs_priv = fid;
return 0;
+
+no_fid:
+ WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n",
+ rreq->inode->i_ino);
+ return -EINVAL;
}
/**
@@ -82,281 +128,20 @@ static void v9fs_free_request(struct netfs_io_request *rreq)
p9_fid_put(fid);
}
-/**
- * v9fs_begin_cache_operation - Begin a cache operation for a read
- * @rreq: The read request
- */
-static int v9fs_begin_cache_operation(struct netfs_io_request *rreq)
-{
-#ifdef CONFIG_9P_FSCACHE
- struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode));
-
- return fscache_begin_read_operation(&rreq->cache_resources, cookie);
-#else
- return -ENOBUFS;
-#endif
-}
-
const struct netfs_request_ops v9fs_req_ops = {
.init_request = v9fs_init_request,
.free_request = v9fs_free_request,
- .begin_cache_operation = v9fs_begin_cache_operation,
.issue_read = v9fs_issue_read,
+ .create_write_requests = v9fs_create_write_requests,
};
-/**
- * v9fs_release_folio - release the private state associated with a folio
- * @folio: The folio to be released
- * @gfp: The caller's allocation restrictions
- *
- * Returns true if the page can be released, false otherwise.
- */
-
-static bool v9fs_release_folio(struct folio *folio, gfp_t gfp)
-{
- if (folio_test_private(folio))
- return false;
-#ifdef CONFIG_9P_FSCACHE
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
- fscache_note_page_release(v9fs_inode_cookie(V9FS_I(folio_inode(folio))));
-#endif
- return true;
-}
-
-static void v9fs_invalidate_folio(struct folio *folio, size_t offset,
- size_t length)
-{
- folio_wait_fscache(folio);
-}
-
-#ifdef CONFIG_9P_FSCACHE
-static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct v9fs_inode *v9inode = priv;
- __le32 version;
-
- if (IS_ERR_VALUE(transferred_or_error) &&
- transferred_or_error != -ENOBUFS) {
- version = cpu_to_le32(v9inode->qid.version);
- fscache_invalidate(v9fs_inode_cookie(v9inode), &version,
- i_size_read(&v9inode->netfs.inode), 0);
- }
-}
-#endif
-
-static int v9fs_vfs_write_folio_locked(struct folio *folio)
-{
- struct inode *inode = folio_inode(folio);
- loff_t start = folio_pos(folio);
- loff_t i_size = i_size_read(inode);
- struct iov_iter from;
- size_t len = folio_size(folio);
- struct p9_fid *writeback_fid;
- int err;
- struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode);
- struct fscache_cookie __maybe_unused *cookie = v9fs_inode_cookie(v9inode);
-
- if (start >= i_size)
- return 0; /* Simultaneous truncation occurred */
-
- len = min_t(loff_t, i_size - start, len);
-
- iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len);
-
- writeback_fid = v9fs_fid_find_inode(inode, true, INVALID_UID, true);
- if (!writeback_fid) {
- WARN_ONCE(1, "folio expected an open fid inode->i_private=%p\n",
- inode->i_private);
- return -EINVAL;
- }
-
- folio_wait_fscache(folio);
- folio_start_writeback(folio);
-
- p9_client_write(writeback_fid, start, &from, &err);
-
-#ifdef CONFIG_9P_FSCACHE
- if (err == 0 &&
- fscache_cookie_enabled(cookie) &&
- test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) {
- folio_start_fscache(folio);
- fscache_write_to_cache(v9fs_inode_cookie(v9inode),
- folio_mapping(folio), start, len, i_size,
- v9fs_write_to_cache_done, v9inode,
- true);
- }
-#endif
-
- folio_end_writeback(folio);
- p9_fid_put(writeback_fid);
-
- return err;
-}
-
-static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct folio *folio = page_folio(page);
- int retval;
-
- p9_debug(P9_DEBUG_VFS, "folio %p\n", folio);
-
- retval = v9fs_vfs_write_folio_locked(folio);
- if (retval < 0) {
- if (retval == -EAGAIN) {
- folio_redirty_for_writepage(wbc, folio);
- retval = 0;
- } else {
- mapping_set_error(folio_mapping(folio), retval);
- }
- } else
- retval = 0;
-
- folio_unlock(folio);
- return retval;
-}
-
-static int v9fs_launder_folio(struct folio *folio)
-{
- int retval;
-
- if (folio_clear_dirty_for_io(folio)) {
- retval = v9fs_vfs_write_folio_locked(folio);
- if (retval)
- return retval;
- }
- folio_wait_fscache(folio);
- return 0;
-}
-
-/**
- * v9fs_direct_IO - 9P address space operation for direct I/O
- * @iocb: target I/O control block
- * @iter: The data/buffer to use
- *
- * The presence of v9fs_direct_IO() in the address space ops vector
- * allowes open() O_DIRECT flags which would have failed otherwise.
- *
- * In the non-cached mode, we shunt off direct read and write requests before
- * the VFS gets them, so this method should never be called.
- *
- * Direct IO is not 'yet' supported in the cached mode. Hence when
- * this routine is called through generic_file_aio_read(), the read/write fails
- * with an error.
- *
- */
-static ssize_t
-v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- loff_t pos = iocb->ki_pos;
- ssize_t n;
- int err = 0;
-
- if (iov_iter_rw(iter) == WRITE) {
- n = p9_client_write(file->private_data, pos, iter, &err);
- if (n) {
- struct inode *inode = file_inode(file);
- loff_t i_size = i_size_read(inode);
-
- if (pos + n > i_size)
- inode_add_bytes(inode, pos + n - i_size);
- }
- } else {
- n = p9_client_read(file->private_data, pos, iter, &err);
- }
- return n ? n : err;
-}
-
-static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned int len,
- struct page **subpagep, void **fsdata)
-{
- int retval;
- struct folio *folio;
- struct v9fs_inode *v9inode = V9FS_I(mapping->host);
-
- p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
-
- /* Prefetch area to be written into the cache if we're caching this
- * file. We need to do this before we get a lock on the page in case
- * there's more than one writer competing for the same cache block.
- */
- retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata);
- if (retval < 0)
- return retval;
-
- *subpagep = &folio->page;
- return retval;
-}
-
-static int v9fs_write_end(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int copied,
- struct page *subpage, void *fsdata)
-{
- loff_t last_pos = pos + copied;
- struct folio *folio = page_folio(subpage);
- struct inode *inode = mapping->host;
-
- p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
-
- if (!folio_test_uptodate(folio)) {
- if (unlikely(copied < len)) {
- copied = 0;
- goto out;
- }
-
- folio_mark_uptodate(folio);
- }
-
- /*
- * No need to use i_size_read() here, the i_size
- * cannot change under us because we hold the i_mutex.
- */
- if (last_pos > inode->i_size) {
- inode_add_bytes(inode, last_pos - inode->i_size);
- i_size_write(inode, last_pos);
-#ifdef CONFIG_9P_FSCACHE
- fscache_update_cookie(v9fs_inode_cookie(V9FS_I(inode)), NULL,
- &last_pos);
-#endif
- }
- folio_mark_dirty(folio);
-out:
- folio_unlock(folio);
- folio_put(folio);
-
- return copied;
-}
-
-#ifdef CONFIG_9P_FSCACHE
-/*
- * Mark a page as having been made dirty and thus needing writeback. We also
- * need to pin the cache object to write back to.
- */
-static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio)
-{
- struct v9fs_inode *v9inode = V9FS_I(mapping->host);
-
- return fscache_dirty_folio(mapping, folio, v9fs_inode_cookie(v9inode));
-}
-#else
-#define v9fs_dirty_folio filemap_dirty_folio
-#endif
-
const struct address_space_operations v9fs_addr_operations = {
- .read_folio = netfs_read_folio,
- .readahead = netfs_readahead,
- .dirty_folio = v9fs_dirty_folio,
- .writepage = v9fs_vfs_writepage,
- .write_begin = v9fs_write_begin,
- .write_end = v9fs_write_end,
- .release_folio = v9fs_release_folio,
- .invalidate_folio = v9fs_invalidate_folio,
- .launder_folio = v9fs_launder_folio,
- .direct_IO = v9fs_direct_IO,
+ .read_folio = netfs_read_folio,
+ .readahead = netfs_readahead,
+ .dirty_folio = netfs_dirty_folio,
+ .release_folio = netfs_release_folio,
+ .invalidate_folio = netfs_invalidate_folio,
+ .launder_folio = netfs_launder_folio,
+ .direct_IO = noop_direct_IO,
+ .writepages = netfs_writepages,
};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 11cd8d23f6f2..bae330c2f0cf 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -353,25 +353,15 @@ static ssize_t
v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct p9_fid *fid = iocb->ki_filp->private_data;
- int ret, err = 0;
p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n",
fid->fid, iov_iter_count(to), iocb->ki_pos);
- if (!(fid->mode & P9L_DIRECT)) {
- p9_debug(P9_DEBUG_VFS, "(cached)\n");
- return generic_file_read_iter(iocb, to);
- }
-
- if (iocb->ki_filp->f_flags & O_NONBLOCK)
- ret = p9_client_read_once(fid, iocb->ki_pos, to, &err);
- else
- ret = p9_client_read(fid, iocb->ki_pos, to, &err);
- if (!ret)
- return err;
+ if (fid->mode & P9L_DIRECT)
+ return netfs_unbuffered_read_iter(iocb, to);
- iocb->ki_pos += ret;
- return ret;
+ p9_debug(P9_DEBUG_VFS, "(cached)\n");
+ return netfs_file_read_iter(iocb, to);
}
/*
@@ -407,46 +397,14 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct p9_fid *fid = file->private_data;
- ssize_t retval;
- loff_t origin;
- int err = 0;
p9_debug(P9_DEBUG_VFS, "fid %d\n", fid->fid);
- if (!(fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))) {
- p9_debug(P9_DEBUG_CACHE, "(cached)\n");
- return generic_file_write_iter(iocb, from);
- }
+ if (fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))
+ return netfs_unbuffered_write_iter(iocb, from);
- retval = generic_write_checks(iocb, from);
- if (retval <= 0)
- return retval;
-
- origin = iocb->ki_pos;
- retval = p9_client_write(file->private_data, iocb->ki_pos, from, &err);
- if (retval > 0) {
- struct inode *inode = file_inode(file);
- loff_t i_size;
- unsigned long pg_start, pg_end;
-
- pg_start = origin >> PAGE_SHIFT;
- pg_end = (origin + retval - 1) >> PAGE_SHIFT;
- if (inode->i_mapping && inode->i_mapping->nrpages)
- invalidate_inode_pages2_range(inode->i_mapping,
- pg_start, pg_end);
- iocb->ki_pos += retval;
- i_size = i_size_read(inode);
- if (iocb->ki_pos > i_size) {
- inode_add_bytes(inode, iocb->ki_pos - i_size);
- /*
- * Need to serialize against i_size_write() in
- * v9fs_stat2inode()
- */
- v9fs_i_size_write(inode, iocb->ki_pos);
- }
- return retval;
- }
- return err;
+ p9_debug(P9_DEBUG_CACHE, "(cached)\n");
+ return netfs_file_write_iter(iocb, from);
}
static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
@@ -519,36 +477,7 @@ v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
static vm_fault_t
v9fs_vm_page_mkwrite(struct vm_fault *vmf)
{
- struct folio *folio = page_folio(vmf->page);
- struct file *filp = vmf->vma->vm_file;
- struct inode *inode = file_inode(filp);
-
-
- p9_debug(P9_DEBUG_VFS, "folio %p fid %lx\n",
- folio, (unsigned long)filp->private_data);
-
- /* Wait for the page to be written to the cache before we allow it to
- * be modified. We then assume the entire page will need writing back.
- */
-#ifdef CONFIG_9P_FSCACHE
- if (folio_test_fscache(folio) &&
- folio_wait_fscache_killable(folio) < 0)
- return VM_FAULT_NOPAGE;
-#endif
-
- /* Update file times before taking page lock */
- file_update_time(filp);
-
- if (folio_lock_killable(folio) < 0)
- return VM_FAULT_RETRY;
- if (folio_mapping(folio) != inode->i_mapping)
- goto out_unlock;
- folio_wait_stable(folio);
-
- return VM_FAULT_LOCKED;
-out_unlock:
- folio_unlock(folio);
- return VM_FAULT_NOPAGE;
+ return netfs_page_mkwrite(vmf, NULL);
}
static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b845ee18a80b..32572982f72e 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -246,10 +246,10 @@ void v9fs_free_inode(struct inode *inode)
/*
* Set parameters for the netfs library
*/
-static void v9fs_set_netfs_context(struct inode *inode)
+void v9fs_set_netfs_context(struct inode *inode)
{
struct v9fs_inode *v9inode = V9FS_I(inode);
- netfs_inode_init(&v9inode->netfs, &v9fs_req_ops);
+ netfs_inode_init(&v9inode->netfs, &v9fs_req_ops, true);
}
int v9fs_init_inode(struct v9fs_session_info *v9ses,
@@ -326,8 +326,6 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
err = -EINVAL;
goto error;
}
-
- v9fs_set_netfs_context(inode);
error:
return err;
@@ -359,6 +357,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
iput(inode);
return ERR_PTR(err);
}
+ v9fs_set_netfs_context(inode);
return inode;
}
@@ -374,11 +373,8 @@ void v9fs_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
-#ifdef CONFIG_9P_FSCACHE
version = cpu_to_le32(v9inode->qid.version);
- fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode,
- &version);
-#endif
+ netfs_clear_inode_writeback(inode, &version);
clear_inode(inode);
filemap_fdatawrite(&inode->i_data);
@@ -464,6 +460,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
goto error;
v9fs_stat2inode(st, inode, sb, 0);
+ v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
unlock_new_inode(inode);
return inode;
@@ -1113,7 +1110,7 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
if ((iattr->ia_valid & ATTR_SIZE) &&
iattr->ia_size != i_size_read(inode)) {
truncate_setsize(inode, iattr->ia_size);
- truncate_pagecache(inode, iattr->ia_size);
+ netfs_resize_file(netfs_inode(inode), iattr->ia_size, true);
#ifdef CONFIG_9P_FSCACHE
if (v9ses->cache & CACHE_FSCACHE) {
@@ -1181,6 +1178,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
+ v9inode->netfs.remote_i_size = stat->length;
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
v9fs_i_size_write(inode, stat->length);
/* not real number of blocks, but 512 byte ones ... */
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index c7319af2f471..3505227e1704 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -128,6 +128,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
goto error;
v9fs_stat2inode_dotl(st, inode, 0);
+ v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
retval = v9fs_get_acl(inode, fid);
if (retval)
@@ -598,7 +599,7 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size !=
i_size_read(inode)) {
truncate_setsize(inode, iattr->ia_size);
- truncate_pagecache(inode, iattr->ia_size);
+ netfs_resize_file(netfs_inode(inode), iattr->ia_size, true);
#ifdef CONFIG_9P_FSCACHE
if (v9ses->cache & CACHE_FSCACHE)
@@ -655,6 +656,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
+ v9inode->netfs.remote_i_size = stat->st_size;
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
v9fs_i_size_write(inode, stat->st_size);
inode->i_blocks = stat->st_blocks;
@@ -683,8 +685,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
inode->i_mode = mode;
}
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
- stat->st_result_mask & P9_STATS_SIZE)
+ stat->st_result_mask & P9_STATS_SIZE) {
+ v9inode->netfs.remote_i_size = stat->st_size;
v9fs_i_size_write(inode, stat->st_size);
+ }
if (stat->st_result_mask & P9_STATS_BLOCKS)
inode->i_blocks = stat->st_blocks;
}
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 73db55c050bf..941f7d0e0bfa 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -289,31 +289,21 @@ static int v9fs_drop_inode(struct inode *inode)
static int v9fs_write_inode(struct inode *inode,
struct writeback_control *wbc)
{
- struct v9fs_inode *v9inode;
-
/*
* send an fsync request to server irrespective of
* wbc->sync_mode.
*/
p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
-
- v9inode = V9FS_I(inode);
- fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
-
- return 0;
+ return netfs_unpin_writeback(inode, wbc);
}
static int v9fs_write_inode_dotl(struct inode *inode,
struct writeback_control *wbc)
{
- struct v9fs_inode *v9inode;
- v9inode = V9FS_I(inode);
p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
- fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
-
- return 0;
+ return netfs_unpin_writeback(inode, wbc);
}
static const struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index a3159831ba98..89fdbefd1075 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -144,7 +144,6 @@ source "fs/overlayfs/Kconfig"
menu "Caches"
source "fs/netfs/Kconfig"
-source "fs/fscache/Kconfig"
source "fs/cachefiles/Kconfig"
endmenu
diff --git a/fs/Makefile b/fs/Makefile
index a6962c588962..c09016257f05 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -61,7 +61,6 @@ obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
obj-$(CONFIG_NETFS_SUPPORT) += netfs/
-obj-$(CONFIG_FSCACHE) += fscache/
obj-$(CONFIG_REISERFS_FS) += reiserfs/
obj-$(CONFIG_EXT4_FS) += ext4/
# We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d6b9758ee23d..8c154490a2d6 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -532,9 +532,6 @@ static struct dentry *affs_get_parent(struct dentry *child)
parent = affs_iget(child->d_sb,
be32_to_cpu(AFFS_TAIL(child->d_sb, bh)->parent));
brelse(bh);
- if (IS_ERR(parent))
- return ERR_CAST(parent);
-
return d_obtain_alias(parent);
}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index c14533ef108f..b5b8de521f99 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -124,7 +124,7 @@ static void afs_dir_read_cleanup(struct afs_read *req)
if (xas_retry(&xas, folio))
continue;
BUG_ON(xa_is_value(folio));
- ASSERTCMP(folio_file_mapping(folio), ==, mapping);
+ ASSERTCMP(folio->mapping, ==, mapping);
folio_put(folio);
}
@@ -202,12 +202,12 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
if (xas_retry(&xas, folio))
continue;
- BUG_ON(folio_file_mapping(folio) != mapping);
+ BUG_ON(folio->mapping != mapping);
size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio));
for (offset = 0; offset < size; offset += sizeof(*block)) {
block = kmap_local_folio(folio, offset);
- pr_warn("[%02lx] %32phN\n", folio_index(folio) + offset, block);
+ pr_warn("[%02lx] %32phN\n", folio->index + offset, block);
kunmap_local(block);
}
}
@@ -233,7 +233,7 @@ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
if (xas_retry(&xas, folio))
continue;
- BUG_ON(folio_file_mapping(folio) != mapping);
+ BUG_ON(folio->mapping != mapping);
if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) {
afs_dir_dump(dvnode, req);
@@ -474,6 +474,14 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
continue;
}
+ /* Don't expose silly rename entries to userspace. */
+ if (nlen > 6 &&
+ dire->u.name[0] == '.' &&
+ ctx->actor != afs_lookup_filldir &&
+ ctx->actor != afs_lookup_one_filldir &&
+ memcmp(dire->u.name, ".__afs", 6) == 0)
+ continue;
+
/* found the next entry */
if (!dir_emit(ctx, dire->u.name, nlen,
ntohl(dire->u.vnode),
@@ -708,6 +716,8 @@ static void afs_do_lookup_success(struct afs_operation *op)
break;
}
+ if (vp->scb.status.abort_code)
+ trace_afs_bulkstat_error(op, &vp->fid, i, vp->scb.status.abort_code);
if (!vp->scb.have_status && !vp->scb.have_error)
continue;
@@ -897,12 +907,16 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
afs_begin_vnode_operation(op);
afs_wait_for_operation(op);
}
- inode = ERR_PTR(afs_op_error(op));
out_op:
if (!afs_op_error(op)) {
- inode = &op->file[1].vnode->netfs.inode;
- op->file[1].vnode = NULL;
+ if (op->file[1].scb.status.abort_code) {
+ afs_op_accumulate_error(op, -ECONNABORTED,
+ op->file[1].scb.status.abort_code);
+ } else {
+ inode = &op->file[1].vnode->netfs.inode;
+ op->file[1].vnode = NULL;
+ }
}
if (op->file[0].scb.have_status)
@@ -2022,7 +2036,7 @@ static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags)
{
struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
- _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio));
+ _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio->index);
folio_detach_private(folio);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 1f656005018e..c4d2711e20ad 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
/* there shouldn't be an existing inode */
BUG_ON(!(inode->i_state & I_NEW));
- netfs_inode_init(&vnode->netfs, NULL);
+ netfs_inode_init(&vnode->netfs, NULL, false);
inode->i_size = 0;
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
if (root) {
@@ -258,16 +258,7 @@ const struct inode_operations afs_dynroot_inode_operations = {
.lookup = afs_dynroot_lookup,
};
-/*
- * Dirs in the dynamic root don't need revalidation.
- */
-static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags)
-{
- return 1;
-}
-
const struct dentry_operations afs_dynroot_dentry_operations = {
- .d_revalidate = afs_dynroot_d_revalidate,
.d_delete = always_delete_dentry,
.d_release = afs_d_release,
.d_automount = afs_d_automount,
@@ -373,7 +364,7 @@ error:
void afs_dynroot_depopulate(struct super_block *sb)
{
struct afs_net *net = afs_sb2net(sb);
- struct dentry *root = sb->s_root, *subdir, *tmp;
+ struct dentry *root = sb->s_root, *subdir;
/* Prevent more subdirs from being created */
mutex_lock(&net->proc_cells_lock);
@@ -382,10 +373,11 @@ void afs_dynroot_depopulate(struct super_block *sb)
mutex_unlock(&net->proc_cells_lock);
if (root) {
+ struct hlist_node *n;
inode_lock(root->d_inode);
/* Remove all the pins for dirs created for manually added cells */
- list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) {
+ hlist_for_each_entry_safe(subdir, n, &root->d_children, d_sib) {
if (subdir->d_fsdata) {
subdir->d_fsdata = NULL;
dput(subdir);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 30914e0d9cb2..3d33b221d9ca 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -20,9 +20,6 @@
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
static int afs_symlink_read_folio(struct file *file, struct folio *folio);
-static void afs_invalidate_folio(struct folio *folio, size_t offset,
- size_t length);
-static bool afs_release_folio(struct folio *folio, gfp_t gfp_flags);
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
@@ -37,7 +34,7 @@ const struct file_operations afs_file_operations = {
.release = afs_release,
.llseek = generic_file_llseek,
.read_iter = afs_file_read_iter,
- .write_iter = afs_file_write,
+ .write_iter = netfs_file_write_iter,
.mmap = afs_file_mmap,
.splice_read = afs_file_splice_read,
.splice_write = iter_file_splice_write,
@@ -53,22 +50,21 @@ const struct inode_operations afs_file_inode_operations = {
};
const struct address_space_operations afs_file_aops = {
+ .direct_IO = noop_direct_IO,
.read_folio = netfs_read_folio,
.readahead = netfs_readahead,
- .dirty_folio = afs_dirty_folio,
- .launder_folio = afs_launder_folio,
- .release_folio = afs_release_folio,
- .invalidate_folio = afs_invalidate_folio,
- .write_begin = afs_write_begin,
- .write_end = afs_write_end,
- .writepages = afs_writepages,
+ .dirty_folio = netfs_dirty_folio,
+ .launder_folio = netfs_launder_folio,
+ .release_folio = netfs_release_folio,
+ .invalidate_folio = netfs_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
+ .writepages = afs_writepages,
};
const struct address_space_operations afs_symlink_aops = {
.read_folio = afs_symlink_read_folio,
- .release_folio = afs_release_folio,
- .invalidate_folio = afs_invalidate_folio,
+ .release_folio = netfs_release_folio,
+ .invalidate_folio = netfs_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
};
@@ -323,11 +319,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)
fsreq->len = subreq->len - subreq->transferred;
fsreq->key = key_get(subreq->rreq->netfs_priv);
fsreq->vnode = vnode;
- fsreq->iter = &fsreq->def_iter;
-
- iov_iter_xarray(&fsreq->def_iter, ITER_DEST,
- &fsreq->vnode->netfs.inode.i_mapping->i_pages,
- fsreq->pos, fsreq->len);
+ fsreq->iter = &subreq->io_iter;
afs_fetch_data(fsreq->vnode, fsreq);
afs_put_read(fsreq);
@@ -359,22 +351,13 @@ static int afs_symlink_read_folio(struct file *file, struct folio *folio)
static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
{
- rreq->netfs_priv = key_get(afs_file_key(file));
+ if (file)
+ rreq->netfs_priv = key_get(afs_file_key(file));
+ rreq->rsize = 256 * 1024;
+ rreq->wsize = 256 * 1024;
return 0;
}
-static int afs_begin_cache_operation(struct netfs_io_request *rreq)
-{
-#ifdef CONFIG_AFS_FSCACHE
- struct afs_vnode *vnode = AFS_FS_I(rreq->inode);
-
- return fscache_begin_read_operation(&rreq->cache_resources,
- afs_vnode_cache(vnode));
-#else
- return -ENOBUFS;
-#endif
-}
-
static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
struct folio **foliop, void **_fsdata)
{
@@ -388,128 +371,37 @@ static void afs_free_request(struct netfs_io_request *rreq)
key_put(rreq->netfs_priv);
}
-const struct netfs_request_ops afs_req_ops = {
- .init_request = afs_init_request,
- .free_request = afs_free_request,
- .begin_cache_operation = afs_begin_cache_operation,
- .check_write_begin = afs_check_write_begin,
- .issue_read = afs_issue_read,
-};
-
-int afs_write_inode(struct inode *inode, struct writeback_control *wbc)
+static void afs_update_i_size(struct inode *inode, loff_t new_i_size)
{
- fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode)));
- return 0;
-}
-
-/*
- * Adjust the dirty region of the page on truncation or full invalidation,
- * getting rid of the markers altogether if the region is entirely invalidated.
- */
-static void afs_invalidate_dirty(struct folio *folio, size_t offset,
- size_t length)
-{
- struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
- unsigned long priv;
- unsigned int f, t, end = offset + length;
-
- priv = (unsigned long)folio_get_private(folio);
-
- /* we clean up only if the entire page is being invalidated */
- if (offset == 0 && length == folio_size(folio))
- goto full_invalidate;
-
- /* If the page was dirtied by page_mkwrite(), the PTE stays writable
- * and we don't get another notification to tell us to expand it
- * again.
- */
- if (afs_is_folio_dirty_mmapped(priv))
- return;
-
- /* We may need to shorten the dirty region */
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
-
- if (t <= offset || f >= end)
- return; /* Doesn't overlap */
-
- if (f < offset && t > end)
- return; /* Splits the dirty region - just absorb it */
-
- if (f >= offset && t <= end)
- goto undirty;
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+ loff_t i_size;
- if (f < offset)
- t = offset;
- else
- f = end;
- if (f == t)
- goto undirty;
-
- priv = afs_folio_dirty(folio, f, t);
- folio_change_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("trunc"), folio);
- return;
-
-undirty:
- trace_afs_folio_dirty(vnode, tracepoint_string("undirty"), folio);
- folio_clear_dirty_for_io(folio);
-full_invalidate:
- trace_afs_folio_dirty(vnode, tracepoint_string("inval"), folio);
- folio_detach_private(folio);
+ write_seqlock(&vnode->cb_lock);
+ i_size = i_size_read(&vnode->netfs.inode);
+ if (new_i_size > i_size) {
+ i_size_write(&vnode->netfs.inode, new_i_size);
+ inode_set_bytes(&vnode->netfs.inode, new_i_size);
+ }
+ write_sequnlock(&vnode->cb_lock);
+ fscache_update_cookie(afs_vnode_cache(vnode), NULL, &new_i_size);
}
-/*
- * invalidate part or all of a page
- * - release a page and clean up its private data if offset is 0 (indicating
- * the entire page)
- */
-static void afs_invalidate_folio(struct folio *folio, size_t offset,
- size_t length)
+static void afs_netfs_invalidate_cache(struct netfs_io_request *wreq)
{
- _enter("{%lu},%zu,%zu", folio->index, offset, length);
-
- BUG_ON(!folio_test_locked(folio));
+ struct afs_vnode *vnode = AFS_FS_I(wreq->inode);
- if (folio_get_private(folio))
- afs_invalidate_dirty(folio, offset, length);
-
- folio_wait_fscache(folio);
- _leave("");
+ afs_invalidate_cache(vnode, 0);
}
-/*
- * release a page and clean up its private state if it's not busy
- * - return true if the page can now be released, false if not
- */
-static bool afs_release_folio(struct folio *folio, gfp_t gfp)
-{
- struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
-
- _enter("{{%llx:%llu}[%lu],%lx},%x",
- vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags,
- gfp);
-
- /* deny if folio is being written to the cache and the caller hasn't
- * elected to wait */
-#ifdef CONFIG_AFS_FSCACHE
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
- fscache_note_page_release(afs_vnode_cache(vnode));
-#endif
-
- if (folio_test_private(folio)) {
- trace_afs_folio_dirty(vnode, tracepoint_string("rel"), folio);
- folio_detach_private(folio);
- }
-
- /* Indicate that the folio can be released */
- _leave(" = T");
- return true;
-}
+const struct netfs_request_ops afs_req_ops = {
+ .init_request = afs_init_request,
+ .free_request = afs_free_request,
+ .check_write_begin = afs_check_write_begin,
+ .issue_read = afs_issue_read,
+ .update_i_size = afs_update_i_size,
+ .invalidate_cache = afs_netfs_invalidate_cache,
+ .create_write_requests = afs_create_write_requests,
+};
static void afs_add_open_mmap(struct afs_vnode *vnode)
{
@@ -576,28 +468,39 @@ static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pg
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct afs_vnode *vnode = AFS_FS_I(inode);
struct afs_file *af = iocb->ki_filp->private_data;
- int ret;
+ ssize_t ret;
- ret = afs_validate(vnode, af->key);
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return netfs_unbuffered_read_iter(iocb, iter);
+
+ ret = netfs_start_io_read(inode);
if (ret < 0)
return ret;
-
- return generic_file_read_iter(iocb, iter);
+ ret = afs_validate(vnode, af->key);
+ if (ret == 0)
+ ret = filemap_read(iocb, iter, 0);
+ netfs_end_io_read(inode);
+ return ret;
}
static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len, unsigned int flags)
{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(in));
+ struct inode *inode = file_inode(in);
+ struct afs_vnode *vnode = AFS_FS_I(inode);
struct afs_file *af = in->private_data;
- int ret;
+ ssize_t ret;
- ret = afs_validate(vnode, af->key);
+ ret = netfs_start_io_read(inode);
if (ret < 0)
return ret;
-
- return filemap_splice_read(in, ppos, pipe, len, flags);
+ ret = afs_validate(vnode, af->key);
+ if (ret == 0)
+ ret = filemap_splice_read(in, ppos, pipe, len, flags);
+ netfs_end_io_read(inode);
+ return ret;
}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 4f04f6f33f46..94fc049aff58 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -58,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren
*/
static void afs_set_netfs_context(struct afs_vnode *vnode)
{
- netfs_inode_init(&vnode->netfs, &afs_req_ops);
+ netfs_inode_init(&vnode->netfs, &afs_req_ops, true);
}
/*
@@ -166,6 +166,7 @@ static void afs_apply_status(struct afs_operation *op,
struct inode *inode = &vnode->netfs.inode;
struct timespec64 t;
umode_t mode;
+ bool unexpected_jump = false;
bool data_changed = false;
bool change_size = vp->set_size;
@@ -230,6 +231,7 @@ static void afs_apply_status(struct afs_operation *op,
}
change_size = true;
data_changed = true;
+ unexpected_jump = true;
} else if (vnode->status.type == AFS_FTYPE_DIR) {
/* Expected directory change is handled elsewhere so
* that we can locally edit the directory and save on a
@@ -249,8 +251,10 @@ static void afs_apply_status(struct afs_operation *op,
* what's on the server.
*/
vnode->netfs.remote_i_size = status->size;
- if (change_size) {
+ if (change_size || status->size > i_size_read(inode)) {
afs_set_i_size(vnode, status->size);
+ if (unexpected_jump)
+ vnode->netfs.zero_point = status->size;
inode_set_ctime_to_ts(inode, t);
inode_set_atime_to_ts(inode, t);
}
@@ -647,7 +651,7 @@ void afs_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
afs_set_cache_aux(vnode, &aux);
- fscache_clear_inode_writeback(afs_vnode_cache(vnode), inode, &aux);
+ netfs_clear_inode_writeback(inode, &aux);
clear_inode(inode);
while (!list_empty(&vnode->wb_keys)) {
@@ -689,17 +693,17 @@ static void afs_setattr_success(struct afs_operation *op)
static void afs_setattr_edit_file(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
- struct inode *inode = &vp->vnode->netfs.inode;
+ struct afs_vnode *vnode = vp->vnode;
if (op->setattr.attr->ia_valid & ATTR_SIZE) {
loff_t size = op->setattr.attr->ia_size;
loff_t i_size = op->setattr.old_i_size;
- if (size < i_size)
- truncate_pagecache(inode, size);
- if (size != i_size)
- fscache_resize_cookie(afs_vnode_cache(vp->vnode),
- vp->scb.status.size);
+ if (size != i_size) {
+ truncate_setsize(&vnode->netfs.inode, size);
+ netfs_resize_file(&vnode->netfs, size, true);
+ fscache_resize_cookie(afs_vnode_cache(vnode), size);
+ }
}
}
@@ -767,11 +771,11 @@ int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
*/
if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) &&
attr->ia_size < i_size &&
- attr->ia_size > vnode->status.size) {
- truncate_pagecache(inode, attr->ia_size);
+ attr->ia_size > vnode->netfs.remote_i_size) {
+ truncate_setsize(inode, attr->ia_size);
+ netfs_resize_file(&vnode->netfs, size, false);
fscache_resize_cookie(afs_vnode_cache(vnode),
attr->ia_size);
- i_size_write(inode, attr->ia_size);
ret = 0;
goto out_unlock;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 55aa0679d8ce..9c03fcf7ffaa 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -985,62 +985,6 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl
i_size_read(&vnode->netfs.inode), flags);
}
-/*
- * We use folio->private to hold the amount of the folio that we've written to,
- * splitting the field into two parts. However, we need to represent a range
- * 0...FOLIO_SIZE, so we reduce the resolution if the size of the folio
- * exceeds what we can encode.
- */
-#ifdef CONFIG_64BIT
-#define __AFS_FOLIO_PRIV_MASK 0x7fffffffUL
-#define __AFS_FOLIO_PRIV_SHIFT 32
-#define __AFS_FOLIO_PRIV_MMAPPED 0x80000000UL
-#else
-#define __AFS_FOLIO_PRIV_MASK 0x7fffUL
-#define __AFS_FOLIO_PRIV_SHIFT 16
-#define __AFS_FOLIO_PRIV_MMAPPED 0x8000UL
-#endif
-
-static inline unsigned int afs_folio_dirty_resolution(struct folio *folio)
-{
- int shift = folio_shift(folio) - (__AFS_FOLIO_PRIV_SHIFT - 1);
- return (shift > 0) ? shift : 0;
-}
-
-static inline size_t afs_folio_dirty_from(struct folio *folio, unsigned long priv)
-{
- unsigned long x = priv & __AFS_FOLIO_PRIV_MASK;
-
- /* The lower bound is inclusive */
- return x << afs_folio_dirty_resolution(folio);
-}
-
-static inline size_t afs_folio_dirty_to(struct folio *folio, unsigned long priv)
-{
- unsigned long x = (priv >> __AFS_FOLIO_PRIV_SHIFT) & __AFS_FOLIO_PRIV_MASK;
-
- /* The upper bound is immediately beyond the region */
- return (x + 1) << afs_folio_dirty_resolution(folio);
-}
-
-static inline unsigned long afs_folio_dirty(struct folio *folio, size_t from, size_t to)
-{
- unsigned int res = afs_folio_dirty_resolution(folio);
- from >>= res;
- to = (to - 1) >> res;
- return (to << __AFS_FOLIO_PRIV_SHIFT) | from;
-}
-
-static inline unsigned long afs_folio_dirty_mmapped(unsigned long priv)
-{
- return priv | __AFS_FOLIO_PRIV_MMAPPED;
-}
-
-static inline bool afs_is_folio_dirty_mmapped(unsigned long priv)
-{
- return priv & __AFS_FOLIO_PRIV_MMAPPED;
-}
-
#include <trace/events/afs.h>
/*****************************************************************************/
@@ -1167,7 +1111,6 @@ extern int afs_release(struct inode *, struct file *);
extern int afs_fetch_data(struct afs_vnode *, struct afs_read *);
extern struct afs_read *afs_alloc_read(gfp_t);
extern void afs_put_read(struct afs_read *);
-extern int afs_write_inode(struct inode *, struct writeback_control *);
static inline struct afs_read *afs_get_read(struct afs_read *req)
{
@@ -1658,24 +1601,11 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);
/*
* write.c
*/
-#ifdef CONFIG_AFS_FSCACHE
-bool afs_dirty_folio(struct address_space *, struct folio *);
-#else
-#define afs_dirty_folio filemap_dirty_folio
-#endif
-extern int afs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct page **pagep, void **fsdata);
-extern int afs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata);
-extern int afs_writepage(struct page *, struct writeback_control *);
extern int afs_writepages(struct address_space *, struct writeback_control *);
-extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
extern int afs_fsync(struct file *, loff_t, loff_t, int);
extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf);
extern void afs_prune_wb_keys(struct afs_vnode *);
-int afs_launder_folio(struct folio *);
+void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len);
/*
* xattr.c
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 3bd02571f30d..15eab053af6d 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -166,7 +166,7 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v)
if (!preflist) {
seq_puts(m, "NO PREFS\n");
- return 0;
+ goto out;
}
seq_printf(m, "PROT SUBNET PRIOR (v=%u n=%u/%u/%u)\n",
@@ -191,7 +191,8 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v)
}
}
- rcu_read_lock();
+out:
+ rcu_read_unlock();
return 0;
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index ae2d66a52add..f3ba1c3e72f5 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -55,7 +55,7 @@ int afs_net_id;
static const struct super_operations afs_super_ops = {
.statfs = afs_statfs,
.alloc_inode = afs_alloc_inode,
- .write_inode = afs_write_inode,
+ .write_inode = netfs_unpin_writeback,
.drop_inode = afs_drop_inode,
.destroy_inode = afs_destroy_inode,
.free_inode = afs_free_inode,
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 61d34ad2ca7d..74402d95a884 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -12,309 +12,17 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/netfs.h>
+#include <trace/events/netfs.h>
#include "internal.h"
-static int afs_writepages_region(struct address_space *mapping,
- struct writeback_control *wbc,
- loff_t start, loff_t end, loff_t *_next,
- bool max_one_loop);
-
-static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len,
- loff_t i_size, bool caching);
-
-#ifdef CONFIG_AFS_FSCACHE
-/*
- * Mark a page as having been made dirty and thus needing writeback. We also
- * need to pin the cache object to write back to.
- */
-bool afs_dirty_folio(struct address_space *mapping, struct folio *folio)
-{
- return fscache_dirty_folio(mapping, folio,
- afs_vnode_cache(AFS_FS_I(mapping->host)));
-}
-static void afs_folio_start_fscache(bool caching, struct folio *folio)
-{
- if (caching)
- folio_start_fscache(folio);
-}
-#else
-static void afs_folio_start_fscache(bool caching, struct folio *folio)
-{
-}
-#endif
-
-/*
- * Flush out a conflicting write. This may extend the write to the surrounding
- * pages if also dirty and contiguous to the conflicting region..
- */
-static int afs_flush_conflicting_write(struct address_space *mapping,
- struct folio *folio)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
- .range_start = folio_pos(folio),
- .range_end = LLONG_MAX,
- };
- loff_t next;
-
- return afs_writepages_region(mapping, &wbc, folio_pos(folio), LLONG_MAX,
- &next, true);
-}
-
-/*
- * prepare to perform part of a write to a page
- */
-int afs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct page **_page, void **fsdata)
-{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
- struct folio *folio;
- unsigned long priv;
- unsigned f, from;
- unsigned t, to;
- pgoff_t index;
- int ret;
-
- _enter("{%llx:%llu},%llx,%x",
- vnode->fid.vid, vnode->fid.vnode, pos, len);
-
- /* Prefetch area to be written into the cache if we're caching this
- * file. We need to do this before we get a lock on the page in case
- * there's more than one writer competing for the same cache block.
- */
- ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata);
- if (ret < 0)
- return ret;
-
- index = folio_index(folio);
- from = pos - index * PAGE_SIZE;
- to = from + len;
-
-try_again:
- /* See if this page is already partially written in a way that we can
- * merge the new write with.
- */
- if (folio_test_private(folio)) {
- priv = (unsigned long)folio_get_private(folio);
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
- ASSERTCMP(f, <=, t);
-
- if (folio_test_writeback(folio)) {
- trace_afs_folio_dirty(vnode, tracepoint_string("alrdy"), folio);
- folio_unlock(folio);
- goto wait_for_writeback;
- }
- /* If the file is being filled locally, allow inter-write
- * spaces to be merged into writes. If it's not, only write
- * back what the user gives us.
- */
- if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) &&
- (to < f || from > t))
- goto flush_conflicting_write;
- }
-
- *_page = folio_file_page(folio, pos / PAGE_SIZE);
- _leave(" = 0");
- return 0;
-
- /* The previous write and this write aren't adjacent or overlapping, so
- * flush the page out.
- */
-flush_conflicting_write:
- trace_afs_folio_dirty(vnode, tracepoint_string("confl"), folio);
- folio_unlock(folio);
-
- ret = afs_flush_conflicting_write(mapping, folio);
- if (ret < 0)
- goto error;
-
-wait_for_writeback:
- ret = folio_wait_writeback_killable(folio);
- if (ret < 0)
- goto error;
-
- ret = folio_lock_killable(folio);
- if (ret < 0)
- goto error;
- goto try_again;
-
-error:
- folio_put(folio);
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
- * finalise part of a write to a page
- */
-int afs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *subpage, void *fsdata)
-{
- struct folio *folio = page_folio(subpage);
- struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
- unsigned long priv;
- unsigned int f, from = offset_in_folio(folio, pos);
- unsigned int t, to = from + copied;
- loff_t i_size, write_end_pos;
-
- _enter("{%llx:%llu},{%lx}",
- vnode->fid.vid, vnode->fid.vnode, folio_index(folio));
-
- if (!folio_test_uptodate(folio)) {
- if (copied < len) {
- copied = 0;
- goto out;
- }
-
- folio_mark_uptodate(folio);
- }
-
- if (copied == 0)
- goto out;
-
- write_end_pos = pos + copied;
-
- i_size = i_size_read(&vnode->netfs.inode);
- if (write_end_pos > i_size) {
- write_seqlock(&vnode->cb_lock);
- i_size = i_size_read(&vnode->netfs.inode);
- if (write_end_pos > i_size)
- afs_set_i_size(vnode, write_end_pos);
- write_sequnlock(&vnode->cb_lock);
- fscache_update_cookie(afs_vnode_cache(vnode), NULL, &write_end_pos);
- }
-
- if (folio_test_private(folio)) {
- priv = (unsigned long)folio_get_private(folio);
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
- if (from < f)
- f = from;
- if (to > t)
- t = to;
- priv = afs_folio_dirty(folio, f, t);
- folio_change_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("dirty+"), folio);
- } else {
- priv = afs_folio_dirty(folio, from, to);
- folio_attach_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("dirty"), folio);
- }
-
- if (folio_mark_dirty(folio))
- _debug("dirtied %lx", folio_index(folio));
-
-out:
- folio_unlock(folio);
- folio_put(folio);
- return copied;
-}
-
-/*
- * kill all the pages in the given range
- */
-static void afs_kill_pages(struct address_space *mapping,
- loff_t start, loff_t len)
-{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct folio *folio;
- pgoff_t index = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
-
- _enter("{%llx:%llu},%llx @%llx",
- vnode->fid.vid, vnode->fid.vnode, len, start);
-
- do {
- _debug("kill %lx (to %lx)", index, last);
-
- folio = filemap_get_folio(mapping, index);
- if (IS_ERR(folio)) {
- next = index + 1;
- continue;
- }
-
- next = folio_next_index(folio);
-
- folio_clear_uptodate(folio);
- folio_end_writeback(folio);
- folio_lock(folio);
- generic_error_remove_folio(mapping, folio);
- folio_unlock(folio);
- folio_put(folio);
-
- } while (index = next, index <= last);
-
- _leave("");
-}
-
-/*
- * Redirty all the pages in a given range.
- */
-static void afs_redirty_pages(struct writeback_control *wbc,
- struct address_space *mapping,
- loff_t start, loff_t len)
-{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct folio *folio;
- pgoff_t index = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
-
- _enter("{%llx:%llu},%llx @%llx",
- vnode->fid.vid, vnode->fid.vnode, len, start);
-
- do {
- _debug("redirty %llx @%llx", len, start);
-
- folio = filemap_get_folio(mapping, index);
- if (IS_ERR(folio)) {
- next = index + 1;
- continue;
- }
-
- next = index + folio_nr_pages(folio);
- folio_redirty_for_writepage(wbc, folio);
- folio_end_writeback(folio);
- folio_put(folio);
- } while (index = next, index <= last);
-
- _leave("");
-}
-
/*
* completion of write to server
*/
static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len)
{
- struct address_space *mapping = vnode->netfs.inode.i_mapping;
- struct folio *folio;
- pgoff_t end;
-
- XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
-
_enter("{%llx:%llu},{%x @%llx}",
vnode->fid.vid, vnode->fid.vnode, len, start);
- rcu_read_lock();
-
- end = (start + len - 1) / PAGE_SIZE;
- xas_for_each(&xas, folio, end) {
- if (!folio_test_writeback(folio)) {
- kdebug("bad %x @%llx page %lx %lx",
- len, start, folio_index(folio), end);
- ASSERT(folio_test_writeback(folio));
- }
-
- trace_afs_folio_dirty(vnode, tracepoint_string("clear"), folio);
- folio_detach_private(folio);
- folio_end_writeback(folio);
- }
-
- rcu_read_unlock();
-
afs_prune_wb_keys(vnode);
_leave("");
}
@@ -451,363 +159,53 @@ try_next_key:
return afs_put_operation(op);
}
-/*
- * Extend the region to be written back to include subsequent contiguously
- * dirty pages if possible, but don't sleep while doing so.
- *
- * If this page holds new content, then we can include filler zeros in the
- * writeback.
- */
-static void afs_extend_writeback(struct address_space *mapping,
- struct afs_vnode *vnode,
- long *_count,
- loff_t start,
- loff_t max_len,
- bool new_content,
- bool caching,
- unsigned int *_len)
+static void afs_upload_to_server(struct netfs_io_subrequest *subreq)
{
- struct folio_batch fbatch;
- struct folio *folio;
- unsigned long priv;
- unsigned int psize, filler = 0;
- unsigned int f, t;
- loff_t len = *_len;
- pgoff_t index = (start + len) / PAGE_SIZE;
- bool stop = true;
- unsigned int i;
-
- XA_STATE(xas, &mapping->i_pages, index);
- folio_batch_init(&fbatch);
-
- do {
- /* Firstly, we gather up a batch of contiguous dirty pages
- * under the RCU read lock - but we can't clear the dirty flags
- * there if any of those pages are mapped.
- */
- rcu_read_lock();
-
- xas_for_each(&xas, folio, ULONG_MAX) {
- stop = true;
- if (xas_retry(&xas, folio))
- continue;
- if (xa_is_value(folio))
- break;
- if (folio_index(folio) != index)
- break;
-
- if (!folio_try_get_rcu(folio)) {
- xas_reset(&xas);
- continue;
- }
-
- /* Has the page moved or been split? */
- if (unlikely(folio != xas_reload(&xas))) {
- folio_put(folio);
- break;
- }
-
- if (!folio_trylock(folio)) {
- folio_put(folio);
- break;
- }
- if (!folio_test_dirty(folio) ||
- folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- break;
- }
-
- psize = folio_size(folio);
- priv = (unsigned long)folio_get_private(folio);
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
- if (f != 0 && !new_content) {
- folio_unlock(folio);
- folio_put(folio);
- break;
- }
-
- len += filler + t;
- filler = psize - t;
- if (len >= max_len || *_count <= 0)
- stop = true;
- else if (t == psize || new_content)
- stop = false;
-
- index += folio_nr_pages(folio);
- if (!folio_batch_add(&fbatch, folio))
- break;
- if (stop)
- break;
- }
-
- if (!stop)
- xas_pause(&xas);
- rcu_read_unlock();
-
- /* Now, if we obtained any folios, we can shift them to being
- * writable and mark them for caching.
- */
- if (!folio_batch_count(&fbatch))
- break;
-
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- folio = fbatch.folios[i];
- trace_afs_folio_dirty(vnode, tracepoint_string("store+"), folio);
-
- if (!folio_clear_dirty_for_io(folio))
- BUG();
- folio_start_writeback(folio);
- afs_folio_start_fscache(caching, folio);
-
- *_count -= folio_nr_pages(folio);
- folio_unlock(folio);
- }
+ struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
+ ssize_t ret;
- folio_batch_release(&fbatch);
- cond_resched();
- } while (!stop);
+ _enter("%x[%x],%zx",
+ subreq->rreq->debug_id, subreq->debug_index, subreq->io_iter.count);
- *_len = len;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ ret = afs_store_data(vnode, &subreq->io_iter, subreq->start,
+ subreq->rreq->origin == NETFS_LAUNDER_WRITE);
+ netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len,
+ false);
}
-/*
- * Synchronously write back the locked page and any subsequent non-locked dirty
- * pages.
- */
-static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
- struct writeback_control *wbc,
- struct folio *folio,
- loff_t start, loff_t end)
+static void afs_upload_to_server_worker(struct work_struct *work)
{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct iov_iter iter;
- unsigned long priv;
- unsigned int offset, to, len, max_len;
- loff_t i_size = i_size_read(&vnode->netfs.inode);
- bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
- bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode));
- long count = wbc->nr_to_write;
- int ret;
-
- _enter(",%lx,%llx-%llx", folio_index(folio), start, end);
-
- folio_start_writeback(folio);
- afs_folio_start_fscache(caching, folio);
-
- count -= folio_nr_pages(folio);
-
- /* Find all consecutive lockable dirty pages that have contiguous
- * written regions, stopping when we find a page that is not
- * immediately lockable, is not dirty or is missing, or we reach the
- * end of the range.
- */
- priv = (unsigned long)folio_get_private(folio);
- offset = afs_folio_dirty_from(folio, priv);
- to = afs_folio_dirty_to(folio, priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("store"), folio);
-
- len = to - offset;
- start += offset;
- if (start < i_size) {
- /* Trim the write to the EOF; the extra data is ignored. Also
- * put an upper limit on the size of a single storedata op.
- */
- max_len = 65536 * 4096;
- max_len = min_t(unsigned long long, max_len, end - start + 1);
- max_len = min_t(unsigned long long, max_len, i_size - start);
-
- if (len < max_len &&
- (to == folio_size(folio) || new_content))
- afs_extend_writeback(mapping, vnode, &count,
- start, max_len, new_content,
- caching, &len);
- len = min_t(loff_t, len, max_len);
- }
-
- /* We now have a contiguous set of dirty pages, each with writeback
- * set; the first page is still locked at this point, but all the rest
- * have been unlocked.
- */
- folio_unlock(folio);
-
- if (start < i_size) {
- _debug("write back %x @%llx [%llx]", len, start, i_size);
-
- /* Speculatively write to the cache. We have to fix this up
- * later if the store fails.
- */
- afs_write_to_cache(vnode, start, len, i_size, caching);
-
- iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len);
- ret = afs_store_data(vnode, &iter, start, false);
- } else {
- _debug("write discard %x @%llx [%llx]", len, start, i_size);
-
- /* The dirty region was entirely beyond the EOF. */
- fscache_clear_page_bits(mapping, start, len, caching);
- afs_pages_written_back(vnode, start, len);
- ret = 0;
- }
-
- switch (ret) {
- case 0:
- wbc->nr_to_write = count;
- ret = len;
- break;
+ struct netfs_io_subrequest *subreq =
+ container_of(work, struct netfs_io_subrequest, work);
- default:
- pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret);
- fallthrough;
- case -EACCES:
- case -EPERM:
- case -ENOKEY:
- case -EKEYEXPIRED:
- case -EKEYREJECTED:
- case -EKEYREVOKED:
- case -ENETRESET:
- afs_redirty_pages(wbc, mapping, start, len);
- mapping_set_error(mapping, ret);
- break;
-
- case -EDQUOT:
- case -ENOSPC:
- afs_redirty_pages(wbc, mapping, start, len);
- mapping_set_error(mapping, -ENOSPC);
- break;
-
- case -EROFS:
- case -EIO:
- case -EREMOTEIO:
- case -EFBIG:
- case -ENOENT:
- case -ENOMEDIUM:
- case -ENXIO:
- trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail);
- afs_kill_pages(mapping, start, len);
- mapping_set_error(mapping, ret);
- break;
- }
-
- _leave(" = %d", ret);
- return ret;
+ afs_upload_to_server(subreq);
}
/*
- * write a region of pages back to the server
+ * Set up write requests for a writeback slice. We need to add a write request
+ * for each write we want to make.
*/
-static int afs_writepages_region(struct address_space *mapping,
- struct writeback_control *wbc,
- loff_t start, loff_t end, loff_t *_next,
- bool max_one_loop)
+void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len)
{
- struct folio *folio;
- struct folio_batch fbatch;
- ssize_t ret;
- unsigned int i;
- int n, skips = 0;
-
- _enter("%llx,%llx,", start, end);
- folio_batch_init(&fbatch);
-
- do {
- pgoff_t index = start / PAGE_SIZE;
-
- n = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE,
- PAGECACHE_TAG_DIRTY, &fbatch);
-
- if (!n)
- break;
- for (i = 0; i < n; i++) {
- folio = fbatch.folios[i];
- start = folio_pos(folio); /* May regress with THPs */
-
- _debug("wback %lx", folio_index(folio));
-
- /* At this point we hold neither the i_pages lock nor the
- * page lock: the page may be truncated or invalidated
- * (changing page->mapping to NULL), or even swizzled
- * back from swapper_space to tmpfs file mapping
- */
-try_again:
- if (wbc->sync_mode != WB_SYNC_NONE) {
- ret = folio_lock_killable(folio);
- if (ret < 0) {
- folio_batch_release(&fbatch);
- return ret;
- }
- } else {
- if (!folio_trylock(folio))
- continue;
- }
-
- if (folio->mapping != mapping ||
- !folio_test_dirty(folio)) {
- start += folio_size(folio);
- folio_unlock(folio);
- continue;
- }
-
- if (folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- if (wbc->sync_mode != WB_SYNC_NONE) {
- folio_wait_writeback(folio);
-#ifdef CONFIG_AFS_FSCACHE
- folio_wait_fscache(folio);
-#endif
- goto try_again;
- }
-
- start += folio_size(folio);
- if (wbc->sync_mode == WB_SYNC_NONE) {
- if (skips >= 5 || need_resched()) {
- *_next = start;
- folio_batch_release(&fbatch);
- _leave(" = 0 [%llx]", *_next);
- return 0;
- }
- skips++;
- }
- continue;
- }
-
- if (!folio_clear_dirty_for_io(folio))
- BUG();
- ret = afs_write_back_from_locked_folio(mapping, wbc,
- folio, start, end);
- if (ret < 0) {
- _leave(" = %zd", ret);
- folio_batch_release(&fbatch);
- return ret;
- }
-
- start += ret;
- }
+ struct netfs_io_subrequest *subreq;
- folio_batch_release(&fbatch);
- cond_resched();
- } while (wbc->nr_to_write > 0);
+ _enter("%x,%llx-%llx", wreq->debug_id, start, start + len);
- *_next = start;
- _leave(" = 0 [%llx]", *_next);
- return 0;
+ subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER,
+ start, len, afs_upload_to_server_worker);
+ if (subreq)
+ netfs_queue_write_request(subreq);
}
/*
* write some of the pending data back to the server
*/
-int afs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+int afs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- loff_t start, next;
int ret;
- _enter("");
-
/* We have to be careful as we can end up racing with setattr()
* truncating the pagecache since the caller doesn't take a lock here
* to prevent it.
@@ -817,69 +215,12 @@ int afs_writepages(struct address_space *mapping,
else if (!down_read_trylock(&vnode->validate_lock))
return 0;
- if (wbc->range_cyclic) {
- start = mapping->writeback_index * PAGE_SIZE;
- ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX,
- &next, false);
- if (ret == 0) {
- mapping->writeback_index = next / PAGE_SIZE;
- if (start > 0 && wbc->nr_to_write > 0) {
- ret = afs_writepages_region(mapping, wbc, 0,
- start, &next, false);
- if (ret == 0)
- mapping->writeback_index =
- next / PAGE_SIZE;
- }
- }
- } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX,
- &next, false);
- if (wbc->nr_to_write > 0 && ret == 0)
- mapping->writeback_index = next / PAGE_SIZE;
- } else {
- ret = afs_writepages_region(mapping, wbc,
- wbc->range_start, wbc->range_end,
- &next, false);
- }
-
+ ret = netfs_writepages(mapping, wbc);
up_read(&vnode->validate_lock);
- _leave(" = %d", ret);
return ret;
}
/*
- * write to an AFS file
- */
-ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
-{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
- struct afs_file *af = iocb->ki_filp->private_data;
- ssize_t result;
- size_t count = iov_iter_count(from);
-
- _enter("{%llx:%llu},{%zu},",
- vnode->fid.vid, vnode->fid.vnode, count);
-
- if (IS_SWAPFILE(&vnode->netfs.inode)) {
- printk(KERN_INFO
- "AFS: Attempt to write to active swap file!\n");
- return -EBUSY;
- }
-
- if (!count)
- return 0;
-
- result = afs_validate(vnode, af->key);
- if (result < 0)
- return result;
-
- result = generic_file_write_iter(iocb, from);
-
- _leave(" = %zd", result);
- return result;
-}
-
-/*
* flush any dirty pages for this process, and check for write errors.
* - the return status from this call provides a reliable indication of
* whether any write errors occurred for this process.
@@ -907,59 +248,11 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
{
- struct folio *folio = page_folio(vmf->page);
struct file *file = vmf->vma->vm_file;
- struct inode *inode = file_inode(file);
- struct afs_vnode *vnode = AFS_FS_I(inode);
- struct afs_file *af = file->private_data;
- unsigned long priv;
- vm_fault_t ret = VM_FAULT_RETRY;
-
- _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio));
-
- afs_validate(vnode, af->key);
- sb_start_pagefault(inode->i_sb);
-
- /* Wait for the page to be written to the cache before we allow it to
- * be modified. We then assume the entire page will need writing back.
- */
-#ifdef CONFIG_AFS_FSCACHE
- if (folio_test_fscache(folio) &&
- folio_wait_fscache_killable(folio) < 0)
- goto out;
-#endif
-
- if (folio_wait_writeback_killable(folio))
- goto out;
-
- if (folio_lock_killable(folio) < 0)
- goto out;
-
- /* We mustn't change folio->private until writeback is complete as that
- * details the portion of the page we need to write back and we might
- * need to redirty the page if there's a problem.
- */
- if (folio_wait_writeback_killable(folio) < 0) {
- folio_unlock(folio);
- goto out;
- }
-
- priv = afs_folio_dirty(folio, 0, folio_size(folio));
- priv = afs_folio_dirty_mmapped(priv);
- if (folio_test_private(folio)) {
- folio_change_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite+"), folio);
- } else {
- folio_attach_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite"), folio);
- }
- file_update_time(file);
-
- ret = VM_FAULT_LOCKED;
-out:
- sb_end_pagefault(inode->i_sb);
- return ret;
+ if (afs_validate(AFS_FS_I(file_inode(file)), afs_file_key(file)) < 0)
+ return VM_FAULT_SIGBUS;
+ return netfs_page_mkwrite(vmf, NULL);
}
/*
@@ -989,64 +282,3 @@ void afs_prune_wb_keys(struct afs_vnode *vnode)
afs_put_wb_key(wbk);
}
}
-
-/*
- * Clean up a page during invalidation.
- */
-int afs_launder_folio(struct folio *folio)
-{
- struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
- struct iov_iter iter;
- struct bio_vec bv;
- unsigned long priv;
- unsigned int f, t;
- int ret = 0;
-
- _enter("{%lx}", folio->index);
-
- priv = (unsigned long)folio_get_private(folio);
- if (folio_clear_dirty_for_io(folio)) {
- f = 0;
- t = folio_size(folio);
- if (folio_test_private(folio)) {
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
- }
-
- bvec_set_folio(&bv, folio, t - f, f);
- iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, bv.bv_len);
-
- trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio);
- ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true);
- }
-
- trace_afs_folio_dirty(vnode, tracepoint_string("laundered"), folio);
- folio_detach_private(folio);
- folio_wait_fscache(folio);
- return ret;
-}
-
-/*
- * Deal with the completion of writing the data to the cache.
- */
-static void afs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct afs_vnode *vnode = priv;
-
- if (IS_ERR_VALUE(transferred_or_error) &&
- transferred_or_error != -ENOBUFS)
- afs_invalidate_cache(vnode, 0);
-}
-
-/*
- * Save the write to the cache also.
- */
-static void afs_write_to_cache(struct afs_vnode *vnode,
- loff_t start, size_t len, loff_t i_size,
- bool caching)
-{
- fscache_write_to_cache(afs_vnode_cache(vnode),
- vnode->netfs.inode.i_mapping, start, len, i_size,
- afs_write_to_cache_done, vnode, caching);
-}
diff --git a/fs/aio.c b/fs/aio.c
index ffe65c1aab4e..bb2ff48991f3 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -239,7 +239,6 @@ static struct ctl_table aio_sysctls[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
- {}
};
static void __init aio_sysctl_init(void)
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index d26222b7eefe..0496cb5b6eab 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -79,7 +79,7 @@ static struct file *__anon_inode_getfile(const char *name,
const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode,
- bool secure)
+ bool make_inode)
{
struct inode *inode;
struct file *file;
@@ -87,7 +87,7 @@ static struct file *__anon_inode_getfile(const char *name,
if (fops->owner && !try_module_get(fops->owner))
return ERR_PTR(-ENOENT);
- if (secure) {
+ if (make_inode) {
inode = anon_inode_make_secure_inode(name, context_inode);
if (IS_ERR(inode)) {
file = ERR_CAST(inode);
@@ -149,13 +149,10 @@ struct file *anon_inode_getfile(const char *name,
EXPORT_SYMBOL_GPL(anon_inode_getfile);
/**
- * anon_inode_getfile_secure - Like anon_inode_getfile(), but creates a new
+ * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new
* !S_PRIVATE anon inode rather than reuse the
* singleton anon inode and calls the
- * inode_init_security_anon() LSM hook. This
- * allows for both the inode to have its own
- * security context and for the LSM to enforce
- * policy on the inode's creation.
+ * inode_init_security_anon() LSM hook.
*
* @name: [in] name of the "class" of the new file
* @fops: [in] file operations for the new file
@@ -164,11 +161,21 @@ EXPORT_SYMBOL_GPL(anon_inode_getfile);
* @context_inode:
* [in] the logical relationship with the new inode (optional)
*
+ * Create a new anonymous inode and file pair. This can be done for two
+ * reasons:
+ *
+ * - for the inode to have its own security context, so that LSMs can enforce
+ * policy on the inode's creation;
+ *
+ * - if the caller needs a unique inode, for example in order to customize
+ * the size returned by fstat()
+ *
* The LSM may use @context_inode in inode_init_security_anon(), but a
- * reference to it is not held. Returns the newly created file* or an error
- * pointer. See the anon_inode_getfile() documentation for more information.
+ * reference to it is not held.
+ *
+ * Returns the newly created file* or an error pointer.
*/
-struct file *anon_inode_getfile_secure(const char *name,
+struct file *anon_inode_create_getfile(const char *name,
const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode)
@@ -176,12 +183,13 @@ struct file *anon_inode_getfile_secure(const char *name,
return __anon_inode_getfile(name, fops, priv, flags,
context_inode, true);
}
+EXPORT_SYMBOL_GPL(anon_inode_create_getfile);
static int __anon_inode_getfd(const char *name,
const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode,
- bool secure)
+ bool make_inode)
{
int error, fd;
struct file *file;
@@ -192,7 +200,7 @@ static int __anon_inode_getfd(const char *name,
fd = error;
file = __anon_inode_getfile(name, fops, priv, flags, context_inode,
- secure);
+ make_inode);
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto err_put_unused_fd;
@@ -231,10 +239,9 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
EXPORT_SYMBOL_GPL(anon_inode_getfd);
/**
- * anon_inode_getfd_secure - Like anon_inode_getfd(), but creates a new
+ * anon_inode_create_getfd - Like anon_inode_getfd(), but creates a new
* !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls
- * the inode_init_security_anon() LSM hook. This allows the inode to have its
- * own security context and for a LSM to reject creation of the inode.
+ * the inode_init_security_anon() LSM hook.
*
* @name: [in] name of the "class" of the new file
* @fops: [in] file operations for the new file
@@ -243,16 +250,26 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd);
* @context_inode:
* [in] the logical relationship with the new inode (optional)
*
+ * Create a new anonymous inode and file pair. This can be done for two
+ * reasons:
+ *
+ * - for the inode to have its own security context, so that LSMs can enforce
+ * policy on the inode's creation;
+ *
+ * - if the caller needs a unique inode, for example in order to customize
+ * the size returned by fstat()
+ *
* The LSM may use @context_inode in inode_init_security_anon(), but a
* reference to it is not held.
+ *
+ * Returns a newly created file descriptor or an error code.
*/
-int anon_inode_getfd_secure(const char *name, const struct file_operations *fops,
+int anon_inode_create_getfd(const char *name, const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode)
{
return __anon_inode_getfd(name, fops, priv, flags, context_inode, true);
}
-EXPORT_SYMBOL_GPL(anon_inode_getfd_secure);
static int __init anon_inode_init(void)
{
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index 038b3d2d9f57..39d8c84c16f4 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -73,12 +73,9 @@ done:
/* p->d_lock held */
static struct dentry *positive_after(struct dentry *p, struct dentry *child)
{
- if (child)
- child = list_next_entry(child, d_child);
- else
- child = list_first_entry(&p->d_subdirs, struct dentry, d_child);
+ child = child ? d_next_sibling(child) : d_first_child(p);
- list_for_each_entry_from(child, &p->d_subdirs, d_child) {
+ hlist_for_each_entry_from(child, d_sib) {
spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
if (simple_positive(child)) {
dget_dlock(child);
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index fddc7be58022..5cdfef3b551a 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -50,14 +50,6 @@ config BCACHEFS_POSIX_ACL
depends on BCACHEFS_FS
select FS_POSIX_ACL
-config BCACHEFS_DEBUG_TRANSACTIONS
- bool "bcachefs runtime info"
- depends on BCACHEFS_FS
- help
- This makes the list of running btree transactions available in debugfs.
-
- This is a highly useful debugging feature but does add a small amount of overhead.
-
config BCACHEFS_DEBUG
bool "bcachefs debugging"
depends on BCACHEFS_FS
@@ -85,6 +77,16 @@ config BCACHEFS_NO_LATENCY_ACCT
help
This disables device latency tracking and time stats, only for performance testing
+config BCACHEFS_SIX_OPTIMISTIC_SPIN
+ bool "Optimistic spinning for six locks"
+ depends on BCACHEFS_FS
+ depends on SMP
+ default y
+ help
+ Instead of immediately sleeping when attempting to take a six lock that
+ is held by another thread, spin for a short while, as long as the
+ thread owning the lock is running.
+
config MEAN_AND_VARIANCE_UNIT_TEST
tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
depends on KUNIT
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index b81268418174..1a05cecda7cc 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -27,7 +27,6 @@ bcachefs-y := \
checksum.o \
clock.o \
compress.o \
- counters.o \
darray.o \
debug.o \
dirent.o \
@@ -71,6 +70,7 @@ bcachefs-y := \
reflink.o \
replicas.o \
sb-clean.o \
+ sb-counters.o \
sb-downgrade.o \
sb-errors.o \
sb-members.o \
@@ -82,6 +82,7 @@ bcachefs-y := \
super-io.o \
sysfs.o \
tests.o \
+ thread_with_file.o \
trace.o \
two_state_shared_lock.o \
util.o \
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 1fec0e67891f..fd3e175d8342 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -261,10 +261,8 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard:
- bkey_fsck_err_on(a.v->dirty_sectors ||
- a.v->cached_sectors ||
- a.v->stripe, c, err,
- alloc_key_empty_but_have_data,
+ bkey_fsck_err_on(bch2_bucket_sectors(*a.v) || a.v->stripe,
+ c, err, alloc_key_empty_but_have_data,
"empty data type free but have data");
break;
case BCH_DATA_sb:
@@ -272,22 +270,21 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
case BCH_DATA_btree:
case BCH_DATA_user:
case BCH_DATA_parity:
- bkey_fsck_err_on(!a.v->dirty_sectors, c, err,
- alloc_key_dirty_sectors_0,
+ bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
+ c, err, alloc_key_dirty_sectors_0,
"data_type %s but dirty_sectors==0",
- bch2_data_types[a.v->data_type]);
+ bch2_data_type_str(a.v->data_type));
break;
case BCH_DATA_cached:
bkey_fsck_err_on(!a.v->cached_sectors ||
- a.v->dirty_sectors ||
- a.v->stripe, c, err,
- alloc_key_cached_inconsistency,
+ bch2_bucket_sectors_dirty(*a.v) ||
+ a.v->stripe,
+ c, err, alloc_key_cached_inconsistency,
"data type inconsistency");
bkey_fsck_err_on(!a.v->io_time[READ] &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
- c, err,
- alloc_key_cached_but_read_time_zero,
+ c, err, alloc_key_cached_but_read_time_zero,
"cached bucket with read_time == 0");
break;
case BCH_DATA_stripe:
@@ -324,16 +321,12 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
{
struct bch_alloc_v4 _a;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
- unsigned i;
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_printf(out, "gen %u oldest_gen %u data_type %s",
- a->gen, a->oldest_gen,
- a->data_type < BCH_DATA_NR
- ? bch2_data_types[a->data_type]
- : "(invalid data type)");
+ prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
+ bch2_prt_data_type(out, a->data_type);
prt_newline(out);
prt_printf(out, "journal_seq %llu", a->journal_seq);
prt_newline(out);
@@ -356,23 +349,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, "fragmentation %llu", a->fragmentation_lru);
prt_newline(out);
prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
- prt_newline(out);
-
- if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
- struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
- const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);
-
- prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
- printbuf_indent_add(out, 2);
-
- for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
- prt_newline(out);
- bch2_backpointer_to_text(out, &bps[i]);
- }
-
- printbuf_indent_sub(out, 2);
- }
-
printbuf_indent_sub(out, 2);
}
@@ -537,18 +513,12 @@ void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bke
int bch2_bucket_gens_init(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_alloc_v4 a;
struct bkey_i_bucket_gens g;
bool have_bucket_gens_key = false;
- unsigned offset;
- struct bpos pos;
- u8 gen;
int ret;
- for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
/*
* Not a fsck error because this is checked/repaired by
* bch2_check_alloc_key() which runs later:
@@ -556,13 +526,14 @@ int bch2_bucket_gens_init(struct bch_fs *c)
if (!bch2_dev_bucket_exists(c, k.k->p))
continue;
- gen = bch2_alloc_to_v4(k, &a)->gen;
- pos = alloc_gens_pos(iter.pos, &offset);
+ struct bch_alloc_v4 a;
+ u8 gen = bch2_alloc_to_v4(k, &a)->gen;
+ unsigned offset;
+ struct bpos pos = alloc_gens_pos(iter.pos, &offset);
if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ BCH_TRANS_COMMIT_no_enospc,
bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
if (ret)
break;
@@ -576,45 +547,37 @@ int bch2_bucket_gens_init(struct bch_fs *c)
}
g.v.gens[offset] = gen;
- }
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
if (have_bucket_gens_key && !ret)
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ BCH_TRANS_COMMIT_no_enospc,
bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
bch2_trans_put(trans);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
int bch2_alloc_read(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_dev *ca;
int ret;
down_read(&c->gc_lock);
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
- const struct bch_bucket_gens *g;
- u64 b;
-
- for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
if (k.k->type != KEY_TYPE_bucket_gens)
continue;
- g = bkey_s_c_to_bucket_gens(k).v;
+ const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
/*
* Not a fsck error because this is checked/repaired by
@@ -623,19 +586,17 @@ int bch2_alloc_read(struct bch_fs *c)
if (!bch2_dev_exists2(c, k.k->p.inode))
continue;
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
- for (b = max_t(u64, ca->mi.first_bucket, start);
+ for (u64 b = max_t(u64, ca->mi.first_bucket, start);
b < min_t(u64, ca->mi.nbuckets, end);
b++)
*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
- }
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
} else {
- struct bch_alloc_v4 a;
-
- for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
/*
* Not a fsck error because this is checked/repaired by
* bch2_check_alloc_key() which runs later:
@@ -643,19 +604,18 @@ int bch2_alloc_read(struct bch_fs *c)
if (!bch2_dev_bucket_exists(c, k.k->p))
continue;
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ struct bch_alloc_v4 a;
*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
- }
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
}
bch2_trans_put(trans);
up_read(&c->gc_lock);
- if (ret)
- bch_err_fn(c, ret);
-
+ bch_err_fn(c, ret);
return ret;
}
@@ -768,83 +728,177 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
return ret;
}
-int bch2_trans_mark_alloc(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
+int bch2_trigger_alloc(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
- struct bch_alloc_v4 old_a_convert, *new_a;
- const struct bch_alloc_v4 *old_a;
- u64 old_lru, new_lru;
int ret = 0;
- /*
- * Deletion only happens in the device removal path, with
- * BTREE_TRIGGER_NORUN:
- */
- BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
+ if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
+ "alloc key for invalid device or bucket"))
+ return -EIO;
- old_a = bch2_alloc_to_v4(old, &old_a_convert);
- new_a = &bkey_i_to_alloc_v4(new)->v;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode);
- new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
+ struct bch_alloc_v4 old_a_convert;
+ const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
- if (new_a->dirty_sectors > old_a->dirty_sectors ||
- new_a->cached_sectors > old_a->cached_sectors) {
- new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
- new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
- SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
- SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
- }
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
- if (data_type_is_empty(new_a->data_type) &&
- BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
- !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
- new_a->gen++;
- SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
- }
+ new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
- if (old_a->data_type != new_a->data_type ||
- (new_a->data_type == BCH_DATA_free &&
- alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
- ret = bch2_bucket_do_index(trans, old, old_a, false) ?:
- bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true);
- if (ret)
- return ret;
- }
+ if (bch2_bucket_sectors(*new_a) > bch2_bucket_sectors(*old_a)) {
+ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+ new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
+ SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
+ }
- if (new_a->data_type == BCH_DATA_cached &&
- !new_a->io_time[READ])
- new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+ if (data_type_is_empty(new_a->data_type) &&
+ BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
+ !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
+ new_a->gen++;
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
+ }
+
+ if (old_a->data_type != new_a->data_type ||
+ (new_a->data_type == BCH_DATA_free &&
+ alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
+ ret = bch2_bucket_do_index(trans, old, old_a, false) ?:
+ bch2_bucket_do_index(trans, new.s_c, new_a, true);
+ if (ret)
+ return ret;
+ }
- old_lru = alloc_lru_idx_read(*old_a);
- new_lru = alloc_lru_idx_read(*new_a);
+ if (new_a->data_type == BCH_DATA_cached &&
+ !new_a->io_time[READ])
+ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
- if (old_lru != new_lru) {
- ret = bch2_lru_change(trans, new->k.p.inode,
- bucket_to_u64(new->k.p),
- old_lru, new_lru);
- if (ret)
- return ret;
+ u64 old_lru = alloc_lru_idx_read(*old_a);
+ u64 new_lru = alloc_lru_idx_read(*new_a);
+ if (old_lru != new_lru) {
+ ret = bch2_lru_change(trans, new.k->p.inode,
+ bucket_to_u64(new.k->p),
+ old_lru, new_lru);
+ if (ret)
+ return ret;
+ }
+
+ new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
+ bch_dev_bkey_exists(c, new.k->p.inode));
+ if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
+ ret = bch2_lru_change(trans,
+ BCH_LRU_FRAGMENTATION_START,
+ bucket_to_u64(new.k->p),
+ old_a->fragmentation_lru, new_a->fragmentation_lru);
+ if (ret)
+ return ret;
+ }
+
+ if (old_a->gen != new_a->gen) {
+ ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * need to know if we're getting called from the invalidate path or
+ * not:
+ */
+
+ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
+ old_a->cached_sectors) {
+ ret = bch2_update_cached_sectors_list(trans, new.k->p.inode,
+ -((s64) old_a->cached_sectors));
+ if (ret)
+ return ret;
+ }
}
- new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
- bch_dev_bkey_exists(c, new->k.p.inode));
+ if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
+ struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
+ u64 journal_seq = trans->journal_res.seq;
+ u64 bucket_journal_seq = new_a->journal_seq;
- if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
- ret = bch2_lru_change(trans,
- BCH_LRU_FRAGMENTATION_START,
- bucket_to_u64(new->k.p),
- old_a->fragmentation_lru, new_a->fragmentation_lru);
- if (ret)
- return ret;
+ if ((flags & BTREE_TRIGGER_INSERT) &&
+ data_type_is_empty(old_a->data_type) !=
+ data_type_is_empty(new_a->data_type) &&
+ new.k->type == KEY_TYPE_alloc_v4) {
+ struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v;
+
+ /*
+ * If the btree updates referring to a bucket weren't flushed
+ * before the bucket became empty again, then the we don't have
+ * to wait on a journal flush before we can reuse the bucket:
+ */
+ v->journal_seq = bucket_journal_seq =
+ data_type_is_empty(new_a->data_type) &&
+ (journal_seq == v->journal_seq ||
+ bch2_journal_noflush_seq(&c->journal, v->journal_seq))
+ ? 0 : journal_seq;
+ }
+
+ if (!data_type_is_empty(old_a->data_type) &&
+ data_type_is_empty(new_a->data_type) &&
+ bucket_journal_seq) {
+ ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk,
+ new.k->p.inode, new.k->p.offset,
+ bucket_journal_seq);
+ if (ret) {
+ bch2_fs_fatal_error(c,
+ "error setting bucket_needs_journal_commit: %i", ret);
+ return ret;
+ }
+ }
+
+ percpu_down_read(&c->mark_lock);
+ if (new_a->gen != old_a->gen)
+ *bucket_gen(ca, new.k->p.offset) = new_a->gen;
+
+ bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
+
+ if (new_a->data_type == BCH_DATA_free &&
+ (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
+ closure_wake_up(&c->freelist_wait);
+
+ if (new_a->data_type == BCH_DATA_need_discard &&
+ (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
+ bch2_do_discards(c);
+
+ if (old_a->data_type != BCH_DATA_cached &&
+ new_a->data_type == BCH_DATA_cached &&
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
+ bch2_do_invalidates(c);
+
+ if (new_a->data_type == BCH_DATA_need_gc_gens)
+ bch2_do_gc_gens(c);
+ percpu_up_read(&c->mark_lock);
}
- if (old_a->gen != new_a->gen) {
- ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen);
- if (ret)
- return ret;
+ if ((flags & BTREE_TRIGGER_GC) &&
+ (flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) {
+ struct bch_alloc_v4 new_a_convert;
+ const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert);
+
+ percpu_down_read(&c->mark_lock);
+ struct bucket *g = gc_bucket(ca, new.k->p.offset);
+
+ bucket_lock(g);
+
+ g->gen_valid = 1;
+ g->gen = new_a->gen;
+ g->data_type = new_a->data_type;
+ g->stripe = new_a->stripe;
+ g->stripe_redundancy = new_a->stripe_redundancy;
+ g->dirty_sectors = new_a->dirty_sectors;
+ g->cached_sectors = new_a->cached_sectors;
+
+ bucket_unlock(g);
+ percpu_up_read(&c->mark_lock);
}
return 0;
@@ -869,8 +923,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
bch2_trans_copy_iter(&iter2, iter);
- if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX))
- end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p));
+ struct btree_path *path = btree_iter_path(iter->trans, iter);
+ if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
+ end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
@@ -898,7 +953,6 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
{
struct bch_dev *ca;
- unsigned iter;
if (bch2_dev_bucket_exists(c, *bucket))
return true;
@@ -916,8 +970,7 @@ static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
}
rcu_read_lock();
- iter = bucket->inode;
- ca = __bch2_next_dev(c, &iter, NULL);
+ ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
if (ca)
*bucket = POS(ca->dev_idx, ca->mi.first_bucket);
rcu_read_unlock();
@@ -1158,9 +1211,6 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
unsigned i, gens_offset, gens_end_offset;
int ret;
- if (c->sb.version < bcachefs_metadata_version_bucket_gens)
- return 0;
-
bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
k = bch2_btree_iter_peek_slot(bucket_gens_iter);
@@ -1212,7 +1262,7 @@ fsck_err:
return ret;
}
-static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans,
+static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
struct btree_iter *iter)
{
struct bch_fs *c = trans->c;
@@ -1267,28 +1317,10 @@ delete:
ret = bch2_btree_delete_extent_at(trans, iter,
iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW);
+ BCH_TRANS_COMMIT_no_enospc);
goto out;
}
-static int bch2_check_discard_freespace_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos end)
-{
- if (!btree_id_is_extents(iter->btree_id)) {
- return __bch2_check_discard_freespace_key(trans, iter);
- } else {
- int ret = 0;
-
- while (!bkey_eq(iter->pos, end) &&
- !(ret = btree_trans_too_many_iters(trans) ?:
- __bch2_check_discard_freespace_key(trans, iter)))
- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
-
- return ret;
- }
-}
-
/*
* We've already checked that generation numbers in the bucket_gens btree are
* valid for buckets that exist; this just checks for keys for nonexistent
@@ -1422,8 +1454,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
}
ret = bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto bkey_err;
@@ -1442,23 +1473,50 @@ bkey_err:
if (ret < 0)
goto err;
- ret = for_each_btree_key2(trans, iter,
+ ret = for_each_btree_key(trans, iter,
BTREE_ID_need_discard, POS_MIN,
BTREE_ITER_PREFETCH, k,
- bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
- for_each_btree_key2(trans, iter,
- BTREE_ID_freespace, POS_MIN,
- BTREE_ITER_PREFETCH, k,
- bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
- for_each_btree_key_commit(trans, iter,
+ bch2_check_discard_freespace_key(trans, &iter));
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
+ BTREE_ITER_PREFETCH);
+ while (1) {
+ bch2_trans_begin(trans);
+ k = bch2_btree_iter_peek(&iter);
+ if (!k.k)
+ break;
+
+ ret = bkey_err(k) ?:
+ bch2_check_discard_freespace_key(trans, &iter);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ ret = 0;
+ continue;
+ }
+ if (ret) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bch_err(c, "while checking %s", buf.buf);
+ printbuf_exit(&buf);
+ break;
+ }
+
+ bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret)
+ goto err;
+
+ ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_bucket_gens, POS_MIN,
BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_bucket_gens_key(trans, &iter, k));
err:
bch2_trans_put(trans);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1486,6 +1544,27 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
if (a->data_type != BCH_DATA_cached)
return 0;
+ if (fsck_err_on(!a->io_time[READ], c,
+ alloc_key_cached_but_read_time_zero,
+ "cached bucket with read_time 0\n"
+ " %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ struct bkey_i_alloc_v4 *a_mut =
+ bch2_alloc_to_v4_mut(trans, alloc_k);
+ ret = PTR_ERR_OR_ZERO(a_mut);
+ if (ret)
+ goto err;
+
+ a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
+ ret = bch2_trans_update(trans, alloc_iter,
+ &a_mut->k_i, BTREE_TRIGGER_NORUN);
+ if (ret)
+ goto err;
+
+ a = &a_mut->v;
+ }
+
lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
lru_pos(alloc_k.k->p.inode,
bucket_to_u64(alloc_k.k->p),
@@ -1494,41 +1573,18 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
if (ret)
return ret;
- if (fsck_err_on(!a->io_time[READ], c,
- alloc_key_cached_but_read_time_zero,
- "cached bucket with read_time 0\n"
- " %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
- fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
+ if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
alloc_key_to_missing_lru_entry,
"missing lru entry\n"
" %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
- u64 read_time = a->io_time[READ] ?:
- atomic64_read(&c->io_clock[READ].now);
-
ret = bch2_lru_set(trans,
alloc_k.k->p.inode,
bucket_to_u64(alloc_k.k->p),
- read_time);
+ a->io_time[READ]);
if (ret)
goto err;
-
- if (a->io_time[READ] != read_time) {
- struct bkey_i_alloc_v4 *a_mut =
- bch2_alloc_to_v4_mut(trans, alloc_k);
- ret = PTR_ERR_OR_ZERO(a_mut);
- if (ret)
- goto err;
-
- a_mut->v.io_time[READ] = read_time;
- ret = bch2_trans_update(trans, alloc_iter,
- &a_mut->k_i, BTREE_TRIGGER_NORUN);
- if (ret)
- goto err;
- }
}
err:
fsck_err:
@@ -1539,27 +1595,45 @@ fsck_err:
int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_alloc_to_lru_ref(trans, &iter)));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
+struct discard_buckets_state {
+ u64 seen;
+ u64 open;
+ u64 need_journal_commit;
+ u64 discarded;
+ struct bch_dev *ca;
+ u64 need_journal_commit_this_dev;
+};
+
+static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
+{
+ if (s->ca == ca)
+ return;
+
+ if (s->ca && s->need_journal_commit_this_dev >
+ bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
+ bch2_journal_flush_async(&c->journal, NULL);
+
+ if (s->ca)
+ percpu_ref_put(&s->ca->ref);
+ if (ca)
+ percpu_ref_get(&ca->ref);
+ s->ca = ca;
+ s->need_journal_commit_this_dev = 0;
+}
+
static int bch2_discard_one_bucket(struct btree_trans *trans,
struct btree_iter *need_discard_iter,
struct bpos *discard_pos_done,
- u64 *seen,
- u64 *open,
- u64 *need_journal_commit,
- u64 *discarded)
+ struct discard_buckets_state *s)
{
struct bch_fs *c = trans->c;
struct bpos pos = need_discard_iter->pos;
@@ -1571,20 +1645,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
int ret = 0;
ca = bch_dev_bkey_exists(c, pos.inode);
+
if (!percpu_ref_tryget(&ca->io_ref)) {
bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
return 0;
}
+ discard_buckets_next_dev(c, s, ca);
+
if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
- (*open)++;
+ s->open++;
goto out;
}
if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
c->journal.flushed_seq_ondisk,
pos.inode, pos.offset)) {
- (*need_journal_commit)++;
+ s->need_journal_commit++;
+ s->need_journal_commit_this_dev++;
goto out;
}
@@ -1637,7 +1715,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
* This works without any other locks because this is the only
* thread that removes items from the need_discard tree
*/
- bch2_trans_unlock(trans);
+ bch2_trans_unlock_long(trans);
blkdev_issue_discard(ca->disk_sb.bdev,
k.k->p.offset * ca->mi.bucket_size,
ca->mi.bucket_size,
@@ -1655,14 +1733,14 @@ write:
ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
- this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
- (*discarded)++;
+ count_event(c, bucket_discard);
+ s->discarded++;
out:
- (*seen)++;
+ s->seen++;
bch2_trans_iter_exit(trans, &iter);
percpu_ref_put(&ca->io_ref);
printbuf_exit(&buf);
@@ -1672,9 +1750,7 @@ out:
static void bch2_do_discards_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
+ struct discard_buckets_state s = {};
struct bpos discard_pos_done = POS_MAX;
int ret;
@@ -1684,21 +1760,16 @@ static void bch2_do_discards_work(struct work_struct *work)
* successful commit:
*/
ret = bch2_trans_run(c,
- for_each_btree_key2(trans, iter,
- BTREE_ID_need_discard, POS_MIN, 0, k,
- bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
- &seen,
- &open,
- &need_journal_commit,
- &discarded)));
-
- if (need_journal_commit * 2 > seen)
- bch2_journal_flush_async(&c->journal, NULL);
+ for_each_btree_key(trans, iter,
+ BTREE_ID_need_discard, POS_MIN, 0, k,
+ bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ discard_buckets_next_dev(c, &s, NULL);
- trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
+ trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
void bch2_do_discards(struct bch_fs *c)
@@ -1760,7 +1831,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
@@ -1795,22 +1866,18 @@ err:
static void bch2_do_invalidates_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
- struct bch_dev *ca;
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- unsigned i;
int ret = 0;
- ret = bch2_btree_write_buffer_flush(trans);
+ ret = bch2_btree_write_buffer_tryflush(trans);
if (ret)
goto err;
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
s64 nr_to_invalidate =
should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
+ ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
lru_pos(ca->dev_idx, 0, 0),
lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
BTREE_ITER_INTENT, k,
@@ -1884,8 +1951,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
ret = bch2_bucket_do_index(trans, k, a, true) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto bkey_err;
@@ -1905,8 +1971,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto bkey_err;
@@ -1937,8 +2002,6 @@ bkey_err:
int bch2_fs_freespace_init(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
int ret = 0;
bool doing_init = false;
@@ -1947,7 +2010,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
* every mount:
*/
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
if (ca->mi.freespace_initialized)
continue;
@@ -2007,15 +2070,13 @@ out:
void bch2_recalc_capacity(struct bch_fs *c)
{
- struct bch_dev *ca;
u64 capacity = 0, reserved_sectors = 0, gc_reserve;
unsigned bucket_size_max = 0;
unsigned long ra_pages = 0;
- unsigned i;
lockdep_assert_held(&c->state_lock);
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
ra_pages += bdi->ra_pages;
@@ -2023,7 +2084,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
bch2_set_ra_pages(c, ra_pages);
- for_each_rw_member(ca, c, i) {
+ for_each_rw_member(c, ca) {
u64 dev_reserve = 0;
/*
@@ -2079,11 +2140,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
u64 bch2_min_rw_member_capacity(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
u64 ret = U64_MAX;
- for_each_rw_member(ca, c, i)
+ for_each_rw_member(c, ca)
ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
return ret;
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 73faf99a222a..e7f7e842ee1b 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -71,6 +71,24 @@ static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type;
}
+static inline unsigned bch2_bucket_sectors(struct bch_alloc_v4 a)
+{
+ return a.dirty_sectors + a.cached_sectors;
+}
+
+static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
+{
+ return a.dirty_sectors;
+}
+
+static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca,
+ struct bch_alloc_v4 a)
+{
+ int d = bch2_bucket_sectors_dirty(a);
+
+ return d ? max(0, ca->mi.bucket_size - d) : 0;
+}
+
static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
{
return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
@@ -90,10 +108,11 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
struct bch_dev *ca)
{
if (!data_type_movable(a.data_type) ||
- a.dirty_sectors >= ca->mi.bucket_size)
+ !bch2_bucket_sectors_fragmented(ca, a))
return 0;
- return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size);
+ u64 d = bch2_bucket_sectors_dirty(a);
+ return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
}
static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
@@ -163,24 +182,21 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc ((struct bkey_ops) { \
.key_invalid = bch2_alloc_v1_invalid, \
.val_to_text = bch2_alloc_to_text, \
- .trans_trigger = bch2_trans_mark_alloc, \
- .atomic_trigger = bch2_mark_alloc, \
+ .trigger = bch2_trigger_alloc, \
.min_val_size = 8, \
})
#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \
.key_invalid = bch2_alloc_v2_invalid, \
.val_to_text = bch2_alloc_to_text, \
- .trans_trigger = bch2_trans_mark_alloc, \
- .atomic_trigger = bch2_mark_alloc, \
+ .trigger = bch2_trigger_alloc, \
.min_val_size = 8, \
})
#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \
.key_invalid = bch2_alloc_v3_invalid, \
.val_to_text = bch2_alloc_to_text, \
- .trans_trigger = bch2_trans_mark_alloc, \
- .atomic_trigger = bch2_mark_alloc, \
+ .trigger = bch2_trigger_alloc, \
.min_val_size = 16, \
})
@@ -188,8 +204,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
.key_invalid = bch2_alloc_v4_invalid, \
.val_to_text = bch2_alloc_to_text, \
.swab = bch2_alloc_v4_swab, \
- .trans_trigger = bch2_trans_mark_alloc, \
- .atomic_trigger = bch2_mark_alloc, \
+ .trigger = bch2_trigger_alloc, \
.min_val_size = 48, \
})
@@ -213,8 +228,8 @@ static inline bool bkey_is_alloc(const struct bkey *k)
int bch2_alloc_read(struct bch_fs *);
-int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
int bch2_check_alloc_info(struct bch_fs *);
int bch2_check_alloc_to_lru_refs(struct bch_fs *);
void bch2_do_discards(struct bch_fs *);
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
new file mode 100644
index 000000000000..b4ec20be93b8
--- /dev/null
+++ b/fs/bcachefs/alloc_background_format.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
+#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
+
+struct bch_alloc {
+ struct bch_val v;
+ __u8 fields;
+ __u8 gen;
+ __u8 data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V1() \
+ x(read_time, 16) \
+ x(write_time, 16) \
+ x(data_type, 8) \
+ x(dirty_sectors, 16) \
+ x(cached_sectors, 16) \
+ x(oldest_gen, 8) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
+
+enum {
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+ BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
+struct bch_alloc_v2 {
+ struct bch_val v;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V2() \
+ x(read_time, 64) \
+ x(write_time, 64) \
+ x(dirty_sectors, 32) \
+ x(cached_sectors, 32) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
+
+struct bch_alloc_v3 {
+ struct bch_val v;
+ __le64 journal_seq;
+ __le32 flags;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __packed __aligned(8);
+
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
+
+struct bch_alloc_v4 {
+ struct bch_val v;
+ __u64 journal_seq;
+ __u32 flags;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 stripe_redundancy;
+ __u32 dirty_sectors;
+ __u32 cached_sectors;
+ __u64 io_time[2];
+ __u32 stripe;
+ __u32 nr_external_backpointers;
+ __u64 fragmentation_lru;
+} __packed __aligned(8);
+
+#define BCH_ALLOC_V4_U64s_V0 6
+#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64))
+
+BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
+BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
+BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
+BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
+
+#define KEY_TYPE_BUCKET_GENS_BITS 8
+#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
+#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
+
+struct bch_bucket_gens {
+ struct bch_val v;
+ u8 gens[KEY_TYPE_BUCKET_GENS_NR];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 0e6157982607..633d3223b353 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -69,11 +69,8 @@ const char * const bch2_watermarks[] = {
void bch2_reset_alloc_cursors(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i, NULL)
+ for_each_member_device_rcu(c, ca, NULL)
ca->alloc_cursor = 0;
rcu_read_unlock();
}
@@ -239,9 +236,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
if (cl)
closure_wait(&c->open_buckets_wait, cl);
- if (!c->blocked_allocate_open_bucket)
- c->blocked_allocate_open_bucket = local_clock();
-
+ track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
+ &c->blocked_allocate_open_bucket, true);
spin_unlock(&c->freelist_lock);
return ERR_PTR(-BCH_ERR_open_buckets_empty);
}
@@ -267,19 +263,11 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
ca->nr_open_buckets++;
bch2_open_bucket_hash_add(c, ob);
- if (c->blocked_allocate_open_bucket) {
- bch2_time_stats_update(
- &c->times[BCH_TIME_blocked_allocate_open_bucket],
- c->blocked_allocate_open_bucket);
- c->blocked_allocate_open_bucket = 0;
- }
+ track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
+ &c->blocked_allocate_open_bucket, false);
- if (c->blocked_allocate) {
- bch2_time_stats_update(
- &c->times[BCH_TIME_blocked_allocate],
- c->blocked_allocate);
- c->blocked_allocate = 0;
- }
+ track_event_change(&c->times[BCH_TIME_blocked_allocate],
+ &c->blocked_allocate, false);
spin_unlock(&c->freelist_lock);
return ob;
@@ -377,9 +365,9 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
if (!ob)
- iter.path->preserve = false;
+ set_btree_iter_dontneed(&iter);
err:
- if (iter.trans && iter.path)
+ if (iter.path)
set_btree_iter_dontneed(&iter);
bch2_trans_iter_exit(trans, &iter);
printbuf_exit(&buf);
@@ -447,7 +435,7 @@ again:
ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
next:
- citer.path->preserve = false;
+ set_btree_iter_dontneed(&citer);
bch2_trans_iter_exit(trans, &citer);
if (ob)
break;
@@ -502,7 +490,7 @@ again:
ob = try_alloc_bucket(trans, ca, watermark,
alloc_cursor, s, k, cl);
if (ob) {
- iter.path->preserve = false;
+ set_btree_iter_dontneed(&iter);
break;
}
}
@@ -567,8 +555,8 @@ again:
goto again;
}
- if (!c->blocked_allocate)
- c->blocked_allocate = local_clock();
+ track_event_change(&c->times[BCH_TIME_blocked_allocate],
+ &c->blocked_allocate, true);
ob = ERR_PTR(-BCH_ERR_freelist_empty);
goto err;
@@ -697,11 +685,9 @@ static int add_new_bucket(struct bch_fs *c,
bch_dev_bkey_exists(c, ob->dev)->mi.durability;
BUG_ON(*nr_effective >= nr_replicas);
- BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
__clear_bit(ob->dev, devs_may_alloc->d);
- *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
- ? durability : 1;
+ *nr_effective += durability;
*have_cache |= !durability;
ob_push(c, ptrs, ob);
@@ -972,8 +958,8 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
devs = target_rw_devs(c, wp->data_type, target);
/* Don't allocate from devices we already have pointers to: */
- for (i = 0; i < devs_have->nr; i++)
- __clear_bit(devs_have->devs[i], devs.d);
+ darray_for_each(*devs_have, i)
+ __clear_bit(*i, devs.d);
open_bucket_for_each(c, ptrs, ob, i)
__clear_bit(ob->dev, devs.d);
@@ -1539,10 +1525,11 @@ static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, str
unsigned data_type = ob->data_type;
barrier(); /* READ_ONCE() doesn't work on bitfields */
- prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
+ prt_printf(out, "%zu ref %u ",
ob - c->open_buckets,
- atomic_read(&ob->pin),
- data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
+ atomic_read(&ob->pin));
+ bch2_prt_data_type(out, data_type);
+ prt_printf(out, " %u:%llu gen %u allocated %u/%u",
ob->dev, ob->bucket, ob->gen,
ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
if (ob->ec)
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 23c0834a97a4..b4dc319bcb2b 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -3,6 +3,7 @@
#include "bbpos.h"
#include "alloc_background.h"
#include "backpointers.h"
+#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_update.h"
#include "btree_update_interior.h"
@@ -136,15 +137,30 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
}
int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
- struct bkey_i_backpointer *bp_k,
+ struct bpos bucket,
struct bch_backpointer bp,
struct bkey_s_c orig_k,
bool insert)
{
struct btree_iter bp_iter;
struct bkey_s_c k;
+ struct bkey_i_backpointer *bp_k;
int ret;
+ bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
+ ret = PTR_ERR_OR_ZERO(bp_k);
+ if (ret)
+ return ret;
+
+ bkey_backpointer_init(&bp_k->k_i);
+ bp_k->k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
+ bp_k->v = bp;
+
+ if (!insert) {
+ bp_k->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&bp_k->k, 0);
+ }
+
k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
bp_k->k.p,
BTREE_ITER_INTENT|
@@ -375,41 +391,45 @@ fsck_err:
/* verify that every backpointer has a corresponding alloc key */
int bch2_check_btree_backpointers(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_backpointers, POS_MIN, 0, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_btree_backpointer(trans, &iter, k)));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
-struct bpos_level {
- unsigned level;
- struct bpos pos;
+static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
+{
+ return bpos_eq(l.k->p, r.k->p) &&
+ bkey_bytes(l.k) == bkey_bytes(r.k) &&
+ !memcmp(l.v, r.v, bkey_val_bytes(l.k));
+}
+
+struct extents_to_bp_state {
+ struct bpos bucket_start;
+ struct bpos bucket_end;
+ struct bkey_buf last_flushed;
};
static int check_bp_exists(struct btree_trans *trans,
+ struct extents_to_bp_state *s,
struct bpos bucket,
struct bch_backpointer bp,
- struct bkey_s_c orig_k,
- struct bpos bucket_start,
- struct bpos bucket_end,
- struct bpos_level *last_flushed)
+ struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
struct btree_iter bp_iter = { NULL };
struct printbuf buf = PRINTBUF;
struct bkey_s_c bp_k;
+ struct bkey_buf tmp;
int ret;
- if (bpos_lt(bucket, bucket_start) ||
- bpos_gt(bucket, bucket_end))
+ bch2_bkey_buf_init(&tmp);
+
+ if (bpos_lt(bucket, s->bucket_start) ||
+ bpos_gt(bucket, s->bucket_end))
return 0;
if (!bch2_dev_bucket_exists(c, bucket))
@@ -424,13 +444,20 @@ static int check_bp_exists(struct btree_trans *trans,
if (bp_k.k->type != KEY_TYPE_backpointer ||
memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
- if (last_flushed->level != bp.level ||
- !bpos_eq(last_flushed->pos, orig_k.k->p)) {
- last_flushed->level = bp.level;
- last_flushed->pos = orig_k.k->p;
+ bch2_bkey_buf_reassemble(&tmp, c, orig_k);
+
+ if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) {
+ if (bp.level) {
+ bch2_trans_unlock(trans);
+ bch2_btree_interior_updates_flush(c);
+ }
+
+ ret = bch2_btree_write_buffer_flush_sync(trans);
+ if (ret)
+ goto err;
- ret = bch2_btree_write_buffer_flush_sync(trans) ?:
- -BCH_ERR_transaction_restart_write_buffer_flush;
+ bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k);
+ ret = -BCH_ERR_transaction_restart_write_buffer_flush;
goto out;
}
goto missing;
@@ -439,6 +466,7 @@ out:
err:
fsck_err:
bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_bkey_buf_exit(&tmp, c);
printbuf_exit(&buf);
return ret;
missing:
@@ -448,8 +476,7 @@ missing:
prt_printf(&buf, "\nbp pos ");
bch2_bpos_to_text(&buf, bp_iter.pos);
- if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers ||
- c->opts.reconstruct_alloc ||
+ if (c->opts.reconstruct_alloc ||
fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
@@ -457,25 +484,16 @@ missing:
}
static int check_extent_to_backpointers(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos bucket_start,
- struct bpos bucket_end,
- struct bpos_level *last_flushed)
+ struct extents_to_bp_state *s,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- struct bkey_s_c k;
int ret;
- k = bch2_btree_iter_peek_all_levels(iter);
- ret = bkey_err(k);
- if (ret)
- return ret;
- if (!k.k)
- return 0;
-
ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
struct bpos bucket_pos;
@@ -484,12 +502,10 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
if (p.ptr.cached)
continue;
- bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
+ bch2_extent_ptr_to_bp(c, btree, level,
k, p, &bucket_pos, &bp);
- ret = check_bp_exists(trans, bucket_pos, bp, k,
- bucket_start, bucket_end,
- last_flushed);
+ ret = check_bp_exists(trans, s, bucket_pos, bp, k);
if (ret)
return ret;
}
@@ -498,47 +514,32 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
}
static int check_btree_root_to_backpointers(struct btree_trans *trans,
+ struct extents_to_bp_state *s,
enum btree_id btree_id,
- struct bpos bucket_start,
- struct bpos bucket_end,
- struct bpos_level *last_flushed)
+ int *level)
{
struct bch_fs *c = trans->c;
- struct btree_root *r = bch2_btree_id_root(c, btree_id);
struct btree_iter iter;
struct btree *b;
struct bkey_s_c k;
- struct bkey_ptrs_c ptrs;
- struct extent_ptr_decoded p;
- const union bch_extent_entry *entry;
int ret;
-
- bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0);
+retry:
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN,
+ 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0);
b = bch2_btree_iter_peek_node(&iter);
ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto err;
- BUG_ON(b != btree_node_root(c, b));
-
- k = bkey_i_to_s_c(&b->key);
- ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- struct bpos bucket_pos;
- struct bch_backpointer bp;
-
- if (p.ptr.cached)
- continue;
+ if (b != btree_node_root(c, b)) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto retry;
+ }
- bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1,
- k, p, &bucket_pos, &bp);
+ *level = b->c.level;
- ret = check_bp_exists(trans, bucket_pos, bp, k,
- bucket_start, bucket_end,
- last_flushed);
- if (ret)
- goto err;
- }
+ k = bkey_i_to_s_c(&b->key);
+ ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -559,7 +560,7 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
si_meminfo(&i);
mem_bytes = i.totalram * i.mem_unit;
- return div_u64(mem_bytes >> 1, btree_bytes(c));
+ return div_u64(mem_bytes >> 1, c->opts.btree_node_size);
}
static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
@@ -610,49 +611,57 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
}
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
- struct bpos bucket_start,
- struct bpos bucket_end)
+ struct extents_to_bp_state *s)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- enum btree_id btree_id;
- struct bpos_level last_flushed = { UINT_MAX, POS_MIN };
int ret = 0;
- for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
- unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
+ for (enum btree_id btree_id = 0;
+ btree_id < btree_id_nr_alive(c);
+ btree_id++) {
+ int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
- bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
- depth,
- BTREE_ITER_ALL_LEVELS|
- BTREE_ITER_PREFETCH);
-
- do {
- ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_extent_to_backpointers(trans, &iter,
- bucket_start, bucket_end,
- &last_flushed));
- if (ret)
- break;
- } while (!bch2_btree_iter_advance(&iter));
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc,
+ check_btree_root_to_backpointers(trans, s, btree_id, &level));
+ if (ret)
+ return ret;
- bch2_trans_iter_exit(trans, &iter);
+ while (level >= depth) {
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+ level,
+ BTREE_ITER_PREFETCH);
+ while (1) {
+ bch2_trans_begin(trans);
+
+ struct bkey_s_c k = bch2_btree_iter_peek(&iter);
+ if (!k.k)
+ break;
+ ret = bkey_err(k) ?:
+ check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ ret = 0;
+ continue;
+ }
+ if (ret)
+ break;
+ if (bpos_eq(iter.pos, SPOS_MAX))
+ break;
+ bch2_btree_iter_advance(&iter);
+ }
+ bch2_trans_iter_exit(trans, &iter);
- if (ret)
- break;
+ if (ret)
+ return ret;
- ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_btree_root_to_backpointers(trans, btree_id,
- bucket_start, bucket_end,
- &last_flushed));
- if (ret)
- break;
+ --level;
+ }
}
- return ret;
+
+ return 0;
}
static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
@@ -714,40 +723,45 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct bpos start = POS_MIN, end;
+ struct extents_to_bp_state s = { .bucket_start = POS_MIN };
int ret;
+ bch2_bkey_buf_init(&s.last_flushed);
+ bkey_init(&s.last_flushed.k->k);
+
while (1) {
- ret = bch2_get_alloc_in_memory_pos(trans, start, &end);
+ ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end);
if (ret)
break;
- if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX))
+ if ( bpos_eq(s.bucket_start, POS_MIN) &&
+ !bpos_eq(s.bucket_end, SPOS_MAX))
bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
__func__, btree_nodes_fit_in_ram(c));
- if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) {
+ if (!bpos_eq(s.bucket_start, POS_MIN) ||
+ !bpos_eq(s.bucket_end, SPOS_MAX)) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "check_extents_to_backpointers(): ");
- bch2_bpos_to_text(&buf, start);
+ bch2_bpos_to_text(&buf, s.bucket_start);
prt_str(&buf, "-");
- bch2_bpos_to_text(&buf, end);
+ bch2_bpos_to_text(&buf, s.bucket_end);
bch_verbose(c, "%s", buf.buf);
printbuf_exit(&buf);
}
- ret = bch2_check_extents_to_backpointers_pass(trans, start, end);
- if (ret || bpos_eq(end, SPOS_MAX))
+ ret = bch2_check_extents_to_backpointers_pass(trans, &s);
+ if (ret || bpos_eq(s.bucket_end, SPOS_MAX))
break;
- start = bpos_successor(end);
+ s.bucket_start = bpos_successor(s.bucket_end);
}
bch2_trans_put(trans);
+ bch2_bkey_buf_exit(&s.last_flushed, c);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -801,13 +815,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
struct bbpos start,
struct bbpos end)
{
- struct btree_iter iter;
- struct bkey_s_c k;
struct bpos last_flushed_pos = SPOS_MAX;
return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_one_backpointer(trans, start, end,
bkey_s_c_to_backpointer(k),
&last_flushed_pos));
@@ -854,7 +866,6 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
}
bch2_trans_put(trans);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index ab866feeaf66..327365a9feac 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -2,6 +2,7 @@
#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#include "btree_cache.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "buckets.h"
@@ -63,7 +64,7 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
return ret;
}
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bkey_i_backpointer *,
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos bucket,
struct bch_backpointer, struct bkey_s_c, bool);
static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
@@ -72,28 +73,21 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
struct bkey_s_c orig_k,
bool insert)
{
- struct bch_fs *c = trans->c;
- struct bkey_i_backpointer *bp_k;
- int ret;
+ if (unlikely(bch2_backpointers_no_use_write_buffer))
+ return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert);
- bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
- ret = PTR_ERR_OR_ZERO(bp_k);
- if (ret)
- return ret;
+ struct bkey_i_backpointer bp_k;
- bkey_backpointer_init(&bp_k->k_i);
- bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset);
- bp_k->v = bp;
+ bkey_backpointer_init(&bp_k.k_i);
+ bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
+ bp_k.v = bp;
if (!insert) {
- bp_k->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&bp_k->k, 0);
+ bp_k.k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&bp_k.k, 0);
}
- if (unlikely(bch2_backpointers_no_use_write_buffer))
- return bch2_bucket_backpointer_mod_nowritebuffer(trans, bp_k, bp, orig_k, insert);
-
- return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i);
+ return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i);
}
static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index b62737fdf5ab..b80c6c9efd8c 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -193,6 +193,7 @@
#include <linux/mutex.h>
#include <linux/percpu-refcount.h>
#include <linux/percpu-rwsem.h>
+#include <linux/refcount.h>
#include <linux/rhashtable.h>
#include <linux/rwsem.h>
#include <linux/semaphore.h>
@@ -223,9 +224,11 @@
#define race_fault(...) dynamic_fault("bcachefs:race")
+#define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name])
+
#define trace_and_count(_c, _name, ...) \
do { \
- this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]); \
+ count_event(_c, _name); \
trace_##_name(__VA_ARGS__); \
} while (0)
@@ -262,46 +265,76 @@ do { \
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
+__printf(2, 3)
+void __bch2_print(struct bch_fs *c, const char *fmt, ...);
+
+#define maybe_dev_to_fs(_c) _Generic((_c), \
+ struct bch_dev *: ((struct bch_dev *) (_c))->fs, \
+ struct bch_fs *: (_c))
+
+#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__)
+
+#define bch2_print_ratelimited(_c, ...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ \
+ if (__ratelimit(&_rs)) \
+ bch2_print(_c, __VA_ARGS__); \
+} while (0)
+
#define bch_info(c, fmt, ...) \
- printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_notice(c, fmt, ...) \
- printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_warn(c, fmt, ...) \
- printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_warn_ratelimited(c, fmt, ...) \
- printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err(c, fmt, ...) \
- printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err_dev(ca, fmt, ...) \
- printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
#define bch_err_dev_offset(ca, _offset, fmt, ...) \
- printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
#define bch_err_inum(c, _inum, fmt, ...) \
- printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
- printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
#define bch_err_ratelimited(c, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err_dev_ratelimited(ca, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+
+static inline bool should_print_err(int err)
+{
+ return err && !bch2_err_matches(err, BCH_ERR_transaction_restart);
+}
#define bch_err_fn(_c, _ret) \
do { \
- if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ if (should_print_err(_ret)) \
bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
} while (0)
+#define bch_err_fn_ratelimited(_c, _ret) \
+do { \
+ if (should_print_err(_ret)) \
+ bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
+} while (0)
+
#define bch_err_msg(_c, _ret, _msg, ...) \
do { \
- if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ if (should_print_err(_ret)) \
bch_err(_c, "%s(): error " _msg " %s", __func__, \
##__VA_ARGS__, bch2_err_str(_ret)); \
} while (0)
@@ -392,6 +425,7 @@ BCH_DEBUG_PARAMS_DEBUG()
x(btree_node_merge) \
x(btree_node_sort) \
x(btree_node_read) \
+ x(btree_node_read_done) \
x(btree_interior_update_foreground) \
x(btree_interior_update_total) \
x(btree_gc) \
@@ -401,9 +435,12 @@ BCH_DEBUG_PARAMS_DEBUG()
x(journal_flush_write) \
x(journal_noflush_write) \
x(journal_flush_seq) \
- x(blocked_journal) \
+ x(blocked_journal_low_on_space) \
+ x(blocked_journal_low_on_pin) \
+ x(blocked_journal_max_in_flight) \
x(blocked_allocate) \
x(blocked_allocate_open_bucket) \
+ x(blocked_write_buffer_full) \
x(nocow_lock_contended)
enum bch_time_stats {
@@ -428,6 +465,7 @@ enum bch_time_stats {
#include "replicas_types.h"
#include "subvolume_types.h"
#include "super_types.h"
+#include "thread_with_file_types.h"
/* Number of nodes btree coalesce will try to coalesce at once */
#define GC_MERGE_NODES 4U
@@ -564,32 +602,35 @@ struct bch_dev {
struct io_count __percpu *io_done;
};
-enum {
- /* startup: */
- BCH_FS_STARTED,
- BCH_FS_MAY_GO_RW,
- BCH_FS_RW,
- BCH_FS_WAS_RW,
-
- /* shutdown: */
- BCH_FS_STOPPING,
- BCH_FS_EMERGENCY_RO,
- BCH_FS_GOING_RO,
- BCH_FS_WRITE_DISABLE_COMPLETE,
- BCH_FS_CLEAN_SHUTDOWN,
-
- /* fsck passes: */
- BCH_FS_FSCK_DONE,
- BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */
- BCH_FS_NEED_ANOTHER_GC,
-
- BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS,
-
- /* errors: */
- BCH_FS_ERROR,
- BCH_FS_TOPOLOGY_ERROR,
- BCH_FS_ERRORS_FIXED,
- BCH_FS_ERRORS_NOT_FIXED,
+/*
+ * initial_gc_unfixed
+ * error
+ * topology error
+ */
+
+#define BCH_FS_FLAGS() \
+ x(started) \
+ x(may_go_rw) \
+ x(rw) \
+ x(was_rw) \
+ x(stopping) \
+ x(emergency_ro) \
+ x(going_ro) \
+ x(write_disable_complete) \
+ x(clean_shutdown) \
+ x(fsck_running) \
+ x(initial_gc_unfixed) \
+ x(need_another_gc) \
+ x(need_delete_dead_snapshots) \
+ x(error) \
+ x(topology_error) \
+ x(errors_fixed) \
+ x(errors_not_fixed)
+
+enum bch_fs_flags {
+#define x(n) BCH_FS_##n,
+ BCH_FS_FLAGS()
+#undef x
};
struct btree_debug {
@@ -599,10 +640,11 @@ struct btree_debug {
#define BCH_TRANSACTIONS_NR 128
struct btree_transaction_stats {
+ struct bch2_time_stats duration;
struct bch2_time_stats lock_hold_times;
struct mutex lock;
unsigned nr_max_paths;
- unsigned wb_updates_size;
+ unsigned journal_entries_size;
unsigned max_mem;
char *max_paths_text;
};
@@ -664,7 +706,8 @@ struct btree_trans_buf {
x(invalidate) \
x(delete_dead_snapshots) \
x(snapshot_delete_pagecache) \
- x(sysfs)
+ x(sysfs) \
+ x(btree_write_buffer)
enum bch_write_ref {
#define x(n) BCH_WRITE_REF_##n,
@@ -689,6 +732,8 @@ struct bch_fs {
struct super_block *vfs_sb;
dev_t dev;
char name[40];
+ struct stdio_redirect *stdio;
+ struct task_struct *stdio_filter;
/* ro/rw, add/remove/resize devices: */
struct rw_semaphore state_lock;
@@ -699,6 +744,13 @@ struct bch_fs {
#else
struct percpu_ref writes;
#endif
+ /*
+ * Analagous to c->writes, for asynchronous ops that don't necessarily
+ * need fs to be read-write
+ */
+ refcount_t ro_ref;
+ wait_queue_head_t ro_ref_wait;
+
struct work_struct read_only_work;
struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
@@ -1002,10 +1054,21 @@ struct bch_fs {
/* RECOVERY */
u64 journal_replay_seq_start;
u64 journal_replay_seq_end;
+ /*
+ * Two different uses:
+ * "Has this fsck pass?" - i.e. should this type of error be an
+ * emergency read-only
+ * And, in certain situations fsck will rewind to an earlier pass: used
+ * for signaling to the toplevel code which pass we want to run now.
+ */
enum bch_recovery_pass curr_recovery_pass;
/* bitmap of explicitly enabled recovery passes: */
u64 recovery_passes_explicit;
+ /* bitmask of recovery passes that we actually ran */
u64 recovery_passes_complete;
+ /* never rewinds version of curr_recovery_pass */
+ enum bch_recovery_pass recovery_pass_done;
+ struct semaphore online_fsck_mutex;
/* DEBUG JUNK */
struct dentry *fs_debug_dir;
@@ -1065,10 +1128,20 @@ static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
#endif
}
+static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+ return !test_bit(BCH_FS_going_ro, &c->flags) &&
+ atomic_long_inc_not_zero(&c->writes[ref]);
+#else
+ return percpu_ref_tryget(&c->writes);
+#endif
+}
+
static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
{
#ifdef BCH_WRITE_REF_DEBUG
- return !test_bit(BCH_FS_GOING_RO, &c->flags) &&
+ return !test_bit(BCH_FS_going_ro, &c->flags) &&
atomic_long_inc_not_zero(&c->writes[ref]);
#else
return percpu_ref_tryget_live(&c->writes);
@@ -1087,13 +1160,27 @@ static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
if (atomic_long_read(&c->writes[i]))
return;
- set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+ set_bit(BCH_FS_write_disable_complete, &c->flags);
wake_up(&bch2_read_only_wait);
#else
percpu_ref_put(&c->writes);
#endif
}
+static inline bool bch2_ro_ref_tryget(struct bch_fs *c)
+{
+ if (test_bit(BCH_FS_stopping, &c->flags))
+ return false;
+
+ return refcount_inc_not_zero(&c->ro_ref);
+}
+
+static inline void bch2_ro_ref_put(struct bch_fs *c)
+{
+ if (refcount_dec_and_test(&c->ro_ref))
+ wake_up(&c->ro_ref_wait);
+}
+
static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
{
#ifndef NO_BCACHEFS_FS
@@ -1117,11 +1204,6 @@ static inline unsigned block_sectors(const struct bch_fs *c)
return c->opts.block_size >> 9;
}
-static inline size_t btree_sectors(const struct bch_fs *c)
-{
- return c->opts.btree_node_size >> 9;
-}
-
static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
{
return c->btree_key_cache_btrees & (1U << btree);
@@ -1158,6 +1240,15 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
return dev < c->sb.nr_devices && c->devs[dev];
}
+static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
+{
+ struct stdio_redirect *stdio = c->stdio;
+
+ if (c->stdio_filter && c->stdio_filter != current)
+ stdio = NULL;
+ return stdio;
+}
+
#define BKEY_PADDED_ONSTACK(key, pad) \
struct { struct bkey_i key; __u64 key ## _pad[pad]; }
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index fe78e87603fc..0668b682a21c 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -307,6 +307,13 @@ struct bkey_i {
struct bch_val v;
};
+#define POS_KEY(_pos) \
+((struct bkey) { \
+ .u64s = BKEY_U64s, \
+ .format = KEY_FORMAT_CURRENT, \
+ .p = _pos, \
+})
+
#define KEY(_inode, _offset, _size) \
((struct bkey) { \
.u64s = BKEY_U64s, \
@@ -410,600 +417,12 @@ struct bch_set {
struct bch_val v;
};
-/* Extents */
-
-/*
- * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
- * preceded by checksum/compression information (bch_extent_crc32 or
- * bch_extent_crc64).
- *
- * One major determining factor in the format of extents is how we handle and
- * represent extents that have been partially overwritten and thus trimmed:
- *
- * If an extent is not checksummed or compressed, when the extent is trimmed we
- * don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the data that is currently
- * live. The size field in struct bkey records the current (live) size of the
- * extent, and is also used to mean "size of region on disk that we point to" in
- * this case.
- *
- * Thus an extent that is not checksummed or compressed will consist only of a
- * list of bch_extent_ptrs, with none of the fields in
- * bch_extent_crc32/bch_extent_crc64.
- *
- * When an extent is checksummed or compressed, it's not possible to read only
- * the data that is currently live: we have to read the entire extent that was
- * originally written, and then return only the part of the extent that is
- * currently live.
- *
- * Thus, in addition to the current size of the extent in struct bkey, we need
- * to store the size of the originally allocated space - this is the
- * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
- * when the extent is trimmed, instead of modifying the offset field of the
- * pointer, we keep a second smaller offset field - "offset into the original
- * extent of the currently live region".
- *
- * The other major determining factor is replication and data migration:
- *
- * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
- * write, we will initially write all the replicas in the same format, with the
- * same checksum type and compression format - however, when copygc runs later (or
- * tiering/cache promotion, anything that moves data), it is not in general
- * going to rewrite all the pointers at once - one of the replicas may be in a
- * bucket on one device that has very little fragmentation while another lives
- * in a bucket that has become heavily fragmented, and thus is being rewritten
- * sooner than the rest.
- *
- * Thus it will only move a subset of the pointers (or in the case of
- * tiering/cache promotion perhaps add a single pointer without dropping any
- * current pointers), and if the extent has been partially overwritten it must
- * write only the currently live portion (or copygc would not be able to reduce
- * fragmentation!) - which necessitates a different bch_extent_crc format for
- * the new pointer.
- *
- * But in the interests of space efficiency, we don't want to store one
- * bch_extent_crc for each pointer if we don't have to.
- *
- * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
- * bch_extent_ptrs appended arbitrarily one after the other. We determine the
- * type of a given entry with a scheme similar to utf8 (except we're encoding a
- * type, not a size), encoding the type in the position of the first set bit:
- *
- * bch_extent_crc32 - 0b1
- * bch_extent_ptr - 0b10
- * bch_extent_crc64 - 0b100
- *
- * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
- * bch_extent_crc64 is the least constrained).
- *
- * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
- * until the next bch_extent_crc32/64.
- *
- * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
- * is neither checksummed nor compressed.
- */
-
/* 128 bits, sufficient for cryptographic MACs: */
struct bch_csum {
__le64 lo;
__le64 hi;
} __packed __aligned(8);
-#define BCH_EXTENT_ENTRY_TYPES() \
- x(ptr, 0) \
- x(crc32, 1) \
- x(crc64, 2) \
- x(crc128, 3) \
- x(stripe_ptr, 4) \
- x(rebalance, 5)
-#define BCH_EXTENT_ENTRY_MAX 6
-
-enum bch_extent_entry_type {
-#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
- BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-/* Compressed/uncompressed size are stored biased by 1: */
-struct bch_extent_crc32 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u32 type:2,
- _compressed_size:7,
- _uncompressed_size:7,
- offset:7,
- _unused:1,
- csum_type:4,
- compression_type:4;
- __u32 csum;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u32 csum;
- __u32 compression_type:4,
- csum_type:4,
- _unused:1,
- offset:7,
- _uncompressed_size:7,
- _compressed_size:7,
- type:2;
-#endif
-} __packed __aligned(8);
-
-#define CRC32_SIZE_MAX (1U << 7)
-#define CRC32_NONCE_MAX 0
-
-struct bch_extent_crc64 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:3,
- _compressed_size:9,
- _uncompressed_size:9,
- offset:9,
- nonce:10,
- csum_type:4,
- compression_type:4,
- csum_hi:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 csum_hi:16,
- compression_type:4,
- csum_type:4,
- nonce:10,
- offset:9,
- _uncompressed_size:9,
- _compressed_size:9,
- type:3;
-#endif
- __u64 csum_lo;
-} __packed __aligned(8);
-
-#define CRC64_SIZE_MAX (1U << 9)
-#define CRC64_NONCE_MAX ((1U << 10) - 1)
-
-struct bch_extent_crc128 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:4,
- _compressed_size:13,
- _uncompressed_size:13,
- offset:13,
- nonce:13,
- csum_type:4,
- compression_type:4;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 compression_type:4,
- csum_type:4,
- nonce:13,
- offset:13,
- _uncompressed_size:13,
- _compressed_size:13,
- type:4;
-#endif
- struct bch_csum csum;
-} __packed __aligned(8);
-
-#define CRC128_SIZE_MAX (1U << 13)
-#define CRC128_NONCE_MAX ((1U << 13) - 1)
-
-/*
- * @reservation - pointer hasn't been written to, just reserved
- */
-struct bch_extent_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:1,
- cached:1,
- unused:1,
- unwritten:1,
- offset:44, /* 8 petabytes */
- dev:8,
- gen:8;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 gen:8,
- dev:8,
- offset:44,
- unwritten:1,
- unused:1,
- cached:1,
- type:1;
-#endif
-} __packed __aligned(8);
-
-struct bch_extent_stripe_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:5,
- block:8,
- redundancy:4,
- idx:47;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 idx:47,
- redundancy:4,
- block:8,
- type:5;
-#endif
-};
-
-struct bch_extent_rebalance {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:6,
- unused:34,
- compression:8, /* enum bch_compression_opt */
- target:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 target:16,
- compression:8,
- unused:34,
- type:6;
-#endif
-};
-
-union bch_extent_entry {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
- unsigned long type;
-#elif __BITS_PER_LONG == 32
- struct {
- unsigned long pad;
- unsigned long type;
- };
-#else
-#error edit for your odd byteorder.
-#endif
-
-#define x(f, n) struct bch_extent_##f f;
- BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-struct bch_btree_ptr {
- struct bch_val v;
-
- __u64 _data[0];
- struct bch_extent_ptr start[];
-} __packed __aligned(8);
-
-struct bch_btree_ptr_v2 {
- struct bch_val v;
-
- __u64 mem_ptr;
- __le64 seq;
- __le16 sectors_written;
- __le16 flags;
- struct bpos min_key;
- __u64 _data[0];
- struct bch_extent_ptr start[];
-} __packed __aligned(8);
-
-LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
-
-struct bch_extent {
- struct bch_val v;
-
- __u64 _data[0];
- union bch_extent_entry start[];
-} __packed __aligned(8);
-
-struct bch_reservation {
- struct bch_val v;
-
- __le32 generation;
- __u8 nr_replicas;
- __u8 pad[3];
-} __packed __aligned(8);
-
-/* Maximum size (in u64s) a single pointer could be: */
-#define BKEY_EXTENT_PTR_U64s_MAX\
- ((sizeof(struct bch_extent_crc128) + \
- sizeof(struct bch_extent_ptr)) / sizeof(__u64))
-
-/* Maximum possible size of an entire extent value: */
-#define BKEY_EXTENT_VAL_U64s_MAX \
- (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
-
-/* * Maximum possible size of an entire extent, key + value: */
-#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
-
-/* Btree pointers don't carry around checksums: */
-#define BKEY_BTREE_PTR_VAL_U64s_MAX \
- ((sizeof(struct bch_btree_ptr_v2) + \
- sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
-#define BKEY_BTREE_PTR_U64s_MAX \
- (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
-
-/* Inodes */
-
-#define BLOCKDEV_INODE_MAX 4096
-
-#define BCACHEFS_ROOT_INO 4096
-
-struct bch_inode {
- struct bch_val v;
-
- __le64 bi_hash_seed;
- __le32 bi_flags;
- __le16 bi_mode;
- __u8 fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v2 {
- struct bch_val v;
-
- __le64 bi_journal_seq;
- __le64 bi_hash_seed;
- __le64 bi_flags;
- __le16 bi_mode;
- __u8 fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v3 {
- struct bch_val v;
-
- __le64 bi_journal_seq;
- __le64 bi_hash_seed;
- __le64 bi_flags;
- __le64 bi_sectors;
- __le64 bi_size;
- __le64 bi_version;
- __u8 fields[];
-} __packed __aligned(8);
-
-#define INODEv3_FIELDS_START_INITIAL 6
-#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
-
-struct bch_inode_generation {
- struct bch_val v;
-
- __le32 bi_generation;
- __le32 pad;
-} __packed __aligned(8);
-
-/*
- * bi_subvol and bi_parent_subvol are only set for subvolume roots:
- */
-
-#define BCH_INODE_FIELDS_v2() \
- x(bi_atime, 96) \
- x(bi_ctime, 96) \
- x(bi_mtime, 96) \
- x(bi_otime, 96) \
- x(bi_size, 64) \
- x(bi_sectors, 64) \
- x(bi_uid, 32) \
- x(bi_gid, 32) \
- x(bi_nlink, 32) \
- x(bi_generation, 32) \
- x(bi_dev, 32) \
- x(bi_data_checksum, 8) \
- x(bi_compression, 8) \
- x(bi_project, 32) \
- x(bi_background_compression, 8) \
- x(bi_data_replicas, 8) \
- x(bi_promote_target, 16) \
- x(bi_foreground_target, 16) \
- x(bi_background_target, 16) \
- x(bi_erasure_code, 16) \
- x(bi_fields_set, 16) \
- x(bi_dir, 64) \
- x(bi_dir_offset, 64) \
- x(bi_subvol, 32) \
- x(bi_parent_subvol, 32)
-
-#define BCH_INODE_FIELDS_v3() \
- x(bi_atime, 96) \
- x(bi_ctime, 96) \
- x(bi_mtime, 96) \
- x(bi_otime, 96) \
- x(bi_uid, 32) \
- x(bi_gid, 32) \
- x(bi_nlink, 32) \
- x(bi_generation, 32) \
- x(bi_dev, 32) \
- x(bi_data_checksum, 8) \
- x(bi_compression, 8) \
- x(bi_project, 32) \
- x(bi_background_compression, 8) \
- x(bi_data_replicas, 8) \
- x(bi_promote_target, 16) \
- x(bi_foreground_target, 16) \
- x(bi_background_target, 16) \
- x(bi_erasure_code, 16) \
- x(bi_fields_set, 16) \
- x(bi_dir, 64) \
- x(bi_dir_offset, 64) \
- x(bi_subvol, 32) \
- x(bi_parent_subvol, 32) \
- x(bi_nocow, 8)
-
-/* subset of BCH_INODE_FIELDS */
-#define BCH_INODE_OPTS() \
- x(data_checksum, 8) \
- x(compression, 8) \
- x(project, 32) \
- x(background_compression, 8) \
- x(data_replicas, 8) \
- x(promote_target, 16) \
- x(foreground_target, 16) \
- x(background_target, 16) \
- x(erasure_code, 16) \
- x(nocow, 8)
-
-enum inode_opt_id {
-#define x(name, ...) \
- Inode_opt_##name,
- BCH_INODE_OPTS()
-#undef x
- Inode_opt_nr,
-};
-
-#define BCH_INODE_FLAGS() \
- x(sync, 0) \
- x(immutable, 1) \
- x(append, 2) \
- x(nodump, 3) \
- x(noatime, 4) \
- x(i_size_dirty, 5) \
- x(i_sectors_dirty, 6) \
- x(unlinked, 7) \
- x(backptr_untrusted, 8)
-
-/* bits 20+ reserved for packed fields below: */
-
-enum bch_inode_flags {
-#define x(t, n) BCH_INODE_##t = 1U << n,
- BCH_INODE_FLAGS()
-#undef x
-};
-
-enum __bch_inode_flags {
-#define x(t, n) __BCH_INODE_##t = n,
- BCH_INODE_FLAGS()
-#undef x
-};
-
-LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
-LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
-
-LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
-LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
-LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_FIELDS_START,
- struct bch_inode_v3, bi_flags, 31, 36);
-LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
-
-/* Dirents */
-
-/*
- * Dirents (and xattrs) have to implement string lookups; since our b-tree
- * doesn't support arbitrary length strings for the key, we instead index by a
- * 64 bit hash (currently truncated sha1) of the string, stored in the offset
- * field of the key - using linear probing to resolve hash collisions. This also
- * provides us with the readdir cookie posix requires.
- *
- * Linear probing requires us to use whiteouts for deletions, in the event of a
- * collision:
- */
-
-struct bch_dirent {
- struct bch_val v;
-
- /* Target inode number: */
- union {
- __le64 d_inum;
- struct { /* DT_SUBVOL */
- __le32 d_child_subvol;
- __le32 d_parent_subvol;
- };
- };
-
- /*
- * Copy of mode bits 12-15 from the target inode - so userspace can get
- * the filetype without having to do a stat()
- */
- __u8 d_type;
-
- __u8 d_name[];
-} __packed __aligned(8);
-
-#define DT_SUBVOL 16
-#define BCH_DT_MAX 17
-
-#define BCH_NAME_MAX 512
-
-/* Xattrs */
-
-#define KEY_TYPE_XATTR_INDEX_USER 0
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
-#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
-#define KEY_TYPE_XATTR_INDEX_SECURITY 4
-
-struct bch_xattr {
- struct bch_val v;
- __u8 x_type;
- __u8 x_name_len;
- __le16 x_val_len;
- __u8 x_name[];
-} __packed __aligned(8);
-
-/* Bucket/allocation information: */
-
-struct bch_alloc {
- struct bch_val v;
- __u8 fields;
- __u8 gen;
- __u8 data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V1() \
- x(read_time, 16) \
- x(write_time, 16) \
- x(data_type, 8) \
- x(dirty_sectors, 16) \
- x(cached_sectors, 16) \
- x(oldest_gen, 8) \
- x(stripe, 32) \
- x(stripe_redundancy, 8)
-
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
- BCH_ALLOC_FIELDS_V1()
-#undef x
-};
-
-struct bch_alloc_v2 {
- struct bch_val v;
- __u8 nr_fields;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V2() \
- x(read_time, 64) \
- x(write_time, 64) \
- x(dirty_sectors, 32) \
- x(cached_sectors, 32) \
- x(stripe, 32) \
- x(stripe_redundancy, 8)
-
-struct bch_alloc_v3 {
- struct bch_val v;
- __le64 journal_seq;
- __le32 flags;
- __u8 nr_fields;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
-LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
-
-struct bch_alloc_v4 {
- struct bch_val v;
- __u64 journal_seq;
- __u32 flags;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 stripe_redundancy;
- __u32 dirty_sectors;
- __u32 cached_sectors;
- __u64 io_time[2];
- __u32 stripe;
- __u32 nr_external_backpointers;
- __u64 fragmentation_lru;
-} __packed __aligned(8);
-
-#define BCH_ALLOC_V4_U64s_V0 6
-#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64))
-
-BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
-BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
-BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
-BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
-
-#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40
-
struct bch_backpointer {
struct bch_val v;
__u8 btree_id;
@@ -1014,154 +433,6 @@ struct bch_backpointer {
struct bpos pos;
} __packed __aligned(8);
-#define KEY_TYPE_BUCKET_GENS_BITS 8
-#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
-#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
-
-struct bch_bucket_gens {
- struct bch_val v;
- u8 gens[KEY_TYPE_BUCKET_GENS_NR];
-} __packed __aligned(8);
-
-/* Quotas: */
-
-enum quota_types {
- QTYP_USR = 0,
- QTYP_GRP = 1,
- QTYP_PRJ = 2,
- QTYP_NR = 3,
-};
-
-enum quota_counters {
- Q_SPC = 0,
- Q_INO = 1,
- Q_COUNTERS = 2,
-};
-
-struct bch_quota_counter {
- __le64 hardlimit;
- __le64 softlimit;
-};
-
-struct bch_quota {
- struct bch_val v;
- struct bch_quota_counter c[Q_COUNTERS];
-} __packed __aligned(8);
-
-/* Erasure coding */
-
-struct bch_stripe {
- struct bch_val v;
- __le16 sectors;
- __u8 algorithm;
- __u8 nr_blocks;
- __u8 nr_redundant;
-
- __u8 csum_granularity_bits;
- __u8 csum_type;
- __u8 pad;
-
- struct bch_extent_ptr ptrs[];
-} __packed __aligned(8);
-
-/* Reflink: */
-
-struct bch_reflink_p {
- struct bch_val v;
- __le64 idx;
- /*
- * A reflink pointer might point to an indirect extent which is then
- * later split (by copygc or rebalance). If we only pointed to part of
- * the original indirect extent, and then one of the fragments is
- * outside the range we point to, we'd leak a refcount: so when creating
- * reflink pointers, we need to store pad values to remember the full
- * range we were taking a reference on.
- */
- __le32 front_pad;
- __le32 back_pad;
-} __packed __aligned(8);
-
-struct bch_reflink_v {
- struct bch_val v;
- __le64 refcount;
- union bch_extent_entry start[0];
- __u64 _data[];
-} __packed __aligned(8);
-
-struct bch_indirect_inline_data {
- struct bch_val v;
- __le64 refcount;
- u8 data[];
-};
-
-/* Inline data */
-
-struct bch_inline_data {
- struct bch_val v;
- u8 data[];
-};
-
-/* Subvolumes: */
-
-#define SUBVOL_POS_MIN POS(0, 1)
-#define SUBVOL_POS_MAX POS(0, S32_MAX)
-#define BCACHEFS_ROOT_SUBVOL 1
-
-struct bch_subvolume {
- struct bch_val v;
- __le32 flags;
- __le32 snapshot;
- __le64 inode;
- /*
- * Snapshot subvolumes form a tree, separate from the snapshot nodes
- * tree - if this subvolume is a snapshot, this is the ID of the
- * subvolume it was created from:
- */
- __le32 parent;
- __le32 pad;
- bch_le128 otime;
-};
-
-LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
-/*
- * We need to know whether a subvolume is a snapshot so we can know whether we
- * can delete it (or whether it should just be rm -rf'd)
- */
-LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
-LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
-
-/* Snapshots */
-
-struct bch_snapshot {
- struct bch_val v;
- __le32 flags;
- __le32 parent;
- __le32 children[2];
- __le32 subvol;
- /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
- __le32 tree;
- __le32 depth;
- __le32 skip[3];
-};
-
-LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
-
-/* True if a subvolume points to this snapshot node: */
-LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
-
-/*
- * Snapshot trees:
- *
- * The snapshot_trees btree gives us persistent indentifier for each tree of
- * bch_snapshot nodes, and allow us to record and easily find the root/master
- * subvolume that other snapshots were created from:
- */
-struct bch_snapshot_tree {
- struct bch_val v;
- __le32 master_subvol;
- __le32 root_snapshot;
-};
-
/* LRU btree: */
struct bch_lru {
@@ -1171,33 +442,6 @@ struct bch_lru {
#define LRU_ID_STRIPES (1U << 16)
-/* Logged operations btree: */
-
-struct bch_logged_op_truncate {
- struct bch_val v;
- __le32 subvol;
- __le32 pad;
- __le64 inum;
- __le64 new_i_size;
-};
-
-enum logged_op_finsert_state {
- LOGGED_OP_FINSERT_start,
- LOGGED_OP_FINSERT_shift_extents,
- LOGGED_OP_FINSERT_finish,
-};
-
-struct bch_logged_op_finsert {
- struct bch_val v;
- __u8 state;
- __u8 pad[3];
- __le32 subvol;
- __le64 inum;
- __le64 dst_offset;
- __le64 src_offset;
- __le64 pos;
-};
-
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -1223,6 +467,19 @@ struct bch_sb_field {
x(ext, 13) \
x(downgrade, 14)
+#include "alloc_background_format.h"
+#include "extents_format.h"
+#include "reflink_format.h"
+#include "ec_format.h"
+#include "inode_format.h"
+#include "dirent_format.h"
+#include "xattr_format.h"
+#include "quota_format.h"
+#include "logged_ops_format.h"
+#include "snapshot_format.h"
+#include "subvolume_format.h"
+#include "sb-counters_format.h"
+
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
BCH_SB_FIELDS()
@@ -1296,6 +553,7 @@ struct bch_member {
__le64 errors[BCH_MEMBER_ERROR_NR];
__le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
__le64 errors_reset_time;
+ __le64 seq;
};
#define BCH_MEMBER_V1_BYTES 56
@@ -1442,7 +700,7 @@ struct bch_sb_field_replicas_v0 {
struct bch_replicas_entry_v0 entries[];
} __packed __aligned(8);
-struct bch_replicas_entry {
+struct bch_replicas_entry_v1 {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
@@ -1454,24 +712,7 @@ struct bch_replicas_entry {
struct bch_sb_field_replicas {
struct bch_sb_field field;
- struct bch_replicas_entry entries[];
-} __packed __aligned(8);
-
-/* BCH_SB_FIELD_quota: */
-
-struct bch_sb_quota_counter {
- __le32 timelimit;
- __le32 warnlimit;
-};
-
-struct bch_sb_quota_type {
- __le64 flags;
- struct bch_sb_quota_counter c[Q_COUNTERS];
-};
-
-struct bch_sb_field_quota {
- struct bch_sb_field field;
- struct bch_sb_quota_type q[QTYP_NR];
+ struct bch_replicas_entry_v1 entries[];
} __packed __aligned(8);
/* BCH_SB_FIELD_disk_groups: */
@@ -1492,99 +733,6 @@ struct bch_sb_field_disk_groups {
struct bch_disk_group entries[];
} __packed __aligned(8);
-/* BCH_SB_FIELD_counters */
-
-#define BCH_PERSISTENT_COUNTERS() \
- x(io_read, 0) \
- x(io_write, 1) \
- x(io_move, 2) \
- x(bucket_invalidate, 3) \
- x(bucket_discard, 4) \
- x(bucket_alloc, 5) \
- x(bucket_alloc_fail, 6) \
- x(btree_cache_scan, 7) \
- x(btree_cache_reap, 8) \
- x(btree_cache_cannibalize, 9) \
- x(btree_cache_cannibalize_lock, 10) \
- x(btree_cache_cannibalize_lock_fail, 11) \
- x(btree_cache_cannibalize_unlock, 12) \
- x(btree_node_write, 13) \
- x(btree_node_read, 14) \
- x(btree_node_compact, 15) \
- x(btree_node_merge, 16) \
- x(btree_node_split, 17) \
- x(btree_node_rewrite, 18) \
- x(btree_node_alloc, 19) \
- x(btree_node_free, 20) \
- x(btree_node_set_root, 21) \
- x(btree_path_relock_fail, 22) \
- x(btree_path_upgrade_fail, 23) \
- x(btree_reserve_get_fail, 24) \
- x(journal_entry_full, 25) \
- x(journal_full, 26) \
- x(journal_reclaim_finish, 27) \
- x(journal_reclaim_start, 28) \
- x(journal_write, 29) \
- x(read_promote, 30) \
- x(read_bounce, 31) \
- x(read_split, 33) \
- x(read_retry, 32) \
- x(read_reuse_race, 34) \
- x(move_extent_read, 35) \
- x(move_extent_write, 36) \
- x(move_extent_finish, 37) \
- x(move_extent_fail, 38) \
- x(move_extent_start_fail, 39) \
- x(copygc, 40) \
- x(copygc_wait, 41) \
- x(gc_gens_end, 42) \
- x(gc_gens_start, 43) \
- x(trans_blocked_journal_reclaim, 44) \
- x(trans_restart_btree_node_reused, 45) \
- x(trans_restart_btree_node_split, 46) \
- x(trans_restart_fault_inject, 47) \
- x(trans_restart_iter_upgrade, 48) \
- x(trans_restart_journal_preres_get, 49) \
- x(trans_restart_journal_reclaim, 50) \
- x(trans_restart_journal_res_get, 51) \
- x(trans_restart_key_cache_key_realloced, 52) \
- x(trans_restart_key_cache_raced, 53) \
- x(trans_restart_mark_replicas, 54) \
- x(trans_restart_mem_realloced, 55) \
- x(trans_restart_memory_allocation_failure, 56) \
- x(trans_restart_relock, 57) \
- x(trans_restart_relock_after_fill, 58) \
- x(trans_restart_relock_key_cache_fill, 59) \
- x(trans_restart_relock_next_node, 60) \
- x(trans_restart_relock_parent_for_fill, 61) \
- x(trans_restart_relock_path, 62) \
- x(trans_restart_relock_path_intent, 63) \
- x(trans_restart_too_many_iters, 64) \
- x(trans_restart_traverse, 65) \
- x(trans_restart_upgrade, 66) \
- x(trans_restart_would_deadlock, 67) \
- x(trans_restart_would_deadlock_write, 68) \
- x(trans_restart_injected, 69) \
- x(trans_restart_key_cache_upgrade, 70) \
- x(trans_traverse_all, 71) \
- x(transaction_commit, 72) \
- x(write_super, 73) \
- x(trans_restart_would_deadlock_recursion_limit, 74) \
- x(trans_restart_write_buffer_flush, 75) \
- x(trans_restart_split_race, 76)
-
-enum bch_persistent_counters {
-#define x(t, n, ...) BCH_COUNTER_##t,
- BCH_PERSISTENT_COUNTERS()
-#undef x
- BCH_COUNTER_NR
-};
-
-struct bch_sb_field_counters {
- struct bch_sb_field field;
- __le64 d[];
-};
-
/*
* On clean shutdown, store btree roots and current journal sequence number in
* the superblock:
@@ -1662,69 +810,41 @@ struct bch_sb_field_downgrade {
#define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10)))
#define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0)
-#define RECOVERY_PASS_ALL_FSCK (1ULL << 63)
-
/*
* field 1: version name
* field 2: BCH_VERSION(major, minor)
* field 3: recovery passess required on upgrade
*/
#define BCH_METADATA_VERSIONS() \
- x(bkey_renumber, BCH_VERSION(0, 10), \
- RECOVERY_PASS_ALL_FSCK) \
- x(inode_btree_change, BCH_VERSION(0, 11), \
- RECOVERY_PASS_ALL_FSCK) \
- x(snapshot, BCH_VERSION(0, 12), \
- RECOVERY_PASS_ALL_FSCK) \
- x(inode_backpointers, BCH_VERSION(0, 13), \
- RECOVERY_PASS_ALL_FSCK) \
- x(btree_ptr_sectors_written, BCH_VERSION(0, 14), \
- RECOVERY_PASS_ALL_FSCK) \
- x(snapshot_2, BCH_VERSION(0, 15), \
- BIT_ULL(BCH_RECOVERY_PASS_fs_upgrade_for_subvolumes)| \
- BIT_ULL(BCH_RECOVERY_PASS_initialize_subvolumes)| \
- RECOVERY_PASS_ALL_FSCK) \
- x(reflink_p_fix, BCH_VERSION(0, 16), \
- BIT_ULL(BCH_RECOVERY_PASS_fix_reflink_p)) \
- x(subvol_dirent, BCH_VERSION(0, 17), \
- RECOVERY_PASS_ALL_FSCK) \
- x(inode_v2, BCH_VERSION(0, 18), \
- RECOVERY_PASS_ALL_FSCK) \
- x(freespace, BCH_VERSION(0, 19), \
- RECOVERY_PASS_ALL_FSCK) \
- x(alloc_v4, BCH_VERSION(0, 20), \
- RECOVERY_PASS_ALL_FSCK) \
- x(new_data_types, BCH_VERSION(0, 21), \
- RECOVERY_PASS_ALL_FSCK) \
- x(backpointers, BCH_VERSION(0, 22), \
- RECOVERY_PASS_ALL_FSCK) \
- x(inode_v3, BCH_VERSION(0, 23), \
- RECOVERY_PASS_ALL_FSCK) \
- x(unwritten_extents, BCH_VERSION(0, 24), \
- RECOVERY_PASS_ALL_FSCK) \
- x(bucket_gens, BCH_VERSION(0, 25), \
- BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \
- RECOVERY_PASS_ALL_FSCK) \
- x(lru_v2, BCH_VERSION(0, 26), \
- RECOVERY_PASS_ALL_FSCK) \
- x(fragmentation_lru, BCH_VERSION(0, 27), \
- RECOVERY_PASS_ALL_FSCK) \
- x(no_bps_in_alloc_keys, BCH_VERSION(0, 28), \
- RECOVERY_PASS_ALL_FSCK) \
- x(snapshot_trees, BCH_VERSION(0, 29), \
- RECOVERY_PASS_ALL_FSCK) \
- x(major_minor, BCH_VERSION(1, 0), \
- 0) \
- x(snapshot_skiplists, BCH_VERSION(1, 1), \
- BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \
- x(deleted_inodes, BCH_VERSION(1, 2), \
- BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \
- x(rebalance_work, BCH_VERSION(1, 3), \
- BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
+ x(bkey_renumber, BCH_VERSION(0, 10)) \
+ x(inode_btree_change, BCH_VERSION(0, 11)) \
+ x(snapshot, BCH_VERSION(0, 12)) \
+ x(inode_backpointers, BCH_VERSION(0, 13)) \
+ x(btree_ptr_sectors_written, BCH_VERSION(0, 14)) \
+ x(snapshot_2, BCH_VERSION(0, 15)) \
+ x(reflink_p_fix, BCH_VERSION(0, 16)) \
+ x(subvol_dirent, BCH_VERSION(0, 17)) \
+ x(inode_v2, BCH_VERSION(0, 18)) \
+ x(freespace, BCH_VERSION(0, 19)) \
+ x(alloc_v4, BCH_VERSION(0, 20)) \
+ x(new_data_types, BCH_VERSION(0, 21)) \
+ x(backpointers, BCH_VERSION(0, 22)) \
+ x(inode_v3, BCH_VERSION(0, 23)) \
+ x(unwritten_extents, BCH_VERSION(0, 24)) \
+ x(bucket_gens, BCH_VERSION(0, 25)) \
+ x(lru_v2, BCH_VERSION(0, 26)) \
+ x(fragmentation_lru, BCH_VERSION(0, 27)) \
+ x(no_bps_in_alloc_keys, BCH_VERSION(0, 28)) \
+ x(snapshot_trees, BCH_VERSION(0, 29)) \
+ x(major_minor, BCH_VERSION(1, 0)) \
+ x(snapshot_skiplists, BCH_VERSION(1, 1)) \
+ x(deleted_inodes, BCH_VERSION(1, 2)) \
+ x(rebalance_work, BCH_VERSION(1, 3)) \
+ x(member_seq, BCH_VERSION(1, 4))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
-#define x(t, n, upgrade_passes) bcachefs_metadata_version_##t = n,
+#define x(t, n) bcachefs_metadata_version_##t = n,
BCH_METADATA_VERSIONS()
#undef x
bcachefs_metadata_version_max
@@ -1786,7 +906,8 @@ struct bch_sb {
__le32 time_base_hi;
__le32 time_precision;
- __le64 flags[8];
+ __le64 flags[7];
+ __le64 write_time;
__le64 features[2];
__le64 compat[2];
@@ -2153,7 +1274,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(clock, 7) \
x(dev_usage, 8) \
x(log, 9) \
- x(overwrite, 10)
+ x(overwrite, 10) \
+ x(write_buffer_keys, 11)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
@@ -2162,6 +1284,19 @@ enum {
BCH_JSET_ENTRY_NR
};
+static inline bool jset_entry_is_key(struct jset_entry *e)
+{
+ switch (e->type) {
+ case BCH_JSET_ENTRY_btree_keys:
+ case BCH_JSET_ENTRY_btree_root:
+ case BCH_JSET_ENTRY_overwrite:
+ case BCH_JSET_ENTRY_write_buffer_keys:
+ return true;
+ }
+
+ return false;
+}
+
/*
* Journal sequence numbers can be blacklisted: bsets record the max sequence
* number of all the journal entries they contain updates for, so that on
@@ -2203,7 +1338,7 @@ struct jset_entry_usage {
struct jset_entry_data_usage {
struct jset_entry entry;
__le64 v;
- struct bch_replicas_entry r;
+ struct bch_replicas_entry_v1 r;
} __packed;
struct jset_entry_clock {
@@ -2224,8 +1359,8 @@ struct jset_entry_dev_usage {
__le32 dev;
__u32 pad;
- __le64 buckets_ec;
- __le64 _buckets_unavailable; /* No longer used */
+ __le64 _buckets_ec; /* No longer used */
+ __le64 _buckets_unavailable; /* No longer used */
struct jset_entry_dev_usage_type d[];
};
@@ -2239,7 +1374,7 @@ static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage
struct jset_entry_log {
struct jset_entry entry;
u8 d[];
-} __packed;
+} __packed __aligned(8);
/*
* On disk format for a journal entry:
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index f05881f7e113..4b8fba754b1c 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -81,6 +81,11 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume)
#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume)
+#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2)
+
+#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
+#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
+
/* ioctl below act on a particular file, not the filesystem as a whole: */
#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *)
@@ -173,12 +178,18 @@ struct bch_ioctl_disk_set_state {
__u64 dev;
};
+#define BCH_DATA_OPS() \
+ x(scrub, 0) \
+ x(rereplicate, 1) \
+ x(migrate, 2) \
+ x(rewrite_old_nodes, 3) \
+ x(drop_extra_replicas, 4)
+
enum bch_data_ops {
- BCH_DATA_OP_SCRUB = 0,
- BCH_DATA_OP_REREPLICATE = 1,
- BCH_DATA_OP_MIGRATE = 2,
- BCH_DATA_OP_REWRITE_OLD_NODES = 3,
- BCH_DATA_OP_NR = 4,
+#define x(t, n) BCH_DATA_OP_##t = n,
+ BCH_DATA_OPS()
+#undef x
+ BCH_DATA_OP_NR
};
/*
@@ -237,7 +248,7 @@ struct bch_ioctl_data_event {
struct bch_replicas_usage {
__u64 sectors;
- struct bch_replicas_entry r;
+ struct bch_replicas_entry_v1 r;
} __packed;
static inline struct bch_replicas_usage *
@@ -268,7 +279,7 @@ struct bch_ioctl_fs_usage {
__u32 replica_entries_bytes;
__u32 pad;
- struct bch_replicas_usage replicas[0];
+ struct bch_replicas_usage replicas[];
};
/*
@@ -292,7 +303,20 @@ struct bch_ioctl_dev_usage {
__u64 buckets;
__u64 sectors;
__u64 fragmented;
- } d[BCH_DATA_NR];
+ } d[10];
+};
+
+struct bch_ioctl_dev_usage_v2 {
+ __u64 dev;
+ __u32 flags;
+ __u8 state;
+ __u8 nr_data_types;
+ __u8 pad[6];
+
+ __u32 bucket_size;
+ __u64 nr_buckets;
+
+ struct bch_ioctl_dev_usage_type d[];
};
/*
@@ -365,4 +389,24 @@ struct bch_ioctl_subvolume {
#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0)
#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1)
+/*
+ * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command,
+ * but with the kernel's implementation of fsck:
+ */
+struct bch_ioctl_fsck_offline {
+ __u64 flags;
+ __u64 opts; /* string */
+ __u64 nr_devs;
+ __u64 devs[] __counted_by(nr_devs);
+};
+
+/*
+ * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command,
+ * but with the kernel's implementation of fsck:
+ */
+struct bch_ioctl_fsck_online {
+ __u64 flags;
+ __u64 opts; /* string */
+};
+
#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index abdb05507d16..76e79a15ba08 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -33,7 +33,7 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out,
next_key_bits -= 64;
}
- bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits));
+ bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits));
if (!next_key_bits)
break;
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 761f5e33b1e6..5e52684764eb 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -63,8 +63,17 @@ static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
+static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k);
+
+ prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie));
+}
+
#define bch2_bkey_ops_cookie ((struct bkey_ops) { \
.key_invalid = key_type_cookie_invalid, \
+ .val_to_text = key_type_cookie_to_text, \
.min_val_size = 8, \
})
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 3a370b7087ac..03efe8ee565a 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -28,10 +28,8 @@ struct bkey_ops {
void (*swab)(struct bkey_s);
bool (*key_normalize)(struct bch_fs *, struct bkey_s);
bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
- int (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *, unsigned);
- int (*atomic_trigger)(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
+ int (*trigger)(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
void (*compat)(enum btree_id id, unsigned version,
unsigned big_endian, int write,
struct bkey_s);
@@ -78,84 +76,88 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b
bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-static inline int bch2_mark_key(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
-
- return ops->atomic_trigger
- ? ops->atomic_trigger(trans, btree, level, old, new, flags)
- : 0;
-}
-
enum btree_update_flags {
__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
__BTREE_UPDATE_NOJOURNAL,
- __BTREE_UPDATE_PREJOURNAL,
__BTREE_UPDATE_KEY_CACHE_RECLAIM,
- __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
-
+ __BTREE_TRIGGER_NORUN,
+ __BTREE_TRIGGER_TRANSACTIONAL,
+ __BTREE_TRIGGER_ATOMIC,
+ __BTREE_TRIGGER_GC,
__BTREE_TRIGGER_INSERT,
__BTREE_TRIGGER_OVERWRITE,
-
- __BTREE_TRIGGER_GC,
__BTREE_TRIGGER_BUCKET_INVALIDATE,
- __BTREE_TRIGGER_NOATOMIC,
};
#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL)
-#define BTREE_UPDATE_PREJOURNAL (1U << __BTREE_UPDATE_PREJOURNAL)
#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
+/* Don't run triggers at all */
#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
+/*
+ * If set, we're running transactional triggers as part of a transaction commit:
+ * triggers may generate new updates
+ *
+ * If cleared, and either BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE are set,
+ * we're running atomic triggers during a transaction commit: we have our
+ * journal reservation, we're holding btree node write locks, and we know the
+ * transaction is going to commit (returning an error here is a fatal error,
+ * causing us to go emergency read-only)
+ */
+#define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL)
+#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC)
+
+/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */
+#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
+
+/* @new is entering the btree */
#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
+
+/* @old is leaving the btree */
#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
-#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
+/* signal from bucket invalidate path to alloc trigger */
#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
-#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
-static inline int bch2_trans_mark_key(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
+static inline int bch2_key_trigger(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
- const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new->k.type);
+ const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
- return ops->trans_trigger
- ? ops->trans_trigger(trans, btree_id, level, old, new, flags)
+ return ops->trigger
+ ? ops->trigger(trans, btree, level, old, new, flags)
: 0;
}
-static inline int bch2_trans_mark_old(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, unsigned flags)
+static inline int bch2_key_trigger_old(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, unsigned flags)
{
struct bkey_i deleted;
bkey_init(&deleted.k);
deleted.k.p = old.k->p;
- return bch2_trans_mark_key(trans, btree_id, level, old, &deleted,
- BTREE_TRIGGER_OVERWRITE|flags);
+ return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted),
+ BTREE_TRIGGER_OVERWRITE|flags);
}
-static inline int bch2_trans_mark_new(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_i *new, unsigned flags)
+static inline int bch2_key_trigger_new(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s new, unsigned flags)
{
struct bkey_i deleted;
bkey_init(&deleted.k);
- deleted.k.p = new->k.p;
+ deleted.k.p = new.k->p;
- return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
- BTREE_TRIGGER_INSERT|flags);
+ return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
+ BTREE_TRIGGER_INSERT|flags);
}
void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index bb73ba9017b0..3fd1085b6c61 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -68,6 +68,12 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
_k = _n) {
_n = bkey_p_next(_k);
+ if (!_k->u64s) {
+ printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set,
+ _k->_data - i->_data);
+ break;
+ }
+
k = bkey_disassemble(b, _k, &uk);
printbuf_reset(&buf);
@@ -714,7 +720,7 @@ static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
{
struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
struct bkey_i min_key, max_key;
- unsigned j, cacheline = 1;
+ unsigned cacheline = 1;
t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
bset_ro_tree_capacity(b, t));
@@ -817,13 +823,12 @@ void bch2_bset_init_first(struct btree *b, struct bset *i)
set_btree_bset(b, t, i);
}
-void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
- struct btree_node_entry *bne)
+void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne)
{
struct bset *i = &bne->keys;
struct bset_tree *t;
- BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
+ BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b));
BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
BUG_ON(b->nsets >= MAX_BSETS);
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 632c2b8c5460..79c77baaa383 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -264,8 +264,7 @@ static inline struct bset *bset_next_set(struct btree *b,
void bch2_btree_keys_init(struct btree *);
void bch2_bset_init_first(struct btree *, struct bset *);
-void bch2_bset_init_next(struct bch_fs *, struct btree *,
- struct btree_node_entry *);
+void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
void bch2_bset_insert(struct btree *, struct btree_node_iter *,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 79495cd7a794..d7c81beac14a 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
clear_btree_node_just_written(b);
- kvpfree(b->data, btree_bytes(c));
+ kvpfree(b->data, btree_buf_bytes(b));
b->data = NULL;
#ifdef __KERNEL__
kvfree(b->aux_data);
@@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
- b->data = kvpmalloc(btree_bytes(c), gfp);
+ b->data = kvpmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
#ifdef __KERNEL__
@@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->aux_data = NULL;
#endif
if (!b->aux_data) {
- kvpfree(b->data, btree_bytes(c));
+ kvpfree(b->data, btree_buf_bytes(b));
b->data = NULL;
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
}
@@ -126,7 +126,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
bkey_btree_ptr_init(&b->key);
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
- b->byte_order = ilog2(btree_bytes(c));
+ b->byte_order = ilog2(c->opts.btree_node_size);
return b;
}
@@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
if (c->verify_data)
list_move(&c->verify_data->list, &bc->live);
- kvpfree(c->verify_ondisk, btree_bytes(c));
+ kvpfree(c->verify_ondisk, c->opts.btree_node_size);
for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
@@ -500,19 +500,21 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
* cannibalize_bucket() will take. This means every time we unlock the root of
* the btree, we need to release this lock if we have it held.
*/
-void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
+void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
if (bc->alloc_lock == current) {
- trace_and_count(c, btree_cache_cannibalize_unlock, c);
+ trace_and_count(c, btree_cache_cannibalize_unlock, trans);
bc->alloc_lock = NULL;
closure_wake_up(&bc->alloc_wait);
}
}
-int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
+int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct task_struct *old;
@@ -521,7 +523,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
goto success;
if (!cl) {
- trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
}
@@ -535,11 +537,11 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
goto success;
}
- trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
success:
- trace_and_count(c, btree_cache_cannibalize_lock, c);
+ trace_and_count(c, btree_cache_cannibalize_lock, trans);
return 0;
}
@@ -673,7 +675,7 @@ err:
mutex_unlock(&bc->lock);
- trace_and_count(c, btree_cache_cannibalize, c);
+ trace_and_count(c, btree_cache_cannibalize, trans);
goto out;
}
@@ -717,12 +719,6 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
if (IS_ERR(b))
return b;
- /*
- * Btree nodes read in from disk should not have the accessed bit set
- * initially, so that linear scans don't thrash the cache:
- */
- clear_btree_node_accessed(b);
-
bkey_copy(&b->key, k);
if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
/* raced with another fill: */
@@ -749,7 +745,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
if (path && sync)
bch2_trans_unlock_noassert(trans);
- bch2_btree_node_read(c, b, sync);
+ bch2_btree_node_read(trans, b, sync);
if (!sync)
return NULL;
@@ -1039,7 +1035,7 @@ retry:
goto retry;
if (IS_ERR(b) &&
- !bch2_btree_cache_cannibalize_lock(c, NULL))
+ !bch2_btree_cache_cannibalize_lock(trans, NULL))
goto retry;
if (IS_ERR(b))
@@ -1087,7 +1083,7 @@ lock_node:
EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
btree_check_header(c, b);
out:
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
return b;
}
@@ -1196,7 +1192,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc
" failed unpacked %zu\n",
b->unpack_fn_len,
b->nr.live_u64s * sizeof(u64),
- btree_bytes(c) - sizeof(struct btree_node),
+ btree_buf_bytes(b) - sizeof(struct btree_node),
b->nr.live_u64s * 100 / btree_max_u64s(c),
b->sib_u64s[0],
b->sib_u64s[1],
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index cfb80b201d61..6d33885fdbde 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -17,8 +17,8 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
-void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
-int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
+void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
+int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
@@ -74,22 +74,27 @@ static inline bool btree_node_hashed(struct btree *b)
_iter = 0; _iter < (_tbl)->size; _iter++) \
rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
-static inline size_t btree_bytes(struct bch_fs *c)
+static inline size_t btree_buf_bytes(const struct btree *b)
{
- return c->opts.btree_node_size;
+ return 1UL << b->byte_order;
}
-static inline size_t btree_max_u64s(struct bch_fs *c)
+static inline size_t btree_buf_max_u64s(const struct btree *b)
{
- return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+ return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64);
}
-static inline size_t btree_pages(struct bch_fs *c)
+static inline size_t btree_max_u64s(const struct bch_fs *c)
{
- return btree_bytes(c) / PAGE_SIZE;
+ return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64);
}
-static inline unsigned btree_blocks(struct bch_fs *c)
+static inline size_t btree_sectors(const struct bch_fs *c)
+{
+ return c->opts.btree_node_size >> SECTOR_SHIFT;
+}
+
+static inline unsigned btree_blocks(const struct bch_fs *c)
{
return btree_sectors(c) >> c->block_bits;
}
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 30ab78a24517..1102995643b1 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -41,6 +41,14 @@
#define DROP_THIS_NODE 10
#define DROP_PREV_NODE 11
+static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
+{
+ return (struct bkey_s) {{{
+ (struct bkey *) k.k,
+ (struct bch_val *) k.v
+ }}};
+}
+
static bool should_restart_for_topology_repair(struct bch_fs *c)
{
return c->opts.fix_errors != FSCK_FIX_no &&
@@ -108,7 +116,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
goto err;
} else {
- set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+ set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
}
}
}
@@ -134,7 +142,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
goto err;
} else {
- set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+ set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
}
}
@@ -414,10 +422,9 @@ again:
continue;
}
- if (ret) {
- bch_err_msg(c, ret, "getting btree node");
+ bch_err_msg(c, ret, "getting btree node");
+ if (ret)
break;
- }
ret = btree_repair_node_boundaries(c, b, prev, cur);
@@ -482,10 +489,9 @@ again:
false);
ret = PTR_ERR_OR_ZERO(cur);
- if (ret) {
- bch_err_msg(c, ret, "getting btree node");
+ bch_err_msg(c, ret, "getting btree node");
+ if (ret)
goto err;
- }
ret = bch2_btree_repair_topology_recurse(trans, cur);
six_unlock_read(&cur->c.lock);
@@ -591,7 +597,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
@@ -609,7 +615,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
@@ -619,7 +625,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
g->data_type = 0;
g->dirty_sectors = 0;
g->cached_sectors = 0;
- set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ set_bit(BCH_FS_need_another_gc, &c->flags);
} else {
do_update = true;
}
@@ -631,7 +637,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
@@ -643,7 +649,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
@@ -658,13 +664,13 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu different types of data in same bucket: %s, %s\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[g->data_type],
- bch2_data_types[data_type],
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (data_type == BCH_DATA_btree) {
g->data_type = data_type;
- set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ set_bit(BCH_FS_need_another_gc, &c->flags);
} else {
do_update = true;
}
@@ -707,8 +713,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
if (!new) {
- bch_err_msg(c, ret, "allocating new key");
ret = -BCH_ERR_ENOMEM_gc_repair_key;
+ bch_err_msg(c, ret, "allocating new key");
goto err;
}
@@ -807,9 +813,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
struct bch_fs *c = trans->c;
struct bkey deleted = KEY(0, 0, 0);
struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
- unsigned flags =
- BTREE_TRIGGER_GC|
- (initial ? BTREE_TRIGGER_NOATOMIC : 0);
int ret = 0;
deleted.p = k->k->p;
@@ -831,11 +834,10 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
}
ret = commit_do(trans, NULL, NULL, 0,
- bch2_mark_key(trans, btree_id, level, old, *k, flags));
+ bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC));
fsck_err:
err:
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -996,7 +998,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
/* Continue marking when opted to not
* fix the error: */
ret = 0;
- set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+ set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
continue;
}
} else if (ret) {
@@ -1068,8 +1070,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
fsck_err:
six_unlock_read(&b->c.lock);
- if (ret < 0)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
printbuf_exit(&buf);
return ret;
}
@@ -1105,10 +1106,8 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
: bch2_gc_btree(trans, i, initial, metadata_only);
}
- if (ret < 0)
- bch_err_fn(c, ret);
-
bch2_trans_put(trans);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1159,13 +1158,10 @@ static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
static void bch2_mark_superblocks(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
mutex_lock(&c->sb_lock);
gc_pos_set(c, gc_phase(GC_PHASE_SB));
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC);
mutex_unlock(&c->sb_lock);
}
@@ -1190,13 +1186,10 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
static void bch2_gc_free(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
genradix_free(&c->reflink_gc_table);
genradix_free(&c->gc_stripes);
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
@@ -1218,7 +1211,7 @@ static int bch2_gc_done(struct bch_fs *c,
bool verify = !metadata_only &&
!c->opts.reconstruct_alloc &&
(!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
- unsigned i, dev;
+ unsigned i;
int ret = 0;
percpu_down_write(&c->mark_lock);
@@ -1230,14 +1223,14 @@ static int bch2_gc_done(struct bch_fs *c,
, ##__VA_ARGS__, dst->_f, src->_f))) \
dst->_f = src->_f
#define copy_dev_field(_err, _f, _msg, ...) \
- copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
+ copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__)
#define copy_fs_field(_err, _f, _msg, ...) \
copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
- for_each_member_device(ca, c, dev) {
+ __for_each_member_device(c, ca) {
struct bch_dev_usage *dst = ca->usage_base;
struct bch_dev_usage *src = (void *)
bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
@@ -1245,15 +1238,12 @@ static int bch2_gc_done(struct bch_fs *c,
for (i = 0; i < BCH_DATA_NR; i++) {
copy_dev_field(dev_usage_buckets_wrong,
- d[i].buckets, "%s buckets", bch2_data_types[i]);
+ d[i].buckets, "%s buckets", bch2_data_type_str(i));
copy_dev_field(dev_usage_sectors_wrong,
- d[i].sectors, "%s sectors", bch2_data_types[i]);
+ d[i].sectors, "%s sectors", bch2_data_type_str(i));
copy_dev_field(dev_usage_fragmented_wrong,
- d[i].fragmented, "%s fragmented", bch2_data_types[i]);
+ d[i].fragmented, "%s fragmented", bch2_data_type_str(i));
}
-
- copy_dev_field(dev_usage_buckets_ec_wrong,
- buckets_ec, "buckets_ec");
}
{
@@ -1263,19 +1253,19 @@ static int bch2_gc_done(struct bch_fs *c,
bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
copy_fs_field(fs_usage_hidden_wrong,
- hidden, "hidden");
+ b.hidden, "hidden");
copy_fs_field(fs_usage_btree_wrong,
- btree, "btree");
+ b.btree, "btree");
if (!metadata_only) {
copy_fs_field(fs_usage_data_wrong,
- data, "data");
+ b.data, "data");
copy_fs_field(fs_usage_cached_wrong,
- cached, "cached");
+ b.cached, "cached");
copy_fs_field(fs_usage_reserved_wrong,
- reserved, "reserved");
+ b.reserved, "reserved");
copy_fs_field(fs_usage_nr_inodes_wrong,
- nr_inodes,"nr_inodes");
+ b.nr_inodes,"nr_inodes");
for (i = 0; i < BCH_REPLICAS_MAX; i++)
copy_fs_field(fs_usage_persistent_reserved_wrong,
@@ -1284,7 +1274,7 @@ static int bch2_gc_done(struct bch_fs *c,
}
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
if (metadata_only &&
@@ -1307,8 +1297,7 @@ static int bch2_gc_done(struct bch_fs *c,
fsck_err:
if (ca)
percpu_ref_put(&ca->ref);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
percpu_up_write(&c->mark_lock);
printbuf_exit(&buf);
@@ -1317,9 +1306,6 @@ fsck_err:
static int bch2_gc_start(struct bch_fs *c)
{
- struct bch_dev *ca = NULL;
- unsigned i;
-
BUG_ON(c->usage_gc);
c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
@@ -1329,7 +1315,7 @@ static int bch2_gc_start(struct bch_fs *c)
return -BCH_ERR_ENOMEM_gc_start;
}
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
BUG_ON(ca->usage_gc);
ca->usage_gc = alloc_percpu(struct bch_dev_usage);
@@ -1348,10 +1334,7 @@ static int bch2_gc_start(struct bch_fs *c)
static int bch2_gc_reset(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
free_percpu(ca->usage_gc);
ca->usage_gc = NULL;
}
@@ -1389,9 +1372,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
enum bch_data_type type;
int ret;
- if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)))
- return 1;
-
old = bch2_alloc_to_v4(k, &old_convert);
new = *old;
@@ -1437,8 +1417,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
": got %s, should be %s",
iter->pos.inode, iter->pos.offset,
gc.gen,
- bch2_data_types[new.data_type],
- bch2_data_types[gc.data_type]))
+ bch2_data_type_str(new.data_type),
+ bch2_data_type_str(gc.data_type)))
new.data_type = gc.data_type;
#define copy_bucket_field(_errtype, _f) \
@@ -1448,7 +1428,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
": got %u, should be %u", \
iter->pos.inode, iter->pos.offset, \
gc.gen, \
- bch2_data_types[gc.data_type], \
+ bch2_data_type_str(gc.data_type), \
new._f, gc._f)) \
new._f = gc._f; \
@@ -1488,52 +1468,36 @@ fsck_err:
static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_dev *ca;
- unsigned i;
int ret = 0;
- for_each_member_device(ca, c, i) {
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
- POS(ca->dev_idx, ca->mi.first_bucket),
- BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW,
- bch2_alloc_write_key(trans, &iter, k, metadata_only));
-
- if (ret < 0) {
- bch_err_fn(c, ret);
+ for_each_member_device(c, ca) {
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, ca->mi.first_bucket),
+ POS(ca->dev_idx, ca->mi.nbuckets - 1),
+ BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+ bch2_alloc_write_key(trans, &iter, k, metadata_only)));
+ if (ret) {
percpu_ref_put(&ca->ref);
break;
}
}
- bch2_trans_put(trans);
- return ret < 0 ? ret : 0;
+ bch_err_fn(c, ret);
+ return ret;
}
static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
{
- struct bch_dev *ca;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bucket *g;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
- unsigned i;
- int ret;
-
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO);
if (!buckets) {
percpu_ref_put(&ca->ref);
bch_err(c, "error allocating ca->buckets[gc]");
- ret = -BCH_ERR_ENOMEM_gc_alloc_start;
- goto err;
+ return -BCH_ERR_ENOMEM_gc_alloc_start;
}
buckets->first_bucket = ca->mi.first_bucket;
@@ -1541,42 +1505,38 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
rcu_assign_pointer(ca->buckets_gc, buckets);
}
- ret = for_each_btree_key2(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
- g = gc_bucket(ca, k.k->p.offset);
-
- a = bch2_alloc_to_v4(k, &a_convert);
-
- g->gen_valid = 1;
- g->gen = a->gen;
-
- if (metadata_only &&
- (a->data_type == BCH_DATA_user ||
- a->data_type == BCH_DATA_cached ||
- a->data_type == BCH_DATA_parity)) {
- g->data_type = a->data_type;
- g->dirty_sectors = a->dirty_sectors;
- g->cached_sectors = a->cached_sectors;
- g->stripe = a->stripe;
- g->stripe_redundancy = a->stripe_redundancy;
- }
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
+ struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ struct bucket *g = gc_bucket(ca, k.k->p.offset);
- 0;
- }));
-err:
- bch2_trans_put(trans);
- if (ret)
- bch_err_fn(c, ret);
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+
+ g->gen_valid = 1;
+ g->gen = a->gen;
+
+ if (metadata_only &&
+ (a->data_type == BCH_DATA_user ||
+ a->data_type == BCH_DATA_cached ||
+ a->data_type == BCH_DATA_parity)) {
+ g->data_type = a->data_type;
+ g->dirty_sectors = a->dirty_sectors;
+ g->cached_sectors = a->cached_sectors;
+ g->stripe = a->stripe;
+ g->stripe_redundancy = a->stripe_redundancy;
+ }
+
+ 0;
+ })));
+ bch_err_fn(c, ret);
return ret;
}
static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
{
- struct bch_dev *ca;
- unsigned i;
-
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
struct bucket_array *buckets = gc_bucket_array(ca);
struct bucket *g;
@@ -1634,7 +1594,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
if (!r->refcount)
new->k.type = KEY_TYPE_deleted;
else
- *bkey_refcount(new) = cpu_to_le64(r->refcount);
+ *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
}
fsck_err:
printbuf_exit(&buf);
@@ -1643,64 +1603,52 @@ fsck_err:
static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
size_t idx = 0;
- int ret = 0;
if (metadata_only)
return 0;
- trans = bch2_trans_get(c);
-
- ret = for_each_btree_key_commit(trans, iter,
- BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
- bch2_gc_write_reflink_key(trans, &iter, k, &idx));
-
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
c->reflink_gc_nr = 0;
- bch2_trans_put(trans);
return ret;
}
static int bch2_gc_reflink_start(struct bch_fs *c,
bool metadata_only)
{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct reflink_gc *r;
- int ret = 0;
if (metadata_only)
return 0;
- trans = bch2_trans_get(c);
c->reflink_gc_nr = 0;
- for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
- const __le64 *refcount = bkey_refcount_c(k);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
+ const __le64 *refcount = bkey_refcount_c(k);
- if (!refcount)
- continue;
+ if (!refcount)
+ continue;
- r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
- GFP_KERNEL);
- if (!r) {
- ret = -BCH_ERR_ENOMEM_gc_reflink_start;
- break;
- }
+ struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
+ c->reflink_gc_nr++, GFP_KERNEL);
+ if (!r) {
+ ret = -BCH_ERR_ENOMEM_gc_reflink_start;
+ break;
+ }
- r->offset = k.k->p.offset;
- r->size = k.k->size;
- r->refcount = 0;
- }
- bch2_trans_iter_exit(trans, &iter);
+ r->offset = k.k->p.offset;
+ r->size = k.k->size;
+ r->refcount = 0;
+ 0;
+ })));
- bch2_trans_put(trans);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1768,24 +1716,15 @@ fsck_err:
static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
if (metadata_only)
return 0;
- trans = bch2_trans_get(c);
-
- ret = for_each_btree_key_commit(trans, iter,
- BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
- bch2_gc_write_stripes_key(trans, &iter, k));
-
- bch2_trans_put(trans);
- return ret;
+ return bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_gc_write_stripes_key(trans, &iter, k)));
}
static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
@@ -1848,7 +1787,7 @@ again:
#endif
c->gc_count++;
- if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
+ if (test_bit(BCH_FS_need_another_gc, &c->flags) ||
(!iter && bch2_test_restart_gc)) {
if (iter++ > 2) {
bch_info(c, "Unable to fix bucket gens, looping");
@@ -1860,7 +1799,7 @@ again:
* XXX: make sure gens we fixed got saved
*/
bch_info(c, "Second GC pass needed, restarting:");
- clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ clear_bit(BCH_FS_need_another_gc, &c->flags);
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
bch2_gc_stripes_reset(c, metadata_only);
@@ -1900,9 +1839,7 @@ out:
* allocator thread - issue wakeup in case they blocked on gc_lock:
*/
closure_wake_up(&c->freelist_wait);
-
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1912,7 +1849,6 @@ static int gc_btree_gens_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
struct bkey_i *u;
int ret;
@@ -1970,12 +1906,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
int bch2_gc_gens(struct bch_fs *c)
{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_dev *ca;
u64 b, start_time = local_clock();
- unsigned i;
int ret;
/*
@@ -1988,9 +1919,8 @@ int bch2_gc_gens(struct bch_fs *c)
trace_and_count(c, gc_gens_start, c);
down_read(&c->gc_lock);
- trans = bch2_trans_get(c);
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
struct bucket_gens *gens = bucket_gens(ca);
BUG_ON(ca->oldest_gen);
@@ -2007,33 +1937,31 @@ int bch2_gc_gens(struct bch_fs *c)
ca->oldest_gen[b] = gens->b[b];
}
- for (i = 0; i < BTREE_ID_NR; i++)
+ for (unsigned i = 0; i < BTREE_ID_NR; i++)
if (btree_type_has_ptrs(i)) {
c->gc_gens_btree = i;
c->gc_gens_pos = POS_MIN;
- ret = for_each_btree_key_commit(trans, iter, i,
- POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
- k,
- NULL, NULL,
- BTREE_INSERT_NOFAIL,
- gc_btree_gens_key(trans, &iter, k));
- if (ret && !bch2_err_matches(ret, EROFS))
- bch_err_fn(c, ret);
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, i,
+ POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ k,
+ NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc,
+ gc_btree_gens_key(trans, &iter, k)));
if (ret)
goto err;
}
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
- POS_MIN,
- BTREE_ITER_PREFETCH,
- k,
- NULL, NULL,
- BTREE_INSERT_NOFAIL,
- bch2_alloc_write_oldest_gen(trans, &iter, k));
- if (ret && !bch2_err_matches(ret, EROFS))
- bch_err_fn(c, ret);
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
+ POS_MIN,
+ BTREE_ITER_PREFETCH,
+ k,
+ NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc,
+ bch2_alloc_write_oldest_gen(trans, &iter, k)));
if (ret)
goto err;
@@ -2045,14 +1973,15 @@ int bch2_gc_gens(struct bch_fs *c)
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
trace_and_count(c, gc_gens_end, c);
err:
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
kvfree(ca->oldest_gen);
ca->oldest_gen = NULL;
}
- bch2_trans_put(trans);
up_read(&c->gc_lock);
mutex_unlock(&c->gc_gens_lock);
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_fn(c, ret);
return ret;
}
@@ -2062,7 +1991,6 @@ static int bch2_gc_thread(void *arg)
struct io_clock *clock = &c->io_clock[WRITE];
unsigned long last = atomic64_read(&clock->now);
unsigned last_kick = atomic_read(&c->kick_gc);
- int ret;
set_freezable();
@@ -2102,11 +2030,8 @@ static int bch2_gc_thread(void *arg)
#if 0
ret = bch2_gc(c, false, false);
#else
- ret = bch2_gc_gens(c);
+ bch2_gc_gens(c);
#endif
- if (ret < 0)
- bch_err_fn(c, ret);
-
debug_check_no_locks_held();
}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 5a720f0cd5a6..aa9b6cbe3226 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -112,7 +112,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
unsigned flags = memalloc_nofs_save();
void *p;
- BUG_ON(size > btree_bytes(c));
+ BUG_ON(size > c->opts.btree_node_size);
*used_mempool = false;
p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
@@ -174,8 +174,8 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
- for (k = unwritten_whiteouts_start(c, b);
- k != unwritten_whiteouts_end(c, b);
+ for (k = unwritten_whiteouts_start(b);
+ k != unwritten_whiteouts_end(b);
k = bkey_p_next(k))
*--ptrs = k;
@@ -192,7 +192,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
verify_no_dups(b, new_whiteouts,
(void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
- memcpy_u64s(unwritten_whiteouts_start(c, b),
+ memcpy_u64s(unwritten_whiteouts_start(b),
new_whiteouts, b->whiteout_u64s);
btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
@@ -313,7 +313,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
}
bytes = sorting_entire_node
- ? btree_bytes(c)
+ ? btree_buf_bytes(b)
: __vstruct_bytes(struct btree_node, u64s);
out = btree_bounce_alloc(c, bytes, &used_mempool);
@@ -338,7 +338,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
if (sorting_entire_node) {
u64s = le16_to_cpu(out->keys.u64s);
- BUG_ON(bytes != btree_bytes(c));
+ BUG_ON(bytes != btree_buf_bytes(b));
/*
* Our temporary buffer is the same size as the btree node's
@@ -502,7 +502,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
bne = want_new_bset(c, b);
if (bne)
- bch2_bset_init_next(c, b, bne);
+ bch2_bset_init_next(b, bne);
bch2_btree_build_aux_trees(b);
@@ -524,7 +524,8 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
prt_printf(out, "at btree ");
bch2_btree_pos_to_text(out, c, b);
- prt_printf(out, "\n node offset %u", b->written);
+ prt_printf(out, "\n node offset %u/%u",
+ b->written, btree_ptr_sectors_written(&b->key));
if (i)
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
prt_str(out, ": ");
@@ -830,6 +831,23 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b,
(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
}
+static bool __bkey_valid(struct bch_fs *c, struct btree *b,
+ struct bset *i, struct bkey_packed *k)
+{
+ if (bkey_p_next(k) > vstruct_last(i))
+ return false;
+
+ if (k->format > KEY_FORMAT_CURRENT)
+ return false;
+
+ struct printbuf buf = PRINTBUF;
+ struct bkey tmp;
+ struct bkey_s u = __bkey_disassemble(b, k, &tmp);
+ bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf);
+ printbuf_exit(&buf);
+ return ret;
+}
+
static int validate_bset_keys(struct bch_fs *c, struct btree *b,
struct bset *i, int write,
bool have_retry, bool *saw_error)
@@ -845,6 +863,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
k != vstruct_last(i);) {
struct bkey_s u;
struct bkey tmp;
+ unsigned next_good_key;
if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
-BCH_ERR_btree_node_read_err_fixable,
@@ -859,12 +878,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bkey_bad_format,
- "invalid bkey format %u", k->format)) {
- i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_p_next(k),
- (u64 *) vstruct_end(i) - (u64 *) k);
- continue;
- }
+ "invalid bkey format %u", k->format))
+ goto drop_this_key;
/* XXX: validate k->u64s */
if (!write)
@@ -885,11 +900,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
c, NULL, b, i,
btree_node_bad_bkey,
"invalid bkey: %s", buf.buf);
-
- i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_p_next(k),
- (u64 *) vstruct_end(i) - (u64 *) k);
- continue;
+ goto drop_this_key;
}
if (write)
@@ -906,21 +917,45 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
prt_printf(&buf, " > ");
bch2_bkey_to_text(&buf, u.k);
- bch2_dump_bset(c, b, i, 0);
-
if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bkey_out_of_order,
- "%s", buf.buf)) {
- i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_p_next(k),
- (u64 *) vstruct_end(i) - (u64 *) k);
- continue;
- }
+ "%s", buf.buf))
+ goto drop_this_key;
}
prev = k;
k = bkey_p_next(k);
+ continue;
+drop_this_key:
+ next_good_key = k->u64s;
+
+ if (!next_good_key ||
+ (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN &&
+ version >= bcachefs_metadata_version_snapshot)) {
+ /*
+ * only do scanning if bch2_bkey_compat() has nothing to
+ * do
+ */
+
+ if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
+ for (next_good_key = 1;
+ next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
+ next_good_key++)
+ if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
+ goto got_good_key;
+
+ }
+
+ /*
+ * didn't find a good key, have to truncate the rest of
+ * the bset
+ */
+ next_good_key = (u64 *) vstruct_last(i) - (u64 *) k;
+ }
+got_good_key:
+ le16_add_cpu(&i->u64s, -next_good_key);
+ memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k);
}
fsck_err:
printbuf_exit(&buf);
@@ -934,7 +969,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
struct sort_iter *iter;
struct btree_node *sorted;
struct bkey_packed *k;
- struct bch_extent_ptr *ptr;
struct bset *i;
bool used_mempool, blacklisted;
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
@@ -943,6 +977,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
unsigned ptr_written = btree_ptr_sectors_written(&b->key);
struct printbuf buf = PRINTBUF;
int ret = 0, retry_read = 0, write = READ;
+ u64 start_time = local_clock();
b->version_ondisk = U16_MAX;
/* We might get called multiple times on read retry: */
@@ -968,12 +1003,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
struct bch_btree_ptr_v2 *bp =
&bkey_i_to_btree_ptr_v2(&b->key)->v;
+ bch2_bpos_to_text(&buf, b->data->min_key);
+ prt_str(&buf, "-");
+ bch2_bpos_to_text(&buf, b->data->max_key);
+
btree_err_on(b->data->keys.seq != bp->seq,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, NULL,
btree_node_bad_seq,
- "got wrong btree node (seq %llx want %llx)",
- b->data->keys.seq, bp->seq);
+ "got wrong btree node (want %llx got %llx)\n"
+ "got btree %s level %llu pos %s",
+ bp->seq, b->data->keys.seq,
+ bch2_btree_id_str(BTREE_NODE_ID(b->data)),
+ BTREE_NODE_LEVEL(b->data),
+ buf.buf);
} else {
btree_err_on(!b->data->keys.seq,
-BCH_ERR_btree_node_read_err_must_retry,
@@ -999,8 +1042,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
nonce = btree_nonce(i, b->written << 9);
- csum_bad = bch2_crc_cmp(b->data->csum,
- csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data));
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
+ csum_bad = bch2_crc_cmp(b->data->csum, csum);
if (csum_bad)
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
@@ -1008,7 +1051,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
-BCH_ERR_btree_node_read_err_want_retry,
c, ca, b, i,
bset_bad_csum,
- "invalid checksum");
+ "%s",
+ (printbuf_reset(&buf),
+ bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
+ buf.buf));
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
@@ -1037,8 +1083,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
nonce = btree_nonce(i, b->written << 9);
- csum_bad = bch2_crc_cmp(bne->csum,
- csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne));
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+ csum_bad = bch2_crc_cmp(bne->csum, csum);
if (csum_bad)
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
@@ -1046,7 +1092,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
-BCH_ERR_btree_node_read_err_want_retry,
c, ca, b, i,
bset_bad_csum,
- "invalid checksum");
+ "%s",
+ (printbuf_reset(&buf),
+ bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
+ buf.buf));
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
@@ -1111,7 +1160,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ptr_written, b->written);
} else {
for (bne = write_block(b);
- bset_byte_offset(b, bne) < btree_bytes(c);
+ bset_byte_offset(b, bne) < btree_buf_bytes(b);
bne = (void *) bne + block_bytes(c))
btree_err_on(bne->keys.seq == b->data->keys.seq &&
!bch2_journal_seq_is_blacklisted(c,
@@ -1123,7 +1172,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
"found bset signature after last bset");
}
- sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
+ sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
sorted->keys.u64s = 0;
set_btree_bset(b, b->set, &b->data->keys);
@@ -1139,7 +1188,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
BUG_ON(b->nr.live_u64s != u64s);
- btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
+ btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
if (updated_range)
bch2_btree_node_drop_keys_outside_node(b);
@@ -1202,6 +1251,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
out:
mempool_free(iter, &c->fill_iter);
printbuf_exit(&buf);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
return retry_read;
fsck_err:
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
@@ -1234,7 +1284,7 @@ static void btree_node_read_work(struct work_struct *work)
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
- bio->bi_iter.bi_size = btree_bytes(c);
+ bio->bi_iter.bi_size = btree_buf_bytes(b);
if (rb->have_ioref) {
bio_set_dev(bio, ca->disk_sb.bdev);
@@ -1462,7 +1512,7 @@ fsck_err:
}
if (best >= 0) {
- memcpy(b->data, ra->buf[best], btree_bytes(c));
+ memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
} else {
ret = -1;
@@ -1528,7 +1578,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
for (i = 0; i < ra->nr; i++) {
ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
ra->bio[i] = bio_alloc_bioset(NULL,
- buf_pages(ra->buf[i], btree_bytes(c)),
+ buf_pages(ra->buf[i], btree_buf_bytes(b)),
REQ_OP_READ|REQ_SYNC|REQ_META,
GFP_NOFS,
&c->btree_bio);
@@ -1548,7 +1598,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
rb->pick = pick;
rb->bio.bi_iter.bi_sector = pick.ptr.offset;
rb->bio.bi_end_io = btree_node_read_all_replicas_endio;
- bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
+ bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b));
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
@@ -1575,16 +1625,17 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
return 0;
}
-void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
+void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
bool sync)
{
+ struct bch_fs *c = trans->c;
struct extent_ptr_decoded pick;
struct btree_read_bio *rb;
struct bch_dev *ca;
struct bio *bio;
int ret;
- trace_and_count(c, btree_node_read, c, b);
+ trace_and_count(c, btree_node_read, trans, b);
if (bch2_verify_all_btree_replicas &&
!btree_node_read_all_replicas(c, b, sync))
@@ -1614,7 +1665,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
bio = bio_alloc_bioset(NULL,
- buf_pages(b->data, btree_bytes(c)),
+ buf_pages(b->data, btree_buf_bytes(b)),
REQ_OP_READ|REQ_SYNC|REQ_META,
GFP_NOFS,
&c->btree_bio);
@@ -1628,7 +1679,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_end_io = btree_node_read_endio;
- bch2_bio_map(bio, b->data, btree_bytes(c));
+ bch2_bio_map(bio, b->data, btree_buf_bytes(b));
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
@@ -1637,7 +1688,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
if (sync) {
submit_bio_wait(bio);
-
+ bch2_latency_acct(ca, rb->start_time, READ);
btree_node_read_work(&rb->work);
} else {
submit_bio(bio);
@@ -1663,12 +1714,12 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
closure_init_stack(&cl);
do {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
closure_sync(&cl);
} while (ret);
b = bch2_btree_node_mem_alloc(trans, level != 0);
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
BUG_ON(IS_ERR(b));
@@ -1677,7 +1728,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
set_btree_node_read_in_flight(b);
- bch2_btree_node_read(c, b, true);
+ bch2_btree_node_read(trans, b, true);
if (btree_node_read_error(b)) {
bch2_btree_node_hash_remove(&c->btree_cache, b);
@@ -1789,8 +1840,10 @@ static void btree_node_write_work(struct work_struct *work)
bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key)))
+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
+ ret = -BCH_ERR_btree_write_all_failed;
goto err;
+ }
if (wbio->wbio.first_btree_write) {
if (wbio->wbio.failed.nr) {
@@ -1800,9 +1853,9 @@ static void btree_node_write_work(struct work_struct *work)
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
BCH_WATERMARK_reclaim|
- BTREE_INSERT_JOURNAL_RECLAIM|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_NOCHECK_RW,
+ BCH_TRANS_COMMIT_journal_reclaim|
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_check_rw,
!wbio->wbio.failed.nr));
if (ret)
goto err;
@@ -1885,7 +1938,6 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
static void btree_write_submit(struct work_struct *work)
{
struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
- struct bch_extent_ptr *ptr;
BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
bkey_copy(&tmp.k, &wbio->key);
@@ -2022,8 +2074,8 @@ do_write:
i->u64s = 0;
sort_iter_add(&sort_iter.iter,
- unwritten_whiteouts_start(c, b),
- unwritten_whiteouts_end(c, b));
+ unwritten_whiteouts_start(b),
+ unwritten_whiteouts_end(b));
SET_BSET_SEPARATE_WHITEOUTS(i, false);
b->whiteout_u64s = 0;
@@ -2199,7 +2251,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
bne = want_new_bset(c, b);
if (bne)
- bch2_bset_init_next(c, b, bne);
+ bch2_bset_init_next(b, bne);
bch2_btree_build_aux_trees(b);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index e0d7fa5b1dfb..e251cb6b965f 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -130,7 +130,7 @@ void bch2_btree_init_next(struct btree_trans *, struct btree *);
int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
struct btree *, bool, bool *);
-void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
+void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
const struct bkey_i *, unsigned);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index da594e006769..5467a8635be1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -13,6 +13,7 @@
#include "error.h"
#include "extents.h"
#include "journal.h"
+#include "journal_io.h"
#include "replicas.h"
#include "snapshot.h"
#include "trace.h"
@@ -21,8 +22,8 @@
#include <linux/prefetch.h>
static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
-static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
- struct btree_path *);
+static inline void btree_path_list_add(struct btree_trans *,
+ btree_path_idx_t, btree_path_idx_t);
static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
{
@@ -33,7 +34,8 @@ static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
#endif
}
-static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
+static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t);
+static void bch2_trans_srcu_lock(struct btree_trans *);
static inline int __btree_path_cmp(const struct btree_path *l,
enum btree_id r_btree_id,
@@ -239,8 +241,9 @@ static void bch2_btree_path_verify(struct btree_trans *trans,
void bch2_trans_verify_paths(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned iter;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, iter)
bch2_btree_path_verify(trans, path);
}
@@ -250,7 +253,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
BUG_ON(iter->btree_id >= BTREE_ID_NR);
- BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
+ BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != btree_iter_path(trans, iter)->cached);
BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
@@ -260,8 +263,8 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
!btree_type_has_snapshot_field(iter->btree_id));
if (iter->update_path)
- bch2_btree_path_verify(trans, iter->update_path);
- bch2_btree_path_verify(trans, iter->path);
+ bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
+ bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
}
static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
@@ -330,12 +333,12 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
struct bpos pos, bool key_cache)
{
struct btree_path *path;
- unsigned idx;
+ struct trans_for_each_path_inorder_iter iter;
struct printbuf buf = PRINTBUF;
btree_trans_sort_paths(trans);
- trans_for_each_path_inorder(trans, path, idx) {
+ trans_for_each_path_inorder(trans, path, iter) {
int cmp = cmp_int(path->btree_id, id) ?:
cmp_int(path->cached, key_cache);
@@ -415,8 +418,9 @@ void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
struct bkey_packed *where)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path_with_node(trans, b, path) {
+ trans_for_each_path_with_node(trans, b, path, i) {
__bch2_btree_path_fix_key_modified(path, b, where);
bch2_btree_path_verify_level(trans, path, b->c.level);
}
@@ -523,6 +527,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans,
{
struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
struct btree_path *linked;
+ unsigned i;
if (node_iter != &path->l[b->c.level].iter) {
__bch2_btree_node_iter_fix(path, b, node_iter, t,
@@ -532,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans,
bch2_btree_node_iter_verify(node_iter, b);
}
- trans_for_each_path_with_node(trans, b, linked) {
+ trans_for_each_path_with_node(trans, b, linked, i) {
__bch2_btree_node_iter_fix(linked, b,
&linked->l[b->c.level].iter, t,
where, clobber_u64s, new_u64s);
@@ -647,7 +652,6 @@ void bch2_btree_path_level_init(struct btree_trans *trans,
static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
trans_for_each_update(trans, i)
if (!i->cached &&
@@ -655,7 +659,7 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str
i->btree_id == b->c.btree_id &&
bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
- i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
+ i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v;
if (unlikely(trans->journal_replay_not_finished)) {
struct bkey_i *j_k =
@@ -674,14 +678,22 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str
* A btree node is being replaced - update the iterator to point to the new
* node:
*/
-void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
+void bch2_trans_node_add(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
{
- struct btree_path *path;
+ struct btree_path *prev;
+
+ BUG_ON(!btree_path_pos_in_node(path, b));
+
+ while ((prev = prev_btree_path(trans, path)) &&
+ btree_path_pos_in_node(prev, b))
+ path = prev;
- trans_for_each_path(trans, path)
- if (path->uptodate == BTREE_ITER_UPTODATE &&
- !path->cached &&
- btree_path_pos_in_node(path, b)) {
+ for (;
+ path && btree_path_pos_in_node(path, b);
+ path = next_btree_path(trans, path))
+ if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) {
enum btree_node_locked_type t =
btree_lock_want(path, b->c.level);
@@ -704,8 +716,9 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path_with_node(trans, b, path)
+ trans_for_each_path_with_node(trans, b, path, i)
__btree_path_level_init(path, b->c.level);
bch2_trans_revalidate_updates_in_node(trans, b);
@@ -781,7 +794,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
struct btree_node_iter node_iter = l->iter;
struct bkey_packed *k;
struct bkey_buf tmp;
- unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+ unsigned nr = test_bit(BCH_FS_started, &c->flags)
? (path->level > 1 ? 0 : 2)
: (path->level > 1 ? 1 : 16);
bool was_locked = btree_node_locked(path, path->level);
@@ -816,7 +829,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct bkey_buf tmp;
- unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+ unsigned nr = test_bit(BCH_FS_started, &c->flags)
? (path->level > 1 ? 0 : 2)
: (path->level > 1 ? 1 : 16);
bool was_locked = btree_node_locked(path, path->level);
@@ -884,7 +897,8 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
bch2_bkey_buf_reassemble(out, c, k);
- if (flags & BTREE_ITER_PREFETCH)
+ if ((flags & BTREE_ITER_PREFETCH) &&
+ c->opts.btree_node_prefetch)
ret = btree_path_prefetch_j(trans, path, &jiter);
bch2_btree_and_journal_iter_exit(&jiter);
@@ -916,7 +930,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
bch2_bkey_buf_unpack(&tmp, c, l->b,
bch2_btree_node_iter_peek(&l->iter, l->b));
- if (flags & BTREE_ITER_PREFETCH) {
+ if ((flags & BTREE_ITER_PREFETCH) &&
+ c->opts.btree_node_prefetch) {
ret = btree_path_prefetch(trans, path);
if (ret)
goto err;
@@ -953,7 +968,8 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans)
struct bch_fs *c = trans->c;
struct btree_path *path;
unsigned long trace_ip = _RET_IP_;
- int i, ret = 0;
+ unsigned i;
+ int ret = 0;
if (trans->in_traverse_all)
return -BCH_ERR_transaction_restart_in_traverse_all;
@@ -963,7 +979,7 @@ retry_all:
trans->restarted = 0;
trans->last_restarted_ip = 0;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
path->should_be_locked = false;
btree_trans_sort_paths(trans);
@@ -977,7 +993,7 @@ retry_all:
closure_init_stack(&cl);
do {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
closure_sync(&cl);
} while (ret);
}
@@ -985,16 +1001,16 @@ retry_all:
/* Now, redo traversals in correct order: */
i = 0;
while (i < trans->nr_sorted) {
- path = trans->paths + trans->sorted[i];
+ btree_path_idx_t idx = trans->sorted[i];
/*
* Traversing a path can cause another path to be added at about
* the same position:
*/
- if (path->uptodate) {
- __btree_path_get(path, false);
- ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_);
- __btree_path_put(path, false);
+ if (trans->paths[idx].uptodate) {
+ __btree_path_get(&trans->paths[idx], false);
+ ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
+ __btree_path_put(&trans->paths[idx], false);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
bch2_err_matches(ret, ENOMEM))
@@ -1013,7 +1029,7 @@ retry_all:
* then failed to relock a path - that's fine.
*/
err:
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
trans->in_traverse_all = false;
@@ -1099,10 +1115,11 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
* stashed in the iterator and returned from bch2_trans_exit().
*/
int bch2_btree_path_traverse_one(struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path_idx,
unsigned flags,
unsigned long trace_ip)
{
+ struct btree_path *path = &trans->paths[path_idx];
unsigned depth_want = path->level;
int ret = -((int) trans->restarted);
@@ -1126,6 +1143,8 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
goto out;
}
+ path = &trans->paths[path_idx];
+
if (unlikely(path->level >= BTREE_MAX_DEPTH))
goto out;
@@ -1188,39 +1207,38 @@ static inline void btree_path_copy(struct btree_trans *trans, struct btree_path
}
}
-static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
- bool intent)
+static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src,
+ bool intent)
{
- struct btree_path *new = btree_path_alloc(trans, src);
-
- btree_path_copy(trans, new, src);
- __btree_path_get(new, intent);
+ btree_path_idx_t new = btree_path_alloc(trans, src);
+ btree_path_copy(trans, trans->paths + new, trans->paths + src);
+ __btree_path_get(trans->paths + new, intent);
return new;
}
__flatten
-struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
- struct btree_path *path, bool intent,
- unsigned long ip)
+btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
+ btree_path_idx_t path, bool intent, unsigned long ip)
{
- __btree_path_put(path, intent);
+ __btree_path_put(trans->paths + path, intent);
path = btree_path_clone(trans, path, intent);
- path->preserve = false;
+ trans->paths[path].preserve = false;
return path;
}
-struct btree_path * __must_check
+btree_path_idx_t __must_check
__bch2_btree_path_set_pos(struct btree_trans *trans,
- struct btree_path *path, struct bpos new_pos,
- bool intent, unsigned long ip, int cmp)
+ btree_path_idx_t path_idx, struct bpos new_pos,
+ bool intent, unsigned long ip)
{
- unsigned level = path->level;
+ int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos);
bch2_trans_verify_not_in_restart(trans);
- EBUG_ON(!path->ref);
+ EBUG_ON(!trans->paths[path_idx].ref);
- path = bch2_btree_path_make_mut(trans, path, intent, ip);
+ path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
+ struct btree_path *path = trans->paths + path_idx;
path->pos = new_pos;
trans->paths_sorted = false;
@@ -1231,7 +1249,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
goto out;
}
- level = btree_path_up_until_good_node(trans, path, cmp);
+ unsigned level = btree_path_up_until_good_node(trans, path, cmp);
if (btree_path_node(path, level)) {
struct btree_path_level *l = &path->l[level];
@@ -1261,7 +1279,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
}
out:
bch2_btree_path_verify(trans, path);
- return path;
+ return path_idx;
}
/* Btree path: main interface: */
@@ -1296,19 +1314,16 @@ static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btr
return NULL;
}
-static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
+static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path)
{
- __bch2_btree_path_unlock(trans, path);
- btree_path_list_remove(trans, path);
- trans->paths_allocated &= ~(1ULL << path->idx);
+ __bch2_btree_path_unlock(trans, trans->paths + path);
+ btree_path_list_remove(trans, trans->paths + path);
+ __clear_bit(path, trans->paths_allocated);
}
-void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
+void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
{
- struct btree_path *dup;
-
- EBUG_ON(trans->paths + path->idx != path);
- EBUG_ON(!path->ref);
+ struct btree_path *path = trans->paths + path_idx, *dup;
if (!__btree_path_put(path, intent))
return;
@@ -1322,7 +1337,7 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte
if (path->should_be_locked &&
!trans->restarted &&
- (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
+ (!dup || !bch2_btree_path_relock_norestart(trans, dup)))
return;
if (dup) {
@@ -1330,16 +1345,13 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte
dup->should_be_locked |= path->should_be_locked;
}
- __bch2_path_free(trans, path);
+ __bch2_path_free(trans, path_idx);
}
-static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
+static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path,
bool intent)
{
- EBUG_ON(trans->paths + path->idx != path);
- EBUG_ON(!path->ref);
-
- if (!__btree_path_put(path, intent))
+ if (!__btree_path_put(trans->paths + path, intent))
return;
__bch2_path_free(trans, path);
@@ -1362,9 +1374,6 @@ void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
noinline __cold
void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
{
- struct btree_insert_entry *i;
- struct btree_write_buffered_key *wb;
-
prt_printf(buf, "transaction updates for %s journal seq %llu",
trans->fn, trans->journal_res.seq);
prt_newline(buf);
@@ -1388,16 +1397,10 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
prt_newline(buf);
}
- trans_for_each_wb_update(trans, wb) {
- prt_printf(buf, "update: btree=%s wb=1 %pS",
- bch2_btree_id_str(wb->btree),
- (void *) i->ip_allocated);
- prt_newline(buf);
-
- prt_printf(buf, " new ");
- bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k));
- prt_newline(buf);
- }
+ for (struct jset_entry *e = trans->journal_entries;
+ e != btree_trans_journal_entries_top(trans);
+ e = vstruct_next(e))
+ bch2_journal_entry_to_text(buf, trans->c, e);
printbuf_indent_sub(buf, 2);
}
@@ -1412,11 +1415,12 @@ void bch2_dump_trans_updates(struct btree_trans *trans)
printbuf_exit(&buf);
}
-noinline __cold
-void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
+static void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
{
+ struct btree_path *path = trans->paths + path_idx;
+
prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
- path->idx, path->ref, path->intent_ref,
+ path_idx, path->ref, path->intent_ref,
path->preserve ? 'P' : ' ',
path->should_be_locked ? 'S' : ' ',
bch2_btree_id_str(path->btree_id),
@@ -1434,14 +1438,13 @@ static noinline __cold
void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
bool nosort)
{
- struct btree_path *path;
- unsigned idx;
+ struct trans_for_each_path_inorder_iter iter;
if (!nosort)
btree_trans_sort_paths(trans);
- trans_for_each_path_inorder(trans, path, idx)
- bch2_btree_path_to_text(out, path);
+ trans_for_each_path_idx_inorder(trans, iter)
+ bch2_btree_path_to_text(out, trans, iter.path_idx);
}
noinline __cold
@@ -1473,17 +1476,14 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
{
struct btree_transaction_stats *s = btree_trans_stats(trans);
struct printbuf buf = PRINTBUF;
-
- if (!s)
- return;
+ size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths);
bch2_trans_paths_to_text(&buf, trans);
if (!buf.allocation_failure) {
mutex_lock(&s->lock);
- if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
- s->nr_max_paths = trans->nr_max_paths =
- hweight64(trans->paths_allocated);
+ if (nr > s->nr_max_paths) {
+ s->nr_max_paths = nr;
swap(s->max_paths_text, buf.buf);
}
mutex_unlock(&s->lock);
@@ -1491,64 +1491,121 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
printbuf_exit(&buf);
- trans->nr_max_paths = hweight64(trans->paths_allocated);
+ trans->nr_paths_max = nr;
+}
+
+noinline __cold
+int __bch2_btree_trans_too_many_iters(struct btree_trans *trans)
+{
+ if (trace_trans_restart_too_many_iters_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_trans_paths_to_text(&buf, trans);
+ trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ count_event(trans->c, trans_restart_too_many_iters);
+
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
}
static noinline void btree_path_overflow(struct btree_trans *trans)
{
bch2_dump_trans_paths_updates(trans);
- panic("trans path overflow\n");
+ bch_err(trans->c, "trans path overflow");
}
-static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
- struct btree_path *pos)
+static noinline void btree_paths_realloc(struct btree_trans *trans)
{
- struct btree_path *path;
- unsigned idx;
+ unsigned nr = trans->nr_paths * 2;
+
+ void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
+ sizeof(struct btree_trans_paths) +
+ nr * sizeof(struct btree_path) +
+ nr * sizeof(btree_path_idx_t) + 8 +
+ nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL);
+
+ unsigned long *paths_allocated = p;
+ memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long));
+ p += BITS_TO_LONGS(nr) * sizeof(unsigned long);
+
+ p += sizeof(struct btree_trans_paths);
+ struct btree_path *paths = p;
+ *trans_paths_nr(paths) = nr;
+ memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path));
+ p += nr * sizeof(struct btree_path);
+
+ btree_path_idx_t *sorted = p;
+ memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t));
+ p += nr * sizeof(btree_path_idx_t) + 8;
+
+ struct btree_insert_entry *updates = p;
+ memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry));
+
+ unsigned long *old = trans->paths_allocated;
- if (unlikely(trans->paths_allocated ==
- ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
- btree_path_overflow(trans);
+ rcu_assign_pointer(trans->paths_allocated, paths_allocated);
+ rcu_assign_pointer(trans->paths, paths);
+ rcu_assign_pointer(trans->sorted, sorted);
+ rcu_assign_pointer(trans->updates, updates);
- idx = __ffs64(~trans->paths_allocated);
+ trans->nr_paths = nr;
+
+ if (old != trans->_paths_allocated)
+ kfree_rcu_mightsleep(old);
+}
+
+static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans,
+ btree_path_idx_t pos)
+{
+ btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths);
+
+ if (unlikely(idx == trans->nr_paths)) {
+ if (trans->nr_paths == BTREE_ITER_MAX) {
+ btree_path_overflow(trans);
+ return 0;
+ }
+
+ btree_paths_realloc(trans);
+ }
/*
* Do this before marking the new path as allocated, since it won't be
* initialized yet:
*/
- if (unlikely(idx > trans->nr_max_paths))
+ if (unlikely(idx > trans->nr_paths_max))
bch2_trans_update_max_paths(trans);
- trans->paths_allocated |= 1ULL << idx;
+ __set_bit(idx, trans->paths_allocated);
- path = &trans->paths[idx];
- path->idx = idx;
+ struct btree_path *path = &trans->paths[idx];
path->ref = 0;
path->intent_ref = 0;
path->nodes_locked = 0;
- path->alloc_seq++;
- btree_path_list_add(trans, pos, path);
+ btree_path_list_add(trans, pos, idx);
trans->paths_sorted = false;
- return path;
+ return idx;
}
-struct btree_path *bch2_path_get(struct btree_trans *trans,
- enum btree_id btree_id, struct bpos pos,
- unsigned locks_want, unsigned level,
- unsigned flags, unsigned long ip)
+btree_path_idx_t bch2_path_get(struct btree_trans *trans,
+ enum btree_id btree_id, struct bpos pos,
+ unsigned locks_want, unsigned level,
+ unsigned flags, unsigned long ip)
{
- struct btree_path *path, *path_pos = NULL;
+ struct btree_path *path;
bool cached = flags & BTREE_ITER_CACHED;
bool intent = flags & BTREE_ITER_INTENT;
- int i;
+ struct trans_for_each_path_inorder_iter iter;
+ btree_path_idx_t path_pos = 0, path_idx;
bch2_trans_verify_not_in_restart(trans);
bch2_trans_verify_locks(trans);
btree_trans_sort_paths(trans);
- trans_for_each_path_inorder(trans, path, i) {
+ trans_for_each_path_inorder(trans, path, iter) {
if (__btree_path_cmp(path,
btree_id,
cached,
@@ -1556,18 +1613,19 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
level) > 0)
break;
- path_pos = path;
+ path_pos = iter.path_idx;
}
if (path_pos &&
- path_pos->cached == cached &&
- path_pos->btree_id == btree_id &&
- path_pos->level == level) {
- __btree_path_get(path_pos, intent);
- path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
+ trans->paths[path_pos].cached == cached &&
+ trans->paths[path_pos].btree_id == btree_id &&
+ trans->paths[path_pos].level == level) {
+ __btree_path_get(trans->paths + path_pos, intent);
+ path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
+ path = trans->paths + path_idx;
} else {
- path = btree_path_alloc(trans, path_pos);
- path_pos = NULL;
+ path_idx = btree_path_alloc(trans, path_pos);
+ path = trans->paths + path_idx;
__btree_path_get(path, intent);
path->pos = pos;
@@ -1578,7 +1636,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
path->level = level;
path->locks_want = locks_want;
path->nodes_locked = 0;
- for (i = 0; i < ARRAY_SIZE(path->l); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++)
path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
#ifdef TRACK_PATH_ALLOCATED
path->ip_allocated = ip;
@@ -1604,7 +1662,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
if (locks_want > path->locks_want)
bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
- return path;
+ return path_idx;
}
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
@@ -1659,9 +1717,10 @@ __bch2_btree_iter_traverse(struct btree_iter *iter)
int __must_check
bch2_btree_iter_traverse(struct btree_iter *iter)
{
+ struct btree_trans *trans = iter->trans;
int ret;
- iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path,
btree_iter_search_key(iter),
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -1670,7 +1729,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
if (ret)
return ret;
- btree_path_set_should_be_locked(iter->path);
+ btree_path_set_should_be_locked(trans->paths + iter->path);
return 0;
}
@@ -1682,14 +1741,15 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
struct btree *b = NULL;
int ret;
- EBUG_ON(iter->path->cached);
+ EBUG_ON(trans->paths[iter->path].cached);
bch2_btree_iter_verify(iter);
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
goto err;
- b = btree_path_node(iter->path, iter->path->level);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ b = btree_path_node(path, path->level);
if (!b)
goto out;
@@ -1701,7 +1761,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(iter->path);
+ btree_path_set_should_be_locked(btree_iter_path(trans, iter));
out:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -1726,14 +1786,15 @@ struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
- struct btree_path *path = iter->path;
struct btree *b = NULL;
int ret;
+ EBUG_ON(trans->paths[iter->path].cached);
bch2_trans_verify_not_in_restart(trans);
- EBUG_ON(iter->path->cached);
bch2_btree_iter_verify(iter);
+ struct btree_path *path = btree_iter_path(trans, iter);
+
/* already at end? */
if (!btree_path_node(path, path->level))
return NULL;
@@ -1763,17 +1824,19 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
* Haven't gotten to the end of the parent node: go back down to
* the next child node
*/
- path = iter->path =
- bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
- iter->flags & BTREE_ITER_INTENT,
- btree_iter_ip_allocated(iter));
+ iter->path = bch2_btree_path_set_pos(trans, iter->path,
+ bpos_successor(iter->pos),
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+ path = btree_iter_path(trans, iter);
btree_path_set_level_down(trans, path, iter->min_depth);
- ret = bch2_btree_path_traverse(trans, path, iter->flags);
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
goto err;
+ path = btree_iter_path(trans, iter);
b = path->l[path->level].b;
}
@@ -1783,8 +1846,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(iter->path);
- BUG_ON(iter->path->uptodate);
+ btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+ EBUG_ON(btree_iter_path(trans, iter)->uptodate);
out:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -1799,23 +1862,15 @@ err:
inline bool bch2_btree_iter_advance(struct btree_iter *iter)
{
- if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
- struct bpos pos = iter->k.p;
- bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
- ? bpos_eq(pos, SPOS_MAX)
- : bkey_eq(pos, SPOS_MAX));
-
- if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
- pos = bkey_successor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
- return ret;
- } else {
- if (!btree_path_node(iter->path, iter->path->level))
- return true;
+ struct bpos pos = iter->k.p;
+ bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ ? bpos_eq(pos, SPOS_MAX)
+ : bkey_eq(pos, SPOS_MAX));
- iter->advanced = true;
- return false;
- }
+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ pos = bkey_successor(iter, pos);
+ bch2_btree_iter_set_pos(iter, pos);
+ return ret;
}
inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
@@ -1832,58 +1887,70 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
}
static noinline
-struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
+void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c *k)
{
- struct btree_insert_entry *i;
- struct bkey_i *ret = NULL;
+ struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key;
- trans_for_each_update(iter->trans, i) {
- if (i->btree_id < iter->btree_id)
- continue;
- if (i->btree_id > iter->btree_id)
- break;
- if (bpos_lt(i->k->k.p, iter->path->pos))
- continue;
- if (i->key_cache_already_flushed)
- continue;
- if (!ret || bpos_lt(i->k->k.p, ret->k.p))
- ret = i->k;
- }
+ trans_for_each_update(trans, i)
+ if (!i->key_cache_already_flushed &&
+ i->btree_id == iter->btree_id &&
+ bpos_le(i->k->k.p, iter->pos) &&
+ bpos_ge(i->k->k.p, k->k ? k->k->p : end)) {
+ iter->k = i->k->k;
+ *k = bkey_i_to_s_c(i->k);
+ }
+}
- return ret;
+static noinline
+void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c *k)
+{
+ struct btree_path *path = btree_iter_path(trans, iter);
+ struct bpos end = path_l(path)->b->key.k.p;
+
+ trans_for_each_update(trans, i)
+ if (!i->key_cache_already_flushed &&
+ i->btree_id == iter->btree_id &&
+ bpos_ge(i->k->k.p, path->pos) &&
+ bpos_le(i->k->k.p, k->k ? k->k->p : end)) {
+ iter->k = i->k->k;
+ *k = bkey_i_to_s_c(i->k);
+ }
}
-static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
+static noinline
+void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c *k)
{
- return iter->flags & BTREE_ITER_WITH_UPDATES
- ? __bch2_btree_trans_peek_updates(iter)
- : NULL;
+ trans_for_each_update(trans, i)
+ if (!i->key_cache_already_flushed &&
+ i->btree_id == iter->btree_id &&
+ bpos_eq(i->k->k.p, iter->pos)) {
+ iter->k = i->k->k;
+ *k = bkey_i_to_s_c(i->k);
+ }
}
static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
struct btree_iter *iter,
struct bpos end_pos)
{
- struct bkey_i *k;
-
- if (bpos_lt(iter->path->pos, iter->journal_pos))
- iter->journal_idx = 0;
+ struct btree_path *path = btree_iter_path(trans, iter);
- k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
- iter->path->level,
- iter->path->pos,
- end_pos,
- &iter->journal_idx);
-
- iter->journal_pos = k ? k->k.p : end_pos;
- return k;
+ return bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
+ path->level,
+ path->pos,
+ end_pos,
+ &iter->journal_idx);
}
static noinline
struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
struct btree_iter *iter)
{
- struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos);
if (k) {
iter->k = k->k;
@@ -1898,9 +1965,10 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
+ struct btree_path *path = btree_iter_path(trans, iter);
struct bkey_i *next_journal =
bch2_btree_journal_peek(trans, iter,
- k.k ? k.k->p : path_l(iter->path)->b->key.k.p);
+ k.k ? k.k->p : path_l(path)->b->key.k.p);
if (next_journal) {
iter->k = next_journal->k;
@@ -1943,13 +2011,13 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
iter->flags|BTREE_ITER_CACHED) ?:
- bch2_btree_path_relock(trans, iter->path, _THIS_IP_);
+ bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_);
if (unlikely(ret))
return bkey_s_c_err(ret);
- btree_path_set_should_be_locked(iter->key_cache_path);
+ btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
- k = bch2_btree_path_peek_slot(iter->key_cache_path, &u);
+ k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
if (k.k && !bkey_err(k)) {
iter->k = u;
k.k = &iter->k;
@@ -1960,11 +2028,10 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
{
struct btree_trans *trans = iter->trans;
- struct bkey_i *next_update;
struct bkey_s_c k, k2;
int ret;
- EBUG_ON(iter->path->cached);
+ EBUG_ON(btree_iter_path(trans, iter)->cached);
bch2_btree_iter_verify(iter);
while (1) {
@@ -1982,7 +2049,8 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
goto out;
}
- l = path_l(iter->path);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ l = path_l(path);
if (unlikely(!l->b)) {
/* No btree nodes at requested level: */
@@ -1991,7 +2059,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
goto out;
}
- btree_path_set_should_be_locked(iter->path);
+ btree_path_set_should_be_locked(path);
k = btree_path_level_peek_all(trans->c, l, &iter->k);
@@ -2009,14 +2077,9 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
k = btree_trans_peek_journal(trans, iter, k);
- next_update = btree_trans_peek_updates(iter);
-
- if (next_update &&
- bpos_le(next_update->k.p,
- k.k ? k.k->p : l->b->key.k.p)) {
- iter->k = next_update->k;
- k = bkey_i_to_s_c(next_update);
- }
+ if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ trans->nr_updates))
+ bch2_btree_trans_peek_updates(trans, iter, &k);
if (k.k && bkey_deleted(k.k)) {
/*
@@ -2066,13 +2129,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
struct bpos iter_pos;
int ret;
- EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
if (iter->update_path) {
bch2_path_put_nokeep(trans, iter->update_path,
iter->flags & BTREE_ITER_INTENT);
- iter->update_path = NULL;
+ iter->update_path = 0;
}
bch2_btree_iter_verify_entry_exit(iter);
@@ -2098,10 +2160,10 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
goto end;
if (iter->update_path &&
- !bkey_eq(iter->update_path->pos, k.k->p)) {
+ !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
bch2_path_put_nokeep(trans, iter->update_path,
iter->flags & BTREE_ITER_INTENT);
- iter->update_path = NULL;
+ iter->update_path = 0;
}
if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
@@ -2121,7 +2183,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* advance, same as on exit for iter->path, but only up
* to snapshot
*/
- __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
+ __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_INTENT);
iter->update_path = iter->path;
iter->update_path = bch2_btree_path_set_pos(trans,
@@ -2177,14 +2239,14 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(iter->path);
+ btree_path_set_should_be_locked(btree_iter_path(trans, iter));
out_no_locked:
if (iter->update_path) {
- ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_);
+ ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
if (unlikely(ret))
k = bkey_s_c_err(ret);
else
- btree_path_set_should_be_locked(iter->update_path);
+ btree_path_set_should_be_locked(trans->paths + iter->update_path);
}
if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
@@ -2206,103 +2268,6 @@ end:
}
/**
- * bch2_btree_iter_peek_all_levels() - returns the first key greater than or
- * equal to iterator's current position, returning keys from every level of the
- * btree. For keys at different levels of the btree that compare equal, the key
- * from the lower level (leaf) is returned first.
- * @iter: iterator to peek from
- *
- * Returns: key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
-{
- struct btree_trans *trans = iter->trans;
- struct bkey_s_c k;
- int ret;
-
- EBUG_ON(iter->path->cached);
- bch2_btree_iter_verify(iter);
- BUG_ON(iter->path->level < iter->min_depth);
- BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
- EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
-
- while (1) {
- iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
- iter->flags & BTREE_ITER_INTENT,
- btree_iter_ip_allocated(iter));
-
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (unlikely(ret)) {
- /* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(iter, iter->pos);
- k = bkey_s_c_err(ret);
- goto out_no_locked;
- }
-
- /* Already at end? */
- if (!btree_path_node(iter->path, iter->path->level)) {
- k = bkey_s_c_null;
- goto out_no_locked;
- }
-
- k = btree_path_level_peek_all(trans->c,
- &iter->path->l[iter->path->level], &iter->k);
-
- /* Check if we should go up to the parent node: */
- if (!k.k ||
- (iter->advanced &&
- bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
- iter->pos = path_l(iter->path)->b->key.k.p;
- btree_path_set_level_up(trans, iter->path);
- iter->advanced = false;
- continue;
- }
-
- /*
- * Check if we should go back down to a leaf:
- * If we're not in a leaf node, we only return the current key
- * if it exactly matches iter->pos - otherwise we first have to
- * go back to the leaf:
- */
- if (iter->path->level != iter->min_depth &&
- (iter->advanced ||
- !k.k ||
- !bpos_eq(iter->pos, k.k->p))) {
- btree_path_set_level_down(trans, iter->path, iter->min_depth);
- iter->pos = bpos_successor(iter->pos);
- iter->advanced = false;
- continue;
- }
-
- /* Check if we should go to the next key: */
- if (iter->path->level == iter->min_depth &&
- iter->advanced &&
- k.k &&
- bpos_eq(iter->pos, k.k->p)) {
- iter->pos = bpos_successor(iter->pos);
- iter->advanced = false;
- continue;
- }
-
- if (iter->advanced &&
- iter->path->level == iter->min_depth &&
- !bpos_eq(k.k->p, iter->pos))
- iter->advanced = false;
-
- BUG_ON(iter->advanced);
- BUG_ON(!k.k);
- break;
- }
-
- iter->pos = k.k->p;
- btree_path_set_should_be_locked(iter->path);
-out_no_locked:
- bch2_btree_iter_verify(iter);
-
- return k;
-}
-
-/**
* bch2_btree_iter_next() - returns first key greater than iterator's current
* position
* @iter: iterator to peek from
@@ -2328,14 +2293,14 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
struct bpos search_key = iter->pos;
- struct btree_path *saved_path = NULL;
struct bkey_s_c k;
struct bkey saved_k;
const struct bch_val *saved_v;
+ btree_path_idx_t saved_path = 0;
int ret;
- EBUG_ON(iter->path->cached || iter->path->level);
- EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
+ EBUG_ON(btree_iter_path(trans, iter)->cached ||
+ btree_iter_path(trans, iter)->level);
if (iter->flags & BTREE_ITER_WITH_JOURNAL)
return bkey_s_c_err(-EIO);
@@ -2359,14 +2324,18 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
goto out_no_locked;
}
- k = btree_path_level_peek(trans, iter->path,
- &iter->path->l[0], &iter->k);
+ struct btree_path *path = btree_iter_path(trans, iter);
+
+ k = btree_path_level_peek(trans, path, &path->l[0], &iter->k);
if (!k.k ||
((iter->flags & BTREE_ITER_IS_EXTENTS)
? bpos_ge(bkey_start_pos(k.k), search_key)
: bpos_gt(k.k->p, search_key)))
- k = btree_path_level_prev(trans, iter->path,
- &iter->path->l[0], &iter->k);
+ k = btree_path_level_prev(trans, path, &path->l[0], &iter->k);
+
+ if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ trans->nr_updates))
+ bch2_btree_trans_peek_prev_updates(trans, iter, &k);
if (likely(k.k)) {
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
@@ -2382,13 +2351,13 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
bch2_path_put_nokeep(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
iter->path = saved_path;
- saved_path = NULL;
+ saved_path = 0;
iter->k = saved_k;
k.v = saved_v;
goto got_key;
}
- if (bch2_snapshot_is_ancestor(iter->trans->c,
+ if (bch2_snapshot_is_ancestor(trans->c,
iter->snapshot,
k.k->p.snapshot)) {
if (saved_path)
@@ -2396,6 +2365,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
iter->flags & BTREE_ITER_INTENT);
saved_path = btree_path_clone(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
+ path = btree_iter_path(trans, iter);
saved_k = *k.k;
saved_v = k.v;
}
@@ -2412,10 +2382,11 @@ got_key:
continue;
}
+ btree_path_set_should_be_locked(path);
break;
- } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) {
+ } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
/* Advance to previous leaf node: */
- search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
+ search_key = bpos_predecessor(path->l[0].b->data->min_key);
} else {
/* Start of btree: */
bch2_btree_iter_set_pos(iter, POS_MIN);
@@ -2432,8 +2403,6 @@ got_key:
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
iter->pos.snapshot = iter->snapshot;
-
- btree_path_set_should_be_locked(iter->path);
out_no_locked:
if (saved_path)
bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
@@ -2468,8 +2437,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
- EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
+ EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
/* extents can't span inode numbers: */
if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
@@ -2493,13 +2461,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if ((iter->flags & BTREE_ITER_CACHED) ||
!(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
- struct bkey_i *next_update;
+ k = bkey_s_c_null;
- if ((next_update = btree_trans_peek_updates(iter)) &&
- bpos_eq(next_update->k.p, iter->pos)) {
- iter->k = next_update->k;
- k = bkey_i_to_s_c(next_update);
- goto out;
+ if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ trans->nr_updates)) {
+ bch2_btree_trans_peek_slot_updates(trans, iter, &k);
+ if (k.k)
+ goto out;
}
if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
@@ -2514,7 +2482,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
goto out_no_locked;
}
- k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+ k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
if (unlikely(!k.k))
goto out_no_locked;
} else {
@@ -2524,7 +2492,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if (iter->flags & BTREE_ITER_IS_EXTENTS)
end.offset = U64_MAX;
- EBUG_ON(iter->path->level);
+ EBUG_ON(btree_iter_path(trans, iter)->level);
if (iter->flags & BTREE_ITER_INTENT) {
struct btree_iter iter2;
@@ -2570,7 +2538,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
}
}
out:
- btree_path_set_should_be_locked(iter->path);
+ btree_path_set_should_be_locked(btree_iter_path(trans, iter));
out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -2617,17 +2585,17 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
struct btree_path *path;
unsigned i;
- BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated));
+ BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1);
- trans_for_each_path(trans, path) {
+ trans_for_each_path(trans, path, i) {
BUG_ON(path->sorted_idx >= trans->nr_sorted);
- BUG_ON(trans->sorted[path->sorted_idx] != path->idx);
+ BUG_ON(trans->sorted[path->sorted_idx] != i);
}
for (i = 0; i < trans->nr_sorted; i++) {
unsigned idx = trans->sorted[i];
- EBUG_ON(!(trans->paths_allocated & (1ULL << idx)));
+ BUG_ON(!test_bit(idx, trans->paths_allocated));
BUG_ON(trans->paths[idx].sorted_idx != i);
}
}
@@ -2635,12 +2603,12 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
static void btree_trans_verify_sorted(struct btree_trans *trans)
{
struct btree_path *path, *prev = NULL;
- unsigned i;
+ struct trans_for_each_path_inorder_iter iter;
if (!bch2_debug_check_iterators)
return;
- trans_for_each_path_inorder(trans, path, i) {
+ trans_for_each_path_inorder(trans, path, iter) {
if (prev && btree_path_cmp(prev, path) > 0) {
__bch2_dump_trans_paths_updates(trans, true);
panic("trans paths out of order!\n");
@@ -2697,42 +2665,40 @@ out:
static inline void btree_path_list_remove(struct btree_trans *trans,
struct btree_path *path)
{
- unsigned i;
-
EBUG_ON(path->sorted_idx >= trans->nr_sorted);
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
trans->nr_sorted--;
memmove_u64s_down_small(trans->sorted + path->sorted_idx,
trans->sorted + path->sorted_idx + 1,
- DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+ DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
+ sizeof(u64) / sizeof(btree_path_idx_t)));
#else
array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
#endif
- for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+ for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
trans->paths[trans->sorted[i]].sorted_idx = i;
-
- path->sorted_idx = U8_MAX;
}
static inline void btree_path_list_add(struct btree_trans *trans,
- struct btree_path *pos,
- struct btree_path *path)
+ btree_path_idx_t pos,
+ btree_path_idx_t path_idx)
{
- unsigned i;
+ struct btree_path *path = trans->paths + path_idx;
- path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted;
+ path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted;
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
trans->sorted + path->sorted_idx,
- DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+ DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
+ sizeof(u64) / sizeof(btree_path_idx_t)));
trans->nr_sorted++;
- trans->sorted[path->sorted_idx] = path->idx;
+ trans->sorted[path->sorted_idx] = path_idx;
#else
- array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
+ array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx);
#endif
- for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+ for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
trans->paths[trans->sorted[i]].sorted_idx = i;
btree_trans_verify_sorted_refs(trans);
@@ -2749,9 +2715,10 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
if (iter->key_cache_path)
bch2_path_put(trans, iter->key_cache_path,
iter->flags & BTREE_ITER_INTENT);
- iter->path = NULL;
- iter->update_path = NULL;
- iter->key_cache_path = NULL;
+ iter->path = 0;
+ iter->update_path = 0;
+ iter->key_cache_path = 0;
+ iter->trans = NULL;
}
void bch2_trans_iter_init_outlined(struct btree_trans *trans,
@@ -2782,41 +2749,46 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
iter->min_depth = depth;
- BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH));
- BUG_ON(iter->path->level != depth);
- BUG_ON(iter->min_depth != depth);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ BUG_ON(path->locks_want < min(locks_want, BTREE_MAX_DEPTH));
+ BUG_ON(path->level != depth);
+ BUG_ON(iter->min_depth != depth);
}
void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
{
+ struct btree_trans *trans = src->trans;
+
*dst = *src;
if (src->path)
- __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
+ __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT);
if (src->update_path)
- __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
- dst->key_cache_path = NULL;
+ __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_INTENT);
+ dst->key_cache_path = 0;
}
void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{
+ struct bch_fs *c = trans->c;
unsigned new_top = trans->mem_top + size;
- size_t old_bytes = trans->mem_bytes;
- size_t new_bytes = roundup_pow_of_two(new_top);
+ unsigned old_bytes = trans->mem_bytes;
+ unsigned new_bytes = roundup_pow_of_two(new_top);
int ret;
void *new_mem;
void *p;
- trans->mem_max = max(trans->mem_max, new_top);
-
WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
+ s->max_mem = max(s->max_mem, new_bytes);
+
new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_mem)) {
bch2_trans_unlock(trans);
new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
- new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+ new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
new_bytes = BTREE_TRANS_MEM_MAX;
kfree(trans->mem);
}
@@ -2836,7 +2808,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
trans->mem_bytes = new_bytes;
if (old_bytes) {
- trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
+ trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
}
@@ -2858,8 +2830,9 @@ void bch2_trans_srcu_unlock(struct btree_trans *trans)
if (trans->srcu_held) {
struct bch_fs *c = trans->c;
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->cached && !btree_node_locked(path, 0))
path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
@@ -2869,7 +2842,7 @@ void bch2_trans_srcu_unlock(struct btree_trans *trans)
}
}
-void bch2_trans_srcu_lock(struct btree_trans *trans)
+static void bch2_trans_srcu_lock(struct btree_trans *trans)
{
if (!trans->srcu_held) {
trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
@@ -2891,14 +2864,16 @@ void bch2_trans_srcu_lock(struct btree_trans *trans)
u32 bch2_trans_begin(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
u64 now;
bch2_trans_reset_updates(trans);
trans->restart_count++;
trans->mem_top = 0;
+ trans->journal_entries = NULL;
- trans_for_each_path(trans, path) {
+ trans_for_each_path(trans, path, i) {
path->should_be_locked = false;
/*
@@ -2915,15 +2890,21 @@ u32 bch2_trans_begin(struct btree_trans *trans)
* iterators if we do that
*/
if (!path->ref && !path->preserve)
- __bch2_path_free(trans, path);
+ __bch2_path_free(trans, i);
else
path->preserve = false;
}
now = local_clock();
+
+ if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
+ time_after64(now, trans->last_begin_time + 10))
+ __bch2_time_stats_update(&btree_trans_stats(trans)->duration,
+ trans->last_begin_time, now);
+
if (!trans->restarted &&
(need_resched() ||
- now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
+ time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) {
drop_locks_do(trans, (cond_resched(), 0));
now = local_clock();
}
@@ -2942,32 +2923,11 @@ u32 bch2_trans_begin(struct btree_trans *trans)
return trans->restart_count;
}
-static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
-{
- struct btree_trans *trans;
-
- if (IS_ENABLED(__KERNEL__)) {
- trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
- if (trans)
- return trans;
- }
-
- trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
- /*
- * paths need to be zeroed, bch2_check_for_deadlock looks at
- * paths in other threads
- */
- memset(&trans->paths, 0, sizeof(trans->paths));
- return trans;
-}
-
-const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" };
unsigned bch2_trans_get_fn_idx(const char *fn)
{
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
if (!bch2_btree_transaction_fns[i] ||
bch2_btree_transaction_fns[i] == fn) {
bch2_btree_transaction_fns[i] = fn;
@@ -2975,76 +2935,92 @@ unsigned bch2_trans_get_fn_idx(const char *fn)
}
pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
- return i;
+ return 0;
}
struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
__acquires(&c->btree_trans_barrier)
{
struct btree_trans *trans;
- struct btree_transaction_stats *s;
- trans = bch2_trans_alloc(c);
-
- memset(trans, 0, sizeof(*trans));
- trans->c = c;
- trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
- ? bch2_btree_transaction_fns[fn_idx] : NULL;
- trans->last_begin_time = local_clock();
- trans->fn_idx = fn_idx;
- trans->locking_wait.task = current;
- trans->journal_replay_not_finished =
- unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
- atomic_inc_not_zero(&c->journal_keys.ref);
- closure_init_stack(&trans->ref);
-
- s = btree_trans_stats(trans);
- if (s && s->max_mem) {
- unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
-
- trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
-
- if (!unlikely(trans->mem)) {
- trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
- trans->mem_bytes = BTREE_TRANS_MEM_MAX;
- } else {
- trans->mem_bytes = expected_mem_bytes;
+ if (IS_ENABLED(__KERNEL__)) {
+ trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
+ if (trans) {
+ memset(trans, 0, offsetof(struct btree_trans, list));
+ goto got_trans;
}
}
- if (s) {
- trans->nr_max_paths = s->nr_max_paths;
- trans->wb_updates_size = s->wb_updates_size;
- }
-
- trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- trans->srcu_lock_time = jiffies;
- trans->srcu_held = true;
+ trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
+ memset(trans, 0, sizeof(*trans));
+ closure_init_stack(&trans->ref);
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+ seqmutex_lock(&c->btree_trans_lock);
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
struct btree_trans *pos;
+ pid_t pid = current->pid;
+
+ trans->locking_wait.task = current;
- seqmutex_lock(&c->btree_trans_lock);
list_for_each_entry(pos, &c->btree_trans_list, list) {
+ struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task);
/*
* We'd much prefer to be stricter here and completely
* disallow multiple btree_trans in the same thread -
* but the data move path calls bch2_write when we
* already have a btree_trans initialized.
*/
- BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid &&
+ BUG_ON(pos_task &&
+ pid == pos_task->pid &&
bch2_trans_locked(pos));
- if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
+ if (pos_task && pid < pos_task->pid) {
list_add_tail(&trans->list, &pos->list);
goto list_add_done;
}
}
- list_add_tail(&trans->list, &c->btree_trans_list);
+ }
+ list_add_tail(&trans->list, &c->btree_trans_list);
list_add_done:
- seqmutex_unlock(&c->btree_trans_lock);
+ seqmutex_unlock(&c->btree_trans_lock);
+got_trans:
+ trans->c = c;
+ trans->last_begin_time = local_clock();
+ trans->fn_idx = fn_idx;
+ trans->locking_wait.task = current;
+ trans->journal_replay_not_finished =
+ unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
+ atomic_inc_not_zero(&c->journal_keys.ref);
+ trans->nr_paths = ARRAY_SIZE(trans->_paths);
+ trans->paths_allocated = trans->_paths_allocated;
+ trans->sorted = trans->_sorted;
+ trans->paths = trans->_paths;
+ trans->updates = trans->_updates;
+
+ *trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL;
+
+ trans->paths_allocated[0] = 1;
+
+ if (fn_idx < BCH_TRANSACTIONS_NR) {
+ trans->fn = bch2_btree_transaction_fns[fn_idx];
+
+ struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx];
+
+ if (s->max_mem) {
+ unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
+
+ trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
+ if (likely(trans->mem))
+ trans->mem_bytes = expected_mem_bytes;
+ }
+
+ trans->nr_paths_max = s->nr_max_paths;
+ trans->journal_entries_size = s->journal_entries_size;
}
+ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ trans->srcu_lock_time = jiffies;
+ trans->srcu_held = true;
return trans;
}
@@ -3053,14 +3029,15 @@ static void check_btree_paths_leaked(struct btree_trans *trans)
#ifdef CONFIG_BCACHEFS_DEBUG
struct bch_fs *c = trans->c;
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->ref)
goto leaked;
return;
leaked:
bch_err(c, "btree paths leaked from %s!", trans->fn);
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->ref)
printk(KERN_ERR " btree %s %pS\n",
bch2_btree_id_str(path->btree_id),
@@ -3073,26 +3050,14 @@ leaked:
void bch2_trans_put(struct btree_trans *trans)
__releases(&c->btree_trans_barrier)
{
- struct btree_insert_entry *i;
struct bch_fs *c = trans->c;
- struct btree_transaction_stats *s = btree_trans_stats(trans);
bch2_trans_unlock(trans);
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
- seqmutex_lock(&c->btree_trans_lock);
- list_del(&trans->list);
- seqmutex_unlock(&c->btree_trans_lock);
- }
-
- closure_sync(&trans->ref);
-
- if (s)
- s->max_mem = max(s->max_mem, trans->mem_max);
-
trans_for_each_update(trans, i)
- __btree_path_put(i->path, true);
- trans->nr_updates = 0;
+ __btree_path_put(trans->paths + i->path, true);
+ trans->nr_updates = 0;
+ trans->locking_wait.task = NULL;
check_btree_paths_leaked(trans);
@@ -3101,8 +3066,6 @@ void bch2_trans_put(struct btree_trans *trans)
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
}
- kfree(trans->extra_journal_entries.data);
-
if (trans->fs_usage_deltas) {
if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
REPLICAS_DELTA_LIST_MAX)
@@ -3115,6 +3078,13 @@ void bch2_trans_put(struct btree_trans *trans)
if (unlikely(trans->journal_replay_not_finished))
bch2_journal_keys_put(c);
+ unsigned long *paths_allocated = trans->paths_allocated;
+ trans->paths_allocated = NULL;
+ trans->paths = NULL;
+
+ if (paths_allocated != trans->_paths_allocated)
+ kfree_rcu_mightsleep(paths_allocated);
+
if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
mempool_free(trans->mem, &c->btree_trans_mem_pool);
else
@@ -3123,8 +3093,16 @@ void bch2_trans_put(struct btree_trans *trans)
/* Userspace doesn't have a real percpu implementation: */
if (IS_ENABLED(__KERNEL__))
trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
- if (trans)
+
+ if (trans) {
+ closure_sync(&trans->ref);
+
+ seqmutex_lock(&c->btree_trans_lock);
+ list_del(&trans->list);
+ seqmutex_unlock(&c->btree_trans_lock);
+
mempool_free(trans, &c->btree_trans_pool);
+ }
}
static void __maybe_unused
@@ -3152,24 +3130,38 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
{
- struct btree_path *path;
struct btree_bkey_cached_common *b;
static char lock_types[] = { 'r', 'i', 'w' };
+ struct task_struct *task = READ_ONCE(trans->locking_wait.task);
unsigned l, idx;
+ /* before rcu_read_lock(): */
+ bch2_printbuf_make_room(out, 4096);
+
if (!out->nr_tabstops) {
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 32);
}
- prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
+ prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
+
+ /* trans->paths is rcu protected vs. freeing */
+ rcu_read_lock();
+ out->atomic++;
+
+ struct btree_path *paths = rcu_dereference(trans->paths);
+ if (!paths)
+ goto out;
+
+ unsigned long *paths_allocated = trans_paths_allocated(paths);
- trans_for_each_path_safe(trans, path, idx) {
+ trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) {
+ struct btree_path *path = paths + idx;
if (!path->nodes_locked)
continue;
prt_printf(out, " path %u %c l=%u %s:",
- path->idx,
+ idx,
path->cached ? 'c' : 'b',
path->level,
bch2_btree_id_str(path->btree_id));
@@ -3197,6 +3189,9 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
bch2_btree_bkey_cached_common_to_text(out, b);
prt_newline(out);
}
+out:
+ --out->atomic;
+ rcu_read_unlock();
}
void bch2_fs_btree_iter_exit(struct bch_fs *c)
@@ -3205,15 +3200,26 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
struct btree_trans *trans;
int cpu;
+ if (c->btree_trans_bufs)
+ for_each_possible_cpu(cpu) {
+ struct btree_trans *trans =
+ per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
+
+ if (trans) {
+ closure_sync(&trans->ref);
+
+ seqmutex_lock(&c->btree_trans_lock);
+ list_del(&trans->list);
+ seqmutex_unlock(&c->btree_trans_lock);
+ }
+ kfree(trans);
+ }
+ free_percpu(c->btree_trans_bufs);
+
trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
if (trans)
panic("%s leaked btree_trans\n", trans->fn);
- if (c->btree_trans_bufs)
- for_each_possible_cpu(cpu)
- kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
- free_percpu(c->btree_trans_bufs);
-
for (s = c->btree_transaction_stats;
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
s++) {
@@ -3234,6 +3240,7 @@ void bch2_fs_btree_iter_init_early(struct bch_fs *c)
for (s = c->btree_transaction_stats;
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
s++) {
+ bch2_time_stats_init(&s->duration);
bch2_time_stats_init(&s->lock_hold_times);
mutex_init(&s->lock);
}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index eaffced4c132..24772538e4cc 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -63,60 +63,57 @@ static inline void btree_trans_sort_paths(struct btree_trans *trans)
__bch2_btree_trans_sort_paths(trans);
}
-static inline struct btree_path *
-__trans_next_path(struct btree_trans *trans, unsigned idx)
+static inline unsigned long *trans_paths_nr(struct btree_path *paths)
{
- u64 l;
-
- if (idx == BTREE_ITER_MAX)
- return NULL;
-
- l = trans->paths_allocated >> idx;
- if (!l)
- return NULL;
-
- idx += __ffs64(l);
- EBUG_ON(idx >= BTREE_ITER_MAX);
- EBUG_ON(trans->paths[idx].idx != idx);
- return &trans->paths[idx];
+ return &container_of(paths, struct btree_trans_paths, paths[0])->nr_paths;
}
-#define trans_for_each_path_from(_trans, _path, _start) \
- for (_path = __trans_next_path((_trans), _start); \
- (_path); \
- _path = __trans_next_path((_trans), (_path)->idx + 1))
-
-#define trans_for_each_path(_trans, _path) \
- trans_for_each_path_from(_trans, _path, 0)
-
-static inline struct btree_path *
-__trans_next_path_safe(struct btree_trans *trans, unsigned *idx)
+static inline unsigned long *trans_paths_allocated(struct btree_path *paths)
{
- u64 l;
+ unsigned long *v = trans_paths_nr(paths);
+ return v - BITS_TO_LONGS(*v);
+}
- if (*idx == BTREE_ITER_MAX)
- return NULL;
+#define trans_for_each_path_idx_from(_paths_allocated, _nr, _idx, _start)\
+ for (_idx = _start; \
+ (_idx = find_next_bit(_paths_allocated, _nr, _idx)) < _nr; \
+ _idx++)
- l = trans->paths_allocated >> *idx;
- if (!l)
- return NULL;
+static inline struct btree_path *
+__trans_next_path(struct btree_trans *trans, unsigned *idx)
+{
+ unsigned long *w = trans->paths_allocated + *idx / BITS_PER_LONG;
+ /*
+ * Open coded find_next_bit(), because
+ * - this is fast path, we can't afford the function call
+ * - and we know that nr_paths is a multiple of BITS_PER_LONG,
+ */
+ while (*idx < trans->nr_paths) {
+ unsigned long v = *w >> (*idx & (BITS_PER_LONG - 1));
+ if (v) {
+ *idx += __ffs(v);
+ return trans->paths + *idx;
+ }
+
+ *idx += BITS_PER_LONG;
+ *idx &= ~(BITS_PER_LONG - 1);
+ w++;
+ }
- *idx += __ffs64(l);
- EBUG_ON(*idx >= BTREE_ITER_MAX);
- return &trans->paths[*idx];
+ return NULL;
}
/*
* This version is intended to be safe for use on a btree_trans that is owned by
* another thread, for bch2_btree_trans_to_text();
*/
-#define trans_for_each_path_safe_from(_trans, _path, _idx, _start) \
+#define trans_for_each_path_from(_trans, _path, _idx, _start) \
for (_idx = _start; \
- (_path = __trans_next_path_safe((_trans), &_idx)); \
+ (_path = __trans_next_path((_trans), &_idx)); \
_idx++)
-#define trans_for_each_path_safe(_trans, _path, _idx) \
- trans_for_each_path_safe_from(_trans, _path, _idx, 0)
+#define trans_for_each_path(_trans, _path, _idx) \
+ trans_for_each_path_from(_trans, _path, _idx, 1)
static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
{
@@ -138,10 +135,23 @@ static inline struct btree_path *prev_btree_path(struct btree_trans *trans, stru
: NULL;
}
-#define trans_for_each_path_inorder(_trans, _path, _i) \
- for (_i = 0; \
- ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
- _i++)
+#define trans_for_each_path_idx_inorder(_trans, _iter) \
+ for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \
+ (_iter.path_idx = trans->sorted[_iter.sorted_idx], \
+ _iter.sorted_idx < (_trans)->nr_sorted); \
+ _iter.sorted_idx++)
+
+struct trans_for_each_path_inorder_iter {
+ btree_path_idx_t sorted_idx;
+ btree_path_idx_t path_idx;
+};
+
+#define trans_for_each_path_inorder(_trans, _path, _iter) \
+ for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \
+ (_iter.path_idx = trans->sorted[_iter.sorted_idx], \
+ _path = (_trans)->paths + _iter.path_idx, \
+ _iter.sorted_idx < (_trans)->nr_sorted); \
+ _iter.sorted_idx++)
#define trans_for_each_path_inorder_reverse(_trans, _path, _i) \
for (_i = trans->nr_sorted - 1; \
@@ -157,67 +167,65 @@ static inline bool __path_has_node(const struct btree_path *path,
static inline struct btree_path *
__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
- unsigned idx)
+ unsigned *idx)
{
- struct btree_path *path = __trans_next_path(trans, idx);
+ struct btree_path *path;
- while (path && !__path_has_node(path, b))
- path = __trans_next_path(trans, path->idx + 1);
+ while ((path = __trans_next_path(trans, idx)) &&
+ !__path_has_node(path, b))
+ (*idx)++;
return path;
}
-#define trans_for_each_path_with_node(_trans, _b, _path) \
- for (_path = __trans_next_path_with_node((_trans), (_b), 0); \
- (_path); \
- _path = __trans_next_path_with_node((_trans), (_b), \
- (_path)->idx + 1))
+#define trans_for_each_path_with_node(_trans, _b, _path, _iter) \
+ for (_iter = 1; \
+ (_path = __trans_next_path_with_node((_trans), (_b), &_iter));\
+ _iter++)
-struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
- bool, unsigned long);
+btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *, btree_path_idx_t,
+ bool, unsigned long);
-static inline struct btree_path * __must_check
+static inline btree_path_idx_t __must_check
bch2_btree_path_make_mut(struct btree_trans *trans,
- struct btree_path *path, bool intent,
+ btree_path_idx_t path, bool intent,
unsigned long ip)
{
- if (path->ref > 1 || path->preserve)
+ if (trans->paths[path].ref > 1 ||
+ trans->paths[path].preserve)
path = __bch2_btree_path_make_mut(trans, path, intent, ip);
- path->should_be_locked = false;
+ trans->paths[path].should_be_locked = false;
return path;
}
-struct btree_path * __must_check
-__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
- struct bpos, bool, unsigned long, int);
+btree_path_idx_t __must_check
+__bch2_btree_path_set_pos(struct btree_trans *, btree_path_idx_t,
+ struct bpos, bool, unsigned long);
-static inline struct btree_path * __must_check
+static inline btree_path_idx_t __must_check
bch2_btree_path_set_pos(struct btree_trans *trans,
- struct btree_path *path, struct bpos new_pos,
- bool intent, unsigned long ip)
+ btree_path_idx_t path, struct bpos new_pos,
+ bool intent, unsigned long ip)
{
- int cmp = bpos_cmp(new_pos, path->pos);
-
- return cmp
- ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp)
+ return !bpos_eq(new_pos, trans->paths[path].pos)
+ ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip)
: path;
}
-int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *,
+int __must_check bch2_btree_path_traverse_one(struct btree_trans *,
+ btree_path_idx_t,
unsigned, unsigned long);
static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
- struct btree_path *path, unsigned flags)
+ btree_path_idx_t path, unsigned flags)
{
- if (path->uptodate < BTREE_ITER_NEED_RELOCK)
+ if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK)
return 0;
return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_);
}
-int __must_check bch2_btree_path_traverse(struct btree_trans *,
- struct btree_path *, unsigned);
-struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
+btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
unsigned, unsigned, unsigned, unsigned long);
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
@@ -269,7 +277,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
-void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
+void bch2_path_put(struct btree_trans *, btree_path_idx_t, bool);
int bch2_trans_relock(struct btree_trans *);
int bch2_trans_relock_notrace(struct btree_trans *);
@@ -335,7 +343,7 @@ static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
void bch2_trans_downgrade(struct btree_trans *);
-void bch2_trans_node_add(struct btree_trans *trans, struct btree *);
+void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *);
void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
@@ -348,8 +356,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *);
-
static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
{
return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
@@ -376,10 +382,12 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo
static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
+ struct btree_trans *trans = iter->trans;
+
if (unlikely(iter->update_path))
- bch2_path_put(iter->trans, iter->update_path,
+ bch2_path_put(trans, iter->update_path,
iter->flags & BTREE_ITER_INTENT);
- iter->update_path = NULL;
+ iter->update_path = 0;
if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
new_pos.snapshot = iter->snapshot;
@@ -408,9 +416,6 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
unsigned btree_id,
unsigned flags)
{
- if (flags & BTREE_ITER_ALL_LEVELS)
- flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
-
if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
btree_id_is_extents(btree_id))
flags |= BTREE_ITER_IS_EXTENTS;
@@ -450,14 +455,16 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
unsigned flags,
unsigned long ip)
{
- memset(iter, 0, sizeof(*iter));
- iter->trans = trans;
- iter->btree_id = btree_id;
- iter->flags = flags;
- iter->snapshot = pos.snapshot;
- iter->pos = pos;
- iter->k.p = pos;
-
+ iter->trans = trans;
+ iter->update_path = 0;
+ iter->key_cache_path = 0;
+ iter->btree_id = btree_id;
+ iter->min_depth = 0;
+ iter->flags = flags;
+ iter->snapshot = pos.snapshot;
+ iter->pos = pos;
+ iter->k = POS_KEY(pos);
+ iter->journal_idx = 0;
#ifdef CONFIG_BCACHEFS_DEBUG
iter->ip_allocated = ip;
#endif
@@ -489,8 +496,10 @@ void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
static inline void set_btree_iter_dontneed(struct btree_iter *iter)
{
- if (!iter->trans->restarted)
- iter->path->preserve = false;
+ struct btree_trans *trans = iter->trans;
+
+ if (!trans->restarted)
+ btree_iter_path(trans, iter)->preserve = false;
}
void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
@@ -512,7 +521,7 @@ static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
{
- size = roundup(size, 8);
+ size = round_up(size, 8);
if (likely(trans->mem_top + size <= trans->mem_bytes)) {
void *p = trans->mem + trans->mem_top;
@@ -581,7 +590,6 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
KEY_TYPE_##_type, sizeof(*_val), _val)
void bch2_trans_srcu_unlock(struct btree_trans *);
-void bch2_trans_srcu_lock(struct btree_trans *);
u32 bch2_trans_begin(struct btree_trans *);
@@ -606,8 +614,6 @@ u32 bch2_trans_begin(struct btree_trans *);
static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
unsigned flags)
{
- BUG_ON(flags & BTREE_ITER_ALL_LEVELS);
-
return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
bch2_btree_iter_peek_prev(iter);
}
@@ -615,8 +621,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *
static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) :
- flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
+ return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
bch2_btree_iter_peek(iter);
}
@@ -633,61 +638,34 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *
return bch2_btree_iter_peek_slot(iter);
}
+int __bch2_btree_trans_too_many_iters(struct btree_trans *);
+
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
- if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) {
- trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
- }
+ if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8)
+ return __bch2_btree_trans_too_many_iters(trans);
return 0;
}
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
-
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
- struct btree_iter *iter, unsigned flags)
-{
- struct bkey_s_c k;
-
- while (btree_trans_too_many_iters(trans) ||
- (k = bch2_btree_iter_peek_type(iter, flags),
- bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
- bch2_trans_begin(trans);
-
- return k;
-}
-
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos end,
- unsigned flags)
-{
- struct bkey_s_c k;
-
- while (btree_trans_too_many_iters(trans) ||
- (k = bch2_btree_iter_peek_upto_type(iter, end, flags),
- bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
- bch2_trans_begin(trans);
-
- return k;
-}
-
+/*
+ * goto instead of loop, so that when used inside for_each_btree_key2()
+ * break/continue work correctly
+ */
#define lockrestart_do(_trans, _do) \
({ \
+ __label__ transaction_restart; \
u32 _restart_count; \
int _ret2; \
+transaction_restart: \
+ _restart_count = bch2_trans_begin(_trans); \
+ _ret2 = (_do); \
\
- do { \
- _restart_count = bch2_trans_begin(_trans); \
- _ret2 = (_do); \
- } while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)); \
+ if (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)) \
+ goto transaction_restart; \
\
if (!_ret2) \
bch2_trans_verify_not_restarted(_trans, _restart_count);\
- \
_ret2; \
})
@@ -716,91 +694,56 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
_ret2 ?: trans_was_restarted(_trans, _restart_count); \
})
-#define for_each_btree_key2(_trans, _iter, _btree_id, \
- _start, _flags, _k, _do) \
+#define for_each_btree_key_upto(_trans, _iter, _btree_id, \
+ _start, _end, _flags, _k, _do) \
({ \
+ struct btree_iter _iter; \
+ struct bkey_s_c _k; \
int _ret3 = 0; \
\
bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
\
- while (1) { \
- u32 _restart_count = bch2_trans_begin(_trans); \
- \
- _ret3 = 0; \
- (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \
- if (!(_k).k) \
- break; \
+ do { \
+ _ret3 = lockrestart_do(_trans, ({ \
+ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \
+ _end, (_flags)); \
+ if (!(_k).k) \
+ break; \
\
- _ret3 = bkey_err(_k) ?: (_do); \
- if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
- continue; \
- if (_ret3) \
- break; \
- bch2_trans_verify_not_restarted(_trans, _restart_count);\
- if (!bch2_btree_iter_advance(&(_iter))) \
- break; \
- } \
+ bkey_err(_k) ?: (_do); \
+ })); \
+ } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
_ret3; \
})
-#define for_each_btree_key2_upto(_trans, _iter, _btree_id, \
- _start, _end, _flags, _k, _do) \
-({ \
- int _ret3 = 0; \
- \
- bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- \
- while (1) { \
- u32 _restart_count = bch2_trans_begin(_trans); \
- \
- _ret3 = 0; \
- (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\
- if (!(_k).k) \
- break; \
- \
- _ret3 = bkey_err(_k) ?: (_do); \
- if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
- continue; \
- if (_ret3) \
- break; \
- bch2_trans_verify_not_restarted(_trans, _restart_count);\
- if (!bch2_btree_iter_advance(&(_iter))) \
- break; \
- } \
- \
- bch2_trans_iter_exit((_trans), &(_iter)); \
- _ret3; \
-})
+#define for_each_btree_key(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _do) \
+ for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \
+ SPOS_MAX, _flags, _k, _do)
#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \
_start, _flags, _k, _do) \
({ \
+ struct btree_iter _iter; \
+ struct bkey_s_c _k; \
int _ret3 = 0; \
\
bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
\
- while (1) { \
- u32 _restart_count = bch2_trans_begin(_trans); \
- (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
- if (!(_k).k) { \
- _ret3 = 0; \
- break; \
- } \
+ do { \
+ _ret3 = lockrestart_do(_trans, ({ \
+ (_k) = bch2_btree_iter_peek_prev_type(&(_iter), \
+ (_flags)); \
+ if (!(_k).k) \
+ break; \
\
- _ret3 = bkey_err(_k) ?: (_do); \
- if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
- continue; \
- if (_ret3) \
- break; \
- bch2_trans_verify_not_restarted(_trans, _restart_count);\
- if (!bch2_btree_iter_rewind(&(_iter))) \
- break; \
- } \
+ bkey_err(_k) ?: (_do); \
+ })); \
+ } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
_ret3; \
@@ -810,7 +753,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
_start, _iter_flags, _k, \
_disk_res, _journal_seq, _commit_flags,\
_do) \
- for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
+ for_each_btree_key(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_commit_flags)))
@@ -826,32 +769,31 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
_start, _end, _iter_flags, _k, \
_disk_res, _journal_seq, _commit_flags,\
_do) \
- for_each_btree_key2_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
+ for_each_btree_key_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_commit_flags)))
-#define for_each_btree_key(_trans, _iter, _btree_id, \
- _start, _flags, _k, _ret) \
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
-#define for_each_btree_key_upto(_trans, _iter, _btree_id, \
- _start, _end, _flags, _k, _ret) \
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- (_k) = __bch2_btree_iter_peek_upto_and_restart((_trans), \
- &(_iter), _end, _flags),\
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
+ struct btree_iter *iter, unsigned flags)
+{
+ struct bkey_s_c k;
-#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
+ while (btree_trans_too_many_iters(trans) ||
+ (k = bch2_btree_iter_peek_type(iter, flags),
+ bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+ bch2_trans_begin(trans);
+
+ return k;
+}
+
+#define for_each_btree_key_old(_trans, _iter, _btree_id, \
_start, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
- (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \
+ (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
@@ -863,24 +805,25 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
-#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \
- for (; \
- (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
-
-#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
- for (; \
- (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
-
#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\
for (; \
(_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
+#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _ret) \
+ for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\
+ SPOS_MAX, _flags, _k, _ret)
+
+#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
+ for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
+
+/*
+ * This should not be used in a fastpath, without first trying _do in
+ * nonblocking mode - it will cause excessive transaction restarts and
+ * potentially livelocking:
+ */
#define drop_locks_do(_trans, _do) \
({ \
bch2_trans_unlock(_trans); \
@@ -912,10 +855,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
_p; \
})
-/* new multiple iterator interface: */
-
void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
-void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
void bch2_dump_trans_updates(struct btree_trans *);
void bch2_dump_trans_paths_updates(struct btree_trans *);
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index ec52f50d249d..719a94a84950 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -73,6 +73,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys,
return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
}
+/* Returns first non-overwritten key >= search key: */
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
unsigned level, struct bpos pos,
struct bpos end_pos, size_t *idx)
@@ -86,12 +87,26 @@ search:
if (!*idx)
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+ while (*idx &&
+ __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
+ --(*idx);
+ iters++;
+ if (iters == 10) {
+ *idx = 0;
+ goto search;
+ }
+ }
+
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
return NULL;
- if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
- !k->overwritten)
+ if (k->overwritten) {
+ (*idx)++;
+ continue;
+ }
+
+ if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
return k->k;
(*idx)++;
@@ -162,7 +177,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
- BUG_ON(test_bit(BCH_FS_RW, &c->flags));
+ BUG_ON(test_bit(BCH_FS_rw, &c->flags));
if (idx < keys->size &&
journal_key_cmp(&n, &keys->d[idx]) == 0) {
@@ -452,9 +467,7 @@ static void __journal_keys_sort(struct journal_keys *keys)
src = dst = keys->d;
while (src < keys->d + keys->nr) {
while (src + 1 < keys->d + keys->nr &&
- src[0].btree_id == src[1].btree_id &&
- src[0].level == src[1].level &&
- bpos_eq(src[0].k->k.p, src[1].k->k.p))
+ !journal_key_cmp(src, src + 1))
src++;
*dst++ = *src++;
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 1b7a5668df7c..74e52fd28abe 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -630,7 +630,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
if (ret)
goto out;
- ck = (void *) c_iter.path->l[0].b;
+ ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b;
if (!ck)
goto out;
@@ -645,22 +645,29 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
if (journal_seq && ck->journal.seq != journal_seq)
goto out;
+ trans->journal_res.seq = ck->journal.seq;
+
/*
- * Since journal reclaim depends on us making progress here, and the
- * allocator/copygc depend on journal reclaim making progress, we need
- * to be using alloc reserves:
+ * If we're at the end of the journal, we really want to free up space
+ * in the journal right away - we don't want to pin that old journal
+ * sequence number with a new btree node write, we want to re-journal
+ * the update
*/
+ if (ck->journal.seq == journal_last_seq(j))
+ commit_flags |= BCH_WATERMARK_reclaim;
+
+ if (ck->journal.seq != journal_last_seq(j) ||
+ j->watermark == BCH_WATERMARK_stripe)
+ commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
+
ret = bch2_btree_iter_traverse(&b_iter) ?:
bch2_trans_update(trans, &b_iter, ck->k,
BTREE_UPDATE_KEY_CACHE_RECLAIM|
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- (ck->journal.seq == journal_last_seq(j)
- ? BCH_WATERMARK_reclaim
- : 0)|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
commit_flags);
bch2_fs_fatal_err_on(ret &&
@@ -673,7 +680,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
bch2_journal_pin_drop(j, &ck->journal);
- BUG_ON(!btree_node_locked(c_iter.path, 0));
+ struct btree_path *path = btree_iter_path(trans, &c_iter);
+ BUG_ON(!btree_node_locked(path, 0));
if (!evict) {
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
@@ -682,19 +690,20 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
}
} else {
struct btree_path *path2;
+ unsigned i;
evict:
- trans_for_each_path(trans, path2)
- if (path2 != c_iter.path)
+ trans_for_each_path(trans, path2, i)
+ if (path2 != path)
__bch2_btree_path_unlock(trans, path2);
- bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c);
+ bch2_btree_node_lock_write_nofail(trans, path, &ck->c);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
atomic_long_dec(&c->btree_key_cache.nr_dirty);
}
- mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED);
+ mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
bkey_cached_evict(&c->btree_key_cache, ck);
bkey_cached_free_fast(&c->btree_key_cache, ck);
}
@@ -732,9 +741,9 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
}
six_unlock_read(&ck->c.lock);
- ret = commit_do(trans, NULL, NULL, 0,
+ ret = lockrestart_do(trans,
btree_key_cache_flush_pos(trans, key, seq,
- BTREE_INSERT_JOURNAL_RECLAIM, false));
+ BCH_TRANS_COMMIT_journal_reclaim, false));
unlock:
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
@@ -742,28 +751,12 @@ unlock:
return ret;
}
-/*
- * Flush and evict a key from the key cache:
- */
-int bch2_btree_key_cache_flush(struct btree_trans *trans,
- enum btree_id id, struct bpos pos)
-{
- struct bch_fs *c = trans->c;
- struct bkey_cached_key key = { id, pos };
-
- /* Fastpath - assume it won't be found: */
- if (!bch2_btree_key_cache_find(c, id, pos))
- return 0;
-
- return btree_key_cache_flush_pos(trans, key, 0, 0, true);
-}
-
bool bch2_btree_insert_key_cached(struct btree_trans *trans,
unsigned flags,
struct btree_insert_entry *insert_entry)
{
struct bch_fs *c = trans->c;
- struct bkey_cached *ck = (void *) insert_entry->path->l[0].b;
+ struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b;
struct bkey_i *insert = insert_entry->k;
bool kick_reclaim = false;
@@ -773,7 +766,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
ck->valid = true;
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+ EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
set_bit(BKEY_CACHED_DIRTY, &ck->flags);
atomic_long_inc(&c->btree_key_cache.nr_dirty);
@@ -1000,7 +993,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
if (atomic_long_read(&bc->nr_dirty) &&
!bch2_journal_error(&c->journal) &&
- test_bit(BCH_FS_WAS_RW, &c->flags))
+ test_bit(BCH_FS_was_rw, &c->flags))
panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
atomic_long_read(&bc->nr_dirty));
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index be3acde2caa0..e6b2cd0dd2c1 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -31,8 +31,6 @@ int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
struct btree_insert_entry *);
-int bch2_btree_key_cache_flush(struct btree_trans *,
- enum btree_id, struct bpos);
void bch2_btree_key_cache_drop(struct btree_trans *,
struct btree_path *);
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 3d48834d091f..684397442338 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -32,13 +32,14 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
{
struct btree_path *path;
struct six_lock_count ret;
+ unsigned i;
memset(&ret, 0, sizeof(ret));
if (IS_ERR_OR_NULL(b))
return ret;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path != skip && &path->l[level].b->c == b) {
int t = btree_node_locked_type(path, level);
@@ -85,8 +86,14 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
prt_printf(out, "Found lock cycle (%u entries):", g->nr);
prt_newline(out);
- for (i = g->g; i < g->g + g->nr; i++)
+ for (i = g->g; i < g->g + g->nr; i++) {
+ struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
+ if (!task)
+ continue;
+
bch2_btree_trans_to_text(out, i->trans);
+ bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT);
+ }
}
static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
@@ -94,9 +101,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
struct trans_waiting_for_lock *i;
for (i = g->g; i != g->g + g->nr; i++) {
+ struct task_struct *task = i->trans->locking_wait.task;
if (i != g->g)
prt_str(out, "<- ");
- prt_printf(out, "%u ", i->trans->locking_wait.task->pid);
+ prt_printf(out, "%u ", task ?task->pid : 0);
}
prt_newline(out);
}
@@ -142,10 +150,27 @@ static bool lock_graph_remove_non_waiters(struct lock_graph *g)
return false;
}
+static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+
+ count_event(c, trans_restart_would_deadlock);
+
+ if (trace_trans_restart_would_deadlock_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ buf.atomic++;
+ print_cycle(&buf, g);
+
+ trace_trans_restart_would_deadlock(trans, buf.buf);
+ printbuf_exit(&buf);
+ }
+}
+
static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
{
if (i == g->g) {
- trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_);
+ trace_would_deadlock(g, i->trans);
return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
} else {
i->trans->lock_must_abort = true;
@@ -202,7 +227,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
prt_printf(&buf, "backtrace:");
prt_newline(&buf);
printbuf_indent_add(&buf, 2);
- bch2_prt_task_backtrace(&buf, trans->locking_wait.task);
+ bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
printbuf_indent_sub(&buf, 2);
prt_newline(&buf);
}
@@ -262,27 +287,40 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
struct lock_graph g;
struct trans_waiting_for_lock *top;
struct btree_bkey_cached_common *b;
- struct btree_path *path;
- unsigned path_idx;
- int ret;
+ btree_path_idx_t path_idx;
+ int ret = 0;
+
+ g.nr = 0;
if (trans->lock_must_abort) {
if (cycle)
return -1;
- trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_);
+ trace_would_deadlock(&g, trans);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
}
- g.nr = 0;
lock_graph_down(&g, trans);
+
+ /* trans->paths is rcu protected vs. freeing */
+ rcu_read_lock();
+ if (cycle)
+ cycle->atomic++;
next:
if (!g.nr)
- return 0;
+ goto out;
top = &g.g[g.nr - 1];
- trans_for_each_path_safe_from(top->trans, path, path_idx, top->path_idx) {
+ struct btree_path *paths = rcu_dereference(top->trans->paths);
+ if (!paths)
+ goto up;
+
+ unsigned long *paths_allocated = trans_paths_allocated(paths);
+
+ trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths),
+ path_idx, top->path_idx) {
+ struct btree_path *path = paths + path_idx;
if (!path->nodes_locked)
continue;
@@ -348,18 +386,23 @@ next:
ret = lock_graph_descend(&g, trans, cycle);
if (ret)
- return ret;
+ goto out;
goto next;
}
raw_spin_unlock(&b->lock.wait_lock);
}
}
-
+up:
if (g.nr > 1 && cycle)
print_chain(cycle, &g);
lock_graph_up(&g);
goto next;
+out:
+ if (cycle)
+ --cycle->atomic;
+ rcu_read_unlock();
+ return ret;
}
int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
@@ -398,7 +441,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
struct btree_bkey_cached_common *b)
{
struct btree_path *linked;
- unsigned i;
+ unsigned i, iter;
int ret;
/*
@@ -412,7 +455,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
* already taken are no longer needed:
*/
- trans_for_each_path(trans, linked) {
+ trans_for_each_path(trans, linked, iter) {
if (!linked->nodes_locked)
continue;
@@ -588,8 +631,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans,
}
__flatten
-bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
- struct btree_path *path, unsigned long trace_ip)
+bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path)
{
struct get_locks_fail f;
@@ -599,7 +641,7 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
int __bch2_btree_path_relock(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
- if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
+ if (!bch2_btree_path_relock_norestart(trans, path)) {
trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
}
@@ -624,8 +666,6 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
unsigned new_locks_want,
struct get_locks_fail *f)
{
- struct btree_path *linked;
-
if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f))
return true;
@@ -648,8 +688,11 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
* before interior nodes - now that's handled by
* bch2_btree_path_traverse_all().
*/
- if (!path->cached && !trans->in_traverse_all)
- trans_for_each_path(trans, linked)
+ if (!path->cached && !trans->in_traverse_all) {
+ struct btree_path *linked;
+ unsigned i;
+
+ trans_for_each_path(trans, linked, i)
if (linked != path &&
linked->cached == path->cached &&
linked->btree_id == path->btree_id &&
@@ -657,6 +700,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
linked->locks_want = new_locks_want;
btree_path_get_locks(trans, linked, true, NULL);
}
+ }
return false;
}
@@ -665,7 +709,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
- unsigned l;
+ unsigned l, old_locks_want = path->locks_want;
if (trans->restarted)
return;
@@ -689,8 +733,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
bch2_btree_path_verify_locks(path);
- path->downgrade_seq++;
- trace_path_downgrade(trans, _RET_IP_, path);
+ trace_path_downgrade(trans, _RET_IP_, path, old_locks_want);
}
/* Btree transaction locking: */
@@ -698,40 +741,70 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
void bch2_trans_downgrade(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
if (trans->restarted)
return;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
bch2_btree_path_downgrade(trans, path);
}
int bch2_trans_relock(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
if (unlikely(trans->restarted))
return -((int) trans->restarted);
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i) {
+ struct get_locks_fail f;
+
if (path->should_be_locked &&
- !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
- trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
+ !btree_path_get_locks(trans, path, false, &f)) {
+ if (trace_trans_restart_relock_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_printf(&buf, " l=%u seq=%u node seq=",
+ f.l, path->l[f.l].lock_seq);
+ if (IS_ERR_OR_NULL(f.b)) {
+ prt_str(&buf, bch2_err_str(PTR_ERR(f.b)));
+ } else {
+ prt_printf(&buf, "%u", f.b->c.lock.seq);
+
+ struct six_lock_count c =
+ bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l);
+ prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+
+ c = six_lock_counts(&f.b->c.lock);
+ prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+ }
+
+ trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ count_event(trans->c, trans_restart_relock);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
+ }
+
return 0;
}
int bch2_trans_relock_notrace(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
if (unlikely(trans->restarted))
return -((int) trans->restarted);
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->should_be_locked &&
- !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+ !bch2_btree_path_relock_norestart(trans, path)) {
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
return 0;
@@ -740,16 +813,18 @@ int bch2_trans_relock_notrace(struct btree_trans *trans)
void bch2_trans_unlock_noassert(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
__bch2_btree_path_unlock(trans, path);
}
void bch2_trans_unlock(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
__bch2_btree_path_unlock(trans, path);
}
@@ -762,8 +837,9 @@ void bch2_trans_unlock_long(struct btree_trans *trans)
bool bch2_trans_locked(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->nodes_locked)
return true;
return false;
@@ -809,8 +885,9 @@ void bch2_btree_path_verify_locks(struct btree_path *path)
void bch2_trans_verify_locks(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
bch2_btree_path_verify_locks(path);
}
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 11b0a2c8cd69..4bd72c855da1 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -122,12 +122,9 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
- struct btree_transaction_stats *s = btree_trans_stats(trans);
-
- if (s)
- __bch2_time_stats_update(&s->lock_hold_times,
- path->l[level].lock_taken_time,
- local_clock());
+ __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
+ path->l[level].lock_taken_time,
+ local_clock());
#endif
}
@@ -175,6 +172,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
struct btree *b)
{
struct btree_path *linked;
+ unsigned i;
EBUG_ON(path->l[b->c.level].b != b);
EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
@@ -182,7 +180,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
- trans_for_each_path_with_node(trans, b, linked)
+ trans_for_each_path_with_node(trans, b, linked, i)
linked->l[b->c.level].lock_seq++;
six_unlock_write(&b->c.lock);
@@ -242,8 +240,9 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
enum btree_node_locked_type want)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (&path->l[level].b->c == b &&
btree_node_locked_type(path, level) >= want) {
six_lock_increment(&b->lock, (enum six_lock_type) want);
@@ -263,7 +262,6 @@ static inline int btree_node_lock(struct btree_trans *trans,
int ret = 0;
EBUG_ON(level >= BTREE_MAX_DEPTH);
- EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
if (likely(six_trylock_type(&b->lock, type)) ||
btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
@@ -314,8 +312,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *,
/* relock: */
-bool bch2_btree_path_relock_norestart(struct btree_trans *,
- struct btree_path *, unsigned long);
+bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *);
int __bch2_btree_path_relock(struct btree_trans *,
struct btree_path *, unsigned long);
@@ -355,12 +352,6 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
/* upgrade */
-
-struct get_locks_fail {
- unsigned l;
- struct btree *b;
-};
-
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
struct btree_path *, unsigned,
struct get_locks_fail *);
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 12907beda98c..30d69a6d133e 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -12,6 +12,7 @@
#include "errcode.h"
#include "error.h"
#include "journal.h"
+#include "journal_io.h"
#include "journal_reclaim.h"
#include "replicas.h"
#include "snapshot.h"
@@ -23,7 +24,7 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert
#ifdef CONFIG_BCACHEFS_DEBUG
struct bch_fs *c = trans->c;
struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u);
+ struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u);
if (unlikely(trans->journal_replay_not_finished)) {
struct bkey_i *j_k =
@@ -41,23 +42,23 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert
#endif
}
-static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
+static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i)
{
- return i->path->l + i->level;
+ return (trans->paths + i->path)->l + i->level;
}
static inline bool same_leaf_as_prev(struct btree_trans *trans,
struct btree_insert_entry *i)
{
return i != trans->updates &&
- insert_l(&i[0])->b == insert_l(&i[-1])->b;
+ insert_l(trans, &i[0])->b == insert_l(trans, &i[-1])->b;
}
static inline bool same_leaf_as_next(struct btree_trans *trans,
struct btree_insert_entry *i)
{
return i + 1 < trans->updates + trans->nr_updates &&
- insert_l(&i[0])->b == insert_l(&i[1])->b;
+ insert_l(trans, &i[0])->b == insert_l(trans, &i[1])->b;
}
inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
@@ -84,7 +85,7 @@ static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btre
if (same_leaf_as_prev(trans, i))
continue;
- bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
+ bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
}
trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
@@ -93,19 +94,17 @@ static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btre
static inline int bch2_trans_lock_write(struct btree_trans *trans)
{
- struct btree_insert_entry *i;
-
EBUG_ON(trans->write_locked);
trans_for_each_update(trans, i) {
if (same_leaf_as_prev(trans, i))
continue;
- if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+ if (bch2_btree_node_lock_write(trans, trans->paths + i->path, &insert_l(trans, i)->b->c))
return trans_lock_write_fail(trans, i);
if (!i->cached)
- bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+ bch2_btree_node_prep_for_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
}
trans->write_locked = true;
@@ -115,12 +114,10 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans)
static inline void bch2_trans_unlock_write(struct btree_trans *trans)
{
if (likely(trans->write_locked)) {
- struct btree_insert_entry *i;
-
trans_for_each_update(trans, i)
if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_unlock_write_inlined(trans, i->path,
- insert_l(i)->b);
+ bch2_btree_node_unlock_write_inlined(trans,
+ trans->paths + i->path, insert_l(trans, i)->b);
trans->write_locked = false;
}
}
@@ -142,8 +139,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
- EBUG_ON(insert->k.u64s >
- bch_btree_keys_u64s_remaining(trans->c, b));
+ EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
k = bch2_btree_node_iter_peek_all(node_iter, b);
@@ -163,7 +159,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
k->type = KEY_TYPE_deleted;
if (k->needs_whiteout)
- push_whiteout(trans->c, b, insert->k.p);
+ push_whiteout(b, insert->k.p);
k->needs_whiteout = false;
if (k >= btree_bset_last(b)->start) {
@@ -287,7 +283,7 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
bch2_btree_add_journal_pin(c, b, journal_seq);
if (unlikely(!btree_node_dirty(b))) {
- EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+ EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
set_btree_node_dirty_acct(c, b);
}
@@ -311,10 +307,12 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
static inline void btree_insert_entry_checks(struct btree_trans *trans,
struct btree_insert_entry *i)
{
- BUG_ON(!bpos_eq(i->k->k.p, i->path->pos));
- BUG_ON(i->cached != i->path->cached);
- BUG_ON(i->level != i->path->level);
- BUG_ON(i->btree_id != i->path->btree_id);
+ struct btree_path *path = trans->paths + i->path;
+
+ BUG_ON(!bpos_eq(i->k->k.p, path->pos));
+ BUG_ON(i->cached != path->cached);
+ BUG_ON(i->level != path->level);
+ BUG_ON(i->btree_id != path->btree_id);
EBUG_ON(!i->level &&
btree_type_has_snapshots(i->btree_id) &&
!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
@@ -349,9 +347,7 @@ static noinline void journal_transaction_name(struct btree_trans *trans)
static inline int btree_key_can_insert(struct btree_trans *trans,
struct btree *b, unsigned u64s)
{
- struct bch_fs *c = trans->c;
-
- if (!bch2_btree_node_insert_fits(c, b, u64s))
+ if (!bch2_btree_node_insert_fits(b, u64s))
return -BCH_ERR_btree_insert_btree_node_full;
return 0;
@@ -361,8 +357,6 @@ noinline static int
btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
struct btree_path *path, unsigned new_u64s)
{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
struct bkey_cached *ck = (void *) path->l[0].b;
struct bkey_i *new_k;
int ret;
@@ -372,7 +366,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
if (!new_k) {
- bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+ bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(path->btree_id), new_u64s);
return -BCH_ERR_ENOMEM_btree_key_cache_insert;
}
@@ -401,7 +395,6 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
{
struct bch_fs *c = trans->c;
struct bkey_cached *ck = (void *) path->l[0].b;
- struct btree_insert_entry *i;
unsigned new_u64s;
struct bkey_i *new_k;
@@ -409,7 +402,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bch2_btree_key_cache_must_wait(c) &&
- !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
+ !(flags & BCH_TRANS_COMMIT_journal_reclaim))
return -BCH_ERR_btree_insert_need_journal_reclaim;
/*
@@ -422,7 +415,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
return 0;
new_u64s = roundup_pow_of_two(u64s);
- new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_k))
return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
@@ -452,25 +445,15 @@ static int run_one_mem_trigger(struct btree_trans *trans,
if (unlikely(flags & BTREE_TRIGGER_NORUN))
return 0;
- if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)))
- return 0;
-
- if (old_ops->atomic_trigger == new_ops->atomic_trigger) {
- ret = bch2_mark_key(trans, i->btree_id, i->level,
- old, bkey_i_to_s_c(new),
+ if (old_ops->trigger == new_ops->trigger) {
+ ret = bch2_key_trigger(trans, i->btree_id, i->level,
+ old, bkey_i_to_s(new),
BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
} else {
- struct bkey _deleted = KEY(0, 0, 0);
- struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
-
- _deleted.p = i->path->pos;
-
- ret = bch2_mark_key(trans, i->btree_id, i->level,
- deleted, bkey_i_to_s_c(new),
- BTREE_TRIGGER_INSERT|flags) ?:
- bch2_mark_key(trans, i->btree_id, i->level,
- old, deleted,
- BTREE_TRIGGER_OVERWRITE|flags);
+ ret = bch2_key_trigger_new(trans, i->btree_id, i->level,
+ bkey_i_to_s(new), flags) ?:
+ bch2_key_trigger_old(trans, i->btree_id, i->level,
+ old, flags);
}
return ret;
@@ -488,6 +471,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
struct bkey_s_c old = { &old_k, i->old_v };
const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+ unsigned flags = i->flags|BTREE_TRIGGER_TRANSACTIONAL;
verify_update_old_key(trans, i);
@@ -497,19 +481,18 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
if (!i->insert_trigger_run &&
!i->overwrite_trigger_run &&
- old_ops->trans_trigger == new_ops->trans_trigger) {
+ old_ops->trigger == new_ops->trigger) {
i->overwrite_trigger_run = true;
i->insert_trigger_run = true;
- return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
- BTREE_TRIGGER_INSERT|
- BTREE_TRIGGER_OVERWRITE|
- i->flags) ?: 1;
+ return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k),
+ BTREE_TRIGGER_INSERT|
+ BTREE_TRIGGER_OVERWRITE|flags) ?: 1;
} else if (overwrite && !i->overwrite_trigger_run) {
i->overwrite_trigger_run = true;
- return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
+ return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1;
} else if (!overwrite && !i->insert_trigger_run) {
i->insert_trigger_run = true;
- return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
+ return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1;
} else {
return 0;
}
@@ -551,7 +534,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
{
- struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+ struct btree_insert_entry *btree_id_start = trans->updates;
unsigned btree_id = 0;
int ret = 0;
@@ -597,10 +580,6 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- int ret = 0;
-
trans_for_each_update(trans, i) {
/*
* XXX: synchronization of cached update triggers with gc
@@ -608,14 +587,15 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
*/
BUG_ON(i->cached || i->level);
- if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
- ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
+ if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) &&
+ gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) {
+ int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
if (ret)
- break;
+ return ret;
}
}
- return ret;
+ return 0;
}
static inline int
@@ -624,8 +604,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- struct btree_write_buffered_key *wb;
struct btree_trans_commit_hook *h;
unsigned u64s = 0;
int ret;
@@ -650,23 +628,21 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
u64s += i->k->k.u64s;
ret = !i->cached
- ? btree_key_can_insert(trans, insert_l(i)->b, u64s)
- : btree_key_can_insert_cached(trans, flags, i->path, u64s);
+ ? btree_key_can_insert(trans, insert_l(trans, i)->b, u64s)
+ : btree_key_can_insert_cached(trans, flags, trans->paths + i->path, u64s);
if (ret) {
*stopped_at = i;
return ret;
}
- }
- if (trans->nr_wb_updates &&
- trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
- return -BCH_ERR_btree_insert_need_flush_buffer;
+ i->k->k.needs_whiteout = false;
+ }
/*
* Don't get journal reservation until after we know insert will
* succeed:
*/
- if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
ret = bch2_trans_journal_res_get(trans,
(flags & BCH_WATERMARK_MASK)|
JOURNAL_RES_GET_NONBLOCK);
@@ -675,8 +651,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
if (unlikely(trans->journal_transaction_names))
journal_transaction_name(trans);
- } else {
- trans->journal_res.seq = c->journal.replay_journal_seq;
}
/*
@@ -685,7 +659,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
*/
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+ !(flags & BCH_TRANS_COMMIT_no_journal_res)) {
if (bch2_journal_seq_verify)
trans_for_each_update(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
@@ -698,13 +672,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
return -BCH_ERR_btree_insert_need_mark_replicas;
- if (trans->nr_wb_updates) {
- EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
-
- ret = bch2_btree_insert_keys_write_buffer(trans);
- if (ret)
- goto revert_fs_usage;
- }
+ /* XXX: we only want to run this if deltas are nonzero */
+ bch2_trans_account_disk_usage_change(trans);
h = trans->hooks;
while (h) {
@@ -715,8 +684,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
}
trans_for_each_update(trans, i)
- if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
- ret = run_one_mem_trigger(trans, i, i->flags);
+ if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) {
+ ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags);
if (ret)
goto fatal_err;
}
@@ -727,16 +696,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
goto fatal_err;
}
- if (unlikely(trans->extra_journal_entries.nr)) {
- memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
- trans->extra_journal_entries.data,
- trans->extra_journal_entries.nr);
-
- trans->journal_res.offset += trans->extra_journal_entries.nr;
- trans->journal_res.u64s -= trans->extra_journal_entries.nr;
- }
-
- if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
struct journal *j = &c->journal;
struct jset_entry *entry;
@@ -765,33 +725,27 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
bkey_copy((struct bkey_i *) entry->start, i->k);
}
- trans_for_each_wb_update(trans, wb) {
- entry = bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_btree_keys,
- wb->btree, 0,
- wb->k.k.u64s);
- bkey_copy((struct bkey_i *) entry->start, &wb->k);
- }
+ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
+ trans->journal_entries,
+ trans->journal_entries_u64s);
+
+ trans->journal_res.offset += trans->journal_entries_u64s;
+ trans->journal_res.u64s -= trans->journal_entries_u64s;
if (trans->journal_seq)
*trans->journal_seq = trans->journal_res.seq;
}
trans_for_each_update(trans, i) {
- i->k->k.needs_whiteout = false;
+ struct btree_path *path = trans->paths + i->path;
if (!i->cached) {
- u64 seq = trans->journal_res.seq;
-
- if (i->flags & BTREE_UPDATE_PREJOURNAL)
- seq = i->seq;
-
- bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
+ bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq);
} else if (!i->key_cache_already_flushed)
bch2_btree_insert_key_cached(trans, flags, i);
else {
- bch2_btree_key_cache_drop(trans, i->path);
- btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
+ bch2_btree_key_cache_drop(trans, path);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
}
}
@@ -806,14 +760,8 @@ revert_fs_usage:
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
{
- struct btree_insert_entry *i;
- struct btree_write_buffered_key *wb;
-
trans_for_each_update(trans, i)
bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
-
- trans_for_each_wb_update(trans, wb)
- bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
}
static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
@@ -841,6 +789,33 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
return -EINVAL;
}
+static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *trans,
+ struct jset_entry *i)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "invalid bkey on insert from %s", trans->fn);
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+
+ bch2_journal_entry_to_text(&buf, c, i);
+ prt_newline(&buf);
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+
+ bch2_inconsistent_error(c);
+ bch2_dump_trans_updates(trans);
+
+ return -EINVAL;
+}
+
+static int bch2_trans_commit_journal_pin_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ return 0;
+}
+
/*
* Get journal reservation, take write locks, and attempt to do btree update(s):
*/
@@ -849,7 +824,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
int ret = 0, u64s_delta = 0;
trans_for_each_update(trans, i) {
@@ -884,13 +858,15 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
if (!ret && trans->journal_pin)
bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
- trans->journal_pin, NULL);
+ trans->journal_pin,
+ bch2_trans_commit_journal_pin_flush);
/*
* Drop journal reservation after dropping write locks, since dropping
* the journal reservation may kick off a journal write:
*/
- bch2_journal_res_put(&c->journal, &trans->journal_res);
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
+ bch2_journal_res_put(&c->journal, &trans->journal_res);
return ret;
}
@@ -916,7 +892,8 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
case -BCH_ERR_btree_insert_btree_node_full:
ret = bch2_btree_split_leaf(trans, i->path, flags);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
+ trace_and_count(c, trans_restart_btree_node_split, trans,
+ trace_ip, trans->paths + i->path);
break;
case -BCH_ERR_btree_insert_need_mark_replicas:
ret = drop_locks_do(trans,
@@ -927,7 +904,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
* XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
* flag
*/
- if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
(flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
break;
@@ -950,30 +927,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
ret = bch2_trans_relock(trans);
break;
- case -BCH_ERR_btree_insert_need_flush_buffer: {
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- ret = 0;
-
- if (wb->state.nr > wb->size * 3 / 4) {
- bch2_trans_unlock(trans);
- mutex_lock(&wb->flush_lock);
-
- if (wb->state.nr > wb->size * 3 / 4) {
- bch2_trans_begin(trans);
- ret = __bch2_btree_write_buffer_flush(trans,
- flags|BTREE_INSERT_NOCHECK_RW, true);
- if (!ret) {
- trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
- }
- } else {
- mutex_unlock(&wb->flush_lock);
- ret = bch2_trans_relock(trans);
- }
- }
- break;
- }
default:
BUG_ON(ret >= 0);
break;
@@ -982,8 +935,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
- !(flags & BTREE_INSERT_NOWAIT) &&
- (flags & BTREE_INSERT_NOFAIL), c,
+ (flags & BCH_TRANS_COMMIT_no_enospc), c,
"%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
return ret;
@@ -995,8 +947,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
struct bch_fs *c = trans->c;
int ret;
- if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
- test_bit(BCH_FS_STARTED, &c->flags))
+ if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) ||
+ test_bit(BCH_FS_started, &c->flags))
return -BCH_ERR_erofs_trans_commit;
ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
@@ -1016,7 +968,6 @@ static noinline int
do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
int ret = 0;
trans_for_each_update(trans, i) {
@@ -1030,18 +981,15 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
{
+ struct btree_insert_entry *errored_at = NULL;
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i = NULL;
- struct btree_write_buffered_key *wb;
int ret = 0;
if (!trans->nr_updates &&
- !trans->nr_wb_updates &&
- !trans->extra_journal_entries.nr)
+ !trans->journal_entries_u64s)
goto out_reset;
- if (flags & BTREE_INSERT_GC_LOCK_HELD)
- lockdep_assert_held(&c->gc_lock);
+ memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
ret = bch2_trans_commit_run_triggers(trans);
if (ret)
@@ -1051,7 +999,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
struct printbuf buf = PRINTBUF;
enum bkey_invalid_flags invalid_flags = 0;
- if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
+ if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
@@ -1064,47 +1012,52 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
return ret;
}
- if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+ for (struct jset_entry *i = trans->journal_entries;
+ i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ i = vstruct_next(i)) {
+ enum bkey_invalid_flags invalid_flags = 0;
+
+ if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
+ invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
+
+ if (unlikely(bch2_journal_entry_validate(c, NULL, i,
+ bcachefs_metadata_version_current,
+ CPU_BIG_ENDIAN, invalid_flags)))
+ ret = bch2_trans_commit_journal_entry_invalid(trans, i);
+
+ if (ret)
+ return ret;
+ }
+
+ if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
ret = do_bch2_trans_commit_to_journal_replay(trans);
goto out_reset;
}
- if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
+ if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
ret = bch2_trans_commit_get_rw_cold(trans, flags);
if (ret)
goto out_reset;
}
- if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
- mutex_trylock(&c->btree_write_buffer.flush_lock)) {
- bch2_trans_begin(trans);
- bch2_trans_unlock(trans);
-
- ret = __bch2_btree_write_buffer_flush(trans,
- flags|BTREE_INSERT_NOCHECK_RW, true);
- if (!ret) {
- trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
- }
- goto out;
- }
-
- EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+ EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
- trans->journal_u64s = trans->extra_journal_entries.nr;
+ trans->journal_u64s = trans->journal_entries_u64s;
trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
if (trans->journal_transaction_names)
trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
trans_for_each_update(trans, i) {
- EBUG_ON(!i->path->should_be_locked);
+ struct btree_path *path = trans->paths + i->path;
+
+ EBUG_ON(!path->should_be_locked);
- ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
+ ret = bch2_btree_path_upgrade(trans, path, i->level + 1);
if (unlikely(ret))
goto out;
- EBUG_ON(!btree_node_intent_locked(i->path, i->level));
+ EBUG_ON(!btree_node_intent_locked(path, i->level));
if (i->key_cache_already_flushed)
continue;
@@ -1120,22 +1073,21 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
trans->journal_u64s += jset_u64s(i->old_k.u64s);
}
- trans_for_each_wb_update(trans, wb)
- trans->journal_u64s += jset_u64s(wb->k.k.u64s);
-
- if (trans->extra_journal_res) {
+ if (trans->extra_disk_res) {
ret = bch2_disk_reservation_add(c, trans->disk_res,
- trans->extra_journal_res,
- (flags & BTREE_INSERT_NOFAIL)
+ trans->extra_disk_res,
+ (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0);
if (ret)
goto err;
}
retry:
+ errored_at = NULL;
bch2_trans_verify_not_in_restart(trans);
- memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
+ memset(&trans->journal_res, 0, sizeof(trans->journal_res));
- ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
+ ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_);
/* make sure we didn't drop or screw up locks: */
bch2_trans_verify_locks(trans);
@@ -1145,7 +1097,7 @@ retry:
trace_and_count(c, transaction_commit, trans, _RET_IP_);
out:
- if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
bch2_write_ref_put(c, BCH_WRITE_REF_trans);
out_reset:
if (!ret)
@@ -1154,9 +1106,21 @@ out_reset:
return ret;
err:
- ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_);
+ ret = bch2_trans_commit_error(trans, flags, errored_at, ret, _RET_IP_);
if (ret)
goto out;
+ /*
+ * We might have done another transaction commit in the error path -
+ * i.e. btree write buffer flush - which will have made use of
+ * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is
+ * how the journal sequence number to pin is passed in - so we must
+ * restart:
+ */
+ if (flags & BCH_TRANS_COMMIT_no_journal_res) {
+ ret = -BCH_ERR_transaction_restart_nested;
+ goto out;
+ }
+
goto retry;
}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 60453ba86c4b..4a5a64499eb7 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -185,33 +185,32 @@ struct btree_node_iter {
* Iterate over all possible positions, synthesizing deleted keys for holes:
*/
static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0;
-static const __maybe_unused u16 BTREE_ITER_ALL_LEVELS = 1 << 1;
/*
* Indicates that intent locks should be taken on leaf nodes, because we expect
* to be doing updates:
*/
-static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 2;
+static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 1;
/*
* Causes the btree iterator code to prefetch additional btree nodes from disk:
*/
-static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 3;
+static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 2;
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
*/
-static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 4;
-static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 5;
-static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 6;
-static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7;
-static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 8;
-static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 9;
-static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10;
-static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11;
-static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12;
-static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 13;
-static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 14;
-static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15;
-#define __BTREE_ITER_FLAGS_END 16
+static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 3;
+static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 4;
+static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 5;
+static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 6;
+static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 7;
+static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 8;
+static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 9;
+static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 10;
+static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 11;
+static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 12;
+static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 13;
+static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 14;
+#define __BTREE_ITER_FLAGS_END 15
enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
@@ -223,13 +222,12 @@ enum btree_path_uptodate {
#define TRACK_PATH_ALLOCATED
#endif
+typedef u16 btree_path_idx_t;
+
struct btree_path {
- u8 idx;
- u8 sorted_idx;
+ btree_path_idx_t sorted_idx;
u8 ref;
u8 intent_ref;
- u32 alloc_seq;
- u32 downgrade_seq;
/* btree_iter_copy starts here: */
struct bpos pos;
@@ -283,13 +281,12 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
*/
struct btree_iter {
struct btree_trans *trans;
- struct btree_path *path;
- struct btree_path *update_path;
- struct btree_path *key_cache_path;
+ btree_path_idx_t path;
+ btree_path_idx_t update_path;
+ btree_path_idx_t key_cache_path;
enum btree_id btree_id:8;
- unsigned min_depth:3;
- unsigned advanced:1;
+ u8 min_depth;
/* btree_iter_copy starts here: */
u16 flags;
@@ -306,7 +303,6 @@ struct btree_iter {
/* BTREE_ITER_WITH_JOURNAL: */
size_t journal_idx;
- struct bpos journal_pos;
#ifdef TRACK_PATH_ALLOCATED
unsigned long ip_allocated;
#endif
@@ -354,16 +350,16 @@ struct btree_insert_entry {
* to the size of the key being overwritten in the btree:
*/
u8 old_btree_u64s;
+ btree_path_idx_t path;
struct bkey_i *k;
- struct btree_path *path;
- u64 seq;
/* key being overwritten: */
struct bkey old_k;
const struct bch_val *old_v;
unsigned long ip_allocated;
};
-#define BTREE_ITER_MAX 64
+#define BTREE_ITER_INITIAL 64
+#define BTREE_ITER_MAX (1U << 10)
struct btree_trans_commit_hook;
typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
@@ -377,25 +373,30 @@ struct btree_trans_commit_hook {
#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000
+struct btree_trans_paths {
+ unsigned long nr_paths;
+ struct btree_path paths[];
+};
+
struct btree_trans {
struct bch_fs *c;
- const char *fn;
- struct closure ref;
- struct list_head list;
- u64 last_begin_time;
- u8 lock_may_not_fail;
- u8 lock_must_abort;
- struct btree_bkey_cached_common *locking;
- struct six_lock_waiter locking_wait;
+ unsigned long *paths_allocated;
+ struct btree_path *paths;
+ btree_path_idx_t *sorted;
+ struct btree_insert_entry *updates;
- int srcu_idx;
+ void *mem;
+ unsigned mem_top;
+ unsigned mem_bytes;
+ btree_path_idx_t nr_sorted;
+ btree_path_idx_t nr_paths;
+ btree_path_idx_t nr_paths_max;
u8 fn_idx;
- u8 nr_sorted;
u8 nr_updates;
- u8 nr_wb_updates;
- u8 wb_updates_size;
+ u8 lock_must_abort;
+ bool lock_may_not_fail:1;
bool srcu_held:1;
bool used_mempool:1;
bool in_traverse_all:1;
@@ -407,41 +408,59 @@ struct btree_trans {
bool write_locked:1;
enum bch_errcode restarted:16;
u32 restart_count;
+
+ u64 last_begin_time;
unsigned long last_begin_ip;
unsigned long last_restarted_ip;
unsigned long srcu_lock_time;
- /*
- * For when bch2_trans_update notices we'll be splitting a compressed
- * extent:
- */
- unsigned extra_journal_res;
- unsigned nr_max_paths;
-
- u64 paths_allocated;
-
- unsigned mem_top;
- unsigned mem_max;
- unsigned mem_bytes;
- void *mem;
-
- u8 sorted[BTREE_ITER_MAX + 8];
- struct btree_path paths[BTREE_ITER_MAX];
- struct btree_insert_entry updates[BTREE_ITER_MAX];
- struct btree_write_buffered_key *wb_updates;
+ const char *fn;
+ struct btree_bkey_cached_common *locking;
+ struct six_lock_waiter locking_wait;
+ int srcu_idx;
/* update path: */
+ u16 journal_entries_u64s;
+ u16 journal_entries_size;
+ struct jset_entry *journal_entries;
+
struct btree_trans_commit_hook *hooks;
- darray_u64 extra_journal_entries;
struct journal_entry_pin *journal_pin;
struct journal_res journal_res;
u64 *journal_seq;
struct disk_reservation *disk_res;
+
+ struct bch_fs_usage_base fs_usage_delta;
+
unsigned journal_u64s;
+ unsigned extra_disk_res; /* XXX kill */
struct replicas_delta_list *fs_usage_deltas;
+
+ /* Entries before this are zeroed out on every bch2_trans_get() call */
+
+ struct list_head list;
+ struct closure ref;
+
+ unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)];
+ struct btree_trans_paths trans_paths;
+ struct btree_path _paths[BTREE_ITER_INITIAL];
+ btree_path_idx_t _sorted[BTREE_ITER_INITIAL + 4];
+ struct btree_insert_entry _updates[BTREE_ITER_INITIAL];
};
+static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter)
+{
+ return trans->paths + iter->path;
+}
+
+static inline struct btree_path *btree_iter_key_cache_path(struct btree_trans *trans, struct btree_iter *iter)
+{
+ return iter->key_cache_path
+ ? trans->paths + iter->key_cache_path
+ : NULL;
+}
+
#define BCH_BTREE_WRITE_TYPES() \
x(initial, 0) \
x(init_next_bset, 1) \
@@ -637,7 +656,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
BIT_ULL(BKEY_TYPE_reflink)| \
BIT_ULL(BKEY_TYPE_btree))
-#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
+#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \
(BIT_ULL(BKEY_TYPE_alloc)| \
BIT_ULL(BKEY_TYPE_inodes)| \
BIT_ULL(BKEY_TYPE_stripes)| \
@@ -645,7 +664,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
#define BTREE_NODE_TYPE_HAS_TRIGGERS \
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
- BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
+ BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
@@ -722,4 +741,9 @@ enum btree_node_sibling {
btree_next_sib,
};
+struct get_locks_fail {
+ unsigned l;
+ struct btree *b;
+};
+
#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 2fd3c8cc6f51..c3ff365acce9 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -24,7 +24,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
}
static int __must_check
-bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t,
struct bkey_i *, enum btree_update_flags,
unsigned long ip);
@@ -200,7 +200,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
*/
if (nr_splits > 1 &&
(compressed_sectors = bch2_bkey_sectors_compressed(old)))
- trans->extra_journal_res += compressed_sectors * (nr_splits - 1);
+ trans->extra_disk_res += compressed_sectors * (nr_splits - 1);
if (front_split) {
update = bch2_bkey_make_mut_noupdate(trans, old);
@@ -339,21 +339,22 @@ err:
}
static noinline int flush_new_cached_update(struct btree_trans *trans,
- struct btree_path *path,
struct btree_insert_entry *i,
enum btree_update_flags flags,
unsigned long ip)
{
- struct btree_path *btree_path;
struct bkey k;
int ret;
- btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
- BTREE_ITER_INTENT, _THIS_IP_);
- ret = bch2_btree_path_traverse(trans, btree_path, 0);
+ btree_path_idx_t path_idx =
+ bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0,
+ BTREE_ITER_INTENT, _THIS_IP_);
+ ret = bch2_btree_path_traverse(trans, path_idx, 0);
if (ret)
goto out;
+ struct btree_path *btree_path = trans->paths + path_idx;
+
/*
* The old key in the insert entry might actually refer to an existing
* key in the btree that has been deleted from cache and not yet
@@ -368,43 +369,34 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
i->flags |= BTREE_TRIGGER_NORUN;
btree_path_set_should_be_locked(btree_path);
- ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
+ ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
out:
- bch2_path_put(trans, btree_path, true);
+ bch2_path_put(trans, path_idx, true);
return ret;
}
static int __must_check
-bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
struct bkey_i *k, enum btree_update_flags flags,
unsigned long ip)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i, n;
- u64 seq = 0;
int cmp;
+ struct btree_path *path = trans->paths + path_idx;
EBUG_ON(!path->should_be_locked);
- EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+ EBUG_ON(trans->nr_updates >= trans->nr_paths);
EBUG_ON(!bpos_eq(k->k.p, path->pos));
- /*
- * The transaction journal res hasn't been allocated at this point.
- * That occurs at commit time. Reuse the seq field to pass in the seq
- * of a prejournaled key.
- */
- if (flags & BTREE_UPDATE_PREJOURNAL)
- seq = trans->journal_res.seq;
-
n = (struct btree_insert_entry) {
.flags = flags,
.bkey_type = __btree_node_type(path->level, path->btree_id),
.btree_id = path->btree_id,
.level = path->level,
.cached = path->cached,
- .path = path,
+ .path = path_idx,
.k = k,
- .seq = seq,
.ip_allocated = ip,
};
@@ -418,7 +410,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
* Pending updates are kept sorted: first, find position of new update,
* then delete/trim any updates the new update overwrites:
*/
- trans_for_each_update(trans, i) {
+ for (i = trans->updates; i < trans->updates + trans->nr_updates; i++) {
cmp = btree_insert_entry_cmp(&n, i);
if (cmp <= 0)
break;
@@ -432,7 +424,6 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
i->cached = n.cached;
i->k = n.k;
i->path = n.path;
- i->seq = n.seq;
i->ip_allocated = n.ip_allocated;
} else {
array_insert_item(trans->updates, trans->nr_updates,
@@ -452,7 +443,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
}
}
- __btree_path_get(i->path, true);
+ __btree_path_get(trans->paths + i->path, true);
/*
* If a key is present in the key cache, it must also exist in the
@@ -462,7 +453,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
* work:
*/
if (path->cached && bkey_deleted(&i->old_k))
- return flush_new_cached_update(trans, path, i, flags, ip);
+ return flush_new_cached_update(trans, i, flags, ip);
return 0;
}
@@ -471,9 +462,11 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
struct btree_iter *iter,
struct btree_path *path)
{
- if (!iter->key_cache_path ||
- !iter->key_cache_path->should_be_locked ||
- !bpos_eq(iter->key_cache_path->pos, iter->pos)) {
+ struct btree_path *key_cache_path = btree_iter_key_cache_path(trans, iter);
+
+ if (!key_cache_path ||
+ !key_cache_path->should_be_locked ||
+ !bpos_eq(key_cache_path->pos, iter->pos)) {
struct bkey_cached *ck;
int ret;
@@ -488,19 +481,18 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
iter->flags & BTREE_ITER_INTENT,
_THIS_IP_);
- ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
- BTREE_ITER_CACHED);
+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_CACHED);
if (unlikely(ret))
return ret;
- ck = (void *) iter->key_cache_path->l[0].b;
+ ck = (void *) trans->paths[iter->key_cache_path].l[0].b;
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
}
- btree_path_set_should_be_locked(iter->key_cache_path);
+ btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
}
return 0;
@@ -509,7 +501,7 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_i *k, enum btree_update_flags flags)
{
- struct btree_path *path = iter->update_path ?: iter->path;
+ btree_path_idx_t path_idx = iter->update_path ?: iter->path;
int ret;
if (iter->flags & BTREE_ITER_IS_EXTENTS)
@@ -529,6 +521,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
/*
* Ensure that updates to cached btrees go to the key cache:
*/
+ struct btree_path *path = trans->paths + path_idx;
if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
!path->cached &&
!path->level &&
@@ -537,27 +530,15 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
if (ret)
return ret;
- path = iter->key_cache_path;
+ path_idx = iter->key_cache_path;
}
- return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
+ return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_);
}
-/*
- * Add a transaction update for a key that has already been journaled.
- */
-int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
- struct btree_iter *iter, struct bkey_i *k,
- enum btree_update_flags flags)
-{
- trans->journal_res.seq = seq;
- return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
- BTREE_UPDATE_PREJOURNAL);
-}
-
-static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_i *k)
+int bch2_btree_insert_clone_trans(struct btree_trans *trans,
+ enum btree_id btree,
+ struct bkey_i *k)
{
struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k));
int ret = PTR_ERR_OR_ZERO(n);
@@ -568,60 +549,30 @@ static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans,
return bch2_btree_insert_trans(trans, btree, n, 0);
}
-int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_i *k)
+struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
{
- struct btree_write_buffered_key *i;
- int ret;
-
- EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
- EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
-
- if (unlikely(trans->journal_replay_not_finished))
- return bch2_btree_insert_clone_trans(trans, btree, k);
-
- trans_for_each_wb_update(trans, i) {
- if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
- bkey_copy(&i->k, k);
- return 0;
- }
- }
+ unsigned new_top = trans->journal_entries_u64s + u64s;
+ unsigned old_size = trans->journal_entries_size;
- if (!trans->wb_updates ||
- trans->nr_wb_updates == trans->wb_updates_size) {
- struct btree_write_buffered_key *u;
+ if (new_top > trans->journal_entries_size) {
+ trans->journal_entries_size = roundup_pow_of_two(new_top);
- if (trans->nr_wb_updates == trans->wb_updates_size) {
- struct btree_transaction_stats *s = btree_trans_stats(trans);
-
- BUG_ON(trans->wb_updates_size > U8_MAX / 2);
- trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
- if (s)
- s->wb_updates_size = trans->wb_updates_size;
- }
-
- u = bch2_trans_kmalloc_nomemzero(trans,
- trans->wb_updates_size *
- sizeof(struct btree_write_buffered_key));
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- return ret;
-
- if (trans->nr_wb_updates)
- memcpy(u, trans->wb_updates, trans->nr_wb_updates *
- sizeof(struct btree_write_buffered_key));
- trans->wb_updates = u;
+ btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size;
}
- trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
- .btree = btree,
- };
+ struct jset_entry *n =
+ bch2_trans_kmalloc_nomemzero(trans,
+ trans->journal_entries_size * sizeof(u64));
+ if (IS_ERR(n))
+ return ERR_CAST(n);
- bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
- trans->nr_wb_updates++;
+ if (trans->journal_entries)
+ memcpy(n, trans->journal_entries, old_size * sizeof(u64));
+ trans->journal_entries = n;
- return 0;
+ struct jset_entry *e = btree_trans_journal_entries_top(trans);
+ trans->journal_entries_u64s = new_top;
+ return e;
}
int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
@@ -733,20 +684,6 @@ int bch2_btree_delete_at(struct btree_trans *trans,
return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
}
-int bch2_btree_delete_at_buffered(struct btree_trans *trans,
- enum btree_id btree, struct bpos pos)
-{
- struct bkey_i *k;
-
- k = bch2_trans_kmalloc(trans, sizeof(*k));
- if (IS_ERR(k))
- return PTR_ERR(k);
-
- bkey_init(&k->k);
- k->k.p = pos;
- return bch2_trans_update_buffered(trans, btree, k);
-}
-
int bch2_btree_delete(struct btree_trans *trans,
enum btree_id btree, struct bpos pos,
unsigned update_flags)
@@ -809,7 +746,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
bch2_trans_commit(trans, &disk_res, journal_seq,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(trans->c, &disk_res);
err:
/*
@@ -851,56 +788,26 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
struct bpos pos, bool set)
{
- struct bkey_i *k;
- int ret = 0;
+ struct bkey_i k;
- k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
- ret = PTR_ERR_OR_ZERO(k);
- if (unlikely(ret))
- return ret;
+ bkey_init(&k.k);
+ k.k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+ k.k.p = pos;
- bkey_init(&k->k);
- k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
- k->k.p = pos;
-
- return bch2_trans_update_buffered(trans, btree, k);
+ return bch2_trans_update_buffered(trans, btree, &k);
}
-__printf(2, 0)
-static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
+static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s)
{
- struct printbuf buf = PRINTBUF;
- struct jset_entry_log *l;
- unsigned u64s;
- int ret;
-
- prt_vprintf(&buf, fmt, args);
- ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
- if (ret)
- goto err;
-
- u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
-
- ret = darray_make_room(entries, jset_u64s(u64s));
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
+ int ret = PTR_ERR_OR_ZERO(e);
if (ret)
- goto err;
+ return ret;
- l = (void *) &darray_top(*entries);
- l->entry.u64s = cpu_to_le16(u64s);
- l->entry.btree_id = 0;
- l->entry.level = 1;
- l->entry.type = BCH_JSET_ENTRY_log;
- l->entry.pad[0] = 0;
- l->entry.pad[1] = 0;
- l->entry.pad[2] = 0;
- memcpy(l->d, buf.buf, buf.pos);
- while (buf.pos & 7)
- l->d[buf.pos++] = '\0';
-
- entries->nr += jset_u64s(u64s);
-err:
- printbuf_exit(&buf);
- return ret;
+ struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry);
+ journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s);
+ memcpy(l->d, buf->buf, buf->pos);
+ return 0;
}
__printf(3, 0)
@@ -908,16 +815,32 @@ static int
__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
va_list args)
{
- int ret;
+ struct printbuf buf = PRINTBUF;
+ prt_vprintf(&buf, fmt, args);
+
+ unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
+ prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos);
+
+ int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
+ if (ret)
+ goto err;
if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
- ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
+ ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s));
+ if (ret)
+ goto err;
+
+ struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries);
+ journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s);
+ memcpy(l->d, buf.buf, buf.pos);
+ c->journal.early_journal_entries.nr += jset_u64s(u64s);
} else {
ret = bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_LAZY_RW|commit_flags,
- __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args));
+ BCH_TRANS_COMMIT_lazy_rw|commit_flags,
+ __bch2_trans_log_msg(trans, &buf, u64s));
}
-
+err:
+ printbuf_exit(&buf);
return ret;
}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 9816d2286540..b9382b7b288b 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -21,42 +21,32 @@ void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
struct bkey_i *, u64);
-enum btree_insert_flags {
+#define BCH_TRANS_COMMIT_FLAGS() \
+ x(no_enospc, "don't check for enospc") \
+ x(no_check_rw, "don't attempt to take a ref on c->writes") \
+ x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \
+ x(no_journal_res, "don't take a journal reservation, instead " \
+ "pin journal entry referred to by trans->journal_res.seq") \
+ x(journal_reclaim, "operation required for journal reclaim; may return error" \
+ "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
+
+enum __bch_trans_commit_flags {
/* First bits for bch_watermark: */
- __BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS,
- __BTREE_INSERT_NOCHECK_RW,
- __BTREE_INSERT_LAZY_RW,
- __BTREE_INSERT_JOURNAL_REPLAY,
- __BTREE_INSERT_JOURNAL_RECLAIM,
- __BTREE_INSERT_NOWAIT,
- __BTREE_INSERT_GC_LOCK_HELD,
- __BCH_HASH_SET_MUST_CREATE,
- __BCH_HASH_SET_MUST_REPLACE,
+ __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS,
+#define x(n, ...) __BCH_TRANS_COMMIT_##n,
+ BCH_TRANS_COMMIT_FLAGS()
+#undef x
};
-/* Don't check for -ENOSPC: */
-#define BTREE_INSERT_NOFAIL BIT(__BTREE_INSERT_NOFAIL)
-
-#define BTREE_INSERT_NOCHECK_RW BIT(__BTREE_INSERT_NOCHECK_RW)
-#define BTREE_INSERT_LAZY_RW BIT(__BTREE_INSERT_LAZY_RW)
-
-/* Insert is for journal replay - don't get journal reservations: */
-#define BTREE_INSERT_JOURNAL_REPLAY BIT(__BTREE_INSERT_JOURNAL_REPLAY)
-
-/* Insert is being called from journal reclaim path: */
-#define BTREE_INSERT_JOURNAL_RECLAIM BIT(__BTREE_INSERT_JOURNAL_RECLAIM)
-
-/* Don't block on allocation failure (for new btree nodes: */
-#define BTREE_INSERT_NOWAIT BIT(__BTREE_INSERT_NOWAIT)
-#define BTREE_INSERT_GC_LOCK_HELD BIT(__BTREE_INSERT_GC_LOCK_HELD)
-
-#define BCH_HASH_SET_MUST_CREATE BIT(__BCH_HASH_SET_MUST_CREATE)
-#define BCH_HASH_SET_MUST_REPLACE BIT(__BCH_HASH_SET_MUST_REPLACE)
+enum bch_trans_commit_flags {
+#define x(n, ...) BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n),
+ BCH_TRANS_COMMIT_FLAGS()
+#undef x
+};
int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
unsigned, unsigned);
int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
-int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos);
int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
@@ -74,6 +64,12 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
+static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
+ enum btree_id btree, struct bpos pos)
+{
+ return bch2_btree_bit_mod(trans, btree, pos, false);
+}
+
int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
struct bpos, struct bpos);
@@ -105,10 +101,44 @@ int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_update_flags);
-int __must_check bch2_trans_update_seq(struct btree_trans *, u64, struct btree_iter *,
- struct bkey_i *, enum btree_update_flags);
-int __must_check bch2_trans_update_buffered(struct btree_trans *,
- enum btree_id, struct bkey_i *);
+
+struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned);
+
+static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans)
+{
+ return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+}
+
+static inline struct jset_entry *
+bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
+{
+ if (!trans->journal_entries ||
+ trans->journal_entries_u64s + u64s > trans->journal_entries_size)
+ return __bch2_trans_jset_entry_alloc(trans, u64s);
+
+ struct jset_entry *e = btree_trans_journal_entries_top(trans);
+ trans->journal_entries_u64s += u64s;
+ return e;
+}
+
+int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
+
+static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
+ enum btree_id btree,
+ struct bkey_i *k)
+{
+ if (unlikely(trans->journal_replay_not_finished))
+ return bch2_btree_insert_clone_trans(trans, btree, k);
+
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
+ int ret = PTR_ERR_OR_ZERO(e);
+ if (ret)
+ return ret;
+
+ journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, btree, 0, k->k.u64s);
+ bkey_copy(e->start, k);
+ return 0;
+}
void bch2_trans_commit_hook(struct btree_trans *,
struct btree_trans_commit_hook *);
@@ -157,28 +187,19 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
#define trans_for_each_update(_trans, _i) \
- for ((_i) = (_trans)->updates; \
+ for (struct btree_insert_entry *_i = (_trans)->updates; \
(_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
-#define trans_for_each_wb_update(_trans, _i) \
- for ((_i) = (_trans)->wb_updates; \
- (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates; \
- (_i)++)
-
static inline void bch2_trans_reset_updates(struct btree_trans *trans)
{
- struct btree_insert_entry *i;
-
trans_for_each_update(trans, i)
bch2_path_put(trans, i->path, true);
- trans->extra_journal_res = 0;
trans->nr_updates = 0;
- trans->nr_wb_updates = 0;
- trans->wb_updates = NULL;
+ trans->journal_entries_u64s = 0;
trans->hooks = NULL;
- trans->extra_journal_entries.nr = 0;
+ trans->extra_disk_res = 0;
if (trans->fs_usage_deltas) {
trans->fs_usage_deltas->used = 0;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 239fcc3c7c99..17a5938aa71a 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -25,24 +25,24 @@
#include <linux/random.h>
static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
- struct btree_path *, struct btree *,
+ btree_path_idx_t, struct btree *,
struct keylist *, unsigned);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
-static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans,
- enum btree_id btree_id,
- unsigned level,
- struct bpos pos)
+static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
+ enum btree_id btree_id,
+ unsigned level,
+ struct bpos pos)
{
- struct btree_path *path;
-
- path = bch2_path_get(trans, btree_id, pos, level + 1, level,
+ btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
BTREE_ITER_NOPRESERVE|
BTREE_ITER_INTENT, _RET_IP_);
- path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_);
+ path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
+
+ struct btree_path *path = trans->paths + path_idx;
bch2_btree_path_downgrade(trans, path);
__bch2_btree_path_unlock(trans, path);
- return path;
+ return path_idx;
}
/* Debug code: */
@@ -159,14 +159,16 @@ static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
{
size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
- return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
+ return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b);
}
/* Btree node freeing/allocation: */
-static void __btree_node_free(struct bch_fs *c, struct btree *b)
+static void __btree_node_free(struct btree_trans *trans, struct btree *b)
{
- trace_and_count(c, btree_node_free, c, b);
+ struct bch_fs *c = trans->c;
+
+ trace_and_count(c, btree_node_free, trans, b);
BUG_ON(btree_node_write_blocked(b));
BUG_ON(btree_node_dirty(b));
@@ -188,15 +190,15 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
struct btree *b)
{
struct bch_fs *c = trans->c;
- unsigned level = b->c.level;
+ unsigned i, level = b->c.level;
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
bch2_btree_node_hash_remove(&c->btree_cache, b);
- __btree_node_free(c, b);
+ __btree_node_free(trans, b);
six_unlock_write(&b->c.lock);
mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->l[level].b == b) {
btree_node_unlock(trans, path, level);
path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
@@ -210,7 +212,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
struct bch_fs *c = as->c;
struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
struct btree_path *path;
- unsigned level = b->c.level;
+ unsigned i, level = b->c.level;
BUG_ON(!list_empty(&b->write_blocked));
BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
@@ -233,7 +235,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
six_unlock_intent(&b->c.lock);
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->l[level].b == b) {
btree_node_unlock(trans, path, level);
path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
@@ -363,7 +365,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as,
ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
BUG_ON(ret);
- trace_and_count(c, btree_node_alloc, c, b);
+ trace_and_count(c, btree_node_alloc, trans, b);
bch2_increment_clock(c, btree_sectors(c), WRITE);
return b;
}
@@ -453,7 +455,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
- __btree_node_free(c, b);
+ __btree_node_free(trans, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
@@ -466,7 +468,6 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
unsigned flags,
struct closure *cl)
{
- struct bch_fs *c = as->c;
struct btree *b;
unsigned interior;
int ret = 0;
@@ -476,11 +477,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
/*
* Protects reaping from the btree node cache and using the btree node
* open bucket reserve:
- *
- * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
- * blocking on this lock:
*/
- ret = bch2_btree_cache_cannibalize_lock(c, cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, cl);
if (ret)
return ret;
@@ -488,9 +486,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
struct prealloc_nodes *p = as->prealloc_nodes + interior;
while (p->nr < nr_nodes[interior]) {
- b = __bch2_btree_node_alloc(trans, &as->disk_res,
- flags & BTREE_INSERT_NOWAIT ? NULL : cl,
- interior, flags);
+ b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
+ interior, flags);
if (IS_ERR(b)) {
ret = PTR_ERR(b);
goto err;
@@ -500,7 +497,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
}
}
err:
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
return ret;
}
@@ -559,24 +556,20 @@ static void btree_update_add_key(struct btree_update *as,
static int btree_update_nodes_written_trans(struct btree_trans *trans,
struct btree_update *as)
{
- struct bkey_i *k;
- int ret;
-
- ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s);
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s);
+ int ret = PTR_ERR_OR_ZERO(e);
if (ret)
return ret;
- memcpy(&darray_top(trans->extra_journal_entries),
- as->journal_entries,
- as->journal_u64s * sizeof(u64));
- trans->extra_journal_entries.nr += as->journal_u64s;
+ memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64));
trans->journal_pin = &as->journal;
for_each_keylist_key(&as->old_keys, k) {
unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
- ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0);
+ ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k),
+ BTREE_TRIGGER_TRANSACTIONAL);
if (ret)
return ret;
}
@@ -584,7 +577,8 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
for_each_keylist_key(&as->new_keys, k) {
unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
- ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0);
+ ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k),
+ BTREE_TRIGGER_TRANSACTIONAL);
if (ret)
return ret;
}
@@ -645,9 +639,9 @@ static void btree_update_nodes_written(struct btree_update *as)
*/
ret = commit_do(trans, &as->disk_res, &journal_seq,
BCH_WATERMARK_reclaim|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_JOURNAL_RECLAIM,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_journal_reclaim,
btree_update_nodes_written_trans(trans, as));
bch2_trans_unlock(trans);
@@ -655,10 +649,11 @@ static void btree_update_nodes_written(struct btree_update *as)
"%s(): error %s", __func__, bch2_err_str(ret));
err:
if (as->b) {
- struct btree_path *path;
b = as->b;
- path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p);
+ btree_path_idx_t path_idx = get_unlocked_mut_path(trans,
+ as->btree_id, b->c.level, b->key.k.p);
+ struct btree_path *path = trans->paths + path_idx;
/*
* @b is the node we did the final insert into:
*
@@ -728,7 +723,7 @@ err:
btree_node_write_if_need(c, b, SIX_LOCK_intent);
btree_node_unlock(trans, path, b->c.level);
- bch2_path_put(trans, path, true);
+ bch2_path_put(trans, path_idx, true);
}
bch2_journal_pin_drop(&c->journal, &as->journal);
@@ -815,6 +810,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
mutex_unlock(&c->btree_interior_update_lock);
}
+static int bch2_update_reparent_journal_pin_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ return 0;
+}
+
static void btree_update_reparent(struct btree_update *as,
struct btree_update *child)
{
@@ -825,7 +826,8 @@ static void btree_update_reparent(struct btree_update *as,
child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS;
- bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
+ bch2_update_reparent_journal_pin_flush);
}
static void btree_update_updated_root(struct btree_update *as, struct btree *b)
@@ -934,6 +936,12 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b
b->ob.v[--b->ob.nr];
}
+static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ return 0;
+}
+
/*
* @b is being split/rewritten: it may have pointers to not-yet-written btree
* nodes and thus outstanding btree_updates - redirect @b's
@@ -985,11 +993,13 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
* when the new nodes are persistent and reachable on disk:
*/
w = btree_current_write(b);
- bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
+ bch2_btree_update_will_free_node_journal_pin_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
w = btree_prev_write(b);
- bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
+ bch2_btree_update_will_free_node_journal_pin_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
mutex_unlock(&c->btree_interior_update_lock);
@@ -1039,7 +1049,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
struct bch_fs *c = trans->c;
struct btree_update *as;
u64 start_time = local_clock();
- int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
+ int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0;
unsigned nr_nodes[2] = { 0, 0 };
unsigned update_level = level;
@@ -1057,7 +1067,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
- if (!(flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark < c->journal.watermark) {
struct journal_res res = { 0 };
@@ -1087,16 +1097,14 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* Always check for space for two keys, even if we won't have to
* split at prior level - it might have been a merge instead:
*/
- if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
+ if (bch2_btree_node_insert_fits(path->l[update_level].b,
BKEY_BTREE_PTR_U64s_MAX * 2))
break;
split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
}
- if (flags & BTREE_INSERT_GC_LOCK_HELD)
- lockdep_assert_held(&c->gc_lock);
- else if (!down_read_trylock(&c->gc_lock)) {
+ if (!down_read_trylock(&c->gc_lock)) {
ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
if (ret) {
up_read(&c->gc_lock);
@@ -1110,7 +1118,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as->c = c;
as->start_time = start_time;
as->mode = BTREE_INTERIOR_NO_UPDATE;
- as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
+ as->took_gc_lock = true;
as->btree_id = path->btree_id;
as->update_level = update_level;
INIT_LIST_HEAD(&as->list);
@@ -1153,7 +1161,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* flag
*/
if (bch2_err_matches(ret, ENOSPC) &&
- (flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark != BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err;
@@ -1183,6 +1191,9 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
return as;
err:
bch2_btree_update_free(as, trans);
+ if (!bch2_err_matches(ret, ENOSPC) &&
+ !bch2_err_matches(ret, EROFS))
+ bch_err_fn_ratelimited(c, ret);
return ERR_PTR(ret);
}
@@ -1214,7 +1225,7 @@ static void bch2_btree_set_root(struct btree_update *as,
struct bch_fs *c = as->c;
struct btree *old;
- trace_and_count(c, btree_node_set_root, c, b);
+ trace_and_count(c, btree_node_set_root, trans, b);
old = btree_node_root(c, b);
@@ -1390,7 +1401,7 @@ static void __btree_split_node(struct btree_update *as,
unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
nr_keys[i].val_u64s;
- if (__vstruct_bytes(struct btree_node, u64s) > btree_bytes(as->c))
+ if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b))
n[i]->data->format = b->format;
btree_node_set_format(n[i], n[i]->data->format);
@@ -1445,10 +1456,12 @@ static void __btree_split_node(struct btree_update *as,
*/
static void btree_split_insert_keys(struct btree_update *as,
struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path_idx,
struct btree *b,
struct keylist *keys)
{
+ struct btree_path *path = trans->paths + path_idx;
+
if (!bch2_keylist_empty(keys) &&
bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) {
struct btree_node_iter node_iter;
@@ -1462,25 +1475,25 @@ static void btree_split_insert_keys(struct btree_update *as,
}
static int btree_split(struct btree_update *as, struct btree_trans *trans,
- struct btree_path *path, struct btree *b,
+ btree_path_idx_t path, struct btree *b,
struct keylist *keys, unsigned flags)
{
struct bch_fs *c = as->c;
- struct btree *parent = btree_node_parent(path, b);
+ struct btree *parent = btree_node_parent(trans->paths + path, b);
struct btree *n1, *n2 = NULL, *n3 = NULL;
- struct btree_path *path1 = NULL, *path2 = NULL;
+ btree_path_idx_t path1 = 0, path2 = 0;
u64 start_time = local_clock();
int ret = 0;
BUG_ON(!parent && (b != btree_node_root(c, b)));
- BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1));
+ BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
bch2_btree_interior_update_will_free_node(as, b);
if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
struct btree *n[2];
- trace_and_count(c, btree_node_split, c, b);
+ trace_and_count(c, btree_node_split, trans, b);
n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
@@ -1501,15 +1514,15 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
six_unlock_write(&n2->c.lock);
six_unlock_write(&n1->c.lock);
- path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+ path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p);
six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, path1, n1);
+ mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + path1, n1);
- path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p);
+ path2 = get_unlocked_mut_path(trans, as->btree_id, n2->c.level, n2->key.k.p);
six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, path2, n2);
+ mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + path2, n2);
/*
* Note that on recursive parent_keys == keys, so we
@@ -1526,11 +1539,11 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n3);
six_unlock_write(&n3->c.lock);
- path2->locks_want++;
- BUG_ON(btree_node_locked(path2, n3->c.level));
+ trans->paths[path2].locks_want++;
+ BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level));
six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, path2, n3);
+ mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + path2, n3);
n3->sib_u64s[0] = U16_MAX;
n3->sib_u64s[1] = U16_MAX;
@@ -1538,7 +1551,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
}
} else {
- trace_and_count(c, btree_node_compact, c, b);
+ trace_and_count(c, btree_node_compact, trans, b);
n1 = bch2_btree_node_alloc_replacement(as, trans, b);
@@ -1551,10 +1564,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n1);
six_unlock_write(&n1->c.lock);
- path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+ path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p);
six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, path1, n1);
+ mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + path1, n1);
if (parent)
bch2_keylist_add(&as->parent_keys, &n1->key);
@@ -1568,10 +1581,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
if (ret)
goto err;
} else if (n3) {
- bch2_btree_set_root(as, trans, path, n3);
+ bch2_btree_set_root(as, trans, trans->paths + path, n3);
} else {
/* Root filled up but didn't need to be split */
- bch2_btree_set_root(as, trans, path, n1);
+ bch2_btree_set_root(as, trans, trans->paths + path, n1);
}
if (n3) {
@@ -1591,13 +1604,13 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
* node after another thread has locked and updated the new node, thus
* seeing stale data:
*/
- bch2_btree_node_free_inmem(trans, path, b);
+ bch2_btree_node_free_inmem(trans, trans->paths + path, b);
if (n3)
- bch2_trans_node_add(trans, n3);
+ bch2_trans_node_add(trans, trans->paths + path, n3);
if (n2)
- bch2_trans_node_add(trans, n2);
- bch2_trans_node_add(trans, n1);
+ bch2_trans_node_add(trans, trans->paths + path2, n2);
+ bch2_trans_node_add(trans, trans->paths + path1, n1);
if (n3)
six_unlock_intent(&n3->c.lock);
@@ -1606,11 +1619,11 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
six_unlock_intent(&n1->c.lock);
out:
if (path2) {
- __bch2_btree_path_unlock(trans, path2);
+ __bch2_btree_path_unlock(trans, trans->paths + path2);
bch2_path_put(trans, path2, true);
}
if (path1) {
- __bch2_btree_path_unlock(trans, path1);
+ __bch2_btree_path_unlock(trans, trans->paths + path1);
bch2_path_put(trans, path1, true);
}
@@ -1638,13 +1651,14 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
struct keylist *keys)
{
struct btree_path *linked;
+ unsigned i;
__bch2_btree_insert_keys_interior(as, trans, path, b,
path->l[b->c.level].iter, keys);
btree_update_updated_node(as, b);
- trans_for_each_path_with_node(trans, b, linked)
+ trans_for_each_path_with_node(trans, b, linked, i)
bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
bch2_trans_verify_paths(trans);
@@ -1655,7 +1669,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
*
* @as: btree_update object
* @trans: btree_trans object
- * @path: path that points to current node
+ * @path_idx: path that points to current node
* @b: node to insert keys into
* @keys: list of keys to insert
* @flags: transaction commit flags
@@ -1667,10 +1681,11 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
* for leaf nodes -- inserts into interior nodes have to be atomic.
*/
static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
- struct btree_path *path, struct btree *b,
+ btree_path_idx_t path_idx, struct btree *b,
struct keylist *keys, unsigned flags)
{
struct bch_fs *c = as->c;
+ struct btree_path *path = trans->paths + path_idx;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
@@ -1688,7 +1703,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
bch2_btree_node_prep_for_write(trans, path, b);
- if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
+ if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) {
bch2_btree_node_unlock_write(trans, path, b);
goto split;
}
@@ -1723,19 +1738,22 @@ split:
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
}
- return btree_split(as, trans, path, b, keys, flags);
+ return btree_split(as, trans, path_idx, b, keys, flags);
}
int bch2_btree_split_leaf(struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path,
unsigned flags)
{
- struct btree *b = path_l(path)->b;
+ /* btree_split & merge may both cause paths array to be reallocated */
+
+ struct btree *b = path_l(trans->paths + path)->b;
struct btree_update *as;
unsigned l;
int ret = 0;
- as = bch2_btree_update_start(trans, path, path->level,
+ as = bch2_btree_update_start(trans, trans->paths + path,
+ trans->paths[path].level,
true, flags);
if (IS_ERR(as))
return PTR_ERR(as);
@@ -1748,20 +1766,21 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
bch2_btree_update_done(as, trans);
- for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++)
+ for (l = trans->paths[path].level + 1;
+ btree_node_intent_locked(&trans->paths[path], l) && !ret;
+ l++)
ret = bch2_foreground_maybe_merge(trans, path, l, flags);
return ret;
}
int __bch2_foreground_maybe_merge(struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path,
unsigned level,
unsigned flags,
enum btree_node_sibling sib)
{
struct bch_fs *c = trans->c;
- struct btree_path *sib_path = NULL, *new_path = NULL;
struct btree_update *as;
struct bkey_format_state new_s;
struct bkey_format new_f;
@@ -1769,13 +1788,15 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
struct btree *b, *m, *n, *prev, *next, *parent;
struct bpos sib_pos;
size_t sib_u64s;
+ enum btree_id btree = trans->paths[path].btree_id;
+ btree_path_idx_t sib_path = 0, new_path = 0;
u64 start_time = local_clock();
int ret = 0;
- BUG_ON(!path->should_be_locked);
- BUG_ON(!btree_node_locked(path, level));
+ BUG_ON(!trans->paths[path].should_be_locked);
+ BUG_ON(!btree_node_locked(&trans->paths[path], level));
- b = path->l[level].b;
+ b = trans->paths[path].l[level].b;
if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
(sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) {
@@ -1787,18 +1808,18 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
? bpos_predecessor(b->data->min_key)
: bpos_successor(b->data->max_key);
- sib_path = bch2_path_get(trans, path->btree_id, sib_pos,
+ sib_path = bch2_path_get(trans, btree, sib_pos,
U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_);
ret = bch2_btree_path_traverse(trans, sib_path, false);
if (ret)
goto err;
- btree_path_set_should_be_locked(sib_path);
+ btree_path_set_should_be_locked(trans->paths + sib_path);
- m = sib_path->l[level].b;
+ m = trans->paths[sib_path].l[level].b;
- if (btree_node_parent(path, b) !=
- btree_node_parent(sib_path, m)) {
+ if (btree_node_parent(trans->paths + path, b) !=
+ btree_node_parent(trans->paths + sib_path, m)) {
b->sib_u64s[sib] = U16_MAX;
goto out;
}
@@ -1851,14 +1872,14 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
goto out;
- parent = btree_node_parent(path, b);
- as = bch2_btree_update_start(trans, path, level, false,
- BTREE_INSERT_NOFAIL|flags);
+ parent = btree_node_parent(trans->paths + path, b);
+ as = bch2_btree_update_start(trans, trans->paths + path, level, false,
+ BCH_TRANS_COMMIT_no_enospc|flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret)
goto err;
- trace_and_count(c, btree_node_merge, c, b);
+ trace_and_count(c, btree_node_merge, trans, b);
bch2_btree_interior_update_will_free_node(as, b);
bch2_btree_interior_update_will_free_node(as, m);
@@ -1882,10 +1903,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
- new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
+ new_path = get_unlocked_mut_path(trans, btree, n->c.level, n->key.k.p);
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, new_path, n);
+ mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + new_path, n);
bkey_init(&delete.k);
delete.k.p = prev->key.k.p;
@@ -1903,10 +1924,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_btree_update_get_open_buckets(as, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
- bch2_btree_node_free_inmem(trans, path, b);
- bch2_btree_node_free_inmem(trans, sib_path, m);
+ bch2_btree_node_free_inmem(trans, trans->paths + path, b);
+ bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m);
- bch2_trans_node_add(trans, n);
+ bch2_trans_node_add(trans, trans->paths + path, n);
bch2_trans_verify_paths(trans);
@@ -1934,16 +1955,16 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
unsigned flags)
{
struct bch_fs *c = trans->c;
- struct btree_path *new_path = NULL;
struct btree *n, *parent;
struct btree_update *as;
+ btree_path_idx_t new_path = 0;
int ret;
- flags |= BTREE_INSERT_NOFAIL;
+ flags |= BCH_TRANS_COMMIT_no_enospc;
- parent = btree_node_parent(iter->path, b);
- as = bch2_btree_update_start(trans, iter->path, b->c.level,
- false, flags);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ parent = btree_node_parent(path, b);
+ as = bch2_btree_update_start(trans, path, b->c.level, false, flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret)
goto out;
@@ -1958,27 +1979,27 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, new_path, n);
+ mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + new_path, n);
- trace_and_count(c, btree_node_rewrite, c, b);
+ trace_and_count(c, btree_node_rewrite, trans, b);
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
- ret = bch2_btree_insert_node(as, trans, iter->path, parent,
- &as->parent_keys, flags);
+ ret = bch2_btree_insert_node(as, trans, iter->path,
+ parent, &as->parent_keys, flags);
if (ret)
goto err;
} else {
- bch2_btree_set_root(as, trans, iter->path, n);
+ bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n);
}
bch2_btree_update_get_open_buckets(as, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
- bch2_btree_node_free_inmem(trans, iter->path, b);
+ bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b);
- bch2_trans_node_add(trans, n);
+ bch2_trans_node_add(trans, trans->paths + iter->path, n);
six_unlock_intent(&n->c.lock);
bch2_btree_update_done(as, trans);
@@ -2047,8 +2068,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
ret = bch2_trans_do(c, NULL, NULL, 0,
async_btree_node_rewrite_trans(trans, a));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
kfree(a);
}
@@ -2071,7 +2091,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
a->seq = b->data->keys.seq;
INIT_WORK(&a->work, async_btree_node_rewrite_work);
- if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+ if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
mutex_lock(&c->pending_node_rewrites_lock);
list_add(&a->list, &c->pending_node_rewrites);
mutex_unlock(&c->pending_node_rewrites_lock);
@@ -2079,15 +2099,15 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
}
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
- if (test_bit(BCH_FS_STARTED, &c->flags)) {
+ if (test_bit(BCH_FS_started, &c->flags)) {
bch_err(c, "%s: error getting c->writes ref", __func__);
kfree(a);
return;
}
ret = bch2_fs_read_write_early(c);
+ bch_err_msg(c, ret, "going read-write");
if (ret) {
- bch_err_msg(c, ret, "going read-write");
kfree(a);
return;
}
@@ -2138,13 +2158,12 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
int ret;
if (!skip_triggers) {
- ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
- bkey_i_to_s_c(&b->key), 0);
- if (ret)
- return ret;
-
- ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
- new_key, 0);
+ ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1,
+ bkey_i_to_s_c(&b->key),
+ BTREE_TRIGGER_TRANSACTIONAL) ?:
+ bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1,
+ bkey_i_to_s(new_key),
+ BTREE_TRIGGER_TRANSACTIONAL);
if (ret)
return ret;
}
@@ -2156,7 +2175,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
BUG_ON(ret);
}
- parent = btree_node_parent(iter->path, b);
+ parent = btree_node_parent(btree_iter_path(trans, iter), b);
if (parent) {
bch2_trans_copy_iter(&iter2, iter);
@@ -2164,10 +2183,11 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
iter2.flags & BTREE_ITER_INTENT,
_THIS_IP_);
- BUG_ON(iter2.path->level != b->c.level);
- BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p));
+ struct btree_path *path2 = btree_iter_path(trans, &iter2);
+ BUG_ON(path2->level != b->c.level);
+ BUG_ON(!bpos_eq(path2->pos, new_key->k.p));
- btree_path_set_level_up(trans, iter2.path);
+ btree_path_set_level_up(trans, path2);
trans->paths_sorted = false;
@@ -2178,23 +2198,23 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
} else {
BUG_ON(btree_node_root(c, b) != b);
- ret = darray_make_room(&trans->extra_journal_entries,
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
jset_u64s(new_key->k.u64s));
+ ret = PTR_ERR_OR_ZERO(e);
if (ret)
return ret;
- journal_entry_set((void *) &darray_top(trans->extra_journal_entries),
+ journal_entry_set(e,
BCH_JSET_ENTRY_btree_root,
b->c.btree_id, b->c.level,
new_key, new_key->k.u64s);
- trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s);
}
ret = bch2_trans_commit(trans, NULL, NULL, commit_flags);
if (ret)
goto err;
- bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c);
+ bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c);
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
@@ -2209,7 +2229,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
bkey_copy(&b->key, new_key);
}
- bch2_btree_node_unlock_write(trans, iter->path, b);
+ bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b);
out:
bch2_trans_iter_exit(trans, &iter2);
return ret;
@@ -2228,7 +2248,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
{
struct bch_fs *c = trans->c;
struct btree *new_hash = NULL;
- struct btree_path *path = iter->path;
+ struct btree_path *path = btree_iter_path(trans, iter);
struct closure cl;
int ret = 0;
@@ -2243,7 +2263,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
* btree_iter_traverse():
*/
if (btree_ptr_hash_val(new_key) != b->hash_val) {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
if (ret) {
ret = drop_locks_do(trans, (closure_sync(&cl), 0));
if (ret)
@@ -2267,7 +2287,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
six_unlock_intent(&new_hash->c.lock);
}
closure_sync(&cl);
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
return ret;
}
@@ -2286,7 +2306,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
goto out;
/* has node been freed? */
- if (iter.path->l[b->c.level].b != b) {
+ if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) {
/* node has been freed: */
BUG_ON(!btree_node_dying(b));
goto out;
@@ -2328,12 +2348,12 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
closure_init_stack(&cl);
do {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
closure_sync(&cl);
} while (ret);
b = bch2_btree_node_mem_alloc(trans, false);
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
set_btree_node_fake(b);
set_btree_node_need_rewrite(b);
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index a6668992a272..c593c925d1e3 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -117,16 +117,17 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
struct btree *,
struct bkey_format);
-int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned);
+int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
-int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *,
+int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
unsigned, unsigned, enum btree_node_sibling);
static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path_idx,
unsigned level, unsigned flags,
enum btree_node_sibling sib)
{
+ struct btree_path *path = trans->paths + path_idx;
struct btree *b;
EBUG_ON(!btree_node_locked(path, level));
@@ -135,11 +136,11 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
return 0;
- return __bch2_foreground_maybe_merge(trans, path, level, flags, sib);
+ return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib);
}
static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path,
unsigned level,
unsigned flags)
{
@@ -183,21 +184,19 @@ static inline void btree_node_reset_sib_u64s(struct btree *b)
b->sib_u64s[1] = b->nr.live_u64s;
}
-static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
+static inline void *btree_data_end(struct btree *b)
{
- return (void *) b->data + btree_bytes(c);
+ return (void *) b->data + btree_buf_bytes(b);
}
-static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
- struct btree *b)
+static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b)
{
- return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+ return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s);
}
-static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
- struct btree *b)
+static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b)
{
- return btree_data_end(c, b);
+ return btree_data_end(b);
}
static inline void *write_block(struct btree *b)
@@ -220,13 +219,11 @@ static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
return __btree_addr_written(b, k);
}
-static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
- struct btree *b,
- void *end)
+static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end)
{
ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
b->whiteout_u64s;
- ssize_t total = c->opts.btree_node_size >> 3;
+ ssize_t total = btree_buf_bytes(b) >> 3;
/* Always leave one extra u64 for bch2_varint_decode: */
used++;
@@ -234,10 +231,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
return total - used;
}
-static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
- struct btree *b)
+static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b)
{
- ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+ ssize_t remaining = __bch2_btree_u64s_remaining(b,
btree_bkey_last(b, bset_tree_last(b)));
BUG_ON(remaining < 0);
@@ -259,14 +255,13 @@ static inline unsigned btree_write_set_buffer(struct btree *b)
return 8 << BTREE_WRITE_SET_U64s_BITS;
}
-static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
- struct btree *b)
+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b)
{
struct bset_tree *t = bset_tree_last(b);
struct btree_node_entry *bne = max(write_block(b),
(void *) btree_bkey_last(b, bset_tree_last(b)));
ssize_t remaining_space =
- __bch_btree_u64s_remaining(c, b, bne->keys.start);
+ __bch2_btree_u64s_remaining(b, bne->keys.start);
if (unlikely(bset_written(b, bset(b, t)))) {
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
@@ -280,12 +275,11 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
return NULL;
}
-static inline void push_whiteout(struct bch_fs *c, struct btree *b,
- struct bpos pos)
+static inline void push_whiteout(struct btree *b, struct bpos pos)
{
struct bkey_packed k;
- BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
+ BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s);
EBUG_ON(btree_node_just_written(b));
if (!bkey_pack_pos(&k, pos, b)) {
@@ -298,20 +292,19 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
k.needs_whiteout = true;
b->whiteout_u64s += k.u64s;
- bkey_p_copy(unwritten_whiteouts_start(c, b), &k);
+ bkey_p_copy(unwritten_whiteouts_start(b), &k);
}
/*
* write lock must be held on @b (else the dirty bset that we were going to
* insert into could be written out from under us)
*/
-static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
- struct btree *b, unsigned u64s)
+static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s)
{
if (unlikely(btree_node_need_rewrite(b)))
return false;
- return u64s <= bch_btree_keys_u64s_remaining(c, b);
+ return u64s <= bch2_btree_keys_u64s_remaining(b);
}
void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 4e6241db518b..ac7844861966 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -7,45 +7,143 @@
#include "btree_write_buffer.h"
#include "error.h"
#include "journal.h"
+#include "journal_io.h"
#include "journal_reclaim.h"
-#include <linux/sort.h>
+#include <linux/prefetch.h>
-static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
+static int bch2_btree_write_buffer_journal_flush(struct journal *,
+ struct journal_entry_pin *, u64);
+
+static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *);
+
+static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
{
- const struct btree_write_buffered_key *l = _l;
- const struct btree_write_buffered_key *r = _r;
+ return (cmp_int(l->hi, r->hi) ?:
+ cmp_int(l->mi, r->mi) ?:
+ cmp_int(l->lo, r->lo)) >= 0;
+}
- return cmp_int(l->btree, r->btree) ?:
- bpos_cmp(l->k.k.p, r->k.k.p) ?:
- cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->journal_offset, r->journal_offset);
+static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
+{
+#ifdef CONFIG_X86_64
+ int cmp;
+
+ asm("mov (%[l]), %%rax;"
+ "sub (%[r]), %%rax;"
+ "mov 8(%[l]), %%rax;"
+ "sbb 8(%[r]), %%rax;"
+ "mov 16(%[l]), %%rax;"
+ "sbb 16(%[r]), %%rax;"
+ : "=@ccae" (cmp)
+ : [l] "r" (l), [r] "r" (r)
+ : "rax", "cc");
+
+ EBUG_ON(cmp != __wb_key_ref_cmp(l, r));
+ return cmp;
+#else
+ return __wb_key_ref_cmp(l, r);
+#endif
}
-static int btree_write_buffered_journal_cmp(const void *_l, const void *_r)
+/* Compare excluding idx, the low 24 bits: */
+static inline bool wb_key_eq(const void *_l, const void *_r)
{
- const struct btree_write_buffered_key *l = _l;
- const struct btree_write_buffered_key *r = _r;
+ const struct wb_key_ref *l = _l;
+ const struct wb_key_ref *r = _r;
- return cmp_int(l->journal_seq, r->journal_seq);
+ return !((l->hi ^ r->hi)|
+ (l->mi ^ r->mi)|
+ ((l->lo >> 24) ^ (r->lo >> 24)));
}
-static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
- struct btree_iter *iter,
- struct btree_write_buffered_key *wb,
- unsigned commit_flags,
- bool *write_locked,
- size_t *fast)
+static noinline void wb_sort(struct wb_key_ref *base, size_t num)
+{
+ size_t n = num, a = num / 2;
+
+ if (!a) /* num < 2 || size == 0 */
+ return;
+
+ for (;;) {
+ size_t b, c, d;
+
+ if (a) /* Building heap: sift down --a */
+ --a;
+ else if (--n) /* Sorting: Extract root to --n */
+ swap(base[0], base[n]);
+ else /* Sort complete */
+ break;
+
+ /*
+ * Sift element at "a" down into heap. This is the
+ * "bottom-up" variant, which significantly reduces
+ * calls to cmp_func(): we find the sift-down path all
+ * the way to the leaves (one compare per level), then
+ * backtrack to find where to insert the target element.
+ *
+ * Because elements tend to sift down close to the leaves,
+ * this uses fewer compares than doing two per level
+ * on the way down. (A bit more than half as many on
+ * average, 3/4 worst-case.)
+ */
+ for (b = a; c = 2*b + 1, (d = c + 1) < n;)
+ b = wb_key_ref_cmp(base + c, base + d) ? c : d;
+ if (d == n) /* Special case last leaf with no sibling */
+ b = c;
+
+ /* Now backtrack from "b" to the correct location for "a" */
+ while (b != a && wb_key_ref_cmp(base + a, base + b))
+ b = (b - 1) / 2;
+ c = b; /* Where "a" belongs */
+ while (b != a) { /* Shift it into place */
+ b = (b - 1) / 2;
+ swap(base[b], base[c]);
+ }
+ }
+}
+
+static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct btree_write_buffered_key *wb)
+{
+ struct btree_path *path = btree_iter_path(trans, iter);
+
+ bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+
+ trans->journal_res.seq = wb->journal_seq;
+
+ return bch2_trans_update(trans, iter, &wb->k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_journal_res|
+ BCH_TRANS_COMMIT_journal_reclaim);
+}
+
+static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
+ struct btree_write_buffered_key *wb,
+ bool *write_locked, size_t *fast)
{
- struct bch_fs *c = trans->c;
struct btree_path *path;
int ret;
+ EBUG_ON(!wb->journal_seq);
+ EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
+ EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
+
ret = bch2_btree_iter_traverse(iter);
if (ret)
return ret;
- path = iter->path;
+ /*
+ * We can't clone a path that has write locks: unshare it now, before
+ * set_pos and traverse():
+ */
+ if (btree_iter_path(trans, iter)->ref > 1)
+ iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_);
+
+ path = btree_iter_path(trans, iter);
if (!*write_locked) {
ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
@@ -56,52 +154,14 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
*write_locked = true;
}
- if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) {
- bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+ if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) {
*write_locked = false;
- goto trans_commit;
+ return wb_flush_one_slowpath(trans, iter, wb);
}
bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
(*fast)++;
-
- if (path->ref > 1) {
- /*
- * We can't clone a path that has write locks: if the path is
- * shared, unlock before set_pos(), traverse():
- */
- bch2_btree_node_unlock_write(trans, path, path->l[0].b);
- *write_locked = false;
- }
return 0;
-trans_commit:
- return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(trans, NULL, NULL,
- commit_flags|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RECLAIM);
-}
-
-static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
-{
- union btree_write_buffer_state old, new;
- u64 v = READ_ONCE(wb->state.v);
-
- do {
- old.v = new.v = v;
-
- new.nr = 0;
- new.idx++;
- } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
-
- while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1)
- cpu_relax();
-
- smp_mb();
-
- return old;
}
/*
@@ -124,41 +184,87 @@ btree_write_buffered_insert(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+ trans->journal_res.seq = wb->journal_seq;
+
ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_trans_update(trans, &iter, &wb->k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
-int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
- bool locked)
+static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb)
+{
+ struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer);
+ struct journal *j = &c->journal;
+
+ if (!wb->inc.keys.nr)
+ return;
+
+ bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin,
+ bch2_btree_write_buffer_journal_flush);
+
+ darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr));
+ darray_resize(&wb->sorted, wb->flushing.keys.size);
+
+ if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) {
+ swap(wb->flushing.keys, wb->inc.keys);
+ goto out;
+ }
+
+ size_t nr = min(darray_room(wb->flushing.keys),
+ wb->sorted.size - wb->flushing.keys.nr);
+ nr = min(nr, wb->inc.keys.nr);
+
+ memcpy(&darray_top(wb->flushing.keys),
+ wb->inc.keys.data,
+ sizeof(wb->inc.keys.data[0]) * nr);
+
+ memmove(wb->inc.keys.data,
+ wb->inc.keys.data + nr,
+ sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr));
+
+ wb->flushing.keys.nr += nr;
+ wb->inc.keys.nr -= nr;
+out:
+ if (!wb->inc.keys.nr)
+ bch2_journal_pin_drop(j, &wb->inc.pin);
+ else
+ bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin,
+ bch2_btree_write_buffer_journal_flush);
+
+ if (j->watermark) {
+ spin_lock(&j->lock);
+ bch2_journal_set_watermark(j);
+ spin_unlock(&j->lock);
+ }
+
+ BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
+}
+
+static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct journal_entry_pin pin;
- struct btree_write_buffered_key *i, *keys;
struct btree_iter iter = { NULL };
- size_t nr = 0, skipped = 0, fast = 0, slowpath = 0;
+ size_t skipped = 0, fast = 0, slowpath = 0;
bool write_locked = false;
- union btree_write_buffer_state s;
int ret = 0;
- memset(&pin, 0, sizeof(pin));
-
- if (!locked && !mutex_trylock(&wb->flush_lock))
- return 0;
-
- bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL);
- bch2_journal_pin_drop(j, &wb->journal_pin);
+ bch2_trans_unlock(trans);
+ bch2_trans_begin(trans);
- s = btree_write_buffer_switch(wb);
- keys = wb->keys[s.idx];
- nr = s.nr;
+ mutex_lock(&wb->inc.lock);
+ move_keys_from_inc_to_flushing(wb);
+ mutex_unlock(&wb->inc.lock);
- if (race_fault())
- goto slowpath;
+ for (size_t i = 0; i < wb->flushing.keys.nr; i++) {
+ wb->sorted.data[i].idx = i;
+ wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree;
+ memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos));
+ }
+ wb->sorted.nr = wb->flushing.keys.nr;
/*
* We first sort so that we can detect and skip redundant updates, and
@@ -168,208 +274,373 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
* However, since we're not flushing in the order they appear in the
* journal we won't be able to drop our journal pin until everything is
* flushed - which means this could deadlock the journal if we weren't
- * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
+ * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail
* if it would block taking a journal reservation.
*
* If that happens, simply skip the key so we can optimistically insert
* as many keys as possible in the fast path.
*/
- sort(keys, nr, sizeof(keys[0]),
- btree_write_buffered_key_cmp, NULL);
+ wb_sort(wb->sorted.data, wb->sorted.nr);
+
+ darray_for_each(wb->sorted, i) {
+ struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
+
+ for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
+ prefetch(&wb->flushing.keys.data[n->idx]);
+
+ BUG_ON(!k->journal_seq);
+
+ if (i + 1 < &darray_top(wb->sorted) &&
+ wb_key_eq(i, i + 1)) {
+ struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
- for (i = keys; i < keys + nr; i++) {
- if (i + 1 < keys + nr &&
- i[0].btree == i[1].btree &&
- bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
skipped++;
- i->journal_seq = 0;
+ n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
+ k->journal_seq = 0;
continue;
}
- if (write_locked &&
- (iter.path->btree_id != i->btree ||
- bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) {
- bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
- write_locked = false;
+ if (write_locked) {
+ struct btree_path *path = btree_iter_path(trans, &iter);
+
+ if (path->btree_id != i->btree ||
+ bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) {
+ bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+ write_locked = false;
+ }
}
- if (!iter.path || iter.path->btree_id != i->btree) {
+ if (!iter.path || iter.btree_id != k->btree) {
bch2_trans_iter_exit(trans, &iter);
- bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
+ bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p,
BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
}
- bch2_btree_iter_set_pos(&iter, i->k.k.p);
- iter.path->preserve = false;
+ bch2_btree_iter_set_pos(&iter, k->k.k.p);
+ btree_iter_path(trans, &iter)->preserve = false;
do {
- ret = bch2_btree_write_buffer_flush_one(trans, &iter, i,
- commit_flags, &write_locked, &fast);
+ if (race_fault()) {
+ ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ break;
+ }
+
+ ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
if (!write_locked)
bch2_trans_begin(trans);
} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
- if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
+ if (!ret) {
+ k->journal_seq = 0;
+ } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
slowpath++;
- continue;
- }
- if (ret)
+ ret = 0;
+ } else
break;
-
- i->journal_seq = 0;
}
- if (write_locked)
- bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+ if (write_locked) {
+ struct btree_path *path = btree_iter_path(trans, &iter);
+ bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+ }
bch2_trans_iter_exit(trans, &iter);
- trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
-
- if (slowpath)
- goto slowpath;
+ if (ret)
+ goto err;
+ if (slowpath) {
+ /*
+ * Flush in the order they were present in the journal, so that
+ * we can release journal pins:
+ * The fastpath zapped the seq of keys that were successfully flushed so
+ * we can skip those here.
+ */
+ trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
+
+ darray_for_each(wb->flushing.keys, i) {
+ if (!i->journal_seq)
+ continue;
+
+ bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
+ bch2_btree_write_buffer_journal_flush);
+
+ bch2_trans_begin(trans);
+
+ ret = commit_do(trans, NULL, NULL,
+ BCH_WATERMARK_reclaim|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_journal_res|
+ BCH_TRANS_COMMIT_journal_reclaim,
+ btree_write_buffered_insert(trans, i));
+ if (ret)
+ goto err;
+ }
+ }
+err:
bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
-out:
- bch2_journal_pin_drop(j, &pin);
- mutex_unlock(&wb->flush_lock);
+ trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
+ bch2_journal_pin_drop(j, &wb->flushing.pin);
+ wb->flushing.keys.nr = 0;
return ret;
-slowpath:
- trace_write_buffer_flush_slowpath(trans, i - keys, nr);
+}
- /*
- * Now sort the rest by journal seq and bump the journal pin as we go.
- * The slowpath zapped the seq of keys that were successfully flushed so
- * we can skip those here.
- */
- sort(keys, nr, sizeof(keys[0]),
- btree_write_buffered_journal_cmp,
- NULL);
+static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq)
+{
+ struct journal *j = &c->journal;
+ struct journal_buf *buf;
+ int ret = 0;
- commit_flags &= ~BCH_WATERMARK_MASK;
- commit_flags |= BCH_WATERMARK_reclaim;
+ while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) {
+ ret = bch2_journal_keys_to_write_buffer(c, buf);
+ mutex_unlock(&j->buf_lock);
+ }
- for (i = keys; i < keys + nr; i++) {
- if (!i->journal_seq)
- continue;
+ return ret;
+}
- if (i->journal_seq > pin.seq) {
- struct journal_entry_pin pin2;
+static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ int ret = 0, fetch_from_journal_err;
- memset(&pin2, 0, sizeof(pin2));
+ do {
+ bch2_trans_unlock(trans);
- bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL);
- bch2_journal_pin_drop(j, &pin);
- bch2_journal_pin_copy(j, &pin, &pin2, NULL);
- bch2_journal_pin_drop(j, &pin2);
- }
+ fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq);
- ret = commit_do(trans, NULL, NULL,
- commit_flags|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RECLAIM,
- btree_write_buffered_insert(trans, i));
- if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
- break;
- }
+ /*
+ * On memory allocation failure, bch2_btree_write_buffer_flush_locked()
+ * is not guaranteed to empty wb->inc:
+ */
+ mutex_lock(&wb->flushing.lock);
+ ret = bch2_btree_write_buffer_flush_locked(trans);
+ mutex_unlock(&wb->flushing.lock);
+ } while (!ret &&
+ (fetch_from_journal_err ||
+ (wb->inc.pin.seq && wb->inc.pin.seq <= seq) ||
+ (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq)));
- goto out;
+ return ret;
}
-int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
+static int bch2_btree_write_buffer_journal_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
{
- bch2_trans_unlock(trans);
- mutex_lock(&trans->c->btree_write_buffer.flush_lock);
- return __bch2_btree_write_buffer_flush(trans, 0, true);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+ return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq));
}
-int bch2_btree_write_buffer_flush(struct btree_trans *trans)
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
{
- return __bch2_btree_write_buffer_flush(trans, 0, false);
+ struct bch_fs *c = trans->c;
+
+ trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_);
+
+ return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal));
}
-static int bch2_btree_write_buffer_journal_flush(struct journal *j,
- struct journal_entry_pin *_pin, u64 seq)
+int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_fs *c = trans->c;
struct btree_write_buffer *wb = &c->btree_write_buffer;
+ int ret = 0;
- mutex_lock(&wb->flush_lock);
+ if (mutex_trylock(&wb->flushing.lock)) {
+ ret = bch2_btree_write_buffer_flush_locked(trans);
+ mutex_unlock(&wb->flushing.lock);
+ }
- return bch2_trans_run(c,
- __bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true));
+ return ret;
}
-static inline u64 btree_write_buffer_ref(int idx)
+int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
{
- return ((union btree_write_buffer_state) {
- .ref0 = idx == 0,
- .ref1 = idx == 1,
- }).v;
+ struct bch_fs *c = trans->c;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer))
+ return -BCH_ERR_erofs_no_writes;
+
+ int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+ return ret;
}
-int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
+static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
{
- struct bch_fs *c = trans->c;
+ struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work);
struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct btree_write_buffered_key *i;
- union btree_write_buffer_state old, new;
- int ret = 0;
- u64 v;
+ int ret;
+
+ mutex_lock(&wb->flushing.lock);
+ do {
+ ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
+ } while (!ret && bch2_btree_write_buffer_should_flush(c));
+ mutex_unlock(&wb->flushing.lock);
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+}
- trans_for_each_wb_update(trans, i) {
- EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+int bch2_journal_key_to_wb_slowpath(struct bch_fs *c,
+ struct journal_keys_to_wb *dst,
+ enum btree_id btree, struct bkey_i *k)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ int ret;
+retry:
+ ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL);
+ if (!ret && dst->wb == &wb->flushing)
+ ret = darray_resize(&wb->sorted, wb->flushing.keys.size);
+
+ if (unlikely(ret)) {
+ if (dst->wb == &c->btree_write_buffer.flushing) {
+ mutex_unlock(&dst->wb->lock);
+ dst->wb = &c->btree_write_buffer.inc;
+ bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin,
+ bch2_btree_write_buffer_journal_flush);
+ goto retry;
+ }
- i->journal_seq = trans->journal_res.seq;
- i->journal_offset = trans->journal_res.offset;
+ return ret;
}
- preempt_disable();
- v = READ_ONCE(wb->state.v);
- do {
- old.v = new.v = v;
+ dst->room = darray_room(dst->wb->keys);
+ if (dst->wb == &wb->flushing)
+ dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
+ BUG_ON(!dst->room);
+ BUG_ON(!dst->seq);
+
+ struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
+ wb_k->journal_seq = dst->seq;
+ wb_k->btree = btree;
+ bkey_copy(&wb_k->k, k);
+ dst->wb->keys.nr++;
+ dst->room--;
+ return 0;
+}
+
+void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ if (mutex_trylock(&wb->flushing.lock)) {
+ mutex_lock(&wb->inc.lock);
+ move_keys_from_inc_to_flushing(wb);
- new.v += btree_write_buffer_ref(new.idx);
- new.nr += trans->nr_wb_updates;
- if (new.nr > wb->size) {
- ret = -BCH_ERR_btree_insert_need_flush_buffer;
- goto out;
+ /*
+ * Attempt to skip wb->inc, and add keys directly to
+ * wb->flushing, saving us a copy later:
+ */
+
+ if (!wb->inc.keys.nr) {
+ dst->wb = &wb->flushing;
+ } else {
+ mutex_unlock(&wb->flushing.lock);
+ dst->wb = &wb->inc;
}
- } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+ } else {
+ mutex_lock(&wb->inc.lock);
+ dst->wb = &wb->inc;
+ }
- memcpy(wb->keys[new.idx] + old.nr,
- trans->wb_updates,
- sizeof(trans->wb_updates[0]) * trans->nr_wb_updates);
+ dst->room = darray_room(dst->wb->keys);
+ if (dst->wb == &wb->flushing)
+ dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
+ dst->seq = seq;
- bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin,
+ bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
bch2_btree_write_buffer_journal_flush);
+}
- atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter);
+void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ if (!dst->wb->keys.nr)
+ bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
+
+ if (bch2_btree_write_buffer_should_flush(c) &&
+ __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) &&
+ !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+
+ if (dst->wb == &wb->flushing)
+ mutex_unlock(&wb->flushing.lock);
+ mutex_unlock(&wb->inc.lock);
+}
+
+static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
+{
+ struct journal_keys_to_wb dst;
+ struct jset_entry *entry;
+ struct bkey_i *k;
+ int ret = 0;
+
+ bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
+
+ for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
+ jset_entry_for_each_key(entry, k) {
+ ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
+ if (ret)
+ goto out;
+ }
+
+ entry->type = BCH_JSET_ENTRY_btree_keys;
+ }
+
+ buf->need_flush_to_write_buffer = false;
out:
- preempt_enable();
+ bch2_journal_keys_to_write_buffer_end(c, &dst);
+ return ret;
+}
+
+static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size)
+{
+ if (wb->keys.size >= new_size)
+ return 0;
+
+ if (!mutex_trylock(&wb->lock))
+ return -EINTR;
+
+ int ret = darray_resize(&wb->keys, new_size);
+ mutex_unlock(&wb->lock);
return ret;
}
+int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ return wb_keys_resize(&wb->flushing, new_size) ?:
+ wb_keys_resize(&wb->inc, new_size);
+}
+
void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
- BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal));
+ BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
+ !bch2_journal_error(&c->journal));
- kvfree(wb->keys[1]);
- kvfree(wb->keys[0]);
+ darray_exit(&wb->sorted);
+ darray_exit(&wb->flushing.keys);
+ darray_exit(&wb->inc.keys);
}
int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
- mutex_init(&wb->flush_lock);
- wb->size = c->opts.btree_write_buffer_size;
+ mutex_init(&wb->inc.lock);
+ mutex_init(&wb->flushing.lock);
+ INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work);
- wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
- wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
- if (!wb->keys[0] || !wb->keys[1])
- return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init;
+ /* Will be resized by journal as needed: */
+ unsigned initial_size = 1 << 16;
- return 0;
+ return darray_make_room(&wb->inc.keys, initial_size) ?:
+ darray_make_room(&wb->flushing.keys, initial_size) ?:
+ darray_make_room(&wb->sorted, initial_size);
}
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
index 322df1c8304e..eebcd2b15249 100644
--- a/fs/bcachefs/btree_write_buffer.h
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -2,12 +2,59 @@
#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
#define _BCACHEFS_BTREE_WRITE_BUFFER_H
-int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool);
+#include "bkey.h"
+
+static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4;
+}
+
+static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4;
+}
+
+struct btree_trans;
int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
-int bch2_btree_write_buffer_flush(struct btree_trans *);
+int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
+int bch2_btree_write_buffer_tryflush(struct btree_trans *);
+
+struct journal_keys_to_wb {
+ struct btree_write_buffer_keys *wb;
+ size_t room;
+ u64 seq;
+};
+
+int bch2_journal_key_to_wb_slowpath(struct bch_fs *,
+ struct journal_keys_to_wb *,
+ enum btree_id, struct bkey_i *);
+
+static inline int bch2_journal_key_to_wb(struct bch_fs *c,
+ struct journal_keys_to_wb *dst,
+ enum btree_id btree, struct bkey_i *k)
+{
+ EBUG_ON(!dst->seq);
+
+ if (unlikely(!dst->room))
+ return bch2_journal_key_to_wb_slowpath(c, dst, btree, k);
+
+ struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
+ wb_k->journal_seq = dst->seq;
+ wb_k->btree = btree;
+ bkey_copy(&wb_k->k, k);
+ dst->wb->keys.nr++;
+ dst->room--;
+ return 0;
+}
-int bch2_btree_insert_keys_write_buffer(struct btree_trans *);
+void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
+void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
+int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
int bch2_fs_btree_write_buffer_init(struct bch_fs *);
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
index 99993ba77aea..9b9433de9c36 100644
--- a/fs/bcachefs/btree_write_buffer_types.h
+++ b/fs/bcachefs/btree_write_buffer_types.h
@@ -2,43 +2,56 @@
#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+#include "darray.h"
#include "journal_types.h"
#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4
#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
-struct btree_write_buffered_key {
- u64 journal_seq;
- unsigned journal_offset;
- enum btree_id btree;
- __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
-};
-
-union btree_write_buffer_state {
+struct wb_key_ref {
+union {
struct {
- atomic64_t counter;
- };
-
- struct {
- u64 v;
- };
-
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ unsigned idx:24;
+ u8 pos[sizeof(struct bpos)];
+ enum btree_id btree:8;
+#else
+ enum btree_id btree:8;
+ u8 pos[sizeof(struct bpos)];
+ unsigned idx:24;
+#endif
+ } __packed;
struct {
- u64 nr:23;
- u64 idx:1;
- u64 ref0:20;
- u64 ref1:20;
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ u64 lo;
+ u64 mi;
+ u64 hi;
+#else
+ u64 hi;
+ u64 mi;
+ u64 lo;
+#endif
};
};
+};
-struct btree_write_buffer {
- struct mutex flush_lock;
- struct journal_entry_pin journal_pin;
+struct btree_write_buffered_key {
+ enum btree_id btree:8;
+ u64 journal_seq:56;
+ __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
+};
- union btree_write_buffer_state state;
- size_t size;
+struct btree_write_buffer_keys {
+ DARRAY(struct btree_write_buffered_key) keys;
+ struct journal_entry_pin pin;
+ struct mutex lock;
+};
- struct btree_write_buffered_key *keys[2];
+struct btree_write_buffer {
+ DARRAY(struct wb_key_ref) sorted;
+ struct btree_write_buffer_keys inc;
+ struct btree_write_buffer_keys flushing;
+ struct work_struct flush_work;
};
#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5a91d3189fcf..54f7826ac498 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -25,7 +25,7 @@
#include <linux/preempt.h>
-static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
enum bch_data_type data_type,
s64 sectors)
{
@@ -47,31 +47,27 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
void bch2_fs_usage_initialize(struct bch_fs *c)
{
- struct bch_fs_usage *usage;
- struct bch_dev *ca;
- unsigned i;
-
percpu_down_write(&c->mark_lock);
- usage = c->usage_base;
+ struct bch_fs_usage *usage = c->usage_base;
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
- for (i = 0; i < BCH_REPLICAS_MAX; i++)
- usage->reserved += usage->persistent_reserved[i];
+ for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++)
+ usage->b.reserved += usage->persistent_reserved[i];
- for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ for (unsigned i = 0; i < c->replicas.nr; i++) {
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
- fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
+ fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]);
}
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
struct bch_dev_usage dev = bch2_dev_usage_read(ca);
- usage->hidden += (dev.d[BCH_DATA_sb].buckets +
- dev.d[BCH_DATA_journal].buckets) *
+ usage->b.hidden += (dev.d[BCH_DATA_sb].buckets +
+ dev.d[BCH_DATA_journal].buckets) *
ca->mi.bucket_size;
}
@@ -158,8 +154,7 @@ retry:
void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
{
- struct bch_dev *ca;
- unsigned i, u64s = fs_usage_u64s(c);
+ unsigned u64s = fs_usage_u64s(c);
BUG_ON(idx >= ARRAY_SIZE(c->usage));
@@ -171,7 +166,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i, NULL) {
+ for_each_member_device_rcu(c, ca, NULL) {
u64s = dev_usage_u64s();
acc_u64s_percpu((u64 *) ca->usage_base,
@@ -193,15 +188,15 @@ void bch2_fs_usage_to_text(struct printbuf *out,
prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
prt_printf(out, "hidden:\t\t\t\t%llu\n",
- fs_usage->u.hidden);
+ fs_usage->u.b.hidden);
prt_printf(out, "data:\t\t\t\t%llu\n",
- fs_usage->u.data);
+ fs_usage->u.b.data);
prt_printf(out, "cached:\t\t\t\t%llu\n",
- fs_usage->u.cached);
+ fs_usage->u.b.cached);
prt_printf(out, "reserved:\t\t\t%llu\n",
- fs_usage->u.reserved);
+ fs_usage->u.b.reserved);
prt_printf(out, "nr_inodes:\t\t\t%llu\n",
- fs_usage->u.nr_inodes);
+ fs_usage->u.b.nr_inodes);
prt_printf(out, "online reserved:\t\t%llu\n",
fs_usage->online_reserved);
@@ -214,7 +209,7 @@ void bch2_fs_usage_to_text(struct printbuf *out,
}
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
prt_printf(out, "\t");
@@ -230,10 +225,10 @@ static u64 reserve_factor(u64 r)
u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
{
- return min(fs_usage->u.hidden +
- fs_usage->u.btree +
- fs_usage->u.data +
- reserve_factor(fs_usage->u.reserved +
+ return min(fs_usage->u.b.hidden +
+ fs_usage->u.b.btree +
+ fs_usage->u.b.data +
+ reserve_factor(fs_usage->u.b.reserved +
fs_usage->online_reserved),
c->capacity);
}
@@ -245,17 +240,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c)
u64 data, reserved;
ret.capacity = c->capacity -
- bch2_fs_usage_read_one(c, &c->usage_base->hidden);
+ bch2_fs_usage_read_one(c, &c->usage_base->b.hidden);
- data = bch2_fs_usage_read_one(c, &c->usage_base->data) +
- bch2_fs_usage_read_one(c, &c->usage_base->btree);
- reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
+ data = bch2_fs_usage_read_one(c, &c->usage_base->b.data) +
+ bch2_fs_usage_read_one(c, &c->usage_base->b.btree);
+ reserved = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) +
percpu_u64_get(c->online_reserved);
ret.used = min(ret.capacity, data + reserve_factor(reserved));
ret.free = ret.capacity - ret.used;
- ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
+ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes);
return ret;
}
@@ -277,18 +272,34 @@ void bch2_dev_usage_init(struct bch_dev *ca)
ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
}
-static inline int bucket_sectors_fragmented(struct bch_dev *ca,
- struct bch_alloc_v4 a)
+void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
{
- return a.dirty_sectors
- ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors)
- : 0;
+ prt_tab(out);
+ prt_str(out, "buckets");
+ prt_tab_rjust(out);
+ prt_str(out, "sectors");
+ prt_tab_rjust(out);
+ prt_str(out, "fragmented");
+ prt_tab_rjust(out);
+ prt_newline(out);
+
+ for (unsigned i = 0; i < BCH_DATA_NR; i++) {
+ bch2_prt_data_type(out, i);
+ prt_tab(out);
+ prt_u64(out, usage->d[i].buckets);
+ prt_tab_rjust(out);
+ prt_u64(out, usage->d[i].sectors);
+ prt_tab_rjust(out);
+ prt_u64(out, usage->d[i].fragmented);
+ prt_tab_rjust(out);
+ prt_newline(out);
+ }
}
-static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
- struct bch_alloc_v4 old,
- struct bch_alloc_v4 new,
- u64 journal_seq, bool gc)
+void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+ const struct bch_alloc_v4 *old,
+ const struct bch_alloc_v4 *new,
+ u64 journal_seq, bool gc)
{
struct bch_fs_usage *fs_usage;
struct bch_dev_usage *u;
@@ -296,56 +307,51 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
preempt_disable();
fs_usage = fs_usage_ptr(c, journal_seq, gc);
- if (data_type_is_hidden(old.data_type))
- fs_usage->hidden -= ca->mi.bucket_size;
- if (data_type_is_hidden(new.data_type))
- fs_usage->hidden += ca->mi.bucket_size;
+ if (data_type_is_hidden(old->data_type))
+ fs_usage->b.hidden -= ca->mi.bucket_size;
+ if (data_type_is_hidden(new->data_type))
+ fs_usage->b.hidden += ca->mi.bucket_size;
u = dev_usage_ptr(ca, journal_seq, gc);
- u->d[old.data_type].buckets--;
- u->d[new.data_type].buckets++;
-
- u->buckets_ec -= (int) !!old.stripe;
- u->buckets_ec += (int) !!new.stripe;
+ u->d[old->data_type].buckets--;
+ u->d[new->data_type].buckets++;
- u->d[old.data_type].sectors -= old.dirty_sectors;
- u->d[new.data_type].sectors += new.dirty_sectors;
+ u->d[old->data_type].sectors -= bch2_bucket_sectors_dirty(*old);
+ u->d[new->data_type].sectors += bch2_bucket_sectors_dirty(*new);
- u->d[BCH_DATA_cached].sectors += new.cached_sectors;
- u->d[BCH_DATA_cached].sectors -= old.cached_sectors;
+ u->d[BCH_DATA_cached].sectors += new->cached_sectors;
+ u->d[BCH_DATA_cached].sectors -= old->cached_sectors;
- u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
- u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
+ u->d[old->data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, *old);
+ u->d[new->data_type].fragmented += bch2_bucket_sectors_fragmented(ca, *new);
preempt_enable();
}
-static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
- struct bucket old, struct bucket new,
- u64 journal_seq, bool gc)
+static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
{
- struct bch_alloc_v4 old_a = {
- .gen = old.gen,
- .data_type = old.data_type,
- .dirty_sectors = old.dirty_sectors,
- .cached_sectors = old.cached_sectors,
- .stripe = old.stripe,
- };
- struct bch_alloc_v4 new_a = {
- .gen = new.gen,
- .data_type = new.data_type,
- .dirty_sectors = new.dirty_sectors,
- .cached_sectors = new.cached_sectors,
- .stripe = new.stripe,
+ return (struct bch_alloc_v4) {
+ .gen = b.gen,
+ .data_type = b.data_type,
+ .dirty_sectors = b.dirty_sectors,
+ .cached_sectors = b.cached_sectors,
+ .stripe = b.stripe,
};
+}
- bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
+void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
+ struct bucket *old, struct bucket *new)
+{
+ struct bch_alloc_v4 old_a = bucket_m_to_alloc(*old);
+ struct bch_alloc_v4 new_a = bucket_m_to_alloc(*new);
+
+ bch2_dev_usage_update(c, ca, &old_a, &new_a, 0, true);
}
static inline int __update_replicas(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
- struct bch_replicas_entry *r,
+ struct bch_replicas_entry_v1 *r,
s64 sectors)
{
int idx = bch2_replicas_entry_idx(c, r);
@@ -353,14 +359,14 @@ static inline int __update_replicas(struct bch_fs *c,
if (idx < 0)
return -1;
- fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+ fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
return 0;
}
-static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
- struct bch_replicas_entry *r, s64 sectors,
- unsigned journal_seq, bool gc)
+int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_replicas_entry_v1 *r, s64 sectors,
+ unsigned journal_seq, bool gc)
{
struct bch_fs_usage *fs_usage;
int idx, ret = 0;
@@ -388,7 +394,7 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
preempt_disable();
fs_usage = fs_usage_ptr(c, journal_seq, gc);
- fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+ fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
preempt_enable();
err:
@@ -407,7 +413,7 @@ static inline int update_cached_sectors(struct bch_fs *c,
bch2_replicas_entry_cached(&r.e, dev);
- return update_replicas(c, k, &r.e, sectors, journal_seq, gc);
+ return bch2_update_replicas(c, k, &r.e, sectors, journal_seq, gc);
}
static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more,
@@ -453,9 +459,9 @@ int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
__replicas_deltas_realloc(trans, more, _gfp));
}
-static inline int update_replicas_list(struct btree_trans *trans,
- struct bch_replicas_entry *r,
- s64 sectors)
+int bch2_update_replicas_list(struct btree_trans *trans,
+ struct bch_replicas_entry_v1 *r,
+ s64 sectors)
{
struct replicas_delta_list *d;
struct replicas_delta *n;
@@ -481,139 +487,13 @@ static inline int update_replicas_list(struct btree_trans *trans,
return 0;
}
-static inline int update_cached_sectors_list(struct btree_trans *trans,
- unsigned dev, s64 sectors)
+int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors)
{
struct bch_replicas_padded r;
bch2_replicas_entry_cached(&r.e, dev);
- return update_replicas_list(trans, &r.e, sectors);
-}
-
-int bch2_mark_alloc(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- bool gc = flags & BTREE_TRIGGER_GC;
- u64 journal_seq = trans->journal_res.seq;
- u64 bucket_journal_seq;
- struct bch_fs *c = trans->c;
- struct bch_alloc_v4 old_a_convert, new_a_convert;
- const struct bch_alloc_v4 *old_a, *new_a;
- struct bch_dev *ca;
- int ret = 0;
-
- /*
- * alloc btree is read in by bch2_alloc_read, not gc:
- */
- if ((flags & BTREE_TRIGGER_GC) &&
- !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
- return 0;
-
- if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
- "alloc key for invalid device or bucket"))
- return -EIO;
-
- ca = bch_dev_bkey_exists(c, new.k->p.inode);
-
- old_a = bch2_alloc_to_v4(old, &old_a_convert);
- new_a = bch2_alloc_to_v4(new, &new_a_convert);
-
- bucket_journal_seq = new_a->journal_seq;
-
- if ((flags & BTREE_TRIGGER_INSERT) &&
- data_type_is_empty(old_a->data_type) !=
- data_type_is_empty(new_a->data_type) &&
- new.k->type == KEY_TYPE_alloc_v4) {
- struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v;
-
- EBUG_ON(!journal_seq);
-
- /*
- * If the btree updates referring to a bucket weren't flushed
- * before the bucket became empty again, then the we don't have
- * to wait on a journal flush before we can reuse the bucket:
- */
- v->journal_seq = bucket_journal_seq =
- data_type_is_empty(new_a->data_type) &&
- (journal_seq == v->journal_seq ||
- bch2_journal_noflush_seq(&c->journal, v->journal_seq))
- ? 0 : journal_seq;
- }
-
- if (!data_type_is_empty(old_a->data_type) &&
- data_type_is_empty(new_a->data_type) &&
- bucket_journal_seq) {
- ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- c->journal.flushed_seq_ondisk,
- new.k->p.inode, new.k->p.offset,
- bucket_journal_seq);
- if (ret) {
- bch2_fs_fatal_error(c,
- "error setting bucket_needs_journal_commit: %i", ret);
- return ret;
- }
- }
-
- percpu_down_read(&c->mark_lock);
- if (!gc && new_a->gen != old_a->gen)
- *bucket_gen(ca, new.k->p.offset) = new_a->gen;
-
- bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc);
-
- if (gc) {
- struct bucket *g = gc_bucket(ca, new.k->p.offset);
-
- bucket_lock(g);
-
- g->gen_valid = 1;
- g->gen = new_a->gen;
- g->data_type = new_a->data_type;
- g->stripe = new_a->stripe;
- g->stripe_redundancy = new_a->stripe_redundancy;
- g->dirty_sectors = new_a->dirty_sectors;
- g->cached_sectors = new_a->cached_sectors;
-
- bucket_unlock(g);
- }
- percpu_up_read(&c->mark_lock);
-
- /*
- * need to know if we're getting called from the invalidate path or
- * not:
- */
-
- if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
- old_a->cached_sectors) {
- ret = update_cached_sectors(c, new, ca->dev_idx,
- -((s64) old_a->cached_sectors),
- journal_seq, gc);
- if (ret) {
- bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
- __func__);
- return ret;
- }
- }
-
- if (new_a->data_type == BCH_DATA_free &&
- (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
- closure_wake_up(&c->freelist_wait);
-
- if (new_a->data_type == BCH_DATA_need_discard &&
- (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
- bch2_do_discards(c);
-
- if (old_a->data_type != BCH_DATA_cached &&
- new_a->data_type == BCH_DATA_cached &&
- should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
- bch2_do_invalidates(c);
-
- if (new_a->data_type == BCH_DATA_need_gc_gens)
- bch2_do_gc_gens(c);
-
- return 0;
+ return bch2_update_replicas_list(trans, &r.e, sectors);
}
int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -643,8 +523,8 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (bch2_fs_inconsistent_on(g->data_type &&
g->data_type != data_type, c,
"different types of data in same bucket: %s, %s",
- bch2_data_types[g->data_type],
- bch2_data_types[data_type])) {
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type))) {
ret = -EIO;
goto err;
}
@@ -652,37 +532,33 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
ca->dev_idx, b, g->gen,
- bch2_data_types[g->data_type ?: data_type],
+ bch2_data_type_str(g->data_type ?: data_type),
g->dirty_sectors, sectors)) {
ret = -EIO;
goto err;
}
-
g->data_type = data_type;
g->dirty_sectors += sectors;
new = *g;
err:
bucket_unlock(g);
if (!ret)
- bch2_dev_usage_update_m(c, ca, old, new, 0, true);
+ bch2_dev_usage_update_m(c, ca, &old, &new);
percpu_up_read(&c->mark_lock);
return ret;
}
-static int check_bucket_ref(struct btree_trans *trans,
- struct bkey_s_c k,
- const struct bch_extent_ptr *ptr,
- s64 sectors, enum bch_data_type ptr_data_type,
- u8 b_gen, u8 bucket_data_type,
- u32 dirty_sectors, u32 cached_sectors)
+int bch2_check_bucket_ref(struct btree_trans *trans,
+ struct bkey_s_c k,
+ const struct bch_extent_ptr *ptr,
+ s64 sectors, enum bch_data_type ptr_data_type,
+ u8 b_gen, u8 bucket_data_type,
+ u32 bucket_sectors)
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
- u32 bucket_sectors = !ptr->cached
- ? dirty_sectors
- : cached_sectors;
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -699,7 +575,7 @@ static int check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
ret = -EIO;
@@ -712,7 +588,7 @@ static int check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -727,7 +603,7 @@ static int check_bucket_ref(struct btree_trans *trans,
"while marking %s",
ptr->dev, bucket_nr, b_gen,
*bucket_gen(ca, bucket_nr),
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -748,8 +624,8 @@ static int check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type],
- bch2_data_types[ptr_data_type],
+ bch2_data_type_str(bucket_data_type),
+ bch2_data_type_str(ptr_data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
ret = -EIO;
@@ -762,7 +638,7 @@ static int check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
bucket_sectors, sectors,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -777,508 +653,6 @@ err:
goto out;
}
-static int mark_stripe_bucket(struct btree_trans *trans,
- struct bkey_s_c k,
- unsigned ptr_idx,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- u64 journal_seq = trans->journal_res.seq;
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- unsigned nr_data = s->nr_blocks - s->nr_redundant;
- bool parity = ptr_idx >= nr_data;
- enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
- s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
- const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket old, new, *g;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- /* * XXX doesn't handle deletion */
-
- percpu_down_read(&c->mark_lock);
- g = PTR_GC_BUCKET(ca, ptr);
-
- if (g->dirty_sectors ||
- (g->stripe && g->stripe != k.k->p.offset)) {
- bch2_fs_inconsistent(c,
- "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
- ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EINVAL;
- goto err;
- }
-
- bucket_lock(g);
- old = *g;
-
- ret = check_bucket_ref(trans, k, ptr, sectors, data_type,
- g->gen, g->data_type,
- g->dirty_sectors, g->cached_sectors);
- if (ret)
- goto err;
-
- g->data_type = data_type;
- g->dirty_sectors += sectors;
-
- g->stripe = k.k->p.offset;
- g->stripe_redundancy = s->nr_redundant;
- new = *g;
-err:
- bucket_unlock(g);
- if (!ret)
- bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
- percpu_up_read(&c->mark_lock);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int __mark_pointer(struct btree_trans *trans,
- struct bkey_s_c k,
- const struct bch_extent_ptr *ptr,
- s64 sectors, enum bch_data_type ptr_data_type,
- u8 bucket_gen, u8 *bucket_data_type,
- u32 *dirty_sectors, u32 *cached_sectors)
-{
- u32 *dst_sectors = !ptr->cached
- ? dirty_sectors
- : cached_sectors;
- int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type,
- bucket_gen, *bucket_data_type,
- *dirty_sectors, *cached_sectors);
-
- if (ret)
- return ret;
-
- *dst_sectors += sectors;
-
- if (!*dirty_sectors && !*cached_sectors)
- *bucket_data_type = 0;
- else if (*bucket_data_type != BCH_DATA_stripe)
- *bucket_data_type = ptr_data_type;
-
- return 0;
-}
-
-static int bch2_mark_pointer(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k,
- struct extent_ptr_decoded p,
- s64 sectors,
- unsigned flags)
-{
- u64 journal_seq = trans->journal_res.seq;
- struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket old, new, *g;
- enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
- u8 bucket_data_type;
- int ret = 0;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- percpu_down_read(&c->mark_lock);
- g = PTR_GC_BUCKET(ca, &p.ptr);
- bucket_lock(g);
- old = *g;
-
- bucket_data_type = g->data_type;
- ret = __mark_pointer(trans, k, &p.ptr, sectors,
- data_type, g->gen,
- &bucket_data_type,
- &g->dirty_sectors,
- &g->cached_sectors);
- if (!ret)
- g->data_type = bucket_data_type;
-
- new = *g;
- bucket_unlock(g);
- if (!ret)
- bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
- percpu_up_read(&c->mark_lock);
-
- return ret;
-}
-
-static int bch2_mark_stripe_ptr(struct btree_trans *trans,
- struct bkey_s_c k,
- struct bch_extent_stripe_ptr p,
- enum bch_data_type data_type,
- s64 sectors,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bch_replicas_padded r;
- struct gc_stripe *m;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
- if (!m) {
- bch_err(c, "error allocating memory for gc_stripes, idx %llu",
- (u64) p.idx);
- return -BCH_ERR_ENOMEM_mark_stripe_ptr;
- }
-
- mutex_lock(&c->ec_stripes_heap_lock);
-
- if (!m || !m->alive) {
- mutex_unlock(&c->ec_stripes_heap_lock);
- bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
- (u64) p.idx);
- bch2_inconsistent_error(c);
- return -EIO;
- }
-
- m->block_sectors[p.block] += sectors;
-
- r = m->r;
- mutex_unlock(&c->ec_stripes_heap_lock);
-
- r.e.data_type = data_type;
- update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
-
- return 0;
-}
-
-static int __mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
-{
- u64 journal_seq = trans->journal_res.seq;
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- struct bch_replicas_padded r;
- enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
- ? BCH_DATA_btree
- : BCH_DATA_user;
- s64 sectors = bkey_is_btree_ptr(k.k)
- ? btree_sectors(c)
- : k.k->size;
- s64 dirty_sectors = 0;
- bool stale;
- int ret;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- r.e.data_type = data_type;
- r.e.nr_devs = 0;
- r.e.nr_required = 1;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- s64 disk_sectors = ptr_disk_sectors(sectors, p);
-
- if (flags & BTREE_TRIGGER_OVERWRITE)
- disk_sectors = -disk_sectors;
-
- ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags);
- if (ret < 0)
- return ret;
-
- stale = ret > 0;
-
- if (p.ptr.cached) {
- if (!stale) {
- ret = update_cached_sectors(c, k, p.ptr.dev,
- disk_sectors, journal_seq, true);
- if (ret) {
- bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
- __func__);
- return ret;
- }
- }
- } else if (!p.has_ec) {
- dirty_sectors += disk_sectors;
- r.e.devs[r.e.nr_devs++] = p.ptr.dev;
- } else {
- ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type,
- disk_sectors, flags);
- if (ret)
- return ret;
-
- /*
- * There may be other dirty pointers in this extent, but
- * if so they're not required for mounting if we have an
- * erasure coded pointer in this extent:
- */
- r.e.nr_required = 0;
- }
- }
-
- if (r.e.nr_devs) {
- ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
- if (ret) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
- printbuf_exit(&buf);
- return ret;
- }
- }
-
- return 0;
-}
-
-int bch2_mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags);
-}
-
-int bch2_mark_stripe(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- bool gc = flags & BTREE_TRIGGER_GC;
- u64 journal_seq = trans->journal_res.seq;
- struct bch_fs *c = trans->c;
- u64 idx = new.k->p.offset;
- const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(old).v : NULL;
- const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(new).v : NULL;
- unsigned i;
- int ret;
-
- BUG_ON(gc && old_s);
-
- if (!gc) {
- struct stripe *m = genradix_ptr(&c->stripes, idx);
-
- if (!m) {
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf1, c, old);
- bch2_bkey_val_to_text(&buf2, c, new);
- bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
- "old %s\n"
- "new %s", idx, buf1.buf, buf2.buf);
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- bch2_inconsistent_error(c);
- return -1;
- }
-
- if (!new_s) {
- bch2_stripes_heap_del(c, m, idx);
-
- memset(m, 0, sizeof(*m));
- } else {
- m->sectors = le16_to_cpu(new_s->sectors);
- m->algorithm = new_s->algorithm;
- m->nr_blocks = new_s->nr_blocks;
- m->nr_redundant = new_s->nr_redundant;
- m->blocks_nonempty = 0;
-
- for (i = 0; i < new_s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
-
- if (!old_s)
- bch2_stripes_heap_insert(c, m, idx);
- else
- bch2_stripes_heap_update(c, m, idx);
- }
- } else {
- struct gc_stripe *m =
- genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
-
- if (!m) {
- bch_err(c, "error allocating memory for gc_stripes, idx %llu",
- idx);
- return -BCH_ERR_ENOMEM_mark_stripe;
- }
- /*
- * This will be wrong when we bring back runtime gc: we should
- * be unmarking the old key and then marking the new key
- */
- m->alive = true;
- m->sectors = le16_to_cpu(new_s->sectors);
- m->nr_blocks = new_s->nr_blocks;
- m->nr_redundant = new_s->nr_redundant;
-
- for (i = 0; i < new_s->nr_blocks; i++)
- m->ptrs[i] = new_s->ptrs[i];
-
- bch2_bkey_to_replicas(&m->r.e, new);
-
- /*
- * gc recalculates this field from stripe ptr
- * references:
- */
- memset(m->block_sectors, 0, sizeof(m->block_sectors));
-
- for (i = 0; i < new_s->nr_blocks; i++) {
- ret = mark_stripe_bucket(trans, new, i, flags);
- if (ret)
- return ret;
- }
-
- ret = update_replicas(c, new, &m->r.e,
- ((s64) m->sectors * m->nr_redundant),
- journal_seq, gc);
- if (ret) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, new);
- bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
- printbuf_exit(&buf);
- return ret;
- }
- }
-
- return 0;
-}
-
-static int __mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bch_fs_usage *fs_usage;
- unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
- s64 sectors = (s64) k.k->size;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- if (flags & BTREE_TRIGGER_OVERWRITE)
- sectors = -sectors;
- sectors *= replicas;
-
- percpu_down_read(&c->mark_lock);
- preempt_disable();
-
- fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
- replicas = clamp_t(unsigned, replicas, 1,
- ARRAY_SIZE(fs_usage->persistent_reserved));
-
- fs_usage->reserved += sectors;
- fs_usage->persistent_reserved[replicas - 1] += sectors;
-
- preempt_enable();
- percpu_up_read(&c->mark_lock);
-
- return 0;
-}
-
-int bch2_mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags);
-}
-
-static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p,
- u64 start, u64 end,
- u64 *idx, unsigned flags, size_t r_idx)
-{
- struct bch_fs *c = trans->c;
- struct reflink_gc *r;
- int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
- u64 next_idx = end;
- s64 ret = 0;
- struct printbuf buf = PRINTBUF;
-
- if (r_idx >= c->reflink_gc_nr)
- goto not_found;
-
- r = genradix_ptr(&c->reflink_gc_table, r_idx);
- next_idx = min(next_idx, r->offset - r->size);
- if (*idx < next_idx)
- goto not_found;
-
- BUG_ON((s64) r->refcount + add < 0);
-
- r->refcount += add;
- *idx = r->offset;
- return 0;
-not_found:
- if (fsck_err(c, reflink_p_to_missing_reflink_v,
- "pointer to missing indirect extent\n"
- " %s\n"
- " missing range %llu-%llu",
- (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
- *idx, next_idx)) {
- struct bkey_i_error *new;
-
- new = bch2_trans_kmalloc(trans, sizeof(*new));
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto err;
-
- bkey_init(&new->k);
- new->k.type = KEY_TYPE_error;
- new->k.p = bkey_start_pos(p.k);
- new->k.p.offset += *idx - start;
- bch2_key_resize(&new->k, next_idx - *idx);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i,
- BTREE_TRIGGER_NORUN);
- }
-
- *idx = next_idx;
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int __mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- struct reflink_gc *ref;
- size_t l, r, m;
- u64 idx = le64_to_cpu(p.v->idx), start = idx;
- u64 end = le64_to_cpu(p.v->idx) + p.k->size;
- int ret = 0;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) {
- idx -= le32_to_cpu(p.v->front_pad);
- end += le32_to_cpu(p.v->back_pad);
- }
-
- l = 0;
- r = c->reflink_gc_nr;
- while (l < r) {
- m = l + (r - l) / 2;
-
- ref = genradix_ptr(&c->reflink_gc_table, m);
- if (ref->offset <= idx)
- l = m + 1;
- else
- r = m;
- }
-
- while (idx < end && !ret)
- ret = __bch2_mark_reflink_p(trans, p, start, end,
- &idx, flags, l++);
-
- return ret;
-}
-
-int bch2_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags);
-}
-
void bch2_trans_fs_usage_revert(struct btree_trans *trans,
struct replicas_delta_list *deltas)
{
@@ -1303,11 +677,11 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans,
BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
}
- dst->nr_inodes -= deltas->nr_inodes;
+ dst->b.nr_inodes -= deltas->nr_inodes;
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
added -= deltas->persistent_reserved[i];
- dst->reserved -= deltas->persistent_reserved[i];
+ dst->b.reserved -= deltas->persistent_reserved[i];
dst->persistent_reserved[i] -= deltas->persistent_reserved[i];
}
@@ -1320,48 +694,25 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans,
percpu_up_read(&c->mark_lock);
}
-int bch2_trans_fs_usage_apply(struct btree_trans *trans,
- struct replicas_delta_list *deltas)
+void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
+ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
static int warned_disk_usage = 0;
bool warn = false;
- u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
- struct replicas_delta *d, *d2;
- struct replicas_delta *top = (void *) deltas->d + deltas->used;
- struct bch_fs_usage *dst;
- s64 added = 0, should_not_have_added;
- unsigned i;
percpu_down_read(&c->mark_lock);
preempt_disable();
- dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+ struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b;
+ struct bch_fs_usage_base *src = &trans->fs_usage_delta;
- for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
- switch (d->r.data_type) {
- case BCH_DATA_btree:
- case BCH_DATA_user:
- case BCH_DATA_parity:
- added += d->delta;
- }
-
- if (__update_replicas(c, dst, &d->r, d->delta))
- goto need_mark;
- }
-
- dst->nr_inodes += deltas->nr_inodes;
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++) {
- added += deltas->persistent_reserved[i];
- dst->reserved += deltas->persistent_reserved[i];
- dst->persistent_reserved[i] += deltas->persistent_reserved[i];
- }
+ s64 added = src->btree + src->data + src->reserved;
/*
* Not allowed to reduce sectors_available except by getting a
* reservation:
*/
- should_not_have_added = added - (s64) disk_res_sectors;
+ s64 should_not_have_added = added - (s64) disk_res_sectors;
if (unlikely(should_not_have_added > 0)) {
u64 old, new, v = atomic64_read(&c->sectors_available);
@@ -1380,6 +731,13 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
this_cpu_sub(*c->online_reserved, added);
}
+ dst->hidden += src->hidden;
+ dst->btree += src->btree;
+ dst->data += src->data;
+ dst->cached += src->cached;
+ dst->reserved += src->reserved;
+ dst->nr_inodes += src->nr_inodes;
+
preempt_enable();
percpu_up_read(&c->mark_lock);
@@ -1387,6 +745,34 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
bch2_trans_inconsistent(trans,
"disk usage increased %lli more than %llu sectors reserved)",
should_not_have_added, disk_res_sectors);
+}
+
+int bch2_trans_fs_usage_apply(struct btree_trans *trans,
+ struct replicas_delta_list *deltas)
+{
+ struct bch_fs *c = trans->c;
+ struct replicas_delta *d, *d2;
+ struct replicas_delta *top = (void *) deltas->d + deltas->used;
+ struct bch_fs_usage *dst;
+ unsigned i;
+
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
+ dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+ for (d = deltas->d; d != top; d = replicas_delta_next(d))
+ if (__update_replicas(c, dst, &d->r, d->delta))
+ goto need_mark;
+
+ dst->b.nr_inodes += deltas->nr_inodes;
+
+ for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ dst->b.reserved += deltas->persistent_reserved[i];
+ dst->persistent_reserved[i] += deltas->persistent_reserved[i];
+ }
+
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
return 0;
need_mark:
/* revert changes: */
@@ -1398,92 +784,184 @@ need_mark:
return -1;
}
-/* trans_mark: */
+/* KEY_TYPE_extent: */
+
+static int __mark_pointer(struct btree_trans *trans,
+ struct bkey_s_c k,
+ const struct bch_extent_ptr *ptr,
+ s64 sectors, enum bch_data_type ptr_data_type,
+ u8 bucket_gen, u8 *bucket_data_type,
+ u32 *dirty_sectors, u32 *cached_sectors)
+{
+ u32 *dst_sectors = !ptr->cached
+ ? dirty_sectors
+ : cached_sectors;
+ int ret = bch2_check_bucket_ref(trans, k, ptr, sectors, ptr_data_type,
+ bucket_gen, *bucket_data_type, *dst_sectors);
+
+ if (ret)
+ return ret;
+
+ *dst_sectors += sectors;
+
+ if (!*dirty_sectors && !*cached_sectors)
+ *bucket_data_type = 0;
+ else if (*bucket_data_type != BCH_DATA_stripe)
+ *bucket_data_type = ptr_data_type;
+
+ return 0;
+}
-static inline int bch2_trans_mark_pointer(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, struct extent_ptr_decoded p,
- unsigned flags)
+static int bch2_trigger_pointer(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, struct extent_ptr_decoded p,
+ s64 *sectors,
+ unsigned flags)
{
bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a;
struct bpos bucket;
struct bch_backpointer bp;
- s64 sectors;
- int ret;
bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp);
- sectors = bp.bucket_len;
- if (!insert)
- sectors = -sectors;
+ *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
- a = bch2_trans_start_alloc_update(trans, &iter, bucket);
- if (IS_ERR(a))
- return PTR_ERR(a);
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, &iter, bucket);
+ int ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ret;
- ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type,
- a->v.gen, &a->v.data_type,
- &a->v.dirty_sectors, &a->v.cached_sectors) ?:
- bch2_trans_update(trans, &iter, &a->k_i, 0);
- bch2_trans_iter_exit(trans, &iter);
+ ret = __mark_pointer(trans, k, &p.ptr, *sectors, bp.data_type,
+ a->v.gen, &a->v.data_type,
+ &a->v.dirty_sectors, &a->v.cached_sectors) ?:
+ bch2_trans_update(trans, &iter, &a->k_i, 0);
+ bch2_trans_iter_exit(trans, &iter);
- if (ret)
- return ret;
-
- if (!p.ptr.cached) {
- ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
if (ret)
return ret;
+
+ if (!p.ptr.cached) {
+ ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
+ if (ret)
+ return ret;
+ }
+ }
+
+ if (flags & BTREE_TRIGGER_GC) {
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+
+ percpu_down_read(&c->mark_lock);
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ bucket_lock(g);
+ struct bucket old = *g;
+
+ u8 bucket_data_type = g->data_type;
+ int ret = __mark_pointer(trans, k, &p.ptr, *sectors,
+ data_type, g->gen,
+ &bucket_data_type,
+ &g->dirty_sectors,
+ &g->cached_sectors);
+ if (ret) {
+ bucket_unlock(g);
+ percpu_up_read(&c->mark_lock);
+ return ret;
+ }
+
+ g->data_type = bucket_data_type;
+ struct bucket new = *g;
+ bucket_unlock(g);
+ bch2_dev_usage_update_m(c, ca, &old, &new);
+ percpu_up_read(&c->mark_lock);
}
return 0;
}
-static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
- struct extent_ptr_decoded p,
- s64 sectors, enum bch_data_type data_type)
+static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
+ struct bkey_s_c k,
+ struct extent_ptr_decoded p,
+ enum bch_data_type data_type,
+ s64 sectors, unsigned flags)
{
- struct btree_iter iter;
- struct bkey_i_stripe *s;
- struct bch_replicas_padded r;
- int ret = 0;
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ struct btree_iter iter;
+ struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_stripes, POS(0, p.ec.idx),
+ BTREE_ITER_WITH_UPDATES, stripe);
+ int ret = PTR_ERR_OR_ZERO(s);
+ if (unlikely(ret)) {
+ bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
+ "pointer to nonexistent stripe %llu",
+ (u64) p.ec.idx);
+ goto err;
+ }
- s = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_stripes, POS(0, p.ec.idx),
- BTREE_ITER_WITH_UPDATES, stripe);
- ret = PTR_ERR_OR_ZERO(s);
- if (unlikely(ret)) {
- bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
- "pointer to nonexistent stripe %llu",
- (u64) p.ec.idx);
- goto err;
- }
+ if (!bch2_ptr_matches_stripe(&s->v, p)) {
+ bch2_trans_inconsistent(trans,
+ "stripe pointer doesn't match stripe %llu",
+ (u64) p.ec.idx);
+ ret = -EIO;
+ goto err;
+ }
- if (!bch2_ptr_matches_stripe(&s->v, p)) {
- bch2_trans_inconsistent(trans,
- "stripe pointer doesn't match stripe %llu",
- (u64) p.ec.idx);
- ret = -EIO;
- goto err;
+ stripe_blockcount_set(&s->v, p.ec.block,
+ stripe_blockcount_get(&s->v, p.ec.block) +
+ sectors);
+
+ struct bch_replicas_padded r;
+ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
+ r.e.data_type = data_type;
+ ret = bch2_update_replicas_list(trans, &r.e, sectors);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
- stripe_blockcount_set(&s->v, p.ec.block,
- stripe_blockcount_get(&s->v, p.ec.block) +
- sectors);
+ if (flags & BTREE_TRIGGER_GC) {
+ struct bch_fs *c = trans->c;
- bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
- r.e.data_type = data_type;
- ret = update_replicas_list(trans, &r.e, sectors);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+ struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
+ if (!m) {
+ bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+ (u64) p.ec.idx);
+ return -BCH_ERR_ENOMEM_mark_stripe_ptr;
+ }
+
+ mutex_lock(&c->ec_stripes_heap_lock);
+
+ if (!m || !m->alive) {
+ mutex_unlock(&c->ec_stripes_heap_lock);
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s",
+ (u64) p.ec.idx, buf.buf);
+ printbuf_exit(&buf);
+ bch2_inconsistent_error(c);
+ return -EIO;
+ }
+
+ m->block_sectors[p.ec.block] += sectors;
+
+ struct bch_replicas_padded r = m->r;
+ mutex_unlock(&c->ec_stripes_heap_lock);
+
+ r.e.data_type = data_type;
+ bch2_update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
+ }
+
+ return 0;
}
-static int __trans_mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+static int __trigger_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
+ bool gc = flags & BTREE_TRIGGER_GC;
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@@ -1492,11 +970,7 @@ static int __trans_mark_extent(struct btree_trans *trans,
enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
? BCH_DATA_btree
: BCH_DATA_user;
- s64 sectors = bkey_is_btree_ptr(k.k)
- ? btree_sectors(c)
- : k.k->size;
s64 dirty_sectors = 0;
- bool stale;
int ret = 0;
r.e.data_type = data_type;
@@ -1504,21 +978,20 @@ static int __trans_mark_extent(struct btree_trans *trans,
r.e.nr_required = 1;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- s64 disk_sectors = ptr_disk_sectors(sectors, p);
-
- if (flags & BTREE_TRIGGER_OVERWRITE)
- disk_sectors = -disk_sectors;
-
- ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags);
+ s64 disk_sectors;
+ ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags);
if (ret < 0)
return ret;
- stale = ret > 0;
+ bool stale = ret > 0;
if (p.ptr.cached) {
if (!stale) {
- ret = update_cached_sectors_list(trans, p.ptr.dev,
- disk_sectors);
+ ret = !gc
+ ? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors)
+ : update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true);
+ bch2_fs_fatal_err_on(ret && gc, c, "%s(): no replicas entry while updating cached sectors",
+ __func__);
if (ret)
return ret;
}
@@ -1526,324 +999,122 @@ static int __trans_mark_extent(struct btree_trans *trans,
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
- ret = bch2_trans_mark_stripe_ptr(trans, p,
- disk_sectors, data_type);
+ ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
if (ret)
return ret;
+ /*
+ * There may be other dirty pointers in this extent, but
+ * if so they're not required for mounting if we have an
+ * erasure coded pointer in this extent:
+ */
r.e.nr_required = 0;
}
}
- if (r.e.nr_devs)
- ret = update_replicas_list(trans, &r.e, dirty_sectors);
-
- return ret;
-}
-
-int bch2_trans_mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) -
- (int) bch2_bkey_needs_rebalance(c, old);
+ if (r.e.nr_devs) {
+ ret = !gc
+ ? bch2_update_replicas_list(trans, &r.e, dirty_sectors)
+ : bch2_update_replicas(c, k, &r.e, dirty_sectors, 0, true);
+ if (unlikely(ret && gc)) {
+ struct printbuf buf = PRINTBUF;
- if (mod) {
- int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0);
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
if (ret)
return ret;
}
- return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags);
-}
-
-static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
- struct bkey_s_c_stripe s,
- unsigned idx, bool deleting)
-{
- struct bch_fs *c = trans->c;
- const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a;
- enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
- ? BCH_DATA_parity : 0;
- s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
- int ret = 0;
-
- if (deleting)
- sectors = -sectors;
-
- a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
- if (IS_ERR(a))
- return PTR_ERR(a);
-
- ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
- a->v.gen, a->v.data_type,
- a->v.dirty_sectors, a->v.cached_sectors);
- if (ret)
- goto err;
-
- if (!deleting) {
- if (bch2_trans_inconsistent_on(a->v.stripe ||
- a->v.stripe_redundancy, trans,
- "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_types[a->v.data_type],
- a->v.dirty_sectors,
- a->v.stripe, s.k->p.offset)) {
- ret = -EIO;
- goto err;
- }
-
- if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
- "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_types[a->v.data_type],
- a->v.dirty_sectors,
- s.k->p.offset)) {
- ret = -EIO;
- goto err;
- }
-
- a->v.stripe = s.k->p.offset;
- a->v.stripe_redundancy = s.v->nr_redundant;
- a->v.data_type = BCH_DATA_stripe;
- } else {
- if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
- a->v.stripe_redundancy != s.v->nr_redundant, trans,
- "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- s.k->p.offset, a->v.stripe)) {
- ret = -EIO;
- goto err;
- }
-
- a->v.stripe = 0;
- a->v.stripe_redundancy = 0;
- a->v.data_type = alloc_data_type(a->v, BCH_DATA_user);
- }
-
- a->v.dirty_sectors += sectors;
- if (data_type)
- a->v.data_type = !deleting ? data_type : 0;
-
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
- if (ret)
- goto err;
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
+ return 0;
}
-int bch2_trans_mark_stripe(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
+int bch2_trigger_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
- const struct bch_stripe *old_s = NULL;
- struct bch_stripe *new_s = NULL;
- struct bch_replicas_padded r;
- unsigned i, nr_blocks;
- int ret = 0;
-
- if (old.k->type == KEY_TYPE_stripe)
- old_s = bkey_s_c_to_stripe(old).v;
- if (new->k.type == KEY_TYPE_stripe)
- new_s = &bkey_i_to_stripe(new)->v;
-
- /*
- * If the pointers aren't changing, we don't need to do anything:
- */
- if (new_s && old_s &&
- new_s->nr_blocks == old_s->nr_blocks &&
- new_s->nr_redundant == old_s->nr_redundant &&
- !memcmp(old_s->ptrs, new_s->ptrs,
- new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+ struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
+ struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
+ unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
+ unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start;
+
+ /* if pointers aren't changing - nothing to do: */
+ if (new_ptrs_bytes == old_ptrs_bytes &&
+ !memcmp(new_ptrs.start,
+ old_ptrs.start,
+ new_ptrs_bytes))
return 0;
- BUG_ON(new_s && old_s &&
- (new_s->nr_blocks != old_s->nr_blocks ||
- new_s->nr_redundant != old_s->nr_redundant));
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ struct bch_fs *c = trans->c;
+ int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) -
+ (int) bch2_bkey_needs_rebalance(c, old);
- nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
-
- if (new_s) {
- s64 sectors = le16_to_cpu(new_s->sectors);
-
- bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
- ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
- if (ret)
- return ret;
- }
-
- if (old_s) {
- s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
-
- bch2_bkey_to_replicas(&r.e, old);
- ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
- if (ret)
- return ret;
- }
-
- for (i = 0; i < nr_blocks; i++) {
- if (new_s && old_s &&
- !memcmp(&new_s->ptrs[i],
- &old_s->ptrs[i],
- sizeof(new_s->ptrs[i])))
- continue;
-
- if (new_s) {
- ret = bch2_trans_mark_stripe_bucket(trans,
- bkey_i_to_s_c_stripe(new), i, false);
- if (ret)
- break;
- }
-
- if (old_s) {
- ret = bch2_trans_mark_stripe_bucket(trans,
- bkey_s_c_to_stripe(old), i, true);
+ if (mod) {
+ int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0);
if (ret)
- break;
+ return ret;
}
}
- return ret;
+ if (flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC))
+ return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree_id, level, old, new, flags);
+
+ return 0;
}
-static int __trans_mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+/* KEY_TYPE_reservation */
+
+static int __trigger_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
+ struct bch_fs *c = trans->c;
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
- s64 sectors = (s64) k.k->size;
- struct replicas_delta_list *d;
- int ret;
+ s64 sectors = (s64) k.k->size * replicas;
if (flags & BTREE_TRIGGER_OVERWRITE)
sectors = -sectors;
- sectors *= replicas;
-
- ret = bch2_replicas_deltas_realloc(trans, 0);
- if (ret)
- return ret;
- d = trans->fs_usage_deltas;
- replicas = clamp_t(unsigned, replicas, 1,
- ARRAY_SIZE(d->persistent_reserved));
-
- d->persistent_reserved[replicas - 1] += sectors;
- return 0;
-}
-
-int bch2_trans_mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_i *new,
- unsigned flags)
-{
- return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags);
-}
-
-static int trans_mark_reflink_p_segment(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p,
- u64 *idx, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_i *k;
- __le64 *refcount;
- int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
- struct printbuf buf = PRINTBUF;
- int ret;
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ int ret = bch2_replicas_deltas_realloc(trans, 0);
+ if (ret)
+ return ret;
- k = bch2_bkey_get_mut_noupdate(trans, &iter,
- BTREE_ID_reflink, POS(0, *idx),
- BTREE_ITER_WITH_UPDATES);
- ret = PTR_ERR_OR_ZERO(k);
- if (ret)
- goto err;
+ struct replicas_delta_list *d = trans->fs_usage_deltas;
+ replicas = min(replicas, ARRAY_SIZE(d->persistent_reserved));
- refcount = bkey_refcount(k);
- if (!refcount) {
- bch2_bkey_val_to_text(&buf, c, p.s_c);
- bch2_trans_inconsistent(trans,
- "nonexistent indirect extent at %llu while marking\n %s",
- *idx, buf.buf);
- ret = -EIO;
- goto err;
+ d->persistent_reserved[replicas - 1] += sectors;
}
- if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
- bch2_bkey_val_to_text(&buf, c, p.s_c);
- bch2_trans_inconsistent(trans,
- "indirect extent refcount underflow at %llu while marking\n %s",
- *idx, buf.buf);
- ret = -EIO;
- goto err;
- }
+ if (flags & BTREE_TRIGGER_GC) {
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
- if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
- u64 pad;
+ struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc);
- pad = max_t(s64, le32_to_cpu(v->front_pad),
- le64_to_cpu(v->idx) - bkey_start_offset(&k->k));
- BUG_ON(pad > U32_MAX);
- v->front_pad = cpu_to_le32(pad);
+ replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved));
+ fs_usage->b.reserved += sectors;
+ fs_usage->persistent_reserved[replicas - 1] += sectors;
- pad = max_t(s64, le32_to_cpu(v->back_pad),
- k->k.p.offset - p.k->size - le64_to_cpu(v->idx));
- BUG_ON(pad > U32_MAX);
- v->back_pad = cpu_to_le32(pad);
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
}
- le64_add_cpu(refcount, add);
-
- bch2_btree_iter_set_pos_to_extent_start(&iter);
- ret = bch2_trans_update(trans, &iter, k, 0);
- if (ret)
- goto err;
-
- *idx = k->k.p.offset;
-err:
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
- return ret;
+ return 0;
}
-static int __trans_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+int bch2_trigger_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- u64 idx, end_idx;
- int ret = 0;
-
- idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
- end_idx = le64_to_cpu(p.v->idx) + p.k->size +
- le32_to_cpu(p.v->back_pad);
-
- while (idx < end_idx && !ret)
- ret = trans_mark_reflink_p_segment(trans, p, &idx, flags);
- return ret;
+ return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags);
}
-int bch2_trans_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_i *new,
- unsigned flags)
-{
- if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v;
-
- v->front_pad = v->back_pad = 0;
- }
-
- return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags);
-}
+/* Mark superblocks: */
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_dev *ca, size_t b,
@@ -1871,9 +1142,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_types[a->v.data_type],
- bch2_data_types[type],
- bch2_data_types[type]);
+ bch2_data_type_str(a->v.data_type),
+ bch2_data_type_str(type),
+ bch2_data_type_str(type));
ret = -EIO;
goto err;
}
@@ -1974,17 +1245,13 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
{
int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
int bch2_trans_mark_dev_sbs(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
int ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
percpu_ref_put(&ca->ref);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 21f6cb356921..6387e039f789 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -203,6 +203,7 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
}
void bch2_dev_usage_init(struct bch_dev *);
+void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *);
static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
{
@@ -301,6 +302,12 @@ u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs *);
+void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *,
+ const struct bch_alloc_v4 *,
+ const struct bch_alloc_v4 *, u64, bool);
+void bch2_dev_usage_update_m(struct bch_fs *, struct bch_dev *,
+ struct bucket *, struct bucket *);
+
/* key/bucket marking: */
static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
@@ -315,43 +322,41 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
: c->usage[journal_seq & JOURNAL_BUF_MASK]);
}
+int bch2_update_replicas(struct bch_fs *, struct bkey_s_c,
+ struct bch_replicas_entry_v1 *, s64,
+ unsigned, bool);
+int bch2_update_replicas_list(struct btree_trans *,
+ struct bch_replicas_entry_v1 *, s64);
+int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64);
int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
void bch2_fs_usage_initialize(struct bch_fs *);
+int bch2_check_bucket_ref(struct btree_trans *, struct bkey_s_c,
+ const struct bch_extent_ptr *,
+ s64, enum bch_data_type, u8, u8, u32);
+
int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
-int bch2_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_extent(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
-
-int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
-
-#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
+int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
+int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
+
+#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
({ \
int ret = 0; \
\
if (_old.k->type) \
ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \
if (!ret && _new.k->type) \
- ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \
+ ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_OVERWRITE);\
ret; \
})
-#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags) \
- mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags)
+void bch2_trans_account_disk_usage_change(struct btree_trans *);
void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
@@ -382,6 +387,21 @@ static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
return false;
}
+static inline const char *bch2_data_type_str(enum bch_data_type type)
+{
+ return type < BCH_DATA_NR
+ ? __bch2_data_types[type]
+ : "(invalid data type)";
+}
+
+static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type)
+{
+ if (type < BCH_DATA_NR)
+ prt_str(out, __bch2_data_types[type]);
+ else
+ prt_printf(out, "(invalid data type %u)", type);
+}
+
/* disk reservations: */
static inline void bch2_disk_reservation_put(struct bch_fs *c,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 2a9dab9006ef..6a31740222a7 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -33,8 +33,6 @@ struct bucket_gens {
};
struct bch_dev_usage {
- u64 buckets_ec;
-
struct {
u64 buckets;
u64 sectors; /* _compressed_ sectors: */
@@ -47,23 +45,18 @@ struct bch_dev_usage {
} d[BCH_DATA_NR];
};
-struct bch_fs_usage {
- /* all fields are in units of 512 byte sectors: */
+struct bch_fs_usage_base {
u64 hidden;
u64 btree;
u64 data;
u64 cached;
u64 reserved;
u64 nr_inodes;
+};
- /* XXX: add stats for compression ratio */
-#if 0
- u64 uncompressed;
- u64 compressed;
-#endif
-
- /* broken out: */
-
+struct bch_fs_usage {
+ /* all fields are in units of 512 byte sectors: */
+ struct bch_fs_usage_base b;
u64 persistent_reserved[BCH_REPLICAS_MAX];
u64 replicas[];
};
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 4bb88aefed12..226b39c17667 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -7,22 +7,27 @@
#include "chardev.h"
#include "journal.h"
#include "move.h"
+#include "recovery.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
+#include "thread_with_file.h"
-#include <linux/anon_inodes.h>
#include <linux/cdev.h>
#include <linux/device.h>
-#include <linux/file.h>
#include <linux/fs.h>
#include <linux/ioctl.h>
-#include <linux/kthread.h>
#include <linux/major.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+__must_check
+static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
+{
+ return copy_to_user(to, from, n) ? -EFAULT : 0;
+}
+
/* returns with ref on ca->ref */
static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
unsigned flags)
@@ -132,8 +137,106 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
}
#endif
+struct fsck_thread {
+ struct thread_with_stdio thr;
+ struct bch_fs *c;
+ char **devs;
+ size_t nr_devs;
+ struct bch_opts opts;
+};
+
+static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
+{
+ struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
+ if (thr->devs)
+ for (size_t i = 0; i < thr->nr_devs; i++)
+ kfree(thr->devs[i]);
+ kfree(thr->devs);
+ kfree(thr);
+}
+
+static int bch2_fsck_offline_thread_fn(void *arg)
+{
+ struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+ struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
+
+ thr->thr.thr.ret = PTR_ERR_OR_ZERO(c);
+ if (!thr->thr.thr.ret)
+ bch2_fs_stop(c);
+
+ thread_with_stdio_done(&thr->thr);
+ return 0;
+}
+
+static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
+{
+ struct bch_ioctl_fsck_offline arg;
+ struct fsck_thread *thr = NULL;
+ u64 *devs = NULL;
+ long ret = 0;
+
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
+ return -EFAULT;
+
+ if (arg.flags)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) ||
+ !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) ||
+ !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ thr->opts = bch2_opts_empty();
+ thr->nr_devs = arg.nr_devs;
+
+ if (copy_from_user(devs, &user_arg->devs[0],
+ array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ for (size_t i = 0; i < arg.nr_devs; i++) {
+ thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(thr->devs[i]);
+ if (ret)
+ goto err;
+ }
+
+ if (arg.opts) {
+ char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
+
+ ret = PTR_ERR_OR_ZERO(optstr) ?:
+ bch2_parse_mount_opts(NULL, &thr->opts, optstr);
+ kfree(optstr);
+
+ if (ret)
+ goto err;
+ }
+
+ opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
+
+ ret = bch2_run_thread_with_stdio(&thr->thr,
+ bch2_fsck_thread_exit,
+ bch2_fsck_offline_thread_fn);
+err:
+ if (ret < 0) {
+ if (thr)
+ bch2_fsck_thread_exit(&thr->thr);
+ pr_err("ret %s", bch2_err_str(ret));
+ }
+ kfree(devs);
+ return ret;
+}
+
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
{
+ long ret;
+
switch (cmd) {
#if 0
case BCH_IOCTL_ASSEMBLE:
@@ -141,18 +244,25 @@ static long bch2_global_ioctl(unsigned cmd, void __user *arg)
case BCH_IOCTL_INCREMENTAL:
return bch2_ioctl_incremental(arg);
#endif
+ case BCH_IOCTL_FSCK_OFFLINE: {
+ ret = bch2_ioctl_fsck_offline(arg);
+ break;
+ }
default:
- return -ENOTTY;
+ ret = -ENOTTY;
+ break;
}
+
+ if (ret < 0)
+ ret = bch2_err_class(ret);
+ return ret;
}
static long bch2_ioctl_query_uuid(struct bch_fs *c,
struct bch_ioctl_query_uuid __user *user_arg)
{
- if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid,
- sizeof(c->sb.user_uuid)))
- return -EFAULT;
- return 0;
+ return copy_to_user_errcode(&user_arg->uuid, &c->sb.user_uuid,
+ sizeof(c->sb.user_uuid));
}
#if 0
@@ -295,31 +405,27 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
}
struct bch_data_ctx {
+ struct thread_with_file thr;
+
struct bch_fs *c;
struct bch_ioctl_data arg;
struct bch_move_stats stats;
-
- int ret;
-
- struct task_struct *thread;
};
static int bch2_data_thread(void *arg)
{
- struct bch_data_ctx *ctx = arg;
-
- ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
+ struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
+ ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
ctx->stats.data_type = U8_MAX;
return 0;
}
static int bch2_data_job_release(struct inode *inode, struct file *file)
{
- struct bch_data_ctx *ctx = file->private_data;
+ struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
- kthread_stop(ctx->thread);
- put_task_struct(ctx->thread);
+ bch2_thread_with_file_exit(&ctx->thr);
kfree(ctx);
return 0;
}
@@ -327,7 +433,7 @@ static int bch2_data_job_release(struct inode *inode, struct file *file)
static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
size_t len, loff_t *ppos)
{
- struct bch_data_ctx *ctx = file->private_data;
+ struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
struct bch_fs *c = ctx->c;
struct bch_ioctl_data_event e = {
.type = BCH_DATA_EVENT_PROGRESS,
@@ -341,10 +447,7 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
if (len < sizeof(e))
return -EINVAL;
- if (copy_to_user(buf, &e, sizeof(e)))
- return -EFAULT;
-
- return sizeof(e);
+ return copy_to_user_errcode(buf, &e, sizeof(e)) ?: sizeof(e);
}
static const struct file_operations bcachefs_data_ops = {
@@ -356,10 +459,8 @@ static const struct file_operations bcachefs_data_ops = {
static long bch2_ioctl_data(struct bch_fs *c,
struct bch_ioctl_data arg)
{
- struct bch_data_ctx *ctx = NULL;
- struct file *file = NULL;
- unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
- int ret, fd = -1;
+ struct bch_data_ctx *ctx;
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -374,36 +475,11 @@ static long bch2_ioctl_data(struct bch_fs *c,
ctx->c = c;
ctx->arg = arg;
- ctx->thread = kthread_create(bch2_data_thread, ctx,
- "bch-data/%s", c->name);
- if (IS_ERR(ctx->thread)) {
- ret = PTR_ERR(ctx->thread);
- goto err;
- }
-
- ret = get_unused_fd_flags(flags);
+ ret = bch2_run_thread_with_file(&ctx->thr,
+ &bcachefs_data_ops,
+ bch2_data_thread);
if (ret < 0)
- goto err;
- fd = ret;
-
- file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
- if (IS_ERR(file)) {
- ret = PTR_ERR(file);
- goto err;
- }
-
- fd_install(fd, file);
-
- get_task_struct(ctx->thread);
- wake_up_process(ctx->thread);
-
- return fd;
-err:
- if (fd >= 0)
- put_unused_fd(fd);
- if (!IS_ERR_OR_NULL(ctx->thread))
- kthread_stop(ctx->thread);
- kfree(ctx);
+ kfree(ctx);
return ret;
}
@@ -417,7 +493,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
unsigned i;
int ret = 0;
- if (!test_bit(BCH_FS_STARTED, &c->flags))
+ if (!test_bit(BCH_FS_started, &c->flags))
return -EINVAL;
if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
@@ -444,7 +520,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
dst_end = (void *) arg->replicas + replica_entries_bytes;
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *src_e =
+ struct bch_replicas_entry_v1 *src_e =
cpu_replicas_entry(&c->replicas, i);
/* check that we have enough space for one replicas entry */
@@ -474,14 +550,15 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
if (ret)
goto err;
- if (copy_to_user(user_arg, arg,
- sizeof(*arg) + arg->replica_entries_bytes))
- ret = -EFAULT;
+
+ ret = copy_to_user_errcode(user_arg, arg,
+ sizeof(*arg) + arg->replica_entries_bytes);
err:
kfree(arg);
return ret;
}
+/* obsolete, didn't allow for new data types: */
static long bch2_ioctl_dev_usage(struct bch_fs *c,
struct bch_ioctl_dev_usage __user *user_arg)
{
@@ -490,7 +567,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
struct bch_dev *ca;
unsigned i;
- if (!test_bit(BCH_FS_STARTED, &c->flags))
+ if (!test_bit(BCH_FS_started, &c->flags))
return -EINVAL;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
@@ -511,7 +588,6 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
arg.state = ca->mi.state;
arg.bucket_size = ca->mi.bucket_size;
arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
- arg.buckets_ec = src.buckets_ec;
for (i = 0; i < BCH_DATA_NR; i++) {
arg.d[i].buckets = src.d[i].buckets;
@@ -521,10 +597,58 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
percpu_ref_put(&ca->ref);
- if (copy_to_user(user_arg, &arg, sizeof(arg)))
+ return copy_to_user_errcode(user_arg, &arg, sizeof(arg));
+}
+
+static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
+ struct bch_ioctl_dev_usage_v2 __user *user_arg)
+{
+ struct bch_ioctl_dev_usage_v2 arg;
+ struct bch_dev_usage src;
+ struct bch_dev *ca;
+ int ret = 0;
+
+ if (!test_bit(BCH_FS_started, &c->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
return -EFAULT;
- return 0;
+ if ((arg.flags & ~BCH_BY_INDEX) ||
+ arg.pad[0] ||
+ arg.pad[1] ||
+ arg.pad[2])
+ return -EINVAL;
+
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ src = bch2_dev_usage_read(ca);
+
+ arg.state = ca->mi.state;
+ arg.bucket_size = ca->mi.bucket_size;
+ arg.nr_data_types = min(arg.nr_data_types, BCH_DATA_NR);
+ arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
+
+ ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
+ if (ret)
+ goto err;
+
+ for (unsigned i = 0; i < arg.nr_data_types; i++) {
+ struct bch_ioctl_dev_usage_type t = {
+ .buckets = src.d[i].buckets,
+ .sectors = src.d[i].sectors,
+ .fragmented = src.d[i].fragmented,
+ };
+
+ ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t));
+ if (ret)
+ goto err;
+ }
+err:
+ percpu_ref_put(&ca->ref);
+ return ret;
}
static long bch2_ioctl_read_super(struct bch_fs *c,
@@ -561,9 +685,8 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
goto err;
}
- if (copy_to_user((void __user *)(unsigned long)arg.sb, sb,
- vstruct_bytes(sb)))
- ret = -EFAULT;
+ ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb,
+ vstruct_bytes(sb));
err:
if (!IS_ERR_OR_NULL(ca))
percpu_ref_put(&ca->ref);
@@ -575,8 +698,6 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
struct bch_ioctl_disk_get_idx arg)
{
dev_t dev = huge_decode_dev(arg.dev);
- struct bch_dev *ca;
- unsigned i;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -584,10 +705,10 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
if (!dev)
return -EINVAL;
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
if (ca->dev == dev) {
percpu_ref_put(&ca->io_ref);
- return i;
+ return ca->dev_idx;
}
return -BCH_ERR_ENOENT_dev_idx_not_found;
@@ -642,6 +763,97 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
return ret;
}
+static int bch2_fsck_online_thread_fn(void *arg)
+{
+ struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+ struct bch_fs *c = thr->c;
+
+ c->stdio_filter = current;
+ c->stdio = &thr->thr.stdio;
+
+ /*
+ * XXX: can we figure out a way to do this without mucking with c->opts?
+ */
+ unsigned old_fix_errors = c->opts.fix_errors;
+ if (opt_defined(thr->opts, fix_errors))
+ c->opts.fix_errors = thr->opts.fix_errors;
+ else
+ c->opts.fix_errors = FSCK_FIX_ask;
+
+ c->opts.fsck = true;
+ set_bit(BCH_FS_fsck_running, &c->flags);
+
+ c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
+ int ret = bch2_run_online_recovery_passes(c);
+
+ clear_bit(BCH_FS_fsck_running, &c->flags);
+ bch_err_fn(c, ret);
+
+ c->stdio = NULL;
+ c->stdio_filter = NULL;
+ c->opts.fix_errors = old_fix_errors;
+
+ thread_with_stdio_done(&thr->thr);
+
+ up(&c->online_fsck_mutex);
+ bch2_ro_ref_put(c);
+ return 0;
+}
+
+static long bch2_ioctl_fsck_online(struct bch_fs *c,
+ struct bch_ioctl_fsck_online arg)
+{
+ struct fsck_thread *thr = NULL;
+ long ret = 0;
+
+ if (arg.flags)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!bch2_ro_ref_tryget(c))
+ return -EROFS;
+
+ if (down_trylock(&c->online_fsck_mutex)) {
+ bch2_ro_ref_put(c);
+ return -EAGAIN;
+ }
+
+ thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+ if (!thr) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ thr->c = c;
+ thr->opts = bch2_opts_empty();
+
+ if (arg.opts) {
+ char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
+
+ ret = PTR_ERR_OR_ZERO(optstr) ?:
+ bch2_parse_mount_opts(c, &thr->opts, optstr);
+ kfree(optstr);
+
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_run_thread_with_stdio(&thr->thr,
+ bch2_fsck_thread_exit,
+ bch2_fsck_online_thread_fn);
+err:
+ if (ret < 0) {
+ bch_err_fn(c, ret);
+ if (thr)
+ bch2_fsck_thread_exit(&thr->thr);
+ up(&c->online_fsck_mutex);
+ bch2_ro_ref_put(c);
+ }
+ return ret;
+}
+
#define BCH_IOCTL(_name, _argtype) \
do { \
_argtype i; \
@@ -663,6 +875,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
return bch2_ioctl_fs_usage(c, arg);
case BCH_IOCTL_DEV_USAGE:
return bch2_ioctl_dev_usage(c, arg);
+ case BCH_IOCTL_DEV_USAGE_V2:
+ return bch2_ioctl_dev_usage_v2(c, arg);
#if 0
case BCH_IOCTL_START:
BCH_IOCTL(start, struct bch_ioctl_start);
@@ -675,7 +889,7 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
}
- if (!test_bit(BCH_FS_STARTED, &c->flags))
+ if (!test_bit(BCH_FS_started, &c->flags))
return -EINVAL;
switch (cmd) {
@@ -695,7 +909,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
case BCH_IOCTL_DISK_RESIZE_JOURNAL:
BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
-
+ case BCH_IOCTL_FSCK_ONLINE:
+ BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
default:
return -ENOTTY;
}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 13998388c545..1b8c2c1016dc 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -45,6 +45,29 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
})
+static inline void bch2_csum_to_text(struct printbuf *out,
+ enum bch_csum_type type,
+ struct bch_csum csum)
+{
+ const u8 *p = (u8 *) &csum;
+ unsigned bytes = type < BCH_CSUM_NR ? bch_crc_bytes[type] : 16;
+
+ for (unsigned i = 0; i < bytes; i++)
+ prt_hex_byte(out, p[i]);
+}
+
+static inline void bch2_csum_err_msg(struct printbuf *out,
+ enum bch_csum_type type,
+ struct bch_csum expected,
+ struct bch_csum got)
+{
+ prt_printf(out, "checksum error: got ");
+ bch2_csum_to_text(out, type, got);
+ prt_str(out, " should be ");
+ bch2_csum_to_text(out, type, expected);
+ prt_printf(out, " type %s", bch2_csum_types[type]);
+}
+
int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
int bch2_request_key(struct bch_sb *, struct bch_key *);
#ifndef __KERNEL__
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index f41889093a2c..363644451106 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -109,7 +109,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
- while (1) {
+ do {
set_current_state(TASK_INTERRUPTIBLE);
if (kthread && kthread_should_stop())
break;
@@ -119,7 +119,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
schedule();
try_to_freeze();
- }
+ } while (0);
__set_current_state(TASK_RUNNING);
del_timer_sync(&wait.cpu_timer);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 51af8ea230ed..33df8cf86bd8 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -572,10 +572,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
c->opts.encoded_extent_max);
- /*
- * ZSTD is lying: if we allocate the size of the workspace it says it
- * requires, it returns memory allocation errors
- */
c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams);
struct {
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index 607fd5e232c9..58c2eb45570f 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -47,6 +47,14 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
}
+static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type)
+{
+ if (type < BCH_COMPRESSION_TYPE_NR)
+ prt_str(out, __bch2_compression_types[type]);
+ else
+ prt_printf(out, "(invalid compression type %u)", type);
+}
+
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
struct bch_extent_crc_unpacked *);
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index e367c625f057..4b340d13caac 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -20,6 +20,7 @@ struct { \
#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
typedef DARRAY(char) darray_char;
+typedef DARRAY(char *) darray_str;
int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
@@ -81,11 +82,14 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more,
#define darray_remove_item(_d, _pos) \
array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
+#define __darray_for_each(_d, _i) \
+ for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
+
#define darray_for_each(_d, _i) \
- for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+ for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++)
#define darray_for_each_reverse(_d, _i) \
- for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
+ for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
#define darray_init(_d) \
do { \
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 37d6ecae8c30..4150feca42a2 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -267,19 +267,31 @@ restart_drop_extra_replicas:
goto out;
}
+ if (trace_data_update_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+ prt_str(&buf, "\nk: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+ trace_data_update(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+
ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, insert->k.p) ?:
- bch2_bkey_set_needs_rebalance(c, insert,
- op->opts.background_target,
- op->opts.background_compression) ?:
+ bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?:
bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
m->data_opts.btree_insert_flags);
if (!ret) {
bch2_btree_iter_set_pos(&iter, next_pos);
@@ -300,14 +312,14 @@ next:
}
continue;
nowork:
- if (m->stats && m->stats) {
+ if (m->stats) {
BUG_ON(k.k->p.offset <= iter.pos.offset);
atomic64_inc(&m->stats->keys_raced);
atomic64_add(k.k->p.offset - iter.pos.offset,
&m->stats->sectors_raced);
}
- this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
+ count_event(c, move_extent_fail);
bch2_btree_iter_advance(&iter);
goto next;
@@ -342,7 +354,6 @@ void bch2_data_update_exit(struct data_update *update)
struct bch_fs *c = update->op.c;
struct bkey_ptrs_c ptrs =
bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr) {
if (c->opts.nocow_enabled)
@@ -363,7 +374,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
struct bio *bio = &update->op.wbio.bio;
struct bkey_i_extent *e;
struct write_point *wp;
- struct bch_extent_ptr *ptr;
struct closure cl;
struct btree_iter iter;
struct bkey_s_c k;
@@ -404,6 +414,8 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
continue;
}
+ bch_err_fn_ratelimited(c, ret);
+
if (ret)
return;
@@ -476,7 +488,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
return bch2_trans_relock(trans) ?:
bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
int bch2_data_update_init(struct btree_trans *trans,
@@ -493,7 +505,6 @@ int bch2_data_update_init(struct btree_trans *trans,
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- const struct bch_extent_ptr *ptr;
unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
unsigned ptrs_locked = 0;
int ret = 0;
@@ -516,7 +527,7 @@ int bch2_data_update_init(struct btree_trans *trans,
BCH_WRITE_DATA_ENCODED|
BCH_WRITE_MOVE|
m->data_opts.write_flags;
- m->op.compression_opt = io_opts.background_compression ?: io_opts.compression;
+ m->op.compression_opt = background_compression(io_opts);
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
bkey_for_each_ptr(ptrs, ptr)
@@ -639,7 +650,6 @@ done:
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) {
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 57c5128db173..7bdba8507fc9 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -44,19 +44,19 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
return false;
bio = bio_alloc_bioset(ca->disk_sb.bdev,
- buf_pages(n_sorted, btree_bytes(c)),
+ buf_pages(n_sorted, btree_buf_bytes(b)),
REQ_OP_READ|REQ_META,
GFP_NOFS,
&c->btree_bio);
bio->bi_iter.bi_sector = pick.ptr.offset;
- bch2_bio_map(bio, n_sorted, btree_bytes(c));
+ bch2_bio_map(bio, n_sorted, btree_buf_bytes(b));
submit_bio_wait(bio);
bio_put(bio);
percpu_ref_put(&ca->io_ref);
- memcpy(n_ondisk, n_sorted, btree_bytes(c));
+ memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
v->written = 0;
if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
@@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
mutex_lock(&c->verify_lock);
if (!c->verify_ondisk) {
- c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+ c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!c->verify_ondisk)
goto out;
}
@@ -199,19 +199,19 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
return;
}
- n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+ n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!n_ondisk) {
prt_printf(out, "memory allocation failure\n");
goto out;
}
bio = bio_alloc_bioset(ca->disk_sb.bdev,
- buf_pages(n_ondisk, btree_bytes(c)),
+ buf_pages(n_ondisk, btree_buf_bytes(b)),
REQ_OP_READ|REQ_META,
GFP_NOFS,
&c->btree_bio);
bio->bi_iter.bi_sector = pick.ptr.offset;
- bch2_bio_map(bio, n_ondisk, btree_bytes(c));
+ bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b));
ret = submit_bio_wait(bio);
if (ret) {
@@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
out:
if (bio)
bio_put(bio);
- kvpfree(n_ondisk, btree_bytes(c));
+ kvpfree(n_ondisk, btree_buf_bytes(b));
percpu_ref_put(&ca->io_ref);
}
@@ -366,35 +366,23 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- ssize_t ret;
i->ubuf = buf;
i->size = size;
i->ret = 0;
- ret = flush_buf(i);
- if (ret)
- return ret;
-
- trans = bch2_trans_get(i->c);
- ret = for_each_btree_key2(trans, iter, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
- bch2_bkey_val_to_text(&i->buf, i->c, k);
- prt_newline(&i->buf);
- drop_locks_do(trans, flush_buf(i));
- }));
- i->from = iter.pos;
-
- bch2_trans_put(trans);
-
- if (!ret)
- ret = flush_buf(i);
-
- return ret ?: i->ret;
+ return flush_buf(i) ?:
+ bch2_trans_run(i->c,
+ for_each_btree_key(trans, iter, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ bch2_bkey_val_to_text(&i->buf, i->c, k);
+ prt_newline(&i->buf);
+ bch2_trans_unlock(trans);
+ i->from = bpos_successor(iter.pos);
+ flush_buf(i);
+ }))) ?:
+ i->ret;
}
static const struct file_operations btree_debug_ops = {
@@ -462,44 +450,32 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- ssize_t ret;
i->ubuf = buf;
i->size = size;
i->ret = 0;
- ret = flush_buf(i);
- if (ret)
- return ret;
-
- trans = bch2_trans_get(i->c);
-
- ret = for_each_btree_key2(trans, iter, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
- struct btree_path_level *l = &iter.path->l[0];
- struct bkey_packed *_k =
- bch2_btree_node_iter_peek(&l->iter, l->b);
-
- if (bpos_gt(l->b->key.k.p, i->prev_node)) {
- bch2_btree_node_to_text(&i->buf, i->c, l->b);
- i->prev_node = l->b->key.k.p;
- }
-
- bch2_bfloat_to_text(&i->buf, l->b, _k);
- drop_locks_do(trans, flush_buf(i));
- }));
- i->from = iter.pos;
-
- bch2_trans_put(trans);
-
- if (!ret)
- ret = flush_buf(i);
-
- return ret ?: i->ret;
+ return flush_buf(i) ?:
+ bch2_trans_run(i->c,
+ for_each_btree_key(trans, iter, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ struct btree_path_level *l =
+ &btree_iter_path(trans, &iter)->l[0];
+ struct bkey_packed *_k =
+ bch2_btree_node_iter_peek(&l->iter, l->b);
+
+ if (bpos_gt(l->b->key.k.p, i->prev_node)) {
+ bch2_btree_node_to_text(&i->buf, i->c, l->b);
+ i->prev_node = l->b->key.k.p;
+ }
+
+ bch2_bfloat_to_text(&i->buf, l->b, _k);
+ bch2_trans_unlock(trans);
+ i->from = bpos_successor(iter.pos);
+ flush_buf(i);
+ }))) ?:
+ i->ret;
}
static const struct file_operations bfloat_failed_debug_ops = {
@@ -616,7 +592,6 @@ static const struct file_operations cached_btree_nodes_ops = {
.read = bch2_cached_btree_nodes_read,
};
-#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
@@ -632,7 +607,9 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
restart:
seqmutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) {
- if (trans->locking_wait.task->pid <= i->iter)
+ struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+
+ if (!task || task->pid <= i->iter)
continue;
closure_get(&trans->ref);
@@ -650,11 +627,11 @@ restart:
prt_printf(&i->buf, "backtrace:");
prt_newline(&i->buf);
printbuf_indent_add(&i->buf, 2);
- bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task);
+ bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL);
printbuf_indent_sub(&i->buf, 2);
prt_newline(&i->buf);
- i->iter = trans->locking_wait.task->pid;
+ i->iter = task->pid;
closure_put(&trans->ref);
@@ -678,7 +655,6 @@ static const struct file_operations btree_transactions_ops = {
.release = bch2_dump_release,
.read = bch2_btree_transactions_read,
};
-#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */
static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
@@ -717,7 +693,7 @@ static const struct file_operations journal_pins_ops = {
.read = bch2_journal_pins_read,
};
-static int lock_held_stats_open(struct inode *inode, struct file *file)
+static int btree_transaction_stats_open(struct inode *inode, struct file *file)
{
struct bch_fs *c = inode->i_private;
struct dump_iter *i;
@@ -727,7 +703,7 @@ static int lock_held_stats_open(struct inode *inode, struct file *file)
if (!i)
return -ENOMEM;
- i->iter = 0;
+ i->iter = 1;
i->c = c;
i->buf = PRINTBUF;
file->private_data = i;
@@ -735,7 +711,7 @@ static int lock_held_stats_open(struct inode *inode, struct file *file)
return 0;
}
-static int lock_held_stats_release(struct inode *inode, struct file *file)
+static int btree_transaction_stats_release(struct inode *inode, struct file *file)
{
struct dump_iter *i = file->private_data;
@@ -745,8 +721,8 @@ static int lock_held_stats_release(struct inode *inode, struct file *file)
return 0;
}
-static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
+static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
struct bch_fs *c = i->c;
@@ -779,6 +755,13 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
prt_printf(&i->buf, "Max mem used: %u", s->max_mem);
prt_newline(&i->buf);
+ prt_printf(&i->buf, "Transaction duration:");
+ prt_newline(&i->buf);
+
+ printbuf_indent_add(&i->buf, 2);
+ bch2_time_stats_to_text(&i->buf, &s->duration);
+ printbuf_indent_sub(&i->buf, 2);
+
if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
prt_printf(&i->buf, "Lock hold times:");
prt_newline(&i->buf);
@@ -810,11 +793,11 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
return i->ret;
}
-static const struct file_operations lock_held_stats_op = {
- .owner = THIS_MODULE,
- .open = lock_held_stats_open,
- .release = lock_held_stats_release,
- .read = lock_held_stats_read,
+static const struct file_operations btree_transaction_stats_op = {
+ .owner = THIS_MODULE,
+ .open = btree_transaction_stats_open,
+ .release = btree_transaction_stats_release,
+ .read = btree_transaction_stats_read,
};
static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
@@ -835,7 +818,9 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
restart:
seqmutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) {
- if (trans->locking_wait.task->pid <= i->iter)
+ struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+
+ if (!task || task->pid <= i->iter)
continue;
closure_get(&trans->ref);
@@ -850,7 +835,7 @@ restart:
bch2_check_for_deadlock(trans, &i->buf);
- i->iter = trans->locking_wait.task->pid;
+ i->iter = task->pid;
closure_put(&trans->ref);
@@ -897,16 +882,14 @@ void bch2_fs_debug_init(struct bch_fs *c)
debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
c->btree_debug, &cached_btree_nodes_ops);
-#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
c->btree_debug, &btree_transactions_ops);
-#endif
debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
c->btree_debug, &journal_pins_ops);
debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
- c, &lock_held_stats_op);
+ c, &btree_transaction_stats_op);
debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
c->btree_debug, &btree_deadlock_ops);
@@ -947,8 +930,6 @@ void bch2_debug_exit(void)
int __init bch2_debug_init(void)
{
- int ret = 0;
-
bch_debug = debugfs_create_dir("bcachefs", NULL);
- return ret;
+ return 0;
}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 2bfff0da7000..4ae1e9f002a0 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -65,7 +65,7 @@ static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
const struct qstr l_name = bch2_dirent_get_name(l);
const struct qstr *r_name = _r;
- return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len);
+ return !qstr_eq(l_name, *r_name);
}
static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
@@ -75,7 +75,7 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
const struct qstr l_name = bch2_dirent_get_name(l);
const struct qstr r_name = bch2_dirent_get_name(r);
- return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len);
+ return !qstr_eq(l_name, r_name);
}
static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
@@ -198,10 +198,39 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
return dirent;
}
+int bch2_dirent_create_snapshot(struct btree_trans *trans,
+ u64 dir, u32 snapshot,
+ const struct bch_hash_info *hash_info,
+ u8 type, const struct qstr *name, u64 dst_inum,
+ u64 *dir_offset,
+ bch_str_hash_flags_t str_hash_flags)
+{
+ subvol_inum zero_inum = { 0 };
+ struct bkey_i_dirent *dirent;
+ int ret;
+
+ dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum);
+ ret = PTR_ERR_OR_ZERO(dirent);
+ if (ret)
+ return ret;
+
+ dirent->k.p.inode = dir;
+ dirent->k.p.snapshot = snapshot;
+
+ ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+ zero_inum, snapshot,
+ &dirent->k_i, str_hash_flags,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ *dir_offset = dirent->k.p.offset;
+
+ return ret;
+}
+
int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
- u64 *dir_offset, int flags)
+ u64 *dir_offset,
+ bch_str_hash_flags_t str_hash_flags)
{
struct bkey_i_dirent *dirent;
int ret;
@@ -212,7 +241,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
return ret;
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
- dir, &dirent->k_i, flags);
+ dir, &dirent->k_i, str_hash_flags);
*dir_offset = dirent->k.p.offset;
return ret;
@@ -470,17 +499,11 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
const struct qstr *name, subvol_inum *inum)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- int ret;
-retry:
- bch2_trans_begin(trans);
+ struct btree_iter iter = { NULL };
- ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info,
- name, inum, 0);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- if (!ret)
- bch2_trans_iter_exit(trans, &iter);
+ int ret = lockrestart_do(trans,
+ __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
+ bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
return ret;
}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 1e3431990abd..21ffeb78f02e 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -35,9 +35,14 @@ static inline unsigned dirent_val_u64s(unsigned len)
int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
struct bkey_s_c_dirent, subvol_inum *);
+int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32,
+ const struct bch_hash_info *, u8,
+ const struct qstr *, u64, u64 *,
+ bch_str_hash_flags_t);
int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
- const struct qstr *, u64, u64 *, int);
+ const struct qstr *, u64, u64 *,
+ bch_str_hash_flags_t);
static inline unsigned vfs_d_type(unsigned type)
{
diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h
new file mode 100644
index 000000000000..5e116b88e814
--- /dev/null
+++ b/fs/bcachefs/dirent_format.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DIRENT_FORMAT_H
+#define _BCACHEFS_DIRENT_FORMAT_H
+
+/*
+ * Dirents (and xattrs) have to implement string lookups; since our b-tree
+ * doesn't support arbitrary length strings for the key, we instead index by a
+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
+ * field of the key - using linear probing to resolve hash collisions. This also
+ * provides us with the readdir cookie posix requires.
+ *
+ * Linear probing requires us to use whiteouts for deletions, in the event of a
+ * collision:
+ */
+
+struct bch_dirent {
+ struct bch_val v;
+
+ /* Target inode number: */
+ union {
+ __le64 d_inum;
+ struct { /* DT_SUBVOL */
+ __le32 d_child_subvol;
+ __le32 d_parent_subvol;
+ };
+ };
+
+ /*
+ * Copy of mode bits 12-15 from the target inode - so userspace can get
+ * the filetype without having to do a stat()
+ */
+ __u8 d_type;
+
+ __u8 d_name[];
+} __packed __aligned(8);
+
+#define DT_SUBVOL 16
+#define BCH_DT_MAX 17
+
+#define BCH_NAME_MAX 512
+
+#endif /* _BCACHEFS_DIRENT_FORMAT_H */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 4d0cb0ccff32..06a7df529b40 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -89,19 +89,14 @@ err:
void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct bch_disk_groups_cpu *g;
- struct bch_dev *ca;
- int i;
- unsigned iter;
-
out->atomic++;
rcu_read_lock();
- g = rcu_dereference(c->disk_groups);
+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
if (!g)
goto out;
- for (i = 0; i < g->nr; i++) {
+ for (unsigned i = 0; i < g->nr; i++) {
if (i)
prt_printf(out, " ");
@@ -111,7 +106,7 @@ void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
}
prt_printf(out, "[parent %d devs", g->entries[i].parent);
- for_each_member_device_rcu(ca, c, iter, &g->entries[i].devs)
+ for_each_member_device_rcu(c, ca, &g->entries[i].devs)
prt_printf(out, " %s", ca->name);
prt_printf(out, "]");
}
@@ -562,7 +557,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
: NULL;
if (ca && percpu_ref_tryget(&ca->io_ref)) {
- prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
+ prt_printf(out, "/dev/%s", ca->name);
percpu_ref_put(&ca->io_ref);
} else if (ca) {
prt_printf(out, "offline device %u", t.dev);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 2a77de18c004..d503af270024 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -3,6 +3,7 @@
/* erasure coding */
#include "bcachefs.h"
+#include "alloc_background.h"
#include "alloc_foreground.h"
#include "backpointers.h"
#include "bkey_buf.h"
@@ -156,12 +157,311 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
}
}
+/* Triggers: */
+
+static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
+ struct bkey_s_c_stripe s,
+ unsigned idx, bool deleting)
+{
+ struct bch_fs *c = trans->c;
+ const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a;
+ enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
+ ? BCH_DATA_parity : 0;
+ s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
+ int ret = 0;
+
+ if (deleting)
+ sectors = -sectors;
+
+ a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
+ if (IS_ERR(a))
+ return PTR_ERR(a);
+
+ ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
+ a->v.gen, a->v.data_type,
+ a->v.dirty_sectors);
+ if (ret)
+ goto err;
+
+ if (!deleting) {
+ if (bch2_trans_inconsistent_on(a->v.stripe ||
+ a->v.stripe_redundancy, trans,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_type_str(a->v.data_type),
+ a->v.dirty_sectors,
+ a->v.stripe, s.k->p.offset)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_type_str(a->v.data_type),
+ a->v.dirty_sectors,
+ s.k->p.offset)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ a->v.stripe = s.k->p.offset;
+ a->v.stripe_redundancy = s.v->nr_redundant;
+ a->v.data_type = BCH_DATA_stripe;
+ } else {
+ if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
+ a->v.stripe_redundancy != s.v->nr_redundant, trans,
+ "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ s.k->p.offset, a->v.stripe)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ a->v.stripe = 0;
+ a->v.stripe_redundancy = 0;
+ a->v.data_type = alloc_data_type(a->v, BCH_DATA_user);
+ }
+
+ a->v.dirty_sectors += sectors;
+ if (data_type)
+ a->v.data_type = !deleting ? data_type : 0;
+
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int mark_stripe_bucket(struct btree_trans *trans,
+ struct bkey_s_c k,
+ unsigned ptr_idx,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+ unsigned nr_data = s->nr_blocks - s->nr_redundant;
+ bool parity = ptr_idx >= nr_data;
+ enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
+ s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
+ const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bucket old, new, *g;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+ /* * XXX doesn't handle deletion */
+
+ percpu_down_read(&c->mark_lock);
+ g = PTR_GC_BUCKET(ca, ptr);
+
+ if (g->dirty_sectors ||
+ (g->stripe && g->stripe != k.k->p.offset)) {
+ bch2_fs_inconsistent(c,
+ "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
+ ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EINVAL;
+ goto err;
+ }
+
+ bucket_lock(g);
+ old = *g;
+
+ ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type,
+ g->gen, g->data_type,
+ g->dirty_sectors);
+ if (ret)
+ goto err;
+
+ g->data_type = data_type;
+ g->dirty_sectors += sectors;
+
+ g->stripe = k.k->p.offset;
+ g->stripe_redundancy = s->nr_redundant;
+ new = *g;
+err:
+ bucket_unlock(g);
+ if (!ret)
+ bch2_dev_usage_update_m(c, ca, &old, &new);
+ percpu_up_read(&c->mark_lock);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+int bch2_trigger_stripe(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s _new,
+ unsigned flags)
+{
+ struct bkey_s_c new = _new.s_c;
+ struct bch_fs *c = trans->c;
+ u64 idx = new.k->p.offset;
+ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(old).v : NULL;
+ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(new).v : NULL;
+
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ /*
+ * If the pointers aren't changing, we don't need to do anything:
+ */
+ if (new_s && old_s &&
+ new_s->nr_blocks == old_s->nr_blocks &&
+ new_s->nr_redundant == old_s->nr_redundant &&
+ !memcmp(old_s->ptrs, new_s->ptrs,
+ new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+ return 0;
+
+ BUG_ON(new_s && old_s &&
+ (new_s->nr_blocks != old_s->nr_blocks ||
+ new_s->nr_redundant != old_s->nr_redundant));
+
+ if (new_s) {
+ s64 sectors = le16_to_cpu(new_s->sectors);
+
+ struct bch_replicas_padded r;
+ bch2_bkey_to_replicas(&r.e, new);
+ int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+ if (ret)
+ return ret;
+ }
+
+ if (old_s) {
+ s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+
+ struct bch_replicas_padded r;
+ bch2_bkey_to_replicas(&r.e, old);
+ int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+ if (ret)
+ return ret;
+ }
+
+ unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
+ for (unsigned i = 0; i < nr_blocks; i++) {
+ if (new_s && old_s &&
+ !memcmp(&new_s->ptrs[i],
+ &old_s->ptrs[i],
+ sizeof(new_s->ptrs[i])))
+ continue;
+
+ if (new_s) {
+ int ret = bch2_trans_mark_stripe_bucket(trans,
+ bkey_s_c_to_stripe(new), i, false);
+ if (ret)
+ return ret;
+ }
+
+ if (old_s) {
+ int ret = bch2_trans_mark_stripe_bucket(trans,
+ bkey_s_c_to_stripe(old), i, true);
+ if (ret)
+ return ret;
+ }
+ }
+ }
+
+ if (flags & BTREE_TRIGGER_ATOMIC) {
+ struct stripe *m = genradix_ptr(&c->stripes, idx);
+
+ if (!m) {
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf1, c, old);
+ bch2_bkey_val_to_text(&buf2, c, new);
+ bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
+ "old %s\n"
+ "new %s", idx, buf1.buf, buf2.buf);
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
+ bch2_inconsistent_error(c);
+ return -1;
+ }
+
+ if (!new_s) {
+ bch2_stripes_heap_del(c, m, idx);
+
+ memset(m, 0, sizeof(*m));
+ } else {
+ m->sectors = le16_to_cpu(new_s->sectors);
+ m->algorithm = new_s->algorithm;
+ m->nr_blocks = new_s->nr_blocks;
+ m->nr_redundant = new_s->nr_redundant;
+ m->blocks_nonempty = 0;
+
+ for (unsigned i = 0; i < new_s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
+
+ if (!old_s)
+ bch2_stripes_heap_insert(c, m, idx);
+ else
+ bch2_stripes_heap_update(c, m, idx);
+ }
+ }
+
+ if (flags & BTREE_TRIGGER_GC) {
+ struct gc_stripe *m =
+ genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
+
+ if (!m) {
+ bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+ idx);
+ return -BCH_ERR_ENOMEM_mark_stripe;
+ }
+ /*
+ * This will be wrong when we bring back runtime gc: we should
+ * be unmarking the old key and then marking the new key
+ */
+ m->alive = true;
+ m->sectors = le16_to_cpu(new_s->sectors);
+ m->nr_blocks = new_s->nr_blocks;
+ m->nr_redundant = new_s->nr_redundant;
+
+ for (unsigned i = 0; i < new_s->nr_blocks; i++)
+ m->ptrs[i] = new_s->ptrs[i];
+
+ bch2_bkey_to_replicas(&m->r.e, new);
+
+ /*
+ * gc recalculates this field from stripe ptr
+ * references:
+ */
+ memset(m->block_sectors, 0, sizeof(m->block_sectors));
+
+ for (unsigned i = 0; i < new_s->nr_blocks; i++) {
+ int ret = mark_stripe_bucket(trans, new, i, flags);
+ if (ret)
+ return ret;
+ }
+
+ int ret = bch2_update_replicas(c, new, &m->r.e,
+ ((s64) m->sectors * m->nr_redundant),
+ 0, true);
+ if (ret) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, new);
+ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+ printbuf_exit(&buf);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
/* returns blocknr in stripe that we matched: */
static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
struct bkey_s_c k, unsigned *block)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
bkey_for_each_ptr(ptrs, ptr)
@@ -791,28 +1091,22 @@ static void ec_stripe_delete_work(struct work_struct *work)
{
struct bch_fs *c =
container_of(work, struct bch_fs, ec_stripe_delete_work);
- struct btree_trans *trans = bch2_trans_get(c);
- int ret;
- u64 idx;
while (1) {
mutex_lock(&c->ec_stripes_heap_lock);
- idx = stripe_idx_to_delete(c);
+ u64 idx = stripe_idx_to_delete(c);
mutex_unlock(&c->ec_stripes_heap_lock);
if (!idx)
break;
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
- ec_stripe_delete(trans, idx));
- if (ret) {
- bch_err_fn(c, ret);
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ ec_stripe_delete(trans, idx));
+ bch_err_fn(c, ret);
+ if (ret)
break;
- }
}
- bch2_trans_put(trans);
-
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
}
@@ -983,8 +1277,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
while (1) {
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc,
ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
s, &bp_pos));
if (ret)
@@ -1005,7 +1299,7 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
int ret = 0;
- ret = bch2_btree_write_buffer_flush(trans);
+ ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
goto err;
@@ -1121,21 +1415,20 @@ static void ec_stripe_create(struct ec_stripe_new *s)
}
ret = bch2_trans_do(c, &s->res, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc,
ec_stripe_key_update(trans,
bkey_i_to_stripe(&s->new_stripe.key),
!s->have_existing_stripe));
+ bch_err_msg(c, ret, "creating stripe key");
if (ret) {
- bch_err(c, "error creating stripe: error creating stripe key");
goto err;
}
ret = ec_stripe_update_extents(c, &s->new_stripe);
- if (ret) {
- bch_err_msg(c, ret, "creating stripe: error updating pointers");
+ bch_err_msg(c, ret, "error updating extents");
+ if (ret)
goto err;
- }
err:
bch2_disk_reservation_put(c, &s->res);
@@ -1250,18 +1543,17 @@ static int unsigned_cmp(const void *_l, const void *_r)
static unsigned pick_blocksize(struct bch_fs *c,
struct bch_devs_mask *devs)
{
- struct bch_dev *ca;
- unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
+ unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
struct {
unsigned nr, size;
} cur = { 0, 0 }, best = { 0, 0 };
- for_each_member_device_rcu(ca, c, i, devs)
+ for_each_member_device_rcu(c, ca, devs)
sizes[nr++] = ca->mi.bucket_size;
sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
- for (i = 0; i < nr; i++) {
+ for (unsigned i = 0; i < nr; i++) {
if (sizes[i] != cur.size) {
if (cur.nr > best.nr)
best = cur;
@@ -1344,8 +1636,6 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
enum bch_watermark watermark)
{
struct ec_stripe_head *h;
- struct bch_dev *ca;
- unsigned i;
h = kzalloc(sizeof(*h), GFP_KERNEL);
if (!h)
@@ -1362,13 +1652,13 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
rcu_read_lock();
h->devs = target_rw_devs(c, BCH_DATA_user, target);
- for_each_member_device_rcu(ca, c, i, &h->devs)
+ for_each_member_device_rcu(c, ca, &h->devs)
if (!ca->mi.durability)
- __clear_bit(i, h->devs.d);
+ __clear_bit(ca->dev_idx, h->devs.d);
h->blocksize = pick_blocksize(c, &h->devs);
- for_each_member_device_rcu(ca, c, i, &h->devs)
+ for_each_member_device_rcu(c, ca, &h->devs)
if (ca->mi.bucket_size == h->blocksize)
h->nr_active_devs++;
@@ -1415,7 +1705,7 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
- if (test_bit(BCH_FS_GOING_RO, &c->flags)) {
+ if (test_bit(BCH_FS_going_ro, &c->flags)) {
h = ERR_PTR(-BCH_ERR_erofs_no_writes);
goto found;
}
@@ -1833,44 +2123,32 @@ void bch2_fs_ec_flush(struct bch_fs *c)
int bch2_stripes_read(struct bch_fs *c)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- const struct bch_stripe *s;
- struct stripe *m;
- unsigned i;
- int ret;
-
- for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
- if (k.k->type != KEY_TYPE_stripe)
- continue;
-
- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
- if (ret)
- break;
-
- s = bkey_s_c_to_stripe(k).v;
-
- m = genradix_ptr(&c->stripes, k.k->p.offset);
- m->sectors = le16_to_cpu(s->sectors);
- m->algorithm = s->algorithm;
- m->nr_blocks = s->nr_blocks;
- m->nr_redundant = s->nr_redundant;
- m->blocks_nonempty = 0;
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
+ if (k.k->type != KEY_TYPE_stripe)
+ continue;
- for (i = 0; i < s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
+ if (ret)
+ break;
- bch2_stripes_heap_insert(c, m, k.k->p.offset);
- }
- bch2_trans_iter_exit(trans, &iter);
+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- bch2_trans_put(trans);
+ struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
+ m->sectors = le16_to_cpu(s->sectors);
+ m->algorithm = s->algorithm;
+ m->nr_blocks = s->nr_blocks;
+ m->nr_redundant = s->nr_redundant;
+ m->blocks_nonempty = 0;
- if (ret)
- bch_err_fn(c, ret);
+ for (unsigned i = 0; i < s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+ bch2_stripes_heap_insert(c, m, k.k->p.offset);
+ 0;
+ })));
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 7d0237c9819f..f4369b02e805 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -12,13 +12,14 @@ int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
+int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_stripe ((struct bkey_ops) { \
.key_invalid = bch2_stripe_invalid, \
.val_to_text = bch2_stripe_to_text, \
.swab = bch2_ptr_swab, \
- .trans_trigger = bch2_trans_mark_stripe, \
- .atomic_trigger = bch2_mark_stripe, \
+ .trigger = bch2_trigger_stripe, \
.min_val_size = 8, \
})
diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h
new file mode 100644
index 000000000000..44ce88ba08d7
--- /dev/null
+++ b/fs/bcachefs/ec_format.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_FORMAT_H
+#define _BCACHEFS_EC_FORMAT_H
+
+struct bch_stripe {
+ struct bch_val v;
+ __le16 sectors;
+ __u8 algorithm;
+ __u8 nr_blocks;
+ __u8 nr_redundant;
+
+ __u8 csum_granularity_bits;
+ __u8 csum_type;
+ __u8 pad;
+
+ struct bch_extent_ptr ptrs[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_EC_FORMAT_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index e2b02a82de32..976426da3a12 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -5,7 +5,7 @@
#include "bcachefs_format.h"
struct bch_replicas_padded {
- struct bch_replicas_entry e;
+ struct bch_replicas_entry_v1 e;
u8 pad[BCH_BKEY_PTRS_MAX];
};
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 9ce29681eec9..8c40c2067a04 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -73,7 +73,6 @@
x(ENOMEM, ENOMEM_fsck_add_nlink) \
x(ENOMEM, ENOMEM_journal_key_insert) \
x(ENOMEM, ENOMEM_journal_keys_sort) \
- x(ENOMEM, ENOMEM_journal_replay) \
x(ENOMEM, ENOMEM_read_superblock_clean) \
x(ENOMEM, ENOMEM_fs_alloc) \
x(ENOMEM, ENOMEM_fs_name_alloc) \
@@ -152,7 +151,6 @@
x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \
- x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \
x(0, backpointer_to_overwritten_btree_node) \
x(0, lock_fail_root_changed) \
x(0, journal_reclaim_would_deadlock) \
@@ -172,10 +170,12 @@
x(EINVAL, device_size_too_small) \
x(EINVAL, device_not_a_member_of_filesystem) \
x(EINVAL, device_has_been_removed) \
+ x(EINVAL, device_splitbrain) \
x(EINVAL, device_already_online) \
x(EINVAL, insufficient_devices_to_start) \
x(EINVAL, invalid) \
x(EINVAL, internal_fsck_err) \
+ x(EINVAL, opt_parse_error) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \
@@ -224,6 +224,8 @@
x(BCH_ERR_invalid, invalid_bkey) \
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
x(EIO, btree_node_read_err) \
+ x(EIO, sb_not_downgraded) \
+ x(EIO, btree_write_all_failed) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
@@ -235,6 +237,7 @@
x(BCH_ERR_nopromote, nopromote_unwritten) \
x(BCH_ERR_nopromote, nopromote_congested) \
x(BCH_ERR_nopromote, nopromote_in_flight) \
+ x(BCH_ERR_nopromote, nopromote_no_writes) \
x(BCH_ERR_nopromote, nopromote_enomem)
enum bch_errcode {
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 25cf78a7b946..d32c8bebe46c 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -2,12 +2,13 @@
#include "bcachefs.h"
#include "error.h"
#include "super.h"
+#include "thread_with_file.h"
#define FSCK_ERR_RATELIMIT_NR 10
bool bch2_inconsistent_error(struct bch_fs *c)
{
- set_bit(BCH_FS_ERROR, &c->flags);
+ set_bit(BCH_FS_error, &c->flags);
switch (c->opts.errors) {
case BCH_ON_ERROR_continue:
@@ -26,8 +27,8 @@ bool bch2_inconsistent_error(struct bch_fs *c)
void bch2_topology_error(struct bch_fs *c)
{
- set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
- if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+ set_bit(BCH_FS_topology_error, &c->flags);
+ if (!test_bit(BCH_FS_fsck_running, &c->flags))
bch2_inconsistent_error(c);
}
@@ -69,40 +70,66 @@ enum ask_yn {
YN_ALLYES,
};
+static enum ask_yn parse_yn_response(char *buf)
+{
+ buf = strim(buf);
+
+ if (strlen(buf) == 1)
+ switch (buf[0]) {
+ case 'n':
+ return YN_NO;
+ case 'y':
+ return YN_YES;
+ case 'N':
+ return YN_ALLNO;
+ case 'Y':
+ return YN_ALLYES;
+ }
+ return -1;
+}
+
#ifdef __KERNEL__
-#define bch2_fsck_ask_yn() YN_NO
+static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
+{
+ struct stdio_redirect *stdio = c->stdio;
+
+ if (c->stdio_filter && c->stdio_filter != current)
+ stdio = NULL;
+
+ if (!stdio)
+ return YN_NO;
+
+ char buf[100];
+ int ret;
+
+ do {
+ bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
+
+ int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
+ if (r < 0)
+ return YN_NO;
+ buf[r] = '\0';
+ } while ((ret = parse_yn_response(buf)) < 0);
+
+ return ret;
+}
#else
#include "tools-util.h"
-enum ask_yn bch2_fsck_ask_yn(void)
+static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
{
char *buf = NULL;
size_t buflen = 0;
- bool ret;
+ int ret;
- while (true) {
+ do {
fputs(" (y,n, or Y,N for all errors of this type) ", stdout);
fflush(stdout);
if (getline(&buf, &buflen, stdin) < 0)
die("error reading from standard input");
-
- strim(buf);
- if (strlen(buf) != 1)
- continue;
-
- switch (buf[0]) {
- case 'n':
- return YN_NO;
- case 'y':
- return YN_YES;
- case 'N':
- return YN_ALLNO;
- case 'Y':
- return YN_ALLYES;
- }
- }
+ } while ((ret = parse_yn_response(buf)) < 0);
free(buf);
return ret;
@@ -114,7 +141,7 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
{
struct fsck_err_state *s;
- if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+ if (!test_bit(BCH_FS_fsck_running, &c->flags))
return NULL;
list_for_each_entry(s, &c->fsck_error_msgs, list)
@@ -152,7 +179,8 @@ int bch2_fsck_err(struct bch_fs *c,
struct printbuf buf = PRINTBUF, *out = &buf;
int ret = -BCH_ERR_fsck_ignore;
- if (test_bit(err, c->sb.errors_silent))
+ if ((flags & FSCK_CAN_FIX) &&
+ test_bit(err, c->sb.errors_silent))
return -BCH_ERR_fsck_fix;
bch2_sb_error_count(c, err);
@@ -196,7 +224,7 @@ int bch2_fsck_err(struct bch_fs *c,
prt_printf(out, bch2_log_msg(c, ""));
#endif
- if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
+ if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
if (c->opts.errors != BCH_ON_ERROR_continue ||
!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
prt_str(out, ", shutting down");
@@ -221,10 +249,13 @@ int bch2_fsck_err(struct bch_fs *c,
int ask;
prt_str(out, ": fix?");
- bch2_print_string_as_lines(KERN_ERR, out->buf);
+ if (bch2_fs_stdio_redirect(c))
+ bch2_print(c, "%s", out->buf);
+ else
+ bch2_print_string_as_lines(KERN_ERR, out->buf);
print = false;
- ask = bch2_fsck_ask_yn();
+ ask = bch2_fsck_ask_yn(c);
if (ask >= YN_ALLNO && s)
s->fix = ask == YN_ALLNO
@@ -253,10 +284,14 @@ int bch2_fsck_err(struct bch_fs *c,
!(flags & FSCK_CAN_IGNORE)))
ret = -BCH_ERR_fsck_errors_not_fixed;
- if (print)
- bch2_print_string_as_lines(KERN_ERR, out->buf);
+ if (print) {
+ if (bch2_fs_stdio_redirect(c))
+ bch2_print(c, "%s\n", out->buf);
+ else
+ bch2_print_string_as_lines(KERN_ERR, out->buf);
+ }
- if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) &&
+ if (test_bit(BCH_FS_fsck_running, &c->flags) &&
(ret != -BCH_ERR_fsck_fix &&
ret != -BCH_ERR_fsck_ignore))
bch_err(c, "Unable to continue, halting");
@@ -274,10 +309,10 @@ int bch2_fsck_err(struct bch_fs *c,
bch2_inconsistent_error(c);
if (ret == -BCH_ERR_fsck_fix) {
- set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+ set_bit(BCH_FS_errors_fixed, &c->flags);
} else {
- set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
- set_bit(BCH_FS_ERROR, &c->flags);
+ set_bit(BCH_FS_errors_not_fixed, &c->flags);
+ set_bit(BCH_FS_error, &c->flags);
}
return ret;
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 21af6fb8cecf..b9033bb4f11c 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -100,7 +100,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
return ret2 ?: ret;
}
-#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3)
+#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3)
int bch2_extent_atomic_end(struct btree_trans *trans,
struct btree_iter *iter,
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 9d8afcb5979a..61395b113df9 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -8,6 +8,7 @@
#include "bcachefs.h"
#include "bkey_methods.h"
+#include "btree_cache.h"
#include "btree_gc.h"
#include "btree_io.h"
#include "btree_iter.h"
@@ -843,7 +844,6 @@ void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr)
if (ptr->dev == dev)
@@ -855,7 +855,6 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned
bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr)
if (bch2_dev_in_target(c, ptr->dev, target) &&
@@ -1020,12 +1019,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ",
crc.compressed_size,
crc.uncompressed_size,
crc.offset, crc.nonce,
- bch2_csum_types[crc.csum_type],
- bch2_compression_types[crc.compression_type]);
+ bch2_csum_types[crc.csum_type]);
+ bch2_prt_compression_type(out, crc.compression_type);
break;
}
case BCH_EXTENT_ENTRY_stripe_ptr: {
@@ -1065,7 +1064,6 @@ static int extent_ptr_invalid(struct bch_fs *c,
struct printbuf *err)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr2;
u64 bucket;
u32 bucket_offset;
struct bch_dev *ca;
@@ -1307,7 +1305,6 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
}
incompressible:
if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
- const struct bch_extent_ptr *ptr;
unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) {
@@ -1338,10 +1335,12 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
}
int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
- unsigned target, unsigned compression)
+ struct bch_io_opts *opts)
{
struct bkey_s k = bkey_i_to_s(_k);
struct bch_extent_rebalance *r;
+ unsigned target = opts->background_target;
+ unsigned compression = background_compression(*opts);
bool needs_rebalance;
if (!bkey_extent_is_direct_data(k.k))
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index a2ce8a3be13c..6bf839d69e84 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -300,7 +300,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
bkey_extent_entry_for_each_from(_p, _entry, _p.start)
#define __bkey_for_each_ptr(_start, _end, _ptr) \
- for ((_ptr) = (_start); \
+ for (typeof(_start) (_ptr) = (_start); \
((_ptr) = __bkey_ptr_next(_ptr, _end)); \
(_ptr)++)
@@ -415,8 +415,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
.key_invalid = bch2_btree_ptr_invalid, \
.val_to_text = bch2_btree_ptr_to_text, \
.swab = bch2_ptr_swab, \
- .trans_trigger = bch2_trans_mark_extent, \
- .atomic_trigger = bch2_mark_extent, \
+ .trigger = bch2_trigger_extent, \
})
#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \
@@ -424,8 +423,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
.val_to_text = bch2_btree_ptr_v2_to_text, \
.swab = bch2_ptr_swab, \
.compat = bch2_btree_ptr_v2_compat, \
- .trans_trigger = bch2_trans_mark_extent, \
- .atomic_trigger = bch2_mark_extent, \
+ .trigger = bch2_trigger_extent, \
.min_val_size = 40, \
})
@@ -439,8 +437,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
.swab = bch2_ptr_swab, \
.key_normalize = bch2_extent_normalize, \
.key_merge = bch2_extent_merge, \
- .trans_trigger = bch2_trans_mark_extent, \
- .atomic_trigger = bch2_mark_extent, \
+ .trigger = bch2_trigger_extent, \
})
/* KEY_TYPE_reservation: */
@@ -454,8 +451,7 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
.key_invalid = bch2_reservation_invalid, \
.val_to_text = bch2_reservation_to_text, \
.key_merge = bch2_reservation_merge, \
- .trans_trigger = bch2_trans_mark_reservation, \
- .atomic_trigger = bch2_mark_reservation, \
+ .trigger = bch2_trigger_reservation, \
.min_val_size = 8, \
})
@@ -547,7 +543,6 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
static inline bool bkey_extent_is_unwritten(struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr)
if (ptr->unwritten)
@@ -565,10 +560,9 @@ static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
{
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(p, ptr)
- ret.devs[ret.nr++] = ptr->dev;
+ ret.data[ret.nr++] = ptr->dev;
return ret;
}
@@ -577,11 +571,10 @@ static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
{
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(p, ptr)
if (!ptr->cached)
- ret.devs[ret.nr++] = ptr->dev;
+ ret.data[ret.nr++] = ptr->dev;
return ret;
}
@@ -590,11 +583,10 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
{
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(p, ptr)
if (ptr->cached)
- ret.devs[ret.nr++] = ptr->dev;
+ ret.data[ret.nr++] = ptr->dev;
return ret;
}
@@ -716,7 +708,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
- unsigned, unsigned);
+ struct bch_io_opts *);
/* Generic extent code: */
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
new file mode 100644
index 000000000000..3bd2fdbb0817
--- /dev/null
+++ b/fs/bcachefs/extents_format.h
@@ -0,0 +1,295 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_FORMAT_H
+#define _BCACHEFS_EXTENTS_FORMAT_H
+
+/*
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the data that is currently
+ * live. The size field in struct bkey records the current (live) size of the
+ * extent, and is also used to mean "size of region on disk that we point to" in
+ * this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8 (except we're encoding a
+ * type, not a size), encoding the type in the position of the first set bit:
+ *
+ * bch_extent_crc32 - 0b1
+ * bch_extent_ptr - 0b10
+ * bch_extent_crc64 - 0b100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
+ */
+
+#define BCH_EXTENT_ENTRY_TYPES() \
+ x(ptr, 0) \
+ x(crc32, 1) \
+ x(crc64, 2) \
+ x(crc128, 3) \
+ x(stripe_ptr, 4) \
+ x(rebalance, 5)
+#define BCH_EXTENT_ENTRY_MAX 6
+
+enum bch_extent_entry_type {
+#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
+ BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+/* Compressed/uncompressed size are stored biased by 1: */
+struct bch_extent_crc32 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u32 type:2,
+ _compressed_size:7,
+ _uncompressed_size:7,
+ offset:7,
+ _unused:1,
+ csum_type:4,
+ compression_type:4;
+ __u32 csum;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u32 csum;
+ __u32 compression_type:4,
+ csum_type:4,
+ _unused:1,
+ offset:7,
+ _uncompressed_size:7,
+ _compressed_size:7,
+ type:2;
+#endif
+} __packed __aligned(8);
+
+#define CRC32_SIZE_MAX (1U << 7)
+#define CRC32_NONCE_MAX 0
+
+struct bch_extent_crc64 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:3,
+ _compressed_size:9,
+ _uncompressed_size:9,
+ offset:9,
+ nonce:10,
+ csum_type:4,
+ compression_type:4,
+ csum_hi:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 csum_hi:16,
+ compression_type:4,
+ csum_type:4,
+ nonce:10,
+ offset:9,
+ _uncompressed_size:9,
+ _compressed_size:9,
+ type:3;
+#endif
+ __u64 csum_lo;
+} __packed __aligned(8);
+
+#define CRC64_SIZE_MAX (1U << 9)
+#define CRC64_NONCE_MAX ((1U << 10) - 1)
+
+struct bch_extent_crc128 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:4,
+ _compressed_size:13,
+ _uncompressed_size:13,
+ offset:13,
+ nonce:13,
+ csum_type:4,
+ compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 compression_type:4,
+ csum_type:4,
+ nonce:13,
+ offset:13,
+ _uncompressed_size:13,
+ _compressed_size:13,
+ type:4;
+#endif
+ struct bch_csum csum;
+} __packed __aligned(8);
+
+#define CRC128_SIZE_MAX (1U << 13)
+#define CRC128_NONCE_MAX ((1U << 13) - 1)
+
+/*
+ * @reservation - pointer hasn't been written to, just reserved
+ */
+struct bch_extent_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:1,
+ cached:1,
+ unused:1,
+ unwritten:1,
+ offset:44, /* 8 petabytes */
+ dev:8,
+ gen:8;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 gen:8,
+ dev:8,
+ offset:44,
+ unwritten:1,
+ unused:1,
+ cached:1,
+ type:1;
+#endif
+} __packed __aligned(8);
+
+struct bch_extent_stripe_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:5,
+ block:8,
+ redundancy:4,
+ idx:47;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 idx:47,
+ redundancy:4,
+ block:8,
+ type:5;
+#endif
+};
+
+struct bch_extent_rebalance {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:6,
+ unused:34,
+ compression:8, /* enum bch_compression_opt */
+ target:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 target:16,
+ compression:8,
+ unused:34,
+ type:6;
+#endif
+};
+
+union bch_extent_entry {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
+ unsigned long type;
+#elif __BITS_PER_LONG == 32
+ struct {
+ unsigned long pad;
+ unsigned long type;
+ };
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define x(f, n) struct bch_extent_##f f;
+ BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+struct bch_btree_ptr {
+ struct bch_val v;
+
+ __u64 _data[0];
+ struct bch_extent_ptr start[];
+} __packed __aligned(8);
+
+struct bch_btree_ptr_v2 {
+ struct bch_val v;
+
+ __u64 mem_ptr;
+ __le64 seq;
+ __le16 sectors_written;
+ __le16 flags;
+ struct bpos min_key;
+ __u64 _data[0];
+ struct bch_extent_ptr start[];
+} __packed __aligned(8);
+
+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
+
+struct bch_extent {
+ struct bch_val v;
+
+ __u64 _data[0];
+ union bch_extent_entry start[];
+} __packed __aligned(8);
+
+/* Maximum size (in u64s) a single pointer could be: */
+#define BKEY_EXTENT_PTR_U64s_MAX\
+ ((sizeof(struct bch_extent_crc128) + \
+ sizeof(struct bch_extent_ptr)) / sizeof(__u64))
+
+/* Maximum possible size of an entire extent value: */
+#define BKEY_EXTENT_VAL_U64s_MAX \
+ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+
+/* * Maximum possible size of an entire extent, key + value: */
+#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
+
+/* Btree pointers don't carry around checksums: */
+#define BKEY_BTREE_PTR_VAL_U64s_MAX \
+ ((sizeof(struct bch_btree_ptr_v2) + \
+ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
+#define BKEY_BTREE_PTR_U64s_MAX \
+ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
+
+struct bch_reservation {
+ struct bch_val v;
+
+ __le32 generation;
+ __u8 nr_replicas;
+ __u8 pad[3];
+} __packed __aligned(8);
+
+struct bch_inline_data {
+ struct bch_val v;
+ u8 data[];
+};
+
+#endif /* _BCACHEFS_EXTENTS_FORMAT_H */
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index 05429c9631cd..b04750dbf870 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -156,7 +156,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
}
#define eytzinger1_for_each(_i, _size) \
- for ((_i) = eytzinger1_first((_size)); \
+ for (unsigned (_i) = eytzinger1_first((_size)); \
(_i) != 0; \
(_i) = eytzinger1_next((_i), (_size)))
@@ -227,7 +227,7 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
}
#define eytzinger0_for_each(_i, _size) \
- for ((_i) = eytzinger0_first((_size)); \
+ for (unsigned (_i) = eytzinger0_first((_size)); \
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
@@ -261,11 +261,11 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
#define eytzinger0_find(base, nr, size, _cmp, search) \
({ \
- void *_base = (base); \
- void *_search = (search); \
- size_t _nr = (nr); \
- size_t _size = (size); \
- size_t _i = 0; \
+ void *_base = (base); \
+ const void *_search = (search); \
+ size_t _nr = (nr); \
+ size_t _size = (size); \
+ size_t _i = 0; \
int _res; \
\
while (_i < _nr && \
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 4496cf91a4c1..1c1ea0f0c692 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -166,10 +166,8 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
- if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
- new_inode->bi_dir = dir_u->bi_inum;
- new_inode->bi_dir_offset = dir_offset;
- }
+ new_inode->bi_dir = dir_u->bi_inum;
+ new_inode->bi_dir_offset = dir_offset;
}
inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
@@ -228,10 +226,8 @@ int bch2_link_trans(struct btree_trans *trans,
if (ret)
goto err;
- if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
- inode_u->bi_dir = dir.inum;
- inode_u->bi_dir_offset = dir_offset;
- }
+ inode_u->bi_dir = dir.inum;
+ inode_u->bi_dir_offset = dir_offset;
ret = bch2_inode_write(trans, &dir_iter, dir_u) ?:
bch2_inode_write(trans, &inode_iter, inode_u);
@@ -414,21 +410,19 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
- if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
- src_inode_u->bi_dir = dst_dir_u->bi_inum;
- src_inode_u->bi_dir_offset = dst_offset;
+ src_inode_u->bi_dir = dst_dir_u->bi_inum;
+ src_inode_u->bi_dir_offset = dst_offset;
- if (mode == BCH_RENAME_EXCHANGE) {
- dst_inode_u->bi_dir = src_dir_u->bi_inum;
- dst_inode_u->bi_dir_offset = src_offset;
- }
+ if (mode == BCH_RENAME_EXCHANGE) {
+ dst_inode_u->bi_dir = src_dir_u->bi_inum;
+ dst_inode_u->bi_dir_offset = src_offset;
+ }
- if (mode == BCH_RENAME_OVERWRITE &&
- dst_inode_u->bi_dir == dst_dir_u->bi_inum &&
- dst_inode_u->bi_dir_offset == src_offset) {
- dst_inode_u->bi_dir = 0;
- dst_inode_u->bi_dir_offset = 0;
- }
+ if (mode == BCH_RENAME_OVERWRITE &&
+ dst_inode_u->bi_dir == dst_dir_u->bi_inum &&
+ dst_inode_u->bi_dir_offset == src_offset) {
+ dst_inode_u->bi_dir = 0;
+ dst_inode_u->bi_dir_offset = 0;
}
if (mode == BCH_RENAME_OVERWRITE) {
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 52f0e7acda3d..73c12e565af5 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -52,26 +52,20 @@ struct readpages_iter {
static int readpages_iter_init(struct readpages_iter *iter,
struct readahead_control *ractl)
{
- struct folio **fi;
- int ret;
-
- memset(iter, 0, sizeof(*iter));
+ struct folio *folio;
- iter->mapping = ractl->mapping;
+ *iter = (struct readpages_iter) { ractl->mapping };
- ret = bch2_filemap_get_contig_folios_d(iter->mapping,
- ractl->_index << PAGE_SHIFT,
- (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT,
- 0, mapping_gfp_mask(iter->mapping),
- &iter->folios);
- if (ret)
- return ret;
+ while ((folio = __readahead_folio(ractl))) {
+ if (!bch2_folio_create(folio, GFP_KERNEL) ||
+ darray_push(&iter->folios, folio)) {
+ bch2_folio_release(folio);
+ ractl->_nr_pages += folio_nr_pages(folio);
+ ractl->_index -= folio_nr_pages(folio);
+ return iter->folios.nr ? 0 : -ENOMEM;
+ }
- darray_for_each(iter->folios, fi) {
- ractl->_nr_pages -= 1U << folio_order(*fi);
- __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL);
- folio_put(*fi);
- folio_put(*fi);
+ folio_put(folio);
}
return 0;
@@ -273,12 +267,12 @@ void bch2_readahead(struct readahead_control *ractl)
struct btree_trans *trans = bch2_trans_get(c);
struct folio *folio;
struct readpages_iter readpages_iter;
- int ret;
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
- ret = readpages_iter_init(&readpages_iter, ractl);
- BUG_ON(ret);
+ int ret = readpages_iter_init(&readpages_iter, ractl);
+ if (ret)
+ return;
bch2_pagecache_add_get(inode);
@@ -638,7 +632,7 @@ do_io:
/* Check for writing past i_size: */
WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
round_up(i_size, block_bytes(c)) &&
- !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
+ !test_bit(BCH_FS_emergency_ro, &c->flags),
"writing past i_size: %llu > %llu (unrounded %llu)\n",
bio_end_sector(&w->io->op.wbio.bio) << 9,
round_up(i_size, block_bytes(c)),
@@ -826,7 +820,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation res;
folios fs;
- struct folio **fi, *f;
+ struct folio *f;
unsigned copied = 0, f_offset, f_copied;
u64 end = pos + len, f_pos, f_len;
loff_t last_folio_pos = inode->v.i_size;
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 84e20c3ada6c..e3b219e19e10 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -77,7 +77,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
- if ((offset|iter->count) & (block_bytes(c) - 1))
+ /* bios must be 512 byte aligned: */
+ if ((offset|iter->count) & (SECTOR_SIZE - 1))
return -EINVAL;
ret = min_t(loff_t, iter->count,
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index ff664fd0d8ef..d359aa9b33b8 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -309,39 +309,49 @@ void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
}
}
-void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
- u64 start, u64 end)
+int bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
+ u64 *start, u64 end,
+ bool nonblocking)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+ pgoff_t index = *start >> PAGE_SECTORS_SHIFT;
pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
struct folio_batch fbatch;
s64 i_sectors_delta = 0;
- unsigned i, j;
+ int ret = 0;
- if (end <= start)
- return;
+ if (end <= *start)
+ return 0;
folio_batch_init(&fbatch);
while (filemap_get_folios(inode->v.i_mapping,
&index, end_index, &fbatch)) {
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
+
+ if (!nonblocking)
+ folio_lock(folio);
+ else if (!folio_trylock(folio)) {
+ folio_batch_release(&fbatch);
+ ret = -EAGAIN;
+ break;
+ }
+
u64 folio_start = folio_sector(folio);
u64 folio_end = folio_end_sector(folio);
- unsigned folio_offset = max(start, folio_start) - folio_start;
- unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
- struct bch_folio *s;
BUG_ON(end <= folio_start);
- folio_lock(folio);
- s = bch2_folio(folio);
+ *start = min(end, folio_end);
+ struct bch_folio *s = bch2_folio(folio);
if (s) {
+ unsigned folio_offset = max(*start, folio_start) - folio_start;
+ unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+
spin_lock(&s->lock);
- for (j = folio_offset; j < folio_offset + folio_len; j++) {
+ for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) {
i_sectors_delta -= s->s[j].state == SECTOR_dirty;
bch2_folio_sector_set(folio, s, j,
folio_sector_reserve(s->s[j].state));
@@ -356,6 +366,7 @@ void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
}
bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+ return ret;
}
static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index 27f712ae37a6..8cbaba6565b4 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -143,7 +143,7 @@ int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
-void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64);
+int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool);
int bch2_get_folio_disk_reservation(struct bch_fs *,
struct bch_inode_info *,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index b0e8144ec550..8c70123b6a0c 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -79,7 +79,7 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
continue;
bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
- REQ_OP_FLUSH,
+ REQ_OP_WRITE|REQ_PREFLUSH,
GFP_KERNEL,
&c->nocow_flush_bioset),
struct nocow_flush, bio);
@@ -192,13 +192,17 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret, ret2, ret3;
+ int ret;
ret = file_write_and_wait_range(file, start, end);
- ret2 = sync_inode_metadata(&inode->v, 1);
- ret3 = bch2_flush_inode(c, inode);
-
- return bch2_err_class(ret ?: ret2 ?: ret3);
+ if (ret)
+ goto out;
+ ret = sync_inode_metadata(&inode->v, 1);
+ if (ret)
+ goto out;
+ ret = bch2_flush_inode(c, inode);
+out:
+ return bch2_err_class(ret);
}
/* truncate: */
@@ -671,8 +675,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
- drop_locks_do(trans,
- (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
+ if (bch2_mark_pagecache_reserved(inode, &hole_start,
+ iter.pos.offset, true))
+ drop_locks_do(trans,
+ bch2_mark_pagecache_reserved(inode, &hole_start,
+ iter.pos.offset, false));
bkey_err:
bch2_quota_reservation_put(c, inode, &quota_res);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -861,7 +868,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
abs(pos_src - pos_dst) < len)
return -EINVAL;
- bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+ lock_two_nondirectories(&src->v, &dst->v);
+ bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
inode_dio_wait(&src->v);
inode_dio_wait(&dst->v);
@@ -914,7 +922,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
ret = bch2_flush_inode(c, dst);
err:
bch2_quota_reservation_put(c, dst, &quota_res);
- bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+ bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
+ unlock_two_nondirectories(&src->v, &dst->v);
return bch2_err_class(ret);
}
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 94e5a567fa44..3a4c24c28e7f 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -285,34 +285,26 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
bch_notice(c, "shutdown by ioctl type %u", flags);
- down_write(&c->vfs_sb->s_umount);
-
switch (flags) {
case FSOP_GOING_FLAGS_DEFAULT:
ret = bdev_freeze(c->vfs_sb->s_bdev);
if (ret)
- goto err;
-
+ break;
bch2_journal_flush(&c->journal);
- c->vfs_sb->s_flags |= SB_RDONLY;
bch2_fs_emergency_read_only(c);
bdev_thaw(c->vfs_sb->s_bdev);
break;
-
case FSOP_GOING_FLAGS_LOGFLUSH:
bch2_journal_flush(&c->journal);
fallthrough;
-
case FSOP_GOING_FLAGS_NOLOGFLUSH:
- c->vfs_sb->s_flags |= SB_RDONLY;
bch2_fs_emergency_read_only(c);
break;
default:
ret = -EINVAL;
break;
}
-err:
- up_write(&c->vfs_sb->s_umount);
+
return ret;
}
@@ -345,11 +337,12 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
create_flags |= BCH_CREATE_SNAPSHOT_RO;
- /* why do we need this lock? */
- down_read(&c->vfs_sb->s_umount);
-
- if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) {
+ /* sync_inodes_sb enforce s_umount is locked */
+ down_read(&c->vfs_sb->s_umount);
sync_inodes_sb(c->vfs_sb);
+ up_read(&c->vfs_sb->s_umount);
+ }
retry:
if (arg.src_ptr) {
error = user_path_at(arg.dirfd,
@@ -433,8 +426,6 @@ err2:
goto retry;
}
err1:
- up_read(&c->vfs_sb->s_umount);
-
return error;
}
@@ -451,33 +442,36 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
struct bch_ioctl_subvolume arg)
{
+ const char __user *name = (void __user *)(unsigned long)arg.dst_ptr;
struct path path;
struct inode *dir;
+ struct dentry *victim;
int ret = 0;
if (arg.flags)
return -EINVAL;
- ret = user_path_at(arg.dirfd,
- (const char __user *)(unsigned long)arg.dst_ptr,
- LOOKUP_FOLLOW, &path);
- if (ret)
- return ret;
+ victim = user_path_locked_at(arg.dirfd, name, &path);
+ if (IS_ERR(victim))
+ return PTR_ERR(victim);
- if (path.dentry->d_sb->s_fs_info != c) {
+ if (victim->d_sb->s_fs_info != c) {
ret = -EXDEV;
goto err;
}
-
- dir = path.dentry->d_parent->d_inode;
-
- ret = __bch2_unlink(dir, path.dentry, true);
- if (ret)
+ if (!d_is_positive(victim)) {
+ ret = -ENOENT;
goto err;
-
- fsnotify_rmdir(dir, path.dentry);
- d_delete(path.dentry);
+ }
+ dir = d_inode(path.dentry);
+ ret = __bch2_unlink(dir, victim, true);
+ if (!ret) {
+ fsnotify_rmdir(dir, victim);
+ d_delete(victim);
+ }
+ inode_unlock(dir);
err:
+ dput(victim);
path_put(&path);
return ret;
}
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index c1895df1bffe..ec419b8e2c43 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -93,7 +93,7 @@ retry:
BTREE_ITER_INTENT) ?:
(set ? set(trans, inode, &inode_u, p) : 0) ?:
bch2_inode_write(trans, &iter, &inode_u) ?:
- bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
/*
* the btree node lock protects inode->ei_inode, not ei_update_lock;
@@ -455,7 +455,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_enospc,
bch2_unlink_trans(trans,
inode_inum(dir), &dir_u,
&inode_u, &dentry->d_name,
@@ -729,7 +729,7 @@ retry:
ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
btree_err:
bch2_trans_iter_exit(trans, &inode_iter);
@@ -1012,15 +1012,13 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret;
if (!dir_emit_dots(file, ctx))
return 0;
- ret = bch2_readdir(c, inode_inum(inode), ctx);
- if (ret)
- bch_err_fn(c, ret);
+ int ret = bch2_readdir(c, inode_inum(inode), ctx);
+ bch_err_fn(c, ret);
return bch2_err_class(ret);
}
@@ -1500,7 +1498,7 @@ static void bch2_evict_inode(struct inode *vinode)
void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
{
- struct bch_inode_info *inode, **i;
+ struct bch_inode_info *inode;
DARRAY(struct bch_inode_info *) grabbed;
bool clean_pass = false, this_pass_clean;
@@ -1626,43 +1624,18 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
return c ?: ERR_PTR(-ENOENT);
}
-static char **split_devs(const char *_dev_name, unsigned *nr)
-{
- char *dev_name = NULL, **devs = NULL, *s;
- size_t i = 0, nr_devs = 0;
-
- dev_name = kstrdup(_dev_name, GFP_KERNEL);
- if (!dev_name)
- return NULL;
-
- for (s = dev_name; s; s = strchr(s + 1, ':'))
- nr_devs++;
-
- devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
- if (!devs) {
- kfree(dev_name);
- return NULL;
- }
-
- while ((s = strsep(&dev_name, ":")))
- devs[i++] = s;
-
- *nr = nr_devs;
- return devs;
-}
-
static int bch2_remount(struct super_block *sb, int *flags, char *data)
{
struct bch_fs *c = sb->s_fs_info;
struct bch_opts opts = bch2_opts_empty();
int ret;
- opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
-
ret = bch2_parse_mount_opts(c, &opts, data);
if (ret)
goto err;
+ opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
+
if (opts.read_only != c->opts.read_only) {
down_write(&c->state_lock);
@@ -1696,11 +1669,9 @@ err:
static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
{
struct bch_fs *c = root->d_sb->s_fs_info;
- struct bch_dev *ca;
- unsigned i;
bool first = true;
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
if (!first)
seq_putc(seq, ':');
first = false;
@@ -1770,7 +1741,7 @@ static int bch2_unfreeze(struct super_block *sb)
struct bch_fs *c = sb->s_fs_info;
int ret;
- if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+ if (test_bit(BCH_FS_emergency_ro, &c->flags))
return 0;
down_write(&c->state_lock);
@@ -1805,17 +1776,18 @@ static int bch2_noset_super(struct super_block *s, void *data)
return -EBUSY;
}
+typedef DARRAY(struct bch_fs *) darray_fs;
+
static int bch2_test_super(struct super_block *s, void *data)
{
struct bch_fs *c = s->s_fs_info;
- struct bch_fs **devs = data;
- unsigned i;
+ darray_fs *d = data;
if (!c)
return false;
- for (i = 0; devs[i]; i++)
- if (c != devs[i])
+ darray_for_each(*d, i)
+ if (c != *i)
return false;
return true;
}
@@ -1824,13 +1796,9 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
struct bch_fs *c;
- struct bch_dev *ca;
struct super_block *sb;
struct inode *vinode;
struct bch_opts opts = bch2_opts_empty();
- char **devs;
- struct bch_fs **devs_to_fs = NULL;
- unsigned i, nr_devs;
int ret;
opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
@@ -1842,25 +1810,25 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
if (!dev_name || strlen(dev_name) == 0)
return ERR_PTR(-EINVAL);
- devs = split_devs(dev_name, &nr_devs);
- if (!devs)
- return ERR_PTR(-ENOMEM);
+ darray_str devs;
+ ret = bch2_split_devs(dev_name, &devs);
+ if (ret)
+ return ERR_PTR(ret);
- devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
- if (!devs_to_fs) {
- sb = ERR_PTR(-ENOMEM);
- goto got_sb;
+ darray_fs devs_to_fs = {};
+ darray_for_each(devs, i) {
+ ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
+ if (ret) {
+ sb = ERR_PTR(ret);
+ goto got_sb;
+ }
}
- for (i = 0; i < nr_devs; i++)
- devs_to_fs[i] = bch2_path_to_fs(devs[i]);
-
- sb = sget(fs_type, bch2_test_super, bch2_noset_super,
- flags|SB_NOSEC, devs_to_fs);
+ sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs);
if (!IS_ERR(sb))
goto got_sb;
- c = bch2_fs_open(devs, nr_devs, opts);
+ c = bch2_fs_open(devs.data, devs.nr, opts);
if (IS_ERR(c)) {
sb = ERR_CAST(c);
goto got_sb;
@@ -1880,9 +1848,8 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
if (IS_ERR(sb))
bch2_fs_stop(c);
got_sb:
- kfree(devs_to_fs);
- kfree(devs[0]);
- kfree(devs);
+ darray_exit(&devs_to_fs);
+ bch2_darray_str_exit(&devs);
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
@@ -1923,7 +1890,7 @@ got_sb:
sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
struct block_device *bdev = ca->disk_sb.bdev;
/* XXX: create an anonymous device for multi device filesystems */
@@ -1944,10 +1911,9 @@ got_sb:
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
ret = PTR_ERR_OR_ZERO(vinode);
- if (ret) {
- bch_err_msg(c, ret, "mounting: error getting root inode");
+ bch_err_msg(c, ret, "mounting: error getting root inode");
+ if (ret)
goto err_put_super;
- }
sb->s_root = d_make_root(vinode);
if (!sb->s_root) {
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 5edf1d4b9e6b..c3af7225ff69 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -77,9 +77,8 @@ static inline int ptrcmp(void *l, void *r)
}
enum bch_inode_lock_op {
- INODE_LOCK = (1U << 0),
- INODE_PAGECACHE_BLOCK = (1U << 1),
- INODE_UPDATE_LOCK = (1U << 2),
+ INODE_PAGECACHE_BLOCK = (1U << 0),
+ INODE_UPDATE_LOCK = (1U << 1),
};
#define bch2_lock_inodes(_locks, ...) \
@@ -91,8 +90,6 @@ do { \
\
for (i = 1; i < ARRAY_SIZE(a); i++) \
if (a[i] != a[i - 1]) { \
- if ((_locks) & INODE_LOCK) \
- down_write_nested(&a[i]->v.i_rwsem, i); \
if ((_locks) & INODE_PAGECACHE_BLOCK) \
bch2_pagecache_block_get(a[i]);\
if ((_locks) & INODE_UPDATE_LOCK) \
@@ -109,8 +106,6 @@ do { \
\
for (i = 1; i < ARRAY_SIZE(a); i++) \
if (a[i] != a[i - 1]) { \
- if ((_locks) & INODE_LOCK) \
- up_write(&a[i]->v.i_rwsem); \
if ((_locks) & INODE_PAGECACHE_BLOCK) \
bch2_pagecache_block_put(a[i]);\
if ((_locks) & INODE_UPDATE_LOCK) \
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e0c5cd119acc..6a760777bafb 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -20,8 +20,6 @@
#include <linux/bsearch.h>
#include <linux/dcache.h> /* struct qstr */
-#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-
/*
* XXX: this is handling transaction restarts without returning
* -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
@@ -29,19 +27,16 @@
static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
u32 snapshot)
{
- struct btree_iter iter;
- struct bkey_s_c k;
u64 sectors = 0;
- int ret;
- for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
SPOS(inum, 0, snapshot),
POS(inum, U64_MAX),
- 0, k, ret)
+ 0, k, ({
if (bkey_extent_is_allocation(k.k))
sectors += k.k->size;
-
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
return ret ?: sectors;
}
@@ -49,45 +44,23 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
u32 snapshot)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_dirent d;
u64 subdirs = 0;
- int ret;
-
- for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
- SPOS(inum, 0, snapshot),
- POS(inum, U64_MAX),
- 0, k, ret) {
- if (k.k->type != KEY_TYPE_dirent)
- continue;
- d = bkey_s_c_to_dirent(k);
- if (d.v->d_type == DT_DIR)
+ int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
+ SPOS(inum, 0, snapshot),
+ POS(inum, U64_MAX),
+ 0, k, ({
+ if (k.k->type == KEY_TYPE_dirent &&
+ bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
subdirs++;
- }
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
return ret ?: subdirs;
}
-static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
- u32 *subvol)
-{
- struct bch_snapshot s;
- int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots,
- POS(0, snapshot), 0,
- snapshot, &s);
- if (!ret)
- *subvol = le32_to_cpu(s.subvol);
- else if (bch2_err_matches(ret, ENOENT))
- bch_err(trans->c, "snapshot %u not found", snapshot);
- return ret;
-
-}
-
-static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
- u32 *snapshot, u64 *inum)
+static int subvol_lookup(struct btree_trans *trans, u32 subvol,
+ u32 *snapshot, u64 *inum)
{
struct bch_subvolume s;
int ret;
@@ -99,12 +72,6 @@ static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
return ret;
}
-static int subvol_lookup(struct btree_trans *trans, u32 subvol,
- u32 *snapshot, u64 *inum)
-{
- return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
-}
-
static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
struct bch_inode_unpacked *inode)
{
@@ -132,7 +99,7 @@ err:
return ret;
}
-static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
+static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
struct bch_inode_unpacked *inode,
u32 *snapshot)
{
@@ -152,29 +119,19 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
if (!ret)
*snapshot = iter.pos.snapshot;
err:
- bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
-static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode,
- u32 *snapshot)
-{
- return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
-}
-
-static int __lookup_dirent(struct btree_trans *trans,
+static int lookup_dirent_in_snapshot(struct btree_trans *trans,
struct bch_hash_info hash_info,
subvol_inum dir, struct qstr *name,
- u64 *target, unsigned *type)
+ u64 *target, unsigned *type, u32 snapshot)
{
struct btree_iter iter;
struct bkey_s_c_dirent d;
- int ret;
-
- ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
- &hash_info, dir, name, 0);
+ int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
+ &hash_info, dir, name, 0, snapshot);
if (ret)
return ret;
@@ -207,12 +164,9 @@ static int fsck_write_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
- int ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- __write_inode(trans, inode, snapshot));
- if (ret)
- bch_err_fn(trans->c, ret);
+ int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ __write_inode(trans, inode, snapshot));
+ bch_err_fn(trans->c, ret);
return ret;
}
@@ -242,35 +196,44 @@ err:
}
/* Get lost+found, create if it doesn't exist: */
-static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
+static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
struct bch_inode_unpacked *lostfound)
{
struct bch_fs *c = trans->c;
- struct bch_inode_unpacked root;
- struct bch_hash_info root_hash_info;
struct qstr lostfound_str = QSTR("lost+found");
- subvol_inum root_inum = { .subvol = subvol };
u64 inum = 0;
unsigned d_type = 0;
- u32 snapshot;
int ret;
- ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
+ struct bch_snapshot_tree st;
+ ret = bch2_snapshot_tree_lookup(trans,
+ bch2_snapshot_tree(c, snapshot), &st);
+ if (ret)
+ return ret;
+
+ subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
+ u32 subvol_snapshot;
+
+ ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol),
+ &subvol_snapshot, &root_inum.inum);
+ bch_err_msg(c, ret, "looking up root subvol");
if (ret)
return ret;
- ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
+ struct bch_inode_unpacked root_inode;
+ struct bch_hash_info root_hash_info;
+ u32 root_inode_snapshot = snapshot;
+ ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
+ bch_err_msg(c, ret, "looking up root inode");
if (ret)
return ret;
- root_hash_info = bch2_hash_info_init(c, &root);
+ root_hash_info = bch2_hash_info_init(c, &root_inode);
- ret = __lookup_dirent(trans, root_hash_info, root_inum,
- &lostfound_str, &inum, &d_type);
- if (bch2_err_matches(ret, ENOENT)) {
- bch_notice(c, "creating lost+found");
+ ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
+ &lostfound_str, &inum, &d_type, snapshot);
+ if (bch2_err_matches(ret, ENOENT))
goto create_lostfound;
- }
bch_err_fn(c, ret);
if (ret)
@@ -285,20 +248,53 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
* The bch2_check_dirents pass has already run, dangling dirents
* shouldn't exist here:
*/
- return __lookup_inode(trans, inum, lostfound, &snapshot);
+ ret = lookup_inode(trans, inum, lostfound, &snapshot);
+ bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
+ inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
+ return ret;
create_lostfound:
+ /*
+ * XXX: we could have a nicer log message here if we had a nice way to
+ * walk backpointers to print a path
+ */
+ bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
+
+ u64 now = bch2_current_time(c);
+ struct btree_iter lostfound_iter = { NULL };
+ u64 cpu = raw_smp_processor_id();
+
bch2_inode_init_early(c, lostfound);
+ bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
+ lostfound->bi_dir = root_inode.bi_inum;
+
+ root_inode.bi_nlink++;
+
+ ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
+ if (ret)
+ goto err;
- ret = bch2_create_trans(trans, root_inum, &root,
- lostfound, &lostfound_str,
- 0, 0, S_IFDIR|0700, 0, NULL, NULL,
- (subvol_inum) { }, 0);
+ bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
+ ret = bch2_btree_iter_traverse(&lostfound_iter);
+ if (ret)
+ goto err;
+
+ ret = bch2_dirent_create_snapshot(trans,
+ root_inode.bi_inum, snapshot, &root_hash_info,
+ mode_to_type(lostfound->bi_mode),
+ &lostfound_str,
+ lostfound->bi_inum,
+ &lostfound->bi_dir_offset,
+ BCH_HASH_SET_MUST_CREATE) ?:
+ bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+err:
bch_err_msg(c, ret, "creating lost+found");
+ bch2_trans_iter_exit(trans, &lostfound_iter);
return ret;
}
-static int __reattach_inode(struct btree_trans *trans,
+static int reattach_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 inode_snapshot)
{
@@ -307,14 +303,9 @@ static int __reattach_inode(struct btree_trans *trans,
char name_buf[20];
struct qstr name;
u64 dir_offset = 0;
- u32 subvol;
int ret;
- ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
- if (ret)
- return ret;
-
- ret = lookup_lostfound(trans, subvol, &lostfound);
+ ret = lookup_lostfound(trans, inode_snapshot, &lostfound);
if (ret)
return ret;
@@ -331,15 +322,12 @@ static int __reattach_inode(struct btree_trans *trans,
snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
name = (struct qstr) QSTR(name_buf);
- ret = bch2_dirent_create(trans,
- (subvol_inum) {
- .subvol = subvol,
- .inum = lostfound.bi_inum,
- },
- &dir_hash,
- inode_d_type(inode),
- &name, inode->bi_inum, &dir_offset,
- BCH_HASH_SET_MUST_CREATE);
+ ret = bch2_dirent_create_snapshot(trans,
+ lostfound.bi_inum, inode_snapshot,
+ &dir_hash,
+ inode_d_type(inode),
+ &name, inode->bi_inum, &dir_offset,
+ BCH_HASH_SET_MUST_CREATE);
if (ret)
return ret;
@@ -349,18 +337,6 @@ static int __reattach_inode(struct btree_trans *trans,
return __write_inode(trans, inode, inode_snapshot);
}
-static int reattach_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 inode_snapshot)
-{
- int ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- __reattach_inode(trans, inode, inode_snapshot));
- bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
- return ret;
-}
-
static int remove_backpointer(struct btree_trans *trans,
struct bch_inode_unpacked *inode)
{
@@ -405,7 +381,7 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s
};
int ret = 0;
- darray_for_each(s->ids, i) {
+ __darray_for_each(s->ids, i) {
if (i->id == id)
return 0;
if (i->id > id)
@@ -422,7 +398,7 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s
static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
enum btree_id btree_id, struct bpos pos)
{
- struct snapshots_seen_entry *i, n = {
+ struct snapshots_seen_entry n = {
.id = pos.snapshot,
.equiv = bch2_snapshot_equiv(c, pos.snapshot),
};
@@ -448,7 +424,7 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
bch2_btree_id_str(btree_id),
pos.inode, pos.offset,
i->id, n.id, n.equiv);
- set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+ set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
}
}
@@ -593,14 +569,13 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
- u32 restart_count = trans->restart_count;
int ret;
w->recalculate_sums = false;
w->inodes.nr = 0;
- for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
if (k.k->p.offset != inum)
break;
@@ -613,8 +588,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
return ret;
w->first_this_inode = true;
-
- return trans_was_restarted(trans, restart_count);
+ return 0;
}
static struct inode_walker_entry *
@@ -625,7 +599,7 @@ lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
snapshot = bch2_snapshot_equiv(c, snapshot);
- darray_for_each(w->inodes, i)
+ __darray_for_each(w->inodes, i)
if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
goto found;
@@ -667,11 +641,8 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
} else if (bkey_cmp(w->last_pos, pos)) {
- struct inode_walker_entry *i;
-
darray_for_each(w->inodes, i)
i->seen_this_pos = false;
-
}
w->last_pos = pos;
@@ -756,9 +727,7 @@ static int hash_redo_key(struct btree_trans *trans,
k.k->p.snapshot, tmp,
BCH_HASH_SET_MUST_CREATE,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
static int hash_check_key(struct btree_trans *trans,
@@ -826,6 +795,18 @@ fsck_err:
goto out;
}
+static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ bch2_trans_iter_exit(trans, &iter);
+ return k.k->type == KEY_TYPE_set;
+}
+
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -867,7 +848,7 @@ static int check_inode(struct btree_trans *trans,
c, inode_snapshot_mismatch,
"inodes in different snapshots don't match")) {
bch_err(c, "repair not implemented yet");
- return -EINVAL;
+ return -BCH_ERR_fsck_repair_unimplemented;
}
if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
@@ -890,14 +871,22 @@ static int check_inode(struct btree_trans *trans,
return 0;
}
+ if (u.bi_flags & BCH_INODE_unlinked) {
+ ret = check_inode_deleted_list(trans, k.k->p);
+ if (ret < 0)
+ return ret;
+
+ fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
+ "inode %llu:%u unlinked, but not on deleted list",
+ u.bi_inum, k.k->p.snapshot);
+ ret = 0;
+ }
+
if (u.bi_flags & BCH_INODE_unlinked &&
(!c->sb.clean ||
fsck_err(c, inode_unlinked_but_clean,
"filesystem marked clean, but inode %llu unlinked",
u.bi_inum))) {
- bch2_trans_unlock(trans);
- bch2_fs_lazy_rw(c);
-
ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck deleting inode");
return ret;
@@ -910,9 +899,6 @@ static int check_inode(struct btree_trans *trans,
u.bi_inum))) {
bch_verbose(c, "truncating inode %llu", u.bi_inum);
- bch2_trans_unlock(trans);
- bch2_fs_lazy_rw(c);
-
/*
* XXX: need to truncate partial blocks too here - or ideally
* just switch units to bytes and that issue goes away
@@ -976,27 +962,22 @@ fsck_err:
return ret;
}
-noinline_for_stack
int bch2_check_inodes(struct bch_fs *c)
{
bool full = c->opts.fsck;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
struct bch_inode_unpacked prev = { 0 };
struct snapshots_seen s;
- struct bkey_s_c k;
- int ret;
snapshots_seen_init(&s);
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
- POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_inode(trans, &iter, k, &prev, &s, full));
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+ POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_inode(trans, &iter, k, &prev, &s, full)));
snapshots_seen_exit(&s);
- bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
}
@@ -1023,29 +1004,9 @@ static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
: le64_to_cpu(d.v->d_inum) == inode->bi_inum;
}
-static int inode_backpointer_exists(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
-{
- struct btree_iter iter;
- struct bkey_s_c_dirent d;
- int ret;
-
- d = dirent_get_by_pos(trans, &iter,
- SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
- ret = bkey_err(d);
- if (ret)
- return bch2_err_matches(ret, ENOENT) ? 0 : ret;
-
- ret = dirent_points_to_inode(d, inode);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
u32 restart_count = trans->restart_count;
int ret = 0;
s64 count2;
@@ -1094,11 +1055,8 @@ struct extent_ends {
static void extent_ends_reset(struct extent_ends *extent_ends)
{
- struct extent_end *i;
-
darray_for_each(extent_ends->e, i)
snapshots_seen_exit(&i->seen);
-
extent_ends->e.nr = 0;
}
@@ -1130,7 +1088,7 @@ static int extent_ends_at(struct bch_fs *c,
if (!n.seen.ids.data)
return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
- darray_for_each(extent_ends->e, i) {
+ __darray_for_each(extent_ends->e, i) {
if (i->snapshot == k.k->p.snapshot) {
snapshots_seen_exit(&i->seen);
*i = n;
@@ -1220,13 +1178,12 @@ static int overlapping_extents_found(struct btree_trans *trans,
swap(k1, k2);
}
- trans->extra_journal_res += bch2_bkey_sectors_compressed(k2);
+ trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
ret = bch2_trans_update_extent_overwrite(trans, old_iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
k1, k2) ?:
- bch2_trans_commit(trans, &res, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL);
+ bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(c, &res);
if (ret)
@@ -1270,7 +1227,6 @@ static int check_overlapping_extents(struct btree_trans *trans,
bool *fixed)
{
struct bch_fs *c = trans->c;
- struct extent_end *i;
int ret = 0;
/* transaction restart, running again */
@@ -1451,32 +1407,28 @@ int bch2_check_extents(struct bch_fs *c)
{
struct inode_walker w = inode_walker_init();
struct snapshots_seen s;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
struct extent_ends extent_ends;
struct disk_reservation res = { 0 };
- int ret = 0;
snapshots_seen_init(&s);
extent_ends_init(&extent_ends);
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- &res, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
- bch2_disk_reservation_put(c, &res);
- check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
- check_extent_overbig(trans, &iter, k);
- })) ?:
- check_i_sectors(trans, &w);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ &res, NULL,
+ BCH_TRANS_COMMIT_no_enospc, ({
+ bch2_disk_reservation_put(c, &res);
+ check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
+ check_extent_overbig(trans, &iter, k);
+ })) ?:
+ check_i_sectors(trans, &w));
bch2_disk_reservation_put(c, &res);
extent_ends_exit(&extent_ends);
inode_walker_exit(&w);
snapshots_seen_exit(&s);
- bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
@@ -1484,24 +1436,19 @@ int bch2_check_extents(struct bch_fs *c)
int bch2_check_indirect_extents(struct bch_fs *c)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
struct disk_reservation res = { 0 };
- int ret = 0;
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
- POS_MIN,
- BTREE_ITER_PREFETCH, k,
- &res, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
- bch2_disk_reservation_put(c, &res);
- check_extent_overbig(trans, &iter, k);
- }));
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
+ POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ &res, NULL,
+ BCH_TRANS_COMMIT_no_enospc, ({
+ bch2_disk_reservation_put(c, &res);
+ check_extent_overbig(trans, &iter, k);
+ })));
bch2_disk_reservation_put(c, &res);
- bch2_trans_put(trans);
-
bch_err_fn(c, ret);
return ret;
}
@@ -1509,7 +1456,6 @@ int bch2_check_indirect_extents(struct bch_fs *c)
static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
u32 restart_count = trans->restart_count;
int ret = 0;
s64 count2;
@@ -1553,8 +1499,8 @@ static int check_dirent_target(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bkey_i_dirent *n;
- bool backpointer_exists = true;
struct printbuf buf = PRINTBUF;
+ struct btree_iter bp_iter = { NULL };
int ret = 0;
if (!target->bi_dir &&
@@ -1568,25 +1514,37 @@ static int check_dirent_target(struct btree_trans *trans,
}
if (!inode_points_to_dirent(target, d)) {
- ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
- if (ret < 0)
+ struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
+ SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
+ ret = bkey_err(bp_dirent);
+ if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
- backpointer_exists = ret;
+ bool backpointer_exists = !ret;
ret = 0;
+ bch2_bkey_val_to_text(&buf, c, d.s_c);
+ prt_newline(&buf);
+ if (backpointer_exists)
+ bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+
if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
c, inode_dir_multiple_links,
- "directory %llu with multiple links",
- target->bi_inum)) {
+ "directory %llu:%u with multiple links\n%s",
+ target->bi_inum, target_snapshot, buf.buf)) {
ret = __remove_dirent(trans, d.k->p);
goto out;
}
+ /*
+ * hardlinked file with nlink 0:
+ * We're just adjusting nlink here so check_nlinks() will pick
+ * it up, it ignores inodes with nlink 0
+ */
if (fsck_err_on(backpointer_exists && !target->bi_nlink,
c, inode_multiple_links_but_nlink_0,
- "inode %llu type %s has multiple links but i_nlink 0",
- target->bi_inum, bch2_d_types[d.v->d_type])) {
+ "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
+ target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
target->bi_nlink++;
target->bi_flags &= ~BCH_INODE_unlinked;
@@ -1636,13 +1594,12 @@ static int check_dirent_target(struct btree_trans *trans,
d = dirent_i_to_s_c(n);
}
- if (d.v->d_type == DT_SUBVOL &&
- target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
- (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
- fsck_err(c, dirent_d_parent_subvol_wrong,
- "dirent has wrong d_parent_subvol field: got %u, should be %u",
- le32_to_cpu(d.v->d_parent_subvol),
- target->bi_parent_subvol))) {
+ if (fsck_err_on(d.v->d_type == DT_SUBVOL &&
+ target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol),
+ c, dirent_d_parent_subvol_wrong,
+ "dirent has wrong d_parent_subvol field: got %u, should be %u",
+ le32_to_cpu(d.v->d_parent_subvol),
+ target->bi_parent_subvol)) {
n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
ret = PTR_ERR_OR_ZERO(n);
if (ret)
@@ -1660,6 +1617,7 @@ static int check_dirent_target(struct btree_trans *trans,
out:
err:
fsck_err:
+ bch2_trans_iter_exit(trans, &bp_iter);
printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
@@ -1701,7 +1659,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
}
- BUG_ON(!iter->path->should_be_locked);
+ BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout);
ret = PTR_ERR_OR_ZERO(i);
@@ -1754,7 +1712,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
u32 target_snapshot;
u64 target_inum;
- ret = __subvol_lookup(trans, target_subvol,
+ ret = subvol_lookup(trans, target_subvol,
&target_snapshot, &target_inum);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
@@ -1766,7 +1724,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
}
- ret = __lookup_inode(trans, target_inum,
+ ret = lookup_inode(trans, target_inum,
&subvol_root, &target_snapshot);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
@@ -1842,22 +1800,18 @@ int bch2_check_dirents(struct bch_fs *c)
struct inode_walker target = inode_walker_init();
struct snapshots_seen s;
struct bch_hash_info hash_info;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
snapshots_seen_init(&s);
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
- k,
- NULL, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s));
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ k,
+ NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc,
+ check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)));
- bch2_trans_put(trans);
snapshots_seen_exit(&s);
inode_walker_exit(&dir);
inode_walker_exit(&target);
@@ -1908,8 +1862,6 @@ int bch2_check_xattrs(struct bch_fs *c)
{
struct inode_walker inode = inode_walker_init();
struct bch_hash_info hash_info;
- struct btree_iter iter;
- struct bkey_s_c k;
int ret = 0;
ret = bch2_trans_run(c,
@@ -1918,7 +1870,7 @@ int bch2_check_xattrs(struct bch_fs *c)
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
k,
NULL, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_enospc,
check_xattr(trans, &iter, k, &hash_info, &inode)));
bch_err_fn(c, ret);
return ret;
@@ -1932,7 +1884,7 @@ static int check_root_trans(struct btree_trans *trans)
u64 inum;
int ret;
- ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
+ ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
@@ -1948,18 +1900,13 @@ static int check_root_trans(struct btree_trans *trans)
root_subvol.v.flags = 0;
root_subvol.v.snapshot = cpu_to_le32(snapshot);
root_subvol.v.inode = cpu_to_le64(inum);
- ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- bch2_btree_insert_trans(trans, BTREE_ID_subvolumes,
- &root_subvol.k_i, 0));
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0);
bch_err_msg(c, ret, "writing root subvol");
if (ret)
goto err;
-
}
- ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
+ ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
@@ -1983,11 +1930,7 @@ fsck_err:
/* Get root directory, create if it doesn't exist: */
int bch2_check_root(struct bch_fs *c)
{
- int ret;
-
- ret = bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_root_trans(trans));
bch_err_fn(c, ret);
return ret;
@@ -2002,13 +1945,10 @@ typedef DARRAY(struct pathbuf_entry) pathbuf;
static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
{
- struct pathbuf_entry *i;
-
darray_for_each(*p, i)
if (i->inum == inum &&
i->snapshot == snapshot)
return true;
-
return false;
}
@@ -2057,10 +1997,10 @@ static int check_path(struct btree_trans *trans,
break;
}
- ret = lockrestart_do(trans,
- PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
- SPOS(inode->bi_dir, inode->bi_dir_offset,
- parent_snapshot))).k));
+ d = dirent_get_by_pos(trans, &dirent_iter,
+ SPOS(inode->bi_dir, inode->bi_dir_offset,
+ parent_snapshot));
+ ret = bkey_err(d.s_c);
if (ret && !bch2_err_matches(ret, ENOENT))
break;
@@ -2097,13 +2037,12 @@ static int check_path(struct btree_trans *trans,
ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
if (ret) {
/* Should have been caught in dirents pass */
- bch_err(c, "error looking up parent directory: %i", ret);
+ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "error looking up parent directory: %i", ret);
break;
}
if (path_is_dup(p, inode->bi_inum, snapshot)) {
- struct pathbuf_entry *i;
-
/* XXX print path */
bch_err(c, "directory structure loop");
@@ -2111,20 +2050,19 @@ static int check_path(struct btree_trans *trans,
pr_err("%llu:%u", i->inum, i->snapshot);
pr_err("%llu:%u", inode->bi_inum, snapshot);
- if (!fsck_err(c, dir_loop,
- "directory structure loop"))
+ if (!fsck_err(c, dir_loop, "directory structure loop"))
return 0;
- ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- remove_backpointer(trans, inode));
- if (ret) {
- bch_err(c, "error removing dirent: %i", ret);
+ ret = remove_backpointer(trans, inode);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err_msg(c, ret, "removing dirent");
+ if (ret)
break;
- }
ret = reattach_inode(trans, inode, snapshot);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum);
+ break;
}
}
fsck_err:
@@ -2139,37 +2077,28 @@ fsck_err:
*/
int bch2_check_directory_structure(struct bch_fs *c)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
struct bch_inode_unpacked u;
pathbuf path = { 0, };
int ret;
- for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- if (!bkey_is_inode(k.k))
- continue;
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ if (!bkey_is_inode(k.k))
+ continue;
- ret = bch2_inode_unpack(k, &u);
- if (ret) {
- /* Should have been caught earlier in fsck: */
- bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
- break;
- }
+ BUG_ON(bch2_inode_unpack(k, &u));
- if (u.bi_flags & BCH_INODE_unlinked)
- continue;
+ if (u.bi_flags & BCH_INODE_unlinked)
+ continue;
- ret = check_path(trans, &path, &u, iter.pos.snapshot);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
+ check_path(trans, &path, &u, iter.pos.snapshot);
+ })));
darray_exit(&path);
+
bch_err_fn(c, ret);
return ret;
}
@@ -2255,47 +2184,39 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
struct nlink_table *t,
u64 start, u64 *end)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_inode_unpacked u;
- int ret = 0;
-
- for_each_btree_key(trans, iter, BTREE_ID_inodes,
- POS(0, start),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- if (!bkey_is_inode(k.k))
- continue;
-
- /* Should never fail, checked by bch2_inode_invalid: */
- BUG_ON(bch2_inode_unpack(k, &u));
-
- /*
- * Backpointer and directory structure checks are sufficient for
- * directories, since they can't have hardlinks:
- */
- if (S_ISDIR(u.bi_mode))
- continue;
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_inodes,
+ POS(0, start),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ if (!bkey_is_inode(k.k))
+ continue;
- if (!u.bi_nlink)
- continue;
+ /* Should never fail, checked by bch2_inode_invalid: */
+ struct bch_inode_unpacked u;
+ BUG_ON(bch2_inode_unpack(k, &u));
- ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
- if (ret) {
- *end = k.k->p.offset;
- ret = 0;
- break;
- }
+ /*
+ * Backpointer and directory structure checks are sufficient for
+ * directories, since they can't have hardlinks:
+ */
+ if (S_ISDIR(u.bi_mode))
+ continue;
- }
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
+ if (!u.bi_nlink)
+ continue;
- if (ret)
- bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+ ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
+ if (ret) {
+ *end = k.k->p.offset;
+ ret = 0;
+ break;
+ }
+ 0;
+ })));
+ bch_err_fn(c, ret);
return ret;
}
@@ -2303,42 +2224,34 @@ noinline_for_stack
static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
u64 range_start, u64 range_end)
{
- struct btree_trans *trans = bch2_trans_get(c);
struct snapshots_seen s;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_dirent d;
- int ret;
snapshots_seen_init(&s);
- for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
- if (ret)
- break;
-
- switch (k.k->type) {
- case KEY_TYPE_dirent:
- d = bkey_s_c_to_dirent(k);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
+ if (ret)
+ break;
- if (d.v->d_type != DT_DIR &&
- d.v->d_type != DT_SUBVOL)
- inc_link(c, &s, links, range_start, range_end,
- le64_to_cpu(d.v->d_inum),
- bch2_snapshot_equiv(c, d.k->p.snapshot));
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
+ if (k.k->type == KEY_TYPE_dirent) {
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- if (ret)
- bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
+ if (d.v->d_type != DT_DIR &&
+ d.v->d_type != DT_SUBVOL)
+ inc_link(c, &s, links, range_start, range_end,
+ le64_to_cpu(d.v->d_inum),
+ bch2_snapshot_equiv(c, d.k->p.snapshot));
+ }
+ 0;
+ })));
- bch2_trans_put(trans);
snapshots_seen_exit(&s);
+
+ bch_err_fn(c, ret);
return ret;
}
@@ -2389,19 +2302,16 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
struct nlink_table *links,
u64 range_start, u64 range_end)
{
- struct btree_iter iter;
- struct bkey_s_c k;
size_t idx = 0;
- int ret = 0;
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS(0, range_start),
BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
if (ret < 0) {
- bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+ bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
return ret;
}
@@ -2447,7 +2357,6 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
{
struct bkey_s_c_reflink_p p;
struct bkey_i_reflink_p *u;
- int ret;
if (k.k->type != KEY_TYPE_reflink_p)
return 0;
@@ -2458,7 +2367,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
return 0;
u = bch2_trans_kmalloc(trans, sizeof(*u));
- ret = PTR_ERR_OR_ZERO(u);
+ int ret = PTR_ERR_OR_ZERO(u);
if (ret)
return ret;
@@ -2471,19 +2380,15 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
int bch2_fix_reflink_p(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
return 0;
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_extents, POS_MIN,
BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
fix_reflink_p_key(trans, &iter, k)));
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 9309cfeecd8d..086f0090b03a 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -506,22 +506,33 @@ fsck_err:
static void __bch2_inode_unpacked_to_text(struct printbuf *out,
struct bch_inode_unpacked *inode)
{
- prt_printf(out, "mode=%o ", inode->bi_mode);
+ printbuf_indent_add(out, 2);
+ prt_printf(out, "mode=%o", inode->bi_mode);
+ prt_newline(out);
prt_str(out, "flags=");
prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
prt_printf(out, " (%x)", inode->bi_flags);
+ prt_newline(out);
- prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu",
- inode->bi_journal_seq,
- inode->bi_size,
- inode->bi_sectors,
- inode->bi_version);
+ prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq);
+ prt_newline(out);
+
+ prt_printf(out, "bi_size=%llu", inode->bi_size);
+ prt_newline(out);
+
+ prt_printf(out, "bi_sectors=%llu", inode->bi_sectors);
+ prt_newline(out);
+
+ prt_newline(out);
+ prt_printf(out, "bi_version=%llu", inode->bi_version);
#define x(_name, _bits) \
- prt_printf(out, " "#_name "=%llu", (u64) inode->_name);
+ prt_printf(out, #_name "=%llu", (u64) inode->_name); \
+ prt_newline(out);
BCH_INODE_FIELDS_v3()
#undef x
+ printbuf_indent_sub(out, 2);
}
void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
@@ -561,64 +572,46 @@ static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
return bkey_inode_flags(k) & BCH_INODE_unlinked;
}
-int bch2_trans_mark_inode(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_i *new,
- unsigned flags)
+int bch2_trigger_inode(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_s new,
+ unsigned flags)
{
- int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
- bool old_deleted = bkey_is_deleted_inode(old);
- bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new));
+ s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
- if (nr) {
- int ret = bch2_replicas_deltas_realloc(trans, 0);
- struct replicas_delta_list *d = trans->fs_usage_deltas;
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (nr) {
+ int ret = bch2_replicas_deltas_realloc(trans, 0);
+ if (ret)
+ return ret;
- if (ret)
- return ret;
+ trans->fs_usage_deltas->nr_inodes += nr;
+ }
- d->nr_inodes += nr;
+ bool old_deleted = bkey_is_deleted_inode(old);
+ bool new_deleted = bkey_is_deleted_inode(new.s_c);
+ if (old_deleted != new_deleted) {
+ int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted);
+ if (ret)
+ return ret;
+ }
}
- if (old_deleted != new_deleted) {
- int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted);
- if (ret)
- return ret;
- }
+ if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
+ BUG_ON(!trans->journal_res.seq);
- return 0;
-}
-
-int bch2_mark_inode(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bch_fs_usage *fs_usage;
- u64 journal_seq = trans->journal_res.seq;
-
- if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v;
-
- BUG_ON(!journal_seq);
- BUG_ON(new.k->type != KEY_TYPE_inode_v3);
-
- v->bi_journal_seq = cpu_to_le64(journal_seq);
+ bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
}
if (flags & BTREE_TRIGGER_GC) {
- percpu_down_read(&c->mark_lock);
- preempt_disable();
+ struct bch_fs *c = trans->c;
- fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
- fs_usage->nr_inodes += bkey_is_inode(new.k);
- fs_usage->nr_inodes -= bkey_is_inode(old.k);
-
- preempt_enable();
+ percpu_down_read(&c->mark_lock);
+ this_cpu_add(c->usage_gc->b.nr_inodes, nr);
percpu_up_read(&c->mark_lock);
}
+
return 0;
}
@@ -831,7 +824,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
err:
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
@@ -894,7 +887,7 @@ retry:
ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
err:
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -1058,7 +1051,7 @@ retry:
ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
err:
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -1155,51 +1148,48 @@ delete:
int bch2_delete_dead_inodes(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
bool need_another_pass;
int ret;
again:
need_another_pass = false;
- ret = bch2_btree_write_buffer_flush_sync(trans);
- if (ret)
- goto err;
-
/*
* Weird transaction restart handling here because on successful delete,
* bch2_inode_rm_snapshot() will return a nested transaction restart,
* but we can't retry because the btree write buffer won't have been
* flushed and we'd spin:
*/
- for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass));
- if (ret < 0)
- break;
-
- if (ret) {
- if (!test_bit(BCH_FS_RW, &c->flags)) {
- bch2_trans_unlock(trans);
- bch2_fs_lazy_rw(c);
- }
-
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
+ if (ret > 0) {
bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- break;
+ /*
+ * We don't want to loop here: a transaction restart
+ * error here means we handled a transaction restart and
+ * we're actually done, but if we loop we'll retry the
+ * same key because the write buffer hasn't been flushed
+ * yet
+ */
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ ret = 0;
+ continue;
+ }
}
- }
- bch2_trans_iter_exit(trans, &iter);
- if (!ret && need_another_pass)
+ ret;
+ }));
+
+ if (!ret && need_another_pass) {
+ ret = bch2_btree_write_buffer_flush_sync(trans);
+ if (ret)
+ goto err;
goto again;
+ }
err:
bch2_trans_put(trans);
-
return ret;
}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 88818a332b1e..b63f312581cf 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -17,32 +17,27 @@ int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_inode ((struct bkey_ops) { \
.key_invalid = bch2_inode_invalid, \
.val_to_text = bch2_inode_to_text, \
- .trans_trigger = bch2_trans_mark_inode, \
- .atomic_trigger = bch2_mark_inode, \
+ .trigger = bch2_trigger_inode, \
.min_val_size = 16, \
})
#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \
.key_invalid = bch2_inode_v2_invalid, \
.val_to_text = bch2_inode_to_text, \
- .trans_trigger = bch2_trans_mark_inode, \
- .atomic_trigger = bch2_mark_inode, \
+ .trigger = bch2_trigger_inode, \
.min_val_size = 32, \
})
#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \
.key_invalid = bch2_inode_v3_invalid, \
.val_to_text = bch2_inode_to_text, \
- .trans_trigger = bch2_trans_mark_inode, \
- .atomic_trigger = bch2_mark_inode, \
+ .trigger = bch2_trigger_inode, \
.min_val_size = 48, \
})
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
new file mode 100644
index 000000000000..83d107331edf
--- /dev/null
+++ b/fs/bcachefs/inode_format.h
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_INODE_FORMAT_H
+#define _BCACHEFS_INODE_FORMAT_H
+
+#define BLOCKDEV_INODE_MAX 4096
+#define BCACHEFS_ROOT_INO 4096
+
+struct bch_inode {
+ struct bch_val v;
+
+ __le64 bi_hash_seed;
+ __le32 bi_flags;
+ __le16 bi_mode;
+ __u8 fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v2 {
+ struct bch_val v;
+
+ __le64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ __le64 bi_flags;
+ __le16 bi_mode;
+ __u8 fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v3 {
+ struct bch_val v;
+
+ __le64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ __le64 bi_flags;
+ __le64 bi_sectors;
+ __le64 bi_size;
+ __le64 bi_version;
+ __u8 fields[];
+} __packed __aligned(8);
+
+#define INODEv3_FIELDS_START_INITIAL 6
+#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
+
+struct bch_inode_generation {
+ struct bch_val v;
+
+ __le32 bi_generation;
+ __le32 pad;
+} __packed __aligned(8);
+
+/*
+ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
+ */
+
+#define BCH_INODE_FIELDS_v2() \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
+ x(bi_size, 64) \
+ x(bi_sectors, 64) \
+ x(bi_uid, 32) \
+ x(bi_gid, 32) \
+ x(bi_nlink, 32) \
+ x(bi_generation, 32) \
+ x(bi_dev, 32) \
+ x(bi_data_checksum, 8) \
+ x(bi_compression, 8) \
+ x(bi_project, 32) \
+ x(bi_background_compression, 8) \
+ x(bi_data_replicas, 8) \
+ x(bi_promote_target, 16) \
+ x(bi_foreground_target, 16) \
+ x(bi_background_target, 16) \
+ x(bi_erasure_code, 16) \
+ x(bi_fields_set, 16) \
+ x(bi_dir, 64) \
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32)
+
+#define BCH_INODE_FIELDS_v3() \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
+ x(bi_uid, 32) \
+ x(bi_gid, 32) \
+ x(bi_nlink, 32) \
+ x(bi_generation, 32) \
+ x(bi_dev, 32) \
+ x(bi_data_checksum, 8) \
+ x(bi_compression, 8) \
+ x(bi_project, 32) \
+ x(bi_background_compression, 8) \
+ x(bi_data_replicas, 8) \
+ x(bi_promote_target, 16) \
+ x(bi_foreground_target, 16) \
+ x(bi_background_target, 16) \
+ x(bi_erasure_code, 16) \
+ x(bi_fields_set, 16) \
+ x(bi_dir, 64) \
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32) \
+ x(bi_nocow, 8)
+
+/* subset of BCH_INODE_FIELDS */
+#define BCH_INODE_OPTS() \
+ x(data_checksum, 8) \
+ x(compression, 8) \
+ x(project, 32) \
+ x(background_compression, 8) \
+ x(data_replicas, 8) \
+ x(promote_target, 16) \
+ x(foreground_target, 16) \
+ x(background_target, 16) \
+ x(erasure_code, 16) \
+ x(nocow, 8)
+
+enum inode_opt_id {
+#define x(name, ...) \
+ Inode_opt_##name,
+ BCH_INODE_OPTS()
+#undef x
+ Inode_opt_nr,
+};
+
+#define BCH_INODE_FLAGS() \
+ x(sync, 0) \
+ x(immutable, 1) \
+ x(append, 2) \
+ x(nodump, 3) \
+ x(noatime, 4) \
+ x(i_size_dirty, 5) \
+ x(i_sectors_dirty, 6) \
+ x(unlinked, 7) \
+ x(backptr_untrusted, 8)
+
+/* bits 20+ reserved for packed fields below: */
+
+enum bch_inode_flags {
+#define x(t, n) BCH_INODE_##t = 1U << n,
+ BCH_INODE_FLAGS()
+#undef x
+};
+
+enum __bch_inode_flags {
+#define x(t, n) __BCH_INODE_##t = n,
+ BCH_INODE_FLAGS()
+#undef x
+};
+
+LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
+
+LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
+LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
+LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_FIELDS_START,
+ struct bch_inode_v3, bi_flags, 31, 36);
+LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
+
+#endif /* _BCACHEFS_INODE_FORMAT_H */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index bebc11444ef5..1baf78594cca 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -34,8 +34,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
struct open_buckets open_buckets = { 0 };
struct bkey_s_c k;
struct bkey_buf old, new;
- unsigned sectors_allocated = 0;
- bool have_reservation = false;
+ unsigned sectors_allocated = 0, new_replicas;
bool unwritten = opts.nocow &&
c->sb.version >= bcachefs_metadata_version_unwritten_extents;
int ret;
@@ -50,28 +49,20 @@ int bch2_extent_fallocate(struct btree_trans *trans,
return ret;
sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
+ new_replicas = max(0, (int) opts.data_replicas -
+ (int) bch2_bkey_nr_ptrs_fully_allocated(k));
- if (!have_reservation) {
- unsigned new_replicas =
- max(0, (int) opts.data_replicas -
- (int) bch2_bkey_nr_ptrs_fully_allocated(k));
- /*
- * Get a disk reservation before (in the nocow case) calling
- * into the allocator:
- */
- ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
- if (unlikely(ret))
- goto err;
-
- bch2_bkey_buf_reassemble(&old, c, k);
- }
+ /*
+ * Get a disk reservation before (in the nocow case) calling
+ * into the allocator:
+ */
+ ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
+ if (unlikely(ret))
+ goto err_noprint;
- if (have_reservation) {
- if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
- goto err;
+ bch2_bkey_buf_reassemble(&old, c, k);
- bch2_key_resize(&new.k->k, sectors);
- } else if (!unwritten) {
+ if (!unwritten) {
struct bkey_i_reservation *reservation;
bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
@@ -83,7 +74,6 @@ int bch2_extent_fallocate(struct btree_trans *trans,
struct bkey_i_extent *e;
struct bch_devs_list devs_have;
struct write_point *wp;
- struct bch_extent_ptr *ptr;
devs_have.nr = 0;
@@ -118,14 +108,17 @@ int bch2_extent_fallocate(struct btree_trans *trans,
ptr->unwritten = true;
}
- have_reservation = true;
-
ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
0, i_sectors_delta, true);
err:
if (!ret && sectors_allocated)
bch2_increment_clock(c, sectors_allocated, WRITE);
-
+ if (should_print_err(ret))
+ bch_err_inum_offset_ratelimited(c,
+ inum.inum,
+ iter->pos.offset << 9,
+ "%s(): error: %s", __func__, bch2_err_str(ret));
+err_noprint:
bch2_open_buckets_put(c, &open_buckets);
bch2_disk_reservation_put(c, &disk_res);
bch2_bkey_buf_exit(&new, c);
@@ -256,7 +249,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
u64 new_i_size = le64_to_cpu(op->v.new_i_size);
int ret;
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
truncate_set_isize(trans, inum, new_i_size));
if (ret)
goto err;
@@ -378,7 +371,7 @@ case LOGGED_OP_FINSERT_start:
op->v.state = LOGGED_OP_FINSERT_shift_extents;
if (insert) {
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, src_offset, len) ?:
bch2_logged_op_update(trans, &op->k_i));
if (ret)
@@ -390,7 +383,7 @@ case LOGGED_OP_FINSERT_start:
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto err;
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_logged_op_update(trans, &op->k_i));
}
@@ -449,13 +442,11 @@ case LOGGED_OP_FINSERT_shift_extents:
op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
- ret = bch2_bkey_set_needs_rebalance(c, copy,
- opts.background_target,
- opts.background_compression) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?:
- bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
+ bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc);
btree_err:
bch2_disk_reservation_put(c, &disk_res);
@@ -470,12 +461,12 @@ btree_err:
op->v.state = LOGGED_OP_FINSERT_finish;
if (!insert) {
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, src_offset, shift) ?:
bch2_logged_op_update(trans, &op->k_i));
} else {
/* We need an inode update to update bi_journal_seq for fsync: */
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, 0, 0) ?:
bch2_logged_op_update(trans, &op->k_i));
}
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 36763865facd..3c574d8873a1 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -80,7 +80,7 @@ struct promote_op {
struct bpos pos;
struct data_update write;
- struct bio_vec bi_inline_vecs[0]; /* must be last */
+ struct bio_vec bi_inline_vecs[]; /* must be last */
};
static const struct rhashtable_params bch_promote_params = {
@@ -172,11 +172,13 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
int ret;
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
- return NULL;
+ return ERR_PTR(-BCH_ERR_nopromote_no_writes);
- op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
- if (!op)
+ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL);
+ if (!op) {
+ ret = -BCH_ERR_nopromote_enomem;
goto err;
+ }
op->start_time = local_clock();
op->pos = pos;
@@ -187,24 +189,29 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
*/
*rbio = kzalloc(sizeof(struct bch_read_bio) +
sizeof(struct bio_vec) * pages,
- GFP_NOFS);
- if (!*rbio)
+ GFP_KERNEL);
+ if (!*rbio) {
+ ret = -BCH_ERR_nopromote_enomem;
goto err;
+ }
rbio_init(&(*rbio)->bio, opts);
bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
- if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
- GFP_NOFS))
+ if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) {
+ ret = -BCH_ERR_nopromote_enomem;
goto err;
+ }
(*rbio)->bounce = true;
(*rbio)->split = true;
(*rbio)->kmalloc = true;
if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
- bch_promote_params))
+ bch_promote_params)) {
+ ret = -BCH_ERR_nopromote_in_flight;
goto err;
+ }
bio = &op->write.op.wbio.bio;
bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
@@ -223,9 +230,8 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
* -BCH_ERR_ENOSPC_disk_reservation:
*/
if (ret) {
- ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
- bch_promote_params);
- BUG_ON(ret);
+ BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
+ bch_promote_params));
goto err;
}
@@ -239,7 +245,7 @@ err:
*rbio = NULL;
kfree(op);
bch2_write_ref_put(c, BCH_WRITE_REF_promote);
- return NULL;
+ return ERR_PTR(ret);
}
noinline
@@ -274,10 +280,9 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
? BTREE_ID_reflink
: BTREE_ID_extents,
k, pos, pick, opts, sectors, rbio);
- if (!promote) {
- ret = -BCH_ERR_nopromote_enomem;
+ ret = PTR_ERR_OR_ZERO(promote);
+ if (ret)
goto nopromote;
- }
*bounce = true;
*read_full = promote_full;
@@ -526,7 +531,7 @@ out:
static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
{
- bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_rbio_narrow_crcs(trans, rbio));
}
@@ -637,12 +642,17 @@ csum_err:
goto out;
}
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+ prt_str(&buf, "data ");
+ bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
+
bch_err_inum_offset_ratelimited(ca,
rbio->read_pos.inode,
rbio->read_pos.offset << 9,
- "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
- rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
- csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
+ "data %s", buf.buf);
+ printbuf_exit(&buf);
+
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 8c8cb1541ac9..ef3a53f9045a 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -316,8 +316,8 @@ int bch2_extent_update(struct btree_trans *trans,
i_sectors_delta) ?:
bch2_trans_update(trans, iter, k, 0) ?:
bch2_trans_commit(trans, disk_res, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc);
if (unlikely(ret))
return ret;
@@ -362,9 +362,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
bkey_start_pos(&sk.k->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- ret = bch2_bkey_set_needs_rebalance(c, sk.k,
- op->opts.background_target,
- op->opts.background_compression) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?:
bch2_extent_update(trans, inum, &iter, sk.k,
&op->res,
op->new_i_size, &op->i_sectors_delta,
@@ -396,17 +394,14 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
bool nocow)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
- const struct bch_extent_ptr *ptr;
struct bch_write_bio *n;
- struct bch_dev *ca;
BUG_ON(c->opts.nochanges);
bkey_for_each_ptr(ptrs, ptr) {
- BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
- !c->devs[ptr->dev]);
+ BUG_ON(!bch2_dev_exists2(c, ptr->dev));
- ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (to_entry(ptr + 1) < ptrs.end) {
n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
@@ -1109,16 +1104,14 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
- const struct bch_extent_ptr *ptr;
- struct bkey_i *k;
for_each_keylist_key(&op->insert_keys, k) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
bkey_for_each_ptr(ptrs, ptr)
bch2_bucket_nocow_unlock(&c->nocow_locks,
- PTR_BUCKET_POS(c, ptr),
- BUCKET_NOCOW_LOCK_UPDATE);
+ PTR_BUCKET_POS(c, ptr),
+ BUCKET_NOCOW_LOCK_UPDATE);
}
}
@@ -1128,25 +1121,20 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
struct bkey_s_c k,
u64 new_i_size)
{
- struct bkey_i *new;
- struct bkey_ptrs ptrs;
- struct bch_extent_ptr *ptr;
- int ret;
-
if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
/* trace this */
return 0;
}
- new = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(new);
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+ int ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
bch2_cut_front(bkey_start_pos(&orig->k), new);
bch2_cut_back(orig->k.p, new);
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
bkey_for_each_ptr(ptrs, ptr)
ptr->unwritten = 0;
@@ -1167,16 +1155,12 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_i *orig;
- struct bkey_s_c k;
- int ret;
for_each_keylist_key(&op->insert_keys, orig) {
- ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
+ int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
bkey_start_pos(&orig->k), orig->k.p,
BTREE_ITER_INTENT, k,
- NULL, NULL, BTREE_INSERT_NOFAIL, ({
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
}));
@@ -1228,10 +1212,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
- struct bkey_ptrs_c ptrs;
- const struct bch_extent_ptr *ptr;
DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
- struct bucket_to_lock *i;
u32 snapshot;
struct bucket_to_lock *stale_at;
int ret;
@@ -1273,7 +1254,7 @@ retry:
break;
/* Get iorefs before dropping btree locks: */
- ptrs = bch2_bkey_ptrs_c(k);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
struct bpos b = PTR_BUCKET_POS(c, ptr);
struct nocow_lock_bucket *l =
@@ -1464,6 +1445,11 @@ err:
op->flags |= BCH_WRITE_DONE;
if (ret < 0) {
+ if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT))
+ bch_err_inum_offset_ratelimited(c,
+ op->pos.inode,
+ op->pos.offset << 9,
+ "%s(): error: %s", __func__, bch2_err_str(ret));
op->error = ret;
break;
}
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 8cf238be6213..bc890776eb57 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -10,6 +10,7 @@
#include "bkey_methods.h"
#include "btree_gc.h"
#include "btree_update.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "error.h"
#include "journal.h"
@@ -26,6 +27,47 @@ static const char * const bch2_journal_errors[] = {
NULL
};
+static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
+{
+ union journal_res_state s = READ_ONCE(j->reservations);
+ unsigned i = seq & JOURNAL_BUF_MASK;
+ struct journal_buf *buf = j->buf + i;
+
+ prt_printf(out, "seq:");
+ prt_tab(out);
+ prt_printf(out, "%llu", seq);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "refcount:");
+ prt_tab(out);
+ prt_printf(out, "%u", journal_state_count(s, i));
+ prt_newline(out);
+
+ prt_printf(out, "size:");
+ prt_tab(out);
+ prt_human_readable_u64(out, vstruct_bytes(buf->data));
+ prt_newline(out);
+
+ prt_printf(out, "expires");
+ prt_tab(out);
+ prt_printf(out, "%li jiffies", buf->expires - jiffies);
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+}
+
+static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
+{
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 24);
+
+ for (u64 seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++)
+ bch2_journal_buf_to_text(out, j, seq);
+}
+
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
{
return seq > j->seq_ondisk;
@@ -155,7 +197,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
* We don't close a journal_buf until the next journal_buf is finished writing,
* and can be opened again - this also initializes the next journal_buf:
*/
-static void __journal_entry_close(struct journal *j, unsigned closed_val)
+static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
@@ -184,6 +226,18 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
/* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
+ if (trace_journal_entry_close_enabled() && trace) {
+ struct printbuf pbuf = PRINTBUF;
+ pbuf.atomic++;
+
+ prt_str(&pbuf, "entry size: ");
+ prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data));
+ prt_newline(&pbuf);
+ bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT);
+ trace_journal_entry_close(c, pbuf.buf);
+ printbuf_exit(&pbuf);
+ }
+
sectors = vstruct_blocks_plus(buf->data, c->block_bits,
buf->u64s_reserved) << c->block_bits;
BUG_ON(sectors > buf->sectors);
@@ -222,7 +276,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
void bch2_journal_halt(struct journal *j)
{
spin_lock(&j->lock);
- __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
if (!j->err_seq)
j->err_seq = journal_cur_seq(j);
journal_wake(j);
@@ -236,7 +290,7 @@ static bool journal_entry_want_write(struct journal *j)
/* Don't close it yet if we already have a write in flight: */
if (ret)
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
else if (nr_unwritten_journal_entries(j)) {
struct journal_buf *buf = journal_cur_buf(j);
@@ -330,6 +384,7 @@ static int journal_entry_open(struct journal *j)
buf->must_flush = false;
buf->separate_flush = false;
buf->flush_time = 0;
+ buf->need_flush_to_write_buffer = true;
memset(buf->data, 0, sizeof(*buf->data));
buf->data->seq = cpu_to_le64(journal_cur_seq(j));
@@ -363,11 +418,6 @@ static int journal_entry_open(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
- if (j->res_get_blocked_start)
- bch2_time_stats_update(j->blocked_time,
- j->res_get_blocked_start);
- j->res_get_blocked_start = 0;
-
mod_delayed_work(c->io_complete_wq,
&j->write_work,
msecs_to_jiffies(c->opts.journal_flush_delay));
@@ -407,7 +457,7 @@ static void journal_write_work(struct work_struct *work)
if (delta > 0)
mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
else
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
unlock:
spin_unlock(&j->lock);
}
@@ -464,18 +514,23 @@ retry:
buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
ret = journal_entry_open(j);
- if (ret == JOURNAL_ERR_max_in_flight)
- trace_and_count(c, journal_entry_full, c);
-unlock:
- if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
- !j->res_get_blocked_start) {
- j->res_get_blocked_start = local_clock() ?: 1;
- trace_and_count(c, journal_full, c);
- }
+ if (ret == JOURNAL_ERR_max_in_flight) {
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
+ &j->max_in_flight_start, true);
+ if (trace_journal_entry_full_enabled()) {
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+ bch2_journal_bufs_to_text(&buf, j);
+ trace_journal_entry_full(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+ count_event(c, journal_entry_full);
+ }
+unlock:
can_discard = j->can_discard;
spin_unlock(&j->lock);
@@ -553,7 +608,7 @@ void bch2_journal_entry_res_resize(struct journal *j,
/*
* Not enough room in current journal entry, have to flush it:
*/
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
} else {
journal_cur_buf(j)->u64s_reserved += d;
}
@@ -610,7 +665,7 @@ recheck_need_open:
struct journal_res res = { 0 };
if (journal_entry_is_open(j))
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
spin_unlock(&j->lock);
@@ -774,6 +829,48 @@ void bch2_journal_block(struct journal *j)
journal_quiesce(j);
}
+static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+{
+ struct journal_buf *ret = NULL;
+
+ mutex_lock(&j->buf_lock);
+ spin_lock(&j->lock);
+ max_seq = min(max_seq, journal_cur_seq(j));
+
+ for (u64 seq = journal_last_unwritten_seq(j);
+ seq <= max_seq;
+ seq++) {
+ unsigned idx = seq & JOURNAL_BUF_MASK;
+ struct journal_buf *buf = j->buf + idx;
+
+ if (buf->need_flush_to_write_buffer) {
+ if (seq == journal_cur_seq(j))
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
+
+ union journal_res_state s;
+ s.v = atomic64_read_acquire(&j->reservations.counter);
+
+ ret = journal_state_count(s, idx)
+ ? ERR_PTR(-EAGAIN)
+ : buf;
+ break;
+ }
+ }
+
+ spin_unlock(&j->lock);
+ if (IS_ERR_OR_NULL(ret))
+ mutex_unlock(&j->buf_lock);
+ return ret;
+}
+
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+{
+ struct journal_buf *ret;
+
+ wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN));
+ return ret;
+}
+
/* allocate journal on a device: */
static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
@@ -955,8 +1052,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
break;
}
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
unlock:
up_write(&c->state_lock);
return ret;
@@ -986,17 +1082,13 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
err:
- if (ret)
- bch_err_fn(ca, ret);
+ bch_err_fn(ca, ret);
return ret;
}
int bch2_fs_journal_alloc(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
if (ca->journal.nr)
continue;
@@ -1225,6 +1317,7 @@ int bch2_fs_journal_init(struct journal *j)
static struct lock_class_key res_key;
unsigned i;
+ mutex_init(&j->buf_lock);
spin_lock_init(&j->lock);
spin_lock_init(&j->err_lock);
init_waitqueue_head(&j->wait);
@@ -1260,10 +1353,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
union journal_res_state s;
- struct bch_dev *ca;
unsigned long now = jiffies;
- u64 seq;
- unsigned i;
+ u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 24);
@@ -1275,20 +1366,23 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size);
prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j));
prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk);
- prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
+ prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
- prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
- prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
- prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
+ prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
+ prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
+ prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
- prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
- prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
+ prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
+ prt_printf(out, "average write size:\t");
+ prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
+ prt_newline(out);
+ prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
- prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
+ prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
- prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
- prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
+ prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
+ prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
prt_printf(out, "current entry:\t\t");
switch (s.cur_entry_offset) {
@@ -1304,35 +1398,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
}
prt_newline(out);
-
- for (seq = journal_cur_seq(j);
- seq >= journal_last_unwritten_seq(j);
- --seq) {
- i = seq & JOURNAL_BUF_MASK;
-
- prt_printf(out, "unwritten entry:");
- prt_tab(out);
- prt_printf(out, "%llu", seq);
- prt_newline(out);
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "refcount:");
- prt_tab(out);
- prt_printf(out, "%u", journal_state_count(s, i));
- prt_newline(out);
-
- prt_printf(out, "sectors:");
- prt_tab(out);
- prt_printf(out, "%u", j->buf[i].sectors);
- prt_newline(out);
-
- prt_printf(out, "expires");
- prt_tab(out);
- prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies);
- prt_newline(out);
-
- printbuf_indent_sub(out, 2);
- }
+ prt_printf(out, "unwritten entries:");
+ prt_newline(out);
+ bch2_journal_bufs_to_text(out, j);
prt_printf(out,
"replay done:\t\t%i\n",
@@ -1352,8 +1420,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
j->space[journal_space_total].next_entry,
j->space[journal_space_total].total);
- for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_journal]) {
+ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
@@ -1362,7 +1429,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
if (!ja->nr)
continue;
- prt_printf(out, "dev %u:\n", i);
+ prt_printf(out, "dev %u:\n", ca->dev_idx);
prt_printf(out, "\tnr\t\t%u\n", ja->nr);
prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size);
prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 2f768e11aec9..4544ce24bb8a 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -119,7 +119,6 @@ static inline void journal_wake(struct journal *j)
{
wake_up(&j->wait);
closure_wake_up(&j->async_wait);
- closure_wake_up(&j->preres_wait);
}
static inline struct journal_buf *journal_cur_buf(struct journal *j)
@@ -239,8 +238,6 @@ bch2_journal_add_entry(struct journal *j, struct journal_res *res,
static inline bool journal_entry_empty(struct jset *j)
{
- struct jset_entry *i;
-
if (j->seq != j->last_seq)
return false;
@@ -426,6 +423,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
void bch2_journal_unblock(struct journal *);
void bch2_journal_block(struct journal *);
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq);
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 3eb6c3f62a81..bfd6585e746d 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -4,6 +4,7 @@
#include "alloc_foreground.h"
#include "btree_io.h"
#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "checksum.h"
#include "disk_groups.h"
@@ -26,11 +27,15 @@ static struct nonce journal_nonce(const struct jset *jset)
}};
}
-static bool jset_csum_good(struct bch_fs *c, struct jset *j)
+static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum)
{
- return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) &&
- !bch2_crc_cmp(j->csum,
- csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j));
+ if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) {
+ *csum = (struct bch_csum) {};
+ return false;
+ }
+
+ *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
+ return !bch2_crc_cmp(j->csum, *csum);
}
static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
@@ -678,17 +683,12 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
for (i = 0; i < nr_types; i++) {
- if (i < BCH_DATA_NR)
- prt_printf(out, " %s", bch2_data_types[i]);
- else
- prt_printf(out, " (unknown data type %u)", i);
+ bch2_prt_data_type(out, i);
prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
le64_to_cpu(u->d[i].buckets),
le64_to_cpu(u->d[i].sectors),
le64_to_cpu(u->d[i].fragmented));
}
-
- prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
}
static int journal_entry_log_validate(struct bch_fs *c,
@@ -725,6 +725,22 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs
journal_entry_btree_keys_to_text(out, c, entry);
}
+static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ return journal_entry_btree_keys_validate(c, jset, entry,
+ version, big_endian, READ);
+}
+
+static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ journal_entry_btree_keys_to_text(out, c, entry);
+}
+
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
@@ -768,7 +784,6 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
enum bkey_invalid_flags flags)
{
- struct jset_entry *entry;
unsigned version = le32_to_cpu(jset->version);
int ret = 0;
@@ -920,6 +935,7 @@ static int journal_read_bucket(struct bch_dev *ca,
u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
end = offset + ca->mi.bucket_size;
bool saw_bad = false, csum_good;
+ struct printbuf err = PRINTBUF;
int ret = 0;
pr_debug("reading %u", bucket);
@@ -952,7 +968,7 @@ reread:
* found on a different device, and missing or
* no journal entries will be handled later
*/
- return 0;
+ goto out;
}
j = buf->data;
@@ -969,12 +985,12 @@ reread:
ret = journal_read_buf_realloc(buf,
vstruct_bytes(j));
if (ret)
- return ret;
+ goto err;
}
goto reread;
case JOURNAL_ENTRY_NONE:
if (!saw_bad)
- return 0;
+ goto out;
/*
* On checksum error we don't really trust the size
* field of the journal entry we read, so try reading
@@ -983,7 +999,7 @@ reread:
sectors = block_sectors(c);
goto next_block;
default:
- return ret;
+ goto err;
}
/*
@@ -993,20 +1009,28 @@ reread:
* bucket:
*/
if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
- return 0;
+ goto out;
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
- csum_good = jset_csum_good(c, j);
+ enum bch_csum_type csum_type = JSET_CSUM_TYPE(j);
+ struct bch_csum csum;
+ csum_good = jset_csum_good(c, j, &csum);
+
if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
- "journal checksum error"))
+ "%s",
+ (printbuf_reset(&err),
+ prt_str(&err, "journal "),
+ bch2_csum_err_msg(&err, csum_type, j->csum, csum),
+ err.buf)))
saw_bad = true;
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
vstruct_end(j) - (void *) j->encrypted_start);
bch2_fs_fatal_err_on(ret, c,
- "error decrypting journal entry: %i", ret);
+ "error decrypting journal entry: %s",
+ bch2_err_str(ret));
mutex_lock(&jlist->lock);
ret = journal_entry_add(c, ca, (struct journal_ptr) {
@@ -1025,7 +1049,7 @@ reread:
case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
break;
default:
- return ret;
+ goto err;
}
next_block:
pr_debug("next");
@@ -1034,7 +1058,11 @@ next_block:
j = ((void *) j) + (sectors << 9);
}
- return 0;
+out:
+ ret = 0;
+err:
+ printbuf_exit(&err);
+ return ret;
}
static CLOSURE_CALLBACK(bch2_journal_read_device)
@@ -1156,8 +1184,6 @@ int bch2_journal_read(struct bch_fs *c,
struct journal_list jlist;
struct journal_replay *i, **_i, *prev = NULL;
struct genradix_iter radix_iter;
- struct bch_dev *ca;
- unsigned iter;
struct printbuf buf = PRINTBUF;
bool degraded = false, last_write_torn = false;
u64 seq;
@@ -1168,7 +1194,7 @@ int bch2_journal_read(struct bch_fs *c,
jlist.last_seq = 0;
jlist.ret = 0;
- for_each_member_device(ca, c, iter) {
+ for_each_member_device(c, ca) {
if (!c->opts.fsck &&
!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
continue;
@@ -1334,7 +1360,7 @@ int bch2_journal_read(struct bch_fs *c,
continue;
for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
- ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
if (!i->ptrs[ptr].csum_good)
bch_err_dev_offset(ca, i->ptrs[ptr].sector,
@@ -1505,6 +1531,8 @@ done:
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
/* we aren't holding j->lock: */
unsigned new_size = READ_ONCE(j->buf_size_want);
void *new_buf;
@@ -1512,6 +1540,11 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
if (buf->buf_size >= new_size)
return;
+ size_t btree_write_buffer_size = new_size / 64;
+
+ if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
+ return;
+
new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
if (!new_buf)
return;
@@ -1604,6 +1637,9 @@ static CLOSURE_CALLBACK(journal_write_done)
bch2_journal_reclaim_fast(j);
bch2_journal_space_available(j);
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
+ &j->max_in_flight_start, false);
+
closure_wake_up(&w->wait);
journal_wake(j);
@@ -1656,7 +1692,6 @@ static CLOSURE_CALLBACK(do_journal_write)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
- struct bch_extent_ptr *ptr;
struct bio *bio;
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
@@ -1700,11 +1735,13 @@ static CLOSURE_CALLBACK(do_journal_write)
static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct jset_entry *start, *end, *i, *next, *prev = NULL;
+ struct jset_entry *start, *end;
struct jset *jset = w->data;
+ struct journal_keys_to_wb wb = { NULL };
unsigned sectors, bytes, u64s;
- bool validate_before_checksum = false;
unsigned long btree_roots_have = 0;
+ bool validate_before_checksum = false;
+ u64 seq = le64_to_cpu(jset->seq);
int ret;
/*
@@ -1715,7 +1752,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
* If we wanted to be really fancy here, we could sort all the keys in
* the jset and drop keys that were overwritten - probably not worth it:
*/
- vstruct_for_each_safe(jset, i, next) {
+ vstruct_for_each(jset, i) {
unsigned u64s = le16_to_cpu(i->u64s);
/* Empty entry: */
@@ -1732,40 +1769,40 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
* to c->btree_roots we have to get any missing btree roots and
* add them to this journal entry:
*/
- if (i->type == BCH_JSET_ENTRY_btree_root) {
+ switch (i->type) {
+ case BCH_JSET_ENTRY_btree_root:
bch2_journal_entry_to_btree_root(c, i);
__set_bit(i->btree_id, &btree_roots_have);
+ break;
+ case BCH_JSET_ENTRY_write_buffer_keys:
+ EBUG_ON(!w->need_flush_to_write_buffer);
+
+ if (!wb.wb)
+ bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
+
+ struct bkey_i *k;
+ jset_entry_for_each_key(i, k) {
+ ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
+ if (ret) {
+ bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
+ bch2_journal_keys_to_write_buffer_end(c, &wb);
+ return ret;
+ }
+ }
+ i->type = BCH_JSET_ENTRY_btree_keys;
+ break;
}
-
- /* Can we merge with previous entry? */
- if (prev &&
- i->btree_id == prev->btree_id &&
- i->level == prev->level &&
- i->type == prev->type &&
- i->type == BCH_JSET_ENTRY_btree_keys &&
- le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
- memmove_u64s_down(vstruct_next(prev),
- i->_data,
- u64s);
- le16_add_cpu(&prev->u64s, u64s);
- continue;
- }
-
- /* Couldn't merge, move i into new position (after prev): */
- prev = prev ? vstruct_next(prev) : jset->start;
- if (i != prev)
- memmove_u64s_down(prev, i, jset_u64s(u64s));
}
- prev = prev ? vstruct_next(prev) : jset->start;
- jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+ if (wb.wb)
+ bch2_journal_keys_to_write_buffer_end(c, &wb);
+ w->need_flush_to_write_buffer = false;
start = end = vstruct_last(jset);
end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
- bch2_journal_super_entries_add_common(c, &end,
- le64_to_cpu(jset->seq));
+ bch2_journal_super_entries_add_common(c, &end, seq);
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
@@ -1788,7 +1825,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
- j->last_empty_seq = le64_to_cpu(jset->seq);
+ j->last_empty_seq = seq;
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
validate_before_checksum = true;
@@ -1847,7 +1884,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
(!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
- w->noflush = true;
+ w->noflush = true;
SET_JSET_NO_FLUSH(w->data, true);
w->data->last_seq = 0;
w->last_seq = 0;
@@ -1866,12 +1903,11 @@ CLOSURE_CALLBACK(bch2_journal_write)
{
closure_type(j, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
struct bio *bio;
struct printbuf journal_debug_buf = PRINTBUF;
- unsigned i, nr_rw_members = 0;
+ unsigned nr_rw_members = 0;
int ret;
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
@@ -1884,12 +1920,16 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (ret)
goto err;
+ mutex_lock(&j->buf_lock);
journal_buf_realloc(j, w);
ret = bch2_journal_write_prep(j, w);
+ mutex_unlock(&j->buf_lock);
if (ret)
goto err;
+ j->entry_bytes_written += vstruct_bytes(w->data);
+
while (1) {
spin_lock(&j->lock);
ret = journal_write_alloc(j, w);
@@ -1927,7 +1967,7 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (c->opts.nochanges)
goto no_io;
- for_each_rw_member(ca, c, i)
+ for_each_rw_member(c, ca)
nr_rw_members++;
if (nr_rw_members > 1)
@@ -1944,11 +1984,12 @@ CLOSURE_CALLBACK(bch2_journal_write)
goto err;
if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
- for_each_rw_member(ca, c, i) {
+ for_each_rw_member(c, ca) {
percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;
- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+ bio_reset(bio, ca->disk_sb.bdev,
+ REQ_OP_WRITE|REQ_PREFLUSH);
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index ec712104addb..820d25e19e5f 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -3,6 +3,7 @@
#include "bcachefs.h"
#include "btree_key_cache.h"
#include "btree_update.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "errcode.h"
#include "error.h"
@@ -50,17 +51,24 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
return available;
}
-static inline void journal_set_watermark(struct journal *j, bool low_on_space)
+void bch2_journal_set_watermark(struct journal *j)
{
- unsigned watermark = BCH_WATERMARK_stripe;
-
- if (low_on_space)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
- if (fifo_free(&j->pin) < j->pin.size / 4)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
- if (watermark == j->watermark)
- return;
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ bool low_on_space = j->space[journal_space_clean].total * 4 <=
+ j->space[journal_space_total].total;
+ bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
+ bool low_on_wb = bch2_btree_write_buffer_must_wait(c);
+ unsigned watermark = low_on_space || low_on_pin || low_on_wb
+ ? BCH_WATERMARK_reclaim
+ : BCH_WATERMARK_stripe;
+
+ if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
+ &j->low_on_space_start, low_on_space) ||
+ track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
+ &j->low_on_pin_start, low_on_pin) ||
+ track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
+ &j->write_buffer_full_start, low_on_wb))
+ trace_and_count(c, journal_full, c);
swap(watermark, j->watermark);
if (watermark > j->watermark)
@@ -128,15 +136,13 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
enum journal_space_from from)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- unsigned i, pos, nr_devs = 0;
+ unsigned pos, nr_devs = 0;
struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_journal]) {
+ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
if (!ca->journal.nr)
continue;
@@ -165,19 +171,17 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
void bch2_journal_space_available(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
unsigned clean, clean_ondisk, total;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9);
- unsigned i, nr_online = 0, nr_devs_want;
+ unsigned nr_online = 0, nr_devs_want;
bool can_discard = false;
int ret = 0;
lockdep_assert_held(&j->lock);
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_journal]) {
+ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
if (!ja->nr)
@@ -208,7 +212,7 @@ void bch2_journal_space_available(struct journal *j)
nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
- for (i = 0; i < journal_space_nr; i++)
+ for (unsigned i = 0; i < journal_space_nr; i++)
j->space[i] = __journal_space_available(j, nr_devs_want, i);
clean_ondisk = j->space[journal_space_clean_ondisk].total;
@@ -226,7 +230,7 @@ void bch2_journal_space_available(struct journal *j)
else
clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
- journal_set_watermark(j, clean * 4 <= total);
+ bch2_journal_set_watermark(j);
out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
@@ -255,12 +259,10 @@ static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
void bch2_journal_do_discards(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- unsigned iter;
mutex_lock(&j->discard_lock);
- for_each_rw_member(ca, c, iter) {
+ for_each_rw_member(c, ca) {
struct journal_device *ja = &ca->journal;
while (should_discard_bucket(j, ja)) {
@@ -299,6 +301,7 @@ void bch2_journal_reclaim_fast(struct journal *j)
* all btree nodes got written out
*/
while (!fifo_empty(&j->pin) &&
+ j->pin.front <= j->seq_ondisk &&
!atomic_read(&fifo_peek_front(&j->pin).count)) {
j->pin.front++;
popped = true;
@@ -367,15 +370,36 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
return JOURNAL_PIN_other;
}
-void bch2_journal_pin_set(struct journal *j, u64 seq,
+static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
+ journal_pin_flush_fn flush_fn,
+ enum journal_pin_type type)
+{
+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+ /*
+ * flush_fn is how we identify journal pins in debugfs, so must always
+ * exist, even if it doesn't do anything:
+ */
+ BUG_ON(!flush_fn);
+
+ atomic_inc(&pin_list->count);
+ pin->seq = seq;
+ pin->flush = flush_fn;
+ list_add(&pin->list, &pin_list->list[type]);
+}
+
+void bch2_journal_pin_copy(struct journal *j,
+ struct journal_entry_pin *dst,
+ struct journal_entry_pin *src,
+ journal_pin_flush_fn flush_fn)
{
- struct journal_entry_pin_list *pin_list;
bool reclaim;
spin_lock(&j->lock);
+ u64 seq = READ_ONCE(src->seq);
+
if (seq < journal_last_seq(j)) {
/*
* bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
@@ -387,18 +411,34 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
return;
}
- pin_list = journal_seq_pin(j, seq);
+ reclaim = __journal_pin_drop(j, dst);
- reclaim = __journal_pin_drop(j, pin);
+ bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
- atomic_inc(&pin_list->count);
- pin->seq = seq;
- pin->flush = flush_fn;
+ if (reclaim)
+ bch2_journal_reclaim_fast(j);
+ spin_unlock(&j->lock);
- if (flush_fn)
- list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]);
- else
- list_add(&pin->list, &pin_list->flushed);
+ /*
+ * If the journal is currently full, we might want to call flush_fn
+ * immediately:
+ */
+ journal_wake(j);
+}
+
+void bch2_journal_pin_set(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ bool reclaim;
+
+ spin_lock(&j->lock);
+
+ BUG_ON(seq < journal_last_seq(j));
+
+ reclaim = __journal_pin_drop(j, pin);
+
+ bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@@ -537,13 +577,11 @@ static size_t journal_flush_pins(struct journal *j,
static u64 journal_seq_to_flush(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
u64 seq_to_flush = 0;
- unsigned iter;
spin_lock(&j->lock);
- for_each_rw_member(ca, c, iter) {
+ for_each_rw_member(c, ca) {
struct journal_device *ja = &ca->journal;
unsigned nr_buckets, bucket_to_flush;
@@ -747,10 +785,9 @@ int bch2_journal_reclaim_start(struct journal *j)
p = kthread_create(bch2_journal_reclaim_thread, j,
"bch-reclaim/%s", c->name);
ret = PTR_ERR_OR_ZERO(p);
- if (ret) {
- bch_err_msg(c, ret, "creating journal reclaim thread");
+ bch_err_msg(c, ret, "creating journal reclaim thread");
+ if (ret)
return ret;
- }
get_task_struct(p);
j->reclaim_thread = p;
@@ -796,6 +833,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
{
+ /* time_stats this */
bool did_work = false;
if (!test_bit(JOURNAL_STARTED, &j->flags))
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index 494d1a6eddb0..ec84c3345281 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -16,6 +16,7 @@ static inline void journal_reclaim_kick(struct journal *j)
unsigned bch2_journal_dev_buckets_available(struct journal *,
struct journal_device *,
enum journal_space_from);
+void bch2_journal_set_watermark(struct journal *);
void bch2_journal_space_available(struct journal *);
static inline bool journal_pin_active(struct journal_entry_pin *pin)
@@ -47,17 +48,10 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
bch2_journal_pin_set(j, seq, pin, flush_fn);
}
-static inline void bch2_journal_pin_copy(struct journal *j,
- struct journal_entry_pin *dst,
- struct journal_entry_pin *src,
- journal_pin_flush_fn flush_fn)
-{
- /* Guard against racing with journal_pin_drop(src): */
- u64 seq = READ_ONCE(src->seq);
-
- if (seq)
- bch2_journal_pin_add(j, seq, dst, flush_fn);
-}
+void bch2_journal_pin_copy(struct journal *,
+ struct journal_entry_pin *,
+ struct journal_entry_pin *,
+ journal_pin_flush_fn);
static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index f9d9aa95bf3a..0200e299cfbb 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -267,7 +267,7 @@ retry:
while (!(ret = PTR_ERR_OR_ZERO(b)) &&
b &&
- !test_bit(BCH_FS_STOPPING, &c->flags))
+ !test_bit(BCH_FS_stopping, &c->flags))
b = bch2_btree_iter_next_node(&iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index a756b69582e3..38817c7a0851 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -36,6 +36,7 @@ struct journal_buf {
bool noflush; /* write has already been kicked off, and was noflush */
bool must_flush; /* something wants a flush */
bool separate_flush;
+ bool need_flush_to_write_buffer;
};
/*
@@ -182,6 +183,12 @@ struct journal {
darray_u64 early_journal_entries;
/*
+ * Protects journal_buf->data, when accessing without a jorunal
+ * reservation: for synchronization between the btree write buffer code
+ * and the journal write path:
+ */
+ struct mutex buf_lock;
+ /*
* Two journal entries -- one is currently open for new entries, the
* other is possibly being written out.
*/
@@ -195,7 +202,6 @@ struct journal {
/* Used when waiting because the journal was full */
wait_queue_head_t wait;
struct closure_waitlist async_wait;
- struct closure_waitlist preres_wait;
struct closure io;
struct delayed_work write_work;
@@ -262,15 +268,19 @@ struct journal {
unsigned long last_flush_write;
- u64 res_get_blocked_start;
u64 write_start_time;
u64 nr_flush_writes;
u64 nr_noflush_writes;
+ u64 entry_bytes_written;
+
+ u64 low_on_space_start;
+ u64 low_on_pin_start;
+ u64 max_in_flight_start;
+ u64 write_buffer_full_start;
struct bch2_time_stats *flush_write_time;
struct bch2_time_stats *noflush_write_time;
- struct bch2_time_stats *blocked_time;
struct bch2_time_stats *flush_seq_time;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
index 5699cd4873c8..1b828bddd11b 100644
--- a/fs/bcachefs/keylist.c
+++ b/fs/bcachefs/keylist.c
@@ -43,8 +43,6 @@ void bch2_keylist_pop_front(struct keylist *l)
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_verify_keylist_sorted(struct keylist *l)
{
- struct bkey_i *k;
-
for_each_keylist_key(l, k)
BUG_ON(bkey_next(k) != l->top &&
bpos_ge(k->k.p, bkey_next(k)->k.p));
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
index fe759c7031e0..e687e0e9aede 100644
--- a/fs/bcachefs/keylist.h
+++ b/fs/bcachefs/keylist.h
@@ -50,18 +50,16 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
}
#define for_each_keylist_key(_keylist, _k) \
- for (_k = (_keylist)->keys; \
+ for (struct bkey_i *_k = (_keylist)->keys; \
_k != (_keylist)->top; \
_k = bkey_next(_k))
static inline u64 keylist_sectors(struct keylist *keys)
{
- struct bkey_i *k;
u64 ret = 0;
for_each_keylist_key(keys, k)
ret += k->k.size;
-
return ret;
}
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index 8640f7dee0de..ad598105c587 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -54,16 +54,12 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
int bch2_resume_logged_ops(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_trans_run(c,
- for_each_btree_key2(trans, iter,
- BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k,
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter,
+ BTREE_ID_logged_ops, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
resume_logged_op(trans, &iter, k)));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -85,13 +81,13 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
{
- return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_logged_op_start(trans, k));
}
void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
{
- int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
/*
* This needs to be a fatal error because we've left an unfinished
diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h
new file mode 100644
index 000000000000..6a4bf7129dba
--- /dev/null
+++ b/fs/bcachefs/logged_ops_format.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
+#define _BCACHEFS_LOGGED_OPS_FORMAT_H
+
+struct bch_logged_op_truncate {
+ struct bch_val v;
+ __le32 subvol;
+ __le32 pad;
+ __le64 inum;
+ __le64 new_i_size;
+};
+
+enum logged_op_finsert_state {
+ LOGGED_OP_FINSERT_start,
+ LOGGED_OP_FINSERT_shift_extents,
+ LOGGED_OP_FINSERT_finish,
+};
+
+struct bch_logged_op_finsert {
+ struct bch_val v;
+ __u8 state;
+ __u8 pad[3];
+ __le32 subvol;
+ __le64 inum;
+ __le64 dst_offset;
+ __le64 src_offset;
+ __le64 pos;
+};
+
+#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index a5cc0ed195d6..7a4ca5a28b3e 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -147,18 +147,13 @@ fsck_err:
int bch2_check_lrus(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
struct bpos last_flushed_pos = POS_MIN;
- int ret = 0;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c
index 1f0801e2e565..bf0ef668fd38 100644
--- a/fs/bcachefs/mean_and_variance.c
+++ b/fs/bcachefs/mean_and_variance.c
@@ -62,6 +62,7 @@ EXPORT_SYMBOL_GPL(u128_div);
/**
* mean_and_variance_get_mean() - get mean from @s
+ * @s: mean and variance number of samples and their sums
*/
s64 mean_and_variance_get_mean(struct mean_and_variance s)
{
@@ -71,6 +72,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
/**
* mean_and_variance_get_variance() - get variance from @s1
+ * @s1: mean and variance number of samples and sums
*
* see linked pdf equation 12.
*/
@@ -89,6 +91,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
/**
* mean_and_variance_get_stddev() - get standard deviation from @s
+ * @s: mean and variance number of samples and their sums
*/
u32 mean_and_variance_get_stddev(struct mean_and_variance s)
{
@@ -98,8 +101,8 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
/**
* mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
- * @s1: ..
- * @s2: ..
+ * @s: mean and variance number of samples and their sums
+ * @x: new value to include in the &mean_and_variance_weighted
*
* see linked pdf: function derived from equations 140-143 where alpha = 2^w.
* values are stored bitshifted for performance and added precision.
@@ -129,6 +132,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
/**
* mean_and_variance_weighted_get_mean() - get mean from @s
+ * @s: mean and variance number of samples and their sums
*/
s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
{
@@ -138,6 +142,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
/**
* mean_and_variance_weighted_get_variance() -- get variance from @s
+ * @s: mean and variance number of samples and their sums
*/
u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
{
@@ -148,6 +153,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
/**
* mean_and_variance_weighted_get_stddev() - get standard deviation from @s
+ * @s: mean and variance number of samples and their sums
*/
u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
{
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
index 647505010b39..b2be565bb8f2 100644
--- a/fs/bcachefs/mean_and_variance.h
+++ b/fs/bcachefs/mean_and_variance.h
@@ -12,9 +12,12 @@
/*
* u128_u: u128 user mode, because not all architectures support a real int128
* type
+ *
+ * We don't use this version in userspace, because in userspace we link with
+ * Rust and rustc has issues with u128.
*/
-#ifdef __SIZEOF_INT128__
+#if defined(__SIZEOF_INT128__) && defined(__KERNEL__)
typedef struct {
unsigned __int128 v;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index e3a51f6d6c9b..5623cee3ef86 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -79,8 +79,6 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
enum btree_id id;
int ret = 0;
@@ -90,7 +88,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
if (ret)
break;
@@ -145,10 +143,9 @@ retry:
continue;
}
- if (ret) {
- bch_err_msg(c, ret, "updating btree node key");
+ bch_err_msg(c, ret, "updating btree node key");
+ if (ret)
break;
- }
next:
bch2_btree_iter_next_node(&iter);
}
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 54830ee0ed88..bf68ea49447b 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -6,9 +6,11 @@
#include "backpointers.h"
#include "bkey_buf.h"
#include "btree_gc.h"
+#include "btree_io.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
+#include "compress.h"
#include "disk_groups.h"
#include "ec.h"
#include "errcode.h"
@@ -27,12 +29,53 @@
#include <linux/ioprio.h>
#include <linux/kthread.h>
-static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
+const char * const bch2_data_ops_strs[] = {
+#define x(t, n, ...) [n] = #t,
+ BCH_DATA_OPS()
+#undef x
+ NULL
+};
+
+static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ printbuf_tabstop_push(out, 20);
+ prt_str(out, "rewrite ptrs:");
+ prt_tab(out);
+ bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
+ prt_newline(out);
+
+ prt_str(out, "kill ptrs: ");
+ prt_tab(out);
+ bch2_prt_u64_base2(out, data_opts->kill_ptrs);
+ prt_newline(out);
+
+ prt_str(out, "target: ");
+ prt_tab(out);
+ bch2_target_to_text(out, c, data_opts->target);
+ prt_newline(out);
+
+ prt_str(out, "compression: ");
+ prt_tab(out);
+ bch2_compression_opt_to_text(out, background_compression(*io_opts));
+ prt_newline(out);
+
+ prt_str(out, "extra replicas: ");
+ prt_tab(out);
+ prt_u64(out, data_opts->extra_replicas);
+}
+
+static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
if (trace_move_extent_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+ bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
trace_move_extent(c, buf.buf);
printbuf_exit(&buf);
}
@@ -63,7 +106,7 @@ struct moving_io {
struct data_update write;
/* Must be last since it is variable size */
- struct bio_vec bi_inline_vecs[0];
+ struct bio_vec bi_inline_vecs[];
};
static void move_free(struct moving_io *io)
@@ -104,6 +147,15 @@ static void move_write(struct moving_io *io)
return;
}
+ if (trace_move_extent_write_enabled()) {
+ struct bch_fs *c = io->write.op.c;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
+ trace_move_extent_write(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+
closure_get(&io->write.ctxt->cl);
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
atomic_inc(&io->write.ctxt->write_ios);
@@ -152,7 +204,7 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
atomic_read(&ctxt->write_sectors) != sectors_pending);
}
-static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
+void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
{
move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
bch2_trans_unlock_long(ctxt->trans);
@@ -211,7 +263,7 @@ void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
trace_move_data(c, stats);
}
-void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
+void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
{
memset(stats, 0, sizeof(*stats));
stats->data_type = BCH_DATA_user;
@@ -234,9 +286,10 @@ int bch2_move_extent(struct moving_context *ctxt,
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
+ trace_move_extent2(c, k, &io_opts, &data_opts);
+
if (ctxt->stats)
ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
- trace_move_extent2(c, k);
bch2_data_update_opts_normalize(k, &data_opts);
@@ -342,7 +395,8 @@ err:
bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
- this_cpu_inc(c->counters[BCH_COUNTER_move_extent_start_fail]);
+ count_event(c, move_extent_start_fail);
+
if (trace_move_extent_start_fail_enabled()) {
struct printbuf buf = PRINTBUF;
@@ -364,13 +418,10 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
int ret = 0;
if (io_opts->cur_inum != extent_k.k->p.inode) {
- struct btree_iter iter;
- struct bkey_s_c k;
-
io_opts->d.nr = 0;
- for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
if (k.k->p.offset != extent_k.k->p.inode)
break;
@@ -383,11 +434,8 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
- ret = darray_push(&io_opts->d, e);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
+ darray_push(&io_opts->d, e);
+ }));
io_opts->cur_inum = extent_k.k->p.inode;
}
@@ -395,12 +443,10 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
- if (extent_k.k->p.snapshot) {
- struct snapshot_io_opts_entry *i;
+ if (extent_k.k->p.snapshot)
darray_for_each(io_opts->d, i)
if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
return &i->io_opts;
- }
return &io_opts->fs_io_opts;
}
@@ -628,7 +674,7 @@ int bch2_move_data(struct bch_fs *c,
return ret;
}
-int __bch2_evacuate_bucket(struct moving_context *ctxt,
+int bch2_evacuate_bucket(struct moving_context *ctxt,
struct move_bucket_in_flight *bucket_in_flight,
struct bpos bucket, int gen,
struct data_update_opts _data_opts)
@@ -664,21 +710,19 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
bch2_trans_iter_exit(trans, &iter);
- if (ret) {
- bch_err_msg(c, ret, "looking up alloc key");
+ bch_err_msg(c, ret, "looking up alloc key");
+ if (ret)
goto err;
- }
a = bch2_alloc_to_v4(k, &a_convert);
- dirty_sectors = a->dirty_sectors;
+ dirty_sectors = bch2_bucket_sectors_dirty(*a);
bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
fragmentation = a->fragmentation_lru;
- ret = bch2_btree_write_buffer_flush(trans);
- if (ret) {
- bch_err_msg(c, ret, "flushing btree write buffer");
+ ret = bch2_btree_write_buffer_tryflush(trans);
+ bch_err_msg(c, ret, "flushing btree write buffer");
+ if (ret)
goto err;
- }
while (!(ret = bch2_move_ratelimit(ctxt))) {
if (is_kthread && kthread_should_stop())
@@ -697,9 +741,6 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
break;
if (!bp.level) {
- const struct bch_extent_ptr *ptr;
- unsigned i = 0;
-
k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -722,6 +763,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
data_opts.target = io_opts.background_target;
data_opts.rewrite_ptrs = 0;
+ unsigned i = 0;
bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
if (ptr->dev == bucket.inode) {
data_opts.rewrite_ptrs |= 1U << i;
@@ -763,6 +805,8 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
if (!b)
goto next;
+ unsigned sectors = btree_ptr_sectors_written(&b->key);
+
ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
bch2_trans_iter_exit(trans, &iter);
@@ -772,11 +816,10 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
goto err;
if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate,
- c->opts.btree_node_size >> 9);
+ bch2_ratelimit_increment(ctxt->rate, sectors);
if (ctxt->stats) {
- atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
- atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
+ atomic64_add(sectors, &ctxt->stats->sectors_seen);
+ atomic64_add(sectors, &ctxt->stats->sectors_moved);
}
}
next:
@@ -789,31 +832,13 @@ err:
return ret;
}
-int bch2_evacuate_bucket(struct bch_fs *c,
- struct bpos bucket, int gen,
- struct data_update_opts data_opts,
- struct bch_ratelimit *rate,
- struct bch_move_stats *stats,
- struct write_point_specifier wp,
- bool wait_on_copygc)
-{
- struct moving_context ctxt;
- int ret;
-
- bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
- bch2_moving_ctxt_exit(&ctxt);
-
- return ret;
-}
-
typedef bool (*move_btree_pred)(struct bch_fs *, void *,
struct btree *, struct bch_io_opts *,
struct data_update_opts *);
static int bch2_move_btree(struct bch_fs *c,
- enum btree_id start_btree_id, struct bpos start_pos,
- enum btree_id end_btree_id, struct bpos end_pos,
+ struct bbpos start,
+ struct bbpos end,
move_btree_pred pred, void *arg,
struct bch_move_stats *stats)
{
@@ -823,7 +848,7 @@ static int bch2_move_btree(struct bch_fs *c,
struct btree_trans *trans;
struct btree_iter iter;
struct btree *b;
- enum btree_id id;
+ enum btree_id btree;
struct data_update_opts data_opts;
int ret = 0;
@@ -834,15 +859,15 @@ static int bch2_move_btree(struct bch_fs *c,
stats->data_type = BCH_DATA_btree;
- for (id = start_btree_id;
- id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
- id++) {
- stats->pos = BBPOS(id, POS_MIN);
+ for (btree = start.btree;
+ btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
+ btree ++) {
+ stats->pos = BBPOS(btree, POS_MIN);
- if (!bch2_btree_id_root(c, id)->b)
+ if (!bch2_btree_id_root(c, btree)->b)
continue;
- bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
+ bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
BTREE_ITER_PREFETCH);
retry:
ret = 0;
@@ -852,8 +877,8 @@ retry:
if (kthread && kthread_should_stop())
break;
- if ((cmp_int(id, end_btree_id) ?:
- bpos_cmp(b->key.k.p, end_pos)) > 0)
+ if ((cmp_int(btree, end.btree) ?:
+ bpos_cmp(b->key.k.p, end.pos)) > 0)
break;
stats->pos = BBPOS(iter.btree_id, iter.pos);
@@ -910,7 +935,6 @@ static bool migrate_pred(struct bch_fs *c, void *arg,
struct data_update_opts *data_opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
struct bch_ioctl_data *op = arg;
unsigned i = 0;
@@ -990,8 +1014,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
int ret;
ret = bch2_move_btree(c,
- 0, POS_MIN,
- BTREE_ID_NR, SPOS_MAX,
+ BBPOS_MIN,
+ BBPOS_MAX,
rewrite_old_nodes_pred, c, stats);
if (!ret) {
mutex_lock(&c->sb_lock);
@@ -1006,79 +1030,109 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
return ret;
}
+static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ unsigned durability = bch2_bkey_durability(c, k);
+ unsigned replicas = bkey_is_btree_ptr(k.k)
+ ? c->opts.metadata_replicas
+ : io_opts->data_replicas;
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned i = 0;
+
+ bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
+ unsigned d = bch2_extent_ptr_durability(c, &p);
+
+ if (d && durability - d >= replicas) {
+ data_opts->kill_ptrs |= BIT(i);
+ durability -= d;
+ }
+
+ i++;
+ }
+
+ return data_opts->kill_ptrs != 0;
+}
+
+static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
int bch2_data_job(struct bch_fs *c,
struct bch_move_stats *stats,
struct bch_ioctl_data op)
{
+ struct bbpos start = BBPOS(op.start_btree, op.start_pos);
+ struct bbpos end = BBPOS(op.end_btree, op.end_pos);
int ret = 0;
+ if (op.op >= BCH_DATA_OP_NR)
+ return -EINVAL;
+
+ bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
+
switch (op.op) {
- case BCH_DATA_OP_REREPLICATE:
- bch2_move_stats_init(stats, "rereplicate");
+ case BCH_DATA_OP_rereplicate:
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
-
- ret = bch2_move_btree(c,
- op.start_btree, op.start_pos,
- op.end_btree, op.end_pos,
+ ret = bch2_move_btree(c, start, end,
rereplicate_btree_pred, c, stats) ?: ret;
- ret = bch2_replicas_gc2(c) ?: ret;
-
- ret = bch2_move_data(c,
- (struct bbpos) { op.start_btree, op.start_pos },
- (struct bbpos) { op.end_btree, op.end_pos },
+ ret = bch2_move_data(c, start, end,
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
rereplicate_pred, c) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
-
- bch2_move_stats_exit(stats, c);
break;
- case BCH_DATA_OP_MIGRATE:
+ case BCH_DATA_OP_migrate:
if (op.migrate.dev >= c->sb.nr_devices)
return -EINVAL;
- bch2_move_stats_init(stats, "migrate");
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
-
- ret = bch2_move_btree(c,
- op.start_btree, op.start_pos,
- op.end_btree, op.end_pos,
+ ret = bch2_move_btree(c, start, end,
migrate_btree_pred, &op, stats) ?: ret;
- ret = bch2_replicas_gc2(c) ?: ret;
-
- ret = bch2_move_data(c,
- (struct bbpos) { op.start_btree, op.start_pos },
- (struct bbpos) { op.end_btree, op.end_pos },
+ ret = bch2_move_data(c, start, end,
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
migrate_pred, &op) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
-
- bch2_move_stats_exit(stats, c);
break;
- case BCH_DATA_OP_REWRITE_OLD_NODES:
- bch2_move_stats_init(stats, "rewrite_old_nodes");
+ case BCH_DATA_OP_rewrite_old_nodes:
ret = bch2_scan_old_btree_nodes(c, stats);
- bch2_move_stats_exit(stats, c);
+ break;
+ case BCH_DATA_OP_drop_extra_replicas:
+ ret = bch2_move_btree(c, start, end,
+ drop_extra_replicas_btree_pred, c, stats) ?: ret;
+ ret = bch2_move_data(c, start, end, NULL, stats,
+ writepoint_hashed((unsigned long) current),
+ true,
+ drop_extra_replicas_pred, c) ?: ret;
+ ret = bch2_replicas_gc2(c) ?: ret;
break;
default:
ret = -EINVAL;
}
+ bch2_move_stats_exit(stats, c);
return ret;
}
void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
{
- prt_printf(out, "%s: data type=%s pos=",
- stats->name,
- bch2_data_types[stats->data_type]);
+ prt_printf(out, "%s: data type==", stats->name);
+ bch2_prt_data_type(out, stats->data_type);
+ prt_str(out, " pos=");
bch2_bbpos_to_text(out, stats->pos);
prt_newline(out);
printbuf_indent_add(out, 2);
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 0906aa2d1de2..9baf3093a678 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -75,12 +75,15 @@ do { \
typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
struct bch_io_opts *, struct data_update_opts *);
+extern const char * const bch2_data_ops_strs[];
+
void bch2_moving_ctxt_exit(struct moving_context *);
void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
struct bch_ratelimit *, struct bch_move_stats *,
struct write_point_specifier, bool);
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
+void bch2_moving_ctxt_flush_all(struct moving_context *);
void bch2_move_ctxt_wait_for_io(struct moving_context *);
int bch2_move_ratelimit(struct moving_context *);
@@ -133,23 +136,17 @@ int bch2_move_data(struct bch_fs *,
bool,
move_pred_fn, void *);
-int __bch2_evacuate_bucket(struct moving_context *,
+int bch2_evacuate_bucket(struct moving_context *,
struct move_bucket_in_flight *,
struct bpos, int,
struct data_update_opts);
-int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int,
- struct data_update_opts,
- struct bch_ratelimit *,
- struct bch_move_stats *,
- struct write_point_specifier,
- bool);
int bch2_data_job(struct bch_fs *,
struct bch_move_stats *,
struct bch_ioctl_data);
void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
-void bch2_move_stats_init(struct bch_move_stats *, char *);
+void bch2_move_stats_init(struct bch_move_stats *, const char *);
void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index a84e79f79e5e..69e06a84dad4 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -91,7 +91,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
a = bch2_alloc_to_v4(k, &_a);
b->k.gen = a->gen;
- b->sectors = a->dirty_sectors;
+ b->sectors = bch2_bucket_sectors_dirty(*a);
ret = data_type_movable(a->data_type) &&
a->fragmentation_lru &&
@@ -145,20 +145,21 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
int ret;
move_buckets_wait(ctxt, buckets_in_flight, false);
- ret = bch2_btree_write_buffer_flush(trans);
- if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
+ ret = bch2_btree_write_buffer_tryflush(trans);
+ if (bch2_err_matches(ret, EROFS))
+ return ret;
+
+ if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()",
__func__, bch2_err_str(ret)))
return ret;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
+ ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
0, k, ({
@@ -167,15 +168,23 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
saw++;
- if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)))
+ ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p));
+ if (ret2 < 0)
+ goto err;
+
+ if (!ret2)
not_movable++;
else if (bucket_in_flight(buckets_in_flight, b.k))
in_flight++;
else {
- ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
- if (ret2 >= 0)
- sectors += b.sectors;
+ ret2 = darray_push(buckets, b);
+ if (ret2)
+ goto err;
+ sectors += b.sectors;
}
+
+ ret2 = buckets->nr >= nr_to_get;
+err:
ret2;
}));
@@ -198,7 +207,6 @@ static int bch2_copygc(struct moving_context *ctxt,
};
move_buckets buckets = { 0 };
struct move_bucket_in_flight *f;
- struct move_bucket *i;
u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0;
@@ -221,7 +229,7 @@ static int bch2_copygc(struct moving_context *ctxt,
break;
}
- ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
+ ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
f->bucket.k.gen, data_opts);
if (ret)
goto err;
@@ -259,19 +267,16 @@ err:
*/
unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned dev_idx;
s64 wait = S64_MAX, fragmented_allowed, fragmented;
- unsigned i;
- for_each_rw_member(ca, c, dev_idx) {
+ for_each_rw_member(c, ca) {
struct bch_dev_usage usage = bch2_dev_usage_read(ca);
fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
ca->mi.bucket_size) >> 1);
fragmented = 0;
- for (i = 0; i < BCH_DATA_NR; i++)
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
if (data_type_movable(i))
fragmented += usage.d[i].fragmented;
@@ -313,9 +318,9 @@ static int bch2_copygc_thread(void *arg)
if (!buckets)
return -ENOMEM;
ret = rhashtable_init(&buckets->table, &bch_move_bucket_params);
+ bch_err_msg(c, ret, "allocating copygc buckets in flight");
if (ret) {
kfree(buckets);
- bch_err_msg(c, ret, "allocating copygc buckets in flight");
return ret;
}
@@ -334,7 +339,8 @@ static int bch2_copygc_thread(void *arg)
if (!c->copy_gc_enabled) {
move_buckets_wait(&ctxt, buckets, true);
- kthread_wait_freezable(c->copy_gc_enabled);
+ kthread_wait_freezable(c->copy_gc_enabled ||
+ kthread_should_stop());
}
if (unlikely(freezing(current))) {
@@ -411,10 +417,9 @@ int bch2_copygc_start(struct bch_fs *c)
t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
ret = PTR_ERR_OR_ZERO(t);
- if (ret) {
- bch_err_msg(c, ret, "creating copygc thread");
+ bch_err_msg(c, ret, "creating copygc thread");
+ if (ret)
return ret;
- }
get_task_struct(t);
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 8dd4046cca41..b1ed0b9a20d3 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -52,7 +52,7 @@ const char * const bch2_csum_opts[] = {
NULL
};
-const char * const bch2_compression_types[] = {
+const char * const __bch2_compression_types[] = {
BCH_COMPRESSION_TYPES()
NULL
};
@@ -72,7 +72,7 @@ const char * const bch2_str_hash_opts[] = {
NULL
};
-const char * const bch2_data_types[] = {
+const char * const __bch2_data_types[] = {
BCH_DATA_TYPES()
NULL
};
@@ -279,14 +279,14 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
if (err)
prt_printf(err, "%s: not a multiple of 512",
opt->attr.name);
- return -EINVAL;
+ return -BCH_ERR_opt_parse_error;
}
if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
if (err)
prt_printf(err, "%s: must be a power of two",
opt->attr.name);
- return -EINVAL;
+ return -BCH_ERR_opt_parse_error;
}
if (opt->fn.validate)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 8526f177450a..9a4b7faa3765 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -18,11 +18,11 @@ extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
extern const char * const bch2_csum_types[];
extern const char * const bch2_csum_opts[];
-extern const char * const bch2_compression_types[];
+extern const char * const __bch2_compression_types[];
extern const char * const bch2_compression_opts[];
extern const char * const bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
-extern const char * const bch2_data_types[];
+extern const char * const __bch2_data_types[];
extern const char * const bch2_member_states[];
extern const char * const bch2_jset_entry_types[];
extern const char * const bch2_fs_usage_types[];
@@ -233,11 +233,6 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
NULL, "Stash pointer to in memory btree node in btree ptr")\
- x(btree_write_buffer_size, u32, \
- OPT_FS|OPT_MOUNT, \
- OPT_UINT(16, (1U << 20) - 1), \
- BCH2_NO_SB_OPT, 1U << 13, \
- NULL, "Number of btree write buffer entries") \
x(gc_reserve_percent, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(5, 21), \
@@ -394,7 +389,7 @@ enum fsck_err_opts {
BCH2_NO_SB_OPT, BCH_SB_SECTOR, \
"offset", "Sector offset of superblock") \
x(read_only, u8, \
- OPT_FS, \
+ OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, NULL) \
@@ -419,6 +414,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Allocate the buckets_nouse bitmap") \
+ x(stdio, u64, \
+ 0, \
+ OPT_UINT(0, S64_MAX), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Pointer to a struct stdio_redirect") \
x(project, u8, \
OPT_INODE, \
OPT_BOOL(), \
@@ -458,7 +458,13 @@ enum fsck_err_opts {
OPT_UINT(0, BCH_REPLICAS_MAX), \
BCH2_NO_SB_OPT, 1, \
"n", "Data written to this device will be considered\n"\
- "to have already been replicated n times")
+ "to have already been replicated n times") \
+ x(btree_node_prefetch, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "BTREE_ITER_PREFETCH casuse btree nodes to be\n"\
+ " prefetched sequentially")
struct bch_opts {
#define x(_name, _bits, ...) unsigned _name##_defined:1;
@@ -558,6 +564,11 @@ struct bch_io_opts {
#undef x
};
+static inline unsigned background_compression(struct bch_io_opts opts)
+{
+ return opts.background_compression ?: opts.compression;
+}
+
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
bool bch2_opt_is_inode_opt(enum bch_opt_id);
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index a54647c36b85..e68b34eab90a 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -599,14 +599,9 @@ advance:
int bch2_fs_quota_read(struct bch_fs *c)
{
- struct bch_sb_field_quota *sb_quota;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
mutex_lock(&c->sb_lock);
- sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+ struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
if (!sb_quota) {
mutex_unlock(&c->sb_lock);
return -BCH_ERR_ENOSPC_sb_quota;
@@ -615,19 +610,14 @@ int bch2_fs_quota_read(struct bch_fs *c)
bch2_sb_quota_read(c);
mutex_unlock(&c->sb_lock);
- trans = bch2_trans_get(c);
-
- ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas,
- POS_MIN, BTREE_ITER_PREFETCH, k,
- __bch2_quota_set(c, k, NULL)) ?:
- for_each_btree_key2(trans, iter, BTREE_ID_inodes,
- POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- bch2_fs_quota_read_inode(trans, &iter, k));
-
- bch2_trans_put(trans);
-
- if (ret)
- bch_err_fn(c, ret);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ __bch2_quota_set(c, k, NULL)) ?:
+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ bch2_fs_quota_read_inode(trans, &iter, k)));
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h
new file mode 100644
index 000000000000..dc34347ef6c7
--- /dev/null
+++ b/fs/bcachefs/quota_format.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_FORMAT_H
+#define _BCACHEFS_QUOTA_FORMAT_H
+
+/* KEY_TYPE_quota: */
+
+enum quota_types {
+ QTYP_USR = 0,
+ QTYP_GRP = 1,
+ QTYP_PRJ = 2,
+ QTYP_NR = 3,
+};
+
+enum quota_counters {
+ Q_SPC = 0,
+ Q_INO = 1,
+ Q_COUNTERS = 2,
+};
+
+struct bch_quota_counter {
+ __le64 hardlimit;
+ __le64 softlimit;
+};
+
+struct bch_quota {
+ struct bch_val v;
+ struct bch_quota_counter c[Q_COUNTERS];
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_quota: */
+
+struct bch_sb_quota_counter {
+ __le32 timelimit;
+ __le32 warnlimit;
+};
+
+struct bch_sb_quota_type {
+ __le64 flags;
+ struct bch_sb_quota_counter c[Q_COUNTERS];
+};
+
+struct bch_sb_field_quota {
+ struct bch_sb_field field;
+ struct bch_sb_quota_type q[QTYP_NR];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_QUOTA_FORMAT_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 3319190b8d9c..22d1017aa49b 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -69,7 +69,7 @@ err:
int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
{
- int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
__bch2_set_rebalance_needs_scan(trans, inum));
rebalance_wakeup(c);
return ret;
@@ -125,7 +125,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
extent_entry_drop(bkey_i_to_s(n),
(void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
- return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
@@ -171,6 +171,20 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
return bkey_s_c_null;
}
+ if (trace_rebalance_extent_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "target=");
+ bch2_target_to_text(&buf, c, r->target);
+ prt_str(&buf, " compression=");
+ bch2_compression_opt_to_text(&buf, r->compression);
+ prt_str(&buf, " ");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ trace_rebalance_extent(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+
return k;
}
@@ -239,13 +253,12 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
if (k.k->p.inode) {
target = io_opts->background_target;
- compression = io_opts->background_compression ?: io_opts->compression;
+ compression = background_compression(*io_opts);
} else {
const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
target = r ? r->target : io_opts->background_target;
- compression = r ? r->compression :
- (io_opts->background_compression ?: io_opts->compression);
+ compression = r ? r->compression : background_compression(*io_opts);
}
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
@@ -273,7 +286,7 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
r->state = BCH_REBALANCE_scanning;
ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
- commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_clear_rebalance_needs_scan(trans, inum, cookie));
bch2_move_stats_exit(&r->scan_stats, trans->c);
@@ -317,8 +330,16 @@ static int do_rebalance(struct moving_context *ctxt)
BTREE_ID_rebalance_work, POS_MIN,
BTREE_ITER_ALL_SNAPSHOTS);
- while (!bch2_move_ratelimit(ctxt) &&
- !kthread_wait_freezable(r->enabled)) {
+ while (!bch2_move_ratelimit(ctxt)) {
+ if (!r->enabled) {
+ bch2_moving_ctxt_flush_all(ctxt);
+ kthread_wait_freezable(r->enabled ||
+ kthread_should_stop());
+ }
+
+ if (kthread_should_stop())
+ break;
+
bch2_trans_begin(trans);
ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
@@ -348,6 +369,7 @@ static int do_rebalance(struct moving_context *ctxt)
!kthread_should_stop() &&
!atomic64_read(&r->work_stats.sectors_seen) &&
!atomic64_read(&r->scan_stats.sectors_seen)) {
+ bch2_moving_ctxt_flush_all(ctxt);
bch2_trans_unlock_long(trans);
rebalance_wait(c);
}
@@ -362,7 +384,6 @@ static int bch2_rebalance_thread(void *arg)
struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
struct moving_context ctxt;
- int ret;
set_freezable();
@@ -370,8 +391,7 @@ static int bch2_rebalance_thread(void *arg)
writepoint_ptr(&c->rebalance_write_point),
true);
- while (!kthread_should_stop() &&
- !(ret = do_rebalance(&ctxt)))
+ while (!kthread_should_stop() && !do_rebalance(&ctxt))
;
bch2_moving_ctxt_exit(&ctxt);
@@ -447,10 +467,9 @@ int bch2_rebalance_start(struct bch_fs *c)
p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
ret = PTR_ERR_OR_ZERO(p);
- if (ret) {
- bch_err_msg(c, ret, "creating rebalance thread");
+ bch_err_msg(c, ret, "creating rebalance thread");
+ if (ret)
return ret;
- }
get_task_struct(p);
rcu_assign_pointer(c->rebalance.thread, p);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 5cf7d0532002..9127d0e3ca2f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -99,6 +99,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
unsigned update_flags = BTREE_TRIGGER_NORUN;
int ret;
+ if (k->overwritten)
+ return 0;
+
+ trans->journal_res.seq = k->journal_seq;
+
/*
* BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
* keep the key cache coherent with the underlying btree. Nothing
@@ -140,27 +145,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
static int bch2_journal_replay(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
- struct journal_key **keys_sorted, *k;
+ DARRAY(struct journal_key *) keys_sorted = { 0 };
struct journal *j = &c->journal;
u64 start_seq = c->journal_replay_seq_start;
u64 end_seq = c->journal_replay_seq_start;
- size_t i;
+ struct btree_trans *trans = bch2_trans_get(c);
int ret = 0;
- move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
- keys->gap = keys->nr;
-
- keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL);
- if (!keys_sorted)
- return -BCH_ERR_ENOMEM_journal_replay;
-
- for (i = 0; i < keys->nr; i++)
- keys_sorted[i] = &keys->d[i];
-
- sort(keys_sorted, keys->nr,
- sizeof(keys_sorted[0]),
- journal_sort_seq_cmp, NULL);
-
if (keys->nr) {
ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
keys->nr, start_seq, end_seq);
@@ -170,27 +161,67 @@ static int bch2_journal_replay(struct bch_fs *c)
BUG_ON(!atomic_read(&keys->ref));
- for (i = 0; i < keys->nr; i++) {
- k = keys_sorted[i];
+ /*
+ * First, attempt to replay keys in sorted order. This is more
+ * efficient - better locality of btree access - but some might fail if
+ * that would cause a journal deadlock.
+ */
+ for (size_t i = 0; i < keys->nr; i++) {
+ cond_resched();
+
+ struct journal_key *k = keys->d + i;
+
+ /* Skip fastpath if we're low on space in the journal */
+ ret = c->journal.watermark ? -1 :
+ commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_journal_reclaim|
+ (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
+ bch2_journal_replay_key(trans, k));
+ BUG_ON(!ret && !k->overwritten);
+ if (ret) {
+ ret = darray_push(&keys_sorted, k);
+ if (ret)
+ goto err;
+ }
+ }
+ /*
+ * Now, replay any remaining keys in the order in which they appear in
+ * the journal, unpinning those journal entries as we go:
+ */
+ sort(keys_sorted.data, keys_sorted.nr,
+ sizeof(keys_sorted.data[0]),
+ journal_sort_seq_cmp, NULL);
+
+ darray_for_each(keys_sorted, kp) {
cond_resched();
+ struct journal_key *k = *kp;
+
replay_now_at(j, k->journal_seq);
- ret = bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL|
- (!k->allocated
- ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
- : 0),
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ (!k->allocated
+ ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
+ : 0),
bch2_journal_replay_key(trans, k));
- if (ret) {
- bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
- bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret));
+ bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
+ bch2_btree_id_str(k->btree_id), k->level);
+ if (ret)
goto err;
- }
+
+ BUG_ON(!k->overwritten);
}
+ /*
+ * We need to put our btree_trans before calling flush_all_pins(), since
+ * that will use a btree_trans internally
+ */
+ bch2_trans_put(trans);
+ trans = NULL;
+
if (!c->opts.keep_journal)
bch2_journal_keys_put_initial(c);
@@ -198,16 +229,14 @@ static int bch2_journal_replay(struct bch_fs *c)
j->replay_journal_seq = 0;
bch2_journal_set_replay_done(j);
- bch2_journal_flush_all_pins(j);
- ret = bch2_journal_error(j);
- if (keys->nr && !ret)
+ if (keys->nr)
bch2_journal_log_msg(c, "journal replay finished");
err:
- kvfree(keys_sorted);
-
- if (ret)
- bch_err_fn(c, ret);
+ if (trans)
+ bch2_trans_put(trans);
+ darray_exit(&keys_sorted);
+ bch_err_fn(c, ret);
return ret;
}
@@ -251,7 +280,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
le64_to_cpu(u->v);
break;
case BCH_FS_USAGE_inodes:
- c->usage_base->nr_inodes = le64_to_cpu(u->v);
+ c->usage_base->b.nr_inodes = le64_to_cpu(u->v);
break;
case BCH_FS_USAGE_key_version:
atomic64_set(&c->key_version,
@@ -275,8 +304,6 @@ static int journal_replay_entry_early(struct bch_fs *c,
struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
- ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
-
for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
@@ -317,14 +344,11 @@ static int journal_replay_entry_early(struct bch_fs *c,
static int journal_replay_early(struct bch_fs *c,
struct bch_sb_field_clean *clean)
{
- struct jset_entry *entry;
- int ret;
-
if (clean) {
- for (entry = clean->start;
+ for (struct jset_entry *entry = clean->start;
entry != vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
- ret = journal_replay_entry_early(c, entry);
+ int ret = journal_replay_entry_early(c, entry);
if (ret)
return ret;
}
@@ -339,7 +363,7 @@ static int journal_replay_early(struct bch_fs *c,
continue;
vstruct_for_each(&i->j, entry) {
- ret = journal_replay_entry_early(c, entry);
+ int ret = journal_replay_entry_early(c, entry);
if (ret)
return ret;
}
@@ -435,8 +459,7 @@ static int bch2_initialize_subvolumes(struct bch_fs *c)
ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?:
bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?:
bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -474,10 +497,9 @@ err:
noinline_for_stack
static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
{
- int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
__bch2_fs_upgrade_for_subvolumes(trans));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -495,7 +517,20 @@ static int bch2_check_allocations(struct bch_fs *c)
static int bch2_set_may_go_rw(struct bch_fs *c)
{
- set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+ struct journal_keys *keys = &c->journal_keys;
+
+ /*
+ * After we go RW, the journal keys buffer can't be modified (except for
+ * setting journal_key->overwritten: it will be accessed by multiple
+ * threads
+ */
+ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+ keys->gap = keys->nr;
+
+ set_bit(BCH_FS_may_go_rw, &c->flags);
+
+ if (keys->nr || c->opts.fsck || !c->sb.clean)
+ return bch2_fs_read_write_early(c);
return 0;
}
@@ -589,17 +624,15 @@ static bool check_version_upgrade(struct bch_fs *c)
bch2_version_to_text(&buf, new_version);
prt_newline(&buf);
- u64 recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version);
- if (recovery_passes) {
- if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK)
- prt_str(&buf, "fsck required");
- else {
- prt_str(&buf, "running recovery passes: ");
- prt_bitflags(&buf, bch2_recovery_passes, recovery_passes);
- }
-
- c->recovery_passes_explicit |= recovery_passes;
- c->opts.fix_errors = FSCK_FIX_yes;
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ __le64 passes = ext->recovery_passes_required[0];
+ bch2_sb_set_upgrade(c, old_version, new_version);
+ passes = ext->recovery_passes_required[0] & ~passes;
+
+ if (passes) {
+ prt_str(&buf, " running recovery passes: ");
+ prt_bitflags(&buf, bch2_recovery_passes,
+ bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
}
bch_info(c, "%s", buf.buf);
@@ -625,7 +658,7 @@ u64 bch2_fsck_recovery_passes(void)
static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
{
- struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass;
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
return false;
@@ -642,39 +675,62 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa
static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
{
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
int ret;
- c->curr_recovery_pass = pass;
+ if (!(p->when & PASS_SILENT))
+ bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
+ bch2_recovery_passes[pass]);
+ ret = p->fn(c);
+ if (ret)
+ return ret;
+ if (!(p->when & PASS_SILENT))
+ bch2_print(c, KERN_CONT " done\n");
- if (should_run_recovery_pass(c, pass)) {
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
+ return 0;
+}
- if (!(p->when & PASS_SILENT))
- printk(KERN_INFO bch2_log_msg(c, "%s..."),
- bch2_recovery_passes[pass]);
- ret = p->fn(c);
- if (ret)
- return ret;
- if (!(p->when & PASS_SILENT))
- printk(KERN_CONT " done\n");
+static int bch2_run_recovery_passes(struct bch_fs *c)
+{
+ int ret = 0;
- c->recovery_passes_complete |= BIT_ULL(pass);
+ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
+ if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
+ unsigned pass = c->curr_recovery_pass;
+
+ ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
+ (ret && c->curr_recovery_pass < pass))
+ continue;
+ if (ret)
+ break;
+
+ c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
+ }
+ c->curr_recovery_pass++;
+ c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
}
- return 0;
+ return ret;
}
-static int bch2_run_recovery_passes(struct bch_fs *c)
+int bch2_run_online_recovery_passes(struct bch_fs *c)
{
int ret = 0;
- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
- ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery))
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
+ struct recovery_pass_fn *p = recovery_pass_fns + i;
+
+ if (!(p->when & PASS_ONLINE))
+ continue;
+
+ ret = bch2_run_recovery_pass(c, i);
+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
+ i = c->curr_recovery_pass;
continue;
+ }
if (ret)
break;
- c->curr_recovery_pass++;
}
return ret;
@@ -779,6 +835,9 @@ int bch2_fs_recovery(struct bch_fs *c)
if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+ if (c->opts.fsck)
+ set_bit(BCH_FS_fsck_running, &c->flags);
+
ret = bch2_blacklist_table_initialize(c);
if (ret) {
bch_err(c, "error initializing blacklist table");
@@ -919,13 +978,17 @@ use_clean:
if (ret)
goto err;
+ clear_bit(BCH_FS_fsck_running, &c->flags);
+
/* If we fixed errors, verify that fs is actually clean now: */
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- test_bit(BCH_FS_ERRORS_FIXED, &c->flags) &&
- !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) &&
- !test_bit(BCH_FS_ERROR, &c->flags)) {
+ test_bit(BCH_FS_errors_fixed, &c->flags) &&
+ !test_bit(BCH_FS_errors_not_fixed, &c->flags) &&
+ !test_bit(BCH_FS_error, &c->flags)) {
+ bch2_flush_fsck_errs(c);
+
bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
- clear_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+ clear_bit(BCH_FS_errors_fixed, &c->flags);
c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
@@ -933,13 +996,13 @@ use_clean:
if (ret)
goto err;
- if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) ||
- test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
+ if (test_bit(BCH_FS_errors_fixed, &c->flags) ||
+ test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
bch_err(c, "Second fsck run was not clean");
- set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
+ set_bit(BCH_FS_errors_not_fixed, &c->flags);
}
- set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+ set_bit(BCH_FS_errors_fixed, &c->flags);
}
if (enabled_qtypes(c)) {
@@ -958,13 +1021,13 @@ use_clean:
write_sb = true;
}
- if (!test_bit(BCH_FS_ERROR, &c->flags) &&
+ if (!test_bit(BCH_FS_error, &c->flags) &&
!(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) {
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
write_sb = true;
}
- if (!test_bit(BCH_FS_ERROR, &c->flags)) {
+ if (!test_bit(BCH_FS_error, &c->flags)) {
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
if (ext &&
(!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) ||
@@ -976,8 +1039,8 @@ use_clean:
}
if (c->opts.fsck &&
- !test_bit(BCH_FS_ERROR, &c->flags) &&
- !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
+ !test_bit(BCH_FS_error, &c->flags) &&
+ !test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
write_sb = true;
@@ -993,8 +1056,12 @@ use_clean:
bch2_move_stats_init(&stats, "recovery");
- bch_info(c, "scanning for old btree nodes");
- ret = bch2_fs_read_write(c) ?:
+ struct printbuf buf = PRINTBUF;
+ bch2_version_to_text(&buf, c->sb.version_min);
+ bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf);
+ printbuf_exit(&buf);
+
+ ret = bch2_fs_read_write_early(c) ?:
bch2_scan_old_btree_nodes(c, &stats);
if (ret)
goto err;
@@ -1007,7 +1074,6 @@ use_clean:
ret = 0;
out:
- set_bit(BCH_FS_FSCK_DONE, &c->flags);
bch2_flush_fsck_errs(c);
if (!c->opts.keep_journal &&
@@ -1015,13 +1081,14 @@ out:
bch2_journal_keys_put_initial(c);
kfree(clean);
- if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) {
+ if (!ret &&
+ test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
+ !c->opts.nochanges) {
bch2_fs_read_write_early(c);
bch2_delete_dead_snapshots_async(c);
}
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
err:
fsck_err:
@@ -1034,8 +1101,6 @@ int bch2_fs_initialize(struct bch_fs *c)
struct bch_inode_unpacked root_inode, lostfound_inode;
struct bkey_inode_buf packed_inode;
struct qstr lostfound = QSTR("lost+found");
- struct bch_dev *ca;
- unsigned i;
int ret;
bch_notice(c, "initializing new filesystem");
@@ -1054,13 +1119,12 @@ int bch2_fs_initialize(struct bch_fs *c)
mutex_unlock(&c->sb_lock);
c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
- set_bit(BCH_FS_MAY_GO_RW, &c->flags);
- set_bit(BCH_FS_FSCK_DONE, &c->flags);
+ set_bit(BCH_FS_may_go_rw, &c->flags);
- for (i = 0; i < BTREE_ID_NR; i++)
+ for (unsigned i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
- for_each_member_device(ca, c, i)
+ for_each_member_device(c, ca)
bch2_dev_usage_init(ca);
ret = bch2_fs_journal_alloc(c);
@@ -1088,7 +1152,7 @@ int bch2_fs_initialize(struct bch_fs *c)
if (ret)
goto err;
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
ca->new_fs_bucket_idx = 0;
ret = bch2_fs_freespace_init(c);
@@ -1112,10 +1176,9 @@ int bch2_fs_initialize(struct bch_fs *c)
packed_inode.inode.k.p.snapshot = U32_MAX;
ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
- if (ret) {
- bch_err_msg(c, ret, "creating root directory");
+ bch_err_msg(c, ret, "creating root directory");
+ if (ret)
goto err;
- }
bch2_inode_init_early(c, &lostfound_inode);
@@ -1126,10 +1189,11 @@ int bch2_fs_initialize(struct bch_fs *c)
&lostfound,
0, 0, S_IFDIR|0700, 0,
NULL, NULL, (subvol_inum) { 0 }, 0));
- if (ret) {
- bch_err_msg(c, ret, "creating lost+found");
+ bch_err_msg(c, ret, "creating lost+found");
+ if (ret)
goto err;
- }
+
+ c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1;
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
@@ -1138,10 +1202,9 @@ int bch2_fs_initialize(struct bch_fs *c)
}
ret = bch2_journal_flush(&c->journal);
- if (ret) {
- bch_err_msg(c, ret, "writing first journal entry");
+ bch_err_msg(c, ret, "writing first journal entry");
+ if (ret)
goto err;
- }
mutex_lock(&c->sb_lock);
SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
@@ -1152,6 +1215,6 @@ int bch2_fs_initialize(struct bch_fs *c)
return 0;
err:
- bch_err_fn(ca, ret);
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 3a554b0751d0..4e9d24719b2e 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -31,6 +31,7 @@ static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
}
}
+int bch2_run_online_recovery_passes(struct bch_fs *);
u64 bch2_fsck_recovery_passes(void);
int bch2_fs_recovery(struct bch_fs *);
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
index d37c6fd30e38..fa0c8efd2a1b 100644
--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_types.h
@@ -6,6 +6,7 @@
#define PASS_FSCK BIT(1)
#define PASS_UNCLEAN BIT(2)
#define PASS_ALWAYS BIT(3)
+#define PASS_ONLINE BIT(4)
/*
* Passes may be reordered, but the second field is a persistent identifier and
@@ -22,18 +23,18 @@
x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \
x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \
x(journal_replay, 9, PASS_ALWAYS) \
- x(check_alloc_info, 10, PASS_FSCK) \
- x(check_lrus, 11, PASS_FSCK) \
- x(check_btree_backpointers, 12, PASS_FSCK) \
- x(check_backpointers_to_extents, 13, PASS_FSCK) \
- x(check_extents_to_backpointers, 14, PASS_FSCK) \
- x(check_alloc_to_lru_refs, 15, PASS_FSCK) \
+ x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \
+ x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \
+ x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \
+ x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK) \
+ x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \
+ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
x(bucket_gens_init, 17, 0) \
- x(check_snapshot_trees, 18, PASS_FSCK) \
- x(check_snapshots, 19, PASS_FSCK) \
- x(check_subvols, 20, PASS_FSCK) \
- x(delete_dead_snapshots, 21, PASS_FSCK) \
+ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
+ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
+ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
x(fs_upgrade_for_subvolumes, 22, 0) \
x(resume_logged_ops, 23, PASS_ALWAYS) \
x(check_inodes, 24, PASS_FSCK) \
@@ -41,8 +42,8 @@
x(check_indirect_extents, 26, PASS_FSCK) \
x(check_dirents, 27, PASS_FSCK) \
x(check_xattrs, 28, PASS_FSCK) \
- x(check_root, 29, PASS_FSCK) \
- x(check_directory_structure, 30, PASS_FSCK) \
+ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
+ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
x(check_nlinks, 31, PASS_FSCK) \
x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \
x(fix_reflink_p, 33, 0) \
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 37d16e04e671..c47c66c2b394 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -3,6 +3,7 @@
#include "bkey_buf.h"
#include "btree_update.h"
#include "buckets.h"
+#include "error.h"
#include "extents.h"
#include "inode.h"
#include "io_misc.h"
@@ -33,15 +34,14 @@ int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k,
struct printbuf *err)
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ int ret = 0;
- if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
- le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) {
- prt_printf(err, "idx < front_pad (%llu < %u)",
- le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
- return -EINVAL;
- }
-
- return 0;
+ bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad),
+ c, err, reflink_p_front_pad_bad,
+ "idx < front_pad (%llu < %u)",
+ le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
+fsck_err:
+ return ret;
}
void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
@@ -73,6 +73,184 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
return true;
}
+static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 *idx, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_i *k;
+ __le64 *refcount;
+ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ k = bch2_bkey_get_mut_noupdate(trans, &iter,
+ BTREE_ID_reflink, POS(0, *idx),
+ BTREE_ITER_WITH_UPDATES);
+ ret = PTR_ERR_OR_ZERO(k);
+ if (ret)
+ goto err;
+
+ refcount = bkey_refcount(bkey_i_to_s(k));
+ if (!refcount) {
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ bch2_trans_inconsistent(trans,
+ "nonexistent indirect extent at %llu while marking\n %s",
+ *idx, buf.buf);
+ ret = -EIO;
+ goto err;
+ }
+
+ if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ bch2_trans_inconsistent(trans,
+ "indirect extent refcount underflow at %llu while marking\n %s",
+ *idx, buf.buf);
+ ret = -EIO;
+ goto err;
+ }
+
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
+ u64 pad;
+
+ pad = max_t(s64, le32_to_cpu(v->front_pad),
+ le64_to_cpu(v->idx) - bkey_start_offset(&k->k));
+ BUG_ON(pad > U32_MAX);
+ v->front_pad = cpu_to_le32(pad);
+
+ pad = max_t(s64, le32_to_cpu(v->back_pad),
+ k->k.p.offset - p.k->size - le64_to_cpu(v->idx));
+ BUG_ON(pad > U32_MAX);
+ v->back_pad = cpu_to_le32(pad);
+ }
+
+ le64_add_cpu(refcount, add);
+
+ bch2_btree_iter_set_pos_to_extent_start(&iter);
+ ret = bch2_trans_update(trans, &iter, k, 0);
+ if (ret)
+ goto err;
+
+ *idx = k->k.p.offset;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 *idx, unsigned flags, size_t r_idx)
+{
+ struct bch_fs *c = trans->c;
+ struct reflink_gc *r;
+ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ u64 start = le64_to_cpu(p.v->idx);
+ u64 end = le64_to_cpu(p.v->idx) + p.k->size;
+ u64 next_idx = end + le32_to_cpu(p.v->back_pad);
+ s64 ret = 0;
+ struct printbuf buf = PRINTBUF;
+
+ if (r_idx >= c->reflink_gc_nr)
+ goto not_found;
+
+ r = genradix_ptr(&c->reflink_gc_table, r_idx);
+ next_idx = min(next_idx, r->offset - r->size);
+ if (*idx < next_idx)
+ goto not_found;
+
+ BUG_ON((s64) r->refcount + add < 0);
+
+ r->refcount += add;
+ *idx = r->offset;
+ return 0;
+not_found:
+ if (fsck_err(c, reflink_p_to_missing_reflink_v,
+ "pointer to missing indirect extent\n"
+ " %s\n"
+ " missing range %llu-%llu",
+ (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
+ *idx, next_idx)) {
+ struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, p.s_c);
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
+
+ if (next_idx <= start) {
+ bkey_i_to_reflink_p(update)->v.front_pad = cpu_to_le32(start - next_idx);
+ } else if (*idx >= end) {
+ bkey_i_to_reflink_p(update)->v.back_pad = cpu_to_le32(*idx - end);
+ } else {
+ bkey_error_init(update);
+ update->k.p = p.k->p;
+ update->k.p.offset = next_idx;
+ update->k.size = next_idx - *idx;
+ set_bkey_val_u64s(&update->k, 0);
+ }
+
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_NORUN);
+ }
+
+ *idx = next_idx;
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int __trigger_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ int ret = 0;
+
+ u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
+ u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad);
+
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ while (idx < end && !ret)
+ ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags);
+ }
+
+ if (flags & BTREE_TRIGGER_GC) {
+ size_t l = 0, r = c->reflink_gc_nr;
+
+ while (l < r) {
+ size_t m = l + (r - l) / 2;
+ struct reflink_gc *ref = genradix_ptr(&c->reflink_gc_table, m);
+ if (ref->offset <= idx)
+ l = m + 1;
+ else
+ r = m;
+ }
+
+ while (idx < end && !ret)
+ ret = gc_trigger_reflink_p_segment(trans, p, &idx, flags, l++);
+ }
+
+ return ret;
+}
+
+int bch2_trigger_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_s new,
+ unsigned flags)
+{
+ if ((flags & BTREE_TRIGGER_TRANSACTIONAL) &&
+ (flags & BTREE_TRIGGER_INSERT)) {
+ struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v;
+
+ v->front_pad = v->back_pad = 0;
+ }
+
+ return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
/* indirect extents */
int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k,
@@ -104,32 +282,26 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
}
#endif
-static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags)
+static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *flags)
{
if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) {
- new->k.type = KEY_TYPE_deleted;
- new->k.size = 0;
- set_bkey_val_u64s(&new->k, 0);;
+ new.k->type = KEY_TYPE_deleted;
+ new.k->size = 0;
+ set_bkey_val_u64s(new.k, 0);
*flags &= ~BTREE_TRIGGER_INSERT;
}
}
-int bch2_trans_mark_reflink_v(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
+int bch2_trigger_reflink_v(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
- check_indirect_extent_deleting(new, &flags);
-
- if (old.k->type == KEY_TYPE_reflink_v &&
- new->k.type == KEY_TYPE_reflink_v &&
- old.k->u64s == new->k.u64s &&
- !memcmp(bkey_s_c_to_reflink_v(old).v->start,
- bkey_i_to_reflink_v(new)->v.start,
- bkey_val_bytes(&new->k) - 8))
- return 0;
+ if ((flags & BTREE_TRIGGER_TRANSACTIONAL) &&
+ (flags & BTREE_TRIGGER_INSERT))
+ check_indirect_extent_deleting(new, &flags);
- return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
+ return bch2_trigger_extent(trans, btree_id, level, old, new, flags);
}
/* indirect inline data */
@@ -152,9 +324,9 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
min(datalen, 32U), d.v->data);
}
-int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
+int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
+ struct bkey_s_c old, struct bkey_s new,
unsigned flags)
{
check_indirect_extent_deleting(new, &flags);
@@ -197,7 +369,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
- refcount = bkey_refcount(r_v);
+ refcount = bkey_refcount(bkey_i_to_s(r_v));
*refcount = 0;
memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
@@ -314,6 +486,13 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+ if (dst_inum.inum < src_inum.inum) {
+ /* Avoid some lock cycle transaction restarts */
+ ret = bch2_btree_iter_traverse(&dst_iter);
+ if (ret)
+ continue;
+ }
+
dst_done = dst_iter.pos.offset - dst_start.offset;
src_want = POS(src_start.inode, src_start.offset + dst_done);
bch2_btree_iter_set_pos(&src_iter, src_want);
@@ -366,9 +545,7 @@ s64 bch2_remap_range(struct bch_fs *c,
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
- ret = bch2_bkey_set_needs_rebalance(c, new_dst.k,
- opts.background_target,
- opts.background_compression) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?:
bch2_extent_update(trans, dst_inum, &dst_iter,
new_dst.k, &disk_res,
new_i_size, i_sectors_delta,
@@ -398,7 +575,7 @@ s64 bch2_remap_range(struct bch_fs *c,
inode_u.bi_size = new_i_size;
ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
}
bch2_trans_iter_exit(trans, &inode_iter);
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 8ccf3f9c4939..4d8867289717 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -9,13 +9,14 @@ int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c,
void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \
.key_invalid = bch2_reflink_p_invalid, \
.val_to_text = bch2_reflink_p_to_text, \
.key_merge = bch2_reflink_p_merge, \
- .trans_trigger = bch2_trans_mark_reflink_p, \
- .atomic_trigger = bch2_mark_reflink_p, \
+ .trigger = bch2_trigger_reflink_p, \
.min_val_size = 16, \
})
@@ -23,15 +24,14 @@ int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \
.key_invalid = bch2_reflink_v_invalid, \
.val_to_text = bch2_reflink_v_to_text, \
.swab = bch2_ptr_swab, \
- .trans_trigger = bch2_trans_mark_reflink_v, \
- .atomic_trigger = bch2_mark_extent, \
+ .trigger = bch2_trigger_reflink_v, \
.min_val_size = 8, \
})
@@ -39,15 +39,15 @@ int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);
-int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
+int bch2_trigger_indirect_inline_data(struct btree_trans *,
enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *,
+ struct bkey_s_c, struct bkey_s,
unsigned);
#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \
.key_invalid = bch2_indirect_inline_data_invalid, \
.val_to_text = bch2_indirect_inline_data_to_text, \
- .trans_trigger = bch2_trans_mark_indirect_inline_data, \
+ .trigger = bch2_trigger_indirect_inline_data, \
.min_val_size = 8, \
})
@@ -63,13 +63,13 @@ static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
}
}
-static inline __le64 *bkey_refcount(struct bkey_i *k)
+static inline __le64 *bkey_refcount(struct bkey_s k)
{
- switch (k->k.type) {
+ switch (k.k->type) {
case KEY_TYPE_reflink_v:
- return &bkey_i_to_reflink_v(k)->v.refcount;
+ return &bkey_s_to_reflink_v(k).v->refcount;
case KEY_TYPE_indirect_inline_data:
- return &bkey_i_to_indirect_inline_data(k)->v.refcount;
+ return &bkey_s_to_indirect_inline_data(k).v->refcount;
default:
return NULL;
}
diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h
new file mode 100644
index 000000000000..6772eebb1fc6
--- /dev/null
+++ b/fs/bcachefs/reflink_format.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REFLINK_FORMAT_H
+#define _BCACHEFS_REFLINK_FORMAT_H
+
+struct bch_reflink_p {
+ struct bch_val v;
+ __le64 idx;
+ /*
+ * A reflink pointer might point to an indirect extent which is then
+ * later split (by copygc or rebalance). If we only pointed to part of
+ * the original indirect extent, and then one of the fragments is
+ * outside the range we point to, we'd leak a refcount: so when creating
+ * reflink pointers, we need to store pad values to remember the full
+ * range we were taking a reference on.
+ */
+ __le32 front_pad;
+ __le32 back_pad;
+} __packed __aligned(8);
+
+struct bch_reflink_v {
+ struct bch_val v;
+ __le64 refcount;
+ union bch_extent_entry start[0];
+ __u64 _data[];
+} __packed __aligned(8);
+
+struct bch_indirect_inline_data {
+ struct bch_val v;
+ __le64 refcount;
+ u8 data[];
+};
+
+#endif /* _BCACHEFS_REFLINK_FORMAT_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 2008fe8bf706..cc2672c12031 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -9,9 +9,15 @@
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);
+/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
+static int bch2_memcmp(const void *l, const void *r, size_t size)
+{
+ return memcmp(l, r, size);
+}
+
/* Replicas tracking - in memory: */
-static void verify_replicas_entry(struct bch_replicas_entry *e)
+static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
{
#ifdef CONFIG_BCACHEFS_DEBUG
unsigned i;
@@ -26,49 +32,39 @@ static void verify_replicas_entry(struct bch_replicas_entry *e)
#endif
}
-void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
+void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
{
bubble_sort(e->devs, e->nr_devs, u8_cmp);
}
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
- eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+ eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
}
static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
struct bch_replicas_entry_v0 *e)
{
- unsigned i;
-
- if (e->data_type < BCH_DATA_NR)
- prt_printf(out, "%s", bch2_data_types[e->data_type]);
- else
- prt_printf(out, "(invalid data type %u)", e->data_type);
+ bch2_prt_data_type(out, e->data_type);
prt_printf(out, ": %u [", e->nr_devs);
- for (i = 0; i < e->nr_devs; i++)
+ for (unsigned i = 0; i < e->nr_devs; i++)
prt_printf(out, i ? " %u" : "%u", e->devs[i]);
prt_printf(out, "]");
}
void bch2_replicas_entry_to_text(struct printbuf *out,
- struct bch_replicas_entry *e)
+ struct bch_replicas_entry_v1 *e)
{
- unsigned i;
-
- if (e->data_type < BCH_DATA_NR)
- prt_printf(out, "%s", bch2_data_types[e->data_type]);
- else
- prt_printf(out, "(invalid data type %u)", e->data_type);
+ bch2_prt_data_type(out, e->data_type);
prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
- for (i = 0; i < e->nr_devs; i++)
+ for (unsigned i = 0; i < e->nr_devs; i++)
prt_printf(out, i ? " %u" : "%u", e->devs[i]);
prt_printf(out, "]");
}
-int bch2_replicas_entry_validate(struct bch_replicas_entry *r,
+int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
struct bch_sb *sb,
struct printbuf *err)
{
@@ -98,7 +94,7 @@ bad:
void bch2_cpu_replicas_to_text(struct printbuf *out,
struct bch_replicas_cpu *r)
{
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
bool first = true;
for_each_cpu_replicas_entry(r, e) {
@@ -111,7 +107,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
}
static void extent_to_replicas(struct bkey_s_c k,
- struct bch_replicas_entry *r)
+ struct bch_replicas_entry_v1 *r)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@@ -131,7 +127,7 @@ static void extent_to_replicas(struct bkey_s_c k,
}
static void stripe_to_replicas(struct bkey_s_c k,
- struct bch_replicas_entry *r)
+ struct bch_replicas_entry_v1 *r)
{
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
const struct bch_extent_ptr *ptr;
@@ -144,7 +140,7 @@ static void stripe_to_replicas(struct bkey_s_c k,
r->devs[r->nr_devs++] = ptr->dev;
}
-void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
+void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
struct bkey_s_c k)
{
e->nr_devs = 0;
@@ -169,12 +165,10 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
bch2_replicas_entry_sort(e);
}
-void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
+void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
enum bch_data_type data_type,
struct bch_devs_list devs)
{
- unsigned i;
-
BUG_ON(!data_type ||
data_type == BCH_DATA_sb ||
data_type >= BCH_DATA_NR);
@@ -183,8 +177,8 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
e->nr_devs = 0;
e->nr_required = 1;
- for (i = 0; i < devs.nr; i++)
- e->devs[e->nr_devs++] = devs.devs[i];
+ darray_for_each(devs, i)
+ e->devs[e->nr_devs++] = *i;
bch2_replicas_entry_sort(e);
}
@@ -192,7 +186,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
static struct bch_replicas_cpu
cpu_replicas_add_entry(struct bch_fs *c,
struct bch_replicas_cpu *old,
- struct bch_replicas_entry *new_entry)
+ struct bch_replicas_entry_v1 *new_entry)
{
unsigned i;
struct bch_replicas_cpu new = {
@@ -225,7 +219,7 @@ cpu_replicas_add_entry(struct bch_fs *c,
}
static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
- struct bch_replicas_entry *search)
+ struct bch_replicas_entry_v1 *search)
{
int idx, entry_size = replicas_entry_bytes(search);
@@ -243,7 +237,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
}
int bch2_replicas_entry_idx(struct bch_fs *c,
- struct bch_replicas_entry *search)
+ struct bch_replicas_entry_v1 *search)
{
bch2_replicas_entry_sort(search);
@@ -251,13 +245,13 @@ int bch2_replicas_entry_idx(struct bch_fs *c,
}
static bool __replicas_has_entry(struct bch_replicas_cpu *r,
- struct bch_replicas_entry *search)
+ struct bch_replicas_entry_v1 *search)
{
return __replicas_entry_idx(r, search) >= 0;
}
bool bch2_replicas_marked(struct bch_fs *c,
- struct bch_replicas_entry *search)
+ struct bch_replicas_entry_v1 *search)
{
bool marked;
@@ -374,7 +368,7 @@ err:
static unsigned reserve_journal_replicas(struct bch_fs *c,
struct bch_replicas_cpu *r)
{
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
unsigned journal_res_u64s = 0;
/* nr_inodes: */
@@ -399,7 +393,7 @@ static unsigned reserve_journal_replicas(struct bch_fs *c,
noinline
static int bch2_mark_replicas_slowpath(struct bch_fs *c,
- struct bch_replicas_entry *new_entry)
+ struct bch_replicas_entry_v1 *new_entry)
{
struct bch_replicas_cpu new_r, new_gc;
int ret = 0;
@@ -464,7 +458,7 @@ err:
goto out;
}
-int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
+int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
{
return likely(bch2_replicas_marked(c, r))
? 0 : bch2_mark_replicas_slowpath(c, r);
@@ -515,7 +509,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
{
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
unsigned i = 0;
lockdep_assert_held(&c->replicas_gc_lock);
@@ -590,7 +584,7 @@ retry:
}
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
if (e->data_type == BCH_DATA_journal ||
@@ -621,7 +615,7 @@ retry:
}
int bch2_replicas_set_usage(struct bch_fs *c,
- struct bch_replicas_entry *r,
+ struct bch_replicas_entry_v1 *r,
u64 sectors)
{
int ret, idx = bch2_replicas_entry_idx(c, r);
@@ -654,7 +648,7 @@ static int
__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
struct bch_replicas_cpu *cpu_r)
{
- struct bch_replicas_entry *e, *dst;
+ struct bch_replicas_entry_v1 *e, *dst;
unsigned nr = 0, entry_size = 0, idx = 0;
for_each_replicas_entry(sb_r, e) {
@@ -692,7 +686,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
nr++;
}
- entry_size += sizeof(struct bch_replicas_entry) -
+ entry_size += sizeof(struct bch_replicas_entry_v1) -
sizeof(struct bch_replicas_entry_v0);
cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
@@ -703,7 +697,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
cpu_r->entry_size = entry_size;
for_each_replicas_entry(sb_r, e) {
- struct bch_replicas_entry *dst =
+ struct bch_replicas_entry_v1 *dst =
cpu_replicas_entry(cpu_r, idx++);
dst->data_type = e->data_type;
@@ -747,7 +741,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
{
struct bch_sb_field_replicas_v0 *sb_r;
struct bch_replicas_entry_v0 *dst;
- struct bch_replicas_entry *src;
+ struct bch_replicas_entry_v1 *src;
size_t bytes;
bytes = sizeof(struct bch_sb_field_replicas);
@@ -785,7 +779,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
struct bch_replicas_cpu *r)
{
struct bch_sb_field_replicas *sb_r;
- struct bch_replicas_entry *dst, *src;
+ struct bch_replicas_entry_v1 *dst, *src;
bool need_v1 = false;
size_t bytes;
@@ -833,10 +827,10 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
sort_cmp_size(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
- memcmp, NULL);
+ bch2_memcmp, NULL);
for (i = 0; i < cpu_r->nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(cpu_r, i);
int ret = bch2_replicas_entry_validate(e, sb, err);
@@ -844,7 +838,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
return ret;
if (i + 1 < cpu_r->nr) {
- struct bch_replicas_entry *n =
+ struct bch_replicas_entry_v1 *n =
cpu_replicas_entry(cpu_r, i + 1);
BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
@@ -881,7 +875,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
struct bch_sb_field *f)
{
struct bch_sb_field_replicas *r = field_to_type(f, replicas);
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
bool first = true;
for_each_replicas_entry(r, e) {
@@ -943,7 +937,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
unsigned flags, bool print)
{
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
bool ret = true;
percpu_down_read(&c->mark_lock);
@@ -1003,7 +997,7 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
if (replicas) {
- struct bch_replicas_entry *r;
+ struct bch_replicas_entry_v1 *r;
for_each_replicas_entry(replicas, r)
for (i = 0; i < r->nr_devs; i++)
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index f70a642775d1..654a4b26d3a3 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -6,28 +6,28 @@
#include "eytzinger.h"
#include "replicas_types.h"
-void bch2_replicas_entry_sort(struct bch_replicas_entry *);
+void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
void bch2_replicas_entry_to_text(struct printbuf *,
- struct bch_replicas_entry *);
-int bch2_replicas_entry_validate(struct bch_replicas_entry *,
+ struct bch_replicas_entry_v1 *);
+int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *,
struct bch_sb *, struct printbuf *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
-static inline struct bch_replicas_entry *
+static inline struct bch_replicas_entry_v1 *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
{
return (void *) r->entries + r->entry_size * i;
}
int bch2_replicas_entry_idx(struct bch_fs *,
- struct bch_replicas_entry *);
+ struct bch_replicas_entry_v1 *);
-void bch2_devlist_to_replicas(struct bch_replicas_entry *,
+void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
enum bch_data_type,
struct bch_devs_list);
-bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
+bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
int bch2_mark_replicas(struct bch_fs *,
- struct bch_replicas_entry *);
+ struct bch_replicas_entry_v1 *);
static inline struct replicas_delta *
replicas_delta_next(struct replicas_delta *d)
@@ -37,9 +37,9 @@ replicas_delta_next(struct replicas_delta *d)
int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
-void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
+void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);
-static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
unsigned dev)
{
e->data_type = BCH_DATA_cached;
@@ -59,7 +59,7 @@ int bch2_replicas_gc_start(struct bch_fs *, unsigned);
int bch2_replicas_gc2(struct bch_fs *);
int bch2_replicas_set_usage(struct bch_fs *,
- struct bch_replicas_entry *,
+ struct bch_replicas_entry_v1 *,
u64);
#define for_each_cpu_replicas_entry(_r, _i) \
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
index 5cfff489bbc3..ac90d142c4e8 100644
--- a/fs/bcachefs/replicas_types.h
+++ b/fs/bcachefs/replicas_types.h
@@ -5,12 +5,12 @@
struct bch_replicas_cpu {
unsigned nr;
unsigned entry_size;
- struct bch_replicas_entry *entries;
+ struct bch_replicas_entry_v1 *entries;
};
struct replicas_delta {
s64 delta;
- struct bch_replicas_entry r;
+ struct bch_replicas_entry_v1 r;
} __packed;
struct replicas_delta_list {
@@ -21,7 +21,7 @@ struct replicas_delta_list {
u64 nr_inodes;
u64 persistent_reserved[BCH_REPLICAS_MAX];
struct {} memset_end;
- struct replicas_delta d[0];
+ struct replicas_delta d[];
};
#endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index c76ad8ea5e4a..b6bf0ebe7e84 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -191,13 +191,10 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry **end,
u64 journal_seq)
{
- struct bch_dev *ca;
- unsigned i, dev;
-
percpu_down_read(&c->mark_lock);
if (!journal_seq) {
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
} else {
bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
@@ -210,7 +207,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = BCH_FS_USAGE_inodes;
- u->v = cpu_to_le64(c->usage_base->nr_inodes);
+ u->v = cpu_to_le64(c->usage_base->b.nr_inodes);
}
{
@@ -223,7 +220,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->v = cpu_to_le64(atomic64_read(&c->key_version));
}
- for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
struct jset_entry_usage *u =
container_of(jset_entry_init(end, sizeof(*u)),
struct jset_entry_usage, entry);
@@ -234,8 +231,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
}
- for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ for (unsigned i = 0; i < c->replicas.nr; i++) {
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
struct jset_entry_data_usage *u =
container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
@@ -247,7 +244,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
"embedded variable length struct");
}
- for_each_member_device(ca, c, dev) {
+ for_each_member_device(c, ca) {
unsigned b = sizeof(struct jset_entry_dev_usage) +
sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
struct jset_entry_dev_usage *u =
@@ -255,10 +252,9 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry_dev_usage, entry);
u->entry.type = BCH_JSET_ENTRY_dev_usage;
- u->dev = cpu_to_le32(dev);
- u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
+ u->dev = cpu_to_le32(ca->dev_idx);
- for (i = 0; i < BCH_DATA_NR; i++) {
+ for (unsigned i = 0; i < BCH_DATA_NR; i++) {
u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
@@ -267,7 +263,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
percpu_up_read(&c->mark_lock);
- for (i = 0; i < 2; i++) {
+ for (unsigned i = 0; i < 2; i++) {
struct jset_entry_clock *clock =
container_of(jset_entry_init(end, sizeof(*clock)),
struct jset_entry_clock, entry);
diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/sb-counters.c
index 02a996e06a64..7dc898761bb3 100644
--- a/fs/bcachefs/counters.c
+++ b/fs/bcachefs/sb-counters.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "super-io.h"
-#include "counters.h"
+#include "sb-counters.h"
/* BCH_SB_FIELD_counters */
diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/sb-counters.h
index 4778aa19bf34..81f8aec9fcb1 100644
--- a/fs/bcachefs/counters.h
+++ b/fs/bcachefs/sb-counters.h
@@ -1,11 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_COUNTERS_H
-#define _BCACHEFS_COUNTERS_H
+#ifndef _BCACHEFS_SB_COUNTERS_H
+#define _BCACHEFS_SB_COUNTERS_H
#include "bcachefs.h"
#include "super-io.h"
-
int bch2_sb_counters_to_cpu(struct bch_fs *);
int bch2_sb_counters_from_cpu(struct bch_fs *);
@@ -14,4 +13,4 @@ int bch2_fs_counters_init(struct bch_fs *);
extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
-#endif // _BCACHEFS_COUNTERS_H
+#endif // _BCACHEFS_SB_COUNTERS_H
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
new file mode 100644
index 000000000000..62ea478215d0
--- /dev/null
+++ b/fs/bcachefs/sb-counters_format.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H
+#define _BCACHEFS_SB_COUNTERS_FORMAT_H
+
+#define BCH_PERSISTENT_COUNTERS() \
+ x(io_read, 0) \
+ x(io_write, 1) \
+ x(io_move, 2) \
+ x(bucket_invalidate, 3) \
+ x(bucket_discard, 4) \
+ x(bucket_alloc, 5) \
+ x(bucket_alloc_fail, 6) \
+ x(btree_cache_scan, 7) \
+ x(btree_cache_reap, 8) \
+ x(btree_cache_cannibalize, 9) \
+ x(btree_cache_cannibalize_lock, 10) \
+ x(btree_cache_cannibalize_lock_fail, 11) \
+ x(btree_cache_cannibalize_unlock, 12) \
+ x(btree_node_write, 13) \
+ x(btree_node_read, 14) \
+ x(btree_node_compact, 15) \
+ x(btree_node_merge, 16) \
+ x(btree_node_split, 17) \
+ x(btree_node_rewrite, 18) \
+ x(btree_node_alloc, 19) \
+ x(btree_node_free, 20) \
+ x(btree_node_set_root, 21) \
+ x(btree_path_relock_fail, 22) \
+ x(btree_path_upgrade_fail, 23) \
+ x(btree_reserve_get_fail, 24) \
+ x(journal_entry_full, 25) \
+ x(journal_full, 26) \
+ x(journal_reclaim_finish, 27) \
+ x(journal_reclaim_start, 28) \
+ x(journal_write, 29) \
+ x(read_promote, 30) \
+ x(read_bounce, 31) \
+ x(read_split, 33) \
+ x(read_retry, 32) \
+ x(read_reuse_race, 34) \
+ x(move_extent_read, 35) \
+ x(move_extent_write, 36) \
+ x(move_extent_finish, 37) \
+ x(move_extent_fail, 38) \
+ x(move_extent_start_fail, 39) \
+ x(copygc, 40) \
+ x(copygc_wait, 41) \
+ x(gc_gens_end, 42) \
+ x(gc_gens_start, 43) \
+ x(trans_blocked_journal_reclaim, 44) \
+ x(trans_restart_btree_node_reused, 45) \
+ x(trans_restart_btree_node_split, 46) \
+ x(trans_restart_fault_inject, 47) \
+ x(trans_restart_iter_upgrade, 48) \
+ x(trans_restart_journal_preres_get, 49) \
+ x(trans_restart_journal_reclaim, 50) \
+ x(trans_restart_journal_res_get, 51) \
+ x(trans_restart_key_cache_key_realloced, 52) \
+ x(trans_restart_key_cache_raced, 53) \
+ x(trans_restart_mark_replicas, 54) \
+ x(trans_restart_mem_realloced, 55) \
+ x(trans_restart_memory_allocation_failure, 56) \
+ x(trans_restart_relock, 57) \
+ x(trans_restart_relock_after_fill, 58) \
+ x(trans_restart_relock_key_cache_fill, 59) \
+ x(trans_restart_relock_next_node, 60) \
+ x(trans_restart_relock_parent_for_fill, 61) \
+ x(trans_restart_relock_path, 62) \
+ x(trans_restart_relock_path_intent, 63) \
+ x(trans_restart_too_many_iters, 64) \
+ x(trans_restart_traverse, 65) \
+ x(trans_restart_upgrade, 66) \
+ x(trans_restart_would_deadlock, 67) \
+ x(trans_restart_would_deadlock_write, 68) \
+ x(trans_restart_injected, 69) \
+ x(trans_restart_key_cache_upgrade, 70) \
+ x(trans_traverse_all, 71) \
+ x(transaction_commit, 72) \
+ x(write_super, 73) \
+ x(trans_restart_would_deadlock_recursion_limit, 74) \
+ x(trans_restart_write_buffer_flush, 75) \
+ x(trans_restart_split_race, 76) \
+ x(write_buffer_flush_slowpath, 77) \
+ x(write_buffer_flush_sync, 78)
+
+enum bch_persistent_counters {
+#define x(t, n, ...) BCH_COUNTER_##t,
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ BCH_COUNTER_NR
+};
+
+struct bch_sb_field_counters {
+ struct bch_sb_field field;
+ __le64 d[];
+};
+
+#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 4919237bbe73..441dcb1bf160 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -12,33 +12,105 @@
#include "sb-errors.h"
#include "super-io.h"
+#define RECOVERY_PASS_ALL_FSCK BIT_ULL(63)
+
/*
- * Downgrade table:
- * When dowgrading past certain versions, we need to run certain recovery passes
- * and fix certain errors:
+ * Upgrade, downgrade tables - run certain recovery passes, fix certain errors
*
* x(version, recovery_passes, errors...)
*/
+#define UPGRADE_TABLE() \
+ x(backpointers, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(inode_v3, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(unwritten_extents, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(bucket_gens, \
+ BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(lru_v2, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(fragmentation_lru, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(no_bps_in_alloc_keys, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(snapshot_trees, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(snapshot_skiplists, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_snapshots), \
+ BCH_FSCK_ERR_snapshot_bad_depth, \
+ BCH_FSCK_ERR_snapshot_bad_skiplist) \
+ x(deleted_inodes, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
+ BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \
+ x(rebalance_work, \
+ BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
#define DOWNGRADE_TABLE()
-struct downgrade_entry {
+struct upgrade_downgrade_entry {
u64 recovery_passes;
u16 version;
u16 nr_errors;
const u16 *errors;
};
-#define x(ver, passes, ...) static const u16 ver_##errors[] = { __VA_ARGS__ };
+#define x(ver, passes, ...) static const u16 upgrade_##ver##_errors[] = { __VA_ARGS__ };
+UPGRADE_TABLE()
+#undef x
+
+static const struct upgrade_downgrade_entry upgrade_table[] = {
+#define x(ver, passes, ...) { \
+ .recovery_passes = passes, \
+ .version = bcachefs_metadata_version_##ver,\
+ .nr_errors = ARRAY_SIZE(upgrade_##ver##_errors), \
+ .errors = upgrade_##ver##_errors, \
+},
+UPGRADE_TABLE()
+#undef x
+};
+
+void bch2_sb_set_upgrade(struct bch_fs *c,
+ unsigned old_version,
+ unsigned new_version)
+{
+ lockdep_assert_held(&c->sb_lock);
+
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ for (const struct upgrade_downgrade_entry *i = upgrade_table;
+ i < upgrade_table + ARRAY_SIZE(upgrade_table);
+ i++)
+ if (i->version > old_version && i->version <= new_version) {
+ u64 passes = i->recovery_passes;
+
+ if (passes & RECOVERY_PASS_ALL_FSCK)
+ passes |= bch2_fsck_recovery_passes();
+ passes &= ~RECOVERY_PASS_ALL_FSCK;
+
+ ext->recovery_passes_required[0] |=
+ cpu_to_le64(bch2_recovery_passes_to_stable(passes));
+
+ for (const u16 *e = i->errors;
+ e < i->errors + i->nr_errors;
+ e++) {
+ __set_bit(*e, c->sb.errors_silent);
+ ext->errors_silent[*e / 64] |= cpu_to_le64(BIT_ULL(*e % 64));
+ }
+ }
+}
+
+#define x(ver, passes, ...) static const u16 downgrade_ver_##errors[] = { __VA_ARGS__ };
DOWNGRADE_TABLE()
#undef x
-static const struct downgrade_entry downgrade_table[] = {
+static const struct upgrade_downgrade_entry downgrade_table[] = {
#define x(ver, passes, ...) { \
.recovery_passes = passes, \
.version = bcachefs_metadata_version_##ver,\
- .nr_errors = ARRAY_SIZE(ver_##errors), \
- .errors = ver_##errors, \
+ .nr_errors = ARRAY_SIZE(downgrade_##ver##_errors), \
+ .errors = downgrade_##ver##_errors, \
},
DOWNGRADE_TABLE()
#undef x
@@ -118,7 +190,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
darray_char table = {};
int ret = 0;
- for (const struct downgrade_entry *src = downgrade_table;
+ for (const struct upgrade_downgrade_entry *src = downgrade_table;
src < downgrade_table + ARRAY_SIZE(downgrade_table);
src++) {
if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h
index bc48fd2ca70e..57e6c916fc73 100644
--- a/fs/bcachefs/sb-downgrade.h
+++ b/fs/bcachefs/sb-downgrade.h
@@ -5,6 +5,7 @@
extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade;
int bch2_sb_downgrade_update(struct bch_fs *);
+void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned);
void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned);
#endif /* _BCACHEFS_SB_DOWNGRADE_H */
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index 3504c2d09c29..c08aacdfd073 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -248,7 +248,9 @@
x(root_inode_not_dir, 240) \
x(dir_loop, 241) \
x(hash_table_key_duplicate, 242) \
- x(hash_table_key_wrong_offset, 243)
+ x(hash_table_key_wrong_offset, 243) \
+ x(unlinked_inode_not_on_deleted_list, 244) \
+ x(reflink_p_front_pad_bad, 245)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index bed0f857fe5b..a45354d2acde 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -235,6 +235,11 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "(never)");
prt_newline(out);
+ prt_printf(out, "Last superblock write:");
+ prt_tab(out);
+ prt_u64(out, le64_to_cpu(m.seq));
+ prt_newline(out);
+
prt_printf(out, "State:");
prt_tab(out);
prt_printf(out, "%s",
@@ -246,7 +251,7 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Data allowed:");
prt_tab(out);
if (BCH_MEMBER_DATA_ALLOWED(&m))
- prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
+ prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
else
prt_printf(out, "(none)");
prt_newline(out);
@@ -254,11 +259,16 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Has data:");
prt_tab(out);
if (data_have)
- prt_bitflags(out, bch2_data_types, data_have);
+ prt_bitflags(out, __bch2_data_types, data_have);
else
prt_printf(out, "(none)");
prt_newline(out);
+ prt_str(out, "Durability:");
+ prt_tab(out);
+ prt_printf(out, "%llu", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1);
+ prt_newline(out);
+
prt_printf(out, "Discard:");
prt_tab(out);
prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m));
@@ -353,14 +363,12 @@ const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = {
void bch2_sb_members_from_cpu(struct bch_fs *c)
{
struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- struct bch_dev *ca;
- unsigned i, e;
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i, NULL) {
- struct bch_member *m = __bch2_members_v2_get_mut(mi, i);
+ for_each_member_device_rcu(c, ca, NULL) {
+ struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx);
- for (e = 0; e < BCH_MEMBER_ERROR_NR; e++)
+ for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++)
m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
}
rcu_read_unlock();
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index 03613e3eb8e3..be0a94183271 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -2,6 +2,8 @@
#ifndef _BCACHEFS_SB_MEMBERS_H
#define _BCACHEFS_SB_MEMBERS_H
+#include "darray.h"
+
extern char * const bch2_member_error_strs[];
static inline struct bch_member *
@@ -47,23 +49,18 @@ static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
unsigned dev)
{
- unsigned i;
-
- for (i = 0; i < devs.nr; i++)
- if (devs.devs[i] == dev)
+ darray_for_each(devs, i)
+ if (*i == dev)
return true;
-
return false;
}
static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
unsigned dev)
{
- unsigned i;
-
- for (i = 0; i < devs->nr; i++)
- if (devs->devs[i] == dev) {
- array_remove_item(devs->devs, devs->nr, i);
+ darray_for_each(*devs, i)
+ if (*i == dev) {
+ darray_remove_item(devs, i);
return;
}
}
@@ -72,40 +69,48 @@ static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
unsigned dev)
{
if (!bch2_dev_list_has_dev(*devs, dev)) {
- BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
- devs->devs[devs->nr++] = dev;
+ BUG_ON(devs->nr >= ARRAY_SIZE(devs->data));
+ devs->data[devs->nr++] = dev;
}
}
static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
{
- return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
+ return (struct bch_devs_list) { .nr = 1, .data[0] = dev };
}
-static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
- const struct bch_devs_mask *mask)
+static inline struct bch_dev *__bch2_next_dev_idx(struct bch_fs *c, unsigned idx,
+ const struct bch_devs_mask *mask)
{
struct bch_dev *ca = NULL;
- while ((*iter = mask
- ? find_next_bit(mask->d, c->sb.nr_devices, *iter)
- : *iter) < c->sb.nr_devices &&
- !(ca = rcu_dereference_check(c->devs[*iter],
+ while ((idx = mask
+ ? find_next_bit(mask->d, c->sb.nr_devices, idx)
+ : idx) < c->sb.nr_devices &&
+ !(ca = rcu_dereference_check(c->devs[idx],
lockdep_is_held(&c->state_lock))))
- (*iter)++;
+ idx++;
return ca;
}
-#define for_each_member_device_rcu(ca, c, iter, mask) \
- for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *ca,
+ const struct bch_devs_mask *mask)
+{
+ return __bch2_next_dev_idx(c, ca ? ca->dev_idx + 1 : 0, mask);
+}
-static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
+#define for_each_member_device_rcu(_c, _ca, _mask) \
+ for (struct bch_dev *_ca = NULL; \
+ (_ca = __bch2_next_dev((_c), _ca, (_mask)));)
+
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_dev *ca;
+ if (ca)
+ percpu_ref_put(&ca->ref);
rcu_read_lock();
- if ((ca = __bch2_next_dev(c, iter, NULL)))
+ if ((ca = __bch2_next_dev(c, ca, NULL)))
percpu_ref_get(&ca->ref);
rcu_read_unlock();
@@ -115,41 +120,42 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter
/*
* If you break early, you must drop your ref on the current device
*/
-#define for_each_member_device(ca, c, iter) \
- for ((iter) = 0; \
- (ca = bch2_get_next_dev(c, &(iter))); \
- percpu_ref_put(&ca->ref), (iter)++)
+#define __for_each_member_device(_c, _ca) \
+ for (; (_ca = bch2_get_next_dev(_c, _ca));)
+
+#define for_each_member_device(_c, _ca) \
+ for (struct bch_dev *_ca = NULL; \
+ (_ca = bch2_get_next_dev(_c, _ca));)
static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
- unsigned *iter,
- int state_mask)
+ struct bch_dev *ca,
+ unsigned state_mask)
{
- struct bch_dev *ca;
+ if (ca)
+ percpu_ref_put(&ca->io_ref);
rcu_read_lock();
- while ((ca = __bch2_next_dev(c, iter, NULL)) &&
+ while ((ca = __bch2_next_dev(c, ca, NULL)) &&
(!((1 << ca->mi.state) & state_mask) ||
!percpu_ref_tryget(&ca->io_ref)))
- (*iter)++;
+ ;
rcu_read_unlock();
return ca;
}
-#define __for_each_online_member(ca, c, iter, state_mask) \
- for ((iter) = 0; \
- (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \
- percpu_ref_put(&ca->io_ref), (iter)++)
+#define __for_each_online_member(_c, _ca, state_mask) \
+ for (struct bch_dev *_ca = NULL; \
+ (_ca = bch2_get_next_online_dev(_c, _ca, state_mask));)
-#define for_each_online_member(ca, c, iter) \
- __for_each_online_member(ca, c, iter, ~0)
+#define for_each_online_member(c, ca) \
+ __for_each_online_member(c, ca, ~0)
-#define for_each_rw_member(ca, c, iter) \
- __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
+#define for_each_rw_member(c, ca) \
+ __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw))
-#define for_each_readable_member(ca, c, iter) \
- __for_each_online_member(ca, c, iter, \
- (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
+#define for_each_readable_member(c, ca) \
+ __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro))
/*
* If a key exists that references a device, the device won't be going away and
@@ -175,11 +181,9 @@ static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
{
struct bch_devs_mask devs;
- struct bch_dev *ca;
- unsigned i;
memset(&devs, 0, sizeof(devs));
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
__set_bit(ca->dev_idx, devs.d);
return devs;
}
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 97790445e67a..3a494c5d1247 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -324,101 +324,57 @@ bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
}
EXPORT_SYMBOL_GPL(six_relock_ip);
-#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN
-static inline bool six_can_spin_on_owner(struct six_lock *lock)
+static inline bool six_owner_running(struct six_lock *lock)
{
- struct task_struct *owner;
- bool ret;
-
- if (need_resched())
- return false;
-
+ /*
+ * When there's no owner, we might have preempted between the owner
+ * acquiring the lock and setting the owner field. If we're an RT task
+ * that will live-lock because we won't let the owner complete.
+ */
rcu_read_lock();
- owner = READ_ONCE(lock->owner);
- ret = !owner || owner_on_cpu(owner);
+ struct task_struct *owner = READ_ONCE(lock->owner);
+ bool ret = owner ? owner_on_cpu(owner) : !rt_task(current);
rcu_read_unlock();
return ret;
}
-static inline bool six_spin_on_owner(struct six_lock *lock,
- struct task_struct *owner,
- u64 end_time)
+static inline bool six_optimistic_spin(struct six_lock *lock,
+ struct six_lock_waiter *wait,
+ enum six_lock_type type)
{
- bool ret = true;
unsigned loop = 0;
-
- rcu_read_lock();
- while (lock->owner == owner) {
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_
- * checking lock->owner still matches owner. If that fails,
- * owner might point to freed memory. If it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
- */
- barrier();
-
- if (!owner_on_cpu(owner) || need_resched()) {
- ret = false;
- break;
- }
-
- if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
- six_set_bitmask(lock, SIX_LOCK_NOSPIN);
- ret = false;
- break;
- }
-
- cpu_relax();
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
-{
- struct task_struct *task = current;
u64 end_time;
if (type == SIX_LOCK_write)
return false;
- preempt_disable();
- if (!six_can_spin_on_owner(lock))
- goto fail;
+ if (lock->wait_list.next != &wait->list)
+ return false;
- if (!osq_lock(&lock->osq))
- goto fail;
+ if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN)
+ return false;
+ preempt_disable();
end_time = sched_clock() + 10 * NSEC_PER_USEC;
- while (1) {
- struct task_struct *owner;
-
+ while (!need_resched() && six_owner_running(lock)) {
/*
- * If there's an owner, wait for it to either
- * release the lock or go to sleep.
+ * Ensures that writes to the waitlist entry happen after we see
+ * wait->lock_acquired: pairs with the smp_store_release in
+ * __six_lock_wakeup
*/
- owner = READ_ONCE(lock->owner);
- if (owner && !six_spin_on_owner(lock, owner, end_time))
- break;
-
- if (do_six_trylock(lock, type, false)) {
- osq_unlock(&lock->osq);
+ if (smp_load_acquire(&wait->lock_acquired)) {
preempt_enable();
return true;
}
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
- */
- if (!owner && (need_resched() || rt_task(task)))
+ if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
+ six_set_bitmask(lock, SIX_LOCK_NOSPIN);
break;
+ }
/*
* The cpu_relax() call is a compiler barrier which forces
@@ -429,24 +385,15 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
cpu_relax();
}
- osq_unlock(&lock->osq);
-fail:
preempt_enable();
-
- /*
- * If we fell out of the spin path because of need_resched(),
- * reschedule now, before we try-lock again. This avoids getting
- * scheduled out right after we obtained the lock.
- */
- if (need_resched())
- schedule();
-
return false;
}
-#else /* CONFIG_SIX_LOCK_SPIN_ON_OWNER */
+#else /* CONFIG_LOCK_SPIN_ON_OWNER */
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+static inline bool six_optimistic_spin(struct six_lock *lock,
+ struct six_lock_waiter *wait,
+ enum six_lock_type type)
{
return false;
}
@@ -470,9 +417,6 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
trace_contention_begin(lock, 0);
lock_contended(&lock->dep_map, ip);
- if (six_optimistic_spin(lock, type))
- goto out;
-
wait->task = current;
wait->lock_want = type;
wait->lock_acquired = false;
@@ -510,6 +454,9 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
ret = 0;
}
+ if (six_optimistic_spin(lock, wait, type))
+ goto out;
+
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 4c268b0b8316..68d46fd7f391 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -15,7 +15,7 @@
* will have to take write locks for the full duration of the operation.
*
* But by adding an intent state, which is exclusive with other intent locks but
- * not with readers, we can take intent locks at thte start of the operation,
+ * not with readers, we can take intent locks at the start of the operation,
* and then take write locks only for the actual update to each individual
* nodes, without deadlocking.
*
@@ -65,8 +65,8 @@
*
* Reentrancy:
*
- * Six locks are not by themselves reentrent, but have counters for both the
- * read and intent states that can be used to provide reentrency by an upper
+ * Six locks are not by themselves reentrant, but have counters for both the
+ * read and intent states that can be used to provide reentrancy by an upper
* layer that tracks held locks. If a lock is known to already be held in the
* read or intent state, six_lock_increment() can be used to bump the "lock
* held in this state" counter, increasing the number of unlock calls that
@@ -127,10 +127,6 @@
#include <linux/sched.h>
#include <linux/types.h>
-#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
-#include <linux/osq_lock.h>
-#endif
-
enum six_lock_type {
SIX_LOCK_read,
SIX_LOCK_intent,
@@ -143,9 +139,6 @@ struct six_lock {
unsigned intent_lock_recurse;
struct task_struct *owner;
unsigned __percpu *readers;
-#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
- struct optimistic_spin_queue osq;
-#endif
raw_spinlock_t wait_lock;
struct list_head wait_list;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 5dac038f0851..45f67e8b29eb 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -123,7 +123,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
struct snapshot_table *t;
bool ret;
- EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
+ EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots);
rcu_read_lock();
t = rcu_dereference(c->snapshots);
@@ -276,7 +276,7 @@ static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
mutex_unlock(&c->snapshot_table_lock);
}
-int bch2_mark_snapshot(struct btree_trans *trans,
+static int __bch2_mark_snapshot(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
@@ -318,7 +318,7 @@ int bch2_mark_snapshot(struct btree_trans *trans,
__set_is_ancestor_bitmap(c, id);
if (BCH_SNAPSHOT_DELETED(s.v)) {
- set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+ set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
bch2_delete_dead_snapshots_async(c);
}
@@ -330,6 +330,14 @@ err:
return ret;
}
+int bch2_mark_snapshot(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
+{
+ return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
+}
+
int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot *s)
{
@@ -459,7 +467,6 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
- struct bkey_s_c_subvolume s;
bool found = false;
int ret;
@@ -468,7 +475,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_subvolume)
continue;
- s = bkey_s_c_to_subvolume(k);
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
continue;
if (!BCH_SUBVOLUME_SNAP(s.v)) {
@@ -582,19 +589,13 @@ fsck_err:
*/
int bch2_check_snapshot_trees(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_snapshot_trees, POS_MIN,
BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_snapshot_tree(trans, &iter, k)));
-
- if (ret)
- bch_err(c, "error %i checking snapshot trees", ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -813,11 +814,10 @@ static int check_snapshot(struct btree_trans *trans,
real_depth = bch2_snapshot_depth(c, parent_id);
- if (le32_to_cpu(s.depth) != real_depth &&
- (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
- fsck_err(c, snapshot_bad_depth,
- "snapshot with incorrect depth field, should be %u:\n %s",
- real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+ if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
+ c, snapshot_bad_depth,
+ "snapshot with incorrect depth field, should be %u:\n %s",
+ real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u);
if (ret)
@@ -831,11 +831,9 @@ static int check_snapshot(struct btree_trans *trans,
if (ret < 0)
goto err;
- if (!ret &&
- (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
- fsck_err(c, snapshot_bad_skiplist,
- "snapshot with bad skiplist field:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+ if (fsck_err_on(!ret, c, snapshot_bad_skiplist,
+ "snapshot with bad skiplist field:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u);
if (ret)
@@ -856,22 +854,17 @@ fsck_err:
int bch2_check_snapshots(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
/*
* We iterate backwards as checking/fixing the depth field requires that
* the parent's depth already be correct:
*/
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_reverse_commit(trans, iter,
- BTREE_ID_snapshots, POS_MAX,
- BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_snapshot(trans, &iter, k)));
- if (ret)
- bch_err_fn(c, ret);
+ BTREE_ID_snapshots, POS_MAX,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_snapshot(trans, &iter, k)));
+ bch_err_fn(c, ret);
return ret;
}
@@ -1060,6 +1053,8 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
n->v.tree = cpu_to_le32(tree);
n->v.depth = cpu_to_le32(depth);
+ n->v.btime.lo = cpu_to_le64(bch2_current_time(c));
+ n->v.btime.hi = 0;
for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
@@ -1067,7 +1062,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
- ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+ ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
if (ret)
goto err;
@@ -1315,7 +1310,6 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
struct bch_fs *c = trans->c;
u32 nr_deleted_ancestors = 0;
struct bkey_i_snapshot *s;
- u32 *i;
int ret;
if (k.k->type != KEY_TYPE_snapshot)
@@ -1368,23 +1362,19 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
int bch2_delete_dead_snapshots(struct bch_fs *c)
{
struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_snapshot snap;
snapshot_id_list deleted = { 0 };
snapshot_id_list deleted_interior = { 0 };
- u32 *i, id;
+ u32 id;
int ret = 0;
- if (!test_and_clear_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags))
+ if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
return 0;
- if (!test_bit(BCH_FS_STARTED, &c->flags)) {
+ if (!test_bit(BCH_FS_started, &c->flags)) {
ret = bch2_fs_read_write_early(c);
- if (ret) {
- bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
+ bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
+ if (ret)
return ret;
- }
}
trans = bch2_trans_get(c);
@@ -1397,37 +1387,29 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
POS_MIN, 0, k,
NULL, NULL, 0,
bch2_delete_redundant_snapshot(trans, k));
- if (ret) {
- bch_err_msg(c, ret, "deleting redundant snapshots");
+ bch_err_msg(c, ret, "deleting redundant snapshots");
+ if (ret)
goto err;
- }
- ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
+ ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
bch2_snapshot_set_equiv(trans, k));
- if (ret) {
- bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
+ bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
+ if (ret)
goto err;
- }
- for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ({
if (k.k->type != KEY_TYPE_snapshot)
continue;
- snap = bkey_s_c_to_snapshot(k);
- if (BCH_SNAPSHOT_DELETED(snap.v)) {
- ret = snapshot_list_add(c, &deleted, k.k->p.offset);
- if (ret)
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret) {
- bch_err_msg(c, ret, "walking snapshots");
+ BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)
+ ? snapshot_list_add(c, &deleted, k.k->p.offset)
+ : 0;
+ }));
+ bch_err_msg(c, ret, "walking snapshots");
+ if (ret)
goto err;
- }
for (id = 0; id < BTREE_ID_NR; id++) {
struct bpos last_pos = POS_MIN;
@@ -1449,36 +1431,36 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
ret = for_each_btree_key_commit(trans, iter,
id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- &res, NULL, BTREE_INSERT_NOFAIL,
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc,
snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
for_each_btree_key_commit(trans, iter,
id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- &res, NULL, BTREE_INSERT_NOFAIL,
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc,
move_key_to_correct_snapshot(trans, &iter, k));
bch2_disk_reservation_put(c, &res);
darray_exit(&equiv_seen);
- if (ret) {
- bch_err_msg(c, ret, "deleting keys from dying snapshots");
+ bch_err_msg(c, ret, "deleting keys from dying snapshots");
+ if (ret)
goto err;
- }
}
bch2_trans_unlock(trans);
down_write(&c->snapshot_create_lock);
- for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ({
u32 snapshot = k.k->p.offset;
u32 equiv = bch2_snapshot_equiv(c, snapshot);
- if (equiv != snapshot)
- snapshot_list_add(c, &deleted_interior, snapshot);
- }
- bch2_trans_iter_exit(trans, &iter);
+ equiv != snapshot
+ ? snapshot_list_add(c, &deleted_interior, snapshot)
+ : 0;
+ }));
+ bch_err_msg(c, ret, "walking snapshots");
if (ret)
goto err_create_lock;
@@ -1489,7 +1471,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
*/
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
BTREE_ITER_INTENT, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
if (ret)
goto err_create_lock;
@@ -1497,19 +1479,17 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
darray_for_each(deleted, i) {
ret = commit_do(trans, NULL, NULL, 0,
bch2_snapshot_node_delete(trans, *i));
- if (ret) {
- bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ if (ret)
goto err_create_lock;
- }
}
darray_for_each(deleted_interior, i) {
ret = commit_do(trans, NULL, NULL, 0,
bch2_snapshot_node_delete(trans, *i));
- if (ret) {
- bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ if (ret)
goto err_create_lock;
- }
}
err_create_lock:
up_write(&c->snapshot_create_lock);
@@ -1517,8 +1497,7 @@ err:
darray_exit(&deleted_interior);
darray_exit(&deleted);
bch2_trans_put(trans);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1680,7 +1659,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct
if (BCH_SNAPSHOT_DELETED(snap.v) ||
bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
(ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
- set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+ set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
return 0;
}
@@ -1689,25 +1668,20 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct
int bch2_snapshots_read(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
- bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
+ __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
bch2_snapshot_set_equiv(trans, k) ?:
bch2_check_snapshot_needs_deletion(trans, k)) ?:
- for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
+ for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
(set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
void bch2_fs_snapshots_exit(struct bch_fs *c)
{
- kfree(rcu_dereference_protected(c->snapshots, true));
+ kvfree(rcu_dereference_protected(c->snapshots, true));
}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index f09a22f44239..7c66ffc06385 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -22,12 +22,12 @@ void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \
.key_invalid = bch2_snapshot_invalid, \
.val_to_text = bch2_snapshot_to_text, \
- .atomic_trigger = bch2_mark_snapshot, \
+ .trigger = bch2_mark_snapshot, \
.min_val_size = 24, \
})
@@ -202,8 +202,6 @@ static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
{
- u32 *i;
-
darray_for_each(*s, i)
if (*i == id)
return true;
@@ -212,8 +210,6 @@ static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
{
- u32 *i;
-
darray_for_each(*s, i)
if (bch2_snapshot_is_ancestor(c, id, *i))
return true;
diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h
new file mode 100644
index 000000000000..aabcd3a74cd9
--- /dev/null
+++ b/fs/bcachefs/snapshot_format.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H
+#define _BCACHEFS_SNAPSHOT_FORMAT_H
+
+struct bch_snapshot {
+ struct bch_val v;
+ __le32 flags;
+ __le32 parent;
+ __le32 children[2];
+ __le32 subvol;
+ /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
+ __le32 tree;
+ __le32 depth;
+ __le32 skip[3];
+ bch_le128 btime;
+};
+
+LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
+
+/* True if a subvolume points to this snapshot node: */
+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
+
+/*
+ * Snapshot trees:
+ *
+ * The snapshot_trees btree gives us persistent indentifier for each tree of
+ * bch_snapshot nodes, and allow us to record and easily find the root/master
+ * subvolume that other snapshots were created from:
+ */
+struct bch_snapshot_tree {
+ struct bch_val v;
+ __le32 master_subvol;
+ __le32 root_snapshot;
+};
+
+#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index ae21a8cca1b4..fcaa5a888744 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -15,6 +15,16 @@
#include <crypto/hash.h>
#include <crypto/sha2.h>
+typedef unsigned __bitwise bch_str_hash_flags_t;
+
+enum bch_str_hash_flags {
+ __BCH_HASH_SET_MUST_CREATE,
+ __BCH_HASH_SET_MUST_REPLACE,
+};
+
+#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE)
+
static inline enum bch_str_hash_type
bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
{
@@ -150,21 +160,16 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s
}
static __always_inline int
-bch2_hash_lookup(struct btree_trans *trans,
+bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, const void *key,
- unsigned flags)
+ unsigned flags, u32 snapshot)
{
struct bkey_s_c k;
- u32 snapshot;
int ret;
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- return ret;
-
for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
POS(inum.inum, U64_MAX),
@@ -185,6 +190,19 @@ bch2_hash_lookup(struct btree_trans *trans,
}
static __always_inline int
+bch2_hash_lookup(struct btree_trans *trans,
+ struct btree_iter *iter,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum, const void *key,
+ unsigned flags)
+{
+ u32 snapshot;
+ return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+ bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
+}
+
+static __always_inline int
bch2_hash_hole(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
@@ -246,7 +264,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
struct bkey_i *insert,
- int flags,
+ bch_str_hash_flags_t str_hash_flags,
int update_flags)
{
struct btree_iter iter, slot = { NULL };
@@ -269,7 +287,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
}
if (!slot.path &&
- !(flags & BCH_HASH_SET_MUST_REPLACE))
+ !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE))
bch2_trans_copy_iter(&slot, &iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
@@ -287,16 +305,16 @@ found:
found = true;
not_found:
- if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) {
+ if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) {
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
- } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
+ } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) {
ret = -EEXIST;
} else {
if (!found && slot.path)
swap(iter, slot);
insert->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, insert, 0);
+ ret = bch2_trans_update(trans, &iter, insert, update_flags);
}
goto out;
@@ -307,7 +325,8 @@ int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum,
- struct bkey_i *insert, int flags)
+ struct bkey_i *insert,
+ bch_str_hash_flags_t str_hash_flags)
{
u32 snapshot;
int ret;
@@ -319,7 +338,7 @@ int bch2_hash_set(struct btree_trans *trans,
insert->k.p.inode = inum.inum;
return bch2_hash_set_snapshot(trans, desc, info, inum,
- snapshot, insert, flags, 0);
+ snapshot, insert, str_hash_flags, 0);
}
static __always_inline
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 22b34a8e4d6e..7c67c28d3ef8 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -37,11 +37,8 @@ static int check_subvol(struct btree_trans *trans,
return ret;
if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
- bch2_fs_lazy_rw(c);
-
ret = bch2_subvolume_delete(trans, iter->pos.offset);
- if (ret)
- bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+ bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
return ret ?: -BCH_ERR_transaction_restart_nested;
}
@@ -82,17 +79,12 @@ fsck_err:
int bch2_check_subvols(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_subvol(trans, &iter, k)));
- if (ret)
- bch_err_fn(c, ret);
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_subvol(trans, &iter, k)));
+ bch_err_fn(c, ret);
return ret;
}
@@ -228,8 +220,6 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
*/
static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
{
- struct btree_iter iter;
- struct bkey_s_c k;
struct bch_subvolume s;
return lockrestart_do(trans,
@@ -237,7 +227,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
BTREE_ITER_CACHED, &s)) ?:
for_each_btree_key_commit(trans, iter,
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_subvolume_reparent(trans, &iter, k,
subvolid_to_delete, le32_to_cpu(s.parent)));
}
@@ -274,7 +264,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
{
return bch2_subvolumes_reparent(trans, subvolid) ?:
- commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_subvolume_delete(trans, subvolid));
}
@@ -299,10 +289,9 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor
for (id = s.data; id < s.data + s.nr; id++) {
ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
- if (ret) {
- bch_err_msg(c, ret, "deleting subvolume %u", *id);
+ bch_err_msg(c, ret, "deleting subvolume %u", *id);
+ if (ret)
break;
- }
}
darray_exit(&s);
diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h
new file mode 100644
index 000000000000..af79134b07d6
--- /dev/null
+++ b/fs/bcachefs/subvolume_format.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H
+#define _BCACHEFS_SUBVOLUME_FORMAT_H
+
+#define SUBVOL_POS_MIN POS(0, 1)
+#define SUBVOL_POS_MAX POS(0, S32_MAX)
+#define BCACHEFS_ROOT_SUBVOL 1
+
+struct bch_subvolume {
+ struct bch_val v;
+ __le32 flags;
+ __le32 snapshot;
+ __le64 inode;
+ /*
+ * Snapshot subvolumes form a tree, separate from the snapshot nodes
+ * tree - if this subvolume is a snapshot, this is the ID of the
+ * subvolume it was created from:
+ *
+ * This is _not_ necessarily the subvolume of the directory containing
+ * this subvolume:
+ */
+ __le32 parent;
+ __le32 pad;
+ bch_le128 otime;
+};
+
+LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
+/*
+ * We need to know whether a subvolume is a snapshot so we can know whether we
+ * can delete it (or whether it should just be rm -rf'd)
+ */
+LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
+
+#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index 2d2e66a4e468..ae644adfc391 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -20,7 +20,11 @@ struct snapshot_t {
};
struct snapshot_table {
+#ifndef RUST_BINDGEN
DECLARE_FLEX_ARRAY(struct snapshot_t, s);
+#else
+ struct snapshot_t s[0];
+#endif
};
typedef struct {
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 78013deda9df..d60c7d27a047 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -2,7 +2,6 @@
#include "bcachefs.h"
#include "checksum.h"
-#include "counters.h"
#include "disk_groups.h"
#include "ec.h"
#include "error.h"
@@ -13,6 +12,7 @@
#include "replicas.h"
#include "quota.h"
#include "sb-clean.h"
+#include "sb-counters.h"
#include "sb-downgrade.h"
#include "sb-errors.h"
#include "sb-members.h"
@@ -30,14 +30,12 @@ static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
struct bch2_metadata_version {
u16 version;
const char *name;
- u64 recovery_passes;
};
static const struct bch2_metadata_version bch2_metadata_versions[] = {
-#define x(n, v, _recovery_passes) { \
+#define x(n, v) { \
.version = v, \
.name = #n, \
- .recovery_passes = _recovery_passes, \
},
BCH_METADATA_VERSIONS()
#undef x
@@ -70,24 +68,6 @@ unsigned bch2_latest_compatible_version(unsigned v)
return v;
}
-u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
- unsigned old_version,
- unsigned new_version)
-{
- u64 ret = 0;
-
- for (const struct bch2_metadata_version *i = bch2_metadata_versions;
- i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions);
- i++)
- if (i->version > old_version && i->version <= new_version) {
- if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK)
- ret |= bch2_fsck_recovery_passes();
- ret |= i->recovery_passes;
- }
-
- return ret &= ~RECOVERY_PASS_ALL_FSCK;
-}
-
const char * const bch2_sb_fields[] = {
#define x(name, nr) #name,
BCH_SB_FIELDS()
@@ -101,8 +81,6 @@ static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
enum bch_sb_field_type type)
{
- struct bch_sb_field *f;
-
/* XXX: need locking around superblock to access optional fields */
vstruct_for_each(sb, f)
@@ -192,8 +170,12 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
if (new_bytes > max_bytes) {
- pr_err("%pg: superblock too big: want %zu but have %llu",
- sb->bdev, new_bytes, max_bytes);
+ struct printbuf buf = PRINTBUF;
+
+ prt_bdevname(&buf, sb->bdev);
+ prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes);
+ pr_err("%s", buf.buf);
+ printbuf_exit(&buf);
return -BCH_ERR_ENOSPC_sb;
}
}
@@ -241,14 +223,12 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
if (sb->fs_sb) {
struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
- struct bch_dev *ca;
- unsigned i;
lockdep_assert_held(&c->sb_lock);
/* XXX: we're not checking that offline device have enough space */
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
struct bch_sb_handle *dev_sb = &ca->disk_sb;
if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
@@ -368,7 +348,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
int rw)
{
struct bch_sb *sb = disk_sb->sb;
- struct bch_sb_field *f;
struct bch_sb_field_members_v1 *mi;
enum bch_opt_id opt_id;
u16 block_size;
@@ -514,8 +493,6 @@ static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned
static void bch2_sb_update(struct bch_fs *c)
{
struct bch_sb *src = c->disk_sb.sb;
- struct bch_dev *ca;
- unsigned i;
lockdep_assert_held(&c->sb_lock);
@@ -546,7 +523,7 @@ static void bch2_sb_update(struct bch_fs *c)
le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
sizeof(c->sb.errors_silent) * 8);
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
ca->mi = bch2_mi_to_cpu(&m);
}
@@ -571,6 +548,7 @@ static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
dst->time_base_lo = src->time_base_lo;
dst->time_base_hi = src->time_base_hi;
dst->time_precision = src->time_precision;
+ dst->write_time = src->write_time;
memcpy(dst->flags, src->flags, sizeof(dst->flags));
memcpy(dst->features, src->features, sizeof(dst->features));
@@ -634,7 +612,6 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
{
- struct bch_csum csum;
size_t bytes;
int ret;
reread:
@@ -650,7 +627,9 @@ reread:
if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
!uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) {
- prt_printf(err, "Not a bcachefs superblock");
+ prt_str(err, "Not a bcachefs superblock (got magic ");
+ pr_uuid(err, sb->sb->magic.b);
+ prt_str(err, ")");
return -BCH_ERR_invalid_sb_magic;
}
@@ -673,17 +652,16 @@ reread:
goto reread;
}
- if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
+ enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb);
+ if (csum_type >= BCH_CSUM_NR) {
prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
return -BCH_ERR_invalid_sb_csum_type;
}
/* XXX: verify MACs */
- csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
- null_nonce(), sb->sb);
-
+ struct bch_csum csum = csum_vstruct(NULL, csum_type, null_nonce(), sb->sb);
if (bch2_crc_cmp(csum, sb->sb->csum)) {
- prt_printf(err, "bad checksum");
+ bch2_csum_err_msg(err, csum_type, sb->sb->csum, csum);
return -BCH_ERR_invalid_sb_csum;
}
@@ -692,12 +670,13 @@ reread:
return 0;
}
-int bch2_read_super(const char *path, struct bch_opts *opts,
- struct bch_sb_handle *sb)
+static int __bch2_read_super(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb, bool ignore_notbchfs_msg)
{
u64 offset = opt_get(*opts, sb);
struct bch_sb_layout layout;
struct printbuf err = PRINTBUF;
+ struct printbuf err2 = PRINTBUF;
__le64 *i;
int ret;
#ifndef __KERNEL__
@@ -761,8 +740,14 @@ retry:
if (opt_defined(*opts, sb))
goto err;
- printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n",
+ prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
path, err.buf);
+ if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
+ printk(KERN_INFO "%s", err2.buf);
+ else
+ printk(KERN_ERR "%s", err2.buf);
+
+ printbuf_exit(&err2);
printbuf_reset(&err);
/*
@@ -838,6 +823,20 @@ err_no_print:
goto out;
}
+int bch2_read_super(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb)
+{
+ return __bch2_read_super(path, opts, sb, false);
+}
+
+/* provide a silenced version for mount.bcachefs */
+
+int bch2_read_super_silent(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb)
+{
+ return __bch2_read_super(path, opts, sb, true);
+}
+
/* write superblock: */
static void write_super_endio(struct bio *bio)
@@ -906,9 +905,8 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
int bch2_write_super(struct bch_fs *c)
{
struct closure *cl = &c->sb_write;
- struct bch_dev *ca;
struct printbuf err = PRINTBUF;
- unsigned i, sb = 0, nr_wrote;
+ unsigned sb = 0, nr_wrote;
struct bch_devs_mask sb_written;
bool wrote, can_mount_without_written, can_mount_with_written;
unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
@@ -930,9 +928,14 @@ int bch2_write_super(struct bch_fs *c)
le64_add_cpu(&c->disk_sb.sb->seq, 1);
- if (test_bit(BCH_FS_ERROR, &c->flags))
+ struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ for_each_online_member(c, ca)
+ __bch2_members_v2_get_mut(mi, ca->dev_idx)->seq = c->disk_sb.sb->seq;
+ c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds());
+
+ if (test_bit(BCH_FS_error, &c->flags))
SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
- if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags))
+ if (test_bit(BCH_FS_topology_error, &c->flags))
SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
@@ -943,10 +946,10 @@ int bch2_write_super(struct bch_fs *c)
bch2_sb_errors_from_cpu(c);
bch2_sb_downgrade_update(c);
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
bch2_sb_from_fs(c, ca);
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
printbuf_reset(&err);
ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
@@ -967,16 +970,28 @@ int bch2_write_super(struct bch_fs *c)
if (!BCH_SB_INITIALIZED(c->disk_sb.sb))
goto out;
- for_each_online_member(ca, c, i) {
+ if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) {
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "attempting to write superblock that wasn't version downgraded (");
+ bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version));
+ prt_str(&buf, " > ");
+ bch2_version_to_text(&buf, bcachefs_metadata_version_current);
+ prt_str(&buf, ")");
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return -BCH_ERR_sb_not_downgraded;
+ }
+
+ for_each_online_member(c, ca) {
__set_bit(ca->dev_idx, sb_written.d);
ca->sb_write_error = 0;
}
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
read_back_super(c, ca);
closure_sync(cl);
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
if (ca->sb_write_error)
continue;
@@ -1003,7 +1018,7 @@ int bch2_write_super(struct bch_fs *c)
do {
wrote = false;
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
if (!ca->sb_write_error &&
sb < ca->disk_sb.sb->layout.nr_superblocks) {
write_one_super(c, ca, sb);
@@ -1013,7 +1028,7 @@ int bch2_write_super(struct bch_fs *c)
sb++;
} while (wrote);
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
if (ca->sb_write_error)
__clear_bit(ca->dev_idx, sb_written.d);
else
@@ -1025,7 +1040,7 @@ int bch2_write_super(struct bch_fs *c)
can_mount_with_written =
bch2_have_enough_devs(c, sb_written, degraded_flags, false);
- for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++)
sb_written.d[i] = ~sb_written.d[i];
can_mount_without_written =
@@ -1074,13 +1089,22 @@ bool bch2_check_version_downgrade(struct bch_fs *c)
/*
* Downgrade, if superblock is at a higher version than currently
* supported:
+ *
+ * c->sb will be checked before we write the superblock, so update it as
+ * well:
*/
- if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
+ if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) {
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
- if (c->sb.version > bcachefs_metadata_version_current)
+ c->sb.version_upgrade_complete = bcachefs_metadata_version_current;
+ }
+ if (c->sb.version > bcachefs_metadata_version_current) {
c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
- if (c->sb.version_min > bcachefs_metadata_version_current)
+ c->sb.version = bcachefs_metadata_version_current;
+ }
+ if (c->sb.version_min > bcachefs_metadata_version_current) {
c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
+ c->sb.version_min = bcachefs_metadata_version_current;
+ }
c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
return ret;
}
@@ -1173,8 +1197,8 @@ static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
return ret;
}
-void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
+void __bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
{
unsigned type = le32_to_cpu(f->type);
const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
@@ -1182,6 +1206,15 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
+ if (ops->to_text)
+ ops->to_text(out, sb, f);
+}
+
+void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ unsigned type = le32_to_cpu(f->type);
+
if (type < BCH_SB_FIELD_NR)
prt_printf(out, "%s", bch2_sb_fields[type]);
else
@@ -1190,11 +1223,7 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, " (size %zu):", vstruct_bytes(f));
prt_newline(out);
- if (ops->to_text) {
- printbuf_indent_add(out, 2);
- ops->to_text(out, sb, f);
- printbuf_indent_sub(out, 2);
- }
+ __bch2_sb_field_to_text(out, sb, f);
}
void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
@@ -1223,7 +1252,6 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
bool print_layout, unsigned fields)
{
- struct bch_sb_field *f;
u64 fields_have = 0;
unsigned nr_devices = 0;
@@ -1243,6 +1271,11 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
pr_uuid(out, sb->uuid.b);
prt_newline(out);
+ prt_printf(out, "Magic number:");
+ prt_tab(out);
+ pr_uuid(out, sb->magic.b);
+ prt_newline(out);
+
prt_str(out, "Device index:");
prt_tab(out);
prt_printf(out, "%u", sb->dev_idx);
@@ -1281,9 +1314,16 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, "%llu", le64_to_cpu(sb->seq));
prt_newline(out);
+ prt_printf(out, "Time of last write:");
+ prt_tab(out);
+ bch2_prt_datetime(out, le64_to_cpu(sb->write_time));
+ prt_newline(out);
+
prt_printf(out, "Superblock size:");
prt_tab(out);
- prt_printf(out, "%zu", vstruct_bytes(sb));
+ prt_units_u64(out, vstruct_bytes(sb));
+ prt_str(out, "/");
+ prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits);
prt_newline(out);
prt_printf(out, "Clean:");
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index e41e5de531a0..95e80e06316b 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -19,10 +19,6 @@ static inline bool bch2_version_compatible(u16 version)
void bch2_version_to_text(struct printbuf *, unsigned);
unsigned bch2_latest_compatible_version(unsigned);
-u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
- unsigned,
- unsigned);
-
static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
{
return le32_to_cpu(f->u64s) * sizeof(u64);
@@ -84,6 +80,7 @@ void bch2_free_super(struct bch_sb_handle *);
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
+int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_write_super(struct bch_fs *);
void __bch2_check_set_feature(struct bch_fs *, unsigned);
@@ -96,6 +93,8 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
bool bch2_check_version_downgrade(struct bch_fs *);
void bch2_sb_upgrade(struct bch_fs *, unsigned);
+void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
+ struct bch_sb_field *);
void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
struct bch_sb_field *);
void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 818ec467a06b..b9911402b175 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -23,7 +23,6 @@
#include "checksum.h"
#include "clock.h"
#include "compress.h"
-#include "counters.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
@@ -49,6 +48,7 @@
#include "recovery.h"
#include "replicas.h"
#include "sb-clean.h"
+#include "sb-counters.h"
#include "sb-errors.h"
#include "sb-members.h"
#include "snapshot.h"
@@ -79,6 +79,36 @@ MODULE_SOFTDEP("pre: chacha20");
MODULE_SOFTDEP("pre: poly1305");
MODULE_SOFTDEP("pre: xxhash");
+const char * const bch2_fs_flag_strs[] = {
+#define x(n) #n,
+ BCH_FS_FLAGS()
+#undef x
+ NULL
+};
+
+void __bch2_print(struct bch_fs *c, const char *fmt, ...)
+{
+ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
+
+ va_list args;
+ va_start(args, fmt);
+ if (likely(!stdio)) {
+ vprintk(fmt, args);
+ } else {
+ unsigned long flags;
+
+ if (fmt[0] == KERN_SOH[0])
+ fmt += 2;
+
+ spin_lock_irqsave(&stdio->output_lock, flags);
+ prt_vprintf(&stdio->output_buf, fmt, args);
+ spin_unlock_irqrestore(&stdio->output_lock, flags);
+
+ wake_up(&stdio->output_wait);
+ }
+ va_end(args);
+}
+
#define KTYPE(type) \
static const struct attribute_group type ## _group = { \
.attrs = type ## _files \
@@ -134,14 +164,12 @@ static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
struct bch_fs *bch2_dev_to_fs(dev_t dev)
{
struct bch_fs *c;
- struct bch_dev *ca;
- unsigned i;
mutex_lock(&bch_fs_list_lock);
rcu_read_lock();
list_for_each_entry(c, &bch_fs_list, list)
- for_each_member_device_rcu(ca, c, i, NULL)
+ for_each_member_device_rcu(c, ca, NULL)
if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
closure_get(&c->cl);
goto found;
@@ -182,14 +210,13 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i, nr = 0, u64s =
+ unsigned nr = 0, u64s =
((sizeof(struct jset_entry_dev_usage) +
sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) /
sizeof(u64);
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i, NULL)
+ for_each_member_device_rcu(c, ca, NULL)
nr++;
rcu_read_unlock();
@@ -216,8 +243,7 @@ static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
static void __bch2_fs_read_only(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i, clean_passes = 0;
+ unsigned clean_passes = 0;
u64 seq = 0;
bch2_fs_ec_stop(c);
@@ -246,14 +272,14 @@ static void __bch2_fs_read_only(struct bch_fs *c)
journal_cur_seq(&c->journal));
if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
- !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
- set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
+ !test_bit(BCH_FS_emergency_ro, &c->flags))
+ set_bit(BCH_FS_clean_shutdown, &c->flags);
bch2_fs_journal_stop(&c->journal);
/*
* After stopping journal:
*/
- for_each_member_device(ca, c, i)
+ for_each_member_device(c, ca)
bch2_dev_allocator_remove(c, ca);
}
@@ -262,25 +288,27 @@ static void bch2_writes_disabled(struct percpu_ref *writes)
{
struct bch_fs *c = container_of(writes, struct bch_fs, writes);
- set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+ set_bit(BCH_FS_write_disable_complete, &c->flags);
wake_up(&bch2_read_only_wait);
}
#endif
void bch2_fs_read_only(struct bch_fs *c)
{
- if (!test_bit(BCH_FS_RW, &c->flags)) {
+ if (!test_bit(BCH_FS_rw, &c->flags)) {
bch2_journal_reclaim_stop(&c->journal);
return;
}
- BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+ BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags));
+
+ bch_verbose(c, "going read-only");
/*
* Block new foreground-end write operations from starting - any new
* writes will return -EROFS:
*/
- set_bit(BCH_FS_GOING_RO, &c->flags);
+ set_bit(BCH_FS_going_ro, &c->flags);
#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_kill(&c->writes);
#else
@@ -300,33 +328,42 @@ void bch2_fs_read_only(struct bch_fs *c)
* that going RO is complete:
*/
wait_event(bch2_read_only_wait,
- test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
- test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
+ test_bit(BCH_FS_write_disable_complete, &c->flags) ||
+ test_bit(BCH_FS_emergency_ro, &c->flags));
+
+ bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags);
+ if (writes_disabled)
+ bch_verbose(c, "finished waiting for writes to stop");
__bch2_fs_read_only(c);
wait_event(bch2_read_only_wait,
- test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+ test_bit(BCH_FS_write_disable_complete, &c->flags));
- clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
- clear_bit(BCH_FS_GOING_RO, &c->flags);
+ if (!writes_disabled)
+ bch_verbose(c, "finished waiting for writes to stop");
+
+ clear_bit(BCH_FS_write_disable_complete, &c->flags);
+ clear_bit(BCH_FS_going_ro, &c->flags);
+ clear_bit(BCH_FS_rw, &c->flags);
if (!bch2_journal_error(&c->journal) &&
- !test_bit(BCH_FS_ERROR, &c->flags) &&
- !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
- test_bit(BCH_FS_STARTED, &c->flags) &&
- test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) &&
+ !test_bit(BCH_FS_error, &c->flags) &&
+ !test_bit(BCH_FS_emergency_ro, &c->flags) &&
+ test_bit(BCH_FS_started, &c->flags) &&
+ test_bit(BCH_FS_clean_shutdown, &c->flags) &&
!c->opts.norecovery) {
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
BUG_ON(atomic_read(&c->btree_cache.dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
- BUG_ON(c->btree_write_buffer.state.nr);
+ BUG_ON(c->btree_write_buffer.inc.keys.nr);
+ BUG_ON(c->btree_write_buffer.flushing.keys.nr);
bch_verbose(c, "marking filesystem clean");
bch2_fs_mark_clean(c);
+ } else {
+ bch_verbose(c, "done going read-only, filesystem not clean");
}
-
- clear_bit(BCH_FS_RW, &c->flags);
}
static void bch2_fs_read_only_work(struct work_struct *work)
@@ -346,7 +383,7 @@ static void bch2_fs_read_only_async(struct bch_fs *c)
bool bch2_fs_emergency_read_only(struct bch_fs *c)
{
- bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
+ bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
bch2_journal_halt(&c->journal);
bch2_fs_read_only_async(c);
@@ -383,28 +420,16 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
static int __bch2_fs_read_write(struct bch_fs *c, bool early)
{
- struct bch_dev *ca;
- unsigned i;
int ret;
- if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) {
+ if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
bch_err(c, "cannot go rw, unfixed btree errors");
return -BCH_ERR_erofs_unfixed_errors;
}
- if (test_bit(BCH_FS_RW, &c->flags))
+ if (test_bit(BCH_FS_rw, &c->flags))
return 0;
- if (c->opts.norecovery)
- return -BCH_ERR_erofs_norecovery;
-
- /*
- * nochanges is used for fsck -n mode - we have to allow going rw
- * during recovery for that to work:
- */
- if (c->opts.nochanges && (!early || c->opts.read_only))
- return -BCH_ERR_erofs_nochanges;
-
bch_info(c, "going read-write");
ret = bch2_sb_members_v2_init(c);
@@ -415,7 +440,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
if (ret)
goto err;
- clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
+ clear_bit(BCH_FS_clean_shutdown, &c->flags);
/*
* First journal write must be a flush write: after a clean shutdown we
@@ -425,17 +450,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
*/
set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags);
- for_each_rw_member(ca, c, i)
+ for_each_rw_member(c, ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- set_bit(BCH_FS_RW, &c->flags);
- set_bit(BCH_FS_WAS_RW, &c->flags);
+ set_bit(BCH_FS_rw, &c->flags);
+ set_bit(BCH_FS_was_rw, &c->flags);
#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_reinit(&c->writes);
#else
- for (i = 0; i < BCH_WRITE_REF_NR; i++) {
+ for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
BUG_ON(atomic_long_read(&c->writes[i]));
atomic_long_inc(&c->writes[i]);
}
@@ -463,7 +488,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_do_pending_node_rewrites(c);
return 0;
err:
- if (test_bit(BCH_FS_RW, &c->flags))
+ if (test_bit(BCH_FS_rw, &c->flags))
bch2_fs_read_only(c);
else
__bch2_fs_read_only(c);
@@ -472,6 +497,12 @@ err:
int bch2_fs_read_write(struct bch_fs *c)
{
+ if (c->opts.norecovery)
+ return -BCH_ERR_erofs_norecovery;
+
+ if (c->opts.nochanges)
+ return -BCH_ERR_erofs_nochanges;
+
return __bch2_fs_read_write(c, false);
}
@@ -558,12 +589,9 @@ static void bch2_fs_release(struct kobject *kobj)
void __bch2_fs_stop(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
bch_verbose(c, "shutting down");
- set_bit(BCH_FS_STOPPING, &c->flags);
+ set_bit(BCH_FS_stopping, &c->flags);
cancel_work_sync(&c->journal_seq_blacklist_gc_work);
@@ -571,7 +599,7 @@ void __bch2_fs_stop(struct bch_fs *c)
bch2_fs_read_only(c);
up_write(&c->state_lock);
- for_each_member_device(ca, c, i)
+ for_each_member_device(c, ca)
if (ca->kobj.state_in_sysfs &&
ca->disk_sb.bdev)
sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
@@ -582,6 +610,9 @@ void __bch2_fs_stop(struct bch_fs *c)
bch2_fs_debug_exit(c);
bch2_fs_chardev_exit(c);
+ bch2_ro_ref_put(c);
+ wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref));
+
kobject_put(&c->counters_kobj);
kobject_put(&c->time_stats);
kobject_put(&c->opts_dir);
@@ -590,7 +621,7 @@ void __bch2_fs_stop(struct bch_fs *c)
/* btree prefetch might have kicked off reads in the background: */
bch2_btree_flush_all_reads(c);
- for_each_member_device(ca, c, i)
+ for_each_member_device(c, ca)
cancel_work_sync(&ca->io_error_work);
cancel_work_sync(&c->read_only_work);
@@ -629,8 +660,6 @@ void bch2_fs_stop(struct bch_fs *c)
static int bch2_fs_online(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
int ret = 0;
lockdep_assert_held(&bch_fs_list_lock);
@@ -651,7 +680,9 @@ static int bch2_fs_online(struct bch_fs *c)
ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
kobject_add(&c->internal, &c->kobj, "internal") ?:
kobject_add(&c->opts_dir, &c->kobj, "options") ?:
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+#endif
kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
bch2_opts_create_sysfs_files(&c->opts_dir);
if (ret) {
@@ -661,7 +692,7 @@ static int bch2_fs_online(struct bch_fs *c)
down_write(&c->state_lock);
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
ret = bch2_dev_sysfs_online(c, ca);
if (ret) {
bch_err(c, "error creating sysfs objects");
@@ -690,6 +721,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
goto out;
}
+ c->stdio = (void *)(unsigned long) opts.stdio;
+
__module_get(THIS_MODULE);
closure_init(&c->cl, NULL);
@@ -710,6 +743,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mutex_init(&c->btree_root_lock);
INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
+ refcount_set(&c->ro_ref, 1);
+ init_waitqueue_head(&c->ro_ref_wait);
+ sema_init(&c->online_fsck_mutex, 1);
+
init_rwsem(&c->gc_lock);
mutex_init(&c->gc_gens_lock);
atomic_set(&c->journal_keys.ref, 1);
@@ -763,7 +800,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
- c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal];
c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
bch2_fs_btree_cache_init_early(&c->btree_cache);
@@ -832,7 +868,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
!(c->io_complete_wq = alloc_workqueue("bcachefs_io",
- WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
+ WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) ||
!(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
WQ_FREEZABLE, 0)) ||
#ifndef BCH_WRITE_REF_DEBUG
@@ -847,7 +883,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
- btree_bytes(c)) ||
+ c->opts.btree_node_size) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
!(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
sizeof(u64), GFP_KERNEL))) {
@@ -946,16 +982,14 @@ static void print_mount_opts(struct bch_fs *c)
int bch2_fs_start(struct bch_fs *c)
{
- struct bch_dev *ca;
time64_t now = ktime_get_real_seconds();
- unsigned i;
int ret;
print_mount_opts(c);
down_write(&c->state_lock);
- BUG_ON(test_bit(BCH_FS_STARTED, &c->flags));
+ BUG_ON(test_bit(BCH_FS_started, &c->flags));
mutex_lock(&c->sb_lock);
@@ -965,12 +999,12 @@ int bch2_fs_start(struct bch_fs *c)
goto err;
}
- for_each_online_member(ca, c, i)
- bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now);
+ for_each_online_member(c, ca)
+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);
mutex_unlock(&c->sb_lock);
- for_each_rw_member(ca, c, i)
+ for_each_rw_member(c, ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
@@ -990,12 +1024,12 @@ int bch2_fs_start(struct bch_fs *c)
goto err;
}
- set_bit(BCH_FS_STARTED, &c->flags);
+ set_bit(BCH_FS_started, &c->flags);
- if (c->opts.read_only || c->opts.nochanges) {
+ if (c->opts.read_only) {
bch2_fs_read_only(c);
} else {
- ret = !test_bit(BCH_FS_RW, &c->flags)
+ ret = !test_bit(BCH_FS_rw, &c->flags)
? bch2_fs_read_write(c)
: bch2_fs_read_write_late(c);
if (ret)
@@ -1003,12 +1037,13 @@ int bch2_fs_start(struct bch_fs *c)
}
ret = 0;
-out:
+err:
+ if (ret)
+ bch_err_msg(c, ret, "starting filesystem");
+ else
+ bch_verbose(c, "done starting filesystem");
up_write(&c->state_lock);
return ret;
-err:
- bch_err_msg(c, ret, "starting filesystem");
- goto out;
}
static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
@@ -1025,20 +1060,83 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
return 0;
}
-static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
+static int bch2_dev_in_fs(struct bch_sb_handle *fs,
+ struct bch_sb_handle *sb)
{
- struct bch_sb *newest =
- le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
+ if (fs == sb)
+ return 0;
- if (!uuid_equal(&fs->uuid, &sb->uuid))
+ if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))
return -BCH_ERR_device_not_a_member_of_filesystem;
- if (!bch2_dev_exists(newest, sb->dev_idx))
+ if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx))
return -BCH_ERR_device_has_been_removed;
- if (fs->block_size != sb->block_size)
+ if (fs->sb->block_size != sb->sb->block_size)
return -BCH_ERR_mismatched_block_size;
+ if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq ||
+ le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq)
+ return 0;
+
+ if (fs->sb->seq == sb->sb->seq &&
+ fs->sb->write_time != sb->sb->write_time) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "Split brain detected between ");
+ prt_bdevname(&buf, sb->bdev);
+ prt_str(&buf, " and ");
+ prt_bdevname(&buf, fs->bdev);
+ prt_char(&buf, ':');
+ prt_newline(&buf);
+ prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq));
+ prt_newline(&buf);
+
+ prt_bdevname(&buf, fs->bdev);
+ prt_char(&buf, ' ');
+ bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));;
+ prt_newline(&buf);
+
+ prt_bdevname(&buf, sb->bdev);
+ prt_char(&buf, ' ');
+ bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
+ prt_newline(&buf);
+
+ prt_printf(&buf, "Not using older sb");
+
+ pr_err("%s", buf.buf);
+ printbuf_exit(&buf);
+ return -BCH_ERR_device_splitbrain;
+ }
+
+ struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
+ u64 seq_from_fs = le64_to_cpu(m.seq);
+ u64 seq_from_member = le64_to_cpu(sb->sb->seq);
+
+ if (seq_from_fs && seq_from_fs < seq_from_member) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "Split brain detected between ");
+ prt_bdevname(&buf, sb->bdev);
+ prt_str(&buf, " and ");
+ prt_bdevname(&buf, fs->bdev);
+ prt_char(&buf, ':');
+ prt_newline(&buf);
+
+ prt_bdevname(&buf, fs->bdev);
+ prt_str(&buf, "believes seq of ");
+ prt_bdevname(&buf, sb->bdev);
+ prt_printf(&buf, " to be %llu, but ", seq_from_fs);
+ prt_bdevname(&buf, sb->bdev);
+ prt_printf(&buf, " has %llu\n", seq_from_member);
+ prt_str(&buf, "Not using ");
+ prt_bdevname(&buf, sb->bdev);
+
+ pr_err("%s", buf.buf);
+ printbuf_exit(&buf);
+ return -BCH_ERR_device_splitbrain;
+ }
+
return 0;
}
@@ -1284,9 +1382,14 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
bch2_dev_sysfs_online(c, ca);
+ struct printbuf name = PRINTBUF;
+ prt_bdevname(&name, ca->disk_sb.bdev);
+
if (c->sb.nr_devices == 1)
- snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev);
- snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev);
+ strscpy(c->name, name.buf, sizeof(c->name));
+ strscpy(ca->name, name.buf, sizeof(ca->name));
+
+ printbuf_exit(&name);
rebalance_wakeup(c);
return 0;
@@ -1307,8 +1410,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
enum bch_member_state new_state, int flags)
{
struct bch_devs_mask new_online_devs;
- struct bch_dev *ca2;
- int i, nr_rw = 0, required;
+ int nr_rw = 0, required;
lockdep_assert_held(&c->state_lock);
@@ -1320,7 +1422,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
return true;
/* do we have enough devices to write to? */
- for_each_member_device(ca2, c, i)
+ for_each_member_device(c, ca2)
if (ca2 != ca)
nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
@@ -1468,9 +1570,7 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
BTREE_TRIGGER_NORUN, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
BTREE_TRIGGER_NORUN, NULL);
- if (ret)
- bch_err_msg(c, ret, "removing dev alloc info");
-
+ bch_err_msg(c, ret, "removing dev alloc info");
return ret;
}
@@ -1497,40 +1597,35 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
__bch2_dev_read_only(c, ca);
ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
- if (ret) {
- bch_err_msg(ca, ret, "dropping data");
+ bch_err_msg(ca, ret, "dropping data");
+ if (ret)
goto err;
- }
ret = bch2_dev_remove_alloc(c, ca);
- if (ret) {
- bch_err_msg(ca, ret, "deleting alloc info");
+ bch_err_msg(ca, ret, "deleting alloc info");
+ if (ret)
goto err;
- }
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
- if (ret) {
- bch_err_msg(ca, ret, "flushing journal");
+ bch_err_msg(ca, ret, "flushing journal");
+ if (ret)
goto err;
- }
ret = bch2_journal_flush(&c->journal);
- if (ret) {
- bch_err(ca, "journal error");
+ bch_err(ca, "journal error");
+ if (ret)
goto err;
- }
ret = bch2_replicas_gc2(c);
- if (ret) {
- bch_err_msg(ca, ret, "in replicas_gc2()");
+ bch_err_msg(ca, ret, "in replicas_gc2()");
+ if (ret)
goto err;
- }
data = bch2_dev_has_data(c, ca);
if (data) {
struct printbuf data_has = PRINTBUF;
- prt_bitflags(&data_has, bch2_data_types, data);
+ prt_bitflags(&data_has, __bch2_data_types, data);
bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
printbuf_exit(&data_has);
ret = -EBUSY;
@@ -1596,10 +1691,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
int ret;
ret = bch2_read_super(path, &opts, &sb);
- if (ret) {
- bch_err_msg(c, ret, "reading super");
+ bch_err_msg(c, ret, "reading super");
+ if (ret)
goto err;
- }
dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
@@ -1612,10 +1706,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
}
ret = bch2_dev_may_add(sb.sb, c);
- if (ret) {
- bch_err_fn(c, ret);
+ if (ret)
goto err;
- }
ca = __bch2_dev_alloc(c, &dev_mi);
if (!ca) {
@@ -1630,19 +1722,17 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err;
ret = bch2_dev_journal_alloc(ca);
- if (ret) {
- bch_err_msg(c, ret, "allocating journal");
+ bch_err_msg(c, ret, "allocating journal");
+ if (ret)
goto err;
- }
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
ret = bch2_sb_from_fs(c, ca);
- if (ret) {
- bch_err_msg(c, ret, "setting up new superblock");
+ bch_err_msg(c, ret, "setting up new superblock");
+ if (ret)
goto err_unlock;
- }
if (dynamic_fault("bcachefs:add:no_slot"))
goto no_slot;
@@ -1681,10 +1771,9 @@ have_slot:
if (BCH_MEMBER_GROUP(&dev_mi)) {
ret = __bch2_dev_group_set(c, ca, label.buf);
- if (ret) {
- bch_err_msg(c, ret, "creating new label");
+ bch_err_msg(c, ret, "creating new label");
+ if (ret)
goto err_unlock;
- }
}
bch2_write_super(c);
@@ -1693,16 +1782,14 @@ have_slot:
bch2_dev_usage_journal_reserve(c);
ret = bch2_trans_mark_dev_sb(c, ca);
- if (ret) {
- bch_err_msg(ca, ret, "marking new superblock");
+ bch_err_msg(ca, ret, "marking new superblock");
+ if (ret)
goto err_late;
- }
ret = bch2_fs_freespace_init(c);
- if (ret) {
- bch_err_msg(ca, ret, "initializing free space");
+ bch_err_msg(ca, ret, "initializing free space");
+ if (ret)
goto err_late;
- }
ca->new_fs_bucket_idx = 0;
@@ -1721,6 +1808,7 @@ err:
bch2_free_super(&sb);
printbuf_exit(&label);
printbuf_exit(&errbuf);
+ bch_err_fn(c, ret);
return ret;
err_late:
up_write(&c->state_lock);
@@ -1747,11 +1835,10 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
dev_idx = sb.sb->dev_idx;
- ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
- if (ret) {
- bch_err_msg(c, ret, "bringing %s online", path);
+ ret = bch2_dev_in_fs(&c->disk_sb, &sb);
+ bch_err_msg(c, ret, "bringing %s online", path);
+ if (ret)
goto err;
- }
ret = bch2_dev_attach_bdev(c, &sb);
if (ret)
@@ -1760,10 +1847,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
ca = bch_dev_locked(c, dev_idx);
ret = bch2_trans_mark_dev_sb(c, ca);
- if (ret) {
- bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
+ bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
+ if (ret)
goto err;
- }
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
@@ -1842,10 +1928,9 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
}
ret = bch2_dev_buckets_resize(c, ca, nbuckets);
- if (ret) {
- bch_err_msg(ca, ret, "resizing buckets");
+ bch_err_msg(ca, ret, "resizing buckets");
+ if (ret)
goto err;
- }
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret)
@@ -1879,28 +1964,30 @@ err:
/* return with ref on ca->ref: */
struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
{
- struct bch_dev *ca;
- unsigned i;
-
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i, NULL)
- if (!strcmp(name, ca->name))
- goto found;
- ca = ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
-found:
+ for_each_member_device_rcu(c, ca, NULL)
+ if (!strcmp(name, ca->name)) {
+ rcu_read_unlock();
+ return ca;
+ }
rcu_read_unlock();
-
- return ca;
+ return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
}
/* Filesystem open: */
+static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
+{
+ return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?:
+ cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time));
+}
+
struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
struct bch_opts opts)
{
DARRAY(struct bch_sb_handle) sbs = { 0 };
struct bch_fs *c = NULL;
- struct bch_sb_handle *sb, *best = NULL;
+ struct bch_sb_handle *best = NULL;
struct printbuf errbuf = PRINTBUF;
int ret = 0;
@@ -1926,20 +2013,27 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
BUG_ON(darray_push(&sbs, sb));
}
+ if (opts.nochanges && !opts.read_only) {
+ ret = -BCH_ERR_erofs_nochanges;
+ goto err_print;
+ }
+
darray_for_each(sbs, sb)
- if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq))
+ if (!best || sb_cmp(sb->sb, best->sb) > 0)
best = sb;
darray_for_each_reverse(sbs, sb) {
- if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) {
- pr_info("%pg has been removed, skipping", sb->bdev);
+ ret = bch2_dev_in_fs(best, sb);
+
+ if (ret == -BCH_ERR_device_has_been_removed ||
+ ret == -BCH_ERR_device_splitbrain) {
bch2_free_super(sb);
darray_remove_item(&sbs, sb);
best -= best > sb;
+ ret = 0;
continue;
}
- ret = bch2_dev_in_fs(best->sb, sb->sb);
if (ret)
goto err_print;
}
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index bf762df18012..dada09331d2e 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -8,6 +8,8 @@
#include <linux/math64.h>
+extern const char * const bch2_fs_flag_strs[];
+
struct bch_fs *bch2_dev_to_fs(dev_t);
struct bch_fs *bch2_uuid_to_fs(__uuid_t);
@@ -37,8 +39,8 @@ int bch2_fs_read_write_early(struct bch_fs *);
*/
static inline void bch2_fs_lazy_rw(struct bch_fs *c)
{
- if (!test_bit(BCH_FS_RW, &c->flags) &&
- !test_bit(BCH_FS_WAS_RW, &c->flags))
+ if (!test_bit(BCH_FS_rw, &c->flags) &&
+ !test_bit(BCH_FS_was_rw, &c->flags))
bch2_fs_read_write_early(c);
}
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index b2119686e2e1..0e5a14fc8e7f 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -23,7 +23,7 @@ struct bch_devs_mask {
struct bch_devs_list {
u8 nr;
- u8 devs[BCH_BKEY_PTRS_MAX];
+ u8 data[BCH_BKEY_PTRS_MAX];
};
struct bch_member_cpu {
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index f3cb7115b530..cee80c47feea 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -21,6 +21,7 @@
#include "btree_gc.h"
#include "buckets.h"
#include "clock.h"
+#include "compress.h"
#include "disk_groups.h"
#include "ec.h"
#include "inode.h"
@@ -145,6 +146,7 @@ rw_attribute(gc_gens_pos);
read_attribute(uuid);
read_attribute(minor);
+read_attribute(flags);
read_attribute(bucket_size);
read_attribute(first_bucket);
read_attribute(nbuckets);
@@ -246,7 +248,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
mutex_lock(&c->btree_cache.lock);
list_for_each_entry(b, &c->btree_cache.live, list)
- ret += btree_bytes(c);
+ ret += btree_buf_bytes(b);
mutex_unlock(&c->btree_cache.lock);
return ret;
@@ -255,19 +257,18 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
{
struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
enum btree_id id;
- u64 nr_uncompressed_extents = 0,
- nr_compressed_extents = 0,
- nr_incompressible_extents = 0,
- uncompressed_sectors = 0,
- incompressible_sectors = 0,
- compressed_sectors_compressed = 0,
- compressed_sectors_uncompressed = 0;
+ struct compression_type_stats {
+ u64 nr_extents;
+ u64 sectors_compressed;
+ u64 sectors_uncompressed;
+ } s[BCH_COMPRESSION_TYPE_NR];
+ u64 compressed_incompressible = 0;
int ret = 0;
- if (!test_bit(BCH_FS_STARTED, &c->flags))
+ memset(s, 0, sizeof(s));
+
+ if (!test_bit(BCH_FS_started, &c->flags))
return -EPERM;
trans = bch2_trans_get(c);
@@ -276,39 +277,33 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
if (!btree_type_has_ptrs(id))
continue;
- ret = for_each_btree_key2(trans, iter, id, POS_MIN,
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ ret = for_each_btree_key(trans, iter, id, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct bch_extent_crc_unpacked crc;
const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- bool compressed = false, uncompressed = false, incompressible = false;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- switch (p.crc.compression_type) {
- case BCH_COMPRESSION_TYPE_none:
- uncompressed = true;
- uncompressed_sectors += k.k->size;
- break;
- case BCH_COMPRESSION_TYPE_incompressible:
- incompressible = true;
- incompressible_sectors += k.k->size;
- break;
- default:
- compressed_sectors_compressed +=
- p.crc.compressed_size;
- compressed_sectors_uncompressed +=
- p.crc.uncompressed_size;
- compressed = true;
- break;
+ bool compressed = false, incompressible = false;
+
+ bkey_for_each_crc(k.k, ptrs, crc, entry) {
+ incompressible |= crc.compression_type == BCH_COMPRESSION_TYPE_incompressible;
+ compressed |= crc_is_compressed(crc);
+
+ if (crc_is_compressed(crc)) {
+ s[crc.compression_type].nr_extents++;
+ s[crc.compression_type].sectors_compressed += crc.compressed_size;
+ s[crc.compression_type].sectors_uncompressed += crc.uncompressed_size;
}
}
- if (incompressible)
- nr_incompressible_extents++;
- else if (uncompressed)
- nr_uncompressed_extents++;
- else if (compressed)
- nr_compressed_extents++;
+ compressed_incompressible += compressed && incompressible;
+
+ if (!compressed) {
+ unsigned t = incompressible ? BCH_COMPRESSION_TYPE_incompressible : 0;
+
+ s[t].nr_extents++;
+ s[t].sectors_compressed += k.k->size;
+ s[t].sectors_uncompressed += k.k->size;
+ }
0;
}));
}
@@ -318,26 +313,45 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
if (ret)
return ret;
- prt_printf(out, "uncompressed:\n");
- prt_printf(out, " nr extents: %llu\n", nr_uncompressed_extents);
- prt_printf(out, " size: ");
- prt_human_readable_u64(out, uncompressed_sectors << 9);
- prt_printf(out, "\n");
+ prt_str(out, "type");
+ printbuf_tabstop_push(out, 12);
+ prt_tab(out);
- prt_printf(out, "compressed:\n");
- prt_printf(out, " nr extents: %llu\n", nr_compressed_extents);
- prt_printf(out, " compressed size: ");
- prt_human_readable_u64(out, compressed_sectors_compressed << 9);
- prt_printf(out, "\n");
- prt_printf(out, " uncompressed size: ");
- prt_human_readable_u64(out, compressed_sectors_uncompressed << 9);
- prt_printf(out, "\n");
+ prt_str(out, "compressed");
+ printbuf_tabstop_push(out, 16);
+ prt_tab_rjust(out);
+
+ prt_str(out, "uncompressed");
+ printbuf_tabstop_push(out, 16);
+ prt_tab_rjust(out);
+
+ prt_str(out, "average extent size");
+ printbuf_tabstop_push(out, 24);
+ prt_tab_rjust(out);
+ prt_newline(out);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(s); i++) {
+ bch2_prt_compression_type(out, i);
+ prt_tab(out);
+
+ prt_human_readable_u64(out, s[i].sectors_compressed << 9);
+ prt_tab_rjust(out);
+
+ prt_human_readable_u64(out, s[i].sectors_uncompressed << 9);
+ prt_tab_rjust(out);
+
+ prt_human_readable_u64(out, s[i].nr_extents
+ ? div_u64(s[i].sectors_uncompressed << 9, s[i].nr_extents)
+ : 0);
+ prt_tab_rjust(out);
+ prt_newline(out);
+ }
+
+ if (compressed_incompressible) {
+ prt_printf(out, "%llu compressed & incompressible extents", compressed_incompressible);
+ prt_newline(out);
+ }
- prt_printf(out, "incompressible:\n");
- prt_printf(out, " nr extents: %llu\n", nr_incompressible_extents);
- prt_printf(out, " size: ");
- prt_human_readable_u64(out, incompressible_sectors << 9);
- prt_printf(out, "\n");
return 0;
}
@@ -370,6 +384,9 @@ SHOW(bch2_fs)
sysfs_print(minor, c->minor);
sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
+ if (attr == &sysfs_flags)
+ prt_bitflags(out, bch2_fs_flag_strs, c->flags);
+
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
if (attr == &sysfs_btree_write_stats)
@@ -483,12 +500,12 @@ STORE(bch2_fs)
/* Debugging: */
- if (!test_bit(BCH_FS_STARTED, &c->flags))
+ if (!test_bit(BCH_FS_started, &c->flags))
return -EPERM;
/* Debugging: */
- if (!test_bit(BCH_FS_RW, &c->flags))
+ if (!test_bit(BCH_FS_rw, &c->flags))
return -EROFS;
if (attr == &sysfs_prune_cache) {
@@ -620,6 +637,7 @@ STORE(bch2_fs_internal)
SYSFS_OPS(bch2_fs_internal);
struct attribute *bch2_fs_internal_files[] = {
+ &sysfs_flags,
&sysfs_journal_debug,
&sysfs_btree_updates,
&sysfs_btree_cache,
@@ -708,8 +726,10 @@ STORE(bch2_fs_opts_dir)
bch2_opt_set_sb(c, opt, v);
bch2_opt_set_by_id(&c->opts, id, v);
- if ((id == Opt_background_target ||
- id == Opt_background_compression) && v)
+ if (v &&
+ (id == Opt_background_target ||
+ id == Opt_background_compression ||
+ (id == Opt_compression && !c->opts.background_compression)))
bch2_set_rebalance_needs_scan(c, 0);
ret = size;
@@ -786,32 +806,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 16);
- prt_tab(out);
- prt_str(out, "buckets");
- prt_tab_rjust(out);
- prt_str(out, "sectors");
- prt_tab_rjust(out);
- prt_str(out, "fragmented");
- prt_tab_rjust(out);
- prt_newline(out);
-
- for (i = 0; i < BCH_DATA_NR; i++) {
- prt_str(out, bch2_data_types[i]);
- prt_tab(out);
- prt_u64(out, stats.d[i].buckets);
- prt_tab_rjust(out);
- prt_u64(out, stats.d[i].sectors);
- prt_tab_rjust(out);
- prt_u64(out, stats.d[i].fragmented);
- prt_tab_rjust(out);
- prt_newline(out);
- }
-
- prt_str(out, "ec");
- prt_tab(out);
- prt_u64(out, stats.buckets_ec);
- prt_tab_rjust(out);
- prt_newline(out);
+ bch2_dev_usage_to_text(out, &stats);
prt_newline(out);
@@ -891,7 +886,7 @@ static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
for (i = 1; i < BCH_DATA_NR; i++)
prt_printf(out, "%-12s:%12llu\n",
- bch2_data_types[i],
+ bch2_data_type_str(i),
percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
}
}
@@ -916,7 +911,7 @@ SHOW(bch2_dev)
}
if (attr == &sysfs_has_data) {
- prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
+ prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca));
prt_char(out, '\n');
}
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 2fc9e60c754b..b3fe9fc57747 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -107,9 +107,6 @@ err:
static int test_iterate(struct bch_fs *c, u64 nr)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
- struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -127,49 +124,43 @@ static int test_iterate(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
- goto err;
+ return ret;
}
pr_info("iterating forwards");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(k.k->p.offset != i++);
- 0;
- }));
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
+ BUG_ON(k.k->p.offset != i++);
+ 0;
+ })));
bch_err_msg(c, ret, "error iterating forwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i != nr);
pr_info("iterating backwards");
- ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
- SPOS(0, U64_MAX, U32_MAX), 0, k,
- ({
+ ret = bch2_trans_run(c,
+ for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, U64_MAX, U32_MAX), 0, k, ({
BUG_ON(k.k->p.offset != --i);
0;
- }));
+ })));
bch_err_msg(c, ret, "error iterating backwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i);
-err:
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
+ return 0;
}
static int test_iterate_extents(struct bch_fs *c, u64 nr)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
- struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -188,51 +179,45 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
- goto err;
+ return ret;
}
pr_info("iterating forwards");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(bkey_start_offset(k.k) != i);
- i = k.k->p.offset;
- 0;
- }));
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
+ BUG_ON(bkey_start_offset(k.k) != i);
+ i = k.k->p.offset;
+ 0;
+ })));
bch_err_msg(c, ret, "error iterating forwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i != nr);
pr_info("iterating backwards");
- ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
- SPOS(0, U64_MAX, U32_MAX), 0, k,
- ({
+ ret = bch2_trans_run(c,
+ for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
+ SPOS(0, U64_MAX, U32_MAX), 0, k, ({
BUG_ON(k.k->p.offset != i);
i = bkey_start_offset(k.k);
0;
- }));
+ })));
bch_err_msg(c, ret, "error iterating backwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i);
-err:
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
+ return 0;
}
static int test_iterate_slots(struct bch_fs *c, u64 nr)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
- struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -250,57 +235,48 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
- goto err;
+ return ret;
}
pr_info("iterating forwards");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(k.k->p.offset != i);
- i += 2;
- 0;
- }));
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
+ BUG_ON(k.k->p.offset != i);
+ i += 2;
+ 0;
+ })));
bch_err_msg(c, ret, "error iterating forwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i != nr * 2);
pr_info("iterating forwards by slots");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- BTREE_ITER_SLOTS, k, ({
- if (i >= nr * 2)
- break;
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ BTREE_ITER_SLOTS, k, ({
+ if (i >= nr * 2)
+ break;
- BUG_ON(k.k->p.offset != i);
- BUG_ON(bkey_deleted(k.k) != (i & 1));
+ BUG_ON(k.k->p.offset != i);
+ BUG_ON(bkey_deleted(k.k) != (i & 1));
- i++;
- 0;
- }));
- if (ret < 0) {
- bch_err_msg(c, ret, "error iterating forwards by slots");
- goto err;
- }
- ret = 0;
-err:
- bch2_trans_put(trans);
+ i++;
+ 0;
+ })));
+ bch_err_msg(c, ret, "error iterating forwards by slots");
return ret;
}
static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
- struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -319,50 +295,45 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
- goto err;
+ return ret;
}
pr_info("iterating forwards");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(bkey_start_offset(k.k) != i + 8);
- BUG_ON(k.k->size != 8);
- i += 16;
- 0;
- }));
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
+ BUG_ON(bkey_start_offset(k.k) != i + 8);
+ BUG_ON(k.k->size != 8);
+ i += 16;
+ 0;
+ })));
bch_err_msg(c, ret, "error iterating forwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i != nr);
pr_info("iterating forwards by slots");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- BTREE_ITER_SLOTS, k, ({
- if (i == nr)
- break;
- BUG_ON(bkey_deleted(k.k) != !(i % 16));
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ BTREE_ITER_SLOTS, k, ({
+ if (i == nr)
+ break;
+ BUG_ON(bkey_deleted(k.k) != !(i % 16));
- BUG_ON(bkey_start_offset(k.k) != i);
- BUG_ON(k.k->size != 8);
- i = k.k->p.offset;
- 0;
- }));
+ BUG_ON(bkey_start_offset(k.k) != i);
+ BUG_ON(k.k->size != 8);
+ i = k.k->p.offset;
+ 0;
+ })));
bch_err_msg(c, ret, "error iterating forwards by slots");
- if (ret)
- goto err;
- ret = 0;
-err:
- bch2_trans_put(trans);
- return 0;
+ return ret;
}
/*
@@ -736,8 +707,6 @@ static int rand_delete(struct bch_fs *c, u64 nr)
static int seq_insert(struct bch_fs *c, u64 nr)
{
- struct btree_iter iter;
- struct bkey_s_c k;
struct bkey_i_cookie insert;
bkey_cookie_init(&insert.k_i);
@@ -756,11 +725,8 @@ static int seq_insert(struct bch_fs *c, u64 nr)
static int seq_lookup(struct bch_fs *c, u64 nr)
{
- struct btree_iter iter;
- struct bkey_s_c k;
-
return bch2_trans_run(c,
- for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k,
0));
@@ -768,9 +734,6 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
static int seq_overwrite(struct bch_fs *c, u64 nr)
{
- struct btree_iter iter;
- struct bkey_s_c k;
-
return bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX),
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
new file mode 100644
index 000000000000..b1c867aa2b58
--- /dev/null
+++ b/fs/bcachefs/thread_with_file.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "printbuf.h"
+#include "thread_with_file.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/poll.h>
+
+void bch2_thread_with_file_exit(struct thread_with_file *thr)
+{
+ if (thr->task) {
+ kthread_stop(thr->task);
+ put_task_struct(thr->task);
+ }
+}
+
+int bch2_run_thread_with_file(struct thread_with_file *thr,
+ const struct file_operations *fops,
+ int (*fn)(void *))
+{
+ struct file *file = NULL;
+ int ret, fd = -1;
+ unsigned fd_flags = O_CLOEXEC;
+
+ if (fops->read && fops->write)
+ fd_flags |= O_RDWR;
+ else if (fops->read)
+ fd_flags |= O_RDONLY;
+ else if (fops->write)
+ fd_flags |= O_WRONLY;
+
+ char name[TASK_COMM_LEN];
+ get_task_comm(name, current);
+
+ thr->ret = 0;
+ thr->task = kthread_create(fn, thr, "%s", name);
+ ret = PTR_ERR_OR_ZERO(thr->task);
+ if (ret)
+ return ret;
+
+ ret = get_unused_fd_flags(fd_flags);
+ if (ret < 0)
+ goto err;
+ fd = ret;
+
+ file = anon_inode_getfile(name, fops, thr, fd_flags);
+ ret = PTR_ERR_OR_ZERO(file);
+ if (ret)
+ goto err;
+
+ fd_install(fd, file);
+ get_task_struct(thr->task);
+ wake_up_process(thr->task);
+ return fd;
+err:
+ if (fd >= 0)
+ put_unused_fd(fd);
+ if (thr->task)
+ kthread_stop(thr->task);
+ return ret;
+}
+
+static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr)
+{
+ return thr->stdio.output_buf.pos ||
+ thr->output2.nr ||
+ thr->thr.done;
+}
+
+static ssize_t thread_with_stdio_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+ size_t copied = 0, b;
+ int ret = 0;
+
+ if ((file->f_flags & O_NONBLOCK) &&
+ !thread_with_stdio_has_output(thr))
+ return -EAGAIN;
+
+ ret = wait_event_interruptible(thr->stdio.output_wait,
+ thread_with_stdio_has_output(thr));
+ if (ret)
+ return ret;
+
+ if (thr->thr.done)
+ return 0;
+
+ while (len) {
+ ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos);
+ if (ret)
+ break;
+
+ spin_lock_irq(&thr->stdio.output_lock);
+ b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos);
+
+ memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b);
+ memmove(thr->stdio.output_buf.buf,
+ thr->stdio.output_buf.buf + b,
+ thr->stdio.output_buf.pos - b);
+
+ thr->output2.nr += b;
+ thr->stdio.output_buf.pos -= b;
+ spin_unlock_irq(&thr->stdio.output_lock);
+
+ b = min(len, thr->output2.nr);
+ if (!b)
+ break;
+
+ b -= copy_to_user(buf, thr->output2.data, b);
+ if (!b) {
+ ret = -EFAULT;
+ break;
+ }
+
+ copied += b;
+ buf += b;
+ len -= b;
+
+ memmove(thr->output2.data,
+ thr->output2.data + b,
+ thr->output2.nr - b);
+ thr->output2.nr -= b;
+ }
+
+ return copied ?: ret;
+}
+
+static int thread_with_stdio_release(struct inode *inode, struct file *file)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ bch2_thread_with_file_exit(&thr->thr);
+ printbuf_exit(&thr->stdio.input_buf);
+ printbuf_exit(&thr->stdio.output_buf);
+ darray_exit(&thr->output2);
+ thr->exit(thr);
+ return 0;
+}
+
+#define WRITE_BUFFER 4096
+
+static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr)
+{
+ return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done;
+}
+
+static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
+ size_t len, loff_t *ppos)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+ struct printbuf *buf = &thr->stdio.input_buf;
+ size_t copied = 0;
+ ssize_t ret = 0;
+
+ while (len) {
+ if (thr->thr.done) {
+ ret = -EPIPE;
+ break;
+ }
+
+ size_t b = len - fault_in_readable(ubuf, len);
+ if (!b) {
+ ret = -EFAULT;
+ break;
+ }
+
+ spin_lock(&thr->stdio.input_lock);
+ if (buf->pos < WRITE_BUFFER)
+ bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos));
+ b = min(len, printbuf_remaining_size(buf));
+
+ if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) {
+ ubuf += b;
+ len -= b;
+ copied += b;
+ buf->pos += b;
+ }
+ spin_unlock(&thr->stdio.input_lock);
+
+ if (b) {
+ wake_up(&thr->stdio.input_wait);
+ } else {
+ if ((file->f_flags & O_NONBLOCK)) {
+ ret = -EAGAIN;
+ break;
+ }
+
+ ret = wait_event_interruptible(thr->stdio.input_wait,
+ thread_with_stdio_has_input_space(thr));
+ if (ret)
+ break;
+ }
+ }
+
+ return copied ?: ret;
+}
+
+static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ poll_wait(file, &thr->stdio.output_wait, wait);
+ poll_wait(file, &thr->stdio.input_wait, wait);
+
+ __poll_t mask = 0;
+
+ if (thread_with_stdio_has_output(thr))
+ mask |= EPOLLIN;
+ if (thread_with_stdio_has_input_space(thr))
+ mask |= EPOLLOUT;
+ if (thr->thr.done)
+ mask |= EPOLLHUP|EPOLLERR;
+ return mask;
+}
+
+static const struct file_operations thread_with_stdio_fops = {
+ .release = thread_with_stdio_release,
+ .read = thread_with_stdio_read,
+ .write = thread_with_stdio_write,
+ .poll = thread_with_stdio_poll,
+ .llseek = no_llseek,
+};
+
+int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
+ void (*exit)(struct thread_with_stdio *),
+ int (*fn)(void *))
+{
+ thr->stdio.input_buf = PRINTBUF;
+ thr->stdio.input_buf.atomic++;
+ spin_lock_init(&thr->stdio.input_lock);
+ init_waitqueue_head(&thr->stdio.input_wait);
+
+ thr->stdio.output_buf = PRINTBUF;
+ thr->stdio.output_buf.atomic++;
+ spin_lock_init(&thr->stdio.output_lock);
+ init_waitqueue_head(&thr->stdio.output_wait);
+
+ darray_init(&thr->output2);
+ thr->exit = exit;
+
+ return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn);
+}
+
+int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len)
+{
+ wait_event(stdio->input_wait,
+ stdio->input_buf.pos || stdio->done);
+
+ if (stdio->done)
+ return -1;
+
+ spin_lock(&stdio->input_lock);
+ int ret = min(len, stdio->input_buf.pos);
+ stdio->input_buf.pos -= ret;
+ memcpy(buf, stdio->input_buf.buf, ret);
+ memmove(stdio->input_buf.buf,
+ stdio->input_buf.buf + ret,
+ stdio->input_buf.pos);
+ spin_unlock(&stdio->input_lock);
+
+ wake_up(&stdio->input_wait);
+ return ret;
+}
+
+int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len)
+{
+ wait_event(stdio->input_wait,
+ stdio->input_buf.pos || stdio->done);
+
+ if (stdio->done)
+ return -1;
+
+ spin_lock(&stdio->input_lock);
+ int ret = min(len, stdio->input_buf.pos);
+ char *n = memchr(stdio->input_buf.buf, '\n', ret);
+ if (n)
+ ret = min(ret, n + 1 - stdio->input_buf.buf);
+ stdio->input_buf.pos -= ret;
+ memcpy(buf, stdio->input_buf.buf, ret);
+ memmove(stdio->input_buf.buf,
+ stdio->input_buf.buf + ret,
+ stdio->input_buf.pos);
+ spin_unlock(&stdio->input_lock);
+
+ wake_up(&stdio->input_wait);
+ return ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
new file mode 100644
index 000000000000..05879c5048c8
--- /dev/null
+++ b/fs/bcachefs/thread_with_file.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_THREAD_WITH_FILE_H
+#define _BCACHEFS_THREAD_WITH_FILE_H
+
+#include "thread_with_file_types.h"
+
+struct task_struct;
+
+struct thread_with_file {
+ struct task_struct *task;
+ int ret;
+ bool done;
+};
+
+void bch2_thread_with_file_exit(struct thread_with_file *);
+int bch2_run_thread_with_file(struct thread_with_file *,
+ const struct file_operations *,
+ int (*fn)(void *));
+
+struct thread_with_stdio {
+ struct thread_with_file thr;
+ struct stdio_redirect stdio;
+ DARRAY(char) output2;
+ void (*exit)(struct thread_with_stdio *);
+};
+
+static inline void thread_with_stdio_done(struct thread_with_stdio *thr)
+{
+ thr->thr.done = true;
+ thr->stdio.done = true;
+ wake_up(&thr->stdio.input_wait);
+ wake_up(&thr->stdio.output_wait);
+}
+
+int bch2_run_thread_with_stdio(struct thread_with_stdio *,
+ void (*exit)(struct thread_with_stdio *),
+ int (*fn)(void *));
+int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
+int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
+
+#endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
new file mode 100644
index 000000000000..90b5e645e98c
--- /dev/null
+++ b/fs/bcachefs/thread_with_file_types.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
+#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
+
+struct stdio_redirect {
+ spinlock_t output_lock;
+ wait_queue_head_t output_wait;
+ struct printbuf output_buf;
+
+ spinlock_t input_lock;
+ wait_queue_head_t input_wait;
+ struct printbuf input_buf;
+ bool done;
+};
+
+#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index fd49b63562c3..293b90d704fb 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -32,22 +32,68 @@ DECLARE_EVENT_CLASS(bpos,
TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot)
);
-DECLARE_EVENT_CLASS(bkey,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k),
+DECLARE_EVENT_CLASS(fs_str,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str),
TP_STRUCT__entry(
- __string(k, k )
+ __field(dev_t, dev )
+ __string(str, str )
),
TP_fast_assign(
- __assign_str(k, k);
+ __entry->dev = c->dev;
+ __assign_str(str, str);
),
- TP_printk("%s", __get_str(k))
+ TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
);
-DECLARE_EVENT_CLASS(btree_node,
+DECLARE_EVENT_CLASS(trans_str,
+ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
+ TP_ARGS(trans, caller_ip, str),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __string(str, str )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = trans->c->dev;
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __assign_str(str, str);
+ ),
+
+ TP_printk("%d,%d %s %pS %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->trans_fn, (void *) __entry->caller_ip, __get_str(str))
+);
+
+DECLARE_EVENT_CLASS(trans_str_nocaller,
+ TP_PROTO(struct btree_trans *trans, const char *str),
+ TP_ARGS(trans, str),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, trans_fn, 32 )
+ __string(str, str )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = trans->c->dev;
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __assign_str(str, str);
+ ),
+
+ TP_printk("%d,%d %s %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->trans_fn, __get_str(str))
+);
+
+DECLARE_EVENT_CLASS(btree_node_nofs,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b),
@@ -72,6 +118,33 @@ DECLARE_EVENT_CLASS(btree_node,
__entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
);
+DECLARE_EVENT_CLASS(btree_node,
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, trans_fn, 32 )
+ __field(u8, level )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = trans->c->dev;
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->level = b->c.level;
+ __entry->btree_id = b->c.btree_id;
+ TRACE_BPOS_assign(pos, b->key.k.p);
+ ),
+
+ TP_printk("%d,%d %s %u %s %llu:%llu:%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn,
+ __entry->level,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
+);
+
DECLARE_EVENT_CLASS(bch_fs,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c),
@@ -87,6 +160,23 @@ DECLARE_EVENT_CLASS(bch_fs,
TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
);
+DECLARE_EVENT_CLASS(btree_trans,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, trans_fn, 32 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = trans->c->dev;
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ ),
+
+ TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn)
+);
+
DECLARE_EVENT_CLASS(bio,
TP_PROTO(struct bio *bio),
TP_ARGS(bio),
@@ -183,9 +273,14 @@ DEFINE_EVENT(bch_fs, journal_full,
TP_ARGS(c)
);
-DEFINE_EVENT(bch_fs, journal_entry_full,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(fs_str, journal_entry_full,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, journal_entry_close,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
DEFINE_EVENT(bio, journal_write,
@@ -286,36 +381,36 @@ TRACE_EVENT(btree_cache_scan,
__entry->nr_to_scan, __entry->can_free, __entry->ret)
);
-DEFINE_EVENT(btree_node, btree_cache_reap,
+DEFINE_EVENT(btree_node_nofs, btree_cache_reap,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans)
);
-DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans)
);
-DEFINE_EVENT(bch_fs, btree_cache_cannibalize,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans)
);
-DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans)
);
/* Btree */
DEFINE_EVENT(btree_node, btree_node_read,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
TRACE_EVENT(btree_node_write,
@@ -339,13 +434,13 @@ TRACE_EVENT(btree_node_write,
);
DEFINE_EVENT(btree_node, btree_node_alloc,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_free,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
TRACE_EVENT(btree_reserve_get_fail,
@@ -377,28 +472,28 @@ TRACE_EVENT(btree_reserve_get_fail,
);
DEFINE_EVENT(btree_node, btree_node_compact,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_merge,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_split,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_rewrite,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_set_root,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
TRACE_EVENT(btree_path_relock_fail,
@@ -433,7 +528,7 @@ TRACE_EVENT(btree_path_relock_fail,
__entry->level = path->level;
TRACE_BPOS_assign(pos, path->pos);
- c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+ c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
__entry->self_read_count = c.n[SIX_LOCK_read];
__entry->self_intent_count = c.n[SIX_LOCK_intent];
@@ -717,44 +812,32 @@ TRACE_EVENT(bucket_evacuate,
__entry->dev_idx, __entry->bucket)
);
-DEFINE_EVENT(bkey, move_extent,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+DEFINE_EVENT(fs_str, move_extent,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(bkey, move_extent_read,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+DEFINE_EVENT(fs_str, move_extent_read,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(bkey, move_extent_write,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+DEFINE_EVENT(fs_str, move_extent_write,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(bkey, move_extent_finish,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+DEFINE_EVENT(fs_str, move_extent_finish,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-TRACE_EVENT(move_extent_fail,
- TP_PROTO(struct bch_fs *c, const char *msg),
- TP_ARGS(c, msg),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __string(msg, msg )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __assign_str(msg, msg);
- ),
-
- TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg))
+DEFINE_EVENT(fs_str, move_extent_fail,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(bkey, move_extent_start_fail,
+DEFINE_EVENT(fs_str, move_extent_start_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
@@ -930,7 +1013,7 @@ TRACE_EVENT(trans_restart_split_race,
__entry->level = b->c.level;
__entry->written = b->written;
__entry->blocks = btree_blocks(trans->c);
- __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b);
+ __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b);
),
TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
@@ -987,10 +1070,11 @@ DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced,
TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_event, trans_restart_too_many_iters,
+DEFINE_EVENT(trans_str, trans_restart_too_many_iters,
TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
+ unsigned long caller_ip,
+ const char *paths),
+ TP_ARGS(trans, caller_ip, paths)
);
DECLARE_EVENT_CLASS(transaction_restart_iter,
@@ -1036,8 +1120,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
TP_ARGS(trans, caller_ip, path)
);
-struct get_locks_fail;
-
TRACE_EVENT(trans_restart_upgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
@@ -1056,8 +1138,6 @@ TRACE_EVENT(trans_restart_upgrade,
__field(u8, level )
__field(u32, path_seq )
__field(u32, node_seq )
- __field(u32, path_alloc_seq )
- __field(u32, downgrade_seq)
TRACE_BPOS_entries(pos)
),
@@ -1070,12 +1150,10 @@ TRACE_EVENT(trans_restart_upgrade,
__entry->level = f->l;
__entry->path_seq = path->l[f->l].lock_seq;
__entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq;
- __entry->path_alloc_seq = path->alloc_seq;
- __entry->downgrade_seq = path->downgrade_seq;
TRACE_BPOS_assign(pos, path->pos)
),
- TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u",
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
bch2_btree_id_str(__entry->btree_id),
@@ -1086,16 +1164,12 @@ TRACE_EVENT(trans_restart_upgrade,
__entry->new_locks_want,
__entry->level,
__entry->path_seq,
- __entry->node_seq,
- __entry->path_alloc_seq,
- __entry->downgrade_seq)
+ __entry->node_seq)
);
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
+DEFINE_EVENT(trans_str, trans_restart_relock,
+ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
+ TP_ARGS(trans, caller_ip, str)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
@@ -1160,10 +1234,10 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure,
TP_ARGS(trans, caller_ip, path)
);
-DEFINE_EVENT(transaction_event, trans_restart_would_deadlock,
+DEFINE_EVENT(trans_str_nocaller, trans_restart_would_deadlock,
TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
+ const char *cycle),
+ TP_ARGS(trans, cycle)
);
DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit,
@@ -1252,22 +1326,37 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
TRACE_EVENT(path_downgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path),
+ struct btree_path *path,
+ unsigned old_locks_want),
+ TP_ARGS(trans, caller_ip, path, old_locks_want),
TP_STRUCT__entry(
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
+ __field(unsigned, old_locks_want )
+ __field(unsigned, new_locks_want )
+ __field(unsigned, btree )
+ TRACE_BPOS_entries(pos)
),
TP_fast_assign(
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
+ __entry->old_locks_want = old_locks_want;
+ __entry->new_locks_want = path->locks_want;
+ __entry->btree = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos);
),
- TP_printk("%s %pS",
+ TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u",
__entry->trans_fn,
- (void *) __entry->caller_ip)
+ (void *) __entry->caller_ip,
+ __entry->old_locks_want,
+ __entry->new_locks_want,
+ bch2_btree_id_str(__entry->btree),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot)
);
DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
@@ -1298,21 +1387,48 @@ TRACE_EVENT(write_buffer_flush,
__entry->nr, __entry->size, __entry->skipped, __entry->fast)
);
+TRACE_EVENT(write_buffer_flush_sync,
+ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ ),
+
+ TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
+);
+
TRACE_EVENT(write_buffer_flush_slowpath,
- TP_PROTO(struct btree_trans *trans, size_t nr, size_t size),
- TP_ARGS(trans, nr, size),
+ TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total),
+ TP_ARGS(trans, slowpath, total),
TP_STRUCT__entry(
- __field(size_t, nr )
- __field(size_t, size )
+ __field(size_t, slowpath )
+ __field(size_t, total )
),
TP_fast_assign(
- __entry->nr = nr;
- __entry->size = size;
+ __entry->slowpath = slowpath;
+ __entry->total = total;
),
- TP_printk("%zu/%zu", __entry->nr, __entry->size)
+ TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
+);
+
+DEFINE_EVENT(fs_str, rebalance_extent,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, data_update,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
#endif /* _TRACE_BCACHEFS_H */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 84b142fcc3df..56b815fd9fc6 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -241,12 +241,17 @@ bool bch2_is_zero(const void *_p, size_t n)
return true;
}
-void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits)
+void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits)
{
while (nr_bits)
prt_char(out, '0' + ((v >> --nr_bits) & 1));
}
+void bch2_prt_u64_base2(struct printbuf *out, u64 v)
+{
+ bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
+}
+
void bch2_print_string_as_lines(const char *prefix, const char *lines)
{
const char *p;
@@ -267,14 +272,14 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
console_unlock();
}
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
+ gfp_t gfp)
{
#ifdef CONFIG_STACKTRACE
unsigned nr_entries = 0;
- int ret = 0;
stack->nr = 0;
- ret = darray_make_room(stack, 32);
+ int ret = darray_make_room_gfp(stack, 32, gfp);
if (ret)
return ret;
@@ -282,7 +287,7 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
return -1;
do {
- nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, 0);
+ nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1);
} while (nr_entries == stack->size &&
!(ret = darray_make_room(stack, stack->size * 2)));
@@ -297,24 +302,74 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
{
- unsigned long *i;
-
darray_for_each(*stack, i) {
prt_printf(out, "[<0>] %pB", (void *) *i);
prt_newline(out);
}
}
-int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task)
+int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp)
{
bch_stacktrace stack = { 0 };
- int ret = bch2_save_backtrace(&stack, task);
+ int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp);
bch2_prt_backtrace(out, &stack);
darray_exit(&stack);
return ret;
}
+#ifndef __KERNEL__
+#include <time.h>
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+ time_t t = sec;
+ char buf[64];
+ ctime_r(&t, buf);
+ strim(buf);
+ prt_str(out, buf);
+}
+#else
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+ char buf[64];
+ snprintf(buf, sizeof(buf), "%ptT", &sec);
+ prt_u64(out, sec);
+}
+#endif
+
+static const struct time_unit {
+ const char *name;
+ u64 nsecs;
+} time_units[] = {
+ { "ns", 1 },
+ { "us", NSEC_PER_USEC },
+ { "ms", NSEC_PER_MSEC },
+ { "s", NSEC_PER_SEC },
+ { "m", (u64) NSEC_PER_SEC * 60},
+ { "h", (u64) NSEC_PER_SEC * 3600},
+ { "eon", U64_MAX },
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+ const struct time_unit *u;
+
+ for (u = time_units;
+ u + 1 < time_units + ARRAY_SIZE(time_units) &&
+ ns >= u[1].nsecs << 1;
+ u++)
+ ;
+
+ return u;
+}
+
+void bch2_pr_time_units(struct printbuf *out, u64 ns)
+{
+ const struct time_unit *u = pick_time_units(ns);
+
+ prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
/* time stats: */
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
@@ -359,6 +414,7 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
stats->max_duration = max(stats->max_duration, duration);
stats->min_duration = min(stats->min_duration, duration);
+ stats->total_duration += duration;
bch2_quantiles_update(&stats->quantiles, duration);
}
@@ -372,29 +428,33 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
}
}
+static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+ struct bch2_time_stat_buffer *b)
+{
+ for (struct bch2_time_stat_buffer_entry *i = b->entries;
+ i < b->entries + ARRAY_SIZE(b->entries);
+ i++)
+ bch2_time_stats_update_one(stats, i->start, i->end);
+ b->nr = 0;
+}
+
static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
struct bch2_time_stat_buffer *b)
{
- struct bch2_time_stat_buffer_entry *i;
unsigned long flags;
spin_lock_irqsave(&stats->lock, flags);
- for (i = b->entries;
- i < b->entries + ARRAY_SIZE(b->entries);
- i++)
- bch2_time_stats_update_one(stats, i->start, i->end);
+ __bch2_time_stats_clear_buffer(stats, b);
spin_unlock_irqrestore(&stats->lock, flags);
-
- b->nr = 0;
}
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
{
unsigned long flags;
- WARN_RATELIMIT(!stats->min_duration || !stats->min_freq,
- "time_stats: min_duration = %llu, min_freq = %llu",
- stats->min_duration, stats->min_freq);
+ WARN_ONCE(!stats->duration_stats_weighted.weight ||
+ !stats->freq_stats_weighted.weight,
+ "uninitialized time_stats");
if (!stats->buffer) {
spin_lock_irqsave(&stats->lock, flags);
@@ -423,40 +483,6 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
preempt_enable();
}
}
-#endif
-
-static const struct time_unit {
- const char *name;
- u64 nsecs;
-} time_units[] = {
- { "ns", 1 },
- { "us", NSEC_PER_USEC },
- { "ms", NSEC_PER_MSEC },
- { "s", NSEC_PER_SEC },
- { "m", (u64) NSEC_PER_SEC * 60},
- { "h", (u64) NSEC_PER_SEC * 3600},
- { "eon", U64_MAX },
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
-{
- const struct time_unit *u;
-
- for (u = time_units;
- u + 1 < time_units + ARRAY_SIZE(time_units) &&
- ns >= u[1].nsecs << 1;
- u++)
- ;
-
- return u;
-}
-
-void bch2_pr_time_units(struct printbuf *out, u64 ns)
-{
- const struct time_unit *u = pick_time_units(ns);
-
- prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
-}
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
@@ -467,26 +493,6 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
prt_printf(out, "%s", u->name);
}
-#ifndef __KERNEL__
-#include <time.h>
-void bch2_prt_datetime(struct printbuf *out, time64_t sec)
-{
- time_t t = sec;
- char buf[64];
- ctime_r(&t, buf);
- prt_str(out, buf);
-}
-#else
-void bch2_prt_datetime(struct printbuf *out, time64_t sec)
-{
- char buf[64];
- snprintf(buf, sizeof(buf), "%ptT", &sec);
- prt_u64(out, sec);
-}
-#endif
-
-#define TABSTOP_SIZE 12
-
static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
{
prt_str(out, name);
@@ -495,12 +501,24 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
prt_newline(out);
}
+#define TABSTOP_SIZE 12
+
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
{
const struct time_unit *u;
s64 f_mean = 0, d_mean = 0;
u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
int i;
+
+ if (stats->buffer) {
+ int cpu;
+
+ spin_lock_irq(&stats->lock);
+ for_each_possible_cpu(cpu)
+ __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+ spin_unlock_irq(&stats->lock);
+ }
+
/*
* avoid divide by zero
*/
@@ -546,6 +564,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
pr_name_and_units(out, "min:", stats->min_duration);
pr_name_and_units(out, "max:", stats->max_duration);
+ pr_name_and_units(out, "total:", stats->total_duration);
prt_printf(out, "mean:");
prt_tab(out);
@@ -603,6 +622,9 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
last_q = q;
}
}
+#else
+void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
+#endif
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
@@ -1157,3 +1179,39 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
return ret;
}
+
+void bch2_darray_str_exit(darray_str *d)
+{
+ darray_for_each(*d, i)
+ kfree(*i);
+ darray_exit(d);
+}
+
+int bch2_split_devs(const char *_dev_name, darray_str *ret)
+{
+ darray_init(ret);
+
+ char *dev_name, *s, *orig;
+
+ dev_name = orig = kstrdup(_dev_name, GFP_KERNEL);
+ if (!dev_name)
+ return -ENOMEM;
+
+ while ((s = strsep(&dev_name, ":"))) {
+ char *p = kstrdup(s, GFP_KERNEL);
+ if (!p)
+ goto err;
+
+ if (darray_push(ret, p)) {
+ kfree(p);
+ goto err;
+ }
+ }
+
+ kfree(orig);
+ return 0;
+err:
+ bch2_darray_str_exit(ret);
+ kfree(orig);
+ return -ENOMEM;
+}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index b701f7fe0784..b414736d59a5 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -342,14 +342,24 @@ bool bch2_is_zero(const void *, size_t);
u64 bch2_read_flag_list(char *, const char * const[]);
-void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
+void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
+void bch2_prt_u64_base2(struct printbuf *, u64);
void bch2_print_string_as_lines(const char *prefix, const char *lines);
typedef DARRAY(unsigned long) bch_stacktrace;
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *);
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *);
-int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *);
+int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t);
+
+static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
+{
+#ifdef __KERNEL__
+ prt_printf(out, "%pg", bdev);
+#else
+ prt_str(out, bdev->name);
+#endif
+}
#define NR_QUANTILES 15
#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
@@ -374,8 +384,9 @@ struct bch2_time_stat_buffer {
struct bch2_time_stats {
spinlock_t lock;
/* all fields are in nanoseconds */
- u64 max_duration;
u64 min_duration;
+ u64 max_duration;
+ u64 total_duration;
u64 max_freq;
u64 min_freq;
u64 last_event;
@@ -390,15 +401,39 @@ struct bch2_time_stats {
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-#else
-static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
-#endif
static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
{
__bch2_time_stats_update(stats, start, local_clock());
}
+static inline bool track_event_change(struct bch2_time_stats *stats,
+ u64 *start, bool v)
+{
+ if (v != !!*start) {
+ if (!v) {
+ bch2_time_stats_update(stats, *start);
+ *start = 0;
+ } else {
+ *start = local_clock() ?: 1;
+ return true;
+ }
+ }
+
+ return false;
+}
+#else
+static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
+static inline bool track_event_change(struct bch2_time_stats *stats,
+ u64 *start, bool v)
+{
+ bool ret = v && !*start;
+ *start = v;
+ return ret;
+}
+#endif
+
void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
void bch2_time_stats_exit(struct bch2_time_stats *);
@@ -831,4 +866,14 @@ static inline int cmp_le32(__le32 l, __le32 r)
#include <linux/uuid.h>
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+static inline bool qstr_eq(const struct qstr l, const struct qstr r)
+{
+ return l.len == r.len && !memcmp(l.name, r.name, l.len);
+}
+
+void bch2_darray_str_exit(darray_str *);
+int bch2_split_devs(const char *, darray_str *);
+
#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
index a6561b4b36a6..2ad338e282da 100644
--- a/fs/bcachefs/vstructs.h
+++ b/fs/bcachefs/vstructs.h
@@ -48,14 +48,14 @@
((void *) ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
#define vstruct_for_each(_s, _i) \
- for (_i = (_s)->start; \
+ for (typeof(&(_s)->start[0]) _i = (_s)->start; \
_i < vstruct_last(_s); \
_i = vstruct_next(_i))
-#define vstruct_for_each_safe(_s, _i, _t) \
- for (_i = (_s)->start; \
- _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \
- _i = _t)
+#define vstruct_for_each_safe(_s, _i) \
+ for (typeof(&(_s)->start[0]) _next, _i = (_s)->start; \
+ _i < vstruct_last(_s) && (_next = vstruct_next(_i), true); \
+ _i = _next)
#define vstruct_idx(_s, _idx) \
((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 5a1858fb9879..9c0d2316031b 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -590,8 +590,9 @@ err:
mutex_unlock(&inode->ei_update_lock);
if (value &&
- (opt_id == Opt_background_compression ||
- opt_id == Opt_background_target))
+ (opt_id == Opt_background_target ||
+ opt_id == Opt_background_compression ||
+ (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
return bch2_err_class(ret);
diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
new file mode 100644
index 000000000000..e9f810539552
--- /dev/null
+++ b/fs/bcachefs/xattr_format.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_XATTR_FORMAT_H
+#define _BCACHEFS_XATTR_FORMAT_H
+
+#define KEY_TYPE_XATTR_INDEX_USER 0
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
+#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
+#define KEY_TYPE_XATTR_INDEX_SECURITY 4
+
+struct bch_xattr {
+ struct bch_val v;
+ __u8 x_type;
+ __u8 x_name_len;
+ __le16 x_val_len;
+ __u8 x_name[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_XATTR_FORMAT_H */
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index a93d76df8ed8..2b4dda047450 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -671,9 +671,6 @@ static struct dentry *befs_get_parent(struct dentry *child)
parent = befs_iget(child->d_sb,
(unsigned long)befs_ino->i_parent.start);
- if (IS_ERR(parent))
- return ERR_CAST(parent);
-
return d_obtain_alias(parent);
}
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index fbc4ae80a4b2..c375e22c4c0c 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -275,11 +275,6 @@ static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino)
dprintf("name=%s, namelen=%d\n", name, namelen);
- if (!namelen)
- return -ENOENT;
- if (namelen > BFS_NAMELEN)
- return -ENAMETOOLONG;
-
sblock = BFS_I(dir)->i_sblock;
eblock = BFS_I(dir)->i_eblock;
for (block = sblock; block <= eblock; block++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 193168214eeb..68345f73d429 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -141,16 +141,16 @@ static int compression_decompress_bio(struct list_head *ws,
}
static int compression_decompress(int type, struct list_head *ws,
- const u8 *data_in, struct page *dest_page,
- unsigned long start_byte, size_t srclen, size_t destlen)
+ const u8 *data_in, struct page *dest_page,
+ unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_NONE:
default:
/*
@@ -1037,14 +1037,23 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
* start_byte tells us the offset into the compressed data we're interested in
*/
int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
- unsigned long start_byte, size_t srclen, size_t destlen)
+ unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
struct list_head *workspace;
+ const u32 sectorsize = fs_info->sectorsize;
int ret;
+ /*
+ * The full destination page range should not exceed the page size.
+ * And the @destlen should not exceed sectorsize, as this is only called for
+ * inline file extents, which should not exceed sectorsize.
+ */
+ ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize);
+
workspace = get_workspace(type, 0);
ret = compression_decompress(type, workspace, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
put_workspace(type, workspace);
return ret;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 93cc92974dee..afd7e50d073d 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -148,7 +148,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *zlib_alloc_workspace(unsigned int level);
void zlib_free_workspace(struct list_head *ws);
@@ -159,7 +159,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *lzo_alloc_workspace(unsigned int level);
void lzo_free_workspace(struct list_head *ws);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f396aba92c57..8e8cc1111277 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1260,7 +1260,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
u64 bytes_left, end;
u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
- if (WARN_ON(start != aligned_start)) {
+ /* Adjust the range to be aligned to 512B sectors if necessary. */
+ if (start != aligned_start) {
len -= aligned_start - start;
len = round_down(len, 1 << SECTOR_SHIFT);
start = aligned_start;
@@ -4298,6 +4299,42 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
return 0;
}
+static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ if (ffe_ctl->for_treelog) {
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg)
+ ffe_ctl->hint_byte = fs_info->treelog_bg;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ } else if (ffe_ctl->for_data_reloc) {
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg)
+ ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+ struct btrfs_block_group *block_group;
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
+ /*
+ * No lock is OK here because avail is monotinically
+ * decreasing, and this is just a hint.
+ */
+ u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+
+ if (block_group_bits(block_group, ffe_ctl->flags) &&
+ avail >= ffe_ctl->num_bytes) {
+ ffe_ctl->hint_byte = block_group->start;
+ break;
+ }
+ }
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
+
+ return 0;
+}
+
static int prepare_allocation(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
@@ -4308,19 +4345,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
return prepare_allocation_clustered(fs_info, ffe_ctl,
space_info, ins);
case BTRFS_EXTENT_ALLOC_ZONED:
- if (ffe_ctl->for_treelog) {
- spin_lock(&fs_info->treelog_bg_lock);
- if (fs_info->treelog_bg)
- ffe_ctl->hint_byte = fs_info->treelog_bg;
- spin_unlock(&fs_info->treelog_bg_lock);
- }
- if (ffe_ctl->for_data_reloc) {
- spin_lock(&fs_info->relocation_bg_lock);
- if (fs_info->data_reloc_bg)
- ffe_ctl->hint_byte = fs_info->data_reloc_bg;
- spin_unlock(&fs_info->relocation_bg_lock);
- }
- return 0;
+ return prepare_allocation_zoned(fs_info, ffe_ctl);
default:
BUG();
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 809b11472a80..1eb93d3962aa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4458,6 +4458,8 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
u64 root_flags;
int ret;
+ down_write(&fs_info->subvol_sem);
+
/*
* Don't allow to delete a subvolume with send in progress. This is
* inside the inode lock so the error handling that has to drop the bit
@@ -4469,25 +4471,25 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
btrfs_warn(fs_info,
"attempt to delete subvolume %llu during send",
dest->root_key.objectid);
- return -EPERM;
+ ret = -EPERM;
+ goto out_up_write;
}
if (atomic_read(&dest->nr_swapfiles)) {
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu with active swapfile",
root->root_key.objectid);
- return -EPERM;
+ ret = -EPERM;
+ goto out_up_write;
}
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
- down_write(&fs_info->subvol_sem);
-
ret = may_destroy_subvol(dest);
if (ret)
- goto out_up_write;
+ goto out_undead;
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
@@ -4497,7 +4499,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
*/
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
if (ret)
- goto out_up_write;
+ goto out_undead;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
@@ -4563,15 +4565,17 @@ out_end_trans:
inode->i_flags |= S_DEAD;
out_release:
btrfs_subvolume_release_metadata(root, &block_rsv);
-out_up_write:
- up_write(&fs_info->subvol_sem);
+out_undead:
if (ret) {
spin_lock(&dest->root_item_lock);
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
- } else {
+ }
+out_up_write:
+ up_write(&fs_info->subvol_sem);
+ if (!ret) {
d_invalidate(dentry);
btrfs_prune_dentries(dest);
ASSERT(dest->send_in_progress == 0);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 41b479861b3c..dfed9dd9c2d7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -790,6 +790,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
return -EOPNOTSUPP;
}
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return -ENOENT;
+
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return -EINVAL;
@@ -2608,6 +2611,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EFAULT;
goto out;
}
+ if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
/* compression requires us to start the IO */
if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 1131d5a29d61..e43bc0fdc74e 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -425,16 +425,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
}
int lzo_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
size_t in_len;
size_t out_len;
size_t max_segment_len = WORKSPACE_BUF_LENGTH;
int ret = 0;
- char *kaddr;
- unsigned long bytes;
if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
return -EUCLEAN;
@@ -451,7 +451,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
}
data_in += LZO_LEN;
- out_len = PAGE_SIZE;
+ out_len = sectorsize;
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
if (ret != LZO_E_OK) {
pr_warn("BTRFS: decompress failed!\n");
@@ -459,29 +459,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
goto out;
}
- if (out_len < start_byte) {
+ ASSERT(out_len <= sectorsize);
+ memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len);
+ /* Early end, considered as an error. */
+ if (unlikely(out_len < destlen)) {
ret = -EIO;
- goto out;
+ memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len);
}
-
- /*
- * the caller is already checking against PAGE_SIZE, but lets
- * move this check closer to the memcpy/memset
- */
- destlen = min_t(unsigned long, destlen, PAGE_SIZE);
- bytes = min_t(unsigned long, destlen, out_len - start_byte);
-
- kaddr = kmap_local_page(dest_page);
- memcpy(kaddr, workspace->buf + start_byte, bytes);
-
- /*
- * btrfs_getblock is doing a zero on the tail of the page too,
- * but this will cover anything missing from the decompressed
- * data.
- */
- if (bytes < destlen)
- memset(kaddr+bytes, 0, destlen-bytes);
- kunmap_local(kaddr);
out:
return ret;
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 6486f0d7e993..8c4fc98ca9ce 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -889,8 +889,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
out_unlock:
spin_unlock(&fs_info->ref_verify_lock);
out:
- if (ret)
+ if (ret) {
+ btrfs_free_ref_cache(fs_info);
btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ }
return ret;
}
@@ -1021,8 +1023,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
}
}
if (ret) {
- btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
btrfs_free_ref_cache(fs_info);
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
}
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a01807cbd4d4..0123d2728923 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1098,12 +1098,22 @@ out:
static void scrub_read_endio(struct btrfs_bio *bbio)
{
struct scrub_stripe *stripe = bbio->private;
+ struct bio_vec *bvec;
+ int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
+ int num_sectors;
+ u32 bio_size = 0;
+ int i;
+
+ ASSERT(sector_nr < stripe->nr_sectors);
+ bio_for_each_bvec_all(bvec, &bbio->bio, i)
+ bio_size += bvec->bv_len;
+ num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits;
if (bbio->bio.bi_status) {
- bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
- bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
+ bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors);
+ bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors);
} else {
- bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
+ bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors);
}
bio_put(&bbio->bio);
if (atomic_dec_and_test(&stripe->pending_io)) {
@@ -1636,6 +1646,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL;
+ unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
+ stripe->bg->length - stripe->logical) >>
+ fs_info->sectorsize_bits;
u64 stripe_len = BTRFS_STRIPE_LEN;
int mirror = stripe->mirror_num;
int i;
@@ -1646,6 +1659,10 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
struct page *page = scrub_stripe_get_page(stripe, i);
unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i);
+ /* We're beyond the chunk boundary, no need to read anymore. */
+ if (i >= nr_sectors)
+ break;
+
/* The current sector cannot be merged, submit the bio. */
if (bbio &&
((i > 0 &&
@@ -1701,6 +1718,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_bio *bbio;
+ unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
+ stripe->bg->length - stripe->logical) >>
+ fs_info->sectorsize_bits;
int mirror = stripe->mirror_num;
ASSERT(stripe->bg);
@@ -1715,14 +1735,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
scrub_read_endio, stripe);
- /* Read the whole stripe. */
bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
- for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) {
+ /* Read the whole range inside the chunk boundary. */
+ for (unsigned int cur = 0; cur < nr_sectors; cur++) {
+ struct page *page = scrub_stripe_get_page(stripe, cur);
+ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur);
int ret;
- ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0);
+ ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
/* We should have allocated enough bio vectors. */
- ASSERT(ret == PAGE_SIZE);
+ ASSERT(ret == fs_info->sectorsize);
}
atomic_inc(&stripe->pending_io);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4e36550618e5..2d7519a6ce72 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -8205,8 +8205,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
goto out;
}
- sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
- arg->clone_sources_count + 1,
+ sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1,
+ sizeof(*sctx->clone_roots),
GFP_KERNEL);
if (!sctx->clone_roots) {
ret = -ENOMEM;
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 93511d54abf8..0e49dab8dad2 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -475,7 +475,8 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- folio_start_writeback(folio);
+ if (!folio_test_writeback(folio))
+ folio_start_writeback(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 896acfda1789..101f786963d4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1457,6 +1457,14 @@ static int btrfs_reconfigure(struct fs_context *fc)
btrfs_info_to_ctx(fs_info, &old_ctx);
+ /*
+ * This is our "bind mount" trick, we don't want to allow the user to do
+ * anything other than mount a different ro/rw and a different subvol,
+ * all of the mount options should be maintained.
+ */
+ if (mount_reconfigure)
+ ctx->mount_opt = old_ctx.mount_opt;
+
sync_filesystem(sb);
set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 50fdc69fdddf..6eccf8496486 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1436,7 +1436,7 @@ static int check_extent_item(struct extent_buffer *leaf,
if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) {
extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %u end %lu",
- ptr, inline_type, end);
+ ptr, btrfs_extent_inline_ref_size(inline_type), end);
return -EUCLEAN;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4c32497311d2..d67785be2c77 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3087,7 +3087,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
map = btrfs_find_chunk_map(fs_info, logical, length);
if (unlikely(!map)) {
- read_unlock(&fs_info->mapping_tree_lock);
btrfs_crit(fs_info,
"unable to find chunk map for logical %llu length %llu",
logical, length);
@@ -3095,7 +3094,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
}
if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) {
- read_unlock(&fs_info->mapping_tree_lock);
btrfs_crit(fs_info,
"found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
logical, logical + length, map->start,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 36cf1f0e338e..8da66ea699e8 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -354,18 +354,13 @@ done:
}
int zlib_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
int wbits = MAX_WBITS;
- unsigned long bytes_left;
- unsigned long total_out = 0;
- unsigned long pg_offset = 0;
-
- destlen = min_t(unsigned long, destlen, PAGE_SIZE);
- bytes_left = destlen;
+ unsigned long to_copy;
workspace->strm.next_in = data_in;
workspace->strm.avail_in = srclen;
@@ -390,60 +385,30 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
return -EIO;
}
- while (bytes_left > 0) {
- unsigned long buf_start;
- unsigned long buf_offset;
- unsigned long bytes;
-
- ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
- if (ret != Z_OK && ret != Z_STREAM_END)
- break;
-
- buf_start = total_out;
- total_out = workspace->strm.total_out;
-
- if (total_out == buf_start) {
- ret = -EIO;
- break;
- }
-
- if (total_out <= start_byte)
- goto next;
-
- if (total_out > start_byte && buf_start < start_byte)
- buf_offset = start_byte - buf_start;
- else
- buf_offset = 0;
-
- bytes = min(PAGE_SIZE - pg_offset,
- PAGE_SIZE - (buf_offset % PAGE_SIZE));
- bytes = min(bytes, bytes_left);
+ /*
+ * Everything (in/out buf) should be at most one sector, there should
+ * be no need to switch any input/output buffer.
+ */
+ ret = zlib_inflate(&workspace->strm, Z_FINISH);
+ to_copy = min(workspace->strm.total_out, destlen);
+ if (ret != Z_STREAM_END)
+ goto out;
- memcpy_to_page(dest_page, pg_offset,
- workspace->buf + buf_offset, bytes);
+ memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy);
- pg_offset += bytes;
- bytes_left -= bytes;
-next:
- workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = workspace->buf_size;
- }
-
- if (ret != Z_STREAM_END && bytes_left != 0)
+out:
+ if (unlikely(to_copy != destlen)) {
+ pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n",
+ to_copy, destlen);
ret = -EIO;
- else
+ } else {
ret = 0;
+ }
zlib_inflateEnd(&workspace->strm);
- /*
- * this should only happen if zlib returned fewer bytes than we
- * expected. btrfs_get_block is responsible for zeroing from the
- * end of the inline extent (destlen) to the end of the page
- */
- if (pg_offset < destlen) {
- memzero_page(dest_page, pg_offset, destlen - pg_offset);
- }
+ if (unlikely(to_copy < destlen))
+ memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy);
return ret;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 12066afc235c..168af9d000d1 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -578,26 +578,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
kvfree(zones);
- switch (bdev_zoned_model(bdev)) {
- case BLK_ZONED_HM:
+ if (bdev_is_zoned(bdev)) {
model = "host-managed zoned";
emulated = "";
- break;
- case BLK_ZONED_HA:
- model = "host-aware zoned";
- emulated = "";
- break;
- case BLK_ZONED_NONE:
+ } else {
model = "regular";
emulated = "emulated ";
- break;
- default:
- /* Just in case */
- btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
- bdev_zoned_model(bdev),
- rcu_str_deref(device->name));
- ret = -EOPNOTSUPP;
- goto out_free_zone_info;
}
btrfs_info_in_rcu(fs_info,
@@ -609,9 +595,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
out:
kvfree(zones);
-out_free_zone_info:
btrfs_destroy_dev_zone_info(device);
-
return ret;
}
@@ -688,8 +672,7 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
struct btrfs_device *device;
list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
- if (device->bdev &&
- bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
+ if (device->bdev && bdev_is_zoned(device->bdev)) {
btrfs_err(fs_info,
"zoned: mode not enabled but zoned device found: %pg",
device->bdev);
@@ -2072,6 +2055,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
map = block_group->physical_map;
+ spin_lock(&fs_info->zone_active_bgs_lock);
spin_lock(&block_group->lock);
if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
ret = true;
@@ -2084,7 +2068,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
- spin_lock(&fs_info->zone_active_bgs_lock);
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_zoned_device_info *zinfo;
int reserved = 0;
@@ -2104,20 +2087,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
*/
if (atomic_read(&zinfo->active_zones_left) <= reserved) {
ret = false;
- spin_unlock(&fs_info->zone_active_bgs_lock);
goto out_unlock;
}
if (!btrfs_dev_set_active_zone(device, physical)) {
/* Cannot activate the zone */
ret = false;
- spin_unlock(&fs_info->zone_active_bgs_lock);
goto out_unlock;
}
if (!is_data)
zinfo->reserved_active_zones--;
}
- spin_unlock(&fs_info->zone_active_bgs_lock);
/* Successfully activated all the zones */
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
@@ -2125,8 +2105,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
/* For the active block group list */
btrfs_get_block_group(block_group);
-
- spin_lock(&fs_info->zone_active_bgs_lock);
list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
spin_unlock(&fs_info->zone_active_bgs_lock);
@@ -2134,6 +2112,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
out_unlock:
spin_unlock(&block_group->lock);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
return ret;
}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index f24a5ffb7807..f573bda496fb 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -320,7 +320,7 @@ static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_i
}
/* Do not allow Host Managed zoned device. */
- return bdev_zoned_model(bdev) != BLK_ZONED_HM;
+ return !bdev_is_zoned(bdev);
}
static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos)
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
index 8df715640a48..c5a070550ee3 100644
--- a/fs/cachefiles/Kconfig
+++ b/fs/cachefiles/Kconfig
@@ -2,7 +2,7 @@
config CACHEFILES
tristate "Filesystem caching on files"
- depends on FSCACHE && BLOCK
+ depends on NETFS_SUPPORT && FSCACHE && BLOCK
help
This permits use of a mounted filesystem as a cache for other
filesystems - primarily networking filesystems - thus allowing fast
diff --git a/fs/cachefiles/error_inject.c b/fs/cachefiles/error_inject.c
index 18de8a876b02..1715d5ca2b2d 100644
--- a/fs/cachefiles/error_inject.c
+++ b/fs/cachefiles/error_inject.c
@@ -19,7 +19,6 @@ static struct ctl_table cachefiles_sysctls[] = {
.mode = 0644,
.proc_handler = proc_douintvec,
},
- {}
};
int __init cachefiles_register_error_injection(void)
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 4a87c9d714a9..d33169f0018b 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -246,7 +246,7 @@ extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres,
enum fscache_want_state want_state);
extern int __cachefiles_prepare_write(struct cachefiles_object *object,
struct file *file,
- loff_t *_start, size_t *_len,
+ loff_t *_start, size_t *_len, size_t upper_len,
bool no_space_allocated_yet);
extern int __cachefiles_write(struct cachefiles_object *object,
struct file *file,
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 5857241c5918..1d685357e67f 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -517,18 +517,26 @@ cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres,
*/
int __cachefiles_prepare_write(struct cachefiles_object *object,
struct file *file,
- loff_t *_start, size_t *_len,
+ loff_t *_start, size_t *_len, size_t upper_len,
bool no_space_allocated_yet)
{
struct cachefiles_cache *cache = object->volume->cache;
loff_t start = *_start, pos;
- size_t len = *_len, down;
+ size_t len = *_len;
int ret;
/* Round to DIO size */
- down = start - round_down(start, PAGE_SIZE);
- *_start = start - down;
- *_len = round_up(down + len, PAGE_SIZE);
+ start = round_down(*_start, PAGE_SIZE);
+ if (start != *_start || *_len > upper_len) {
+ /* Probably asked to cache a streaming write written into the
+ * pagecache when the cookie was temporarily out of service to
+ * culling.
+ */
+ fscache_count_dio_misfit();
+ return -ENOBUFS;
+ }
+
+ *_len = round_up(len, PAGE_SIZE);
/* We need to work out whether there's sufficient disk space to perform
* the write - but we can skip that check if we have space already
@@ -539,7 +547,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
pos = cachefiles_inject_read_error();
if (pos == 0)
- pos = vfs_llseek(file, *_start, SEEK_DATA);
+ pos = vfs_llseek(file, start, SEEK_DATA);
if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) {
if (pos == -ENXIO)
goto check_space; /* Unallocated tail */
@@ -547,7 +555,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
cachefiles_trace_seek_error);
return pos;
}
- if ((u64)pos >= (u64)*_start + *_len)
+ if ((u64)pos >= (u64)start + *_len)
goto check_space; /* Unallocated region */
/* We have a block that's at least partially filled - if we're low on
@@ -560,13 +568,13 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
pos = cachefiles_inject_read_error();
if (pos == 0)
- pos = vfs_llseek(file, *_start, SEEK_HOLE);
+ pos = vfs_llseek(file, start, SEEK_HOLE);
if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) {
trace_cachefiles_io_error(object, file_inode(file), pos,
cachefiles_trace_seek_error);
return pos;
}
- if ((u64)pos >= (u64)*_start + *_len)
+ if ((u64)pos >= (u64)start + *_len)
return 0; /* Fully allocated */
/* Partially allocated, but insufficient space: cull. */
@@ -574,7 +582,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
ret = cachefiles_inject_remove_error();
if (ret == 0)
ret = vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
- *_start, *_len);
+ start, *_len);
if (ret < 0) {
trace_cachefiles_io_error(object, file_inode(file), ret,
cachefiles_trace_fallocate_error);
@@ -591,8 +599,8 @@ check_space:
}
static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
- loff_t *_start, size_t *_len, loff_t i_size,
- bool no_space_allocated_yet)
+ loff_t *_start, size_t *_len, size_t upper_len,
+ loff_t i_size, bool no_space_allocated_yet)
{
struct cachefiles_object *object = cachefiles_cres_object(cres);
struct cachefiles_cache *cache = object->volume->cache;
@@ -608,7 +616,7 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
cachefiles_begin_secure(cache, &saved_cred);
ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres),
- _start, _len,
+ _start, _len, upper_len,
no_space_allocated_yet);
cachefiles_end_secure(cache, saved_cred);
return ret;
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 7bf7a5fcc045..7ade836beb58 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -305,6 +305,8 @@ try_again:
/* do the multiway lock magic */
trap = lock_rename(cache->graveyard, dir);
+ if (IS_ERR(trap))
+ return PTR_ERR(trap);
/* do some checks before getting the grave dentry */
if (rep->d_parent != dir || IS_DEADDIR(d_inode(rep))) {
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index b8fbbb1961bb..4ba42f1fa3b4 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -50,7 +50,7 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
return -ENOBUFS;
cachefiles_begin_secure(cache, &saved_cred);
- ret = __cachefiles_prepare_write(object, file, &pos, &len, true);
+ ret = __cachefiles_prepare_write(object, file, &pos, &len, len, true);
cachefiles_end_secure(cache, saved_cred);
if (ret < 0)
return ret;
@@ -539,6 +539,9 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object)
struct fscache_volume *volume = object->volume->vcookie;
size_t volume_key_size, cookie_key_size, data_len;
+ if (!object->ondemand)
+ return 0;
+
/*
* CacheFiles will firstly check the cache file under the root cache
* directory. If the coherency check failed, it will fallback to
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 94df854147d3..7249d70e1a43 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -7,6 +7,7 @@ config CEPH_FS
select CRYPTO_AES
select CRYPTO
select NETFS_SUPPORT
+ select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
default n
help
Choose Y or M here to include support for mounting the
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 13af429ab030..1340d77124ae 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -159,27 +159,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset,
ceph_put_snap_context(snapc);
}
- folio_wait_fscache(folio);
-}
-
-static bool ceph_release_folio(struct folio *folio, gfp_t gfp)
-{
- struct inode *inode = folio->mapping->host;
- struct ceph_client *cl = ceph_inode_to_client(inode);
-
- doutc(cl, "%llx.%llx idx %lu (%sdirty)\n", ceph_vinop(inode),
- folio->index, folio_test_dirty(folio) ? "" : "not ");
-
- if (folio_test_private(folio))
- return false;
-
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
- ceph_fscache_note_page_release(inode);
- return true;
+ netfs_invalidate_folio(folio, offset, length);
}
static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
@@ -357,6 +337,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
u64 len = subreq->len;
bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
u64 off = subreq->start;
+ int extent_cnt;
if (ceph_inode_is_shutdown(inode)) {
err = -EIO;
@@ -370,8 +351,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,
off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
- CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
- NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
+ CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
req = NULL;
@@ -379,7 +360,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
}
if (sparse) {
- err = ceph_alloc_sparse_ext_map(&req->r_ops[0]);
+ extent_cnt = __ceph_sparse_read_ext_count(inode, len);
+ err = ceph_alloc_sparse_ext_map(&req->r_ops[0], extent_cnt);
if (err)
goto out;
}
@@ -509,7 +491,6 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq)
const struct netfs_request_ops ceph_netfs_ops = {
.init_request = ceph_init_request,
.free_request = ceph_netfs_free_request,
- .begin_cache_operation = ceph_begin_cache_operation,
.issue_read = ceph_netfs_issue_read,
.expand_readahead = ceph_netfs_expand_readahead,
.clamp_length = ceph_netfs_clamp_length,
@@ -1586,7 +1567,7 @@ const struct address_space_operations ceph_aops = {
.write_end = ceph_write_end,
.dirty_folio = ceph_dirty_folio,
.invalidate_folio = ceph_invalidate_folio,
- .release_folio = ceph_release_folio,
+ .release_folio = netfs_release_folio,
.direct_IO = noop_direct_IO,
};
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index dc502daac49a..20efac020394 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -43,38 +43,19 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
}
}
-static inline void ceph_fscache_unpin_writeback(struct inode *inode,
+static inline int ceph_fscache_unpin_writeback(struct inode *inode,
struct writeback_control *wbc)
{
- fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode)));
+ return netfs_unpin_writeback(inode, wbc);
}
-static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
- struct folio *folio)
-{
- struct ceph_inode_info *ci = ceph_inode(mapping->host);
-
- return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci));
-}
-
-static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
-{
- struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode));
-
- return fscache_begin_read_operation(&rreq->cache_resources, cookie);
-}
+#define ceph_fscache_dirty_folio netfs_dirty_folio
static inline bool ceph_is_cache_enabled(struct inode *inode)
{
return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode)));
}
-static inline void ceph_fscache_note_page_release(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- fscache_note_page_release(ceph_fscache_cookie(ci));
-}
#else /* CONFIG_CEPH_FSCACHE */
static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc,
struct fs_context *fc)
@@ -119,30 +100,18 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
{
}
-static inline void ceph_fscache_unpin_writeback(struct inode *inode,
- struct writeback_control *wbc)
+static inline int ceph_fscache_unpin_writeback(struct inode *inode,
+ struct writeback_control *wbc)
{
+ return 0;
}
-static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
- struct folio *folio)
-{
- return filemap_dirty_folio(mapping, folio);
-}
+#define ceph_fscache_dirty_folio filemap_dirty_folio
static inline bool ceph_is_cache_enabled(struct inode *inode)
{
return false;
}
-
-static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
-{
- return -ENOBUFS;
-}
-
-static inline void ceph_fscache_note_page_release(struct inode *inode)
-{
-}
#endif /* CONFIG_CEPH_FSCACHE */
#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2c0b8dc3dd0d..9c02f328c966 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4887,13 +4887,15 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
struct inode *dir,
int mds, int drop, int unless)
{
- struct dentry *parent = NULL;
struct ceph_mds_request_release *rel = *p;
struct ceph_dentry_info *di = ceph_dentry(dentry);
struct ceph_client *cl;
int force = 0;
int ret;
+ /* This shouldn't happen */
+ BUG_ON(!dir);
+
/*
* force an record for the directory caps if we have a dentry lease.
* this is racy (can't take i_ceph_lock and d_lock together), but it
@@ -4903,14 +4905,9 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
spin_lock(&dentry->d_lock);
if (di->lease_session && di->lease_session->s_mds == mds)
force = 1;
- if (!dir) {
- parent = dget(dentry->d_parent);
- dir = d_inode(parent);
- }
spin_unlock(&dentry->d_lock);
ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
- dput(parent);
cl = ceph_inode_to_client(dir);
spin_lock(&dentry->d_lock);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 91709934c8b1..0e9f56eaba1e 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -174,7 +174,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
/*
* When possible, we try to satisfy a readdir by peeking at the
* dcache. We make this work by carefully ordering dentries on
- * d_child when we initially get results back from the MDS, and
+ * d_children when we initially get results back from the MDS, and
* falling back to a "normal" sync readdir if any dentries in the dir
* are dropped.
*
@@ -1593,10 +1593,12 @@ struct ceph_lease_walk_control {
unsigned long dir_lease_ttl;
};
+static int __dir_lease_check(const struct dentry *, struct ceph_lease_walk_control *);
+static int __dentry_lease_check(const struct dentry *);
+
static unsigned long
__dentry_leases_walk(struct ceph_mds_client *mdsc,
- struct ceph_lease_walk_control *lwc,
- int (*check)(struct dentry*, void*))
+ struct ceph_lease_walk_control *lwc)
{
struct ceph_dentry_info *di, *tmp;
struct dentry *dentry, *last = NULL;
@@ -1624,7 +1626,10 @@ __dentry_leases_walk(struct ceph_mds_client *mdsc,
goto next;
}
- ret = check(dentry, lwc);
+ if (lwc->dir_lease)
+ ret = __dir_lease_check(dentry, lwc);
+ else
+ ret = __dentry_lease_check(dentry);
if (ret & TOUCH) {
/* move it into tail of dir lease list */
__dentry_dir_lease_touch(mdsc, di);
@@ -1681,7 +1686,7 @@ next:
return freed;
}
-static int __dentry_lease_check(struct dentry *dentry, void *arg)
+static int __dentry_lease_check(const struct dentry *dentry)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
int ret;
@@ -1696,9 +1701,9 @@ static int __dentry_lease_check(struct dentry *dentry, void *arg)
return DELETE;
}
-static int __dir_lease_check(struct dentry *dentry, void *arg)
+static int __dir_lease_check(const struct dentry *dentry,
+ struct ceph_lease_walk_control *lwc)
{
- struct ceph_lease_walk_control *lwc = arg;
struct ceph_dentry_info *di = ceph_dentry(dentry);
int ret = __dir_lease_try_check(dentry);
@@ -1737,7 +1742,7 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc)
lwc.dir_lease = false;
lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2;
- freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check);
+ freed = __dentry_leases_walk(mdsc, &lwc);
if (!lwc.nr_to_scan) /* more invalid leases */
return -EAGAIN;
@@ -1747,7 +1752,7 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc)
lwc.dir_lease = true;
lwc.expire_dir_lease = freed < count;
lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ;
- freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check);
+ freed +=__dentry_leases_walk(mdsc, &lwc);
if (!lwc.nr_to_scan) /* more to check */
return -EAGAIN;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 726af69d4d62..a79f163ae4ed 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -286,8 +286,6 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
doutc(cl, "%llx.%llx parent %llx hash %x err=%d", vino.ino,
vino.snap, sfh->parent_ino, sfh->hash, err);
}
- if (IS_ERR(inode))
- return ERR_CAST(inode);
/* see comments in ceph_get_parent() */
return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode);
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d380d9dad0e0..abe8028d95bf 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1029,6 +1029,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
struct ceph_osd_req_op *op;
u64 read_off = off;
u64 read_len = len;
+ int extent_cnt;
/* determine new offset/length if encrypted */
ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len);
@@ -1068,7 +1069,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
op = &req->r_ops[0];
if (sparse) {
- ret = ceph_alloc_sparse_ext_map(op);
+ extent_cnt = __ceph_sparse_read_ext_count(inode, read_len);
+ ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
if (ret) {
ceph_osdc_put_request(req);
break;
@@ -1465,6 +1467,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
ssize_t len;
struct ceph_osd_req_op *op;
int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ;
+ int extent_cnt;
if (write)
size = min_t(u64, size, fsc->mount_options->wsize);
@@ -1528,7 +1531,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
op = &req->r_ops[0];
if (sparse) {
- ret = ceph_alloc_sparse_ext_map(op);
+ extent_cnt = __ceph_sparse_read_ext_count(inode, size);
+ ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
if (ret) {
ceph_osdc_put_request(req);
break;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 0679240f06db..0c25d326afc4 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -574,7 +574,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
doutc(fsc->client, "%p\n", &ci->netfs.inode);
/* Set parameters for the netfs library */
- netfs_inode_init(&ci->netfs, &ceph_netfs_ops);
+ netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false);
spin_lock_init(&ci->i_ceph_lock);
@@ -694,7 +694,7 @@ void ceph_evict_inode(struct inode *inode)
percpu_counter_dec(&mdsc->metric.total_inodes);
truncate_inode_pages_final(&inode->i_data);
- if (inode->i_state & I_PINNING_FSCACHE_WB)
+ if (inode->i_state & I_PINNING_NETFS_WB)
ceph_fscache_unuse_cookie(inode, true);
clear_inode(inode);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d95eb525519a..548d1de379f3 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1534,7 +1534,8 @@ static int encode_metric_spec(void **p, void *end)
* session message, specialization for CEPH_SESSION_REQUEST_OPEN
* to include additional client metadata fields.
*/
-static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq)
+static struct ceph_msg *
+create_session_full_msg(struct ceph_mds_client *mdsc, int op, u64 seq)
{
struct ceph_msg *msg;
struct ceph_mds_session_head *h;
@@ -1578,6 +1579,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
size = METRIC_BYTES(count);
extra_bytes += 2 + 4 + 4 + size;
+ /* flags, mds auth caps and oldest_client_tid */
+ extra_bytes += 4 + 4 + 8;
+
/* Allocate the message */
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
GFP_NOFS, false);
@@ -1589,16 +1593,16 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
end = p + msg->front.iov_len;
h = p;
- h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
+ h->op = cpu_to_le32(op);
h->seq = cpu_to_le64(seq);
/*
* Serialize client metadata into waiting buffer space, using
* the format that userspace expects for map<string, string>
*
- * ClientSession messages with metadata are v4
+ * ClientSession messages with metadata are v7
*/
- msg->hdr.version = cpu_to_le16(4);
+ msg->hdr.version = cpu_to_le16(7);
msg->hdr.compat_version = cpu_to_le16(1);
/* The write pointer, following the session_head structure */
@@ -1634,6 +1638,15 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
return ERR_PTR(ret);
}
+ /* version == 5, flags */
+ ceph_encode_32(&p, 0);
+
+ /* version == 6, mds auth caps */
+ ceph_encode_32(&p, 0);
+
+ /* version == 7, oldest_client_tid */
+ ceph_encode_64(&p, mdsc->oldest_tid);
+
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -1663,7 +1676,8 @@ static int __open_session(struct ceph_mds_client *mdsc,
session->s_renew_requested = jiffies;
/* send connect message */
- msg = create_session_open_msg(mdsc, session->s_seq);
+ msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_OPEN,
+ session->s_seq);
if (IS_ERR(msg))
return PTR_ERR(msg);
ceph_con_send(&session->s_con, msg);
@@ -2028,10 +2042,10 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
doutc(cl, "to mds%d (%s)\n", session->s_mds,
ceph_mds_state_name(state));
- msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+ msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_RENEWCAPS,
++session->s_renew_seq);
- if (!msg)
- return -ENOMEM;
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
ceph_con_send(&session->s_con, msg);
return 0;
}
@@ -2128,7 +2142,7 @@ static bool drop_negative_children(struct dentry *dentry)
goto out;
spin_lock(&dentry->d_lock);
- list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+ hlist_for_each_entry(child, &dentry->d_children, d_sib) {
if (d_really_is_positive(child)) {
all_negative = false;
break;
@@ -4128,12 +4142,12 @@ static void handle_session(struct ceph_mds_session *session,
pr_info_client(cl, "mds%d reconnect success\n",
session->s_mds);
+ session->s_features = features;
if (session->s_state == CEPH_MDS_SESSION_OPEN) {
pr_notice_client(cl, "mds%d is already opened\n",
session->s_mds);
} else {
session->s_state = CEPH_MDS_SESSION_OPEN;
- session->s_features = features;
renewed_caps(mdsc, session, 0);
if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
&session->s_features))
@@ -5870,7 +5884,8 @@ static void mds_peer_reset(struct ceph_connection *con)
pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n",
s->s_mds);
- if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO)
+ if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO &&
+ ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) >= CEPH_MDS_STATE_RECONNECT)
send_mds_reconnect(mdsc, s);
}
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 9d36c3532de1..06ee397e0c3a 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -197,10 +197,10 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
}
/*
- * This function walks through the snaprealm for an inode and returns the
- * ceph_snap_realm for the first snaprealm that has quotas set (max_files,
+ * This function walks through the snaprealm for an inode and set the
+ * realmp with the first snaprealm that has quotas set (max_files,
* max_bytes, or any, depending on the 'which_quota' argument). If the root is
- * reached, return the root ceph_snap_realm instead.
+ * reached, set the realmp with the root ceph_snap_realm instead.
*
* Note that the caller is responsible for calling ceph_put_snap_realm() on the
* returned realm.
@@ -211,10 +211,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
* this function will return -EAGAIN; otherwise, the snaprealms walk-through
* will be restarted.
*/
-static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
- struct inode *inode,
- enum quota_get_realm which_quota,
- bool retry)
+static int get_quota_realm(struct ceph_mds_client *mdsc, struct inode *inode,
+ enum quota_get_realm which_quota,
+ struct ceph_snap_realm **realmp, bool retry)
{
struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci = NULL;
@@ -222,8 +221,10 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
struct inode *in;
bool has_quota;
+ if (realmp)
+ *realmp = NULL;
if (ceph_snap(inode) != CEPH_NOSNAP)
- return NULL;
+ return 0;
restart:
realm = ceph_inode(inode)->i_snap_realm;
@@ -250,7 +251,7 @@ restart:
break;
ceph_put_snap_realm(mdsc, realm);
if (!retry)
- return ERR_PTR(-EAGAIN);
+ return -EAGAIN;
goto restart;
}
@@ -259,8 +260,11 @@ restart:
iput(in);
next = realm->parent;
- if (has_quota || !next)
- return realm;
+ if (has_quota || !next) {
+ if (realmp)
+ *realmp = realm;
+ return 0;
+ }
ceph_get_snap_realm(mdsc, next);
ceph_put_snap_realm(mdsc, realm);
@@ -269,7 +273,7 @@ restart:
if (realm)
ceph_put_snap_realm(mdsc, realm);
- return NULL;
+ return 0;
}
bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
@@ -277,6 +281,7 @@ bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb);
struct ceph_snap_realm *old_realm, *new_realm;
bool is_same;
+ int ret;
restart:
/*
@@ -286,9 +291,9 @@ restart:
* dropped and we can then restart the whole operation.
*/
down_read(&mdsc->snap_rwsem);
- old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true);
- new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false);
- if (PTR_ERR(new_realm) == -EAGAIN) {
+ get_quota_realm(mdsc, old, QUOTA_GET_ANY, &old_realm, true);
+ ret = get_quota_realm(mdsc, new, QUOTA_GET_ANY, &new_realm, false);
+ if (ret == -EAGAIN) {
up_read(&mdsc->snap_rwsem);
if (old_realm)
ceph_put_snap_realm(mdsc, old_realm);
@@ -492,8 +497,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
bool is_updated = false;
down_read(&mdsc->snap_rwsem);
- realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root),
- QUOTA_GET_MAX_BYTES, true);
+ get_quota_realm(mdsc, d_inode(fsc->sb->s_root), QUOTA_GET_MAX_BYTES,
+ &realm, true);
up_read(&mdsc->snap_rwsem);
if (!realm)
return false;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index fe0f64a0acb2..b06e2bc86221 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -3,6 +3,7 @@
#define _FS_CEPH_SUPER_H
#include <linux/ceph/ceph_debug.h>
+#include <linux/ceph/osd_client.h>
#include <asm/unaligned.h>
#include <linux/backing-dev.h>
@@ -1407,6 +1408,19 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci,
ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota);
}
+static inline int __ceph_sparse_read_ext_count(struct inode *inode, u64 len)
+{
+ int cnt = 0;
+
+ if (IS_ENCRYPTED(inode)) {
+ cnt = len >> CEPH_FSCRYPT_BLOCK_SHIFT;
+ if (cnt > CEPH_SPARSE_EXT_ARRAY_INITIAL)
+ cnt = 0;
+ }
+
+ return cnt;
+}
+
extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg);
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 3b8c4513118f..970f0022ec52 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -93,13 +93,13 @@ static void coda_flag_children(struct dentry *parent, int flag)
struct dentry *de;
spin_lock(&parent->d_lock);
- list_for_each_entry(de, &parent->d_subdirs, d_child) {
+ hlist_for_each_entry(de, &parent->d_children, d_sib) {
+ struct inode *inode = d_inode_rcu(de);
/* don't know what to do with negative dentries */
- if (d_inode(de) )
- coda_flag_inode(d_inode(de), flag);
+ if (inode)
+ coda_flag_inode(inode, flag);
}
spin_unlock(&parent->d_lock);
- return;
}
void coda_flag_inode_children(struct inode *inode, int flag)
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index a247c14aaab7..9f2d5743e2c8 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -36,7 +36,6 @@ static struct ctl_table coda_table[] = {
.mode = 0600,
.proc_handler = proc_dointvec
},
- {}
};
void coda_sysctl_init(void)
diff --git a/fs/coredump.c b/fs/coredump.c
index 9d235fa14ab9..f258c17c1841 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -981,7 +981,6 @@ static struct ctl_table coredump_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { }
};
static int __init init_fs_coredump_sysctls(void)
diff --git a/fs/dcache.c b/fs/dcache.c
index 2ba37643b9c5..b813528fb147 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -51,8 +51,8 @@
* - d_lru
* - d_count
* - d_unhashed()
- * - d_parent and d_subdirs
- * - childrens' d_child and d_parent
+ * - d_parent and d_chilren
+ * - childrens' d_sib and d_parent
* - d_u.d_alias, d_inode
*
* Ordering:
@@ -191,7 +191,6 @@ static struct ctl_table fs_dcache_sysctls[] = {
.mode = 0444,
.proc_handler = proc_nr_dentry,
},
- { }
};
static int __init init_fs_dcache_sysctls(void)
@@ -344,7 +343,7 @@ static inline void __d_set_inode_and_type(struct dentry *dentry,
dentry->d_inode = inode;
flags = READ_ONCE(dentry->d_flags);
- flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
+ flags &= ~DCACHE_ENTRY_TYPE;
flags |= type_flags;
smp_store_release(&dentry->d_flags, flags);
}
@@ -353,7 +352,7 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry)
{
unsigned flags = READ_ONCE(dentry->d_flags);
- flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
+ flags &= ~DCACHE_ENTRY_TYPE;
WRITE_ONCE(dentry->d_flags, flags);
dentry->d_inode = NULL;
if (dentry->d_flags & DCACHE_LRU_LIST)
@@ -539,7 +538,7 @@ void d_drop(struct dentry *dentry)
}
EXPORT_SYMBOL(d_drop);
-static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
+static inline void dentry_unlist(struct dentry *dentry)
{
struct dentry *next;
/*
@@ -547,12 +546,12 @@ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
* attached to the dentry tree
*/
dentry->d_flags |= DCACHE_DENTRY_KILLED;
- if (unlikely(list_empty(&dentry->d_child)))
+ if (unlikely(hlist_unhashed(&dentry->d_sib)))
return;
- __list_del_entry(&dentry->d_child);
+ __hlist_del(&dentry->d_sib);
/*
* Cursors can move around the list of children. While we'd been
- * a normal list member, it didn't matter - ->d_child.next would've
+ * a normal list member, it didn't matter - ->d_sib.next would've
* been updated. However, from now on it won't be and for the
* things like d_walk() it might end up with a nasty surprise.
* Normally d_walk() doesn't care about cursors moving around -
@@ -560,29 +559,27 @@ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
* of its own, we get through it without ever unlocking the parent.
* There is one exception, though - if we ascend from a child that
* gets killed as soon as we unlock it, the next sibling is found
- * using the value left in its ->d_child.next. And if _that_
+ * using the value left in its ->d_sib.next. And if _that_
* pointed to a cursor, and cursor got moved (e.g. by lseek())
* before d_walk() regains parent->d_lock, we'll end up skipping
* everything the cursor had been moved past.
*
- * Solution: make sure that the pointer left behind in ->d_child.next
+ * Solution: make sure that the pointer left behind in ->d_sib.next
* points to something that won't be moving around. I.e. skip the
* cursors.
*/
- while (dentry->d_child.next != &parent->d_subdirs) {
- next = list_entry(dentry->d_child.next, struct dentry, d_child);
+ while (dentry->d_sib.next) {
+ next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib);
if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
break;
- dentry->d_child.next = next->d_child.next;
+ dentry->d_sib.next = next->d_sib.next;
}
}
-static void __dentry_kill(struct dentry *dentry)
+static struct dentry *__dentry_kill(struct dentry *dentry)
{
struct dentry *parent = NULL;
bool can_free = true;
- if (!IS_ROOT(dentry))
- parent = dentry->d_parent;
/*
* The dentry is now unrecoverably dead to the world.
@@ -602,9 +599,6 @@ static void __dentry_kill(struct dentry *dentry)
}
/* if it was on the hash then remove it */
__d_drop(dentry);
- dentry_unlist(dentry, parent);
- if (parent)
- spin_unlock(&parent->d_lock);
if (dentry->d_inode)
dentry_unlink_inode(dentry);
else
@@ -613,80 +607,114 @@ static void __dentry_kill(struct dentry *dentry)
if (dentry->d_op && dentry->d_op->d_release)
dentry->d_op->d_release(dentry);
- spin_lock(&dentry->d_lock);
- if (dentry->d_flags & DCACHE_SHRINK_LIST) {
- dentry->d_flags |= DCACHE_MAY_FREE;
- can_free = false;
+ cond_resched();
+ /* now that it's negative, ->d_parent is stable */
+ if (!IS_ROOT(dentry)) {
+ parent = dentry->d_parent;
+ spin_lock(&parent->d_lock);
}
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ dentry_unlist(dentry);
+ if (dentry->d_flags & DCACHE_SHRINK_LIST)
+ can_free = false;
spin_unlock(&dentry->d_lock);
if (likely(can_free))
dentry_free(dentry);
- cond_resched();
-}
-
-static struct dentry *__lock_parent(struct dentry *dentry)
-{
- struct dentry *parent;
- rcu_read_lock();
- spin_unlock(&dentry->d_lock);
-again:
- parent = READ_ONCE(dentry->d_parent);
- spin_lock(&parent->d_lock);
- /*
- * We can't blindly lock dentry until we are sure
- * that we won't violate the locking order.
- * Any changes of dentry->d_parent must have
- * been done with parent->d_lock held, so
- * spin_lock() above is enough of a barrier
- * for checking if it's still our child.
- */
- if (unlikely(parent != dentry->d_parent)) {
+ if (parent && --parent->d_lockref.count) {
spin_unlock(&parent->d_lock);
- goto again;
+ return NULL;
}
- rcu_read_unlock();
- if (parent != dentry)
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- else
- parent = NULL;
return parent;
}
-static inline struct dentry *lock_parent(struct dentry *dentry)
+/*
+ * Lock a dentry for feeding it to __dentry_kill().
+ * Called under rcu_read_lock() and dentry->d_lock; the former
+ * guarantees that nothing we access will be freed under us.
+ * Note that dentry is *not* protected from concurrent dentry_kill(),
+ * d_delete(), etc.
+ *
+ * Return false if dentry is busy. Otherwise, return true and have
+ * that dentry's inode locked.
+ */
+
+static bool lock_for_kill(struct dentry *dentry)
{
- struct dentry *parent = dentry->d_parent;
- if (IS_ROOT(dentry))
- return NULL;
- if (likely(spin_trylock(&parent->d_lock)))
- return parent;
- return __lock_parent(dentry);
+ struct inode *inode = dentry->d_inode;
+
+ if (unlikely(dentry->d_lockref.count))
+ return false;
+
+ if (!inode || likely(spin_trylock(&inode->i_lock)))
+ return true;
+
+ do {
+ spin_unlock(&dentry->d_lock);
+ spin_lock(&inode->i_lock);
+ spin_lock(&dentry->d_lock);
+ if (likely(inode == dentry->d_inode))
+ break;
+ spin_unlock(&inode->i_lock);
+ inode = dentry->d_inode;
+ } while (inode);
+ if (likely(!dentry->d_lockref.count))
+ return true;
+ if (inode)
+ spin_unlock(&inode->i_lock);
+ return false;
}
-static inline bool retain_dentry(struct dentry *dentry)
+/*
+ * Decide if dentry is worth retaining. Usually this is called with dentry
+ * locked; if not locked, we are more limited and might not be able to tell
+ * without a lock. False in this case means "punt to locked path and recheck".
+ *
+ * In case we aren't locked, these predicates are not "stable". However, it is
+ * sufficient that at some point after we dropped the reference the dentry was
+ * hashed and the flags had the proper value. Other dentry users may have
+ * re-gotten a reference to the dentry and change that, but our work is done -
+ * we can leave the dentry around with a zero refcount.
+ */
+static inline bool retain_dentry(struct dentry *dentry, bool locked)
{
- WARN_ON(d_in_lookup(dentry));
+ unsigned int d_flags;
- /* Unreachable? Get rid of it */
+ smp_rmb();
+ d_flags = READ_ONCE(dentry->d_flags);
+
+ // Unreachable? Nobody would be able to look it up, no point retaining
if (unlikely(d_unhashed(dentry)))
return false;
- if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
+ // Same if it's disconnected
+ if (unlikely(d_flags & DCACHE_DISCONNECTED))
return false;
- if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
- if (dentry->d_op->d_delete(dentry))
+ // ->d_delete() might tell us not to bother, but that requires
+ // ->d_lock; can't decide without it
+ if (unlikely(d_flags & DCACHE_OP_DELETE)) {
+ if (!locked || dentry->d_op->d_delete(dentry))
return false;
}
- if (unlikely(dentry->d_flags & DCACHE_DONTCACHE))
+ // Explicitly told not to bother
+ if (unlikely(d_flags & DCACHE_DONTCACHE))
return false;
- /* retain; LRU fodder */
- dentry->d_lockref.count--;
- if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
+ // At this point it looks like we ought to keep it. We also might
+ // need to do something - put it on LRU if it wasn't there already
+ // and mark it referenced if it was on LRU, but not marked yet.
+ // Unfortunately, both actions require ->d_lock, so in lockless
+ // case we'd have to punt rather than doing those.
+ if (unlikely(!(d_flags & DCACHE_LRU_LIST))) {
+ if (!locked)
+ return false;
d_lru_add(dentry);
- else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
+ } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) {
+ if (!locked)
+ return false;
dentry->d_flags |= DCACHE_REFERENCED;
+ }
return true;
}
@@ -706,60 +734,11 @@ void d_mark_dontcache(struct inode *inode)
EXPORT_SYMBOL(d_mark_dontcache);
/*
- * Finish off a dentry we've decided to kill.
- * dentry->d_lock must be held, returns with it unlocked.
- * Returns dentry requiring refcount drop, or NULL if we're done.
- */
-static struct dentry *dentry_kill(struct dentry *dentry)
- __releases(dentry->d_lock)
-{
- struct inode *inode = dentry->d_inode;
- struct dentry *parent = NULL;
-
- if (inode && unlikely(!spin_trylock(&inode->i_lock)))
- goto slow_positive;
-
- if (!IS_ROOT(dentry)) {
- parent = dentry->d_parent;
- if (unlikely(!spin_trylock(&parent->d_lock))) {
- parent = __lock_parent(dentry);
- if (likely(inode || !dentry->d_inode))
- goto got_locks;
- /* negative that became positive */
- if (parent)
- spin_unlock(&parent->d_lock);
- inode = dentry->d_inode;
- goto slow_positive;
- }
- }
- __dentry_kill(dentry);
- return parent;
-
-slow_positive:
- spin_unlock(&dentry->d_lock);
- spin_lock(&inode->i_lock);
- spin_lock(&dentry->d_lock);
- parent = lock_parent(dentry);
-got_locks:
- if (unlikely(dentry->d_lockref.count != 1)) {
- dentry->d_lockref.count--;
- } else if (likely(!retain_dentry(dentry))) {
- __dentry_kill(dentry);
- return parent;
- }
- /* we are keeping it, after all */
- if (inode)
- spin_unlock(&inode->i_lock);
- if (parent)
- spin_unlock(&parent->d_lock);
- spin_unlock(&dentry->d_lock);
- return NULL;
-}
-
-/*
* Try to do a lockless dput(), and return whether that was successful.
*
* If unsuccessful, we return false, having already taken the dentry lock.
+ * In that case refcount is guaranteed to be zero and we have already
+ * decided that it's not worth keeping around.
*
* The caller needs to hold the RCU read lock, so that the dentry is
* guaranteed to stay around even if the refcount goes down to zero!
@@ -767,18 +746,9 @@ got_locks:
static inline bool fast_dput(struct dentry *dentry)
{
int ret;
- unsigned int d_flags;
-
- /*
- * If we have a d_op->d_delete() operation, we sould not
- * let the dentry count go to zero, so use "put_or_lock".
- */
- if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
- return lockref_put_or_lock(&dentry->d_lockref);
/*
- * .. otherwise, we can try to just decrement the
- * lockref optimistically.
+ * try to decrement the lockref optimistically.
*/
ret = lockref_put_return(&dentry->d_lockref);
@@ -789,12 +759,12 @@ static inline bool fast_dput(struct dentry *dentry)
*/
if (unlikely(ret < 0)) {
spin_lock(&dentry->d_lock);
- if (dentry->d_lockref.count > 1) {
- dentry->d_lockref.count--;
+ if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
spin_unlock(&dentry->d_lock);
return true;
}
- return false;
+ dentry->d_lockref.count--;
+ goto locked;
}
/*
@@ -804,45 +774,18 @@ static inline bool fast_dput(struct dentry *dentry)
return true;
/*
- * Careful, careful. The reference count went down
- * to zero, but we don't hold the dentry lock, so
- * somebody else could get it again, and do another
- * dput(), and we need to not race with that.
- *
- * However, there is a very special and common case
- * where we don't care, because there is nothing to
- * do: the dentry is still hashed, it does not have
- * a 'delete' op, and it's referenced and already on
- * the LRU list.
- *
- * NOTE! Since we aren't locked, these values are
- * not "stable". However, it is sufficient that at
- * some point after we dropped the reference the
- * dentry was hashed and the flags had the proper
- * value. Other dentry users may have re-gotten
- * a reference to the dentry and change that, but
- * our work is done - we can leave the dentry
- * around with a zero refcount.
- *
- * Nevertheless, there are two cases that we should kill
- * the dentry anyway.
- * 1. free disconnected dentries as soon as their refcount
- * reached zero.
- * 2. free dentries if they should not be cached.
+ * Can we decide that decrement of refcount is all we needed without
+ * taking the lock? There's a very common case when it's all we need -
+ * dentry looks like it ought to be retained and there's nothing else
+ * to do.
*/
- smp_rmb();
- d_flags = READ_ONCE(dentry->d_flags);
- d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST |
- DCACHE_DISCONNECTED | DCACHE_DONTCACHE;
-
- /* Nothing to do? Dropping the reference was all we needed? */
- if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
+ if (retain_dentry(dentry, false))
return true;
/*
- * Not the fast normal case? Get the lock. We've already decremented
- * the refcount, but we'll need to re-check the situation after
- * getting the lock.
+ * Either not worth retaining or we can't tell without the lock.
+ * Get the lock, then. We've already decremented the refcount to 0,
+ * but we'll need to re-check the situation after getting the lock.
*/
spin_lock(&dentry->d_lock);
@@ -852,17 +795,11 @@ static inline bool fast_dput(struct dentry *dentry)
* else could have killed it and marked it dead. Either way, we
* don't need to do anything else.
*/
- if (dentry->d_lockref.count) {
+locked:
+ if (dentry->d_lockref.count || retain_dentry(dentry, true)) {
spin_unlock(&dentry->d_lock);
return true;
}
-
- /*
- * Re-get the reference we optimistically dropped. We hold the
- * lock, and we just tested that it was zero, so we can just
- * set it to 1.
- */
- dentry->d_lockref.count = 1;
return false;
}
@@ -895,39 +832,37 @@ static inline bool fast_dput(struct dentry *dentry)
*/
void dput(struct dentry *dentry)
{
- while (dentry) {
- might_sleep();
-
- rcu_read_lock();
- if (likely(fast_dput(dentry))) {
- rcu_read_unlock();
- return;
- }
-
- /* Slow case: now with the dentry lock held */
+ if (!dentry)
+ return;
+ might_sleep();
+ rcu_read_lock();
+ if (likely(fast_dput(dentry))) {
rcu_read_unlock();
-
- if (likely(retain_dentry(dentry))) {
+ return;
+ }
+ while (lock_for_kill(dentry)) {
+ rcu_read_unlock();
+ dentry = __dentry_kill(dentry);
+ if (!dentry)
+ return;
+ if (retain_dentry(dentry, true)) {
spin_unlock(&dentry->d_lock);
return;
}
-
- dentry = dentry_kill(dentry);
+ rcu_read_lock();
}
+ rcu_read_unlock();
+ spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(dput);
-static void __dput_to_list(struct dentry *dentry, struct list_head *list)
+static void to_shrink_list(struct dentry *dentry, struct list_head *list)
__must_hold(&dentry->d_lock)
{
- if (dentry->d_flags & DCACHE_SHRINK_LIST) {
- /* let the owner of the list it's on deal with it */
- --dentry->d_lockref.count;
- } else {
+ if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
if (dentry->d_flags & DCACHE_LRU_LIST)
d_lru_del(dentry);
- if (!--dentry->d_lockref.count)
- d_shrink_add(dentry, list);
+ d_shrink_add(dentry, list);
}
}
@@ -939,22 +874,10 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
return;
}
rcu_read_unlock();
- if (!retain_dentry(dentry))
- __dput_to_list(dentry, list);
+ to_shrink_list(dentry, list);
spin_unlock(&dentry->d_lock);
}
-/* This must be called with d_lock held */
-static inline void __dget_dlock(struct dentry *dentry)
-{
- dentry->d_lockref.count++;
-}
-
-static inline void __dget(struct dentry *dentry)
-{
- lockref_get(&dentry->d_lockref);
-}
-
struct dentry *dget_parent(struct dentry *dentry)
{
int gotref;
@@ -1004,7 +927,7 @@ static struct dentry * __d_find_any_alias(struct inode *inode)
if (hlist_empty(&inode->i_dentry))
return NULL;
alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
- __dget(alias);
+ lockref_get(&alias->d_lockref);
return alias;
}
@@ -1036,7 +959,7 @@ static struct dentry *__d_find_alias(struct inode *inode)
hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
spin_lock(&alias->d_lock);
if (!d_unhashed(alias)) {
- __dget_dlock(alias);
+ dget_dlock(alias);
spin_unlock(&alias->d_lock);
return alias;
}
@@ -1103,104 +1026,53 @@ struct dentry *d_find_alias_rcu(struct inode *inode)
*/
void d_prune_aliases(struct inode *inode)
{
+ LIST_HEAD(dispose);
struct dentry *dentry;
-restart:
+
spin_lock(&inode->i_lock);
hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
spin_lock(&dentry->d_lock);
- if (!dentry->d_lockref.count) {
- struct dentry *parent = lock_parent(dentry);
- if (likely(!dentry->d_lockref.count)) {
- __dentry_kill(dentry);
- dput(parent);
- goto restart;
- }
- if (parent)
- spin_unlock(&parent->d_lock);
- }
+ if (!dentry->d_lockref.count)
+ to_shrink_list(dentry, &dispose);
spin_unlock(&dentry->d_lock);
}
spin_unlock(&inode->i_lock);
+ shrink_dentry_list(&dispose);
}
EXPORT_SYMBOL(d_prune_aliases);
-/*
- * Lock a dentry from shrink list.
- * Called under rcu_read_lock() and dentry->d_lock; the former
- * guarantees that nothing we access will be freed under us.
- * Note that dentry is *not* protected from concurrent dentry_kill(),
- * d_delete(), etc.
- *
- * Return false if dentry has been disrupted or grabbed, leaving
- * the caller to kick it off-list. Otherwise, return true and have
- * that dentry's inode and parent both locked.
- */
-static bool shrink_lock_dentry(struct dentry *dentry)
+static inline void shrink_kill(struct dentry *victim)
{
- struct inode *inode;
- struct dentry *parent;
-
- if (dentry->d_lockref.count)
- return false;
-
- inode = dentry->d_inode;
- if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
- spin_unlock(&dentry->d_lock);
- spin_lock(&inode->i_lock);
- spin_lock(&dentry->d_lock);
- if (unlikely(dentry->d_lockref.count))
- goto out;
- /* changed inode means that somebody had grabbed it */
- if (unlikely(inode != dentry->d_inode))
- goto out;
- }
-
- parent = dentry->d_parent;
- if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock)))
- return true;
-
- spin_unlock(&dentry->d_lock);
- spin_lock(&parent->d_lock);
- if (unlikely(parent != dentry->d_parent)) {
- spin_unlock(&parent->d_lock);
- spin_lock(&dentry->d_lock);
- goto out;
- }
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- if (likely(!dentry->d_lockref.count))
- return true;
- spin_unlock(&parent->d_lock);
-out:
- if (inode)
- spin_unlock(&inode->i_lock);
- return false;
+ do {
+ rcu_read_unlock();
+ victim = __dentry_kill(victim);
+ rcu_read_lock();
+ } while (victim && lock_for_kill(victim));
+ rcu_read_unlock();
+ if (victim)
+ spin_unlock(&victim->d_lock);
}
void shrink_dentry_list(struct list_head *list)
{
while (!list_empty(list)) {
- struct dentry *dentry, *parent;
+ struct dentry *dentry;
dentry = list_entry(list->prev, struct dentry, d_lru);
spin_lock(&dentry->d_lock);
rcu_read_lock();
- if (!shrink_lock_dentry(dentry)) {
- bool can_free = false;
+ if (!lock_for_kill(dentry)) {
+ bool can_free;
rcu_read_unlock();
d_shrink_del(dentry);
- if (dentry->d_lockref.count < 0)
- can_free = dentry->d_flags & DCACHE_MAY_FREE;
+ can_free = dentry->d_flags & DCACHE_DENTRY_KILLED;
spin_unlock(&dentry->d_lock);
if (can_free)
dentry_free(dentry);
continue;
}
- rcu_read_unlock();
d_shrink_del(dentry);
- parent = dentry->d_parent;
- if (parent != dentry)
- __dput_to_list(parent, list);
- __dentry_kill(dentry);
+ shrink_kill(dentry);
}
}
@@ -1350,8 +1222,7 @@ enum d_walk_ret {
static void d_walk(struct dentry *parent, void *data,
enum d_walk_ret (*enter)(void *, struct dentry *))
{
- struct dentry *this_parent;
- struct list_head *next;
+ struct dentry *this_parent, *dentry;
unsigned seq = 0;
enum d_walk_ret ret;
bool retry = true;
@@ -1373,13 +1244,9 @@ again:
break;
}
repeat:
- next = this_parent->d_subdirs.next;
+ dentry = d_first_child(this_parent);
resume:
- while (next != &this_parent->d_subdirs) {
- struct list_head *tmp = next;
- struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
- next = tmp->next;
-
+ hlist_for_each_entry_from(dentry, d_sib) {
if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
continue;
@@ -1400,7 +1267,7 @@ resume:
continue;
}
- if (!list_empty(&dentry->d_subdirs)) {
+ if (!hlist_empty(&dentry->d_children)) {
spin_unlock(&this_parent->d_lock);
spin_release(&dentry->d_lock.dep_map, _RET_IP_);
this_parent = dentry;
@@ -1415,24 +1282,23 @@ resume:
rcu_read_lock();
ascend:
if (this_parent != parent) {
- struct dentry *child = this_parent;
- this_parent = child->d_parent;
+ dentry = this_parent;
+ this_parent = dentry->d_parent;
- spin_unlock(&child->d_lock);
+ spin_unlock(&dentry->d_lock);
spin_lock(&this_parent->d_lock);
/* might go back up the wrong parent if we have had a rename. */
if (need_seqretry(&rename_lock, seq))
goto rename_retry;
/* go into the first sibling still alive */
- do {
- next = child->d_child.next;
- if (next == &this_parent->d_subdirs)
- goto ascend;
- child = list_entry(next, struct dentry, d_child);
- } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
- rcu_read_unlock();
- goto resume;
+ hlist_for_each_entry_continue(dentry, d_sib) {
+ if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) {
+ rcu_read_unlock();
+ goto resume;
+ }
+ }
+ goto ascend;
}
if (need_seqretry(&rename_lock, seq))
goto rename_retry;
@@ -1532,7 +1398,7 @@ out:
* Search the dentry child list of the specified parent,
* and move any unused dentries to the end of the unused
* list for prune_dcache(). We descend to the next level
- * whenever the d_subdirs list is non-empty and continue
+ * whenever the d_children list is non-empty and continue
* searching.
*
* It returns zero iff there are no unused children,
@@ -1562,13 +1428,11 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
if (dentry->d_flags & DCACHE_SHRINK_LIST) {
data->found++;
- } else {
- if (dentry->d_flags & DCACHE_LRU_LIST)
- d_lru_del(dentry);
- if (!dentry->d_lockref.count) {
- d_shrink_add(dentry, &data->dispose);
- data->found++;
- }
+ } else if (!dentry->d_lockref.count) {
+ to_shrink_list(dentry, &data->dispose);
+ data->found++;
+ } else if (dentry->d_lockref.count < 0) {
+ data->found++;
}
/*
* We can return to the caller if we have found some (this
@@ -1589,17 +1453,13 @@ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
if (data->start == dentry)
goto out;
- if (dentry->d_flags & DCACHE_SHRINK_LIST) {
- if (!dentry->d_lockref.count) {
+ if (!dentry->d_lockref.count) {
+ if (dentry->d_flags & DCACHE_SHRINK_LIST) {
rcu_read_lock();
data->victim = dentry;
return D_WALK_QUIT;
}
- } else {
- if (dentry->d_flags & DCACHE_LRU_LIST)
- d_lru_del(dentry);
- if (!dentry->d_lockref.count)
- d_shrink_add(dentry, &data->dispose);
+ to_shrink_list(dentry, &data->dispose);
}
/*
* We can return to the caller if we have found some (this
@@ -1637,17 +1497,12 @@ void shrink_dcache_parent(struct dentry *parent)
data.victim = NULL;
d_walk(parent, &data, select_collect2);
if (data.victim) {
- struct dentry *parent;
spin_lock(&data.victim->d_lock);
- if (!shrink_lock_dentry(data.victim)) {
+ if (!lock_for_kill(data.victim)) {
spin_unlock(&data.victim->d_lock);
rcu_read_unlock();
} else {
- rcu_read_unlock();
- parent = data.victim->d_parent;
- if (parent != data.victim)
- __dput_to_list(parent, &data.dispose);
- __dentry_kill(data.victim);
+ shrink_kill(data.victim);
}
}
if (!list_empty(&data.dispose))
@@ -1659,7 +1514,7 @@ EXPORT_SYMBOL(shrink_dcache_parent);
static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
{
/* it has busy descendents; complain about those instead */
- if (!list_empty(&dentry->d_subdirs))
+ if (!hlist_empty(&dentry->d_children))
return D_WALK_CONTINUE;
/* root with refcount 1 is fine */
@@ -1709,8 +1564,7 @@ static enum d_walk_ret find_submount(void *_data, struct dentry *dentry)
{
struct dentry **victim = _data;
if (d_mountpoint(dentry)) {
- __dget_dlock(dentry);
- *victim = dentry;
+ *victim = dget_dlock(dentry);
return D_WALK_QUIT;
}
return D_WALK_CONTINUE;
@@ -1816,9 +1670,9 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dentry->d_fsdata = NULL;
INIT_HLIST_BL_NODE(&dentry->d_hash);
INIT_LIST_HEAD(&dentry->d_lru);
- INIT_LIST_HEAD(&dentry->d_subdirs);
+ INIT_HLIST_HEAD(&dentry->d_children);
INIT_HLIST_NODE(&dentry->d_u.d_alias);
- INIT_LIST_HEAD(&dentry->d_child);
+ INIT_HLIST_NODE(&dentry->d_sib);
d_set_d_op(dentry, dentry->d_sb->s_d_op);
if (dentry->d_op && dentry->d_op->d_init) {
@@ -1855,9 +1709,8 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
* don't need child lock because it is not subject
* to concurrency here
*/
- __dget_dlock(parent);
- dentry->d_parent = parent;
- list_add(&dentry->d_child, &parent->d_subdirs);
+ dentry->d_parent = dget_dlock(parent);
+ hlist_add_head(&dentry->d_sib, &parent->d_children);
spin_unlock(&parent->d_lock);
return dentry;
@@ -1897,9 +1750,15 @@ struct dentry *d_alloc_cursor(struct dentry * parent)
*/
struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
{
+ static const struct dentry_operations anon_ops = {
+ .d_dname = simple_dname
+ };
struct dentry *dentry = __d_alloc(sb, name);
- if (likely(dentry))
+ if (likely(dentry)) {
dentry->d_flags |= DCACHE_NORCU;
+ if (!sb->s_d_op)
+ d_set_d_op(dentry, &anon_ops);
+ }
return dentry;
}
@@ -1943,22 +1802,6 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
}
EXPORT_SYMBOL(d_set_d_op);
-
-/*
- * d_set_fallthru - Mark a dentry as falling through to a lower layer
- * @dentry - The dentry to mark
- *
- * Mark a dentry as falling through to the lower layer (as set with
- * d_pin_lower()). This flag may be recorded on the medium.
- */
-void d_set_fallthru(struct dentry *dentry)
-{
- spin_lock(&dentry->d_lock);
- dentry->d_flags |= DCACHE_FALLTHRU;
- spin_unlock(&dentry->d_lock);
-}
-EXPORT_SYMBOL(d_set_fallthru);
-
static unsigned d_flags_for_inode(struct inode *inode)
{
unsigned add_flags = DCACHE_REGULAR_TYPE;
@@ -2077,75 +1920,55 @@ struct dentry *d_make_root(struct inode *root_inode)
}
EXPORT_SYMBOL(d_make_root);
-static struct dentry *__d_instantiate_anon(struct dentry *dentry,
- struct inode *inode,
- bool disconnected)
-{
- struct dentry *res;
- unsigned add_flags;
-
- security_d_instantiate(dentry, inode);
- spin_lock(&inode->i_lock);
- res = __d_find_any_alias(inode);
- if (res) {
- spin_unlock(&inode->i_lock);
- dput(dentry);
- goto out_iput;
- }
-
- /* attach a disconnected dentry */
- add_flags = d_flags_for_inode(inode);
-
- if (disconnected)
- add_flags |= DCACHE_DISCONNECTED;
-
- spin_lock(&dentry->d_lock);
- __d_set_inode_and_type(dentry, inode, add_flags);
- hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
- if (!disconnected) {
- hlist_bl_lock(&dentry->d_sb->s_roots);
- hlist_bl_add_head(&dentry->d_hash, &dentry->d_sb->s_roots);
- hlist_bl_unlock(&dentry->d_sb->s_roots);
- }
- spin_unlock(&dentry->d_lock);
- spin_unlock(&inode->i_lock);
-
- return dentry;
-
- out_iput:
- iput(inode);
- return res;
-}
-
-struct dentry *d_instantiate_anon(struct dentry *dentry, struct inode *inode)
-{
- return __d_instantiate_anon(dentry, inode, true);
-}
-EXPORT_SYMBOL(d_instantiate_anon);
-
static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected)
{
- struct dentry *tmp;
- struct dentry *res;
+ struct super_block *sb;
+ struct dentry *new, *res;
if (!inode)
return ERR_PTR(-ESTALE);
if (IS_ERR(inode))
return ERR_CAST(inode);
- res = d_find_any_alias(inode);
+ sb = inode->i_sb;
+
+ res = d_find_any_alias(inode); /* existing alias? */
if (res)
- goto out_iput;
+ goto out;
- tmp = d_alloc_anon(inode->i_sb);
- if (!tmp) {
+ new = d_alloc_anon(sb);
+ if (!new) {
res = ERR_PTR(-ENOMEM);
- goto out_iput;
+ goto out;
}
- return __d_instantiate_anon(tmp, inode, disconnected);
+ security_d_instantiate(new, inode);
+ spin_lock(&inode->i_lock);
+ res = __d_find_any_alias(inode); /* recheck under lock */
+ if (likely(!res)) { /* still no alias, attach a disconnected dentry */
+ unsigned add_flags = d_flags_for_inode(inode);
+
+ if (disconnected)
+ add_flags |= DCACHE_DISCONNECTED;
-out_iput:
+ spin_lock(&new->d_lock);
+ __d_set_inode_and_type(new, inode, add_flags);
+ hlist_add_head(&new->d_u.d_alias, &inode->i_dentry);
+ if (!disconnected) {
+ hlist_bl_lock(&sb->s_roots);
+ hlist_bl_add_head(&new->d_hash, &sb->s_roots);
+ hlist_bl_unlock(&sb->s_roots);
+ }
+ spin_unlock(&new->d_lock);
+ spin_unlock(&inode->i_lock);
+ inode = NULL; /* consumed by new->d_inode */
+ res = new;
+ } else {
+ spin_unlock(&inode->i_lock);
+ dput(new);
+ }
+
+ out:
iput(inode);
return res;
}
@@ -2729,7 +2552,7 @@ retry:
/* we can't take ->d_lock here; it's OK, though. */
new->d_flags |= DCACHE_PAR_LOOKUP;
new->d_wait = wq;
- hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b);
+ hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b);
hlist_bl_unlock(b);
return new;
mismatch:
@@ -2853,7 +2676,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
spin_unlock(&alias->d_lock);
alias = NULL;
} else {
- __dget_dlock(alias);
+ dget_dlock(alias);
__d_rehash(alias);
spin_unlock(&alias->d_lock);
}
@@ -2995,11 +2818,15 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
} else {
target->d_parent = old_parent;
swap_names(dentry, target);
- list_move(&target->d_child, &target->d_parent->d_subdirs);
+ if (!hlist_unhashed(&target->d_sib))
+ __hlist_del(&target->d_sib);
+ hlist_add_head(&target->d_sib, &target->d_parent->d_children);
__d_rehash(target);
fsnotify_update_flags(target);
}
- list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
+ if (!hlist_unhashed(&dentry->d_sib))
+ __hlist_del(&dentry->d_sib);
+ hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children);
__d_rehash(dentry);
fsnotify_update_flags(dentry);
fscrypt_handle_d_move(dentry);
@@ -3082,8 +2909,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
* Note: If ever the locking in lock_rename() changes, then please
* remember to update this too...
*/
-static int __d_unalias(struct inode *inode,
- struct dentry *dentry, struct dentry *alias)
+static int __d_unalias(struct dentry *dentry, struct dentry *alias)
{
struct mutex *m1 = NULL;
struct rw_semaphore *m2 = NULL;
@@ -3164,7 +2990,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
inode->i_sb->s_id);
} else if (!IS_ROOT(new)) {
struct dentry *old_parent = dget(new->d_parent);
- int err = __d_unalias(inode, dentry, new);
+ int err = __d_unalias(dentry, new);
write_sequnlock(&rename_lock);
if (err) {
dput(new);
@@ -3235,10 +3061,7 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
if (d_unhashed(dentry) || !dentry->d_inode)
return D_WALK_SKIP;
- if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
- dentry->d_flags |= DCACHE_GENOCIDE;
- dentry->d_lockref.count--;
- }
+ dentry->d_lockref.count--;
}
return D_WALK_CONTINUE;
}
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c830261aa883..b20e565b9c5e 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -69,7 +69,6 @@ static struct ctl_table pty_table[] = {
.data = &pty_count,
.proc_handler = proc_dointvec,
},
- {}
};
struct pts_mount_opts {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index d7193687b9b4..5ed1e4cf6c0b 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -607,6 +607,8 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
target_inode = d_inode(new_dentry);
trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ if (IS_ERR(trap))
+ return PTR_ERR(trap);
dget(lower_new_dentry);
rc = -EINVAL;
if (lower_old_dentry->d_parent != lower_old_dir_dentry)
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 1d318f85232d..fffd3919343e 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -114,8 +114,11 @@ config EROFS_FS_ZIP_DEFLATE
config EROFS_FS_ONDEMAND
bool "EROFS fscache-based on-demand read support"
- depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y)
- default n
+ depends on EROFS_FS
+ select NETFS_SUPPORT
+ select FSCACHE
+ select CACHEFILES
+ select CACHEFILES_ONDEMAND
help
This permits EROFS to use fscache-backed data blobs with on-demand
read support.
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 279933e007d2..7cc5841577b2 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -11,13 +11,12 @@
struct z_erofs_decompress_req {
struct super_block *sb;
struct page **in, **out;
-
unsigned short pageofs_in, pageofs_out;
unsigned int inputsize, outputsize;
- /* indicate the algorithm will be used for decompression */
- unsigned int alg;
+ unsigned int alg; /* the algorithm for decompression */
bool inplace_io, partial_decoding, fillgaps;
+ gfp_t gfp; /* allocation flags for extra temporary buffers */
};
struct z_erofs_decompressor {
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 1d65b9f60a39..d4cee95af14c 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -111,8 +111,9 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
victim = availables[--top];
get_page(victim);
} else {
- victim = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ victim = erofs_allocpage(pagepool, rq->gfp);
+ if (!victim)
+ return -ENOMEM;
set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
}
rq->out[i] = victim;
@@ -408,7 +409,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
int size, ret = 0;
if (!erofs_sb_has_compr_cfgs(sbi)) {
- sbi->available_compr_algs = Z_EROFS_COMPRESSION_LZ4;
+ sbi->available_compr_algs = 1 << Z_EROFS_COMPRESSION_LZ4;
return z_erofs_load_lz4_config(sb, dsb, NULL, 0);
}
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index 4a64a9c91dd3..b98872058abe 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -95,7 +95,7 @@ int z_erofs_load_deflate_config(struct super_block *sb,
}
int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+ struct page **pgpl)
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -158,8 +158,12 @@ again:
strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs);
outsz -= strm->z.avail_out;
if (!rq->out[no]) {
- rq->out[no] = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ rq->out[no] = erofs_allocpage(pgpl, rq->gfp);
+ if (!rq->out[no]) {
+ kout = NULL;
+ err = -ENOMEM;
+ break;
+ }
set_page_private(rq->out[no],
Z_EROFS_SHORTLIVED_PAGE);
}
@@ -211,8 +215,11 @@ again:
DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb),
rq->in[j]));
- tmppage = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ tmppage = erofs_allocpage(pgpl, rq->gfp);
+ if (!tmppage) {
+ err = -ENOMEM;
+ goto failed;
+ }
set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
copy_highpage(tmppage, rq->in[j]);
rq->in[j] = tmppage;
@@ -230,7 +237,7 @@ again:
break;
}
}
-
+failed:
if (zlib_inflateEnd(&strm->z) != Z_OK && !err)
err = -EIO;
if (kout)
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 2dd14f99c1dc..6ca357d83cfa 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -148,7 +148,7 @@ again:
}
int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+ struct page **pgpl)
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -215,8 +215,11 @@ again:
PAGE_SIZE - pageofs);
outlen -= strm->buf.out_size;
if (!rq->out[no] && rq->fillgaps) { /* deduped */
- rq->out[no] = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ rq->out[no] = erofs_allocpage(pgpl, rq->gfp);
+ if (!rq->out[no]) {
+ err = -ENOMEM;
+ break;
+ }
set_page_private(rq->out[no],
Z_EROFS_SHORTLIVED_PAGE);
}
@@ -258,8 +261,11 @@ again:
DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
rq->in[j]));
- tmppage = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ tmppage = erofs_allocpage(pgpl, rq->gfp);
+ if (!tmppage) {
+ err = -ENOMEM;
+ goto failed;
+ }
set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
copy_highpage(tmppage, rq->in[j]);
rq->in[j] = tmppage;
@@ -277,6 +283,7 @@ again:
break;
}
}
+failed:
if (no < nrpages_out && strm->buf.out)
kunmap(rq->out[no]);
if (ni < nrpages_in)
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 87ff35bff8d5..5ff90026fd43 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -165,10 +165,10 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
{
int ret;
- struct erofs_fscache *ctx = folio_mapping(folio)->host->i_private;
+ struct erofs_fscache *ctx = folio->mapping->host->i_private;
struct erofs_fscache_request *req;
- req = erofs_fscache_req_alloc(folio_mapping(folio),
+ req = erofs_fscache_req_alloc(folio->mapping,
folio_pos(folio), folio_size(folio));
if (IS_ERR(req)) {
folio_unlock(folio);
@@ -276,7 +276,7 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
struct erofs_fscache_request *req;
int ret;
- req = erofs_fscache_req_alloc(folio_mapping(folio),
+ req = erofs_fscache_req_alloc(folio->mapping,
folio_pos(folio), folio_size(folio));
if (IS_ERR(req)) {
folio_unlock(folio);
@@ -459,7 +459,7 @@ static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &erofs_fscache_meta_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
inode->i_blkbits = EROFS_SB(sb)->blkszbits;
inode->i_private = ctx;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 3d616dea55dc..36e638e8b53a 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -60,7 +60,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
} else {
const unsigned int gotten = sb->s_blocksize - *ofs;
- copied = kmalloc(vi->inode_isize, GFP_NOFS);
+ copied = kmalloc(vi->inode_isize, GFP_KERNEL);
if (!copied) {
err = -ENOMEM;
goto err_out;
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 5dea308764b4..e146d09151af 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -81,7 +81,7 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
repeat:
xa_lock(&sbi->managed_pslots);
pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
- NULL, grp, GFP_NOFS);
+ NULL, grp, GFP_KERNEL);
if (pre) {
if (xa_is_err(pre)) {
pre = ERR_PTR(xa_err(pre));
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 692c0c39be63..ff0aa72b0db3 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -82,6 +82,9 @@ struct z_erofs_pcluster {
/* L: indicate several pageofs_outs or not */
bool multibases;
+ /* L: whether extra buffer allocations are best-effort */
+ bool besteffort;
+
/* A: compressed bvecs (can be cached or inplaced pages) */
struct z_erofs_bvec compressed_bvecs[];
};
@@ -230,7 +233,7 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
struct page *nextpage = *candidate_bvpage;
if (!nextpage) {
- nextpage = erofs_allocpage(pagepool, GFP_NOFS);
+ nextpage = erofs_allocpage(pagepool, GFP_KERNEL);
if (!nextpage)
return -ENOMEM;
set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
@@ -302,7 +305,7 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size)
if (nrpages > pcs->maxpages)
continue;
- pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
+ pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL);
if (!pcl)
return ERR_PTR(-ENOMEM);
pcl->pclustersize = size;
@@ -563,21 +566,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
unsigned int i;
- if (i_blocksize(fe->inode) != PAGE_SIZE)
- return;
- if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
+ if (i_blocksize(fe->inode) != PAGE_SIZE ||
+ fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
return;
for (i = 0; i < pclusterpages; ++i) {
struct page *page, *newpage;
void *t; /* mark pages just found for debugging */
- /* the compressed page was loaded before */
+ /* Inaccurate check w/o locking to avoid unneeded lookups */
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
page = find_get_page(mc, pcl->obj.index + i);
-
if (page) {
t = (void *)((unsigned long)page | 1);
newpage = NULL;
@@ -597,9 +598,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
t = (void *)((unsigned long)newpage | 1);
}
-
- if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
+ spin_lock(&pcl->obj.lockref.lock);
+ if (!pcl->compressed_bvecs[i].page) {
+ pcl->compressed_bvecs[i].page = t;
+ spin_unlock(&pcl->obj.lockref.lock);
continue;
+ }
+ spin_unlock(&pcl->obj.lockref.lock);
if (page)
put_page(page);
@@ -694,7 +699,7 @@ static void z_erofs_cache_invalidate_folio(struct folio *folio,
DBG_BUGON(stop > folio_size(folio) || stop < length);
if (offset == 0 && stop == folio_size(folio))
- while (!z_erofs_cache_release_folio(folio, GFP_NOFS))
+ while (!z_erofs_cache_release_folio(folio, 0))
cond_resched();
}
@@ -713,36 +718,30 @@ int erofs_init_managed_cache(struct super_block *sb)
set_nlink(inode, 1);
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &z_erofs_cache_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
EROFS_SB(sb)->managed_cache = inode;
return 0;
}
-static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
- struct z_erofs_bvec *bvec)
-{
- struct z_erofs_pcluster *const pcl = fe->pcl;
-
- while (fe->icur > 0) {
- if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
- NULL, bvec->page)) {
- pcl->compressed_bvecs[fe->icur] = *bvec;
- return true;
- }
- }
- return false;
-}
-
/* callers must be with pcluster lock held */
static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
struct z_erofs_bvec *bvec, bool exclusive)
{
+ struct z_erofs_pcluster *pcl = fe->pcl;
int ret;
if (exclusive) {
/* give priority for inplaceio to use file pages first */
- if (z_erofs_try_inplace_io(fe, bvec))
+ spin_lock(&pcl->obj.lockref.lock);
+ while (fe->icur > 0) {
+ if (pcl->compressed_bvecs[--fe->icur].page)
+ continue;
+ pcl->compressed_bvecs[fe->icur] = *bvec;
+ spin_unlock(&pcl->obj.lockref.lock);
return 0;
+ }
+ spin_unlock(&pcl->obj.lockref.lock);
+
/* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
!fe->candidate_bvpage)
@@ -964,7 +963,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page)
+ struct page *page, bool ra)
{
struct inode *const inode = fe->inode;
struct erofs_map_blocks *const map = &fe->map;
@@ -1014,6 +1013,7 @@ repeat:
err = z_erofs_pcluster_begin(fe);
if (err)
goto out;
+ fe->pcl->besteffort |= !ra;
}
/*
@@ -1280,6 +1280,9 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
.inplace_io = overlapped,
.partial_decoding = pcl->partial,
.fillgaps = pcl->multibases,
+ .gfp = pcl->besteffort ?
+ GFP_KERNEL | __GFP_NOFAIL :
+ GFP_NOWAIT | __GFP_NORETRY
}, be->pagepool);
/* must handle all compressed pages before actual file pages */
@@ -1322,6 +1325,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
pcl->length = 0;
pcl->partial = true;
pcl->multibases = false;
+ pcl->besteffort = false;
pcl->bvset.nextpage = NULL;
pcl->vcnt = 0;
@@ -1423,23 +1427,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
{
gfp_t gfp = mapping_gfp_mask(mc);
bool tocache = false;
- struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
+ struct z_erofs_bvec zbv;
struct address_space *mapping;
- struct page *page, *oldpage;
+ struct page *page;
int justfound, bs = i_blocksize(f->inode);
/* Except for inplace pages, the entire page can be used for I/Os */
bvec->bv_offset = 0;
bvec->bv_len = PAGE_SIZE;
repeat:
- oldpage = READ_ONCE(zbv->page);
- if (!oldpage)
+ spin_lock(&pcl->obj.lockref.lock);
+ zbv = pcl->compressed_bvecs[nr];
+ page = zbv.page;
+ justfound = (unsigned long)page & 1UL;
+ page = (struct page *)((unsigned long)page & ~1UL);
+ pcl->compressed_bvecs[nr].page = page;
+ spin_unlock(&pcl->obj.lockref.lock);
+ if (!page)
goto out_allocpage;
- justfound = (unsigned long)oldpage & 1UL;
- page = (struct page *)((unsigned long)oldpage & ~1UL);
bvec->bv_page = page;
-
DBG_BUGON(z_erofs_is_shortlived_page(page));
/*
* Handle preallocated cached pages. We tried to allocate such pages
@@ -1448,7 +1455,6 @@ repeat:
*/
if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
set_page_private(page, 0);
- WRITE_ONCE(zbv->page, page);
tocache = true;
goto out_tocache;
}
@@ -1459,9 +1465,9 @@ repeat:
* therefore it is impossible for `mapping` to be NULL.
*/
if (mapping && mapping != mc) {
- if (zbv->offset < 0)
- bvec->bv_offset = round_up(-zbv->offset, bs);
- bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
+ if (zbv.offset < 0)
+ bvec->bv_offset = round_up(-zbv.offset, bs);
+ bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
return;
}
@@ -1471,7 +1477,6 @@ repeat:
/* the cached page is still in managed cache */
if (page->mapping == mc) {
- WRITE_ONCE(zbv->page, page);
/*
* The cached page is still available but without a valid
* `->private` pcluster hint. Let's reconnect them.
@@ -1503,11 +1508,15 @@ repeat:
put_page(page);
out_allocpage:
page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
- if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
+ spin_lock(&pcl->obj.lockref.lock);
+ if (pcl->compressed_bvecs[nr].page) {
erofs_pagepool_add(&f->pagepool, page);
+ spin_unlock(&pcl->obj.lockref.lock);
cond_resched();
goto repeat;
}
+ pcl->compressed_bvecs[nr].page = page;
+ spin_unlock(&pcl->obj.lockref.lock);
bvec->bv_page = page;
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
@@ -1685,6 +1694,7 @@ submit_bio_retry:
if (cur + bvec.bv_len > end)
bvec.bv_len = end - cur;
+ DBG_BUGON(bvec.bv_len < sb->s_blocksize);
if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
bvec.bv_offset))
goto submit_bio_retry;
@@ -1785,7 +1795,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
if (PageUptodate(page))
unlock_page(page);
else
- (void)z_erofs_do_read_page(f, page);
+ (void)z_erofs_do_read_page(f, page, !!rac);
put_page(page);
}
@@ -1806,7 +1816,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
z_erofs_pcluster_readmore(&f, NULL, true);
- err = z_erofs_do_read_page(&f, &folio->page);
+ err = z_erofs_do_read_page(&f, &folio->page, false);
z_erofs_pcluster_readmore(&f, NULL, false);
z_erofs_pcluster_end(&f);
@@ -1847,7 +1857,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
folio = head;
head = folio_get_private(folio);
- err = z_erofs_do_read_page(&f, &folio->page);
+ err = z_erofs_do_read_page(&f, &folio->page, true);
if (err && err != -EINTR)
erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
folio->index, EROFS_I(inode)->nid);
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 9753875e41cb..e313c936351d 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -454,7 +454,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
.map = map,
};
int err = 0;
- unsigned int lclusterbits, endoff;
+ unsigned int lclusterbits, endoff, afmt;
unsigned long initial_lcn;
unsigned long long ofs, end;
@@ -543,17 +543,20 @@ static int z_erofs_do_map_blocks(struct inode *inode,
err = -EFSCORRUPTED;
goto unmap_out;
}
- if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
- map->m_algorithmformat =
- Z_EROFS_COMPRESSION_INTERLACED;
- else
- map->m_algorithmformat =
- Z_EROFS_COMPRESSION_SHIFTED;
- } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
- map->m_algorithmformat = vi->z_algorithmtype[1];
+ afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ?
+ Z_EROFS_COMPRESSION_INTERLACED :
+ Z_EROFS_COMPRESSION_SHIFTED;
} else {
- map->m_algorithmformat = vi->z_algorithmtype[0];
+ afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ?
+ vi->z_algorithmtype[1] : vi->z_algorithmtype[0];
+ if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) {
+ erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu",
+ afmt, vi->nid);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
}
+ map->m_algorithmformat = afmt;
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2877cc01cff1..3534d36a1474 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -322,7 +322,6 @@ static struct ctl_table epoll_table[] = {
.extra1 = &long_zero,
.extra2 = &long_max,
},
- { }
};
static void __init epoll_sysctls_init(void)
diff --git a/fs/exec.c b/fs/exec.c
index ee43597cb453..af4fbb61cd53 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -66,6 +66,7 @@
#include <linux/coredump.h>
#include <linux/time_namespace.h>
#include <linux/user_events.h>
+#include <linux/rseq.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -127,7 +128,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
struct filename *tmp = getname(library);
int error = PTR_ERR(tmp);
static const struct open_flags uselib_flags = {
- .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+ .open_flag = O_LARGEFILE | O_RDONLY,
.acc_mode = MAY_READ | MAY_EXEC,
.intent = LOOKUP_OPEN,
.lookup_flags = LOOKUP_FOLLOW,
@@ -903,6 +904,10 @@ EXPORT_SYMBOL(transfer_args_to_stack);
#endif /* CONFIG_MMU */
+/*
+ * On success, caller must call do_close_execat() on the returned
+ * struct file to close it.
+ */
static struct file *do_open_execat(int fd, struct filename *name, int flags)
{
struct file *file;
@@ -947,6 +952,17 @@ exit:
return ERR_PTR(err);
}
+/**
+ * open_exec - Open a path name for execution
+ *
+ * @name: path name to open with the intent of executing it.
+ *
+ * Returns ERR_PTR on failure or allocated struct file on success.
+ *
+ * As this is a wrapper for the internal do_open_execat(), callers
+ * must call allow_write_access() before fput() on release. Also see
+ * do_close_execat().
+ */
struct file *open_exec(const char *name)
{
struct filename *filename = getname_kernel(name);
@@ -1408,6 +1424,9 @@ int begin_new_exec(struct linux_binprm * bprm)
out_unlock:
up_write(&me->signal->exec_update_lock);
+ if (!bprm->cred)
+ mutex_unlock(&me->signal->cred_guard_mutex);
+
out:
return retval;
}
@@ -1483,6 +1502,15 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
return -ENOMEM;
}
+/* Matches do_open_execat() */
+static void do_close_execat(struct file *file)
+{
+ if (!file)
+ return;
+ allow_write_access(file);
+ fput(file);
+}
+
static void free_bprm(struct linux_binprm *bprm)
{
if (bprm->mm) {
@@ -1494,10 +1522,7 @@ static void free_bprm(struct linux_binprm *bprm)
mutex_unlock(&current->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
- if (bprm->file) {
- allow_write_access(bprm->file);
- fput(bprm->file);
- }
+ do_close_execat(bprm->file);
if (bprm->executable)
fput(bprm->executable);
/* If a binfmt changed the interp, free it. */
@@ -1507,12 +1532,23 @@ static void free_bprm(struct linux_binprm *bprm)
kfree(bprm);
}
-static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
+static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags)
{
- struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+ struct linux_binprm *bprm;
+ struct file *file;
int retval = -ENOMEM;
- if (!bprm)
- goto out;
+
+ file = do_open_execat(fd, filename, flags);
+ if (IS_ERR(file))
+ return ERR_CAST(file);
+
+ bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+ if (!bprm) {
+ do_close_execat(file);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ bprm->file = file;
if (fd == AT_FDCWD || filename->name[0] == '/') {
bprm->filename = filename->name;
@@ -1525,18 +1561,28 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
if (!bprm->fdpath)
goto out_free;
+ /*
+ * Record that a name derived from an O_CLOEXEC fd will be
+ * inaccessible after exec. This allows the code in exec to
+ * choose to fail when the executable is not mmaped into the
+ * interpreter and an open file descriptor is not passed to
+ * the interpreter. This makes for a better user experience
+ * than having the interpreter start and then immediately fail
+ * when it finds the executable is inaccessible.
+ */
+ if (get_close_on_exec(fd))
+ bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
+
bprm->filename = bprm->fdpath;
}
bprm->interp = bprm->filename;
retval = bprm_mm_init(bprm);
- if (retval)
- goto out_free;
- return bprm;
+ if (!retval)
+ return bprm;
out_free:
free_bprm(bprm);
-out:
return ERR_PTR(retval);
}
@@ -1587,6 +1633,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
}
rcu_read_unlock();
+ /* "users" and "in_exec" locked for copy_fs() */
if (p->fs->users > n_fs)
bprm->unsafe |= LSM_UNSAFE_SHARE;
else
@@ -1803,13 +1850,8 @@ static int exec_binprm(struct linux_binprm *bprm)
return 0;
}
-/*
- * sys_execve() executes a new program.
- */
-static int bprm_execve(struct linux_binprm *bprm,
- int fd, struct filename *filename, int flags)
+static int bprm_execve(struct linux_binprm *bprm)
{
- struct file *file;
int retval;
retval = prepare_bprm_creds(bprm);
@@ -1825,26 +1867,8 @@ static int bprm_execve(struct linux_binprm *bprm,
current->in_execve = 1;
sched_mm_cid_before_execve(current);
- file = do_open_execat(fd, filename, flags);
- retval = PTR_ERR(file);
- if (IS_ERR(file))
- goto out_unmark;
-
sched_exec();
- bprm->file = file;
- /*
- * Record that a name derived from an O_CLOEXEC fd will be
- * inaccessible after exec. This allows the code in exec to
- * choose to fail when the executable is not mmaped into the
- * interpreter and an open file descriptor is not passed to
- * the interpreter. This makes for a better user experience
- * than having the interpreter start and then immediately fail
- * when it finds the executable is inaccessible.
- */
- if (bprm->fdpath && get_close_on_exec(fd))
- bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
-
/* Set the unchanging part of bprm->cred */
retval = security_bprm_creds_for_exec(bprm);
if (retval)
@@ -1874,7 +1898,6 @@ out:
if (bprm->point_of_no_return && !fatal_signal_pending(current))
force_fatal_sig(SIGSEGV);
-out_unmark:
sched_mm_cid_after_execve(current);
current->fs->in_exec = 0;
current->in_execve = 0;
@@ -1909,7 +1932,7 @@ static int do_execveat_common(int fd, struct filename *filename,
* further execve() calls fail. */
current->flags &= ~PF_NPROC_EXCEEDED;
- bprm = alloc_bprm(fd, filename);
+ bprm = alloc_bprm(fd, filename, flags);
if (IS_ERR(bprm)) {
retval = PTR_ERR(bprm);
goto out_ret;
@@ -1958,7 +1981,7 @@ static int do_execveat_common(int fd, struct filename *filename,
bprm->argc = 1;
}
- retval = bprm_execve(bprm, fd, filename, flags);
+ retval = bprm_execve(bprm);
out_free:
free_bprm(bprm);
@@ -1983,7 +2006,7 @@ int kernel_execve(const char *kernel_filename,
if (IS_ERR(filename))
return PTR_ERR(filename);
- bprm = alloc_bprm(fd, filename);
+ bprm = alloc_bprm(fd, filename, 0);
if (IS_ERR(bprm)) {
retval = PTR_ERR(bprm);
goto out_ret;
@@ -2018,7 +2041,7 @@ int kernel_execve(const char *kernel_filename,
if (retval < 0)
goto out_free;
- retval = bprm_execve(bprm, fd, filename, 0);
+ retval = bprm_execve(bprm);
out_free:
free_bprm(bprm);
out_ret:
@@ -2164,7 +2187,6 @@ static struct ctl_table fs_exec_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
},
- { }
};
static int __init init_fs_exec_sysctls(void)
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index e918decb3735..0356c88252bd 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -5,42 +5,23 @@
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/bitmap.h>
#include <linux/buffer_head.h>
#include "exfat_raw.h"
#include "exfat_fs.h"
-static const unsigned char free_bit[] = {
- 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,/* 0 ~ 19*/
- 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3,/* 20 ~ 39*/
- 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/* 40 ~ 59*/
- 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,/* 60 ~ 79*/
- 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2,/* 80 ~ 99*/
- 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3,/*100 ~ 119*/
- 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/*120 ~ 139*/
- 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5,/*140 ~ 159*/
- 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,/*160 ~ 179*/
- 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3,/*180 ~ 199*/
- 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/*200 ~ 219*/
- 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,/*220 ~ 239*/
- 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 /*240 ~ 254*/
-};
-
-static const unsigned char used_bit[] = {
- 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3,/* 0 ~ 19*/
- 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4,/* 20 ~ 39*/
- 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5,/* 40 ~ 59*/
- 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,/* 60 ~ 79*/
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4,/* 80 ~ 99*/
- 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,/*100 ~ 119*/
- 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4,/*120 ~ 139*/
- 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,/*140 ~ 159*/
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5,/*160 ~ 179*/
- 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5,/*180 ~ 199*/
- 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6,/*200 ~ 219*/
- 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,/*220 ~ 239*/
- 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 /*240 ~ 255*/
-};
+#if BITS_PER_LONG == 32
+#define __le_long __le32
+#define lel_to_cpu(A) le32_to_cpu(A)
+#define cpu_to_lel(A) cpu_to_le32(A)
+#elif BITS_PER_LONG == 64
+#define __le_long __le64
+#define lel_to_cpu(A) le64_to_cpu(A)
+#define cpu_to_lel(A) cpu_to_le64(A)
+#else
+#error "BITS_PER_LONG not 32 or 64"
+#endif
/*
* Allocation Bitmap Management Functions
@@ -200,32 +181,35 @@ unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu)
{
unsigned int i, map_i, map_b, ent_idx;
unsigned int clu_base, clu_free;
- unsigned char k, clu_mask;
+ unsigned long clu_bits, clu_mask;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ __le_long bitval;
WARN_ON(clu < EXFAT_FIRST_CLUSTER);
- ent_idx = CLUSTER_TO_BITMAP_ENT(clu);
- clu_base = BITMAP_ENT_TO_CLUSTER(ent_idx & ~(BITS_PER_BYTE_MASK));
+ ent_idx = ALIGN_DOWN(CLUSTER_TO_BITMAP_ENT(clu), BITS_PER_LONG);
+ clu_base = BITMAP_ENT_TO_CLUSTER(ent_idx);
clu_mask = IGNORED_BITS_REMAINED(clu, clu_base);
map_i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
map_b = BITMAP_OFFSET_BYTE_IN_SECTOR(sb, ent_idx);
for (i = EXFAT_FIRST_CLUSTER; i < sbi->num_clusters;
- i += BITS_PER_BYTE) {
- k = *(sbi->vol_amap[map_i]->b_data + map_b);
+ i += BITS_PER_LONG) {
+ bitval = *(__le_long *)(sbi->vol_amap[map_i]->b_data + map_b);
if (clu_mask > 0) {
- k |= clu_mask;
+ bitval |= cpu_to_lel(clu_mask);
clu_mask = 0;
}
- if (k < 0xFF) {
- clu_free = clu_base + free_bit[k];
+ if (lel_to_cpu(bitval) != ULONG_MAX) {
+ clu_bits = lel_to_cpu(bitval);
+ clu_free = clu_base + ffz(clu_bits);
if (clu_free < sbi->num_clusters)
return clu_free;
}
- clu_base += BITS_PER_BYTE;
+ clu_base += BITS_PER_LONG;
+ map_b += sizeof(long);
- if (++map_b >= sb->s_blocksize ||
+ if (map_b >= sb->s_blocksize ||
clu_base >= sbi->num_clusters) {
if (++map_i >= sbi->map_sectors) {
clu_base = EXFAT_FIRST_CLUSTER;
@@ -244,25 +228,24 @@ int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count)
unsigned int count = 0;
unsigned int i, map_i = 0, map_b = 0;
unsigned int total_clus = EXFAT_DATA_CLUSTER_COUNT(sbi);
- unsigned int last_mask = total_clus & BITS_PER_BYTE_MASK;
- unsigned char clu_bits;
- const unsigned char last_bit_mask[] = {0, 0b00000001, 0b00000011,
- 0b00000111, 0b00001111, 0b00011111, 0b00111111, 0b01111111};
+ unsigned int last_mask = total_clus & (BITS_PER_LONG - 1);
+ unsigned long *bitmap, clu_bits;
total_clus &= ~last_mask;
- for (i = 0; i < total_clus; i += BITS_PER_BYTE) {
- clu_bits = *(sbi->vol_amap[map_i]->b_data + map_b);
- count += used_bit[clu_bits];
- if (++map_b >= (unsigned int)sb->s_blocksize) {
+ for (i = 0; i < total_clus; i += BITS_PER_LONG) {
+ bitmap = (void *)(sbi->vol_amap[map_i]->b_data + map_b);
+ count += hweight_long(*bitmap);
+ map_b += sizeof(long);
+ if (map_b >= (unsigned int)sb->s_blocksize) {
map_i++;
map_b = 0;
}
}
if (last_mask) {
- clu_bits = *(sbi->vol_amap[map_i]->b_data + map_b);
- clu_bits &= last_bit_mask[last_mask];
- count += used_bit[clu_bits];
+ bitmap = (void *)(sbi->vol_amap[map_i]->b_data + map_b);
+ clu_bits = lel_to_cpu(*(__le_long *)bitmap);
+ count += hweight_long(clu_bits & BITMAP_LAST_WORD_MASK(last_mask));
}
*ret_count = count;
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index a7a2c35d74fb..9474cd50da6d 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -135,8 +135,7 @@ enum {
#define BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent) (ent & BITS_PER_SECTOR_MASK(sb))
#define BITMAP_OFFSET_BYTE_IN_SECTOR(sb, ent) \
((ent / BITS_PER_BYTE) & ((sb)->s_blocksize - 1))
-#define BITS_PER_BYTE_MASK 0x7
-#define IGNORED_BITS_REMAINED(clu, clu_base) ((1 << ((clu) - (clu_base))) - 1)
+#define IGNORED_BITS_REMAINED(clu, clu_base) ((1UL << ((clu) - (clu_base))) - 1)
#define ES_ENTRY_NUM(name_len) (ES_IDX_LAST_FILENAME(name_len) + 1)
/* 19 entries = 1 file entry + 1 stream entry + 17 filename entries */
@@ -208,6 +207,7 @@ struct exfat_dir_entry {
unsigned char flags;
unsigned short attr;
loff_t size;
+ loff_t valid_size;
unsigned int num_subdirs;
struct timespec64 atime;
struct timespec64 mtime;
@@ -317,6 +317,7 @@ struct exfat_inode_info {
loff_t i_size_aligned;
/* on-disk position of directory entry or 0 */
loff_t i_pos;
+ loff_t valid_size;
/* hash by i_location */
struct hlist_node i_hash_fat;
/* protect bmap against truncate */
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index bfdfafe00993..d25a96a148af 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -11,37 +11,76 @@
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/msdos_fs.h>
+#include <linux/writeback.h>
#include "exfat_raw.h"
#include "exfat_fs.h"
static int exfat_cont_expand(struct inode *inode, loff_t size)
{
- struct address_space *mapping = inode->i_mapping;
- loff_t start = i_size_read(inode), count = size - i_size_read(inode);
- int err, err2;
+ int ret;
+ unsigned int num_clusters, new_num_clusters, last_clu;
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_chain clu;
- err = generic_cont_expand_simple(inode, size);
- if (err)
- return err;
+ ret = inode_newsize_ok(inode, size);
+ if (ret)
+ return ret;
+
+ num_clusters = EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi);
+ new_num_clusters = EXFAT_B_TO_CLU_ROUND_UP(size, sbi);
+
+ if (new_num_clusters == num_clusters)
+ goto out;
+
+ exfat_chain_set(&clu, ei->start_clu, num_clusters, ei->flags);
+ ret = exfat_find_last_cluster(sb, &clu, &last_clu);
+ if (ret)
+ return ret;
+
+ clu.dir = (last_clu == EXFAT_EOF_CLUSTER) ?
+ EXFAT_EOF_CLUSTER : last_clu + 1;
+ clu.size = 0;
+ clu.flags = ei->flags;
+
+ ret = exfat_alloc_cluster(inode, new_num_clusters - num_clusters,
+ &clu, IS_DIRSYNC(inode));
+ if (ret)
+ return ret;
+
+ /* Append new clusters to chain */
+ if (clu.flags != ei->flags) {
+ exfat_chain_cont_cluster(sb, ei->start_clu, num_clusters);
+ ei->flags = ALLOC_FAT_CHAIN;
+ }
+ if (clu.flags == ALLOC_FAT_CHAIN)
+ if (exfat_ent_set(sb, last_clu, clu.dir))
+ goto free_clu;
+ if (num_clusters == 0)
+ ei->start_clu = clu.dir;
+
+out:
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- mark_inode_dirty(inode);
+ /* Expanded range not zeroed, do not update valid_size */
+ i_size_write(inode, size);
- if (!IS_SYNC(inode))
- return 0;
+ ei->i_size_aligned = round_up(size, sb->s_blocksize);
+ ei->i_size_ondisk = ei->i_size_aligned;
+ inode->i_blocks = round_up(size, sbi->cluster_size) >> 9;
- err = filemap_fdatawrite_range(mapping, start, start + count - 1);
- err2 = sync_mapping_buffers(mapping);
- if (!err)
- err = err2;
- err2 = write_inode_now(inode, 1);
- if (!err)
- err = err2;
- if (err)
- return err;
+ if (IS_DIRSYNC(inode))
+ return write_inode_now(inode, 1);
+
+ mark_inode_dirty(inode);
- return filemap_fdatawait_range(mapping, start, start + count - 1);
+ return 0;
+
+free_clu:
+ exfat_free_cluster(inode, &clu);
+ return -EIO;
}
static bool exfat_allow_set_time(struct exfat_sb_info *sbi, struct inode *inode)
@@ -146,6 +185,9 @@ int __exfat_truncate(struct inode *inode)
ei->start_clu = EXFAT_EOF_CLUSTER;
}
+ if (i_size_read(inode) < ei->valid_size)
+ ei->valid_size = i_size_read(inode);
+
if (ei->type == TYPE_FILE)
ei->attr |= EXFAT_ATTR_ARCHIVE;
@@ -474,15 +516,124 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
return blkdev_issue_flush(inode->i_sb->s_bdev);
}
+static int exfat_file_zeroed_range(struct file *file, loff_t start, loff_t end)
+{
+ int err;
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *ops = mapping->a_ops;
+
+ while (start < end) {
+ u32 zerofrom, len;
+ struct page *page = NULL;
+
+ zerofrom = start & (PAGE_SIZE - 1);
+ len = PAGE_SIZE - zerofrom;
+ if (start + len > end)
+ len = end - start;
+
+ err = ops->write_begin(file, mapping, start, len, &page, NULL);
+ if (err)
+ goto out;
+
+ zero_user_segment(page, zerofrom, zerofrom + len);
+
+ err = ops->write_end(file, mapping, start, len, len, page, NULL);
+ if (err < 0)
+ goto out;
+ start += len;
+
+ balance_dirty_pages_ratelimited(mapping);
+ cond_resched();
+ }
+
+out:
+ return err;
+}
+
+static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ ssize_t ret;
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ loff_t pos = iocb->ki_pos;
+ loff_t valid_size;
+
+ inode_lock(inode);
+
+ valid_size = ei->valid_size;
+
+ ret = generic_write_checks(iocb, iter);
+ if (ret < 0)
+ goto unlock;
+
+ if (pos > valid_size) {
+ ret = exfat_file_zeroed_range(file, valid_size, pos);
+ if (ret < 0 && ret != -ENOSPC) {
+ exfat_err(inode->i_sb,
+ "write: fail to zero from %llu to %llu(%zd)",
+ valid_size, pos, ret);
+ }
+ if (ret < 0)
+ goto unlock;
+ }
+
+ ret = __generic_file_write_iter(iocb, iter);
+ if (ret < 0)
+ goto unlock;
+
+ inode_unlock(inode);
+
+ if (pos > valid_size)
+ pos = valid_size;
+
+ if (iocb_is_dsync(iocb) && iocb->ki_pos > pos) {
+ ssize_t err = vfs_fsync_range(file, pos, iocb->ki_pos - 1,
+ iocb->ki_flags & IOCB_SYNC);
+ if (err < 0)
+ return err;
+ }
+
+ return ret;
+
+unlock:
+ inode_unlock(inode);
+
+ return ret;
+}
+
+static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ int ret;
+ struct inode *inode = file_inode(file);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ loff_t start = ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ loff_t end = min_t(loff_t, i_size_read(inode),
+ start + vma->vm_end - vma->vm_start);
+
+ if ((vma->vm_flags & VM_WRITE) && ei->valid_size < end) {
+ ret = exfat_file_zeroed_range(file, ei->valid_size, end);
+ if (ret < 0) {
+ exfat_err(inode->i_sb,
+ "mmap: fail to zero from %llu to %llu(%d)",
+ start, end, ret);
+ return ret;
+ }
+ }
+
+ return generic_file_mmap(file, vma);
+}
+
const struct file_operations exfat_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
+ .write_iter = exfat_file_write_iter,
.unlocked_ioctl = exfat_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = exfat_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = exfat_file_mmap,
.fsync = exfat_file_fsync,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index e7ff58b8e68c..0687f952956c 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -75,8 +75,17 @@ int __exfat_write_inode(struct inode *inode, int sync)
if (ei->start_clu == EXFAT_EOF_CLUSTER)
on_disk_size = 0;
- ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_size);
- ep2->dentry.stream.size = ep2->dentry.stream.valid_size;
+ ep2->dentry.stream.size = cpu_to_le64(on_disk_size);
+ /*
+ * mmap write does not use exfat_write_end(), valid_size may be
+ * extended to the sector-aligned length in exfat_get_block().
+ * So we need to fixup valid_size to the writren length.
+ */
+ if (on_disk_size < ei->valid_size)
+ ep2->dentry.stream.valid_size = ep2->dentry.stream.size;
+ else
+ ep2->dentry.stream.valid_size = cpu_to_le64(ei->valid_size);
+
if (on_disk_size) {
ep2->dentry.stream.flags = ei->flags;
ep2->dentry.stream.start_clu = cpu_to_le32(ei->start_clu);
@@ -278,6 +287,7 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
unsigned int cluster, sec_offset;
sector_t last_block;
sector_t phys = 0;
+ sector_t valid_blks;
loff_t pos;
mutex_lock(&sbi->s_lock);
@@ -306,17 +316,32 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
mapped_blocks = sbi->sect_per_clus - sec_offset;
max_blocks = min(mapped_blocks, max_blocks);
- /* Treat newly added block / cluster */
- if (iblock < last_block)
- create = 0;
-
- if (create || buffer_delay(bh_result)) {
- pos = EXFAT_BLK_TO_B((iblock + 1), sb);
+ pos = EXFAT_BLK_TO_B((iblock + 1), sb);
+ if ((create && iblock >= last_block) || buffer_delay(bh_result)) {
if (ei->i_size_ondisk < pos)
ei->i_size_ondisk = pos;
}
+ map_bh(bh_result, sb, phys);
+ if (buffer_delay(bh_result))
+ clear_buffer_delay(bh_result);
+
if (create) {
+ valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb);
+
+ if (iblock + max_blocks < valid_blks) {
+ /* The range has been written, map it */
+ goto done;
+ } else if (iblock < valid_blks) {
+ /*
+ * The range has been partially written,
+ * map the written part.
+ */
+ max_blocks = valid_blks - iblock;
+ goto done;
+ }
+
+ /* The area has not been written, map and mark as new. */
err = exfat_map_new_buffer(ei, bh_result, pos);
if (err) {
exfat_fs_error(sb,
@@ -324,11 +349,58 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
pos, ei->i_size_aligned);
goto unlock_ret;
}
- }
- if (buffer_delay(bh_result))
- clear_buffer_delay(bh_result);
- map_bh(bh_result, sb, phys);
+ ei->valid_size = EXFAT_BLK_TO_B(iblock + max_blocks, sb);
+ mark_inode_dirty(inode);
+ } else {
+ valid_blks = EXFAT_B_TO_BLK(ei->valid_size, sb);
+
+ if (iblock + max_blocks < valid_blks) {
+ /* The range has been written, map it */
+ goto done;
+ } else if (iblock < valid_blks) {
+ /*
+ * The area has been partially written,
+ * map the written part.
+ */
+ max_blocks = valid_blks - iblock;
+ goto done;
+ } else if (iblock == valid_blks &&
+ (ei->valid_size & (sb->s_blocksize - 1))) {
+ /*
+ * The block has been partially written,
+ * zero the unwritten part and map the block.
+ */
+ loff_t size, off;
+
+ max_blocks = 1;
+
+ /*
+ * For direct read, the unwritten part will be zeroed in
+ * exfat_direct_IO()
+ */
+ if (!bh_result->b_folio)
+ goto done;
+
+ pos -= sb->s_blocksize;
+ size = ei->valid_size - pos;
+ off = pos & (PAGE_SIZE - 1);
+
+ folio_set_bh(bh_result, bh_result->b_folio, off);
+ err = bh_read(bh_result, 0);
+ if (err < 0)
+ goto unlock_ret;
+
+ folio_zero_segment(bh_result->b_folio, off + size,
+ off + sb->s_blocksize);
+ } else {
+ /*
+ * The range has not been written, clear the mapped flag
+ * to only zero the cache and do not read from disk.
+ */
+ clear_buffer_mapped(bh_result);
+ }
+ }
done:
bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb);
unlock_ret:
@@ -343,6 +415,17 @@ static int exfat_read_folio(struct file *file, struct folio *folio)
static void exfat_readahead(struct readahead_control *rac)
{
+ struct address_space *mapping = rac->mapping;
+ struct inode *inode = mapping->host;
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ loff_t pos = readahead_pos(rac);
+
+ /* Range cross valid_size, read it page by page. */
+ if (ei->valid_size < i_size_read(inode) &&
+ pos <= ei->valid_size &&
+ ei->valid_size < pos + readahead_length(rac))
+ return;
+
mpage_readahead(rac, exfat_get_block);
}
@@ -370,9 +453,7 @@ static int exfat_write_begin(struct file *file, struct address_space *mapping,
int ret;
*pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
- exfat_get_block,
- &EXFAT_I(mapping->host)->i_size_ondisk);
+ ret = block_write_begin(mapping, pos, len, pagep, exfat_get_block);
if (ret < 0)
exfat_write_failed(mapping, pos+len);
@@ -400,6 +481,11 @@ static int exfat_write_end(struct file *file, struct address_space *mapping,
if (err < len)
exfat_write_failed(mapping, pos+len);
+ if (!(err < 0) && pos + err > ei->valid_size) {
+ ei->valid_size = pos + err;
+ mark_inode_dirty(inode);
+ }
+
if (!(err < 0) && !(ei->attr & EXFAT_ATTR_ARCHIVE)) {
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ei->attr |= EXFAT_ATTR_ARCHIVE;
@@ -413,7 +499,9 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
- loff_t size = iocb->ki_pos + iov_iter_count(iter);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ loff_t pos = iocb->ki_pos;
+ loff_t size = pos + iov_iter_count(iter);
int rw = iov_iter_rw(iter);
ssize_t ret;
@@ -436,8 +524,20 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
* condition of exfat_get_block() and ->truncate().
*/
ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block);
- if (ret < 0 && (rw & WRITE))
- exfat_write_failed(mapping, size);
+ if (ret < 0) {
+ if (rw == WRITE && ret != -EIOCBQUEUED)
+ exfat_write_failed(mapping, size);
+
+ return ret;
+ } else
+ size = pos + ret;
+
+ /* zero the unwritten part in the partially written block */
+ if (rw == READ && pos < ei->valid_size && ei->valid_size < size) {
+ iov_iter_revert(iter, size - ei->valid_size);
+ iov_iter_zero(size - ei->valid_size, iter);
+ }
+
return ret;
}
@@ -537,6 +637,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
ei->start_clu = info->start_clu;
ei->flags = info->flags;
ei->type = info->type;
+ ei->valid_size = info->valid_size;
ei->version = 0;
ei->hint_stat.eidx = 0;
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 5d737e0b639a..9c549fd11fc8 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -406,6 +406,7 @@ static int exfat_find_empty_entry(struct inode *inode,
i_size_write(inode, size);
ei->i_size_ondisk += sbi->cluster_size;
ei->i_size_aligned += sbi->cluster_size;
+ ei->valid_size += sbi->cluster_size;
ei->flags = p_dir->flags;
inode->i_blocks += sbi->cluster_size >> 9;
}
@@ -558,6 +559,8 @@ static int exfat_add_entry(struct inode *inode, const char *path,
info->size = clu_size;
info->num_subdirs = EXFAT_MIN_SUBDIR;
}
+ info->valid_size = info->size;
+
memset(&info->crtime, 0, sizeof(info->crtime));
memset(&info->mtime, 0, sizeof(info->mtime));
memset(&info->atime, 0, sizeof(info->atime));
@@ -660,6 +663,8 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
info->type = exfat_get_entry_type(ep);
info->attr = le16_to_cpu(ep->dentry.file.attr);
info->size = le64_to_cpu(ep2->dentry.stream.valid_size);
+ info->valid_size = le64_to_cpu(ep2->dentry.stream.valid_size);
+ info->size = le64_to_cpu(ep2->dentry.stream.size);
if (info->size == 0) {
info->flags = ALLOC_NO_FAT_CHAIN;
info->start_clu = EXFAT_EOF_CLUSTER;
@@ -1288,6 +1293,7 @@ static int __exfat_rename(struct inode *old_parent_inode,
}
i_size_write(new_inode, 0);
+ new_ei->valid_size = 0;
new_ei->start_clu = EXFAT_EOF_CLUSTER;
new_ei->flags = ALLOC_NO_FAT_CHAIN;
}
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 65f702b1da5b..8346ab9534c1 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -325,6 +325,7 @@ static int ext2_rename (struct mnt_idmap * idmap,
struct ext2_dir_entry_2 * dir_de = NULL;
struct folio * old_folio;
struct ext2_dir_entry_2 * old_de;
+ bool old_is_dir = S_ISDIR(old_inode->i_mode);
int err;
if (flags & ~RENAME_NOREPLACE)
@@ -342,7 +343,7 @@ static int ext2_rename (struct mnt_idmap * idmap,
if (IS_ERR(old_de))
return PTR_ERR(old_de);
- if (S_ISDIR(old_inode->i_mode)) {
+ if (old_is_dir && old_dir != new_dir) {
err = -EIO;
dir_de = ext2_dotdot(old_inode, &dir_folio);
if (!dir_de)
@@ -354,7 +355,7 @@ static int ext2_rename (struct mnt_idmap * idmap,
struct ext2_dir_entry_2 *new_de;
err = -ENOTEMPTY;
- if (dir_de && !ext2_empty_dir (new_inode))
+ if (old_is_dir && !ext2_empty_dir(new_inode))
goto out_dir;
new_de = ext2_find_entry(new_dir, &new_dentry->d_name,
@@ -368,14 +369,14 @@ static int ext2_rename (struct mnt_idmap * idmap,
if (err)
goto out_dir;
inode_set_ctime_current(new_inode);
- if (dir_de)
+ if (old_is_dir)
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
} else {
err = ext2_add_link(new_dentry, old_inode);
if (err)
goto out_dir;
- if (dir_de)
+ if (old_is_dir)
inode_inc_link_count(new_dir);
}
@@ -387,7 +388,7 @@ static int ext2_rename (struct mnt_idmap * idmap,
mark_inode_dirty(old_inode);
err = ext2_delete_entry(old_de, old_folio);
- if (!err && dir_de) {
+ if (!err && old_is_dir) {
if (old_dir != new_dir)
err = ext2_set_link(old_inode, dir_de, dir_folio,
new_dir, false);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d252935f9c8a..05b647e6bc19 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2388,8 +2388,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
sb = dir->i_sb;
blocksize = sb->s_blocksize;
- if (!dentry->d_name.len)
- return -EINVAL;
if (fscrypt_is_nokey_name(dentry))
return -ENOKEY;
@@ -3591,10 +3589,14 @@ struct ext4_renament {
int dir_inlined;
};
-static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
+static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent, bool is_cross)
{
int retval;
+ ent->is_dir = true;
+ if (!is_cross)
+ return 0;
+
ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode,
&retval, &ent->parent_de,
&ent->dir_inlined);
@@ -3612,6 +3614,9 @@ static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
{
int retval;
+ if (!ent->dir_bh)
+ return 0;
+
ent->parent_de->inode = cpu_to_le32(dir_ino);
BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata");
if (!ent->dir_inlined) {
@@ -3900,7 +3905,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
goto end_rename;
}
- retval = ext4_rename_dir_prepare(handle, &old);
+ retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir);
if (retval)
goto end_rename;
}
@@ -3964,7 +3969,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
}
inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir));
ext4_update_dx_flag(old.dir);
- if (old.dir_bh) {
+ if (old.is_dir) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
if (retval)
goto end_rename;
@@ -3987,7 +3992,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (unlikely(retval))
goto end_rename;
- if (S_ISDIR(old.inode->i_mode)) {
+ if (old.is_dir) {
/*
* We disable fast commits here that's because the
* replay code is not yet capable of changing dot dot
@@ -4114,14 +4119,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
ext4_handle_sync(handle);
if (S_ISDIR(old.inode->i_mode)) {
- old.is_dir = true;
- retval = ext4_rename_dir_prepare(handle, &old);
+ retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir);
if (retval)
goto end_rename;
}
if (S_ISDIR(new.inode->i_mode)) {
- new.is_dir = true;
- retval = ext4_rename_dir_prepare(handle, &new);
+ retval = ext4_rename_dir_prepare(handle, &new, new.dir != old.dir);
if (retval)
goto end_rename;
}
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 6b2af514660d..531517dac079 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1036,8 +1036,10 @@ static void set_cluster_dirty(struct compress_ctx *cc)
int i;
for (i = 0; i < cc->cluster_size; i++)
- if (cc->rpages[i])
+ if (cc->rpages[i]) {
set_page_dirty(cc->rpages[i]);
+ set_page_private_gcing(cc->rpages[i]);
+ }
}
static int prepare_compress_overwrite(struct compress_ctx *cc,
@@ -1369,8 +1371,6 @@ unlock_continue:
add_compr_block_stat(inode, cc->valid_nr_cpages);
set_inode_flag(cc->inode, FI_APPEND_WRITE);
- if (cc->cluster_idx == 0)
- set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
f2fs_put_dnode(&dn);
if (quota_inode)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 4e42b5f24deb..26e317696b33 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -995,7 +995,7 @@ static bool is_end_zone_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr)
}
blkaddr -= FDEV(devi).start_blk;
}
- return bdev_zoned_model(FDEV(devi).bdev) == BLK_ZONED_HM &&
+ return bdev_is_zoned(FDEV(devi).bdev) &&
f2fs_blkz_is_seq(sbi, devi, blkaddr) &&
(blkaddr % sbi->blocks_per_blkz == sbi->blocks_per_blkz - 1);
}
@@ -1179,18 +1179,12 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
return 0;
}
-static void __set_data_blkaddr(struct dnode_of_data *dn)
+static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
{
- struct f2fs_node *rn = F2FS_NODE(dn->node_page);
- __le32 *addr_array;
- int base = 0;
+ __le32 *addr = get_dnode_addr(dn->inode, dn->node_page);
- if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
- base = get_extra_isize(dn->inode);
-
- /* Get physical address of data block */
- addr_array = blkaddr_in_node(rn);
- addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
+ dn->data_blkaddr = blkaddr;
+ addr[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
}
/*
@@ -1199,18 +1193,17 @@ static void __set_data_blkaddr(struct dnode_of_data *dn)
* ->node_page
* update block addresses in the node page
*/
-void f2fs_set_data_blkaddr(struct dnode_of_data *dn)
+void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
{
f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
- __set_data_blkaddr(dn);
+ __set_data_blkaddr(dn, blkaddr);
if (set_page_dirty(dn->node_page))
dn->node_changed = true;
}
void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
{
- dn->data_blkaddr = blkaddr;
- f2fs_set_data_blkaddr(dn);
+ f2fs_set_data_blkaddr(dn, blkaddr);
f2fs_update_read_extent_cache(dn);
}
@@ -1237,8 +1230,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
block_t blkaddr = f2fs_data_blkaddr(dn);
if (blkaddr == NULL_ADDR) {
- dn->data_blkaddr = NEW_ADDR;
- __set_data_blkaddr(dn);
+ __set_data_blkaddr(dn, NEW_ADDR);
count--;
}
}
@@ -1492,11 +1484,9 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
old_blkaddr = dn->data_blkaddr;
f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr,
&sum, seg_type, NULL);
- if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) {
- invalidate_mapping_pages(META_MAPPING(sbi),
- old_blkaddr, old_blkaddr);
- f2fs_invalidate_compress_page(sbi, old_blkaddr);
- }
+ if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+ f2fs_invalidate_internal_cache(sbi, old_blkaddr);
+
f2fs_update_data_blkaddr(dn, dn->data_blkaddr);
return 0;
}
@@ -1992,7 +1982,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
- inode_lock(inode);
+ inode_lock_shared(inode);
maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS;
if (start > maxbytes) {
@@ -2112,7 +2102,7 @@ out:
if (ret == 1)
ret = 0;
- inode_unlock(inode);
+ inode_unlock_shared(inode);
return ret;
}
@@ -2566,9 +2556,6 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio)
page = fio->compressed_page ? fio->compressed_page : fio->page;
- /* wait for GCed page writeback via META_MAPPING */
- f2fs_wait_on_block_writeback(inode, fio->old_blkaddr);
-
if (fscrypt_inode_uses_inline_crypto(inode))
return 0;
@@ -2755,6 +2742,10 @@ got_it:
goto out_writepage;
}
+ /* wait for GCed page writeback via META_MAPPING */
+ if (fio->post_read)
+ f2fs_wait_on_block_writeback(inode, fio->old_blkaddr);
+
/*
* If current allocation needs SSR,
* it had better in-place writes for updated data.
@@ -2810,8 +2801,6 @@ got_it:
f2fs_outplace_write_data(&dn, fio);
trace_f2fs_do_write_data_page(page, OPU);
set_inode_flag(inode, FI_APPEND_WRITE);
- if (page->index == 0)
- set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
out_writepage:
f2fs_put_dnode(&dn);
out:
@@ -2894,9 +2883,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
zero_user_segment(page, offset, PAGE_SIZE);
write:
- if (f2fs_is_drop_cache(inode))
- goto out;
-
/* Dentry/quota blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode) || quota_inode) {
/*
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9043cedfa12b..65294e3b0bef 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -374,6 +374,12 @@ enum {
MAX_DPOLICY,
};
+enum {
+ DPOLICY_IO_AWARE_DISABLE, /* force to not be aware of IO */
+ DPOLICY_IO_AWARE_ENABLE, /* force to be aware of IO */
+ DPOLICY_IO_AWARE_MAX,
+};
+
struct discard_policy {
int type; /* type of discard */
unsigned int min_interval; /* used for candidates exist */
@@ -406,6 +412,7 @@ struct discard_cmd_control {
unsigned int discard_urgent_util; /* utilization which issue discard proactively */
unsigned int discard_granularity; /* discard granularity */
unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */
+ unsigned int discard_io_aware; /* io_aware policy */
unsigned int undiscard_blks; /* # of undiscard blocks */
unsigned int next_pos; /* next discard position */
atomic_t issued_discard; /* # of issued discard */
@@ -774,8 +781,6 @@ enum {
FI_UPDATE_WRITE, /* inode has in-place-update data */
FI_NEED_IPU, /* used for ipu per file */
FI_ATOMIC_FILE, /* indicate atomic file */
- FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
- FI_DROP_CACHE, /* drop dirty page cache */
FI_DATA_EXIST, /* indicate data exists */
FI_INLINE_DOTS, /* indicate inline dot dentries */
FI_SKIP_WRITES, /* should skip data page writeback */
@@ -3272,22 +3277,13 @@ static inline bool f2fs_is_cow_file(struct inode *inode)
return is_inode_flag_set(inode, FI_COW_FILE);
}
-static inline bool f2fs_is_first_block_written(struct inode *inode)
-{
- return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN);
-}
-
-static inline bool f2fs_is_drop_cache(struct inode *inode)
-{
- return is_inode_flag_set(inode, FI_DROP_CACHE);
-}
-
+static inline __le32 *get_dnode_addr(struct inode *inode,
+ struct page *node_page);
static inline void *inline_data_addr(struct inode *inode, struct page *page)
{
- struct f2fs_inode *ri = F2FS_INODE(page);
- int extra_size = get_extra_isize(inode);
+ __le32 *addr = get_dnode_addr(inode, page);
- return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]);
+ return (void *)(addr + DEF_INLINE_RESERVED_SIZE);
}
static inline int f2fs_has_inline_dentry(struct inode *inode)
@@ -3432,6 +3428,17 @@ static inline int get_inline_xattr_addrs(struct inode *inode)
return F2FS_I(inode)->i_inline_xattr_size;
}
+static inline __le32 *get_dnode_addr(struct inode *inode,
+ struct page *node_page)
+{
+ int base = 0;
+
+ if (IS_INODE(node_page) && f2fs_has_extra_attr(inode))
+ base = get_extra_isize(inode);
+
+ return blkaddr_in_node(F2FS_NODE(node_page)) + base;
+}
+
#define f2fs_get_inode_mode(i) \
((is_inode_flag_set(i, FI_ACL_MODE)) ? \
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -3815,7 +3822,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio);
struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
block_t blk_addr, sector_t *sector);
int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr);
-void f2fs_set_data_blkaddr(struct dnode_of_data *dn);
+void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr);
void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr);
int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count);
int f2fs_reserve_new_block(struct dnode_of_data *dn);
@@ -4606,6 +4613,13 @@ static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi)
return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb);
}
+static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi,
+ block_t blkaddr)
+{
+ invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr);
+ f2fs_invalidate_compress_page(sbi, blkaddr);
+}
+
#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 4580dfefd5e9..b58ab1157b7e 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -42,11 +42,11 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
vm_fault_t ret;
ret = filemap_fault(vmf);
- if (!ret)
+ if (ret & VM_FAULT_LOCKED)
f2fs_update_iostat(F2FS_I_SB(inode), inode,
APP_MAPPED_READ_IO, F2FS_BLKSIZE);
- trace_f2fs_filemap_fault(inode, vmf->pgoff, (unsigned long)ret);
+ trace_f2fs_filemap_fault(inode, vmf->pgoff, vmf->vma->vm_flags, ret);
return ret;
}
@@ -59,26 +59,29 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
struct dnode_of_data dn;
bool need_alloc = true;
int err = 0;
+ vm_fault_t ret;
if (unlikely(IS_IMMUTABLE(inode)))
return VM_FAULT_SIGBUS;
- if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED))
- return VM_FAULT_SIGBUS;
+ if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+ err = -EIO;
+ goto out;
+ }
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
- goto err;
+ goto out;
}
if (!f2fs_is_checkpoint_ready(sbi)) {
err = -ENOSPC;
- goto err;
+ goto out;
}
err = f2fs_convert_inline_inode(inode);
if (err)
- goto err;
+ goto out;
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
@@ -86,7 +89,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
if (ret < 0) {
err = ret;
- goto err;
+ goto out;
} else if (ret) {
need_alloc = false;
}
@@ -153,13 +156,15 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE);
f2fs_update_time(sbi, REQ_TIME);
- trace_f2fs_vm_page_mkwrite(page, DATA);
out_sem:
filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);
-err:
- return vmf_fs_error(err);
+out:
+ ret = vmf_fs_error(err);
+
+ trace_f2fs_vm_page_mkwrite(inode, page->index, vmf->vma->vm_flags, ret);
+ return ret;
}
static const struct vm_operations_struct f2fs_file_vm_ops = {
@@ -418,7 +423,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
loff_t isize;
int err = 0;
- inode_lock(inode);
+ inode_lock_shared(inode);
isize = i_size_read(inode);
if (offset >= isize)
@@ -483,10 +488,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
found:
if (whence == SEEK_HOLE && data_ofs > isize)
data_ofs = isize;
- inode_unlock(inode);
+ inode_unlock_shared(inode);
return vfs_setpos(file, data_ofs, maxbytes);
fail:
- inode_unlock(inode);
+ inode_unlock_shared(inode);
return -ENXIO;
}
@@ -557,20 +562,14 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
- struct f2fs_node *raw_node;
int nr_free = 0, ofs = dn->ofs_in_node, len = count;
__le32 *addr;
- int base = 0;
bool compressed_cluster = false;
int cluster_index = 0, valid_blocks = 0;
int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
bool released = !atomic_read(&F2FS_I(dn->inode)->i_compr_blocks);
- if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
- base = get_extra_isize(dn->inode);
-
- raw_node = F2FS_NODE(dn->node_page);
- addr = blkaddr_in_node(raw_node) + base + ofs;
+ addr = get_dnode_addr(dn->inode, dn->node_page) + ofs;
/* Assumption: truncation starts with cluster */
for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) {
@@ -588,8 +587,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
if (blkaddr == NULL_ADDR)
continue;
- dn->data_blkaddr = NULL_ADDR;
- f2fs_set_data_blkaddr(dn);
+ f2fs_set_data_blkaddr(dn, NULL_ADDR);
if (__is_valid_data_blkaddr(blkaddr)) {
if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
@@ -599,9 +597,6 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
valid_blocks++;
}
- if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page))
- clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN);
-
f2fs_invalidate_blocks(sbi, blkaddr);
if (!released || blkaddr != COMPRESS_ADDR)
@@ -1317,6 +1312,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
}
memcpy_page(pdst, 0, psrc, 0, PAGE_SIZE);
set_page_dirty(pdst);
+ set_page_private_gcing(pdst);
f2fs_put_page(pdst, 1);
f2fs_put_page(psrc, 1);
@@ -1487,8 +1483,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
}
f2fs_invalidate_blocks(sbi, dn->data_blkaddr);
- dn->data_blkaddr = NEW_ADDR;
- f2fs_set_data_blkaddr(dn);
+ f2fs_set_data_blkaddr(dn, NEW_ADDR);
}
f2fs_update_read_extent_cache_range(dn, start, 0, index - start);
@@ -2818,6 +2813,11 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
goto out;
}
+ if (f2fs_compressed_file(src) || f2fs_compressed_file(dst)) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
ret = -EINVAL;
if (pos_in + len > src->i_size || pos_in + len < pos_in)
goto out_unlock;
@@ -3463,8 +3463,7 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
if (blkaddr != NEW_ADDR)
continue;
- dn->data_blkaddr = NULL_ADDR;
- f2fs_set_data_blkaddr(dn);
+ f2fs_set_data_blkaddr(dn, NULL_ADDR);
}
f2fs_i_compr_blocks_update(dn->inode, compr_blocks, false);
@@ -3630,8 +3629,7 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
continue;
}
- dn->data_blkaddr = NEW_ADDR;
- f2fs_set_data_blkaddr(dn);
+ f2fs_set_data_blkaddr(dn, NEW_ADDR);
}
reserved = cluster_size - compr_blocks;
@@ -4054,6 +4052,7 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len)
f2fs_bug_on(F2FS_I_SB(inode), !page);
set_page_dirty(page);
+ set_page_private_gcing(page);
f2fs_put_page(page, 1);
f2fs_put_page(page, 0);
}
@@ -4568,7 +4567,8 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter,
if (map.m_len > map.m_lblk)
map.m_len -= map.m_lblk;
else
- map.m_len = 0;
+ return 0;
+
map.m_may_create = true;
if (dio) {
map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index f550cdeaa663..a079eebfb080 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -46,8 +46,8 @@ static int gc_thread_func(void *data)
do {
bool sync_mode, foreground = false;
- wait_event_interruptible_timeout(*wq,
- kthread_should_stop() || freezing(current) ||
+ wait_event_freezable_timeout(*wq,
+ kthread_should_stop() ||
waitqueue_active(fggc_wq) ||
gc_th->gc_wake,
msecs_to_jiffies(wait_ms));
@@ -59,7 +59,7 @@ static int gc_thread_func(void *data)
if (gc_th->gc_wake)
gc_th->gc_wake = false;
- if (try_to_freeze() || f2fs_readonly(sbi->sb)) {
+ if (f2fs_readonly(sbi->sb)) {
stat_other_skip_bggc_count(sbi);
continue;
}
@@ -1380,9 +1380,8 @@ static int move_data_block(struct inode *inode, block_t bidx,
memcpy(page_address(fio.encrypted_page),
page_address(mpage), PAGE_SIZE);
f2fs_put_page(mpage, 1);
- invalidate_mapping_pages(META_MAPPING(fio.sbi),
- fio.old_blkaddr, fio.old_blkaddr);
- f2fs_invalidate_compress_page(fio.sbi, fio.old_blkaddr);
+
+ f2fs_invalidate_internal_cache(fio.sbi, fio.old_blkaddr);
set_page_dirty(fio.encrypted_page);
if (clear_page_dirty_for_io(fio.encrypted_page))
@@ -1405,8 +1404,6 @@ static int move_data_block(struct inode *inode, block_t bidx,
f2fs_update_data_blkaddr(&dn, newaddr);
set_inode_flag(inode, FI_APPEND_WRITE);
- if (page->index == 0)
- set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
put_page_out:
f2fs_put_page(fio.encrypted_page, 1);
recover_block:
@@ -1868,6 +1865,9 @@ retry:
seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type,
gc_control->should_migrate_blocks);
+ if (seg_freed < 0)
+ goto stop;
+
total_freed += seg_freed;
if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) {
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index a9eb3891f417..c26effdce9aa 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -61,49 +61,31 @@ void f2fs_set_inode_flags(struct inode *inode)
S_ENCRYPTED|S_VERITY|S_CASEFOLD);
}
-static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
+static void __get_inode_rdev(struct inode *inode, struct page *node_page)
{
- int extra_size = get_extra_isize(inode);
+ __le32 *addr = get_dnode_addr(inode, node_page);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
- if (ri->i_addr[extra_size])
- inode->i_rdev = old_decode_dev(
- le32_to_cpu(ri->i_addr[extra_size]));
+ if (addr[0])
+ inode->i_rdev = old_decode_dev(le32_to_cpu(addr[0]));
else
- inode->i_rdev = new_decode_dev(
- le32_to_cpu(ri->i_addr[extra_size + 1]));
+ inode->i_rdev = new_decode_dev(le32_to_cpu(addr[1]));
}
}
-static int __written_first_block(struct f2fs_sb_info *sbi,
- struct f2fs_inode *ri)
+static void __set_inode_rdev(struct inode *inode, struct page *node_page)
{
- block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]);
-
- if (!__is_valid_data_blkaddr(addr))
- return 1;
- if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) {
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
- return -EFSCORRUPTED;
- }
- return 0;
-}
-
-static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
-{
- int extra_size = get_extra_isize(inode);
+ __le32 *addr = get_dnode_addr(inode, node_page);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
if (old_valid_dev(inode->i_rdev)) {
- ri->i_addr[extra_size] =
- cpu_to_le32(old_encode_dev(inode->i_rdev));
- ri->i_addr[extra_size + 1] = 0;
+ addr[0] = cpu_to_le32(old_encode_dev(inode->i_rdev));
+ addr[1] = 0;
} else {
- ri->i_addr[extra_size] = 0;
- ri->i_addr[extra_size + 1] =
- cpu_to_le32(new_encode_dev(inode->i_rdev));
- ri->i_addr[extra_size + 2] = 0;
+ addr[0] = 0;
+ addr[1] = cpu_to_le32(new_encode_dev(inode->i_rdev));
+ addr[2] = 0;
}
}
}
@@ -398,7 +380,6 @@ static int do_read_inode(struct inode *inode)
struct page *node_page;
struct f2fs_inode *ri;
projid_t i_projid;
- int err;
/* Check if ino is within scope */
if (f2fs_check_nid_range(sbi, inode->i_ino))
@@ -478,17 +459,7 @@ static int do_read_inode(struct inode *inode)
}
/* get rdev by using inline_info */
- __get_inode_rdev(inode, ri);
-
- if (S_ISREG(inode->i_mode)) {
- err = __written_first_block(sbi, ri);
- if (err < 0) {
- f2fs_put_page(node_page, 1);
- return err;
- }
- if (!err)
- set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
- }
+ __get_inode_rdev(inode, node_page);
if (!f2fs_need_inode_block_update(sbi, inode->i_ino))
fi->last_disk_size = inode->i_size;
@@ -761,7 +732,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
}
}
- __set_inode_rdev(inode, ri);
+ __set_inode_rdev(inode, node_page);
/* deleted inode */
if (inode->i_nlink == 0)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index d0053b0284d8..b3bb815fc6aa 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -459,7 +459,6 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct qstr dot = QSTR_INIT(".", 1);
- struct qstr dotdot = QSTR_INIT("..", 2);
struct f2fs_dir_entry *de;
struct page *page;
int err = 0;
@@ -497,13 +496,13 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
goto out;
}
- de = f2fs_find_entry(dir, &dotdot, &page);
+ de = f2fs_find_entry(dir, &dotdot_name, &page);
if (de)
f2fs_put_page(page, 0);
else if (IS_ERR(page))
err = PTR_ERR(page);
else
- err = f2fs_do_add_link(dir, &dotdot, NULL, pino, S_IFDIR);
+ err = f2fs_do_add_link(dir, &dotdot_name, NULL, pino, S_IFDIR);
out:
if (!err)
clear_inode_flag(dir, FI_INLINE_DOTS);
@@ -963,6 +962,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct f2fs_dir_entry *old_dir_entry = NULL;
struct f2fs_dir_entry *old_entry;
struct f2fs_dir_entry *new_entry;
+ bool old_is_dir = S_ISDIR(old_inode->i_mode);
int err;
if (unlikely(f2fs_cp_error(sbi)))
@@ -1017,7 +1017,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
goto out;
}
- if (S_ISDIR(old_inode->i_mode)) {
+ if (old_is_dir && old_dir != new_dir) {
old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page);
if (!old_dir_entry) {
if (IS_ERR(old_dir_page))
@@ -1029,7 +1029,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (new_inode) {
err = -ENOTEMPTY;
- if (old_dir_entry && !f2fs_empty_dir(new_inode))
+ if (old_is_dir && !f2fs_empty_dir(new_inode))
goto out_dir;
err = -ENOENT;
@@ -1054,7 +1054,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
inode_set_ctime_current(new_inode);
f2fs_down_write(&F2FS_I(new_inode)->i_sem);
- if (old_dir_entry)
+ if (old_is_dir)
f2fs_i_links_write(new_inode, false);
f2fs_i_links_write(new_inode, false);
f2fs_up_write(&F2FS_I(new_inode)->i_sem);
@@ -1074,12 +1074,12 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
goto out_dir;
}
- if (old_dir_entry)
+ if (old_is_dir)
f2fs_i_links_write(new_dir, true);
}
f2fs_down_write(&F2FS_I(old_inode)->i_sem);
- if (!old_dir_entry || whiteout)
+ if (!old_is_dir || whiteout)
file_lost_pino(old_inode);
else
/* adjust dir's i_pino to pass fsck check */
@@ -1105,8 +1105,8 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
iput(whiteout);
}
- if (old_dir_entry) {
- if (old_dir != new_dir && !whiteout)
+ if (old_is_dir) {
+ if (old_dir_entry)
f2fs_set_link(old_inode, old_dir_entry,
old_dir_page, new_dir);
else
@@ -1316,21 +1316,27 @@ static int f2fs_rename2(struct mnt_idmap *idmap,
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
+ trace_f2fs_rename_start(old_dir, old_dentry, new_dir, new_dentry,
+ flags);
+
err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry,
flags);
if (err)
return err;
- if (flags & RENAME_EXCHANGE) {
- return f2fs_cross_rename(old_dir, old_dentry,
- new_dir, new_dentry);
- }
+ if (flags & RENAME_EXCHANGE)
+ err = f2fs_cross_rename(old_dir, old_dentry,
+ new_dir, new_dentry);
+ else
/*
* VFS has already handled the new dentry existence case,
* here, we just deal with "RENAME_NOREPLACE" as regular rename.
*/
- return f2fs_rename(idmap, old_dir, old_dentry,
+ err = f2fs_rename(idmap, old_dir, old_dentry,
new_dir, new_dentry, flags);
+
+ trace_f2fs_rename_end(old_dentry, new_dentry, flags, err);
+ return err;
}
static const char *f2fs_encrypted_get_link(struct dentry *dentry,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 6c7f6a649d27..9b546fd21010 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2751,11 +2751,11 @@ recover_xnid:
f2fs_update_inode_page(inode);
/* 3: update and set xattr node page dirty */
- if (page)
+ if (page) {
memcpy(F2FS_NODE(xpage), F2FS_NODE(page),
VALID_XATTR_BLOCK_SIZE);
-
- set_page_dirty(xpage);
+ set_page_dirty(xpage);
+ }
f2fs_put_page(xpage, 1);
return 0;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index b56d0f1078a7..d0f24ccbd1ac 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -712,7 +712,16 @@ retry_dn:
*/
if (dest == NEW_ADDR) {
f2fs_truncate_data_blocks_range(&dn, 1);
- f2fs_reserve_new_block(&dn);
+ do {
+ err = f2fs_reserve_new_block(&dn);
+ if (err == -ENOSPC) {
+ f2fs_bug_on(sbi, 1);
+ break;
+ }
+ } while (err &&
+ IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION));
+ if (err)
+ goto err;
continue;
}
@@ -720,12 +729,14 @@ retry_dn:
if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) {
if (src == NULL_ADDR) {
- err = f2fs_reserve_new_block(&dn);
- while (err &&
- IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION))
+ do {
err = f2fs_reserve_new_block(&dn);
- /* We should not get -ENOSPC */
- f2fs_bug_on(sbi, err);
+ if (err == -ENOSPC) {
+ f2fs_bug_on(sbi, 1);
+ break;
+ }
+ } while (err &&
+ IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION));
if (err)
goto err;
}
@@ -906,6 +917,8 @@ skip:
if (!err && fix_curseg_write_pointer && !f2fs_readonly(sbi->sb) &&
f2fs_sb_has_blkzoned(sbi)) {
err = f2fs_fix_curseg_write_pointer(sbi);
+ if (!err)
+ err = f2fs_check_write_pointer(sbi);
ret = err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 727d016318f9..4c8836ded90f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1172,7 +1172,10 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->min_interval = dcc->min_discard_issue_time;
dpolicy->mid_interval = dcc->mid_discard_issue_time;
dpolicy->max_interval = dcc->max_discard_issue_time;
- dpolicy->io_aware = true;
+ if (dcc->discard_io_aware == DPOLICY_IO_AWARE_ENABLE)
+ dpolicy->io_aware = true;
+ else if (dcc->discard_io_aware == DPOLICY_IO_AWARE_DISABLE)
+ dpolicy->io_aware = false;
dpolicy->sync = false;
dpolicy->ordered = true;
if (utilization(sbi) > dcc->discard_urgent_util) {
@@ -1380,7 +1383,8 @@ static void __insert_discard_cmd(struct f2fs_sb_info *sbi,
p = &(*p)->rb_right;
leftmost = false;
} else {
- f2fs_bug_on(sbi, 1);
+ /* Let's skip to add, if exists */
+ return;
}
}
@@ -1883,9 +1887,8 @@ static int issue_discard_thread(void *data)
set_freezable();
do {
- wait_event_interruptible_timeout(*q,
- kthread_should_stop() || freezing(current) ||
- dcc->discard_wake,
+ wait_event_freezable_timeout(*q,
+ kthread_should_stop() || dcc->discard_wake,
msecs_to_jiffies(wait_ms));
if (sbi->gc_mode == GC_URGENT_HIGH ||
@@ -1903,8 +1906,6 @@ static int issue_discard_thread(void *data)
if (atomic_read(&dcc->queued_discard))
__wait_all_discard_cmd(sbi, NULL);
- if (try_to_freeze())
- continue;
if (f2fs_readonly(sbi->sb))
continue;
if (kthread_should_stop())
@@ -2274,6 +2275,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
dcc->discard_io_aware_gran = MAX_PLIST_NUM;
dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY;
+ dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE;
if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
dcc->discard_granularity = sbi->blocks_per_seg;
else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
@@ -2495,8 +2497,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
if (addr == NEW_ADDR || addr == COMPRESS_ADDR)
return;
- invalidate_mapping_pages(META_MAPPING(sbi), addr, addr);
- f2fs_invalidate_compress_page(sbi, addr);
+ f2fs_invalidate_internal_cache(sbi, addr);
/* add it into sit main buffer */
down_write(&sit_i->sentry_lock);
@@ -3557,11 +3558,8 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
reallocate:
f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
&fio->new_blkaddr, sum, type, fio);
- if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) {
- invalidate_mapping_pages(META_MAPPING(fio->sbi),
- fio->old_blkaddr, fio->old_blkaddr);
- f2fs_invalidate_compress_page(fio->sbi, fio->old_blkaddr);
- }
+ if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO)
+ f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr);
/* writeout dirty page into bdev */
f2fs_submit_page_write(fio);
@@ -3757,9 +3755,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
update_sit_entry(sbi, new_blkaddr, 1);
}
if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) {
- invalidate_mapping_pages(META_MAPPING(sbi),
- old_blkaddr, old_blkaddr);
- f2fs_invalidate_compress_page(sbi, old_blkaddr);
+ f2fs_invalidate_internal_cache(sbi, old_blkaddr);
if (!from_gc)
update_segment_mtime(sbi, old_blkaddr, 0);
update_sit_entry(sbi, old_blkaddr, -1);
@@ -4865,99 +4861,56 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
struct f2fs_dev_info *fdev,
struct blk_zone *zone)
{
- unsigned int wp_segno, wp_blkoff, zone_secno, zone_segno, segno;
- block_t zone_block, wp_block, last_valid_block;
+ unsigned int zone_segno;
+ block_t zone_block, valid_block_cnt;
unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT;
- int i, s, b, ret;
- struct seg_entry *se;
+ int ret;
if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
return 0;
- wp_block = fdev->start_blk + (zone->wp >> log_sectors_per_block);
- wp_segno = GET_SEGNO(sbi, wp_block);
- wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block);
zone_segno = GET_SEGNO(sbi, zone_block);
- zone_secno = GET_SEC_FROM_SEG(sbi, zone_segno);
-
- if (zone_segno >= MAIN_SEGS(sbi))
- return 0;
/*
* Skip check of zones cursegs point to, since
* fix_curseg_write_pointer() checks them.
*/
- for (i = 0; i < NO_CHECK_TYPE; i++)
- if (zone_secno == GET_SEC_FROM_SEG(sbi,
- CURSEG_I(sbi, i)->segno))
- return 0;
+ if (zone_segno >= MAIN_SEGS(sbi) ||
+ IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno)))
+ return 0;
/*
- * Get last valid block of the zone.
+ * Get # of valid block of the zone.
*/
- last_valid_block = zone_block - 1;
- for (s = sbi->segs_per_sec - 1; s >= 0; s--) {
- segno = zone_segno + s;
- se = get_seg_entry(sbi, segno);
- for (b = sbi->blocks_per_seg - 1; b >= 0; b--)
- if (f2fs_test_bit(b, se->cur_valid_map)) {
- last_valid_block = START_BLOCK(sbi, segno) + b;
- break;
- }
- if (last_valid_block >= zone_block)
- break;
- }
+ valid_block_cnt = get_valid_blocks(sbi, zone_segno, true);
- /*
- * When safely unmounted in the previous mount, we can trust write
- * pointers. Otherwise, finish zones.
- */
- if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
- /*
- * The write pointer matches with the valid blocks or
- * already points to the end of the zone.
- */
- if ((last_valid_block + 1 == wp_block) ||
- (zone->wp == zone->start + zone->len))
- return 0;
- }
+ if ((!valid_block_cnt && zone->cond == BLK_ZONE_COND_EMPTY) ||
+ (valid_block_cnt && zone->cond == BLK_ZONE_COND_FULL))
+ return 0;
- if (last_valid_block + 1 == zone_block) {
- if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
- /*
- * If there is no valid block in the zone and if write
- * pointer is not at zone start, reset the write
- * pointer.
- */
- f2fs_notice(sbi,
- "Zone without valid block has non-zero write "
- "pointer. Reset the write pointer: wp[0x%x,0x%x]",
- wp_segno, wp_blkoff);
- }
+ if (!valid_block_cnt) {
+ f2fs_notice(sbi, "Zone without valid block has non-zero write "
+ "pointer. Reset the write pointer: cond[0x%x]",
+ zone->cond);
ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block,
zone->len >> log_sectors_per_block);
if (ret)
f2fs_err(sbi, "Discard zone failed: %s (errno=%d)",
fdev->path, ret);
-
return ret;
}
- if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
- /*
- * If there are valid blocks and the write pointer doesn't match
- * with them, we need to report the inconsistency and fill
- * the zone till the end to close the zone. This inconsistency
- * does not cause write error because the zone will not be
- * selected for write operation until it get discarded.
- */
- f2fs_notice(sbi, "Valid blocks are not aligned with write "
- "pointer: valid block[0x%x,0x%x] wp[0x%x,0x%x]",
- GET_SEGNO(sbi, last_valid_block),
- GET_BLKOFF_FROM_SEG0(sbi, last_valid_block),
- wp_segno, wp_blkoff);
- }
+ /*
+ * If there are valid blocks and the write pointer doesn't match
+ * with them, we need to report the inconsistency and fill
+ * the zone till the end to close the zone. This inconsistency
+ * does not cause write error because the zone will not be
+ * selected for write operation until it get discarded.
+ */
+ f2fs_notice(sbi, "Valid blocks are not aligned with write "
+ "pointer: valid block[0x%x,0x%x] cond[0x%x]",
+ zone_segno, valid_block_cnt, zone->cond);
ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH,
zone->start, zone->len, GFP_NOFS);
@@ -5048,15 +5001,18 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: "
"curseg[0x%x,0x%x] wp[0x%x,0x%x]", type, cs->segno,
cs->next_blkoff, wp_segno, wp_blkoff);
- } else {
- f2fs_notice(sbi, "Not successfully unmounted in the previous "
- "mount");
}
- f2fs_notice(sbi, "Assign new section to curseg[%d]: "
- "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff);
+ /* Allocate a new section if it's not new. */
+ if (cs->next_blkoff) {
+ unsigned int old_segno = cs->segno, old_blkoff = cs->next_blkoff;
- f2fs_allocate_new_section(sbi, type, true);
+ f2fs_allocate_new_section(sbi, type, true);
+ f2fs_notice(sbi, "Assign new section to curseg[%d]: "
+ "[0x%x,0x%x] -> [0x%x,0x%x]",
+ type, old_segno, old_blkoff,
+ cs->segno, cs->next_blkoff);
+ }
/* check consistency of the zone curseg pointed to */
if (check_zone_write_pointer(sbi, zbd, &zone))
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index d66e0692ac02..d45ab0992ae5 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1422,11 +1422,6 @@ default_check:
}
}
- if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) {
- f2fs_err(sbi, "LFS is not compatible with checkpoint=disable");
- return -EINVAL;
- }
-
if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) {
f2fs_err(sbi, "LFS is not compatible with ATGC");
return -EINVAL;
@@ -3361,6 +3356,14 @@ loff_t max_file_blocks(struct inode *inode)
leaf_count *= NIDS_PER_BLOCK;
result += leaf_count;
+ /*
+ * For compatibility with FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{64,32} with
+ * a 4K crypto data unit, we must restrict the max filesize to what can
+ * fit within U32_MAX + 1 data units.
+ */
+
+ result = min(result, (((loff_t)U32_MAX + 1) * 4096) >> F2FS_BLKSIZE_BITS);
+
return result;
}
@@ -4279,24 +4282,21 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
sbi->aligned_blksize = false;
#ifdef CONFIG_BLK_DEV_ZONED
- if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
- !f2fs_sb_has_blkzoned(sbi)) {
- f2fs_err(sbi, "Zoned block device feature not enabled");
- return -EINVAL;
- }
- if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
+ if (bdev_is_zoned(FDEV(i).bdev)) {
+ if (!f2fs_sb_has_blkzoned(sbi)) {
+ f2fs_err(sbi, "Zoned block device feature not enabled");
+ return -EINVAL;
+ }
if (init_blkz_info(sbi, i)) {
f2fs_err(sbi, "Failed to initialize F2FS blkzone information");
return -EINVAL;
}
if (max_devices == 1)
break;
- f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
+ f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: Host-managed)",
i, FDEV(i).path,
FDEV(i).total_segments,
- FDEV(i).start_blk, FDEV(i).end_blk,
- bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
- "Host-aware" : "Host-managed");
+ FDEV(i).start_blk, FDEV(i).end_blk);
continue;
}
#endif
@@ -4743,7 +4743,7 @@ try_onemore:
#ifdef CONFIG_QUOTA
f2fs_recover_quota_end(sbi, quota_enabled);
#endif
-
+reset_checkpoint:
/*
* If the f2fs is not readonly and fsync data recovery succeeds,
* check zoned block devices' write pointer consistency.
@@ -4754,7 +4754,6 @@ try_onemore:
goto free_meta;
}
-reset_checkpoint:
f2fs_init_inmem_curseg(sbi);
/* f2fs_recover_fsync_data() cleared this already */
@@ -4881,6 +4880,7 @@ free_sbi:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi);
+ sb->s_fs_info = NULL;
/* give only one another chance */
if (retry_cnt > 0 && skip_recovery) {
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 417fae96890f..a7ec55c7bb20 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -143,6 +143,33 @@ static ssize_t pending_discard_show(struct f2fs_attr *a,
&SM_I(sbi)->dcc_info->discard_cmd_cnt));
}
+static ssize_t issued_discard_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ if (!SM_I(sbi)->dcc_info)
+ return -EINVAL;
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read(
+ &SM_I(sbi)->dcc_info->issued_discard));
+}
+
+static ssize_t queued_discard_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ if (!SM_I(sbi)->dcc_info)
+ return -EINVAL;
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read(
+ &SM_I(sbi)->dcc_info->queued_discard));
+}
+
+static ssize_t undiscard_blks_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ if (!SM_I(sbi)->dcc_info)
+ return -EINVAL;
+ return sysfs_emit(buf, "%u\n",
+ SM_I(sbi)->dcc_info->undiscard_blks);
+}
+
static ssize_t gc_mode_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -516,6 +543,13 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "discard_io_aware")) {
+ if (t >= DPOLICY_IO_AWARE_MAX)
+ return -EINVAL;
+ *ui = t;
+ return count;
+ }
+
if (!strcmp(a->attr.name, "migration_granularity")) {
if (t == 0 || t > sbi->segs_per_sec)
return -EINVAL;
@@ -734,6 +768,13 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "dir_level")) {
+ if (t > MAX_DIR_HASH_DEPTH)
+ return -EINVAL;
+ sbi->dir_level = t;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -926,6 +967,7 @@ DCC_INFO_GENERAL_RW_ATTR(discard_io_aware_gran);
DCC_INFO_GENERAL_RW_ATTR(discard_urgent_util);
DCC_INFO_GENERAL_RW_ATTR(discard_granularity);
DCC_INFO_GENERAL_RW_ATTR(max_ordered_discard);
+DCC_INFO_GENERAL_RW_ATTR(discard_io_aware);
/* NM_INFO ATTR */
NM_INFO_RW_ATTR(max_roll_forward_node_blocks, max_rf_node_blocks);
@@ -1074,6 +1116,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(discard_urgent_util),
ATTR_LIST(discard_granularity),
ATTR_LIST(max_ordered_discard),
+ ATTR_LIST(discard_io_aware),
ATTR_LIST(pending_discard),
ATTR_LIST(gc_mode),
ATTR_LIST(ipu_policy),
@@ -1197,9 +1240,16 @@ ATTRIBUTE_GROUPS(f2fs_feat);
F2FS_GENERAL_RO_ATTR(sb_status);
F2FS_GENERAL_RO_ATTR(cp_status);
+F2FS_GENERAL_RO_ATTR(issued_discard);
+F2FS_GENERAL_RO_ATTR(queued_discard);
+F2FS_GENERAL_RO_ATTR(undiscard_blks);
+
static struct attribute *f2fs_stat_attrs[] = {
ATTR_LIST(sb_status),
ATTR_LIST(cp_status),
+ ATTR_LIST(issued_discard),
+ ATTR_LIST(queued_discard),
+ ATTR_LIST(undiscard_blks),
NULL,
};
ATTRIBUTE_GROUPS(f2fs_stat);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 47e88b4d4e7d..f290fe9327c4 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -660,11 +660,14 @@ retry:
here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name);
if (!here) {
if (!F2FS_I(inode)->i_xattr_nid) {
+ error = f2fs_recover_xattr_data(inode, NULL);
f2fs_notice(F2FS_I_SB(inode),
- "recover xattr in inode (%lu)", inode->i_ino);
- f2fs_recover_xattr_data(inode, NULL);
- kfree(base_addr);
- goto retry;
+ "recover xattr in inode (%lu), error(%d)",
+ inode->i_ino, error);
+ if (!error) {
+ kfree(base_addr);
+ goto retry;
+ }
}
f2fs_err(F2FS_I_SB(inode), "set inode (%lu) has corrupted xattr",
inode->i_ino);
@@ -754,6 +757,12 @@ retry:
memcpy(pval, value, size);
last->e_value_size = cpu_to_le16(size);
new_hsize += newsize;
+ /*
+ * Explicitly add the null terminator. The unused xattr space
+ * is supposed to always be zeroed, which would make this
+ * unnecessary, but don't depend on that.
+ */
+ *(u32 *)((u8 *)last + newsize) = 0;
}
error = write_all_xattrs(inode, new_hsize, base_addr, ipage);
diff --git a/fs/file_table.c b/fs/file_table.c
index 3ba764d73fc9..b991f90571b4 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -130,7 +130,6 @@ static struct ctl_table fs_stat_sysctls[] = {
.extra1 = &sysctl_nr_open_min,
.extra2 = &sysctl_nr_open_max,
},
- { }
};
static int __init init_fs_stat_sysctls(void)
@@ -317,9 +316,6 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
const char *name, int flags,
const struct file_operations *fops)
{
- static const struct dentry_operations anon_ops = {
- .d_dname = simple_dname
- };
struct qstr this = QSTR_INIT(name, strlen(name));
struct path path;
struct file *file;
@@ -327,8 +323,6 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
if (!path.dentry)
return ERR_PTR(-ENOMEM);
- if (!mnt->mnt_sb->s_d_op)
- d_set_d_op(path.dentry, &anon_ops);
path.mnt = mntget(mnt);
d_instantiate(path.dentry, inode);
file = alloc_file(&path, flags, fops);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1767493dffda..3d84fcc471c6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1675,11 +1675,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
inode->i_state |= I_DIRTY_PAGES;
- else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) {
+ else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) {
if (!(inode->i_state & I_DIRTY_PAGES)) {
- inode->i_state &= ~I_PINNING_FSCACHE_WB;
- wbc->unpinned_fscache_wb = true;
- dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */
+ inode->i_state &= ~I_PINNING_NETFS_WB;
+ wbc->unpinned_netfs_wb = true;
+ dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */
}
}
@@ -1691,7 +1691,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (ret == 0)
ret = err;
}
- wbc->unpinned_fscache_wb = false;
+ wbc->unpinned_netfs_wb = false;
trace_writeback_single_inode(inode, wbc, nr_to_write);
return ret;
}
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
deleted file mode 100644
index b313a978ae0a..000000000000
--- a/fs/fscache/Kconfig
+++ /dev/null
@@ -1,40 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config FSCACHE
- tristate "General filesystem local caching manager"
- select NETFS_SUPPORT
- help
- This option enables a generic filesystem caching manager that can be
- used by various network and other filesystems to cache data locally.
- Different sorts of caches can be plugged in, depending on the
- resources available.
-
- See Documentation/filesystems/caching/fscache.rst for more information.
-
-config FSCACHE_STATS
- bool "Gather statistical information on local caching"
- depends on FSCACHE && PROC_FS
- select NETFS_STATS
- help
- This option causes statistical information to be gathered on local
- caching and exported through file:
-
- /proc/fs/fscache/stats
-
- The gathering of statistics adds a certain amount of overhead to
- execution as there are a quite a few stats gathered, and on a
- multi-CPU system these may be on cachelines that keep bouncing
- between CPUs. On the other hand, the stats are very useful for
- debugging purposes. Saying 'Y' here is recommended.
-
- See Documentation/filesystems/caching/fscache.rst for more information.
-
-config FSCACHE_DEBUG
- bool "Debug FS-Cache"
- depends on FSCACHE
- help
- This permits debugging to be dynamically enabled in the local caching
- management module. If this is set, the debugging output may be
- enabled by setting bits in /sys/modules/fscache/parameter/debug.
-
- See Documentation/filesystems/caching/fscache.rst for more information.
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
deleted file mode 100644
index afb090ea16c4..000000000000
--- a/fs/fscache/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for general filesystem caching code
-#
-
-fscache-y := \
- cache.o \
- cookie.o \
- io.o \
- main.o \
- volume.o
-
-fscache-$(CONFIG_PROC_FS) += proc.o
-fscache-$(CONFIG_FSCACHE_STATS) += stats.o
-
-obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
deleted file mode 100644
index 1336f517e9b1..000000000000
--- a/fs/fscache/internal.h
+++ /dev/null
@@ -1,277 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* Internal definitions for FS-Cache
- *
- * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#ifdef pr_fmt
-#undef pr_fmt
-#endif
-
-#define pr_fmt(fmt) "FS-Cache: " fmt
-
-#include <linux/slab.h>
-#include <linux/fscache-cache.h>
-#include <trace/events/fscache.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-
-/*
- * cache.c
- */
-#ifdef CONFIG_PROC_FS
-extern const struct seq_operations fscache_caches_seq_ops;
-#endif
-bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
-void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
-struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache);
-void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where);
-
-static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache)
-{
- return smp_load_acquire(&cache->state);
-}
-
-static inline bool fscache_cache_is_live(const struct fscache_cache *cache)
-{
- return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE;
-}
-
-static inline void fscache_set_cache_state(struct fscache_cache *cache,
- enum fscache_cache_state new_state)
-{
- smp_store_release(&cache->state, new_state);
-
-}
-
-static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache,
- enum fscache_cache_state old_state,
- enum fscache_cache_state new_state)
-{
- return try_cmpxchg_release(&cache->state, &old_state, new_state);
-}
-
-/*
- * cookie.c
- */
-extern struct kmem_cache *fscache_cookie_jar;
-#ifdef CONFIG_PROC_FS
-extern const struct seq_operations fscache_cookies_seq_ops;
-#endif
-extern struct timer_list fscache_cookie_lru_timer;
-
-extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix);
-extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie,
- enum fscache_access_trace why);
-
-static inline void fscache_see_cookie(struct fscache_cookie *cookie,
- enum fscache_cookie_trace where)
-{
- trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref),
- where);
-}
-
-/*
- * main.c
- */
-extern unsigned fscache_debug;
-
-extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len);
-
-/*
- * proc.c
- */
-#ifdef CONFIG_PROC_FS
-extern int __init fscache_proc_init(void);
-extern void fscache_proc_cleanup(void);
-#else
-#define fscache_proc_init() (0)
-#define fscache_proc_cleanup() do {} while (0)
-#endif
-
-/*
- * stats.c
- */
-#ifdef CONFIG_FSCACHE_STATS
-extern atomic_t fscache_n_volumes;
-extern atomic_t fscache_n_volumes_collision;
-extern atomic_t fscache_n_volumes_nomem;
-extern atomic_t fscache_n_cookies;
-extern atomic_t fscache_n_cookies_lru;
-extern atomic_t fscache_n_cookies_lru_expired;
-extern atomic_t fscache_n_cookies_lru_removed;
-extern atomic_t fscache_n_cookies_lru_dropped;
-
-extern atomic_t fscache_n_acquires;
-extern atomic_t fscache_n_acquires_ok;
-extern atomic_t fscache_n_acquires_oom;
-
-extern atomic_t fscache_n_invalidates;
-
-extern atomic_t fscache_n_relinquishes;
-extern atomic_t fscache_n_relinquishes_retire;
-extern atomic_t fscache_n_relinquishes_dropped;
-
-extern atomic_t fscache_n_resizes;
-extern atomic_t fscache_n_resizes_null;
-
-static inline void fscache_stat(atomic_t *stat)
-{
- atomic_inc(stat);
-}
-
-static inline void fscache_stat_d(atomic_t *stat)
-{
- atomic_dec(stat);
-}
-
-#define __fscache_stat(stat) (stat)
-
-int fscache_stats_show(struct seq_file *m, void *v);
-#else
-
-#define __fscache_stat(stat) (NULL)
-#define fscache_stat(stat) do {} while (0)
-#define fscache_stat_d(stat) do {} while (0)
-#endif
-
-/*
- * volume.c
- */
-#ifdef CONFIG_PROC_FS
-extern const struct seq_operations fscache_volumes_seq_ops;
-#endif
-
-struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
- enum fscache_volume_trace where);
-void fscache_put_volume(struct fscache_volume *volume,
- enum fscache_volume_trace where);
-bool fscache_begin_volume_access(struct fscache_volume *volume,
- struct fscache_cookie *cookie,
- enum fscache_access_trace why);
-void fscache_create_volume(struct fscache_volume *volume, bool wait);
-
-
-/*****************************************************************************/
-/*
- * debug tracing
- */
-#define dbgprintk(FMT, ...) \
- printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
-
-#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
-
-#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
-
-#ifdef __KDEBUG
-#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
-#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
-#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
-
-#elif defined(CONFIG_FSCACHE_DEBUG)
-#define _enter(FMT, ...) \
-do { \
- if (__do_kdebug(ENTER)) \
- kenter(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#define _leave(FMT, ...) \
-do { \
- if (__do_kdebug(LEAVE)) \
- kleave(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#define _debug(FMT, ...) \
-do { \
- if (__do_kdebug(DEBUG)) \
- kdebug(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#else
-#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
-#endif
-
-/*
- * determine whether a particular optional debugging point should be logged
- * - we need to go through three steps to persuade cpp to correctly join the
- * shorthand in FSCACHE_DEBUG_LEVEL with its prefix
- */
-#define ____do_kdebug(LEVEL, POINT) \
- unlikely((fscache_debug & \
- (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3))))
-#define ___do_kdebug(LEVEL, POINT) \
- ____do_kdebug(LEVEL, POINT)
-#define __do_kdebug(POINT) \
- ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT)
-
-#define FSCACHE_DEBUG_CACHE 0
-#define FSCACHE_DEBUG_COOKIE 1
-#define FSCACHE_DEBUG_OBJECT 2
-#define FSCACHE_DEBUG_OPERATION 3
-
-#define FSCACHE_POINT_ENTER 1
-#define FSCACHE_POINT_LEAVE 2
-#define FSCACHE_POINT_DEBUG 4
-
-#ifndef FSCACHE_DEBUG_LEVEL
-#define FSCACHE_DEBUG_LEVEL CACHE
-#endif
-
-/*
- * assertions
- */
-#if 1 /* defined(__KDEBUGALL) */
-
-#define ASSERT(X) \
-do { \
- if (unlikely(!(X))) { \
- pr_err("\n"); \
- pr_err("Assertion failed\n"); \
- BUG(); \
- } \
-} while (0)
-
-#define ASSERTCMP(X, OP, Y) \
-do { \
- if (unlikely(!((X) OP (Y)))) { \
- pr_err("\n"); \
- pr_err("Assertion failed\n"); \
- pr_err("%lx " #OP " %lx is false\n", \
- (unsigned long)(X), (unsigned long)(Y)); \
- BUG(); \
- } \
-} while (0)
-
-#define ASSERTIF(C, X) \
-do { \
- if (unlikely((C) && !(X))) { \
- pr_err("\n"); \
- pr_err("Assertion failed\n"); \
- BUG(); \
- } \
-} while (0)
-
-#define ASSERTIFCMP(C, X, OP, Y) \
-do { \
- if (unlikely((C) && !((X) OP (Y)))) { \
- pr_err("\n"); \
- pr_err("Assertion failed\n"); \
- pr_err("%lx " #OP " %lx is false\n", \
- (unsigned long)(X), (unsigned long)(Y)); \
- BUG(); \
- } \
-} while (0)
-
-#else
-
-#define ASSERT(X) do {} while (0)
-#define ASSERTCMP(X, OP, Y) do {} while (0)
-#define ASSERTIF(C, X) do {} while (0)
-#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
-
-#endif /* assert or not */
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 177f1f41f225..2e215e8c3c88 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -32,25 +32,21 @@
static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
{
- struct dentry *parent = NULL;
+ struct dentry *parent;
struct gfs2_sbd *sdp;
struct gfs2_inode *dip;
- struct inode *dinode, *inode;
+ struct inode *inode;
struct gfs2_holder d_gh;
struct gfs2_inode *ip = NULL;
int error, valid = 0;
int had_lock = 0;
- if (flags & LOOKUP_RCU) {
- dinode = d_inode_rcu(READ_ONCE(dentry->d_parent));
- if (!dinode)
- return -ECHILD;
- } else {
- parent = dget_parent(dentry);
- dinode = d_inode(parent);
- }
- sdp = GFS2_SB(dinode);
- dip = GFS2_I(dinode);
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ parent = dget_parent(dentry);
+ sdp = GFS2_SB(d_inode(parent));
+ dip = GFS2_I(d_inode(parent));
inode = d_inode(dentry);
if (inode) {
@@ -66,8 +62,7 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
if (!had_lock) {
- error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
- flags & LOOKUP_RCU ? GL_NOBLOCK : 0, &d_gh);
+ error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
if (error)
goto out;
}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6bfc9383b7b8..1b95db2c3aac 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1882,10 +1882,10 @@ int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode,
WARN_ON_ONCE(!may_not_block);
return -ECHILD;
}
- if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
- int noblock = may_not_block ? GL_NOBLOCK : 0;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
- LM_FLAG_ANY | noblock, &i_gh);
+ if (gfs2_glock_is_locked_by_me(gl) == NULL) {
+ if (may_not_block)
+ return -ECHILD;
+ error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
if (error)
return error;
}
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index ea87f24c6c3f..a73d27c4dd58 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -637,12 +637,8 @@ static struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
inode = hostfs_iget(ino->i_sb, name);
__putname(name);
- if (IS_ERR(inode)) {
- if (PTR_ERR(inode) == -ENOENT)
- inode = NULL;
- else
- return ERR_CAST(inode);
- }
+ if (inode == ERR_PTR(-ENOENT))
+ inode = NULL;
return d_splice_alias(inode, dentry);
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ea5b8e57d904..671664fed307 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -340,7 +340,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
folio_unlock(folio);
- if (!folio_test_has_hwpoisoned(folio))
+ if (!folio_test_hwpoison(folio))
want = nr;
else {
/*
diff --git a/fs/inode.c b/fs/inode.c
index 99d8754a74a3..91048c4c9c9e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -129,7 +129,6 @@ static struct ctl_table inodes_sysctls[] = {
.mode = 0444,
.proc_handler = proc_nr_inodes,
},
- { }
};
static int __init init_fs_inode_sysctls(void)
@@ -1090,48 +1089,6 @@ void discard_new_inode(struct inode *inode)
EXPORT_SYMBOL(discard_new_inode);
/**
- * lock_two_inodes - lock two inodes (may be regular files but also dirs)
- *
- * Lock any non-NULL argument. The caller must make sure that if he is passing
- * in two directories, one is not ancestor of the other. Zero, one or two
- * objects may be locked by this function.
- *
- * @inode1: first inode to lock
- * @inode2: second inode to lock
- * @subclass1: inode lock subclass for the first lock obtained
- * @subclass2: inode lock subclass for the second lock obtained
- */
-void lock_two_inodes(struct inode *inode1, struct inode *inode2,
- unsigned subclass1, unsigned subclass2)
-{
- if (!inode1 || !inode2) {
- /*
- * Make sure @subclass1 will be used for the acquired lock.
- * This is not strictly necessary (no current caller cares) but
- * let's keep things consistent.
- */
- if (!inode1)
- swap(inode1, inode2);
- goto lock;
- }
-
- /*
- * If one object is directory and the other is not, we must make sure
- * to lock directory first as the other object may be its child.
- */
- if (S_ISDIR(inode2->i_mode) == S_ISDIR(inode1->i_mode)) {
- if (inode1 > inode2)
- swap(inode1, inode2);
- } else if (!S_ISDIR(inode1->i_mode))
- swap(inode1, inode2);
-lock:
- if (inode1)
- inode_lock_nested(inode1, subclass1);
- if (inode2 && inode2 != inode1)
- inode_lock_nested(inode2, subclass2);
-}
-
-/**
* lock_two_nondirectories - take two i_mutexes on non-directory objects
*
* Lock any non-NULL argument. Passed objects must not be directories.
@@ -1146,7 +1103,12 @@ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
if (inode2)
WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
- lock_two_inodes(inode1, inode2, I_MUTEX_NORMAL, I_MUTEX_NONDIR2);
+ if (inode1 > inode2)
+ swap(inode1, inode2);
+ if (inode1)
+ inode_lock(inode1);
+ if (inode2 && inode2 != inode1)
+ inode_lock_nested(inode2, I_MUTEX_NONDIR2);
}
EXPORT_SYMBOL(lock_two_nondirectories);
diff --git a/fs/internal.h b/fs/internal.h
index bf2ee2e0d45d..b67406435fc0 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -197,8 +197,6 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry);
bool in_group_or_capable(struct mnt_idmap *idmap,
const struct inode *inode, vfsgid_t vfsgid);
-void lock_two_inodes(struct inode *inode1, struct inode *inode2,
- unsigned subclass1, unsigned subclass2);
/*
* fs-writeback.c
@@ -216,6 +214,11 @@ extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *)
extern char *simple_dname(struct dentry *, char *, int);
extern void dput_to_list(struct dentry *, struct list_head *);
extern void shrink_dentry_list(struct list_head *);
+extern void shrink_dcache_for_umount(struct super_block *);
+extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
+extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
+ const struct qstr *name, unsigned *seq);
+extern void d_genocide(struct dentry *);
/*
* pipe.c
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 8eec84c651bf..cb3cda1390ad 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -2763,9 +2763,7 @@ static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl)
* leafno - the number of the leaf to be updated.
* newval - the new value for the leaf.
*
- * RETURN VALUES:
- * 0 - success
- * -EIO - i/o error
+ * RETURN VALUES: none
*/
static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl)
{
@@ -2792,10 +2790,6 @@ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl)
* get the buddy size (number of words covered) of
* the new value.
*/
-
- if ((newval - tp->dmt_budmin) > BUDMIN)
- return -EIO;
-
budsz = BUDSIZE(newval, tp->dmt_budmin);
/* try to join.
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 8b2bd65d70e7..bce1d7ac95ca 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -54,9 +54,9 @@ static bool kernfs_lockdep(struct kernfs_node *kn)
static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
{
if (!kn)
- return strlcpy(buf, "(null)", buflen);
+ return strscpy(buf, "(null)", buflen);
- return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
+ return strscpy(buf, kn->parent ? kn->name : "/", buflen);
}
/* kernfs_node_depth - compute depth from @from to @to */
@@ -127,7 +127,7 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
*
* [3] when @kn_to is %NULL result will be "(null)"
*
- * Return: the length of the full path. If the full length is equal to or
+ * Return: the length of the constructed path. If the path would have been
* greater than @buflen, @buf contains the truncated path with the trailing
* '\0'. On error, -errno is returned.
*/
@@ -138,16 +138,17 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
struct kernfs_node *kn, *common;
const char parent_str[] = "/..";
size_t depth_from, depth_to, len = 0;
+ ssize_t copied;
int i, j;
if (!kn_to)
- return strlcpy(buf, "(null)", buflen);
+ return strscpy(buf, "(null)", buflen);
if (!kn_from)
kn_from = kernfs_root(kn_to)->kn;
if (kn_from == kn_to)
- return strlcpy(buf, "/", buflen);
+ return strscpy(buf, "/", buflen);
common = kernfs_common_ancestor(kn_from, kn_to);
if (WARN_ON(!common))
@@ -158,18 +159,19 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
buf[0] = '\0';
- for (i = 0; i < depth_from; i++)
- len += strlcpy(buf + len, parent_str,
- len < buflen ? buflen - len : 0);
+ for (i = 0; i < depth_from; i++) {
+ copied = strscpy(buf + len, parent_str, buflen - len);
+ if (copied < 0)
+ return copied;
+ len += copied;
+ }
/* Calculate how many bytes we need for the rest */
for (i = depth_to - 1; i >= 0; i--) {
for (kn = kn_to, j = 0; j < i; j++)
kn = kn->parent;
- len += strlcpy(buf + len, "/",
- len < buflen ? buflen - len : 0);
- len += strlcpy(buf + len, kn->name,
- len < buflen ? buflen - len : 0);
+
+ len += scnprintf(buf + len, buflen - len, "/%s", kn->name);
}
return len;
@@ -182,12 +184,12 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
* @buflen: size of @buf
*
* Copies the name of @kn into @buf of @buflen bytes. The behavior is
- * similar to strlcpy().
+ * similar to strscpy().
*
* Fills buffer with "(null)" if @kn is %NULL.
*
- * Return: the length of @kn's name and if @buf isn't long enough,
- * it's filled up to @buflen-1 and nul terminated.
+ * Return: the resulting length of @buf. If @buf isn't long enough,
+ * it's filled up to @buflen-1 and nul terminated, and returns -E2BIG.
*
* This function can be called from any context.
*/
@@ -214,7 +216,7 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
* path (which includes '..'s) as needed to reach from @from to @to is
* returned.
*
- * Return: the length of the full path. If the full length is equal to or
+ * Return: the length of the constructed path. If the path would have been
* greater than @buflen, @buf contains the truncated path with the trailing
* '\0'. On error, -errno is returned.
*/
@@ -265,12 +267,10 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf,
sizeof(kernfs_pr_cont_buf));
if (sz < 0) {
- pr_cont("(error)");
- goto out;
- }
-
- if (sz >= sizeof(kernfs_pr_cont_buf)) {
- pr_cont("(name too long)");
+ if (sz == -E2BIG)
+ pr_cont("(name too long)");
+ else
+ pr_cont("(error)");
goto out;
}
@@ -676,6 +676,18 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
{
struct kernfs_node *kn;
+ if (parent->mode & S_ISGID) {
+ /* this code block imitates inode_init_owner() for
+ * kernfs
+ */
+
+ if (parent->iattr)
+ gid = parent->iattr->ia_gid;
+
+ if (flags & KERNFS_DIR)
+ mode |= S_ISGID;
+ }
+
kn = __kernfs_new_node(kernfs_root(parent), parent,
name, mode, uid, gid, flags);
if (kn) {
@@ -850,16 +862,16 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
const unsigned char *path,
const void *ns)
{
- size_t len;
+ ssize_t len;
char *p, *name;
lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);
spin_lock_irq(&kernfs_pr_cont_lock);
- len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
+ len = strscpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
- if (len >= sizeof(kernfs_pr_cont_buf)) {
+ if (len < 0) {
spin_unlock_irq(&kernfs_pr_cont_lock);
return NULL;
}
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index f0cb729e9a97..ffa4565c275a 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -447,7 +447,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
* warnings and we don't want to add spurious locking dependency
* between the two. Check whether mmap is actually implemented
* without grabbing @of->mutex by testing HAS_MMAP flag. See the
- * comment in kernfs_file_open() for more details.
+ * comment in kernfs_fop_open() for more details.
*/
if (!(of->kn->flags & KERNFS_HAS_MMAP))
return -ENODEV;
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 4628edde2e7e..0c93cad0f0ac 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -125,9 +125,6 @@ static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb,
inode = kernfs_get_inode(sb, kn);
kernfs_put(kn);
- if (!inode)
- return ERR_PTR(-ESTALE);
-
return d_obtain_alias(inode);
}
diff --git a/fs/libfs.c b/fs/libfs.c
index c2aa6fd4795c..eec6031b0155 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -104,15 +104,16 @@ EXPORT_SYMBOL(dcache_dir_close);
* If no such element exists, NULL is returned.
*/
static struct dentry *scan_positives(struct dentry *cursor,
- struct list_head *p,
+ struct hlist_node **p,
loff_t count,
struct dentry *last)
{
struct dentry *dentry = cursor->d_parent, *found = NULL;
spin_lock(&dentry->d_lock);
- while ((p = p->next) != &dentry->d_subdirs) {
- struct dentry *d = list_entry(p, struct dentry, d_child);
+ while (*p) {
+ struct dentry *d = hlist_entry(*p, struct dentry, d_sib);
+ p = &d->d_sib.next;
// we must at least skip cursors, to avoid livelocks
if (d->d_flags & DCACHE_DENTRY_CURSOR)
continue;
@@ -126,8 +127,10 @@ static struct dentry *scan_positives(struct dentry *cursor,
count = 1;
}
if (need_resched()) {
- list_move(&cursor->d_child, p);
- p = &cursor->d_child;
+ if (!hlist_unhashed(&cursor->d_sib))
+ __hlist_del(&cursor->d_sib);
+ hlist_add_behind(&cursor->d_sib, &d->d_sib);
+ p = &cursor->d_sib.next;
spin_unlock(&dentry->d_lock);
cond_resched();
spin_lock(&dentry->d_lock);
@@ -159,13 +162,12 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
inode_lock_shared(dentry->d_inode);
if (offset > 2)
- to = scan_positives(cursor, &dentry->d_subdirs,
+ to = scan_positives(cursor, &dentry->d_children.first,
offset - 2, NULL);
spin_lock(&dentry->d_lock);
+ hlist_del_init(&cursor->d_sib);
if (to)
- list_move(&cursor->d_child, &to->d_child);
- else
- list_del_init(&cursor->d_child);
+ hlist_add_behind(&cursor->d_sib, &to->d_sib);
spin_unlock(&dentry->d_lock);
dput(to);
@@ -187,19 +189,16 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
struct dentry *cursor = file->private_data;
- struct list_head *anchor = &dentry->d_subdirs;
struct dentry *next = NULL;
- struct list_head *p;
+ struct hlist_node **p;
if (!dir_emit_dots(file, ctx))
return 0;
if (ctx->pos == 2)
- p = anchor;
- else if (!list_empty(&cursor->d_child))
- p = &cursor->d_child;
+ p = &dentry->d_children.first;
else
- return 0;
+ p = &cursor->d_sib.next;
while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
@@ -207,13 +206,12 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
fs_umode_to_dtype(d_inode(next)->i_mode)))
break;
ctx->pos++;
- p = &next->d_child;
+ p = &next->d_sib.next;
}
spin_lock(&dentry->d_lock);
+ hlist_del_init(&cursor->d_sib);
if (next)
- list_move_tail(&cursor->d_child, &next->d_child);
- else
- list_del_init(&cursor->d_child);
+ hlist_add_before(&cursor->d_sib, &next->d_sib);
spin_unlock(&dentry->d_lock);
dput(next);
@@ -500,12 +498,11 @@ const struct file_operations simple_offset_dir_operations = {
static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
{
- struct dentry *child = NULL;
- struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs;
+ struct dentry *child = NULL, *d;
spin_lock(&parent->d_lock);
- while ((p = p->next) != &parent->d_subdirs) {
- struct dentry *d = container_of(p, struct dentry, d_child);
+ d = prev ? d_next_sibling(prev) : d_first_child(parent);
+ hlist_for_each_entry_from(d, d_sib) {
if (simple_positive(d)) {
spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
if (simple_positive(d))
@@ -666,7 +663,7 @@ int simple_empty(struct dentry *dentry)
int ret = 0;
spin_lock(&dentry->d_lock);
- list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+ hlist_for_each_entry(child, &dentry->d_children, d_sib) {
spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
if (simple_positive(child)) {
spin_unlock(&child->d_lock);
@@ -920,7 +917,6 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
const struct tree_descr *files)
{
struct inode *inode;
- struct dentry *root;
struct dentry *dentry;
int i;
@@ -943,8 +939,8 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
set_nlink(inode, 2);
- root = d_make_root(inode);
- if (!root)
+ s->s_root = d_make_root(inode);
+ if (!s->s_root)
return -ENOMEM;
for (i = 0; !files->name || files->name[0]; i++, files++) {
if (!files->name)
@@ -956,13 +952,13 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
"with an index of 1!\n", __func__,
s->s_type->name);
- dentry = d_alloc_name(root, files->name);
+ dentry = d_alloc_name(s->s_root, files->name);
if (!dentry)
- goto out;
+ return -ENOMEM;
inode = new_inode(s);
if (!inode) {
dput(dentry);
- goto out;
+ return -ENOMEM;
}
inode->i_mode = S_IFREG | files->mode;
simple_inode_init_ts(inode);
@@ -970,13 +966,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
inode->i_ino = i;
d_add(dentry, inode);
}
- s->s_root = root;
return 0;
-out:
- d_genocide(root);
- shrink_dcache_parent(root);
- dput(root);
- return -ENOMEM;
}
EXPORT_SYMBOL(simple_fill_super);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 0d6cb3fdc0e1..ce5862482097 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -473,7 +473,6 @@ static struct ctl_table nlm_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { }
};
#endif /* CONFIG_SYSCTL */
diff --git a/fs/locks.c b/fs/locks.c
index 46d88b9e222c..cc7c117ee192 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -111,7 +111,6 @@ static struct ctl_table locks_sysctls[] = {
.proc_handler = proc_dointvec,
},
#endif /* CONFIG_MMU */
- {}
};
static int __init init_fs_locks_sysctls(void)
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 62c313fc9a49..a224cf222570 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -26,12 +26,6 @@ const struct file_operations minix_dir_operations = {
.fsync = generic_file_fsync,
};
-static inline void dir_put_page(struct page *page)
-{
- kunmap(page);
- put_page(page);
-}
-
/*
* Return the offset into page `page_nr' of the last valid
* byte in that page, plus one.
@@ -70,13 +64,14 @@ static int minix_handle_dirsync(struct inode *dir)
return err;
}
-static struct page * dir_get_page(struct inode *dir, unsigned long n)
+static void *dir_get_page(struct inode *dir, unsigned long n, struct page **p)
{
struct address_space *mapping = dir->i_mapping;
struct page *page = read_mapping_page(mapping, n, NULL);
- if (!IS_ERR(page))
- kmap(page);
- return page;
+ if (IS_ERR(page))
+ return ERR_CAST(page);
+ *p = page;
+ return kmap_local_page(page);
}
static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
@@ -104,11 +99,11 @@ static int minix_readdir(struct file *file, struct dir_context *ctx)
for ( ; n < npages; n++, offset = 0) {
char *p, *kaddr, *limit;
- struct page *page = dir_get_page(inode, n);
+ struct page *page;
- if (IS_ERR(page))
+ kaddr = dir_get_page(inode, n, &page);
+ if (IS_ERR(kaddr))
continue;
- kaddr = (char *)page_address(page);
p = kaddr+offset;
limit = kaddr + minix_last_byte(inode, n) - chunk_size;
for ( ; p <= limit; p = minix_next_entry(p, sbi)) {
@@ -127,13 +122,13 @@ static int minix_readdir(struct file *file, struct dir_context *ctx)
unsigned l = strnlen(name, sbi->s_namelen);
if (!dir_emit(ctx, name, l,
inumber, DT_UNKNOWN)) {
- dir_put_page(page);
+ unmap_and_put_page(page, p);
return 0;
}
}
ctx->pos += chunk_size;
}
- dir_put_page(page);
+ unmap_and_put_page(page, kaddr);
}
return 0;
}
@@ -173,11 +168,10 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page)
for (n = 0; n < npages; n++) {
char *kaddr, *limit;
- page = dir_get_page(dir, n);
- if (IS_ERR(page))
+ kaddr = dir_get_page(dir, n, &page);
+ if (IS_ERR(kaddr))
continue;
- kaddr = (char*)page_address(page);
limit = kaddr + minix_last_byte(dir, n) - sbi->s_dirsize;
for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
if (sbi->s_version == MINIX_V3) {
@@ -194,7 +188,7 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page)
if (namecompare(namelen, sbi->s_namelen, name, namx))
goto found;
}
- dir_put_page(page);
+ unmap_and_put_page(page, kaddr);
}
return NULL;
@@ -229,12 +223,10 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
for (n = 0; n <= npages; n++) {
char *limit, *dir_end;
- page = dir_get_page(dir, n);
- err = PTR_ERR(page);
- if (IS_ERR(page))
- goto out;
+ kaddr = dir_get_page(dir, n, &page);
+ if (IS_ERR(kaddr))
+ return PTR_ERR(kaddr);
lock_page(page);
- kaddr = (char*)page_address(page);
dir_end = kaddr + minix_last_byte(dir, n);
limit = kaddr + PAGE_SIZE - sbi->s_dirsize;
for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
@@ -262,13 +254,13 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
goto out_unlock;
}
unlock_page(page);
- dir_put_page(page);
+ unmap_and_put_page(page, kaddr);
}
BUG();
return -EINVAL;
got_it:
- pos = page_offset(page) + p - (char *)page_address(page);
+ pos = page_offset(page) + offset_in_page(p);
err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
if (err)
goto out_unlock;
@@ -285,8 +277,7 @@ got_it:
mark_inode_dirty(dir);
err = minix_handle_dirsync(dir);
out_put:
- dir_put_page(page);
-out:
+ unmap_and_put_page(page, kaddr);
return err;
out_unlock:
unlock_page(page);
@@ -296,8 +287,7 @@ out_unlock:
int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
{
struct inode *inode = page->mapping->host;
- char *kaddr = page_address(page);
- loff_t pos = page_offset(page) + (char*)de - kaddr;
+ loff_t pos = page_offset(page) + offset_in_page(de);
struct minix_sb_info *sbi = minix_sb(inode->i_sb);
unsigned len = sbi->s_dirsize;
int err;
@@ -333,7 +323,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir)
goto fail;
}
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_page(page);
memset(kaddr, 0, PAGE_SIZE);
if (sbi->s_version == MINIX_V3) {
@@ -353,7 +343,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir)
de->inode = dir->i_ino;
strcpy(de->name, "..");
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
dir_commit_chunk(page, 0, 2 * sbi->s_dirsize);
err = minix_handle_dirsync(inode);
@@ -370,17 +360,16 @@ int minix_empty_dir(struct inode * inode)
struct page *page = NULL;
unsigned long i, npages = dir_pages(inode);
struct minix_sb_info *sbi = minix_sb(inode->i_sb);
- char *name;
+ char *name, *kaddr;
__u32 inumber;
for (i = 0; i < npages; i++) {
- char *p, *kaddr, *limit;
+ char *p, *limit;
- page = dir_get_page(inode, i);
- if (IS_ERR(page))
+ kaddr = dir_get_page(inode, i, &page);
+ if (IS_ERR(kaddr))
continue;
- kaddr = (char *)page_address(page);
limit = kaddr + minix_last_byte(inode, i) - sbi->s_dirsize;
for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
if (sbi->s_version == MINIX_V3) {
@@ -406,12 +395,12 @@ int minix_empty_dir(struct inode * inode)
goto not_empty;
}
}
- dir_put_page(page);
+ unmap_and_put_page(page, kaddr);
}
return 1;
not_empty:
- dir_put_page(page);
+ unmap_and_put_page(page, kaddr);
return 0;
}
@@ -421,8 +410,7 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page,
{
struct inode *dir = page->mapping->host;
struct minix_sb_info *sbi = minix_sb(dir->i_sb);
- loff_t pos = page_offset(page) +
- (char *)de-(char*)page_address(page);
+ loff_t pos = page_offset(page) + offset_in_page(de);
int err;
lock_page(page);
@@ -443,15 +431,12 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page,
struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p)
{
- struct page *page = dir_get_page(dir, 0);
struct minix_sb_info *sbi = minix_sb(dir->i_sb);
- struct minix_dir_entry *de = NULL;
+ struct minix_dir_entry *de = dir_get_page(dir, 0, p);
- if (!IS_ERR(page)) {
- de = minix_next_entry(page_address(page), sbi);
- *p = page;
- }
- return de;
+ if (!IS_ERR(de))
+ return minix_next_entry(de, sbi);
+ return NULL;
}
ino_t minix_inode_by_name(struct dentry *dentry)
@@ -469,7 +454,7 @@ ino_t minix_inode_by_name(struct dentry *dentry)
res = ((minix3_dirent *) de)->inode;
else
res = de->inode;
- dir_put_page(page);
+ unmap_and_put_page(page, de);
}
return res;
}
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 114084d5636a..d6031acc34f0 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -149,8 +149,7 @@ static int minix_unlink(struct inode * dir, struct dentry *dentry)
if (!de)
return -ENOENT;
err = minix_delete_entry(de, page);
- kunmap(page);
- put_page(page);
+ unmap_and_put_page(page, de);
if (err)
return err;
@@ -242,13 +241,10 @@ static int minix_rename(struct mnt_idmap *idmap,
inode_dec_link_count(old_dir);
}
out_dir:
- if (dir_de) {
- kunmap(dir_page);
- put_page(dir_page);
- }
+ if (dir_de)
+ unmap_and_put_page(dir_page, dir_de);
out_old:
- kunmap(old_page);
- put_page(old_page);
+ unmap_and_put_page(old_page, old_de);
out:
return err;
}
diff --git a/fs/namei.c b/fs/namei.c
index faae721e4d63..4e0de939fea1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1071,7 +1071,6 @@ static struct ctl_table namei_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
},
- { }
};
static int __init init_fs_namei_sysctls(void)
@@ -2573,13 +2572,13 @@ static int filename_parentat(int dfd, struct filename *name,
}
/* does lookup, returns the object with parent locked */
-static struct dentry *__kern_path_locked(struct filename *name, struct path *path)
+static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path)
{
struct dentry *d;
struct qstr last;
int type, error;
- error = filename_parentat(AT_FDCWD, name, 0, path, &last, &type);
+ error = filename_parentat(dfd, name, 0, path, &last, &type);
if (error)
return ERR_PTR(error);
if (unlikely(type != LAST_NORM)) {
@@ -2598,12 +2597,22 @@ static struct dentry *__kern_path_locked(struct filename *name, struct path *pat
struct dentry *kern_path_locked(const char *name, struct path *path)
{
struct filename *filename = getname_kernel(name);
- struct dentry *res = __kern_path_locked(filename, path);
+ struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path);
putname(filename);
return res;
}
+struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path)
+{
+ struct filename *filename = getname(name);
+ struct dentry *res = __kern_path_locked(dfd, filename, path);
+
+ putname(filename);
+ return res;
+}
+EXPORT_SYMBOL(user_path_locked_at);
+
int kern_path(const char *name, unsigned int flags, struct path *path)
{
struct filename *filename = getname_kernel(name);
@@ -3014,27 +3023,37 @@ static inline int may_create(struct mnt_idmap *idmap,
return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
}
+// p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held
static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2)
{
- struct dentry *p;
+ struct dentry *p = p1, *q = p2, *r;
- p = d_ancestor(p2, p1);
- if (p) {
+ while ((r = p->d_parent) != p2 && r != p)
+ p = r;
+ if (r == p2) {
+ // p is a child of p2 and an ancestor of p1 or p1 itself
inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
- inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2);
return p;
}
-
- p = d_ancestor(p1, p2);
- if (p) {
+ // p is the root of connected component that contains p1
+ // p2 does not occur on the path from p to p1
+ while ((r = q->d_parent) != p1 && r != p && r != q)
+ q = r;
+ if (r == p1) {
+ // q is a child of p1 and an ancestor of p2 or p2 itself
inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
- inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
- return p;
+ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
+ return q;
+ } else if (likely(r == p)) {
+ // both p2 and p1 are descendents of p
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
+ return NULL;
+ } else { // no common ancestor at the time we'd been called
+ mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
+ return ERR_PTR(-EXDEV);
}
-
- lock_two_inodes(p1->d_inode, p2->d_inode,
- I_MUTEX_PARENT, I_MUTEX_PARENT2);
- return NULL;
}
/*
@@ -4713,11 +4732,12 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
*
* a) we can get into loop creation.
* b) race potential - two innocent renames can create a loop together.
- * That's where 4.4 screws up. Current fix: serialization on
+ * That's where 4.4BSD screws up. Current fix: serialization on
* sb->s_vfs_rename_mutex. We might be more accurate, but that's another
* story.
- * c) we have to lock _four_ objects - parents and victim (if it exists),
- * and source.
+ * c) we may have to lock up to _four_ objects - parents and victim (if it exists),
+ * and source (if it's a non-directory or a subdirectory that moves to
+ * different parent).
* And that - after we got ->i_mutex on parents (until then we don't know
* whether the target exists). Solution: try to be smart with locking
* order for inodes. We rely on the fact that tree topology may change
@@ -4749,6 +4769,7 @@ int vfs_rename(struct renamedata *rd)
bool new_is_dir = false;
unsigned max_links = new_dir->i_sb->s_max_links;
struct name_snapshot old_name;
+ bool lock_old_subdir, lock_new_subdir;
if (source == target)
return 0;
@@ -4802,15 +4823,32 @@ int vfs_rename(struct renamedata *rd)
take_dentry_name_snapshot(&old_name, old_dentry);
dget(new_dentry);
/*
- * Lock all moved children. Moved directories may need to change parent
- * pointer so they need the lock to prevent against concurrent
- * directory changes moving parent pointer. For regular files we've
- * historically always done this. The lockdep locking subclasses are
- * somewhat arbitrary but RENAME_EXCHANGE in particular can swap
- * regular files and directories so it's difficult to tell which
- * subclasses to use.
+ * Lock children.
+ * The source subdirectory needs to be locked on cross-directory
+ * rename or cross-directory exchange since its parent changes.
+ * The target subdirectory needs to be locked on cross-directory
+ * exchange due to parent change and on any rename due to becoming
+ * a victim.
+ * Non-directories need locking in all cases (for NFS reasons);
+ * they get locked after any subdirectories (in inode address order).
+ *
+ * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
+ * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
*/
- lock_two_inodes(source, target, I_MUTEX_NORMAL, I_MUTEX_NONDIR2);
+ lock_old_subdir = new_dir != old_dir;
+ lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE);
+ if (is_dir) {
+ if (lock_old_subdir)
+ inode_lock_nested(source, I_MUTEX_CHILD);
+ if (target && (!new_is_dir || lock_new_subdir))
+ inode_lock(target);
+ } else if (new_is_dir) {
+ if (lock_new_subdir)
+ inode_lock_nested(target, I_MUTEX_CHILD);
+ inode_lock(source);
+ } else {
+ lock_two_nondirectories(source, target);
+ }
error = -EPERM;
if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
@@ -4858,8 +4896,9 @@ int vfs_rename(struct renamedata *rd)
d_exchange(old_dentry, new_dentry);
}
out:
- inode_unlock(source);
- if (target)
+ if (!is_dir || lock_old_subdir)
+ inode_unlock(source);
+ if (target && (!new_is_dir || lock_new_subdir))
inode_unlock(target);
dput(new_dentry);
if (!error) {
@@ -4930,6 +4969,10 @@ retry:
retry_deleg:
trap = lock_rename(new_path.dentry, old_path.dentry);
+ if (IS_ERR(trap)) {
+ error = PTR_ERR(trap);
+ goto exit_lock_rename;
+ }
old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry,
lookup_flags);
@@ -4997,6 +5040,7 @@ exit4:
dput(old_dentry);
exit3:
unlock_rename(new_path.dentry, old_path.dentry);
+exit_lock_rename:
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
diff --git a/fs/namespace.c b/fs/namespace.c
index 95b2fff91f67..437f60e96d40 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5042,13 +5042,12 @@ static struct mount *listmnt_next(struct mount *curr)
return node_to_mount(rb_next(&curr->mnt_node));
}
-static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id,
- u64 __user *buf, size_t bufsize,
- const struct path *root)
+static ssize_t do_listmount(struct mount *first, struct path *orig,
+ u64 mnt_parent_id, u64 __user *mnt_ids,
+ size_t nr_mnt_ids, const struct path *root)
{
struct mount *r;
- ssize_t ctr;
- int err;
+ ssize_t ret;
/*
* Don't trigger audit denials. We just want to determine what
@@ -5058,50 +5057,57 @@ static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id,
!ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
return -EPERM;
- err = security_sb_statfs(orig->dentry);
- if (err)
- return err;
+ ret = security_sb_statfs(orig->dentry);
+ if (ret)
+ return ret;
- for (ctr = 0, r = first; r && ctr < bufsize; r = listmnt_next(r)) {
- if (r->mnt_id_unique == mnt_id)
+ for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r)) {
+ if (r->mnt_id_unique == mnt_parent_id)
continue;
if (!is_path_reachable(r, r->mnt.mnt_root, orig))
continue;
- ctr = array_index_nospec(ctr, bufsize);
- if (put_user(r->mnt_id_unique, buf + ctr))
+ if (put_user(r->mnt_id_unique, mnt_ids))
return -EFAULT;
- if (check_add_overflow(ctr, 1, &ctr))
- return -ERANGE;
+ mnt_ids++;
+ nr_mnt_ids--;
+ ret++;
}
- return ctr;
+ return ret;
}
-SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
- u64 __user *, buf, size_t, bufsize, unsigned int, flags)
+SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *,
+ mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
{
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
struct mnt_id_req kreq;
struct mount *first;
struct path root, orig;
- u64 mnt_id, last_mnt_id;
+ u64 mnt_parent_id, last_mnt_id;
+ const size_t maxcount = (size_t)-1 >> 3;
ssize_t ret;
if (flags)
return -EINVAL;
+ if (unlikely(nr_mnt_ids > maxcount))
+ return -EFAULT;
+
+ if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
+ return -EFAULT;
+
ret = copy_mnt_id_req(req, &kreq);
if (ret)
return ret;
- mnt_id = kreq.mnt_id;
+ mnt_parent_id = kreq.mnt_id;
last_mnt_id = kreq.param;
down_read(&namespace_sem);
get_fs_root(current->fs, &root);
- if (mnt_id == LSMT_ROOT) {
+ if (mnt_parent_id == LSMT_ROOT) {
orig = root;
} else {
ret = -ENOENT;
- orig.mnt = lookup_mnt_in_ns(mnt_id, ns);
+ orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
if (!orig.mnt)
goto err;
orig.dentry = orig.mnt->mnt_root;
@@ -5111,7 +5117,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
else
first = mnt_find_id_at(ns, last_mnt_id + 1);
- ret = do_listmount(first, &orig, mnt_id, buf, bufsize, &root);
+ ret = do_listmount(first, &orig, mnt_parent_id, mnt_ids, nr_mnt_ids, &root);
err:
path_put(&root);
up_read(&namespace_sem);
@@ -5447,7 +5453,6 @@ static struct ctl_table fs_namespace_sysctls[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
},
- { }
};
static int __init init_fs_namespace_sysctls(void)
diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig
index b4db21022cb4..bec805e0c44c 100644
--- a/fs/netfs/Kconfig
+++ b/fs/netfs/Kconfig
@@ -21,3 +21,42 @@ config NETFS_STATS
multi-CPU system these may be on cachelines that keep bouncing
between CPUs. On the other hand, the stats are very useful for
debugging purposes. Saying 'Y' here is recommended.
+
+config FSCACHE
+ bool "General filesystem local caching manager"
+ depends on NETFS_SUPPORT
+ help
+ This option enables a generic filesystem caching manager that can be
+ used by various network and other filesystems to cache data locally.
+ Different sorts of caches can be plugged in, depending on the
+ resources available.
+
+ See Documentation/filesystems/caching/fscache.rst for more information.
+
+config FSCACHE_STATS
+ bool "Gather statistical information on local caching"
+ depends on FSCACHE && PROC_FS
+ select NETFS_STATS
+ help
+ This option causes statistical information to be gathered on local
+ caching and exported through file:
+
+ /proc/fs/fscache/stats
+
+ The gathering of statistics adds a certain amount of overhead to
+ execution as there are a quite a few stats gathered, and on a
+ multi-CPU system these may be on cachelines that keep bouncing
+ between CPUs. On the other hand, the stats are very useful for
+ debugging purposes. Saying 'Y' here is recommended.
+
+ See Documentation/filesystems/caching/fscache.rst for more information.
+
+config FSCACHE_DEBUG
+ bool "Debug FS-Cache"
+ depends on FSCACHE
+ help
+ This permits debugging to be dynamically enabled in the local caching
+ management module. If this is set, the debugging output may be
+ enabled by setting bits in /sys/modules/fscache/parameter/debug.
+
+ See Documentation/filesystems/caching/fscache.rst for more information.
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index 386d6fb92793..d4d1d799819e 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -2,11 +2,29 @@
netfs-y := \
buffered_read.o \
+ buffered_write.o \
+ direct_read.o \
+ direct_write.o \
io.o \
iterator.o \
+ locking.o \
main.o \
- objects.o
+ misc.o \
+ objects.o \
+ output.o
netfs-$(CONFIG_NETFS_STATS) += stats.o
-obj-$(CONFIG_NETFS_SUPPORT) := netfs.o
+netfs-$(CONFIG_FSCACHE) += \
+ fscache_cache.o \
+ fscache_cookie.o \
+ fscache_io.o \
+ fscache_main.o \
+ fscache_volume.o
+
+ifeq ($(CONFIG_PROC_FS),y)
+netfs-$(CONFIG_FSCACHE) += fscache_proc.o
+endif
+netfs-$(CONFIG_FSCACHE_STATS) += fscache_stats.o
+
+obj-$(CONFIG_NETFS_SUPPORT) += netfs.o
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 2cd3ccf4c439..3298c29b5548 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -16,6 +16,7 @@
void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
+ struct netfs_folio *finfo;
struct folio *folio;
pgoff_t start_page = rreq->start / PAGE_SIZE;
pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
@@ -63,6 +64,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
break;
}
if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
+ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
folio_start_fscache(folio);
folio_started = true;
}
@@ -86,11 +88,20 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
if (!pg_failed) {
flush_dcache_folio(folio);
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
+ if (finfo->netfs_group)
+ folio_change_private(folio, finfo->netfs_group);
+ else
+ folio_detach_private(folio);
+ kfree(finfo);
+ }
folio_mark_uptodate(folio);
}
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
- if (folio_index(folio) == rreq->no_unlock_folio &&
+ if (folio->index == rreq->no_unlock_folio &&
test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
_debug("no unlock");
else
@@ -147,6 +158,15 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq,
}
}
+/*
+ * Begin an operation, and fetch the stored zero point value from the cookie if
+ * available.
+ */
+static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx)
+{
+ return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
+}
+
/**
* netfs_readahead - Helper to manage a read request
* @ractl: The description of the readahead request
@@ -180,11 +200,9 @@ void netfs_readahead(struct readahead_control *ractl)
if (IS_ERR(rreq))
return;
- if (ctx->ops->begin_cache_operation) {
- ret = ctx->ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto cleanup_free;
- }
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto cleanup_free;
netfs_stat(&netfs_n_rh_readahead);
trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
@@ -192,6 +210,10 @@ void netfs_readahead(struct readahead_control *ractl)
netfs_rreq_expand(rreq, ractl);
+ /* Set up the output buffer */
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages,
+ rreq->start, rreq->len);
+
/* Drop the refs on the folios here rather than in the cache or
* filesystem. The locks will be dropped in netfs_rreq_unlock().
*/
@@ -199,6 +221,7 @@ void netfs_readahead(struct readahead_control *ractl)
;
netfs_begin_read(rreq, false);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
return;
cleanup_free:
@@ -223,12 +246,13 @@ EXPORT_SYMBOL(netfs_readahead);
*/
int netfs_read_folio(struct file *file, struct folio *folio)
{
- struct address_space *mapping = folio_file_mapping(folio);
+ struct address_space *mapping = folio->mapping;
struct netfs_io_request *rreq;
struct netfs_inode *ctx = netfs_inode(mapping->host);
+ struct folio *sink = NULL;
int ret;
- _enter("%lx", folio_index(folio));
+ _enter("%lx", folio->index);
rreq = netfs_alloc_request(mapping, file,
folio_file_pos(folio), folio_size(folio),
@@ -238,15 +262,64 @@ int netfs_read_folio(struct file *file, struct folio *folio)
goto alloc_error;
}
- if (ctx->ops->begin_cache_operation) {
- ret = ctx->ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto discard;
- }
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto discard;
netfs_stat(&netfs_n_rh_readpage);
trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
- return netfs_begin_read(rreq, true);
+
+ /* Set up the output buffer */
+ if (folio_test_dirty(folio)) {
+ /* Handle someone trying to read from an unflushed streaming
+ * write. We fiddle the buffer so that a gap at the beginning
+ * and/or a gap at the end get copied to, but the middle is
+ * discarded.
+ */
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ struct bio_vec *bvec;
+ unsigned int from = finfo->dirty_offset;
+ unsigned int to = from + finfo->dirty_len;
+ unsigned int off = 0, i = 0;
+ size_t flen = folio_size(folio);
+ size_t nr_bvec = flen / PAGE_SIZE + 2;
+ size_t part;
+
+ ret = -ENOMEM;
+ bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL);
+ if (!bvec)
+ goto discard;
+
+ sink = folio_alloc(GFP_KERNEL, 0);
+ if (!sink)
+ goto discard;
+
+ trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
+
+ rreq->direct_bv = bvec;
+ rreq->direct_bv_count = nr_bvec;
+ if (from > 0) {
+ bvec_set_folio(&bvec[i++], folio, from, 0);
+ off = from;
+ }
+ while (off < to) {
+ part = min_t(size_t, to - off, PAGE_SIZE);
+ bvec_set_folio(&bvec[i++], sink, part, 0);
+ off += part;
+ }
+ if (to < flen)
+ bvec_set_folio(&bvec[i++], folio, flen - to, to);
+ iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len);
+ } else {
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+ rreq->start, rreq->len);
+ }
+
+ ret = netfs_begin_read(rreq, true);
+ if (sink)
+ folio_put(sink);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ return ret < 0 ? ret : 0;
discard:
netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
@@ -387,14 +460,12 @@ retry:
ret = PTR_ERR(rreq);
goto error;
}
- rreq->no_unlock_folio = folio_index(folio);
+ rreq->no_unlock_folio = folio->index;
__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
- if (ctx->ops->begin_cache_operation) {
- ret = ctx->ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto error_put;
- }
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto error_put;
netfs_stat(&netfs_n_rh_write_begin);
trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
@@ -405,6 +476,10 @@ retry:
ractl._nr_pages = folio_nr_pages(folio);
netfs_rreq_expand(rreq, &ractl);
+ /* Set up the output buffer */
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+ rreq->start, rreq->len);
+
/* We hold the folio locks, so we can drop the references */
folio_get(folio);
while (readahead_folio(&ractl))
@@ -413,6 +488,7 @@ retry:
ret = netfs_begin_read(rreq, true);
if (ret < 0)
goto error;
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
have_folio:
ret = folio_wait_fscache_killable(folio);
@@ -434,3 +510,124 @@ error:
return ret;
}
EXPORT_SYMBOL(netfs_write_begin);
+
+/*
+ * Preload the data into a page we're proposing to write into.
+ */
+int netfs_prefetch_for_write(struct file *file, struct folio *folio,
+ size_t offset, size_t len)
+{
+ struct netfs_io_request *rreq;
+ struct address_space *mapping = folio->mapping;
+ struct netfs_inode *ctx = netfs_inode(mapping->host);
+ unsigned long long start = folio_pos(folio);
+ size_t flen = folio_size(folio);
+ int ret;
+
+ _enter("%zx @%llx", flen, start);
+
+ ret = -ENOMEM;
+
+ rreq = netfs_alloc_request(mapping, file, start, flen,
+ NETFS_READ_FOR_WRITE);
+ if (IS_ERR(rreq)) {
+ ret = PTR_ERR(rreq);
+ goto error;
+ }
+
+ rreq->no_unlock_folio = folio->index;
+ __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto error_put;
+
+ netfs_stat(&netfs_n_rh_write_begin);
+ trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);
+
+ /* Set up the output buffer */
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+ rreq->start, rreq->len);
+
+ ret = netfs_begin_read(rreq, true);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ return ret;
+
+error_put:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/**
+ * netfs_buffered_read_iter - Filesystem buffered I/O read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the ->read_iter() routine for all filesystems that can use the page
+ * cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
+ * returned when no data can be read without waiting for I/O requests to
+ * complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
+ * shall be made for the read or for readahead. When no data can be read,
+ * -EAGAIN shall be returned. When readahead would be triggered, a partial,
+ * possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct netfs_inode *ictx = netfs_inode(inode);
+ ssize_t ret;
+
+ if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) ||
+ test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)))
+ return -EINVAL;
+
+ ret = netfs_start_io_read(inode);
+ if (ret == 0) {
+ ret = filemap_read(iocb, iter, 0);
+ netfs_end_io_read(inode);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(netfs_buffered_read_iter);
+
+/**
+ * netfs_file_read_iter - Generic filesystem read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the ->read_iter() routine for all filesystems that can use the page
+ * cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
+ * returned when no data can be read without waiting for I/O requests to
+ * complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
+ * shall be made for the read or for readahead. When no data can be read,
+ * -EAGAIN shall be returned. When readahead would be triggered, a partial,
+ * possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host);
+
+ if ((iocb->ki_flags & IOCB_DIRECT) ||
+ test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
+ return netfs_unbuffered_read_iter(iocb, iter);
+
+ return netfs_buffered_read_iter(iocb, iter);
+}
+EXPORT_SYMBOL(netfs_file_read_iter);
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
new file mode 100644
index 000000000000..a3059b3168fd
--- /dev/null
+++ b/fs/netfs/buffered_write.c
@@ -0,0 +1,1254 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/*
+ * Determined write method. Adjust netfs_folio_traces if this is changed.
+ */
+enum netfs_how_to_modify {
+ NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */
+ NETFS_JUST_PREFETCH, /* We have to read the folio anyway */
+ NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */
+ NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */
+ NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */
+ NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */
+ NETFS_FLUSH_CONTENT, /* Flush incompatible content. */
+};
+
+static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq);
+
+static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
+{
+ if (netfs_group && !folio_get_private(folio))
+ folio_attach_private(folio, netfs_get_group(netfs_group));
+}
+
+#if IS_ENABLED(CONFIG_FSCACHE)
+static void netfs_folio_start_fscache(bool caching, struct folio *folio)
+{
+ if (caching)
+ folio_start_fscache(folio);
+}
+#else
+static void netfs_folio_start_fscache(bool caching, struct folio *folio)
+{
+}
+#endif
+
+/*
+ * Decide how we should modify a folio. We might be attempting to do
+ * write-streaming, in which case we don't want to a local RMW cycle if we can
+ * avoid it. If we're doing local caching or content crypto, we award that
+ * priority over avoiding RMW. If the file is open readably, then we also
+ * assume that we may want to read what we wrote.
+ */
+static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
+ struct file *file,
+ struct folio *folio,
+ void *netfs_group,
+ size_t flen,
+ size_t offset,
+ size_t len,
+ bool maybe_trouble)
+{
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ loff_t pos = folio_file_pos(folio);
+
+ _enter("");
+
+ if (netfs_folio_group(folio) != netfs_group)
+ return NETFS_FLUSH_CONTENT;
+
+ if (folio_test_uptodate(folio))
+ return NETFS_FOLIO_IS_UPTODATE;
+
+ if (pos >= ctx->zero_point)
+ return NETFS_MODIFY_AND_CLEAR;
+
+ if (!maybe_trouble && offset == 0 && len >= flen)
+ return NETFS_WHOLE_FOLIO_MODIFY;
+
+ if (file->f_mode & FMODE_READ)
+ goto no_write_streaming;
+ if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
+ goto no_write_streaming;
+
+ if (netfs_is_cache_enabled(ctx)) {
+ /* We don't want to get a streaming write on a file that loses
+ * caching service temporarily because the backing store got
+ * culled.
+ */
+ if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
+ set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags);
+ goto no_write_streaming;
+ }
+
+ if (!finfo)
+ return NETFS_STREAMING_WRITE;
+
+ /* We can continue a streaming write only if it continues on from the
+ * previous. If it overlaps, we must flush lest we suffer a partial
+ * copy and disjoint dirty regions.
+ */
+ if (offset == finfo->dirty_offset + finfo->dirty_len)
+ return NETFS_STREAMING_WRITE_CONT;
+ return NETFS_FLUSH_CONTENT;
+
+no_write_streaming:
+ if (finfo) {
+ netfs_stat(&netfs_n_wh_wstream_conflict);
+ return NETFS_FLUSH_CONTENT;
+ }
+ return NETFS_JUST_PREFETCH;
+}
+
+/*
+ * Grab a folio for writing and lock it. Attempt to allocate as large a folio
+ * as possible to hold as much of the remaining length as possible in one go.
+ */
+static struct folio *netfs_grab_folio_for_write(struct address_space *mapping,
+ loff_t pos, size_t part)
+{
+ pgoff_t index = pos / PAGE_SIZE;
+ fgf_t fgp_flags = FGP_WRITEBEGIN;
+
+ if (mapping_large_folio_support(mapping))
+ fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part);
+
+ return __filemap_get_folio(mapping, index, fgp_flags,
+ mapping_gfp_mask(mapping));
+}
+
+/**
+ * netfs_perform_write - Copy data into the pagecache.
+ * @iocb: The operation parameters
+ * @iter: The source buffer
+ * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ *
+ * Copy data into pagecache pages attached to the inode specified by @iocb.
+ * The caller must hold appropriate inode locks.
+ *
+ * Dirty pages are tagged with a netfs_folio struct if they're not up to date
+ * to indicate the range modified. Dirty pages may also be tagged with a
+ * netfs-specific grouping such that data from an old group gets flushed before
+ * a new one is started.
+ */
+ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
+ struct netfs_group *netfs_group)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ struct netfs_inode *ctx = netfs_inode(inode);
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .for_sync = true,
+ .nr_to_write = LONG_MAX,
+ .range_start = iocb->ki_pos,
+ .range_end = iocb->ki_pos + iter->count,
+ };
+ struct netfs_io_request *wreq = NULL;
+ struct netfs_folio *finfo;
+ struct folio *folio;
+ enum netfs_how_to_modify howto;
+ enum netfs_folio_trace trace;
+ unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC;
+ ssize_t written = 0, ret;
+ loff_t i_size, pos = iocb->ki_pos, from, to;
+ size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
+ bool maybe_trouble = false;
+
+ if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) ||
+ iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC))
+ ) {
+ if (pos < i_size_read(inode)) {
+ ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count);
+ if (ret < 0) {
+ goto out;
+ }
+ }
+
+ wbc_attach_fdatawrite_inode(&wbc, mapping->host);
+
+ wreq = netfs_begin_writethrough(iocb, iter->count);
+ if (IS_ERR(wreq)) {
+ wbc_detach_inode(&wbc);
+ ret = PTR_ERR(wreq);
+ wreq = NULL;
+ goto out;
+ }
+ if (!is_sync_kiocb(iocb))
+ wreq->iocb = iocb;
+ wreq->cleanup = netfs_cleanup_buffered_write;
+ }
+
+ do {
+ size_t flen;
+ size_t offset; /* Offset into pagecache folio */
+ size_t part; /* Bytes to write to folio */
+ size_t copied; /* Bytes copied from user */
+
+ ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags);
+ if (unlikely(ret < 0))
+ break;
+
+ offset = pos & (max_chunk - 1);
+ part = min(max_chunk - offset, iov_iter_count(iter));
+
+ /* Bring in the user pages that we will copy from _first_ lest
+ * we hit a nasty deadlock on copying from the same page as
+ * we're writing to, without it being marked uptodate.
+ *
+ * Not only is this an optimisation, but it is also required to
+ * check that the address is actually valid, when atomic
+ * usercopies are used below.
+ *
+ * We rely on the page being held onto long enough by the LRU
+ * that we can grab it below if this causes it to be read.
+ */
+ ret = -EFAULT;
+ if (unlikely(fault_in_iov_iter_readable(iter, part) == part))
+ break;
+
+ folio = netfs_grab_folio_for_write(mapping, pos, part);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ break;
+ }
+
+ flen = folio_size(folio);
+ offset = pos & (flen - 1);
+ part = min_t(size_t, flen - offset, part);
+
+ if (signal_pending(current)) {
+ ret = written ? -EINTR : -ERESTARTSYS;
+ goto error_folio_unlock;
+ }
+
+ /* See if we need to prefetch the area we're going to modify.
+ * We need to do this before we get a lock on the folio in case
+ * there's more than one writer competing for the same cache
+ * block.
+ */
+ howto = netfs_how_to_modify(ctx, file, folio, netfs_group,
+ flen, offset, part, maybe_trouble);
+ _debug("howto %u", howto);
+ switch (howto) {
+ case NETFS_JUST_PREFETCH:
+ ret = netfs_prefetch_for_write(file, folio, offset, part);
+ if (ret < 0) {
+ _debug("prefetch = %zd", ret);
+ goto error_folio_unlock;
+ }
+ break;
+ case NETFS_FOLIO_IS_UPTODATE:
+ case NETFS_WHOLE_FOLIO_MODIFY:
+ case NETFS_STREAMING_WRITE_CONT:
+ break;
+ case NETFS_MODIFY_AND_CLEAR:
+ zero_user_segment(&folio->page, 0, offset);
+ break;
+ case NETFS_STREAMING_WRITE:
+ ret = -EIO;
+ if (WARN_ON(folio_get_private(folio)))
+ goto error_folio_unlock;
+ break;
+ case NETFS_FLUSH_CONTENT:
+ trace_netfs_folio(folio, netfs_flush_content);
+ from = folio_pos(folio);
+ to = from + folio_size(folio) - 1;
+ folio_unlock(folio);
+ folio_put(folio);
+ ret = filemap_write_and_wait_range(mapping, from, to);
+ if (ret < 0)
+ goto error_folio_unlock;
+ continue;
+ }
+
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_folio(folio);
+
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+
+ flush_dcache_folio(folio);
+
+ /* Deal with a (partially) failed copy */
+ if (copied == 0) {
+ ret = -EFAULT;
+ goto error_folio_unlock;
+ }
+
+ trace = (enum netfs_folio_trace)howto;
+ switch (howto) {
+ case NETFS_FOLIO_IS_UPTODATE:
+ case NETFS_JUST_PREFETCH:
+ netfs_set_group(folio, netfs_group);
+ break;
+ case NETFS_MODIFY_AND_CLEAR:
+ zero_user_segment(&folio->page, offset + copied, flen);
+ netfs_set_group(folio, netfs_group);
+ folio_mark_uptodate(folio);
+ break;
+ case NETFS_WHOLE_FOLIO_MODIFY:
+ if (unlikely(copied < part)) {
+ maybe_trouble = true;
+ iov_iter_revert(iter, copied);
+ copied = 0;
+ goto retry;
+ }
+ netfs_set_group(folio, netfs_group);
+ folio_mark_uptodate(folio);
+ break;
+ case NETFS_STREAMING_WRITE:
+ if (offset == 0 && copied == flen) {
+ netfs_set_group(folio, netfs_group);
+ folio_mark_uptodate(folio);
+ trace = netfs_streaming_filled_page;
+ break;
+ }
+ finfo = kzalloc(sizeof(*finfo), GFP_KERNEL);
+ if (!finfo) {
+ iov_iter_revert(iter, copied);
+ ret = -ENOMEM;
+ goto error_folio_unlock;
+ }
+ finfo->netfs_group = netfs_get_group(netfs_group);
+ finfo->dirty_offset = offset;
+ finfo->dirty_len = copied;
+ folio_attach_private(folio, (void *)((unsigned long)finfo |
+ NETFS_FOLIO_INFO));
+ break;
+ case NETFS_STREAMING_WRITE_CONT:
+ finfo = netfs_folio_info(folio);
+ finfo->dirty_len += copied;
+ if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) {
+ if (finfo->netfs_group)
+ folio_change_private(folio, finfo->netfs_group);
+ else
+ folio_detach_private(folio);
+ folio_mark_uptodate(folio);
+ kfree(finfo);
+ trace = netfs_streaming_cont_filled_page;
+ }
+ break;
+ default:
+ WARN(true, "Unexpected modify type %u ix=%lx\n",
+ howto, folio->index);
+ ret = -EIO;
+ goto error_folio_unlock;
+ }
+
+ trace_netfs_folio(folio, trace);
+
+ /* Update the inode size if we moved the EOF marker */
+ i_size = i_size_read(inode);
+ pos += copied;
+ if (pos > i_size) {
+ if (ctx->ops->update_i_size) {
+ ctx->ops->update_i_size(inode, pos);
+ } else {
+ i_size_write(inode, pos);
+#if IS_ENABLED(CONFIG_FSCACHE)
+ fscache_update_cookie(ctx->cache, NULL, &pos);
+#endif
+ }
+ }
+ written += copied;
+
+ if (likely(!wreq)) {
+ folio_mark_dirty(folio);
+ } else {
+ if (folio_test_dirty(folio))
+ /* Sigh. mmap. */
+ folio_clear_dirty_for_io(folio);
+ /* We make multiple writes to the folio... */
+ if (!folio_test_writeback(folio)) {
+ folio_wait_fscache(folio);
+ folio_start_writeback(folio);
+ folio_start_fscache(folio);
+ if (wreq->iter.count == 0)
+ trace_netfs_folio(folio, netfs_folio_trace_wthru);
+ else
+ trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
+ }
+ netfs_advance_writethrough(wreq, copied,
+ offset + copied == flen);
+ }
+ retry:
+ folio_unlock(folio);
+ folio_put(folio);
+ folio = NULL;
+
+ cond_resched();
+ } while (iov_iter_count(iter));
+
+out:
+ if (unlikely(wreq)) {
+ ret = netfs_end_writethrough(wreq, iocb);
+ wbc_detach_inode(&wbc);
+ if (ret == -EIOCBQUEUED)
+ return ret;
+ }
+
+ iocb->ki_pos += written;
+ _leave(" = %zd [%zd]", written, ret);
+ return written ? written : ret;
+
+error_folio_unlock:
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out;
+}
+EXPORT_SYMBOL(netfs_perform_write);
+
+/**
+ * netfs_buffered_write_iter_locked - write data to a file
+ * @iocb: IO state structure (file, offset, etc.)
+ * @from: iov_iter with data to write
+ * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ *
+ * This function does all the work needed for actually writing data to a
+ * file. It does all basic checks, removes SUID from the file, updates
+ * modification times and calls proper subroutines depending on whether we
+ * do direct IO or a standard buffered write.
+ *
+ * The caller must hold appropriate locks around this function and have called
+ * generic_write_checks() already. The caller is also responsible for doing
+ * any necessary syncing afterwards.
+ *
+ * This function does *not* take care of syncing data in case of O_SYNC write.
+ * A caller has to handle it. This is mainly due to the fact that we want to
+ * avoid syncing under i_rwsem.
+ *
+ * Return:
+ * * number of bytes written, even for truncated writes
+ * * negative error code if no data has been written at all
+ */
+ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from,
+ struct netfs_group *netfs_group)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t ret;
+
+ trace_netfs_write_iter(iocb, from);
+
+ ret = file_remove_privs(file);
+ if (ret)
+ return ret;
+
+ ret = file_update_time(file);
+ if (ret)
+ return ret;
+
+ return netfs_perform_write(iocb, from, netfs_group);
+}
+EXPORT_SYMBOL(netfs_buffered_write_iter_locked);
+
+/**
+ * netfs_file_write_iter - write data to a file
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * Perform a write to a file, writing into the pagecache if possible and doing
+ * an unbuffered write instead if not.
+ *
+ * Return:
+ * * Negative error code if no data has been written at all of
+ * vfs_fsync_range() failed for a synchronous write
+ * * Number of bytes written, even for truncated writes
+ */
+ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ ssize_t ret;
+
+ _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+
+ if ((iocb->ki_flags & IOCB_DIRECT) ||
+ test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
+ return netfs_unbuffered_write_iter(iocb, from);
+
+ ret = netfs_start_io_write(inode);
+ if (ret < 0)
+ return ret;
+
+ ret = generic_write_checks(iocb, from);
+ if (ret > 0)
+ ret = netfs_buffered_write_iter_locked(iocb, from, NULL);
+ netfs_end_io_write(inode);
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_file_write_iter);
+
+/*
+ * Notification that a previously read-only page is about to become writable.
+ * Note that the caller indicates a single page of a multipage folio.
+ */
+vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group)
+{
+ struct folio *folio = page_folio(vmf->page);
+ struct file *file = vmf->vma->vm_file;
+ struct inode *inode = file_inode(file);
+ vm_fault_t ret = VM_FAULT_RETRY;
+ int err;
+
+ _enter("%lx", folio->index);
+
+ sb_start_pagefault(inode->i_sb);
+
+ if (folio_wait_writeback_killable(folio))
+ goto out;
+
+ if (folio_lock_killable(folio) < 0)
+ goto out;
+
+ /* Can we see a streaming write here? */
+ if (WARN_ON(!folio_test_uptodate(folio))) {
+ ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED;
+ goto out;
+ }
+
+ if (netfs_folio_group(folio) != netfs_group) {
+ folio_unlock(folio);
+ err = filemap_fdatawait_range(inode->i_mapping,
+ folio_pos(folio),
+ folio_pos(folio) + folio_size(folio));
+ switch (err) {
+ case 0:
+ ret = VM_FAULT_RETRY;
+ goto out;
+ case -ENOMEM:
+ ret = VM_FAULT_OOM;
+ goto out;
+ default:
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ }
+
+ if (folio_test_dirty(folio))
+ trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus);
+ else
+ trace_netfs_folio(folio, netfs_folio_trace_mkwrite);
+ netfs_set_group(folio, netfs_group);
+ file_update_time(file);
+ ret = VM_FAULT_LOCKED;
+out:
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_page_mkwrite);
+
+/*
+ * Kill all the pages in the given range
+ */
+static void netfs_kill_pages(struct address_space *mapping,
+ loff_t start, loff_t len)
+{
+ struct folio *folio;
+ pgoff_t index = start / PAGE_SIZE;
+ pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
+
+ _enter("%llx-%llx", start, start + len - 1);
+
+ do {
+ _debug("kill %lx (to %lx)", index, last);
+
+ folio = filemap_get_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ next = index + 1;
+ continue;
+ }
+
+ next = folio_next_index(folio);
+
+ trace_netfs_folio(folio, netfs_folio_trace_kill);
+ folio_clear_uptodate(folio);
+ if (folio_test_fscache(folio))
+ folio_end_fscache(folio);
+ folio_end_writeback(folio);
+ folio_lock(folio);
+ generic_error_remove_folio(mapping, folio);
+ folio_unlock(folio);
+ folio_put(folio);
+
+ } while (index = next, index <= last);
+
+ _leave("");
+}
+
+/*
+ * Redirty all the pages in a given range.
+ */
+static void netfs_redirty_pages(struct address_space *mapping,
+ loff_t start, loff_t len)
+{
+ struct folio *folio;
+ pgoff_t index = start / PAGE_SIZE;
+ pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
+
+ _enter("%llx-%llx", start, start + len - 1);
+
+ do {
+ _debug("redirty %llx @%llx", len, start);
+
+ folio = filemap_get_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ next = index + 1;
+ continue;
+ }
+
+ next = folio_next_index(folio);
+ trace_netfs_folio(folio, netfs_folio_trace_redirty);
+ filemap_dirty_folio(mapping, folio);
+ if (folio_test_fscache(folio))
+ folio_end_fscache(folio);
+ folio_end_writeback(folio);
+ folio_put(folio);
+ } while (index = next, index <= last);
+
+ balance_dirty_pages_ratelimited(mapping);
+
+ _leave("");
+}
+
+/*
+ * Completion of write to server
+ */
+static void netfs_pages_written_back(struct netfs_io_request *wreq)
+{
+ struct address_space *mapping = wreq->mapping;
+ struct netfs_folio *finfo;
+ struct netfs_group *group = NULL;
+ struct folio *folio;
+ pgoff_t last;
+ int gcount = 0;
+
+ XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE);
+
+ _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
+
+ rcu_read_lock();
+
+ last = (wreq->start + wreq->len - 1) / PAGE_SIZE;
+ xas_for_each(&xas, folio, last) {
+ WARN(!folio_test_writeback(folio),
+ "bad %zx @%llx page %lx %lx\n",
+ wreq->len, wreq->start, folio->index, last);
+
+ if ((finfo = netfs_folio_info(folio))) {
+ /* Streaming writes cannot be redirtied whilst under
+ * writeback, so discard the streaming record.
+ */
+ folio_detach_private(folio);
+ group = finfo->netfs_group;
+ gcount++;
+ trace_netfs_folio(folio, netfs_folio_trace_clear_s);
+ kfree(finfo);
+ } else if ((group = netfs_folio_group(folio))) {
+ /* Need to detach the group pointer if the page didn't
+ * get redirtied. If it has been redirtied, then it
+ * must be within the same group.
+ */
+ if (folio_test_dirty(folio)) {
+ trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+ goto end_wb;
+ }
+ if (folio_trylock(folio)) {
+ if (!folio_test_dirty(folio)) {
+ folio_detach_private(folio);
+ gcount++;
+ trace_netfs_folio(folio, netfs_folio_trace_clear_g);
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+ }
+ folio_unlock(folio);
+ goto end_wb;
+ }
+
+ xas_pause(&xas);
+ rcu_read_unlock();
+ folio_lock(folio);
+ if (!folio_test_dirty(folio)) {
+ folio_detach_private(folio);
+ gcount++;
+ trace_netfs_folio(folio, netfs_folio_trace_clear_g);
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+ }
+ folio_unlock(folio);
+ rcu_read_lock();
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_clear);
+ }
+ end_wb:
+ if (folio_test_fscache(folio))
+ folio_end_fscache(folio);
+ xas_advance(&xas, folio_next_index(folio) - 1);
+ folio_end_writeback(folio);
+ }
+
+ rcu_read_unlock();
+ netfs_put_group_many(group, gcount);
+ _leave("");
+}
+
+/*
+ * Deal with the disposition of the folios that are under writeback to close
+ * out the operation.
+ */
+static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq)
+{
+ struct address_space *mapping = wreq->mapping;
+
+ _enter("");
+
+ switch (wreq->error) {
+ case 0:
+ netfs_pages_written_back(wreq);
+ break;
+
+ default:
+ pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error);
+ fallthrough;
+ case -EACCES:
+ case -EPERM:
+ case -ENOKEY:
+ case -EKEYEXPIRED:
+ case -EKEYREJECTED:
+ case -EKEYREVOKED:
+ case -ENETRESET:
+ case -EDQUOT:
+ case -ENOSPC:
+ netfs_redirty_pages(mapping, wreq->start, wreq->len);
+ break;
+
+ case -EROFS:
+ case -EIO:
+ case -EREMOTEIO:
+ case -EFBIG:
+ case -ENOENT:
+ case -ENOMEDIUM:
+ case -ENXIO:
+ netfs_kill_pages(mapping, wreq->start, wreq->len);
+ break;
+ }
+
+ if (wreq->error)
+ mapping_set_error(mapping, wreq->error);
+ if (wreq->netfs_ops->done)
+ wreq->netfs_ops->done(wreq);
+}
+
+/*
+ * Extend the region to be written back to include subsequent contiguously
+ * dirty pages if possible, but don't sleep while doing so.
+ *
+ * If this page holds new content, then we can include filler zeros in the
+ * writeback.
+ */
+static void netfs_extend_writeback(struct address_space *mapping,
+ struct netfs_group *group,
+ struct xa_state *xas,
+ long *_count,
+ loff_t start,
+ loff_t max_len,
+ bool caching,
+ size_t *_len,
+ size_t *_top)
+{
+ struct netfs_folio *finfo;
+ struct folio_batch fbatch;
+ struct folio *folio;
+ unsigned int i;
+ pgoff_t index = (start + *_len) / PAGE_SIZE;
+ size_t len;
+ void *priv;
+ bool stop = true;
+
+ folio_batch_init(&fbatch);
+
+ do {
+ /* Firstly, we gather up a batch of contiguous dirty pages
+ * under the RCU read lock - but we can't clear the dirty flags
+ * there if any of those pages are mapped.
+ */
+ rcu_read_lock();
+
+ xas_for_each(xas, folio, ULONG_MAX) {
+ stop = true;
+ if (xas_retry(xas, folio))
+ continue;
+ if (xa_is_value(folio))
+ break;
+ if (folio->index != index) {
+ xas_reset(xas);
+ break;
+ }
+
+ if (!folio_try_get_rcu(folio)) {
+ xas_reset(xas);
+ continue;
+ }
+
+ /* Has the folio moved or been split? */
+ if (unlikely(folio != xas_reload(xas))) {
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+ if (!folio_test_dirty(folio) ||
+ folio_test_writeback(folio) ||
+ folio_test_fscache(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+
+ stop = false;
+ len = folio_size(folio);
+ priv = folio_get_private(folio);
+ if ((const struct netfs_group *)priv != group) {
+ stop = true;
+ finfo = netfs_folio_info(folio);
+ if (finfo->netfs_group != group ||
+ finfo->dirty_offset > 0) {
+ folio_unlock(folio);
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+ len = finfo->dirty_len;
+ }
+
+ *_top += folio_size(folio);
+ index += folio_nr_pages(folio);
+ *_count -= folio_nr_pages(folio);
+ *_len += len;
+ if (*_len >= max_len || *_count <= 0)
+ stop = true;
+
+ if (!folio_batch_add(&fbatch, folio))
+ break;
+ if (stop)
+ break;
+ }
+
+ xas_pause(xas);
+ rcu_read_unlock();
+
+ /* Now, if we obtained any folios, we can shift them to being
+ * writable and mark them for caching.
+ */
+ if (!folio_batch_count(&fbatch))
+ break;
+
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ folio = fbatch.folios[i];
+ trace_netfs_folio(folio, netfs_folio_trace_store_plus);
+
+ if (!folio_clear_dirty_for_io(folio))
+ BUG();
+ folio_start_writeback(folio);
+ netfs_folio_start_fscache(caching, folio);
+ folio_unlock(folio);
+ }
+
+ folio_batch_release(&fbatch);
+ cond_resched();
+ } while (!stop);
+}
+
+/*
+ * Synchronously write back the locked page and any subsequent non-locked dirty
+ * pages.
+ */
+static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct netfs_group *group,
+ struct xa_state *xas,
+ struct folio *folio,
+ unsigned long long start,
+ unsigned long long end)
+{
+ struct netfs_io_request *wreq;
+ struct netfs_folio *finfo;
+ struct netfs_inode *ctx = netfs_inode(mapping->host);
+ unsigned long long i_size = i_size_read(&ctx->inode);
+ size_t len, max_len;
+ bool caching = netfs_is_cache_enabled(ctx);
+ long count = wbc->nr_to_write;
+ int ret;
+
+ _enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching);
+
+ wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio),
+ NETFS_WRITEBACK);
+ if (IS_ERR(wreq)) {
+ folio_unlock(folio);
+ return PTR_ERR(wreq);
+ }
+
+ if (!folio_clear_dirty_for_io(folio))
+ BUG();
+ folio_start_writeback(folio);
+ netfs_folio_start_fscache(caching, folio);
+
+ count -= folio_nr_pages(folio);
+
+ /* Find all consecutive lockable dirty pages that have contiguous
+ * written regions, stopping when we find a page that is not
+ * immediately lockable, is not dirty or is missing, or we reach the
+ * end of the range.
+ */
+ trace_netfs_folio(folio, netfs_folio_trace_store);
+
+ len = wreq->len;
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ start += finfo->dirty_offset;
+ if (finfo->dirty_offset + finfo->dirty_len != len) {
+ len = finfo->dirty_len;
+ goto cant_expand;
+ }
+ len = finfo->dirty_len;
+ }
+
+ if (start < i_size) {
+ /* Trim the write to the EOF; the extra data is ignored. Also
+ * put an upper limit on the size of a single storedata op.
+ */
+ max_len = 65536 * 4096;
+ max_len = min_t(unsigned long long, max_len, end - start + 1);
+ max_len = min_t(unsigned long long, max_len, i_size - start);
+
+ if (len < max_len)
+ netfs_extend_writeback(mapping, group, xas, &count, start,
+ max_len, caching, &len, &wreq->upper_len);
+ }
+
+cant_expand:
+ len = min_t(unsigned long long, len, i_size - start);
+
+ /* We now have a contiguous set of dirty pages, each with writeback
+ * set; the first page is still locked at this point, but all the rest
+ * have been unlocked.
+ */
+ folio_unlock(folio);
+ wreq->start = start;
+ wreq->len = len;
+
+ if (start < i_size) {
+ _debug("write back %zx @%llx [%llx]", len, start, i_size);
+
+ /* Speculatively write to the cache. We have to fix this up
+ * later if the store fails.
+ */
+ wreq->cleanup = netfs_cleanup_buffered_write;
+
+ iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start,
+ wreq->upper_len);
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback);
+ if (ret == 0 || ret == -EIOCBQUEUED)
+ wbc->nr_to_write -= len / PAGE_SIZE;
+ } else {
+ _debug("write discard %zx @%llx [%llx]", len, start, i_size);
+
+ /* The dirty region was entirely beyond the EOF. */
+ fscache_clear_page_bits(mapping, start, len, caching);
+ netfs_pages_written_back(wreq);
+ ret = 0;
+ }
+
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ _leave(" = 1");
+ return 1;
+}
+
+/*
+ * Write a region of pages back to the server
+ */
+static ssize_t netfs_writepages_begin(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct netfs_group *group,
+ struct xa_state *xas,
+ unsigned long long *_start,
+ unsigned long long end)
+{
+ const struct netfs_folio *finfo;
+ struct folio *folio;
+ unsigned long long start = *_start;
+ ssize_t ret;
+ void *priv;
+ int skips = 0;
+
+ _enter("%llx,%llx,", start, end);
+
+search_again:
+ /* Find the first dirty page in the group. */
+ rcu_read_lock();
+
+ for (;;) {
+ folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY);
+ if (xas_retry(xas, folio) || xa_is_value(folio))
+ continue;
+ if (!folio)
+ break;
+
+ if (!folio_try_get_rcu(folio)) {
+ xas_reset(xas);
+ continue;
+ }
+
+ if (unlikely(folio != xas_reload(xas))) {
+ folio_put(folio);
+ xas_reset(xas);
+ continue;
+ }
+
+ /* Skip any dirty folio that's not in the group of interest. */
+ priv = folio_get_private(folio);
+ if ((const struct netfs_group *)priv != group) {
+ finfo = netfs_folio_info(folio);
+ if (finfo->netfs_group != group) {
+ folio_put(folio);
+ continue;
+ }
+ }
+
+ xas_pause(xas);
+ break;
+ }
+ rcu_read_unlock();
+ if (!folio)
+ return 0;
+
+ start = folio_pos(folio); /* May regress with THPs */
+
+ _debug("wback %lx", folio->index);
+
+ /* At this point we hold neither the i_pages lock nor the page lock:
+ * the page may be truncated or invalidated (changing page->mapping to
+ * NULL), or even swizzled back from swapper_space to tmpfs file
+ * mapping
+ */
+lock_again:
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ ret = folio_lock_killable(folio);
+ if (ret < 0)
+ return ret;
+ } else {
+ if (!folio_trylock(folio))
+ goto search_again;
+ }
+
+ if (folio->mapping != mapping ||
+ !folio_test_dirty(folio)) {
+ start += folio_size(folio);
+ folio_unlock(folio);
+ goto search_again;
+ }
+
+ if (folio_test_writeback(folio) ||
+ folio_test_fscache(folio)) {
+ folio_unlock(folio);
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ folio_wait_writeback(folio);
+#ifdef CONFIG_FSCACHE
+ folio_wait_fscache(folio);
+#endif
+ goto lock_again;
+ }
+
+ start += folio_size(folio);
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ if (skips >= 5 || need_resched()) {
+ ret = 0;
+ goto out;
+ }
+ skips++;
+ }
+ goto search_again;
+ }
+
+ ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas,
+ folio, start, end);
+out:
+ if (ret > 0)
+ *_start = start + ret;
+ _leave(" = %zd [%llx]", ret, *_start);
+ return ret;
+}
+
+/*
+ * Write a region of pages back to the server
+ */
+static int netfs_writepages_region(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct netfs_group *group,
+ unsigned long long *_start,
+ unsigned long long end)
+{
+ ssize_t ret;
+
+ XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE);
+
+ do {
+ ret = netfs_writepages_begin(mapping, wbc, group, &xas,
+ _start, end);
+ if (ret > 0 && wbc->nr_to_write > 0)
+ cond_resched();
+ } while (ret > 0 && wbc->nr_to_write > 0);
+
+ return ret > 0 ? 0 : ret;
+}
+
+/*
+ * write some of the pending data back to the server
+ */
+int netfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct netfs_group *group = NULL;
+ loff_t start, end;
+ int ret;
+
+ _enter("");
+
+ /* We have to be careful as we can end up racing with setattr()
+ * truncating the pagecache since the caller doesn't take a lock here
+ * to prevent it.
+ */
+
+ if (wbc->range_cyclic && mapping->writeback_index) {
+ start = mapping->writeback_index * PAGE_SIZE;
+ ret = netfs_writepages_region(mapping, wbc, group,
+ &start, LLONG_MAX);
+ if (ret < 0)
+ goto out;
+
+ if (wbc->nr_to_write <= 0) {
+ mapping->writeback_index = start / PAGE_SIZE;
+ goto out;
+ }
+
+ start = 0;
+ end = mapping->writeback_index * PAGE_SIZE;
+ mapping->writeback_index = 0;
+ ret = netfs_writepages_region(mapping, wbc, group, &start, end);
+ if (ret == 0)
+ mapping->writeback_index = start / PAGE_SIZE;
+ } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
+ start = 0;
+ ret = netfs_writepages_region(mapping, wbc, group,
+ &start, LLONG_MAX);
+ if (wbc->nr_to_write > 0 && ret == 0)
+ mapping->writeback_index = start / PAGE_SIZE;
+ } else {
+ start = wbc->range_start;
+ ret = netfs_writepages_region(mapping, wbc, group,
+ &start, wbc->range_end);
+ }
+
+out:
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_writepages);
+
+/*
+ * Deal with the disposition of a laundered folio.
+ */
+static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq)
+{
+ if (wreq->error) {
+ pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error);
+ mapping_set_error(wreq->mapping, wreq->error);
+ }
+}
+
+/**
+ * netfs_launder_folio - Clean up a dirty folio that's being invalidated
+ * @folio: The folio to clean
+ *
+ * This is called to write back a folio that's being invalidated when an inode
+ * is getting torn down. Ideally, writepages would be used instead.
+ */
+int netfs_launder_folio(struct folio *folio)
+{
+ struct netfs_io_request *wreq;
+ struct address_space *mapping = folio->mapping;
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ struct netfs_group *group = netfs_folio_group(folio);
+ struct bio_vec bvec;
+ unsigned long long i_size = i_size_read(mapping->host);
+ unsigned long long start = folio_pos(folio);
+ size_t offset = 0, len;
+ int ret = 0;
+
+ if (finfo) {
+ offset = finfo->dirty_offset;
+ start += offset;
+ len = finfo->dirty_len;
+ } else {
+ len = folio_size(folio);
+ }
+ len = min_t(unsigned long long, len, i_size - start);
+
+ wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE);
+ if (IS_ERR(wreq)) {
+ ret = PTR_ERR(wreq);
+ goto out;
+ }
+
+ if (!folio_clear_dirty_for_io(folio))
+ goto out_put;
+
+ trace_netfs_folio(folio, netfs_folio_trace_launder);
+
+ _debug("launder %llx-%llx", start, start + len - 1);
+
+ /* Speculatively write to the cache. We have to fix this up later if
+ * the store fails.
+ */
+ wreq->cleanup = netfs_cleanup_launder_folio;
+
+ bvec_set_folio(&bvec, folio, len, offset);
+ iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len);
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ ret = netfs_begin_write(wreq, true, netfs_write_trace_launder);
+
+out_put:
+ folio_detach_private(folio);
+ netfs_put_group(group);
+ kfree(finfo);
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+out:
+ folio_wait_fscache(folio);
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_launder_folio);
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
new file mode 100644
index 000000000000..ad4370b3935d
--- /dev/null
+++ b/fs/netfs/direct_read.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Direct I/O support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/**
+ * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read
+ * @iocb: The I/O control descriptor describing the read
+ * @iter: The output buffer (also specifies read length)
+ *
+ * Perform an unbuffered I/O or direct I/O from the file in @iocb to the
+ * output buffer. No use is made of the pagecache.
+ *
+ * The caller must hold any appropriate locks.
+ */
+static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct netfs_io_request *rreq;
+ ssize_t ret;
+ size_t orig_count = iov_iter_count(iter);
+ bool async = !is_sync_kiocb(iocb);
+
+ _enter("");
+
+ if (!orig_count)
+ return 0; /* Don't update atime */
+
+ ret = kiocb_write_and_wait(iocb, orig_count);
+ if (ret < 0)
+ return ret;
+ file_accessed(iocb->ki_filp);
+
+ rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
+ iocb->ki_pos, orig_count,
+ NETFS_DIO_READ);
+ if (IS_ERR(rreq))
+ return PTR_ERR(rreq);
+
+ netfs_stat(&netfs_n_rh_dio_read);
+ trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_dio_read);
+
+ /* If this is an async op, we have to keep track of the destination
+ * buffer for ourselves as the caller's iterator will be trashed when
+ * we return.
+ *
+ * In such a case, extract an iterator to represent as much of the the
+ * output buffer as we can manage. Note that the extraction might not
+ * be able to allocate a sufficiently large bvec array and may shorten
+ * the request.
+ */
+ if (user_backed_iter(iter)) {
+ ret = netfs_extract_user_iter(iter, rreq->len, &rreq->iter, 0);
+ if (ret < 0)
+ goto out;
+ rreq->direct_bv = (struct bio_vec *)rreq->iter.bvec;
+ rreq->direct_bv_count = ret;
+ rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
+ rreq->len = iov_iter_count(&rreq->iter);
+ } else {
+ rreq->iter = *iter;
+ rreq->len = orig_count;
+ rreq->direct_bv_unpin = false;
+ iov_iter_advance(iter, orig_count);
+ }
+
+ // TODO: Set up bounce buffer if needed
+
+ if (async)
+ rreq->iocb = iocb;
+
+ ret = netfs_begin_read(rreq, is_sync_kiocb(iocb));
+ if (ret < 0)
+ goto out; /* May be -EIOCBQUEUED */
+ if (!async) {
+ // TODO: Copy from bounce buffer
+ iocb->ki_pos += rreq->transferred;
+ ret = rreq->transferred;
+ }
+
+out:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ if (ret > 0)
+ orig_count -= ret;
+ if (ret != -EIOCBQUEUED)
+ iov_iter_revert(iter, orig_count - iov_iter_count(iter));
+ return ret;
+}
+
+/**
+ * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read
+ * @iocb: The I/O control descriptor describing the read
+ * @iter: The output buffer (also specifies read length)
+ *
+ * Perform an unbuffered I/O or direct I/O from the file in @iocb to the
+ * output buffer. No use is made of the pagecache.
+ */
+ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (!iter->count)
+ return 0; /* Don't update atime */
+
+ ret = netfs_start_io_direct(inode);
+ if (ret == 0) {
+ ret = netfs_unbuffered_read_iter_locked(iocb, iter);
+ netfs_end_io_direct(inode);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(netfs_unbuffered_read_iter);
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
new file mode 100644
index 000000000000..60a40d293c87
--- /dev/null
+++ b/fs/netfs/direct_write.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Unbuffered and direct write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/uio.h>
+#include "internal.h"
+
+static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
+{
+ struct inode *inode = wreq->inode;
+ unsigned long long end = wreq->start + wreq->len;
+
+ if (!wreq->error &&
+ i_size_read(inode) < end) {
+ if (wreq->netfs_ops->update_i_size)
+ wreq->netfs_ops->update_i_size(inode, end);
+ else
+ i_size_write(inode, end);
+ }
+}
+
+/*
+ * Perform an unbuffered write where we may have to do an RMW operation on an
+ * encrypted file. This can also be used for direct I/O writes.
+ */
+static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
+ struct netfs_group *netfs_group)
+{
+ struct netfs_io_request *wreq;
+ unsigned long long start = iocb->ki_pos;
+ unsigned long long end = start + iov_iter_count(iter);
+ ssize_t ret, n;
+ bool async = !is_sync_kiocb(iocb);
+
+ _enter("");
+
+ /* We're going to need a bounce buffer if what we transmit is going to
+ * be different in some way to the source buffer, e.g. because it gets
+ * encrypted/compressed or because it needs expanding to a block size.
+ */
+ // TODO
+
+ _debug("uw %llx-%llx", start, end);
+
+ wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
+ start, end - start,
+ iocb->ki_flags & IOCB_DIRECT ?
+ NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
+ if (IS_ERR(wreq))
+ return PTR_ERR(wreq);
+
+ {
+ /* If this is an async op and we're not using a bounce buffer,
+ * we have to save the source buffer as the iterator is only
+ * good until we return. In such a case, extract an iterator
+ * to represent as much of the the output buffer as we can
+ * manage. Note that the extraction might not be able to
+ * allocate a sufficiently large bvec array and may shorten the
+ * request.
+ */
+ if (async || user_backed_iter(iter)) {
+ n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0);
+ if (n < 0) {
+ ret = n;
+ goto out;
+ }
+ wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec;
+ wreq->direct_bv_count = n;
+ wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
+ wreq->len = iov_iter_count(&wreq->iter);
+ } else {
+ wreq->iter = *iter;
+ }
+
+ wreq->io_iter = wreq->iter;
+ }
+
+ /* Copy the data into the bounce buffer and encrypt it. */
+ // TODO
+
+ /* Dispatch the write. */
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ if (async)
+ wreq->iocb = iocb;
+ wreq->cleanup = netfs_cleanup_dio_write;
+ ret = netfs_begin_write(wreq, is_sync_kiocb(iocb),
+ iocb->ki_flags & IOCB_DIRECT ?
+ netfs_write_trace_dio_write :
+ netfs_write_trace_unbuffered_write);
+ if (ret < 0) {
+ _debug("begin = %zd", ret);
+ goto out;
+ }
+
+ if (!async) {
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+
+ ret = wreq->error;
+ _debug("waited = %zd", ret);
+ if (ret == 0) {
+ ret = wreq->transferred;
+ iocb->ki_pos += ret;
+ }
+ } else {
+ ret = -EIOCBQUEUED;
+ }
+
+out:
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ return ret;
+}
+
+/**
+ * netfs_unbuffered_write_iter - Unbuffered write to a file
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * Do an unbuffered write to a file, writing the data directly to the server
+ * and not lodging the data in the pagecache.
+ *
+ * Return:
+ * * Negative error code if no data has been written at all of
+ * vfs_fsync_range() failed for a synchronous write
+ * * Number of bytes written, even for truncated writes
+ */
+ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ unsigned long long end;
+ ssize_t ret;
+
+ _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+
+ trace_netfs_write_iter(iocb, from);
+ netfs_stat(&netfs_n_rh_dio_write);
+
+ ret = netfs_start_io_direct(inode);
+ if (ret < 0)
+ return ret;
+ ret = generic_write_checks(iocb, from);
+ if (ret < 0)
+ goto out;
+ ret = file_remove_privs(file);
+ if (ret < 0)
+ goto out;
+ ret = file_update_time(file);
+ if (ret < 0)
+ goto out;
+ ret = kiocb_invalidate_pages(iocb, iov_iter_count(from));
+ if (ret < 0)
+ goto out;
+ end = iocb->ki_pos + iov_iter_count(from);
+ if (end > ictx->zero_point)
+ ictx->zero_point = end;
+
+ fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode),
+ FSCACHE_INVAL_DIO_WRITE);
+ ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL);
+out:
+ netfs_end_io_direct(inode);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_unbuffered_write_iter);
diff --git a/fs/fscache/cache.c b/fs/netfs/fscache_cache.c
index d645f8b302a2..9397ed39b0b4 100644
--- a/fs/fscache/cache.c
+++ b/fs/netfs/fscache_cache.c
@@ -179,13 +179,14 @@ EXPORT_SYMBOL(fscache_acquire_cache);
void fscache_put_cache(struct fscache_cache *cache,
enum fscache_cache_trace where)
{
- unsigned int debug_id = cache->debug_id;
+ unsigned int debug_id;
bool zero;
int ref;
if (IS_ERR_OR_NULL(cache))
return;
+ debug_id = cache->debug_id;
zero = __refcount_dec_and_test(&cache->ref, &ref);
trace_fscache_cache(debug_id, ref - 1, where);
diff --git a/fs/fscache/cookie.c b/fs/netfs/fscache_cookie.c
index bce2492186d0..bce2492186d0 100644
--- a/fs/fscache/cookie.c
+++ b/fs/netfs/fscache_cookie.c
diff --git a/fs/netfs/fscache_internal.h b/fs/netfs/fscache_internal.h
new file mode 100644
index 000000000000..a09b948fcef2
--- /dev/null
+++ b/fs/netfs/fscache_internal.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Internal definitions for FS-Cache
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include "internal.h"
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) "FS-Cache: " fmt
diff --git a/fs/fscache/io.c b/fs/netfs/fscache_io.c
index 0d2b8dec8f82..ad572f7ee897 100644
--- a/fs/fscache/io.c
+++ b/fs/netfs/fscache_io.c
@@ -158,46 +158,6 @@ int __fscache_begin_write_operation(struct netfs_cache_resources *cres,
}
EXPORT_SYMBOL(__fscache_begin_write_operation);
-/**
- * fscache_dirty_folio - Mark folio dirty and pin a cache object for writeback
- * @mapping: The mapping the folio belongs to.
- * @folio: The folio being dirtied.
- * @cookie: The cookie referring to the cache object
- *
- * Set the dirty flag on a folio and pin an in-use cache object in memory
- * so that writeback can later write to it. This is intended
- * to be called from the filesystem's ->dirty_folio() method.
- *
- * Return: true if the dirty flag was set on the folio, false otherwise.
- */
-bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio,
- struct fscache_cookie *cookie)
-{
- struct inode *inode = mapping->host;
- bool need_use = false;
-
- _enter("");
-
- if (!filemap_dirty_folio(mapping, folio))
- return false;
- if (!fscache_cookie_valid(cookie))
- return true;
-
- if (!(inode->i_state & I_PINNING_FSCACHE_WB)) {
- spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_PINNING_FSCACHE_WB)) {
- inode->i_state |= I_PINNING_FSCACHE_WB;
- need_use = true;
- }
- spin_unlock(&inode->i_lock);
-
- if (need_use)
- fscache_use_cookie(cookie, true);
- }
- return true;
-}
-EXPORT_SYMBOL(fscache_dirty_folio);
-
struct fscache_write_request {
struct netfs_cache_resources cache_resources;
struct address_space *mapping;
@@ -277,7 +237,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
fscache_access_io_write) < 0)
goto abandon_free;
- ret = cres->ops->prepare_write(cres, &start, &len, i_size, false);
+ ret = cres->ops->prepare_write(cres, &start, &len, len, i_size, false);
if (ret < 0)
goto abandon_end;
diff --git a/fs/fscache/main.c b/fs/netfs/fscache_main.c
index dad85fd84f6f..42e98bb523e3 100644
--- a/fs/fscache/main.c
+++ b/fs/netfs/fscache_main.c
@@ -8,18 +8,9 @@
#define FSCACHE_DEBUG_LEVEL CACHE
#include <linux/module.h>
#include <linux/init.h>
-#define CREATE_TRACE_POINTS
#include "internal.h"
-
-MODULE_DESCRIPTION("FS Cache Manager");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
-
-unsigned fscache_debug;
-module_param_named(debug, fscache_debug, uint,
- S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(fscache_debug,
- "FS-Cache debugging mask");
+#define CREATE_TRACE_POINTS
+#include <trace/events/fscache.h>
EXPORT_TRACEPOINT_SYMBOL(fscache_access_cache);
EXPORT_TRACEPOINT_SYMBOL(fscache_access_volume);
@@ -71,7 +62,7 @@ unsigned int fscache_hash(unsigned int salt, const void *data, size_t len)
/*
* initialise the fs caching module
*/
-static int __init fscache_init(void)
+int __init fscache_init(void)
{
int ret = -ENOMEM;
@@ -92,7 +83,7 @@ static int __init fscache_init(void)
goto error_cookie_jar;
}
- pr_notice("Loaded\n");
+ pr_notice("FS-Cache loaded\n");
return 0;
error_cookie_jar:
@@ -103,19 +94,15 @@ error_wq:
return ret;
}
-fs_initcall(fscache_init);
-
/*
* clean up on module removal
*/
-static void __exit fscache_exit(void)
+void __exit fscache_exit(void)
{
_enter("");
kmem_cache_destroy(fscache_cookie_jar);
fscache_proc_cleanup();
destroy_workqueue(fscache_wq);
- pr_notice("Unloaded\n");
+ pr_notice("FS-Cache unloaded\n");
}
-
-module_exit(fscache_exit);
diff --git a/fs/fscache/proc.c b/fs/netfs/fscache_proc.c
index dc3b0e9c8cce..874d951bc390 100644
--- a/fs/fscache/proc.c
+++ b/fs/netfs/fscache_proc.c
@@ -12,41 +12,34 @@
#include "internal.h"
/*
- * initialise the /proc/fs/fscache/ directory
+ * Add files to /proc/fs/netfs/.
*/
int __init fscache_proc_init(void)
{
- if (!proc_mkdir("fs/fscache", NULL))
- goto error_dir;
+ if (!proc_symlink("fs/fscache", NULL, "netfs"))
+ goto error_sym;
- if (!proc_create_seq("fs/fscache/caches", S_IFREG | 0444, NULL,
+ if (!proc_create_seq("fs/netfs/caches", S_IFREG | 0444, NULL,
&fscache_caches_seq_ops))
goto error;
- if (!proc_create_seq("fs/fscache/volumes", S_IFREG | 0444, NULL,
+ if (!proc_create_seq("fs/netfs/volumes", S_IFREG | 0444, NULL,
&fscache_volumes_seq_ops))
goto error;
- if (!proc_create_seq("fs/fscache/cookies", S_IFREG | 0444, NULL,
+ if (!proc_create_seq("fs/netfs/cookies", S_IFREG | 0444, NULL,
&fscache_cookies_seq_ops))
goto error;
-
-#ifdef CONFIG_FSCACHE_STATS
- if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL,
- fscache_stats_show))
- goto error;
-#endif
-
return 0;
error:
remove_proc_entry("fs/fscache", NULL);
-error_dir:
+error_sym:
return -ENOMEM;
}
/*
- * clean up the /proc/fs/fscache/ directory
+ * Clean up the /proc/fs/fscache symlink.
*/
void fscache_proc_cleanup(void)
{
diff --git a/fs/fscache/stats.c b/fs/netfs/fscache_stats.c
index fc94e5e79f1c..add21abdf713 100644
--- a/fs/fscache/stats.c
+++ b/fs/netfs/fscache_stats.c
@@ -48,13 +48,15 @@ atomic_t fscache_n_no_create_space;
EXPORT_SYMBOL(fscache_n_no_create_space);
atomic_t fscache_n_culled;
EXPORT_SYMBOL(fscache_n_culled);
+atomic_t fscache_n_dio_misfit;
+EXPORT_SYMBOL(fscache_n_dio_misfit);
/*
* display the general statistics
*/
-int fscache_stats_show(struct seq_file *m, void *v)
+int fscache_stats_show(struct seq_file *m)
{
- seq_puts(m, "FS-Cache statistics\n");
+ seq_puts(m, "-- FS-Cache statistics --\n");
seq_printf(m, "Cookies: n=%d v=%d vcol=%u voom=%u\n",
atomic_read(&fscache_n_cookies),
atomic_read(&fscache_n_volumes),
@@ -93,10 +95,9 @@ int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_no_create_space),
atomic_read(&fscache_n_culled));
- seq_printf(m, "IO : rd=%u wr=%u\n",
+ seq_printf(m, "IO : rd=%u wr=%u mis=%u\n",
atomic_read(&fscache_n_read),
- atomic_read(&fscache_n_write));
-
- netfs_stats_show(m);
+ atomic_read(&fscache_n_write),
+ atomic_read(&fscache_n_dio_misfit));
return 0;
}
diff --git a/fs/fscache/volume.c b/fs/netfs/fscache_volume.c
index cdf991bdd9de..cdf991bdd9de 100644
--- a/fs/fscache/volume.c
+++ b/fs/netfs/fscache_volume.c
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 43fac1b14e40..ec7045d24400 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -5,9 +5,13 @@
* Written by David Howells (dhowells@redhat.com)
*/
+#include <linux/slab.h>
+#include <linux/seq_file.h>
#include <linux/netfs.h>
#include <linux/fscache.h>
+#include <linux/fscache-cache.h>
#include <trace/events/netfs.h>
+#include <trace/events/fscache.h>
#ifdef pr_fmt
#undef pr_fmt
@@ -19,6 +23,8 @@
* buffered_read.c
*/
void netfs_rreq_unlock_folios(struct netfs_io_request *rreq);
+int netfs_prefetch_for_write(struct file *file, struct folio *folio,
+ size_t offset, size_t len);
/*
* io.c
@@ -29,6 +35,41 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
* main.c
*/
extern unsigned int netfs_debug;
+extern struct list_head netfs_io_requests;
+extern spinlock_t netfs_proc_lock;
+
+#ifdef CONFIG_PROC_FS
+static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq)
+{
+ spin_lock(&netfs_proc_lock);
+ list_add_tail_rcu(&rreq->proc_link, &netfs_io_requests);
+ spin_unlock(&netfs_proc_lock);
+}
+static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq)
+{
+ if (!list_empty(&rreq->proc_link)) {
+ spin_lock(&netfs_proc_lock);
+ list_del_rcu(&rreq->proc_link);
+ spin_unlock(&netfs_proc_lock);
+ }
+}
+#else
+static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) {}
+static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
+#endif
+
+/*
+ * misc.c
+ */
+#define NETFS_FLAG_PUT_MARK BIT(0)
+#define NETFS_FLAG_PAGECACHE_MARK BIT(1)
+int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
+ struct folio *folio, unsigned int flags,
+ gfp_t gfp_mask);
+int netfs_add_folios_to_buffer(struct xarray *buffer,
+ struct address_space *mapping,
+ pgoff_t index, pgoff_t to, gfp_t gfp_mask);
+void netfs_clear_buffer(struct xarray *buffer);
/*
* objects.c
@@ -50,9 +91,20 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
}
/*
+ * output.c
+ */
+int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
+ enum netfs_write_trace what);
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
+int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end);
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb);
+
+/*
* stats.c
*/
#ifdef CONFIG_NETFS_STATS
+extern atomic_t netfs_n_rh_dio_read;
+extern atomic_t netfs_n_rh_dio_write;
extern atomic_t netfs_n_rh_readahead;
extern atomic_t netfs_n_rh_readpage;
extern atomic_t netfs_n_rh_rreq;
@@ -71,7 +123,15 @@ extern atomic_t netfs_n_rh_write_begin;
extern atomic_t netfs_n_rh_write_done;
extern atomic_t netfs_n_rh_write_failed;
extern atomic_t netfs_n_rh_write_zskip;
+extern atomic_t netfs_n_wh_wstream_conflict;
+extern atomic_t netfs_n_wh_upload;
+extern atomic_t netfs_n_wh_upload_done;
+extern atomic_t netfs_n_wh_upload_failed;
+extern atomic_t netfs_n_wh_write;
+extern atomic_t netfs_n_wh_write_done;
+extern atomic_t netfs_n_wh_write_failed;
+int netfs_stats_show(struct seq_file *m, void *v);
static inline void netfs_stat(atomic_t *stat)
{
@@ -103,6 +163,176 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
#endif
}
+/*
+ * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group)
+{
+ if (netfs_group)
+ refcount_inc(&netfs_group->ref);
+ return netfs_group;
+}
+
+/*
+ * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline void netfs_put_group(struct netfs_group *netfs_group)
+{
+ if (netfs_group && refcount_dec_and_test(&netfs_group->ref))
+ netfs_group->free(netfs_group);
+}
+
+/*
+ * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr)
+{
+ if (netfs_group && refcount_sub_and_test(nr, &netfs_group->ref))
+ netfs_group->free(netfs_group);
+}
+
+/*
+ * fscache-cache.c
+ */
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_caches_seq_ops;
+#endif
+bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
+void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
+struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache);
+void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where);
+
+static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache)
+{
+ return smp_load_acquire(&cache->state);
+}
+
+static inline bool fscache_cache_is_live(const struct fscache_cache *cache)
+{
+ return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE;
+}
+
+static inline void fscache_set_cache_state(struct fscache_cache *cache,
+ enum fscache_cache_state new_state)
+{
+ smp_store_release(&cache->state, new_state);
+
+}
+
+static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache,
+ enum fscache_cache_state old_state,
+ enum fscache_cache_state new_state)
+{
+ return try_cmpxchg_release(&cache->state, &old_state, new_state);
+}
+
+/*
+ * fscache-cookie.c
+ */
+extern struct kmem_cache *fscache_cookie_jar;
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_cookies_seq_ops;
+#endif
+extern struct timer_list fscache_cookie_lru_timer;
+
+extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix);
+extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie,
+ enum fscache_access_trace why);
+
+static inline void fscache_see_cookie(struct fscache_cookie *cookie,
+ enum fscache_cookie_trace where)
+{
+ trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref),
+ where);
+}
+
+/*
+ * fscache-main.c
+ */
+extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len);
+#ifdef CONFIG_FSCACHE
+int __init fscache_init(void);
+void __exit fscache_exit(void);
+#else
+static inline int fscache_init(void) { return 0; }
+static inline void fscache_exit(void) {}
+#endif
+
+/*
+ * fscache-proc.c
+ */
+#ifdef CONFIG_PROC_FS
+extern int __init fscache_proc_init(void);
+extern void fscache_proc_cleanup(void);
+#else
+#define fscache_proc_init() (0)
+#define fscache_proc_cleanup() do {} while (0)
+#endif
+
+/*
+ * fscache-stats.c
+ */
+#ifdef CONFIG_FSCACHE_STATS
+extern atomic_t fscache_n_volumes;
+extern atomic_t fscache_n_volumes_collision;
+extern atomic_t fscache_n_volumes_nomem;
+extern atomic_t fscache_n_cookies;
+extern atomic_t fscache_n_cookies_lru;
+extern atomic_t fscache_n_cookies_lru_expired;
+extern atomic_t fscache_n_cookies_lru_removed;
+extern atomic_t fscache_n_cookies_lru_dropped;
+
+extern atomic_t fscache_n_acquires;
+extern atomic_t fscache_n_acquires_ok;
+extern atomic_t fscache_n_acquires_oom;
+
+extern atomic_t fscache_n_invalidates;
+
+extern atomic_t fscache_n_relinquishes;
+extern atomic_t fscache_n_relinquishes_retire;
+extern atomic_t fscache_n_relinquishes_dropped;
+
+extern atomic_t fscache_n_resizes;
+extern atomic_t fscache_n_resizes_null;
+
+static inline void fscache_stat(atomic_t *stat)
+{
+ atomic_inc(stat);
+}
+
+static inline void fscache_stat_d(atomic_t *stat)
+{
+ atomic_dec(stat);
+}
+
+#define __fscache_stat(stat) (stat)
+
+int fscache_stats_show(struct seq_file *m);
+#else
+
+#define __fscache_stat(stat) (NULL)
+#define fscache_stat(stat) do {} while (0)
+#define fscache_stat_d(stat) do {} while (0)
+
+static inline int fscache_stats_show(struct seq_file *m) { return 0; }
+#endif
+
+/*
+ * fscache-volume.c
+ */
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_volumes_seq_ops;
+#endif
+
+struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where);
+void fscache_put_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where);
+bool fscache_begin_volume_access(struct fscache_volume *volume,
+ struct fscache_cookie *cookie,
+ enum fscache_access_trace why);
+void fscache_create_volume(struct fscache_volume *volume, bool wait);
+
/*****************************************************************************/
/*
* debug tracing
@@ -143,3 +373,57 @@ do { \
#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
#endif
+
+/*
+ * assertions
+ */
+#if 1 /* defined(__KDEBUGALL) */
+
+#define ASSERT(X) \
+do { \
+ if (unlikely(!(X))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTCMP(X, OP, Y) \
+do { \
+ if (unlikely(!((X) OP (Y)))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ pr_err("%lx " #OP " %lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTIF(C, X) \
+do { \
+ if (unlikely((C) && !(X))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTIFCMP(C, X, OP, Y) \
+do { \
+ if (unlikely((C) && !((X) OP (Y)))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ pr_err("%lx " #OP " %lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while (0)
+
+#else
+
+#define ASSERT(X) do {} while (0)
+#define ASSERTCMP(X, OP, Y) do {} while (0)
+#define ASSERTIF(C, X) do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
+
+#endif /* assert or not */
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 7f753380e047..e8ff1e61ce79 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -21,12 +21,7 @@
*/
static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
{
- struct iov_iter iter;
-
- iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages,
- subreq->start + subreq->transferred,
- subreq->len - subreq->transferred);
- iov_iter_zero(iov_iter_count(&iter), &iter);
+ iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
}
static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
@@ -46,14 +41,9 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq,
enum netfs_read_from_hole read_hole)
{
struct netfs_cache_resources *cres = &rreq->cache_resources;
- struct iov_iter iter;
netfs_stat(&netfs_n_rh_read);
- iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages,
- subreq->start + subreq->transferred,
- subreq->len - subreq->transferred);
-
- cres->ops->read(cres, subreq->start, &iter, read_hole,
+ cres->ops->read(cres, subreq->start, &subreq->io_iter, read_hole,
netfs_cache_read_terminated, subreq);
}
@@ -88,6 +78,13 @@ static void netfs_read_from_server(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq)
{
netfs_stat(&netfs_n_rh_download);
+
+ if (rreq->origin != NETFS_DIO_READ &&
+ iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
+ pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n",
+ rreq->debug_id, subreq->debug_index,
+ iov_iter_count(&subreq->io_iter), subreq->len,
+ subreq->transferred, subreq->flags);
rreq->netfs_ops->issue_read(subreq);
}
@@ -127,9 +124,10 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
/* We might have multiple writes from the same huge
* folio, but we mustn't unlock a folio more than once.
*/
- if (have_unlocked && folio_index(folio) <= unlocked)
+ if (have_unlocked && folio->index <= unlocked)
continue;
- unlocked = folio_index(folio);
+ unlocked = folio_next_index(folio) - 1;
+ trace_netfs_folio(folio, netfs_folio_trace_end_copy);
folio_end_fscache(folio);
have_unlocked = true;
}
@@ -201,7 +199,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
}
ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
- rreq->i_size, true);
+ subreq->len, rreq->i_size, true);
if (ret < 0) {
trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
@@ -260,6 +258,30 @@ static void netfs_rreq_short_read(struct netfs_io_request *rreq,
}
/*
+ * Reset the subrequest iterator prior to resubmission.
+ */
+static void netfs_reset_subreq_iter(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ size_t remaining = subreq->len - subreq->transferred;
+ size_t count = iov_iter_count(&subreq->io_iter);
+
+ if (count == remaining)
+ return;
+
+ _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n",
+ rreq->debug_id, subreq->debug_index,
+ iov_iter_count(&subreq->io_iter), subreq->transferred,
+ subreq->len, rreq->i_size,
+ subreq->io_iter.iter_type);
+
+ if (count < remaining)
+ iov_iter_revert(&subreq->io_iter, remaining - count);
+ else
+ iov_iter_advance(&subreq->io_iter, count - remaining);
+}
+
+/*
* Resubmit any short or failed operations. Returns true if we got the rreq
* ref back.
*/
@@ -287,6 +309,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq)
trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
atomic_inc(&rreq->nr_outstanding);
+ netfs_reset_subreq_iter(rreq, subreq);
netfs_read_from_server(rreq, subreq);
} else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) {
netfs_rreq_short_read(rreq, subreq);
@@ -321,6 +344,43 @@ static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq)
}
/*
+ * Determine how much we can admit to having read from a DIO read.
+ */
+static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ unsigned int i;
+ size_t transferred = 0;
+
+ for (i = 0; i < rreq->direct_bv_count; i++)
+ flush_dcache_page(rreq->direct_bv[i].bv_page);
+
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ if (subreq->error || subreq->transferred == 0)
+ break;
+ transferred += subreq->transferred;
+ if (subreq->transferred < subreq->len)
+ break;
+ }
+
+ for (i = 0; i < rreq->direct_bv_count; i++)
+ flush_dcache_page(rreq->direct_bv[i].bv_page);
+
+ rreq->transferred = transferred;
+ task_io_account_read(transferred);
+
+ if (rreq->iocb) {
+ rreq->iocb->ki_pos += transferred;
+ if (rreq->iocb->ki_complete)
+ rreq->iocb->ki_complete(
+ rreq->iocb, rreq->error ? rreq->error : transferred);
+ }
+ if (rreq->netfs_ops->done)
+ rreq->netfs_ops->done(rreq);
+ inode_dio_end(rreq->inode);
+}
+
+/*
* Assess the state of a read request and decide what to do next.
*
* Note that we could be in an ordinary kernel thread, on a workqueue or in
@@ -340,8 +400,12 @@ again:
return;
}
- netfs_rreq_unlock_folios(rreq);
+ if (rreq->origin != NETFS_DIO_READ)
+ netfs_rreq_unlock_folios(rreq);
+ else
+ netfs_rreq_assess_dio(rreq);
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
@@ -399,9 +463,9 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq,
struct netfs_io_request *rreq = subreq->rreq;
int u;
- _enter("[%u]{%llx,%lx},%zd",
- subreq->debug_index, subreq->start, subreq->flags,
- transferred_or_error);
+ _enter("R=%x[%x]{%llx,%lx},%zd",
+ rreq->debug_id, subreq->debug_index,
+ subreq->start, subreq->flags, transferred_or_error);
switch (subreq->source) {
case NETFS_READ_FROM_CACHE:
@@ -501,15 +565,20 @@ static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest
*/
static enum netfs_io_source
netfs_rreq_prepare_read(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq)
+ struct netfs_io_subrequest *subreq,
+ struct iov_iter *io_iter)
{
- enum netfs_io_source source;
+ enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
+ struct netfs_inode *ictx = netfs_inode(rreq->inode);
+ size_t lsize;
_enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
- source = netfs_cache_prepare_read(subreq, rreq->i_size);
- if (source == NETFS_INVALID_READ)
- goto out;
+ if (rreq->origin != NETFS_DIO_READ) {
+ source = netfs_cache_prepare_read(subreq, rreq->i_size);
+ if (source == NETFS_INVALID_READ)
+ goto out;
+ }
if (source == NETFS_DOWNLOAD_FROM_SERVER) {
/* Call out to the netfs to let it shrink the request to fit
@@ -518,19 +587,52 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
* to make serial calls, it can indicate a short read and then
* we will call it again.
*/
+ if (rreq->origin != NETFS_DIO_READ) {
+ if (subreq->start >= ictx->zero_point) {
+ source = NETFS_FILL_WITH_ZEROES;
+ goto set;
+ }
+ if (subreq->len > ictx->zero_point - subreq->start)
+ subreq->len = ictx->zero_point - subreq->start;
+ }
if (subreq->len > rreq->i_size - subreq->start)
subreq->len = rreq->i_size - subreq->start;
+ if (rreq->rsize && subreq->len > rreq->rsize)
+ subreq->len = rreq->rsize;
if (rreq->netfs_ops->clamp_length &&
!rreq->netfs_ops->clamp_length(subreq)) {
source = NETFS_INVALID_READ;
goto out;
}
+
+ if (subreq->max_nr_segs) {
+ lsize = netfs_limit_iter(io_iter, 0, subreq->len,
+ subreq->max_nr_segs);
+ if (subreq->len > lsize) {
+ subreq->len = lsize;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
+ }
+ }
}
- if (WARN_ON(subreq->len == 0))
+set:
+ if (subreq->len > rreq->len)
+ pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n",
+ rreq->debug_id, subreq->debug_index,
+ subreq->len, rreq->len);
+
+ if (WARN_ON(subreq->len == 0)) {
source = NETFS_INVALID_READ;
+ goto out;
+ }
+ subreq->source = source;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+
+ subreq->io_iter = *io_iter;
+ iov_iter_truncate(&subreq->io_iter, subreq->len);
+ iov_iter_advance(io_iter, subreq->len);
out:
subreq->source = source;
trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
@@ -541,6 +643,7 @@ out:
* Slice off a piece of a read request and submit an I/O request for it.
*/
static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
+ struct iov_iter *io_iter,
unsigned int *_debug_index)
{
struct netfs_io_subrequest *subreq;
@@ -552,7 +655,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
subreq->debug_index = (*_debug_index)++;
subreq->start = rreq->start + rreq->submitted;
- subreq->len = rreq->len - rreq->submitted;
+ subreq->len = io_iter->count;
_debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
@@ -565,7 +668,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
* (the starts must coincide), in which case, we go around the loop
* again and ask it to download the next piece.
*/
- source = netfs_rreq_prepare_read(rreq, subreq);
+ source = netfs_rreq_prepare_read(rreq, subreq, io_iter);
if (source == NETFS_INVALID_READ)
goto subreq_failed;
@@ -603,6 +706,7 @@ subreq_failed:
*/
int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
{
+ struct iov_iter io_iter;
unsigned int debug_index = 0;
int ret;
@@ -611,50 +715,71 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
if (rreq->len == 0) {
pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
- netfs_put_request(rreq, false, netfs_rreq_trace_put_zero_len);
return -EIO;
}
- INIT_WORK(&rreq->work, netfs_rreq_work);
+ if (rreq->origin == NETFS_DIO_READ)
+ inode_dio_begin(rreq->inode);
- if (sync)
- netfs_get_request(rreq, netfs_rreq_trace_get_hold);
+ // TODO: Use bounce buffer if requested
+ rreq->io_iter = rreq->iter;
+
+ INIT_WORK(&rreq->work, netfs_rreq_work);
/* Chop the read into slices according to what the cache and the netfs
* want and submit each one.
*/
+ netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding);
atomic_set(&rreq->nr_outstanding, 1);
+ io_iter = rreq->io_iter;
do {
- if (!netfs_rreq_submit_slice(rreq, &debug_index))
+ _debug("submit %llx + %zx >= %llx",
+ rreq->start, rreq->submitted, rreq->i_size);
+ if (rreq->origin == NETFS_DIO_READ &&
+ rreq->start + rreq->submitted >= rreq->i_size)
+ break;
+ if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index))
+ break;
+ if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
+ test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
break;
} while (rreq->submitted < rreq->len);
+ if (!rreq->submitted) {
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
+ ret = 0;
+ goto out;
+ }
+
if (sync) {
- /* Keep nr_outstanding incremented so that the ref always belongs to
- * us, and the service code isn't punted off to a random thread pool to
- * process.
+ /* Keep nr_outstanding incremented so that the ref always
+ * belongs to us, and the service code isn't punted off to a
+ * random thread pool to process. Note that this might start
+ * further work, such as writing to the cache.
*/
- for (;;) {
- wait_var_event(&rreq->nr_outstanding,
- atomic_read(&rreq->nr_outstanding) == 1);
+ wait_var_event(&rreq->nr_outstanding,
+ atomic_read(&rreq->nr_outstanding) == 1);
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_assess(rreq, false);
- if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
- break;
- cond_resched();
- }
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
+ wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
ret = rreq->error;
- if (ret == 0 && rreq->submitted < rreq->len) {
+ if (ret == 0 && rreq->submitted < rreq->len &&
+ rreq->origin != NETFS_DIO_READ) {
trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
ret = -EIO;
}
- netfs_put_request(rreq, false, netfs_rreq_trace_put_hold);
} else {
/* If we decrement nr_outstanding to 0, the ref belongs to us. */
if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_assess(rreq, false);
- ret = 0;
+ ret = -EIOCBQUEUED;
}
+
+out:
return ret;
}
diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c
index 2ff07ba655a0..b781bbbf1d8d 100644
--- a/fs/netfs/iterator.c
+++ b/fs/netfs/iterator.c
@@ -101,3 +101,100 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
return npages;
}
EXPORT_SYMBOL_GPL(netfs_extract_user_iter);
+
+/*
+ * Select the span of a bvec iterator we're going to use. Limit it by both maximum
+ * size and maximum number of segments. Returns the size of the span in bytes.
+ */
+static size_t netfs_limit_bvec(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ const struct bio_vec *bvecs = iter->bvec;
+ unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0;
+ size_t len, span = 0, n = iter->count;
+ size_t skip = iter->iov_offset + start_offset;
+
+ if (WARN_ON(!iov_iter_is_bvec(iter)) ||
+ WARN_ON(start_offset > n) ||
+ n == 0)
+ return 0;
+
+ while (n && ix < nbv && skip) {
+ len = bvecs[ix].bv_len;
+ if (skip < len)
+ break;
+ skip -= len;
+ n -= len;
+ ix++;
+ }
+
+ while (n && ix < nbv) {
+ len = min3(n, bvecs[ix].bv_len - skip, max_size);
+ span += len;
+ nsegs++;
+ ix++;
+ if (span >= max_size || nsegs >= max_segs)
+ break;
+ skip = 0;
+ n -= len;
+ }
+
+ return min(span, max_size);
+}
+
+/*
+ * Select the span of an xarray iterator we're going to use. Limit it by both
+ * maximum size and maximum number of segments. It is assumed that segments
+ * can be larger than a page in size, provided they're physically contiguous.
+ * Returns the size of the span in bytes.
+ */
+static size_t netfs_limit_xarray(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ struct folio *folio;
+ unsigned int nsegs = 0;
+ loff_t pos = iter->xarray_start + iter->iov_offset;
+ pgoff_t index = pos / PAGE_SIZE;
+ size_t span = 0, n = iter->count;
+
+ XA_STATE(xas, iter->xarray, index);
+
+ if (WARN_ON(!iov_iter_is_xarray(iter)) ||
+ WARN_ON(start_offset > n) ||
+ n == 0)
+ return 0;
+ max_size = min(max_size, n - start_offset);
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, ULONG_MAX) {
+ size_t offset, flen, len;
+ if (xas_retry(&xas, folio))
+ continue;
+ if (WARN_ON(xa_is_value(folio)))
+ break;
+ if (WARN_ON(folio_test_hugetlb(folio)))
+ break;
+
+ flen = folio_size(folio);
+ offset = offset_in_folio(folio, pos);
+ len = min(max_size, flen - offset);
+ span += len;
+ nsegs++;
+ if (span >= max_size || nsegs >= max_segs)
+ break;
+ }
+
+ rcu_read_unlock();
+ return min(span, max_size);
+}
+
+size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ if (iov_iter_is_bvec(iter))
+ return netfs_limit_bvec(iter, start_offset, max_size, max_segs);
+ if (iov_iter_is_xarray(iter))
+ return netfs_limit_xarray(iter, start_offset, max_size, max_segs);
+ BUG();
+}
+EXPORT_SYMBOL(netfs_limit_iter);
diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c
new file mode 100644
index 000000000000..75dc52a49b3a
--- /dev/null
+++ b/fs/netfs/locking.c
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * I/O and data path helper functionality.
+ *
+ * Borrowed from NFS Copyright (c) 2016 Trond Myklebust
+ */
+
+#include <linux/kernel.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/*
+ * inode_dio_wait_interruptible - wait for outstanding DIO requests to finish
+ * @inode: inode to wait for
+ *
+ * Waits for all pending direct I/O requests to finish so that we can
+ * proceed with a truncate or equivalent operation.
+ *
+ * Must be called under a lock that serializes taking new references
+ * to i_dio_count, usually by inode->i_mutex.
+ */
+static int inode_dio_wait_interruptible(struct inode *inode)
+{
+ if (!atomic_read(&inode->i_dio_count))
+ return 0;
+
+ wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
+ DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
+
+ for (;;) {
+ prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE);
+ if (!atomic_read(&inode->i_dio_count))
+ break;
+ if (signal_pending(current))
+ break;
+ schedule();
+ }
+ finish_wait(wq, &q.wq_entry);
+
+ return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0;
+}
+
+/* Call with exclusively locked inode->i_rwsem */
+static int netfs_block_o_direct(struct netfs_inode *ictx)
+{
+ if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags))
+ return 0;
+ clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+ return inode_dio_wait_interruptible(&ictx->inode);
+}
+
+/**
+ * netfs_start_io_read - declare the file is being used for buffered reads
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * NETFS_ICTX_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+int netfs_start_io_read(struct inode *inode)
+ __acquires(inode->i_rwsem)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+
+ /* Be an optimist! */
+ if (down_read_interruptible(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) == 0)
+ return 0;
+ up_read(&inode->i_rwsem);
+
+ /* Slow path.... */
+ if (down_write_killable(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (netfs_block_o_direct(ictx) < 0) {
+ up_write(&inode->i_rwsem);
+ return -ERESTARTSYS;
+ }
+ downgrade_write(&inode->i_rwsem);
+ return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_read);
+
+/**
+ * netfs_end_io_read - declare that the buffered read operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_read(struct inode *inode)
+ __releases(inode->i_rwsem)
+{
+ up_read(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_read);
+
+/**
+ * netfs_start_io_write - declare the file is being used for buffered writes
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+int netfs_start_io_write(struct inode *inode)
+ __acquires(inode->i_rwsem)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+
+ if (down_write_killable(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (netfs_block_o_direct(ictx) < 0) {
+ up_write(&inode->i_rwsem);
+ return -ERESTARTSYS;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_write);
+
+/**
+ * netfs_end_io_write - declare that the buffered write operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_write(struct inode *inode)
+ __releases(inode->i_rwsem)
+{
+ up_write(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_write);
+
+/* Call with exclusively locked inode->i_rwsem */
+static int netfs_block_buffered(struct inode *inode)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+ int ret;
+
+ if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) {
+ set_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+ if (inode->i_mapping->nrpages != 0) {
+ unmap_mapping_range(inode->i_mapping, 0, 0, 0);
+ ret = filemap_fdatawait(inode->i_mapping);
+ if (ret < 0) {
+ clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+/**
+ * netfs_start_io_direct - declare the file is being used for direct i/o
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * NETFS_ICTX_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+int netfs_start_io_direct(struct inode *inode)
+ __acquires(inode->i_rwsem)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+ int ret;
+
+ /* Be an optimist! */
+ if (down_read_interruptible(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) != 0)
+ return 0;
+ up_read(&inode->i_rwsem);
+
+ /* Slow path.... */
+ if (down_write_killable(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ ret = netfs_block_buffered(inode);
+ if (ret < 0) {
+ up_write(&inode->i_rwsem);
+ return ret;
+ }
+ downgrade_write(&inode->i_rwsem);
+ return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_direct);
+
+/**
+ * netfs_end_io_direct - declare that the direct i/o operation is done
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_direct(struct inode *inode)
+ __releases(inode->i_rwsem)
+{
+ up_read(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_direct);
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 068568702957..5e77618a7940 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -7,6 +7,8 @@
#include <linux/module.h>
#include <linux/export.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/netfs.h>
@@ -15,6 +17,113 @@ MODULE_DESCRIPTION("Network fs support");
MODULE_AUTHOR("Red Hat, Inc.");
MODULE_LICENSE("GPL");
+EXPORT_TRACEPOINT_SYMBOL(netfs_sreq);
+
unsigned netfs_debug;
module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
+
+#ifdef CONFIG_PROC_FS
+LIST_HEAD(netfs_io_requests);
+DEFINE_SPINLOCK(netfs_proc_lock);
+
+static const char *netfs_origins[nr__netfs_io_origin] = {
+ [NETFS_READAHEAD] = "RA",
+ [NETFS_READPAGE] = "RP",
+ [NETFS_READ_FOR_WRITE] = "RW",
+ [NETFS_WRITEBACK] = "WB",
+ [NETFS_WRITETHROUGH] = "WT",
+ [NETFS_LAUNDER_WRITE] = "LW",
+ [NETFS_UNBUFFERED_WRITE] = "UW",
+ [NETFS_DIO_READ] = "DR",
+ [NETFS_DIO_WRITE] = "DW",
+};
+
+/*
+ * Generate a list of I/O requests in /proc/fs/netfs/requests
+ */
+static int netfs_requests_seq_show(struct seq_file *m, void *v)
+{
+ struct netfs_io_request *rreq;
+
+ if (v == &netfs_io_requests) {
+ seq_puts(m,
+ "REQUEST OR REF FL ERR OPS COVERAGE\n"
+ "======== == === == ==== === =========\n"
+ );
+ return 0;
+ }
+
+ rreq = list_entry(v, struct netfs_io_request, proc_link);
+ seq_printf(m,
+ "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx",
+ rreq->debug_id,
+ netfs_origins[rreq->origin],
+ refcount_read(&rreq->ref),
+ rreq->flags,
+ rreq->error,
+ atomic_read(&rreq->nr_outstanding),
+ rreq->start, rreq->submitted, rreq->len);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static void *netfs_requests_seq_start(struct seq_file *m, loff_t *_pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ return seq_list_start_head(&netfs_io_requests, *_pos);
+}
+
+static void *netfs_requests_seq_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+ return seq_list_next(v, &netfs_io_requests, _pos);
+}
+
+static void netfs_requests_seq_stop(struct seq_file *m, void *v)
+ __releases(rcu)
+{
+ rcu_read_unlock();
+}
+
+static const struct seq_operations netfs_requests_seq_ops = {
+ .start = netfs_requests_seq_start,
+ .next = netfs_requests_seq_next,
+ .stop = netfs_requests_seq_stop,
+ .show = netfs_requests_seq_show,
+};
+#endif /* CONFIG_PROC_FS */
+
+static int __init netfs_init(void)
+{
+ int ret = -ENOMEM;
+
+ if (!proc_mkdir("fs/netfs", NULL))
+ goto error;
+ if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL,
+ &netfs_requests_seq_ops))
+ goto error_proc;
+#ifdef CONFIG_FSCACHE_STATS
+ if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL,
+ netfs_stats_show))
+ goto error_proc;
+#endif
+
+ ret = fscache_init();
+ if (ret < 0)
+ goto error_proc;
+ return 0;
+
+error_proc:
+ remove_proc_entry("fs/netfs", NULL);
+error:
+ return ret;
+}
+fs_initcall(netfs_init);
+
+static void __exit netfs_exit(void)
+{
+ fscache_exit();
+ remove_proc_entry("fs/netfs", NULL);
+}
+module_exit(netfs_exit);
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
new file mode 100644
index 000000000000..90051ced8e2a
--- /dev/null
+++ b/fs/netfs/misc.c
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Miscellaneous routines.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/swap.h>
+#include "internal.h"
+
+/*
+ * Attach a folio to the buffer and maybe set marks on it to say that we need
+ * to put the folio later and twiddle the pagecache flags.
+ */
+int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
+ struct folio *folio, unsigned int flags,
+ gfp_t gfp_mask)
+{
+ XA_STATE_ORDER(xas, xa, index, folio_order(folio));
+
+retry:
+ xas_lock(&xas);
+ for (;;) {
+ xas_store(&xas, folio);
+ if (!xas_error(&xas))
+ break;
+ xas_unlock(&xas);
+ if (!xas_nomem(&xas, gfp_mask))
+ return xas_error(&xas);
+ goto retry;
+ }
+
+ if (flags & NETFS_FLAG_PUT_MARK)
+ xas_set_mark(&xas, NETFS_BUF_PUT_MARK);
+ if (flags & NETFS_FLAG_PAGECACHE_MARK)
+ xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK);
+ xas_unlock(&xas);
+ return xas_error(&xas);
+}
+
+/*
+ * Create the specified range of folios in the buffer attached to the read
+ * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that
+ * these need freeing later.
+ */
+int netfs_add_folios_to_buffer(struct xarray *buffer,
+ struct address_space *mapping,
+ pgoff_t index, pgoff_t to, gfp_t gfp_mask)
+{
+ struct folio *folio;
+ int ret;
+
+ if (to + 1 == index) /* Page range is inclusive */
+ return 0;
+
+ do {
+ /* TODO: Figure out what order folio can be allocated here */
+ folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0);
+ if (!folio)
+ return -ENOMEM;
+ folio->index = index;
+ ret = netfs_xa_store_and_mark(buffer, index, folio,
+ NETFS_FLAG_PUT_MARK, gfp_mask);
+ if (ret < 0) {
+ folio_put(folio);
+ return ret;
+ }
+
+ index += folio_nr_pages(folio);
+ } while (index <= to && index != 0);
+
+ return 0;
+}
+
+/*
+ * Clear an xarray buffer, putting a ref on the folios that have
+ * NETFS_BUF_PUT_MARK set.
+ */
+void netfs_clear_buffer(struct xarray *buffer)
+{
+ struct folio *folio;
+ XA_STATE(xas, buffer, 0);
+
+ rcu_read_lock();
+ xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) {
+ folio_put(folio);
+ }
+ rcu_read_unlock();
+ xa_destroy(buffer);
+}
+
+/**
+ * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback
+ * @mapping: The mapping the folio belongs to.
+ * @folio: The folio being dirtied.
+ *
+ * Set the dirty flag on a folio and pin an in-use cache object in memory so
+ * that writeback can later write to it. This is intended to be called from
+ * the filesystem's ->dirty_folio() method.
+ *
+ * Return: true if the dirty flag was set on the folio, false otherwise.
+ */
+bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ struct fscache_cookie *cookie = netfs_i_cookie(ictx);
+ bool need_use = false;
+
+ _enter("");
+
+ if (!filemap_dirty_folio(mapping, folio))
+ return false;
+ if (!fscache_cookie_valid(cookie))
+ return true;
+
+ if (!(inode->i_state & I_PINNING_NETFS_WB)) {
+ spin_lock(&inode->i_lock);
+ if (!(inode->i_state & I_PINNING_NETFS_WB)) {
+ inode->i_state |= I_PINNING_NETFS_WB;
+ need_use = true;
+ }
+ spin_unlock(&inode->i_lock);
+
+ if (need_use)
+ fscache_use_cookie(cookie, true);
+ }
+ return true;
+}
+EXPORT_SYMBOL(netfs_dirty_folio);
+
+/**
+ * netfs_unpin_writeback - Unpin writeback resources
+ * @inode: The inode on which the cookie resides
+ * @wbc: The writeback control
+ *
+ * Unpin the writeback resources pinned by netfs_dirty_folio(). This is
+ * intended to be called as/by the netfs's ->write_inode() method.
+ */
+int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc)
+{
+ struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
+
+ if (wbc->unpinned_netfs_wb)
+ fscache_unuse_cookie(cookie, NULL, NULL);
+ return 0;
+}
+EXPORT_SYMBOL(netfs_unpin_writeback);
+
+/**
+ * netfs_clear_inode_writeback - Clear writeback resources pinned by an inode
+ * @inode: The inode to clean up
+ * @aux: Auxiliary data to apply to the inode
+ *
+ * Clear any writeback resources held by an inode when the inode is evicted.
+ * This must be called before clear_inode() is called.
+ */
+void netfs_clear_inode_writeback(struct inode *inode, const void *aux)
+{
+ struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
+
+ if (inode->i_state & I_PINNING_NETFS_WB) {
+ loff_t i_size = i_size_read(inode);
+ fscache_unuse_cookie(cookie, aux, &i_size);
+ }
+}
+EXPORT_SYMBOL(netfs_clear_inode_writeback);
+
+/**
+ * netfs_invalidate_folio - Invalidate or partially invalidate a folio
+ * @folio: Folio proposed for release
+ * @offset: Offset of the invalidated region
+ * @length: Length of the invalidated region
+ *
+ * Invalidate part or all of a folio for a network filesystem. The folio will
+ * be removed afterwards if the invalidated region covers the entire folio.
+ */
+void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
+{
+ struct netfs_folio *finfo = NULL;
+ size_t flen = folio_size(folio);
+
+ _enter("{%lx},%zx,%zx", folio->index, offset, length);
+
+ folio_wait_fscache(folio);
+
+ if (!folio_test_private(folio))
+ return;
+
+ finfo = netfs_folio_info(folio);
+
+ if (offset == 0 && length >= flen)
+ goto erase_completely;
+
+ if (finfo) {
+ /* We have a partially uptodate page from a streaming write. */
+ unsigned int fstart = finfo->dirty_offset;
+ unsigned int fend = fstart + finfo->dirty_len;
+ unsigned int end = offset + length;
+
+ if (offset >= fend)
+ return;
+ if (end <= fstart)
+ return;
+ if (offset <= fstart && end >= fend)
+ goto erase_completely;
+ if (offset <= fstart && end > fstart)
+ goto reduce_len;
+ if (offset > fstart && end >= fend)
+ goto move_start;
+ /* A partial write was split. The caller has already zeroed
+ * it, so just absorb the hole.
+ */
+ }
+ return;
+
+erase_completely:
+ netfs_put_group(netfs_folio_group(folio));
+ folio_detach_private(folio);
+ folio_clear_uptodate(folio);
+ kfree(finfo);
+ return;
+reduce_len:
+ finfo->dirty_len = offset + length - finfo->dirty_offset;
+ return;
+move_start:
+ finfo->dirty_len -= offset - finfo->dirty_offset;
+ finfo->dirty_offset = offset;
+}
+EXPORT_SYMBOL(netfs_invalidate_folio);
+
+/**
+ * netfs_release_folio - Try to release a folio
+ * @folio: Folio proposed for release
+ * @gfp: Flags qualifying the release
+ *
+ * Request release of a folio and clean up its private state if it's not busy.
+ * Returns true if the folio can now be released, false if not
+ */
+bool netfs_release_folio(struct folio *folio, gfp_t gfp)
+{
+ struct netfs_inode *ctx = netfs_inode(folio_inode(folio));
+ unsigned long long end;
+
+ end = folio_pos(folio) + folio_size(folio);
+ if (end > ctx->zero_point)
+ ctx->zero_point = end;
+
+ if (folio_test_private(folio))
+ return false;
+ if (folio_test_fscache(folio)) {
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
+ return false;
+ folio_wait_fscache(folio);
+ }
+
+ fscache_note_page_release(netfs_i_cookie(ctx));
+ return true;
+}
+EXPORT_SYMBOL(netfs_release_folio);
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index e17cdf53f6a7..610ceb5bd86c 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -20,14 +20,20 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
struct inode *inode = file ? file_inode(file) : mapping->host;
struct netfs_inode *ctx = netfs_inode(inode);
struct netfs_io_request *rreq;
+ bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE ||
+ origin == NETFS_DIO_READ ||
+ origin == NETFS_DIO_WRITE);
+ bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx);
int ret;
- rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
+ rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request),
+ GFP_KERNEL);
if (!rreq)
return ERR_PTR(-ENOMEM);
rreq->start = start;
rreq->len = len;
+ rreq->upper_len = len;
rreq->origin = origin;
rreq->netfs_ops = ctx->ops;
rreq->mapping = mapping;
@@ -35,8 +41,14 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
rreq->i_size = i_size_read(inode);
rreq->debug_id = atomic_inc_return(&debug_ids);
INIT_LIST_HEAD(&rreq->subrequests);
+ INIT_WORK(&rreq->work, NULL);
refcount_set(&rreq->ref, 1);
+
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+ if (cached)
+ __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
+ if (file && file->f_flags & O_NONBLOCK)
+ __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
if (rreq->netfs_ops->init_request) {
ret = rreq->netfs_ops->init_request(rreq, file);
if (ret < 0) {
@@ -45,6 +57,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
}
}
+ trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new);
+ netfs_proc_add_rreq(rreq);
netfs_stat(&netfs_n_rh_rreq);
return rreq;
}
@@ -74,33 +88,47 @@ static void netfs_free_request(struct work_struct *work)
{
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
+ unsigned int i;
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+ netfs_proc_del_rreq(rreq);
netfs_clear_subrequests(rreq, false);
if (rreq->netfs_ops->free_request)
rreq->netfs_ops->free_request(rreq);
if (rreq->cache_resources.ops)
rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
- kfree(rreq);
+ if (rreq->direct_bv) {
+ for (i = 0; i < rreq->direct_bv_count; i++) {
+ if (rreq->direct_bv[i].bv_page) {
+ if (rreq->direct_bv_unpin)
+ unpin_user_page(rreq->direct_bv[i].bv_page);
+ }
+ }
+ kvfree(rreq->direct_bv);
+ }
+ kfree_rcu(rreq, rcu);
netfs_stat_d(&netfs_n_rh_rreq);
}
void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
enum netfs_rreq_ref_trace what)
{
- unsigned int debug_id = rreq->debug_id;
+ unsigned int debug_id;
bool dead;
int r;
- dead = __refcount_dec_and_test(&rreq->ref, &r);
- trace_netfs_rreq_ref(debug_id, r - 1, what);
- if (dead) {
- if (was_async) {
- rreq->work.func = netfs_free_request;
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
- } else {
- netfs_free_request(&rreq->work);
+ if (rreq) {
+ debug_id = rreq->debug_id;
+ dead = __refcount_dec_and_test(&rreq->ref, &r);
+ trace_netfs_rreq_ref(debug_id, r - 1, what);
+ if (dead) {
+ if (was_async) {
+ rreq->work.func = netfs_free_request;
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ BUG();
+ } else {
+ netfs_free_request(&rreq->work);
+ }
}
}
}
@@ -112,8 +140,11 @@ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq
{
struct netfs_io_subrequest *subreq;
- subreq = kzalloc(sizeof(struct netfs_io_subrequest), GFP_KERNEL);
+ subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?:
+ sizeof(struct netfs_io_subrequest),
+ GFP_KERNEL);
if (subreq) {
+ INIT_WORK(&subreq->work, NULL);
INIT_LIST_HEAD(&subreq->rreq_link);
refcount_set(&subreq->ref, 2);
subreq->rreq = rreq;
@@ -140,6 +171,8 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
struct netfs_io_request *rreq = subreq->rreq;
trace_netfs_sreq(subreq, netfs_sreq_trace_free);
+ if (rreq->netfs_ops->free_subrequest)
+ rreq->netfs_ops->free_subrequest(subreq);
kfree(subreq);
netfs_stat_d(&netfs_n_rh_sreq);
netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq);
diff --git a/fs/netfs/output.c b/fs/netfs/output.c
new file mode 100644
index 000000000000..625eb68f3e5a
--- /dev/null
+++ b/fs/netfs/output.c
@@ -0,0 +1,478 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/**
+ * netfs_create_write_request - Create a write operation.
+ * @wreq: The write request this is storing from.
+ * @dest: The destination type
+ * @start: Start of the region this write will modify
+ * @len: Length of the modification
+ * @worker: The worker function to handle the write(s)
+ *
+ * Allocate a write operation, set it up and add it to the list on a write
+ * request.
+ */
+struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request *wreq,
+ enum netfs_io_source dest,
+ loff_t start, size_t len,
+ work_func_t worker)
+{
+ struct netfs_io_subrequest *subreq;
+
+ subreq = netfs_alloc_subrequest(wreq);
+ if (subreq) {
+ INIT_WORK(&subreq->work, worker);
+ subreq->source = dest;
+ subreq->start = start;
+ subreq->len = len;
+ subreq->debug_index = wreq->subreq_counter++;
+
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write);
+ break;
+ default:
+ BUG();
+ }
+
+ subreq->io_iter = wreq->io_iter;
+ iov_iter_advance(&subreq->io_iter, subreq->start - wreq->start);
+ iov_iter_truncate(&subreq->io_iter, subreq->len);
+
+ trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
+ refcount_read(&subreq->ref),
+ netfs_sreq_trace_new);
+ atomic_inc(&wreq->nr_outstanding);
+ list_add_tail(&subreq->rreq_link, &wreq->subrequests);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+ }
+
+ return subreq;
+}
+EXPORT_SYMBOL(netfs_create_write_request);
+
+/*
+ * Process a completed write request once all the component operations have
+ * been completed.
+ */
+static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async)
+{
+ struct netfs_io_subrequest *subreq;
+ struct netfs_inode *ctx = netfs_inode(wreq->inode);
+ size_t transferred = 0;
+
+ _enter("R=%x[]", wreq->debug_id);
+
+ trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
+
+ list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
+ if (subreq->error || subreq->transferred == 0)
+ break;
+ transferred += subreq->transferred;
+ if (subreq->transferred < subreq->len)
+ break;
+ }
+ wreq->transferred = transferred;
+
+ list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
+ if (!subreq->error)
+ continue;
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ /* Depending on the type of failure, this may prevent
+ * writeback completion unless we're in disconnected
+ * mode.
+ */
+ if (!wreq->error)
+ wreq->error = subreq->error;
+ break;
+
+ case NETFS_WRITE_TO_CACHE:
+ /* Failure doesn't prevent writeback completion unless
+ * we're in disconnected mode.
+ */
+ if (subreq->error != -ENOBUFS)
+ ctx->ops->invalidate_cache(wreq);
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ if (!wreq->error)
+ wreq->error = -EIO;
+ return;
+ }
+ }
+
+ wreq->cleanup(wreq);
+
+ if (wreq->origin == NETFS_DIO_WRITE &&
+ wreq->mapping->nrpages) {
+ pgoff_t first = wreq->start >> PAGE_SHIFT;
+ pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
+ invalidate_inode_pages2_range(wreq->mapping, first, last);
+ }
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_end(wreq->inode);
+
+ _debug("finished");
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
+ clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+ wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+ if (wreq->iocb) {
+ wreq->iocb->ki_pos += transferred;
+ if (wreq->iocb->ki_complete)
+ wreq->iocb->ki_complete(
+ wreq->iocb, wreq->error ? wreq->error : transferred);
+ }
+
+ netfs_clear_subrequests(wreq, was_async);
+ netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete);
+}
+
+/*
+ * Deal with the completion of writing the data to the cache.
+ */
+void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_io_subrequest *subreq = _op;
+ struct netfs_io_request *wreq = subreq->rreq;
+ unsigned int u;
+
+ _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
+
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_done);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_done);
+ break;
+ case NETFS_INVALID_WRITE:
+ break;
+ default:
+ BUG();
+ }
+
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ subreq->error = transferred_or_error;
+ trace_netfs_failure(wreq, subreq, transferred_or_error,
+ netfs_fail_write);
+ goto failed;
+ }
+
+ if (WARN(transferred_or_error > subreq->len - subreq->transferred,
+ "Subreq excess write: R%x[%x] %zd > %zu - %zu",
+ wreq->debug_id, subreq->debug_index,
+ transferred_or_error, subreq->len, subreq->transferred))
+ transferred_or_error = subreq->len - subreq->transferred;
+
+ subreq->error = 0;
+ subreq->transferred += transferred_or_error;
+
+ if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
+ pr_warn("R=%08x[%u] ITER POST-MISMATCH %zx != %zx-%zx %x\n",
+ wreq->debug_id, subreq->debug_index,
+ iov_iter_count(&subreq->io_iter), subreq->len,
+ subreq->transferred, subreq->io_iter.iter_type);
+
+ if (subreq->transferred < subreq->len)
+ goto incomplete;
+
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+out:
+ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+ /* If we decrement nr_outstanding to 0, the ref belongs to us. */
+ u = atomic_dec_return(&wreq->nr_outstanding);
+ if (u == 0)
+ netfs_write_terminated(wreq, was_async);
+ else if (u == 1)
+ wake_up_var(&wreq->nr_outstanding);
+
+ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+ return;
+
+incomplete:
+ if (transferred_or_error == 0) {
+ if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
+ subreq->error = -ENODATA;
+ goto failed;
+ }
+ } else {
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ }
+
+ __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
+ set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
+ goto out;
+
+failed:
+ switch (subreq->source) {
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_failed);
+ set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
+ break;
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_failed);
+ set_bit(NETFS_RREQ_FAILED, &wreq->flags);
+ wreq->error = subreq->error;
+ break;
+ default:
+ break;
+ }
+ goto out;
+}
+EXPORT_SYMBOL(netfs_write_subrequest_terminated);
+
+static void netfs_write_to_cache_op(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *wreq = subreq->rreq;
+ struct netfs_cache_resources *cres = &wreq->cache_resources;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+
+ cres->ops->write(cres, subreq->start, &subreq->io_iter,
+ netfs_write_subrequest_terminated, subreq);
+}
+
+static void netfs_write_to_cache_op_worker(struct work_struct *work)
+{
+ struct netfs_io_subrequest *subreq =
+ container_of(work, struct netfs_io_subrequest, work);
+
+ netfs_write_to_cache_op(subreq);
+}
+
+/**
+ * netfs_queue_write_request - Queue a write request for attention
+ * @subreq: The write request to be queued
+ *
+ * Queue the specified write request for processing by a worker thread. We
+ * pass the caller's ref on the request to the worker thread.
+ */
+void netfs_queue_write_request(struct netfs_io_subrequest *subreq)
+{
+ if (!queue_work(system_unbound_wq, &subreq->work))
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_wip);
+}
+EXPORT_SYMBOL(netfs_queue_write_request);
+
+/*
+ * Set up a op for writing to the cache.
+ */
+static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq)
+{
+ struct netfs_cache_resources *cres = &wreq->cache_resources;
+ struct netfs_io_subrequest *subreq;
+ struct netfs_inode *ctx = netfs_inode(wreq->inode);
+ struct fscache_cookie *cookie = netfs_i_cookie(ctx);
+ loff_t start = wreq->start;
+ size_t len = wreq->len;
+ int ret;
+
+ if (!fscache_cookie_enabled(cookie)) {
+ clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags);
+ return;
+ }
+
+ _debug("write to cache");
+ ret = fscache_begin_write_operation(cres, cookie);
+ if (ret < 0)
+ return;
+
+ ret = cres->ops->prepare_write(cres, &start, &len, wreq->upper_len,
+ i_size_read(wreq->inode), true);
+ if (ret < 0)
+ return;
+
+ subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len,
+ netfs_write_to_cache_op_worker);
+ if (!subreq)
+ return;
+
+ netfs_write_to_cache_op(subreq);
+}
+
+/*
+ * Begin the process of writing out a chunk of data.
+ *
+ * We are given a write request that holds a series of dirty regions and
+ * (partially) covers a sequence of folios, all of which are present. The
+ * pages must have been marked as writeback as appropriate.
+ *
+ * We need to perform the following steps:
+ *
+ * (1) If encrypting, create an output buffer and encrypt each block of the
+ * data into it, otherwise the output buffer will point to the original
+ * folios.
+ *
+ * (2) If the data is to be cached, set up a write op for the entire output
+ * buffer to the cache, if the cache wants to accept it.
+ *
+ * (3) If the data is to be uploaded (ie. not merely cached):
+ *
+ * (a) If the data is to be compressed, create a compression buffer and
+ * compress the data into it.
+ *
+ * (b) For each destination we want to upload to, set up write ops to write
+ * to that destination. We may need multiple writes if the data is not
+ * contiguous or the span exceeds wsize for a server.
+ */
+int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
+ enum netfs_write_trace what)
+{
+ struct netfs_inode *ctx = netfs_inode(wreq->inode);
+
+ _enter("R=%x %llx-%llx f=%lx",
+ wreq->debug_id, wreq->start, wreq->start + wreq->len - 1,
+ wreq->flags);
+
+ trace_netfs_write(wreq, what);
+ if (wreq->len == 0 || wreq->iter.count == 0) {
+ pr_err("Zero-sized write [R=%x]\n", wreq->debug_id);
+ return -EIO;
+ }
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_begin(wreq->inode);
+
+ wreq->io_iter = wreq->iter;
+
+ /* ->outstanding > 0 carries a ref */
+ netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
+ atomic_set(&wreq->nr_outstanding, 1);
+
+ /* Start the encryption/compression going. We can do that in the
+ * background whilst we generate a list of write ops that we want to
+ * perform.
+ */
+ // TODO: Encrypt or compress the region as appropriate
+
+ /* We need to write all of the region to the cache */
+ if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
+ netfs_set_up_write_to_cache(wreq);
+
+ /* However, we don't necessarily write all of the region to the server.
+ * Caching of reads is being managed this way also.
+ */
+ if (test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
+ ctx->ops->create_write_requests(wreq, wreq->start, wreq->len);
+
+ if (atomic_dec_and_test(&wreq->nr_outstanding))
+ netfs_write_terminated(wreq, false);
+
+ if (!may_wait)
+ return -EIOCBQUEUED;
+
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+ return wreq->error;
+}
+
+/*
+ * Begin a write operation for writing through the pagecache.
+ */
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
+{
+ struct netfs_io_request *wreq;
+ struct file *file = iocb->ki_filp;
+
+ wreq = netfs_alloc_request(file->f_mapping, file, iocb->ki_pos, len,
+ NETFS_WRITETHROUGH);
+ if (IS_ERR(wreq))
+ return wreq;
+
+ trace_netfs_write(wreq, netfs_write_trace_writethrough);
+
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ iov_iter_xarray(&wreq->iter, ITER_SOURCE, &wreq->mapping->i_pages, wreq->start, 0);
+ wreq->io_iter = wreq->iter;
+
+ /* ->outstanding > 0 carries a ref */
+ netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
+ atomic_set(&wreq->nr_outstanding, 1);
+ return wreq;
+}
+
+static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final)
+{
+ struct netfs_inode *ictx = netfs_inode(wreq->inode);
+ unsigned long long start;
+ size_t len;
+
+ if (!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
+ return;
+
+ start = wreq->start + wreq->submitted;
+ len = wreq->iter.count - wreq->submitted;
+ if (!final) {
+ len /= wreq->wsize; /* Round to number of maximum packets */
+ len *= wreq->wsize;
+ }
+
+ ictx->ops->create_write_requests(wreq, start, len);
+ wreq->submitted += len;
+}
+
+/*
+ * Advance the state of the write operation used when writing through the
+ * pagecache. Data has been copied into the pagecache that we need to append
+ * to the request. If we've added more than wsize then we need to create a new
+ * subrequest.
+ */
+int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end)
+{
+ _enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u",
+ wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end);
+
+ wreq->iter.count += copied;
+ wreq->io_iter.count += copied;
+ if (to_page_end && wreq->io_iter.count - wreq->submitted >= wreq->wsize)
+ netfs_submit_writethrough(wreq, false);
+
+ return wreq->error;
+}
+
+/*
+ * End a write operation used when writing through the pagecache.
+ */
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb)
+{
+ int ret = -EIOCBQUEUED;
+
+ _enter("ic=%zu sb=%zu ws=%u",
+ wreq->iter.count, wreq->submitted, wreq->wsize);
+
+ if (wreq->submitted < wreq->io_iter.count)
+ netfs_submit_writethrough(wreq, true);
+
+ if (atomic_dec_and_test(&wreq->nr_outstanding))
+ netfs_write_terminated(wreq, false);
+
+ if (is_sync_kiocb(iocb)) {
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+ ret = wreq->error;
+ }
+
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ return ret;
+}
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index 5510a7a14a40..deeba9f9dcf5 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -9,6 +9,8 @@
#include <linux/seq_file.h>
#include "internal.h"
+atomic_t netfs_n_rh_dio_read;
+atomic_t netfs_n_rh_dio_write;
atomic_t netfs_n_rh_readahead;
atomic_t netfs_n_rh_readpage;
atomic_t netfs_n_rh_rreq;
@@ -27,32 +29,48 @@ atomic_t netfs_n_rh_write_begin;
atomic_t netfs_n_rh_write_done;
atomic_t netfs_n_rh_write_failed;
atomic_t netfs_n_rh_write_zskip;
+atomic_t netfs_n_wh_wstream_conflict;
+atomic_t netfs_n_wh_upload;
+atomic_t netfs_n_wh_upload_done;
+atomic_t netfs_n_wh_upload_failed;
+atomic_t netfs_n_wh_write;
+atomic_t netfs_n_wh_write_done;
+atomic_t netfs_n_wh_write_failed;
-void netfs_stats_show(struct seq_file *m)
+int netfs_stats_show(struct seq_file *m, void *v)
{
- seq_printf(m, "RdHelp : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n",
+ seq_printf(m, "Netfs : DR=%u DW=%u RA=%u RP=%u WB=%u WBZ=%u\n",
+ atomic_read(&netfs_n_rh_dio_read),
+ atomic_read(&netfs_n_rh_dio_write),
atomic_read(&netfs_n_rh_readahead),
atomic_read(&netfs_n_rh_readpage),
atomic_read(&netfs_n_rh_write_begin),
- atomic_read(&netfs_n_rh_write_zskip),
- atomic_read(&netfs_n_rh_rreq),
- atomic_read(&netfs_n_rh_sreq));
- seq_printf(m, "RdHelp : ZR=%u sh=%u sk=%u\n",
+ atomic_read(&netfs_n_rh_write_zskip));
+ seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n",
atomic_read(&netfs_n_rh_zero),
atomic_read(&netfs_n_rh_short_read),
atomic_read(&netfs_n_rh_write_zskip));
- seq_printf(m, "RdHelp : DL=%u ds=%u df=%u di=%u\n",
+ seq_printf(m, "Netfs : DL=%u ds=%u df=%u di=%u\n",
atomic_read(&netfs_n_rh_download),
atomic_read(&netfs_n_rh_download_done),
atomic_read(&netfs_n_rh_download_failed),
atomic_read(&netfs_n_rh_download_instead));
- seq_printf(m, "RdHelp : RD=%u rs=%u rf=%u\n",
+ seq_printf(m, "Netfs : RD=%u rs=%u rf=%u\n",
atomic_read(&netfs_n_rh_read),
atomic_read(&netfs_n_rh_read_done),
atomic_read(&netfs_n_rh_read_failed));
- seq_printf(m, "RdHelp : WR=%u ws=%u wf=%u\n",
- atomic_read(&netfs_n_rh_write),
- atomic_read(&netfs_n_rh_write_done),
- atomic_read(&netfs_n_rh_write_failed));
+ seq_printf(m, "Netfs : UL=%u us=%u uf=%u\n",
+ atomic_read(&netfs_n_wh_upload),
+ atomic_read(&netfs_n_wh_upload_done),
+ atomic_read(&netfs_n_wh_upload_failed));
+ seq_printf(m, "Netfs : WR=%u ws=%u wf=%u\n",
+ atomic_read(&netfs_n_wh_write),
+ atomic_read(&netfs_n_wh_write_done),
+ atomic_read(&netfs_n_wh_write_failed));
+ seq_printf(m, "Netfs : rr=%u sr=%u wsc=%u\n",
+ atomic_read(&netfs_n_rh_rreq),
+ atomic_read(&netfs_n_rh_sreq),
+ atomic_read(&netfs_n_wh_wstream_conflict));
+ return fscache_stats_show(m);
}
EXPORT_SYMBOL(netfs_stats_show);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 01ac733a6320..f7e32d76e34d 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -169,8 +169,8 @@ config ROOT_NFS
config NFS_FSCACHE
bool "Provide NFS client caching support"
- depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
- select NETFS_SUPPORT
+ depends on NFS_FS=m && NETFS_SUPPORT || NFS_FS=y && NETFS_SUPPORT=y
+ select FSCACHE
help
Say Y here if you want NFS data to be cached locally on disc through
the general filesystem cache manager
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 943aeea1eb16..6be13e0ec170 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -580,6 +580,8 @@ retry:
nfs4_delete_deviceid(node->ld, node->nfs_client, id);
goto retry;
}
+
+ nfs4_put_deviceid_node(node);
return ERR_PTR(-ENODEV);
}
@@ -893,10 +895,9 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
}
if (pgio->pg_dreq == NULL)
- wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
- req->wb_index);
+ wb_size = pnfs_num_cont_bytes(pgio->pg_inode, req->wb_index);
else
- wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+ wb_size = nfs_dreq_bytes_left(pgio->pg_dreq, req_offset(req));
pnfs_generic_pg_init_write(pgio, req, wb_size);
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index f318a05a80e1..c97ebc42ec0f 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -351,6 +351,9 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
d->map = bl_map_simple;
d->pr_key = v->scsi.pr_key;
+ if (d->len == 0)
+ return -ENODEV;
+
pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key);
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index 6c977288cc28..d8d50a88de04 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -75,7 +75,7 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
msg->len = sizeof(*bl_msg) + b->simple.len;
msg->data = kzalloc(msg->len, gfp_mask);
if (!msg->data)
- goto out_free_data;
+ goto out_unlock;
bl_msg = msg->data;
bl_msg->type = BL_DEVICE_MOUNT;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 0279b78b5fc9..650758ee0d5f 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -21,11 +21,12 @@ enum nfs4_callback_procnum {
struct nfs4_slot;
struct cb_process_state {
- __be32 drc_status;
struct nfs_client *clp;
struct nfs4_slot *slot;
- u32 minorversion;
struct net *net;
+ u32 minorversion;
+ __be32 drc_status;
+ unsigned int referring_calls;
};
struct cb_compound_hdr_arg {
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 96a4923080ae..76cea34477ae 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -207,7 +207,8 @@ static struct inode *nfs_layout_find_inode(struct nfs_client *clp,
* Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
*/
static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo,
- const nfs4_stateid *new)
+ const nfs4_stateid *new,
+ struct cb_process_state *cps)
{
u32 oldseq, newseq;
@@ -221,28 +222,29 @@ static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo,
newseq = be32_to_cpu(new->seqid);
/* Are we already in a layout recall situation? */
- if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
- lo->plh_return_seq != 0) {
- if (newseq < lo->plh_return_seq)
- return NFS4ERR_OLD_STATEID;
- if (newseq > lo->plh_return_seq)
- return NFS4ERR_DELAY;
- goto out;
- }
+ if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+ return NFS4ERR_DELAY;
- /* Check that the stateid matches what we think it should be. */
+ /*
+ * Check that the stateid matches what we think it should be.
+ * Note that if the server sent us a list of referring calls,
+ * and we know that those have completed, then we trust the
+ * stateid argument is correct.
+ */
oldseq = be32_to_cpu(lo->plh_stateid.seqid);
- if (newseq > oldseq + 1)
+ if (newseq > oldseq + 1 && !cps->referring_calls)
return NFS4ERR_DELAY;
+
/* Crazy server! */
if (newseq <= oldseq)
return NFS4ERR_OLD_STATEID;
-out:
+
return NFS_OK;
}
static u32 initiate_file_draining(struct nfs_client *clp,
- struct cb_layoutrecallargs *args)
+ struct cb_layoutrecallargs *args,
+ struct cb_process_state *cps)
{
struct inode *ino;
struct pnfs_layout_hdr *lo;
@@ -266,7 +268,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
goto out;
}
pnfs_get_layout_hdr(lo);
- rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid);
+ rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid, cps);
if (rv != NFS_OK)
goto unlock;
@@ -326,10 +328,11 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
}
static u32 do_callback_layoutrecall(struct nfs_client *clp,
- struct cb_layoutrecallargs *args)
+ struct cb_layoutrecallargs *args,
+ struct cb_process_state *cps)
{
if (args->cbl_recall_type == RETURN_FILE)
- return initiate_file_draining(clp, args);
+ return initiate_file_draining(clp, args, cps);
return initiate_bulk_draining(clp, args);
}
@@ -340,11 +343,12 @@ __be32 nfs4_callback_layoutrecall(void *argp, void *resp,
u32 res = NFS4ERR_OP_NOT_IN_SESSION;
if (cps->clp)
- res = do_callback_layoutrecall(cps->clp, args);
+ res = do_callback_layoutrecall(cps->clp, args, cps);
return cpu_to_be32(res);
}
-static void pnfs_recall_all_layouts(struct nfs_client *clp)
+static void pnfs_recall_all_layouts(struct nfs_client *clp,
+ struct cb_process_state *cps)
{
struct cb_layoutrecallargs args;
@@ -352,7 +356,7 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp)
memset(&args, 0, sizeof(args));
args.cbl_recall_type = RETURN_ALL;
/* FIXME we ignore errors, what should we do? */
- do_callback_layoutrecall(clp, &args);
+ do_callback_layoutrecall(clp, &args, cps);
}
__be32 nfs4_callback_devicenotify(void *argp, void *resp,
@@ -450,6 +454,7 @@ static int referring_call_exists(struct nfs_client *clp,
__acquires(lock)
{
int status = 0;
+ int found = 0;
int i, j;
struct nfs4_session *session;
struct nfs4_slot_table *tbl;
@@ -478,11 +483,12 @@ static int referring_call_exists(struct nfs_client *clp,
spin_lock(lock);
if (status)
goto out;
+ found++;
}
}
out:
- return status;
+ return status < 0 ? status : found;
}
__be32 nfs4_callback_sequence(void *argp, void *resp,
@@ -493,6 +499,7 @@ __be32 nfs4_callback_sequence(void *argp, void *resp,
struct nfs4_slot_table *tbl;
struct nfs4_slot *slot;
struct nfs_client *clp;
+ int ret;
int i;
__be32 status = htonl(NFS4ERR_BADSESSION);
@@ -552,11 +559,13 @@ __be32 nfs4_callback_sequence(void *argp, void *resp,
* related callback was received before the response to the original
* call.
*/
- if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists,
- &tbl->slot_tbl_lock) < 0) {
+ ret = referring_call_exists(clp, args->csa_nrclists, args->csa_rclists,
+ &tbl->slot_tbl_lock);
+ if (ret < 0) {
status = htonl(NFS4ERR_DELAY);
goto out_unlock;
}
+ cps->referring_calls = ret;
/*
* RFC5661 20.9.3
@@ -617,7 +626,7 @@ __be32 nfs4_callback_recallany(void *argp, void *resp,
nfs_expire_unused_delegation_types(cps->clp, flags);
if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT))
- pnfs_recall_all_layouts(cps->clp);
+ pnfs_recall_all_layouts(cps->clp, cps);
if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_READ)) {
set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &cps->clp->cl_state);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 321af81c456e..9369488f2ed4 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -967,6 +967,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)
nops--;
}
+ if (svc_is_backchannel(rqstp) && cps.clp) {
+ rqstp->bc_to_initval = cps.clp->cl_rpcclient->cl_timeout->to_initval;
+ rqstp->bc_to_retries = cps.clp->cl_rpcclient->cl_timeout->to_retries;
+ }
+
*hdr_res.status = status;
*hdr_res.nops = htonl(nops);
nfs4_cb_free_slot(&cps);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 13dffe4201e6..c8ecbe999059 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2194,6 +2194,8 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
{
struct inode *inode;
+ trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
+
if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
goto full_reval;
if (d_mountpoint(dentry))
@@ -2963,7 +2965,7 @@ static u64 nfs_access_login_time(const struct task_struct *task,
rcu_read_lock();
for (;;) {
parent = rcu_dereference(task->real_parent);
- pcred = rcu_dereference(parent->cred);
+ pcred = __task_cred(parent);
if (parent == task || cred_fscmp(pcred, cred) != 0)
break;
task = parent;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f6c74f424691..c03926a1cc73 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -205,9 +205,10 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq)
kref_put(&dreq->kref, nfs_direct_req_free);
}
-ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
+ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset)
{
- return dreq->bytes_left;
+ loff_t start = offset - dreq->io_start;
+ return dreq->max_count - start;
}
EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
@@ -368,7 +369,6 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
bytes -= req_len;
requested_bytes += req_len;
pos += req_len;
- dreq->bytes_left -= req_len;
}
nfs_direct_release_pages(pagevec, npages);
kvfree(pagevec);
@@ -440,7 +440,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
goto out;
dreq->inode = inode;
- dreq->bytes_left = dreq->max_count = count;
+ dreq->max_count = count;
dreq->io_start = iocb->ki_pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
@@ -873,7 +873,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
bytes -= req_len;
requested_bytes += req_len;
pos += req_len;
- dreq->bytes_left -= req_len;
if (defer) {
nfs_mark_request_commit(req, NULL, &cinfo, 0);
@@ -980,7 +979,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
goto out;
dreq->inode = inode;
- dreq->bytes_left = dreq->max_count = count;
+ dreq->max_count = count;
dreq->io_start = pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e8cccb94b927..8577ccf621f5 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -558,7 +558,6 @@ const struct address_space_operations nfs_file_aops = {
.read_folio = nfs_read_folio,
.readahead = nfs_readahead,
.dirty_folio = filemap_dirty_folio,
- .writepage = nfs_writepage,
.writepages = nfs_writepages,
.write_begin = nfs_write_begin,
.write_end = nfs_write_end,
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index b05717fe0d4e..2d1bfee225c3 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -274,12 +274,6 @@ static void nfs_netfs_free_request(struct netfs_io_request *rreq)
put_nfs_open_context(rreq->netfs_priv);
}
-static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq)
-{
- return fscache_begin_read_operation(&rreq->cache_resources,
- netfs_i_cookie(netfs_inode(rreq->inode)));
-}
-
static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq)
{
struct nfs_netfs_io_data *netfs;
@@ -387,7 +381,6 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr)
const struct netfs_request_ops nfs_netfs_ops = {
.init_request = nfs_netfs_init_request,
.free_request = nfs_netfs_free_request,
- .begin_cache_operation = nfs_netfs_begin_cache_operation,
.issue_read = nfs_netfs_issue_read,
.clamp_length = nfs_netfs_clamp_length
};
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 5407ab8c8783..e3cb4923316b 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -80,7 +80,7 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
}
static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
{
- netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops);
+ netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false);
}
extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr);
extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9c9cf764f600..e3722ce6722e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -655,7 +655,7 @@ extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
/* direct.c */
void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
struct nfs_direct_req *dreq);
-extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
+extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset);
/* nfs4proc.c */
extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
@@ -936,7 +936,6 @@ struct nfs_direct_req {
loff_t io_start; /* Start offset for I/O */
ssize_t count, /* bytes actually processed */
max_count, /* max expected count */
- bytes_left, /* bytes left to be sent */
error; /* any reported error */
struct completion completion; /* wait for i/o completion */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8a943fffaad5..23819a756508 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -170,6 +170,7 @@ static int nfs4_map_errors(int err)
case -NFS4ERR_RESOURCE:
case -NFS4ERR_LAYOUTTRYLATER:
case -NFS4ERR_RECALLCONFLICT:
+ case -NFS4ERR_RETURNCONFLICT:
return -EREMOTEIO;
case -NFS4ERR_WRONGSEC:
case -NFS4ERR_WRONG_CRED:
@@ -558,6 +559,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
case -NFS4ERR_GRACE:
case -NFS4ERR_LAYOUTTRYLATER:
case -NFS4ERR_RECALLCONFLICT:
+ case -NFS4ERR_RETURNCONFLICT:
exception->delay = 1;
return 0;
@@ -9691,6 +9693,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
status = -EBUSY;
break;
case -NFS4ERR_RECALLCONFLICT:
+ case -NFS4ERR_RETURNCONFLICT:
status = -ERECALLCONFLICT;
break;
case -NFS4ERR_DELEG_REVOKED:
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index e776200e9a11..886a7c4c60b3 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -34,7 +34,6 @@ static struct ctl_table nfs4_cb_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { }
};
int nfs4_register_sysctl(void)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index deec76cf5afe..69406e60f391 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1602,7 +1602,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args
static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
{
uint32_t attrs[3] = {
- FATTR4_WORD0_RDATTR_ERROR,
+ FATTR4_WORD0_TYPE
+ | FATTR4_WORD0_RDATTR_ERROR,
FATTR4_WORD1_MOUNTED_ON_FILEID,
};
uint32_t dircount = readdir->count;
@@ -1612,12 +1613,20 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
unsigned int i;
if (readdir->plus) {
- attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
- FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE|FATTR4_WORD0_FILEID;
- attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|
- FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
- FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
- FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+ attrs[0] |= FATTR4_WORD0_CHANGE
+ | FATTR4_WORD0_SIZE
+ | FATTR4_WORD0_FSID
+ | FATTR4_WORD0_FILEHANDLE
+ | FATTR4_WORD0_FILEID;
+ attrs[1] |= FATTR4_WORD1_MODE
+ | FATTR4_WORD1_NUMLINKS
+ | FATTR4_WORD1_OWNER
+ | FATTR4_WORD1_OWNER_GROUP
+ | FATTR4_WORD1_RAWDEV
+ | FATTR4_WORD1_SPACE_USED
+ | FATTR4_WORD1_TIME_ACCESS
+ | FATTR4_WORD1_TIME_METADATA
+ | FATTR4_WORD1_TIME_MODIFY;
attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
}
/* Use mounted_on_fileid only if the server supports it */
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 4e90ca531176..afedb449b54f 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -400,6 +400,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event,
__field(unsigned long, flags)
__field(dev_t, dev)
__field(u64, dir)
+ __field(u64, fileid)
__string(name, dentry->d_name.name)
),
@@ -407,16 +408,18 @@ DECLARE_EVENT_CLASS(nfs_lookup_event,
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
__entry->flags = flags;
+ __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry));
__assign_str(name, dentry->d_name.name);
),
TP_printk(
- "flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
+ "flags=0x%lx (%s) name=%02x:%02x:%llu/%s fileid=%llu",
__entry->flags,
show_fs_lookup_flags(__entry->flags),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
- __get_str(name)
+ __get_str(name),
+ __entry->fileid
)
);
@@ -444,6 +447,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done,
__field(unsigned long, flags)
__field(dev_t, dev)
__field(u64, dir)
+ __field(u64, fileid)
__string(name, dentry->d_name.name)
),
@@ -452,17 +456,19 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done,
__entry->dir = NFS_FILEID(dir);
__entry->error = error < 0 ? -error : 0;
__entry->flags = flags;
+ __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry));
__assign_str(name, dentry->d_name.name);
),
TP_printk(
- "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
+ "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s fileid=%llu",
-__entry->error, show_nfs_status(__entry->error),
__entry->flags,
show_fs_lookup_flags(__entry->flags),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
- __get_str(name)
+ __get_str(name),
+ __entry->fileid
)
);
@@ -893,7 +899,7 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done,
DEFINE_NFS_RENAME_EVENT(nfs_rename_enter);
DEFINE_NFS_RENAME_EVENT_DONE(nfs_rename_exit);
-DEFINE_NFS_RENAME_EVENT_DONE(nfs_sillyrename_rename);
+DEFINE_NFS_RENAME_EVENT_DONE(nfs_async_rename_done);
TRACE_EVENT(nfs_sillyrename_unlink,
TP_PROTO(
@@ -1539,7 +1545,6 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class,
__field(u32, fhandle)
__field(loff_t, offset)
__field(ssize_t, count)
- __field(ssize_t, bytes_left)
__field(ssize_t, error)
__field(int, flags)
),
@@ -1554,19 +1559,18 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class,
__entry->fhandle = nfs_fhandle_hash(fh);
__entry->offset = dreq->io_start;
__entry->count = dreq->count;
- __entry->bytes_left = dreq->bytes_left;
__entry->error = dreq->error;
__entry->flags = dreq->flags;
),
TP_printk(
"error=%zd fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%zd bytes_left=%zd flags=%s",
+ "offset=%lld count=%zd flags=%s",
__entry->error, MAJOR(__entry->dev),
MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle, __entry->offset,
- __entry->count, __entry->bytes_left,
+ __entry->count,
nfs_show_direct_req_flags(__entry->flags)
)
);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 21a365357629..0c0fed1ecd0b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2733,7 +2733,8 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
if (pgio->pg_dreq == NULL)
rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
else
- rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+ rd_size = nfs_dreq_bytes_left(pgio->pg_dreq,
+ req_offset(req));
pgio->pg_lseg =
pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index f39e2089bc4c..e645be1a3381 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -29,7 +29,6 @@ static struct ctl_table nfs_cb_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { }
};
int nfs_register_sysctl(void)
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 150a953a8be9..0110299643a2 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -267,7 +267,7 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
struct inode *new_dir = data->new_dir;
struct dentry *old_dentry = data->old_dentry;
- trace_nfs_sillyrename_rename(old_dir, old_dentry,
+ trace_nfs_async_rename_done(old_dir, old_dentry,
new_dir, data->new_dentry, task->tk_status);
if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
rpc_restart_call_prepare(task);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 7248705faef4..bb79d3a886ae 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -680,17 +680,6 @@ static int nfs_writepage_locked(struct folio *folio,
return err;
}
-int nfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct folio *folio = page_folio(page);
- int ret;
-
- ret = nfs_writepage_locked(folio, wbc);
- if (ret != AOP_WRITEPAGE_ACTIVATE)
- unlock_page(page);
- return ret;
-}
-
static int nfs_writepages_callback(struct folio *folio,
struct writeback_control *wbc, void *data)
{
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2fa54cfd4882..6dc6340e2852 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -7911,14 +7911,16 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
{
struct file_lock *fl;
int status = false;
- struct nfsd_file *nf = find_any_file(fp);
+ struct nfsd_file *nf;
struct inode *inode;
struct file_lock_context *flctx;
+ spin_lock(&fp->fi_lock);
+ nf = find_any_file_locked(fp);
if (!nf) {
/* Any valid lock stateid should have some sort of access */
WARN_ON_ONCE(1);
- return status;
+ goto out;
}
inode = file_inode(nf->nf_file);
@@ -7934,7 +7936,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
}
spin_unlock(&flctx->flc_lock);
}
- nfsd_file_put(nf);
+out:
+ spin_unlock(&fp->fi_lock);
return status;
}
@@ -7944,10 +7947,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
* @cstate: NFSv4 COMPOUND state
* @u: RELEASE_LOCKOWNER arguments
*
- * The lockowner's so_count is bumped when a lock record is added
- * or when copying a conflicting lock. The latter case is brief,
- * but can lead to fleeting false positives when looking for
- * locks-in-use.
+ * Check if theree are any locks still held and if not - free the lockowner
+ * and any lock state that is owned.
*
* Return values:
* %nfs_ok: lockowner released or not found
@@ -7983,10 +7984,13 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
spin_unlock(&clp->cl_lock);
return nfs_ok;
}
- if (atomic_read(&lo->lo_owner.so_count) != 2) {
- spin_unlock(&clp->cl_lock);
- nfs4_put_stateowner(&lo->lo_owner);
- return nfserr_locks_held;
+
+ list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) {
+ if (check_for_locks(stp->st_stid.sc_file, lo)) {
+ spin_unlock(&clp->cl_lock);
+ nfs4_put_stateowner(&lo->lo_owner);
+ return nfserr_locks_held;
+ }
}
unhash_lockowner_locked(lo);
while (!list_empty(&lo->lo_owner.so_stateids)) {
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 8e6dbe9e0b65..f206ca32e7f5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -48,10 +48,6 @@ enum {
NFSD_MaxBlkSize,
NFSD_MaxConnections,
NFSD_Filecache,
- /*
- * The below MUST come last. Otherwise we leave a hole in nfsd_files[]
- * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
- */
#ifdef CONFIG_NFSD_V4
NFSD_Leasetime,
NFSD_Gracetime,
@@ -1242,63 +1238,34 @@ static inline void _nfsd_symlink(struct dentry *parent, const char *name,
#endif
-static void clear_ncl(struct inode *inode)
+static void clear_ncl(struct dentry *dentry)
{
+ struct inode *inode = d_inode(dentry);
struct nfsdfs_client *ncl = inode->i_private;
+ spin_lock(&inode->i_lock);
inode->i_private = NULL;
+ spin_unlock(&inode->i_lock);
kref_put(&ncl->cl_ref, ncl->cl_release);
}
-static struct nfsdfs_client *__get_nfsdfs_client(struct inode *inode)
-{
- struct nfsdfs_client *nc = inode->i_private;
-
- if (nc)
- kref_get(&nc->cl_ref);
- return nc;
-}
-
struct nfsdfs_client *get_nfsdfs_client(struct inode *inode)
{
struct nfsdfs_client *nc;
- inode_lock_shared(inode);
- nc = __get_nfsdfs_client(inode);
- inode_unlock_shared(inode);
+ spin_lock(&inode->i_lock);
+ nc = inode->i_private;
+ if (nc)
+ kref_get(&nc->cl_ref);
+ spin_unlock(&inode->i_lock);
return nc;
}
-/* from __rpc_unlink */
-static void nfsdfs_remove_file(struct inode *dir, struct dentry *dentry)
-{
- int ret;
-
- clear_ncl(d_inode(dentry));
- dget(dentry);
- ret = simple_unlink(dir, dentry);
- d_drop(dentry);
- fsnotify_unlink(dir, dentry);
- dput(dentry);
- WARN_ON_ONCE(ret);
-}
-
-static void nfsdfs_remove_files(struct dentry *root)
-{
- struct dentry *dentry, *tmp;
-
- list_for_each_entry_safe(dentry, tmp, &root->d_subdirs, d_child) {
- if (!simple_positive(dentry)) {
- WARN_ON_ONCE(1); /* I think this can't happen? */
- continue;
- }
- nfsdfs_remove_file(d_inode(root), dentry);
- }
-}
/* XXX: cut'n'paste from simple_fill_super; figure out if we could share
* code instead. */
static int nfsdfs_create_files(struct dentry *root,
const struct tree_descr *files,
+ struct nfsdfs_client *ncl,
struct dentry **fdentries)
{
struct inode *dir = d_inode(root);
@@ -1317,8 +1284,9 @@ static int nfsdfs_create_files(struct dentry *root,
dput(dentry);
goto out;
}
+ kref_get(&ncl->cl_ref);
inode->i_fop = files->ops;
- inode->i_private = __get_nfsdfs_client(dir);
+ inode->i_private = ncl;
d_add(dentry, inode);
fsnotify_create(dir, dentry);
if (fdentries)
@@ -1327,7 +1295,6 @@ static int nfsdfs_create_files(struct dentry *root,
inode_unlock(dir);
return 0;
out:
- nfsdfs_remove_files(root);
inode_unlock(dir);
return -ENOMEM;
}
@@ -1347,7 +1314,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name);
if (IS_ERR(dentry)) /* XXX: tossing errors? */
return NULL;
- ret = nfsdfs_create_files(dentry, files, fdentries);
+ ret = nfsdfs_create_files(dentry, files, ncl, fdentries);
if (ret) {
nfsd_client_rmdir(dentry);
return NULL;
@@ -1358,20 +1325,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
/* Taken from __rpc_rmdir: */
void nfsd_client_rmdir(struct dentry *dentry)
{
- struct inode *dir = d_inode(dentry->d_parent);
- struct inode *inode = d_inode(dentry);
- int ret;
-
- inode_lock(dir);
- nfsdfs_remove_files(dentry);
- clear_ncl(inode);
- dget(dentry);
- ret = simple_rmdir(dir, dentry);
- WARN_ON_ONCE(ret);
- d_drop(dentry);
- fsnotify_rmdir(dir, dentry);
- dput(dentry);
- inode_unlock(dir);
+ simple_recursive_removal(dentry, clear_ncl);
}
static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6e7e37192461..b7c7a9273ea0 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1831,6 +1831,10 @@ retry:
}
trap = lock_rename(tdentry, fdentry);
+ if (IS_ERR(trap)) {
+ err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
+ goto out;
+ }
err = fh_fill_pre_attrs(ffhp);
if (err != nfs_ok)
goto out_unlock;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 959bd9fb3d81..c950139db6ef 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -441,7 +441,6 @@ out:
static struct dentry *nilfs_get_parent(struct dentry *child)
{
unsigned long ino;
- struct inode *inode;
struct nilfs_root *root;
ino = nilfs_inode_by_name(d_inode(child), &dotdot_name);
@@ -450,11 +449,7 @@ static struct dentry *nilfs_get_parent(struct dentry *child)
root = NILFS_I(d_inode(child))->i_root;
- inode = nilfs_iget(child->d_sb, root, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
-
- return d_obtain_alias(inode);
+ return d_obtain_alias(nilfs_iget(child->d_sb, root, ino));
}
static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 1cb9ad7e884e..3464fa7e8538 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -29,7 +29,6 @@ static struct ctl_table dnotify_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {}
};
static void __init dnotify_sysctl_init(void)
{
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index f83e7cc5ccf2..fbdc63cc10d9 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -86,7 +86,6 @@ static struct ctl_table fanotify_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO
},
- { }
};
static void __init fanotify_sysctls_init(void)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 7974e91ffe13..8bfd690e9f10 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -124,7 +124,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
* d_flags to indicate parental interest (their parent is the
* original inode) */
spin_lock(&alias->d_lock);
- list_for_each_entry(child, &alias->d_subdirs, d_child) {
+ hlist_for_each_entry(child, &alias->d_children, d_sib) {
if (!child->d_inode)
continue;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index a3809ae92170..85d8fdd55329 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -85,7 +85,6 @@ static struct ctl_table inotify_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO
},
- { }
};
static void __init inotify_sysctls_init(void)
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 9a4b228d42fa..34e1e3e36733 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -90,12 +90,9 @@ slow:
inode->i_fop = &ns_file_operations;
inode->i_private = ns;
- dentry = d_alloc_anon(mnt->mnt_sb);
- if (!dentry) {
- iput(inode);
+ dentry = d_make_root(inode); /* not the normal use, but... */
+ if (!dentry)
return -ENOMEM;
- }
- d_instantiate(dentry, inode);
dentry->d_fsdata = (void *)ns->ops;
d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
if (d) {
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
index 174fe536a1c0..4e980170d86a 100644
--- a/fs/ntfs/sysctl.c
+++ b/fs/ntfs/sysctl.c
@@ -28,7 +28,6 @@ static struct ctl_table ntfs_sysctls[] = {
.mode = 0644, /* Mode, proc handler. */
.proc_handler = proc_dointvec
},
- {}
};
/* Storage for the sysctls header. */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 04fc8344063a..a9b8688aaf30 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -124,17 +124,10 @@ static int ocfs2_match_dentry(struct dentry *dentry,
if (!dentry->d_fsdata)
return 0;
- if (!dentry->d_parent)
- return 0;
-
if (skip_unhashed && d_unhashed(dentry))
return 0;
parent = d_inode(dentry->d_parent);
- /* Negative parent dentry? */
- if (!parent)
- return 0;
-
/* Name is in a different directory. */
if (OCFS2_I(parent)->ip_blkno != parent_blkno)
return 0;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index a14c8fee6ee5..d620d4c53c6f 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1593,9 +1593,6 @@ int __ocfs2_add_entry(handle_t *handle,
struct buffer_head *insert_bh = lookup->dl_leaf_bh;
char *data_start = insert_bh->b_data;
- if (!namelen)
- return -EINVAL;
-
if (ocfs2_dir_indexed(dir)) {
struct buffer_head *bh;
@@ -4245,12 +4242,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
trace_ocfs2_prepare_dir_for_insert(
(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen);
- if (!namelen) {
- ret = -EINVAL;
- mlog_errno(ret);
- goto out;
- }
-
/*
* Do this up front to reduce confusion.
*
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 814733ba2f4b..9221a33f917b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1336,7 +1336,7 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
goto bail;
}
- if (S_ISDIR(old_inode->i_mode)) {
+ if (S_ISDIR(old_inode->i_mode) && new_dir != old_dir) {
u64 old_inode_parent;
update_dot_dot = 1;
@@ -1353,8 +1353,7 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
goto bail;
}
- if (!new_inode && new_dir != old_dir &&
- new_dir->i_nlink >= ocfs2_link_max(osb)) {
+ if (!new_inode && new_dir->i_nlink >= ocfs2_link_max(osb)) {
status = -EMLINK;
goto bail;
}
@@ -1601,6 +1600,9 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
mlog_errno(status);
goto bail;
}
+ }
+
+ if (S_ISDIR(old_inode->i_mode)) {
drop_nlink(old_dir);
if (new_inode) {
drop_nlink(new_inode);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index a8d5ca98fa57..20aa37b67cfb 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -658,7 +658,6 @@ static struct ctl_table ocfs2_nm_table[] = {
.mode = 0644,
.proc_handler = proc_dostring,
},
- { }
};
static struct ctl_table_header *ocfs2_table_header;
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index 9cacce5d55c1..6d1fbeca9d81 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -58,10 +58,10 @@ struct orangefs_dir {
* first part of the part list.
*/
-static int do_readdir(struct orangefs_inode_s *oi,
- struct orangefs_dir *od, struct dentry *dentry,
+static int do_readdir(struct orangefs_dir *od, struct inode *inode,
struct orangefs_kernel_op_s *op)
{
+ struct orangefs_inode_s *oi = ORANGEFS_I(inode);
struct orangefs_readdir_response_s *resp;
int bufi, r;
@@ -87,7 +87,7 @@ again:
op->upcall.req.readdir.buf_index = bufi;
r = service_operation(op, "orangefs_readdir",
- get_interruptible_flag(dentry->d_inode));
+ get_interruptible_flag(inode));
orangefs_readdir_index_put(bufi);
@@ -158,8 +158,7 @@ static int parse_readdir(struct orangefs_dir *od,
return 0;
}
-static int orangefs_dir_more(struct orangefs_inode_s *oi,
- struct orangefs_dir *od, struct dentry *dentry)
+static int orangefs_dir_more(struct orangefs_dir *od, struct inode *inode)
{
struct orangefs_kernel_op_s *op;
int r;
@@ -169,7 +168,7 @@ static int orangefs_dir_more(struct orangefs_inode_s *oi,
od->error = -ENOMEM;
return -ENOMEM;
}
- r = do_readdir(oi, od, dentry, op);
+ r = do_readdir(od, inode, op);
if (r) {
od->error = r;
goto out;
@@ -238,9 +237,7 @@ next:
return 1;
}
-static int orangefs_dir_fill(struct orangefs_inode_s *oi,
- struct orangefs_dir *od, struct dentry *dentry,
- struct dir_context *ctx)
+static int orangefs_dir_fill(struct orangefs_dir *od, struct dir_context *ctx)
{
struct orangefs_dir_part *part;
size_t count;
@@ -304,15 +301,10 @@ static loff_t orangefs_dir_llseek(struct file *file, loff_t offset,
static int orangefs_dir_iterate(struct file *file,
struct dir_context *ctx)
{
- struct orangefs_inode_s *oi;
- struct orangefs_dir *od;
- struct dentry *dentry;
+ struct orangefs_dir *od = file->private_data;
+ struct inode *inode = file_inode(file);
int r;
- dentry = file->f_path.dentry;
- oi = ORANGEFS_I(dentry->d_inode);
- od = file->private_data;
-
if (od->error)
return od->error;
@@ -342,7 +334,7 @@ static int orangefs_dir_iterate(struct file *file,
*/
while (od->token != ORANGEFS_ITERATE_END &&
ctx->pos > od->end) {
- r = orangefs_dir_more(oi, od, dentry);
+ r = orangefs_dir_more(od, inode);
if (r)
return r;
}
@@ -351,17 +343,17 @@ static int orangefs_dir_iterate(struct file *file,
/* Then try to fill if there's any left in the buffer. */
if (ctx->pos < od->end) {
- r = orangefs_dir_fill(oi, od, dentry, ctx);
+ r = orangefs_dir_fill(od, ctx);
if (r)
return r;
}
/* Finally get some more and try to fill. */
if (od->token != ORANGEFS_ITERATE_END) {
- r = orangefs_dir_more(oi, od, dentry);
+ r = orangefs_dir_more(od, inode);
if (r)
return r;
- r = orangefs_dir_fill(oi, od, dentry, ctx);
+ r = orangefs_dir_fill(od, ctx);
}
return r;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 696478f09cc1..b8e25ca51016 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -744,7 +744,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
struct inode *inode;
struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir);
struct path path = { .mnt = ovl_upper_mnt(ofs) };
- struct dentry *temp, *upper;
+ struct dentry *temp, *upper, *trap;
struct ovl_cu_creds cc;
int err;
struct ovl_cattr cattr = {
@@ -781,11 +781,13 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
* temp wasn't moved before copy up completion or cleanup.
*/
ovl_start_write(c->dentry);
- if (lock_rename(c->workdir, c->destdir) != NULL ||
- temp->d_parent != c->workdir) {
+ trap = lock_rename(c->workdir, c->destdir);
+ if (trap || temp->d_parent != c->workdir) {
/* temp or workdir moved underneath us? abort without cleanup */
dput(temp);
err = -EIO;
+ if (IS_ERR(trap))
+ goto out;
goto unlock;
} else if (err) {
goto cleanup;
@@ -826,6 +828,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
ovl_set_flag(OVL_WHITEOUTS, inode);
unlock:
unlock_rename(c->workdir, c->destdir);
+out:
ovl_end_write(c->dentry);
return err;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index aab3f5d93556..0f8b4a719237 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -1180,6 +1180,10 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
}
trap = lock_rename(new_upperdir, old_upperdir);
+ if (IS_ERR(trap)) {
+ err = PTR_ERR(trap);
+ goto out_revert_creds;
+ }
olddentry = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir,
old->d_name.len);
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 6909c4a5da56..063409069f56 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -289,7 +289,6 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb,
{
struct dentry *lower = lowerpath ? lowerpath->dentry : NULL;
struct dentry *upper = upper_alias ?: index;
- struct dentry *dentry;
struct inode *inode = NULL;
struct ovl_entry *oe;
struct ovl_inode_params oip = {
@@ -320,27 +319,7 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb,
if (upper)
ovl_set_flag(OVL_UPPERDATA, inode);
- dentry = d_find_any_alias(inode);
- if (dentry)
- goto out_iput;
-
- dentry = d_alloc_anon(inode->i_sb);
- if (unlikely(!dentry))
- goto nomem;
-
- if (upper_alias)
- ovl_dentry_set_upper_alias(dentry);
-
- ovl_dentry_init_reval(dentry, upper, OVL_I_E(inode));
-
- return d_instantiate_anon(dentry, inode);
-
-nomem:
- dput(dentry);
- dentry = ERR_PTR(-ENOMEM);
-out_iput:
- iput(inode);
- return dentry;
+ return d_obtain_alias(inode);
}
/* Get the upper or lower dentry in stack whose on layer @idx */
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 984ffdaeed6c..5764f91d283e 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -18,10 +18,11 @@
struct ovl_lookup_data {
struct super_block *sb;
- struct vfsmount *mnt;
+ const struct ovl_layer *layer;
struct qstr name;
bool is_dir;
bool opaque;
+ bool xwhiteouts;
bool stop;
bool last;
char *redirect;
@@ -201,17 +202,13 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
return real;
}
-static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path)
-{
- return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE);
-}
-
static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d,
const char *name,
struct dentry *base, int len,
bool drop_negative)
{
- struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->mnt), name, base, len);
+ struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name,
+ base, len);
if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
if (drop_negative && ret->d_lockref.count == 1) {
@@ -232,10 +229,13 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
size_t prelen, const char *post,
struct dentry **ret, bool drop_negative)
{
+ struct ovl_fs *ofs = OVL_FS(d->sb);
struct dentry *this;
struct path path;
int err;
bool last_element = !post[0];
+ bool is_upper = d->layer->idx == 0;
+ char val;
this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative);
if (IS_ERR(this)) {
@@ -253,8 +253,8 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
}
path.dentry = this;
- path.mnt = d->mnt;
- if (ovl_path_is_whiteout(OVL_FS(d->sb), &path)) {
+ path.mnt = d->layer->mnt;
+ if (ovl_path_is_whiteout(ofs, &path)) {
d->stop = d->opaque = true;
goto put_and_out;
}
@@ -272,7 +272,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
d->stop = true;
goto put_and_out;
}
- err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path, NULL);
+ err = ovl_check_metacopy_xattr(ofs, &path, NULL);
if (err < 0)
goto out_err;
@@ -292,7 +292,12 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
if (d->last)
goto out;
- if (ovl_is_opaquedir(OVL_FS(d->sb), &path)) {
+ /* overlay.opaque=x means xwhiteouts directory */
+ val = ovl_get_opaquedir_val(ofs, &path);
+ if (last_element && !is_upper && val == 'x') {
+ d->xwhiteouts = true;
+ ovl_layer_set_xwhiteouts(ofs, d->layer);
+ } else if (val == 'y') {
d->stop = true;
if (last_element)
d->opaque = true;
@@ -863,7 +868,8 @@ fail:
* Returns next layer in stack starting from top.
* Returns -1 if this is the last layer.
*/
-int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path,
+ const struct ovl_layer **layer)
{
struct ovl_entry *oe = OVL_E(dentry);
struct ovl_path *lowerstack = ovl_lowerstack(oe);
@@ -871,13 +877,16 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
BUG_ON(idx < 0);
if (idx == 0) {
ovl_path_upper(dentry, path);
- if (path->dentry)
+ if (path->dentry) {
+ *layer = &OVL_FS(dentry->d_sb)->layers[0];
return ovl_numlower(oe) ? 1 : -1;
+ }
idx++;
}
BUG_ON(idx > ovl_numlower(oe));
path->dentry = lowerstack[idx - 1].dentry;
- path->mnt = lowerstack[idx - 1].layer->mnt;
+ *layer = lowerstack[idx - 1].layer;
+ path->mnt = (*layer)->mnt;
return (idx < ovl_numlower(oe)) ? idx + 1 : -1;
}
@@ -1055,7 +1064,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
old_cred = ovl_override_creds(dentry->d_sb);
upperdir = ovl_dentry_upper(dentry->d_parent);
if (upperdir) {
- d.mnt = ovl_upper_mnt(ofs);
+ d.layer = &ofs->layers[0];
err = ovl_lookup_layer(upperdir, &d, &upperdentry, true);
if (err)
goto out;
@@ -1111,7 +1120,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
else if (d.is_dir || !ofs->numdatalayer)
d.last = lower.layer->idx == ovl_numlower(roe);
- d.mnt = lower.layer->mnt;
+ d.layer = lower.layer;
err = ovl_lookup_layer(lower.dentry, &d, &this, false);
if (err)
goto out_put;
@@ -1278,6 +1287,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (upperopaque)
ovl_dentry_set_opaque(dentry);
+ if (d.xwhiteouts)
+ ovl_dentry_set_xwhiteouts(dentry);
if (upperdentry)
ovl_dentry_set_upper_alias(dentry);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 5ba11eb43767..ee949f3e7c77 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -50,7 +50,6 @@ enum ovl_xattr {
OVL_XATTR_METACOPY,
OVL_XATTR_PROTATTR,
OVL_XATTR_XWHITEOUT,
- OVL_XATTR_XWHITEOUTS,
};
enum ovl_inode_flag {
@@ -70,6 +69,8 @@ enum ovl_entry_flag {
OVL_E_UPPER_ALIAS,
OVL_E_OPAQUE,
OVL_E_CONNECTED,
+ /* Lower stack may contain xwhiteout entries */
+ OVL_E_XWHITEOUTS,
};
enum {
@@ -477,6 +478,10 @@ bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
bool ovl_dentry_is_whiteout(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry);
+bool ovl_dentry_has_xwhiteouts(struct dentry *dentry);
+void ovl_dentry_set_xwhiteouts(struct dentry *dentry);
+void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs,
+ const struct ovl_layer *layer);
bool ovl_dentry_has_upper_alias(struct dentry *dentry);
void ovl_dentry_set_upper_alias(struct dentry *dentry);
bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags);
@@ -494,11 +499,10 @@ struct file *ovl_path_open(const struct path *path, int flags);
int ovl_copy_up_start(struct dentry *dentry, int flags);
void ovl_copy_up_end(struct dentry *dentry);
bool ovl_already_copied_up(struct dentry *dentry, int flags);
-bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
- enum ovl_xattr ox);
+char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path,
+ enum ovl_xattr ox);
bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path);
bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path);
-bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path);
bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
const struct path *upperpath);
@@ -573,7 +577,13 @@ static inline bool ovl_is_impuredir(struct super_block *sb,
.mnt = ovl_upper_mnt(ofs),
};
- return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE);
+ return ovl_get_dir_xattr_val(ofs, &upperpath, OVL_XATTR_IMPURE) == 'y';
+}
+
+static inline char ovl_get_opaquedir_val(struct ovl_fs *ofs,
+ const struct path *path)
+{
+ return ovl_get_dir_xattr_val(ofs, path, OVL_XATTR_OPAQUE);
}
static inline bool ovl_redirect_follow(struct ovl_fs *ofs)
@@ -680,7 +690,8 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh);
struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
struct dentry *origin, bool verify);
-int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path,
+ const struct ovl_layer **layer);
int ovl_verify_lowerdata(struct dentry *dentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 5fa9c58af65f..cb449ab310a7 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -40,6 +40,8 @@ struct ovl_layer {
int idx;
/* One fsid per unique underlying sb (upper fsid == 0) */
int fsid;
+ /* xwhiteouts were found on this layer */
+ bool has_xwhiteouts;
};
struct ovl_path {
@@ -59,7 +61,7 @@ struct ovl_fs {
unsigned int numfs;
/* Number of data-only lower layers */
unsigned int numdatalayer;
- const struct ovl_layer *layers;
+ struct ovl_layer *layers;
struct ovl_sb *fs;
/* workbasedir is the path at workdir= mount option */
struct dentry *workbasedir;
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index e71156baa7bc..0ca8af060b0c 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -305,8 +305,6 @@ static inline int ovl_dir_read(const struct path *realpath,
if (IS_ERR(realfile))
return PTR_ERR(realfile);
- rdd->in_xwhiteouts_dir = rdd->dentry &&
- ovl_path_check_xwhiteouts_xattr(OVL_FS(rdd->dentry->d_sb), realpath);
rdd->first_maybe_whiteout = NULL;
rdd->ctx.pos = 0;
do {
@@ -359,10 +357,13 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
.is_lowest = false,
};
int idx, next;
+ const struct ovl_layer *layer;
for (idx = 0; idx != -1; idx = next) {
- next = ovl_path_next(idx, dentry, &realpath);
+ next = ovl_path_next(idx, dentry, &realpath, &layer);
rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry;
+ rdd.in_xwhiteouts_dir = layer->has_xwhiteouts &&
+ ovl_dentry_has_xwhiteouts(dentry);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 0bbbe4818f67..2eef6c70b2ae 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -439,8 +439,10 @@ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
bool ok = false;
if (workdir != upperdir) {
- ok = (lock_rename(workdir, upperdir) == NULL);
- unlock_rename(workdir, upperdir);
+ struct dentry *trap = lock_rename(workdir, upperdir);
+ if (!IS_ERR(trap))
+ unlock_rename(workdir, upperdir);
+ ok = (trap == NULL);
}
return ok;
}
@@ -1247,6 +1249,7 @@ static struct dentry *ovl_get_root(struct super_block *sb,
struct ovl_entry *oe)
{
struct dentry *root;
+ struct ovl_fs *ofs = OVL_FS(sb);
struct ovl_path *lowerpath = ovl_lowerstack(oe);
unsigned long ino = d_inode(lowerpath->dentry)->i_ino;
int fsid = lowerpath->layer->fsid;
@@ -1268,6 +1271,20 @@ static struct dentry *ovl_get_root(struct super_block *sb,
ovl_set_flag(OVL_IMPURE, d_inode(root));
}
+ /* Look for xwhiteouts marker except in the lowermost layer */
+ for (int i = 0; i < ovl_numlower(oe) - 1; i++, lowerpath++) {
+ struct path path = {
+ .mnt = lowerpath->layer->mnt,
+ .dentry = lowerpath->dentry,
+ };
+
+ /* overlay.opaque=x means xwhiteouts directory */
+ if (ovl_get_opaquedir_val(ofs, &path) == 'x') {
+ ovl_layer_set_xwhiteouts(ofs, lowerpath->layer);
+ ovl_dentry_set_xwhiteouts(root);
+ }
+ }
+
/* Root is always merge -> can have whiteouts */
ovl_set_flag(OVL_WHITEOUTS, d_inode(root));
ovl_dentry_set_flag(OVL_E_CONNECTED, root);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 22b519763267..a8e17f14d7a2 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -461,6 +461,33 @@ void ovl_dentry_set_opaque(struct dentry *dentry)
ovl_dentry_set_flag(OVL_E_OPAQUE, dentry);
}
+bool ovl_dentry_has_xwhiteouts(struct dentry *dentry)
+{
+ return ovl_dentry_test_flag(OVL_E_XWHITEOUTS, dentry);
+}
+
+void ovl_dentry_set_xwhiteouts(struct dentry *dentry)
+{
+ ovl_dentry_set_flag(OVL_E_XWHITEOUTS, dentry);
+}
+
+/*
+ * ovl_layer_set_xwhiteouts() is called before adding the overlay dir
+ * dentry to dcache, while readdir of that same directory happens after
+ * the overlay dir dentry is in dcache, so if some cpu observes that
+ * ovl_dentry_is_xwhiteouts(), it will also observe layer->has_xwhiteouts
+ * for the layers where xwhiteouts marker was found in that merge dir.
+ */
+void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs,
+ const struct ovl_layer *layer)
+{
+ if (layer->has_xwhiteouts)
+ return;
+
+ /* Write once to read-mostly layer properties */
+ ofs->layers[layer->idx].has_xwhiteouts = true;
+}
+
/*
* For hard links and decoded file handles, it's possible for ovl_dentry_upper()
* to return positive, while there's no actual upper alias for the inode.
@@ -739,19 +766,6 @@ bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path)
return res >= 0;
}
-bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path)
-{
- struct dentry *dentry = path->dentry;
- int res;
-
- /* xattr.whiteouts must be a directory */
- if (!d_is_dir(dentry))
- return false;
-
- res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUTS, NULL, 0);
- return res >= 0;
-}
-
/*
* Load persistent uuid from xattr into s_uuid if found, or store a new
* random generated value in s_uuid and in xattr.
@@ -811,20 +825,17 @@ fail:
return false;
}
-bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
- enum ovl_xattr ox)
+char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path,
+ enum ovl_xattr ox)
{
int res;
char val;
if (!d_is_dir(path->dentry))
- return false;
+ return 0;
res = ovl_path_getxattr(ofs, path, ox, &val, 1);
- if (res == 1 && val == 'y')
- return true;
-
- return false;
+ return res == 1 ? val : 0;
}
#define OVL_XATTR_OPAQUE_POSTFIX "opaque"
@@ -837,7 +848,6 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
#define OVL_XATTR_METACOPY_POSTFIX "metacopy"
#define OVL_XATTR_PROTATTR_POSTFIX "protattr"
#define OVL_XATTR_XWHITEOUT_POSTFIX "whiteout"
-#define OVL_XATTR_XWHITEOUTS_POSTFIX "whiteouts"
#define OVL_XATTR_TAB_ENTRY(x) \
[x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \
@@ -854,7 +864,6 @@ const char *const ovl_xattr_table[][2] = {
OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT),
- OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUTS),
};
int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
@@ -1198,12 +1207,17 @@ void ovl_nlink_end(struct dentry *dentry)
int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir)
{
+ struct dentry *trap;
+
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
- if (lock_rename(workdir, upperdir) != NULL)
+ trap = lock_rename(workdir, upperdir);
+ if (IS_ERR(trap))
+ goto err;
+ if (trap)
goto err_unlock;
return 0;
diff --git a/fs/pipe.c b/fs/pipe.c
index 8d9286a1f2e8..f1adbfe743d4 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1507,7 +1507,6 @@ static struct ctl_table fs_pipe_sysctls[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
- { }
};
#endif
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 8064ea76f80b..37cde0efee57 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -44,7 +44,7 @@ static struct ctl_table sysctl_mount_point[] = {
*/
struct ctl_table_header *register_sysctl_mount_point(const char *path)
{
- return register_sysctl_sz(path, sysctl_mount_point, 0);
+ return register_sysctl(path, sysctl_mount_point);
}
EXPORT_SYMBOL(register_sysctl_mount_point);
@@ -71,7 +71,6 @@ static struct ctl_table root_table[] = {
.procname = "",
.mode = S_IFDIR|S_IRUGO|S_IXUGO,
},
- { }
};
static struct ctl_table_root sysctl_table_root = {
.default_set.dir.header = {
@@ -233,7 +232,8 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
return -EROFS;
/* Am I creating a permanently empty directory? */
- if (sysctl_is_perm_empty_ctl_table(header->ctl_table)) {
+ if (header->ctl_table_size > 0 &&
+ sysctl_is_perm_empty_ctl_table(header->ctl_table)) {
if (!RB_EMPTY_ROOT(&dir->root))
return -EINVAL;
sysctl_set_perm_empty_ctl_header(dir_h);
@@ -534,13 +534,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
- inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
- if (IS_ERR(inode)) {
- err = ERR_CAST(inode);
- goto out;
- }
-
d_set_d_op(dentry, &proc_sys_dentry_operations);
+ inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
err = d_splice_alias(inode, dentry);
out:
@@ -698,13 +693,8 @@ static bool proc_sys_fill_cache(struct file *file,
return false;
if (d_in_lookup(child)) {
struct dentry *res;
- inode = proc_sys_make_inode(dir->d_sb, head, table);
- if (IS_ERR(inode)) {
- d_lookup_done(child);
- dput(child);
- return false;
- }
d_set_d_op(child, &proc_sys_dentry_operations);
+ inode = proc_sys_make_inode(dir->d_sb, head, table);
res = d_splice_alias(inode, child);
d_lookup_done(child);
if (unlikely(res)) {
@@ -1213,6 +1203,10 @@ static bool get_links(struct ctl_dir *dir,
struct ctl_table_header *tmp_head;
struct ctl_table *entry, *link;
+ if (header->ctl_table_size == 0 ||
+ sysctl_is_perm_empty_ctl_table(header->ctl_table))
+ return true;
+
/* Are there links available for every entry in table? */
list_for_each_table_entry(entry, header) {
const char *procname = entry->procname;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 62b16f42d5d2..3f78ebbb795f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -2432,7 +2432,6 @@ static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
{
- struct mmu_notifier_range range;
struct pagemap_scan_private p = {0};
unsigned long walk_start;
size_t n_ranges_out = 0;
@@ -2448,15 +2447,9 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
if (ret)
return ret;
- /* Protection change for the range is going to happen. */
- if (p.arg.flags & PM_SCAN_WP_MATCHING) {
- mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
- mm, p.arg.start, p.arg.end);
- mmu_notifier_invalidate_range_start(&range);
- }
-
for (walk_start = p.arg.start; walk_start < p.arg.end;
walk_start = p.arg.walk_end) {
+ struct mmu_notifier_range range;
long n_out;
if (fatal_signal_pending(current)) {
@@ -2467,8 +2460,20 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
ret = mmap_read_lock_killable(mm);
if (ret)
break;
+
+ /* Protection change for the range is going to happen. */
+ if (p.arg.flags & PM_SCAN_WP_MATCHING) {
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
+ mm, walk_start, p.arg.end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+
ret = walk_page_range(mm, walk_start, p.arg.end,
&pagemap_scan_ops, &p);
+
+ if (p.arg.flags & PM_SCAN_WP_MATCHING)
+ mmu_notifier_invalidate_range_end(&range);
+
mmap_read_unlock(mm);
n_out = pagemap_scan_flush_buffer(&p);
@@ -2494,9 +2499,6 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
if (pagemap_scan_writeback_args(&p.arg, uarg))
ret = -EFAULT;
- if (p.arg.flags & PM_SCAN_WP_MATCHING)
- mmu_notifier_invalidate_range_end(&range);
-
kfree(p.vec_buf);
return ret;
}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 44ff2813ae51..1f0c754416b6 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2969,7 +2969,6 @@ static struct ctl_table fs_dqstats_table[] = {
.proc_handler = proc_dointvec,
},
#endif
- { },
};
static int __init dquot_init(void)
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 994d6e6995ab..7e7b531fcc49 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -451,13 +451,6 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
BUG_ON(!th->t_trans_id);
- /* cannot allow items to be added into a busy deleted directory */
- if (!namelen)
- return -EINVAL;
-
- if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize))
- return -ENAMETOOLONG;
-
/* each entry has unique key. compose it */
make_cpu_key(&entry_key, dir,
get_third_component(dir->i_sb, name, namelen),
@@ -1324,8 +1317,8 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
struct inode *old_inode, *new_dentry_inode;
struct reiserfs_transaction_handle th;
int jbegin_count;
- umode_t old_inode_mode;
unsigned long savelink = 1;
+ bool update_dir_parent = false;
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
@@ -1375,8 +1368,7 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
return -ENOENT;
}
- old_inode_mode = old_inode->i_mode;
- if (S_ISDIR(old_inode_mode)) {
+ if (S_ISDIR(old_inode->i_mode)) {
/*
* make sure that directory being renamed has correct ".."
* and that its new parent directory has not too many links
@@ -1389,24 +1381,28 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
}
}
- /*
- * directory is renamed, its parent directory will be changed,
- * so find ".." entry
- */
- dot_dot_de.de_gen_number_bit_string = NULL;
- retval =
- reiserfs_find_entry(old_inode, "..", 2, &dot_dot_entry_path,
+ if (old_dir != new_dir) {
+ /*
+ * directory is renamed, its parent directory will be
+ * changed, so find ".." entry
+ */
+ dot_dot_de.de_gen_number_bit_string = NULL;
+ retval =
+ reiserfs_find_entry(old_inode, "..", 2,
+ &dot_dot_entry_path,
&dot_dot_de);
- pathrelse(&dot_dot_entry_path);
- if (retval != NAME_FOUND) {
- reiserfs_write_unlock(old_dir->i_sb);
- return -EIO;
- }
+ pathrelse(&dot_dot_entry_path);
+ if (retval != NAME_FOUND) {
+ reiserfs_write_unlock(old_dir->i_sb);
+ return -EIO;
+ }
- /* inode number of .. must equal old_dir->i_ino */
- if (dot_dot_de.de_objectid != old_dir->i_ino) {
- reiserfs_write_unlock(old_dir->i_sb);
- return -EIO;
+ /* inode number of .. must equal old_dir->i_ino */
+ if (dot_dot_de.de_objectid != old_dir->i_ino) {
+ reiserfs_write_unlock(old_dir->i_sb);
+ return -EIO;
+ }
+ update_dir_parent = true;
}
}
@@ -1486,7 +1482,7 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1);
- if (S_ISDIR(old_inode->i_mode)) {
+ if (update_dir_parent) {
if ((retval =
search_by_entry_key(new_dir->i_sb,
&dot_dot_de.de_entry_key,
@@ -1534,14 +1530,14 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
new_de.de_bh);
reiserfs_restore_prepared_buffer(old_inode->i_sb,
old_de.de_bh);
- if (S_ISDIR(old_inode_mode))
+ if (update_dir_parent)
reiserfs_restore_prepared_buffer(old_inode->
i_sb,
dot_dot_de.
de_bh);
continue;
}
- if (S_ISDIR(old_inode_mode)) {
+ if (update_dir_parent) {
if (item_moved(&dot_dot_ih, &dot_dot_entry_path) ||
!entry_points_to_object("..", 2, &dot_dot_de,
old_dir)) {
@@ -1559,7 +1555,7 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
}
}
- RFALSE(S_ISDIR(old_inode_mode) &&
+ RFALSE(update_dir_parent &&
!buffer_journal_prepared(dot_dot_de.de_bh), "");
break;
@@ -1592,11 +1588,12 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
savelink = new_dentry_inode->i_nlink;
}
- if (S_ISDIR(old_inode_mode)) {
+ if (update_dir_parent) {
/* adjust ".." of renamed directory */
set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir));
journal_mark_dirty(&th, dot_dot_de.de_bh);
-
+ }
+ if (S_ISDIR(old_inode->i_mode)) {
/*
* there (in new_dir) was no directory, so it got new link
* (".." of renamed directory)
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index d64a306a414b..1daeb5714faa 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -145,21 +145,27 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
struct cached_fid *cfid;
struct cached_fids *cfids;
const char *npath;
+ int retries = 0, cur_sleep = 1;
if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache ||
is_smb1_server(tcon->ses->server) || (dir_cache_timeout == 0))
return -EOPNOTSUPP;
ses = tcon->ses;
- server = ses->server;
cfids = tcon->cfids;
- if (!server->ops->new_lease_key)
- return -EIO;
-
if (cifs_sb->root == NULL)
return -ENOENT;
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ oplock = SMB2_OPLOCK_LEVEL_II;
+ server = cifs_pick_channel(ses);
+
+ if (!server->ops->new_lease_key)
+ return -EIO;
+
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (!utf16_path)
return -ENOMEM;
@@ -268,6 +274,11 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
*/
cfid->has_lease = true;
+ if (retries) {
+ smb2_set_replay(server, &rqst[0]);
+ smb2_set_replay(server, &rqst[1]);
+ }
+
rc = compound_send_recv(xid, ses, server,
flags, 2, rqst,
resp_buftype, rsp_iov);
@@ -367,6 +378,11 @@ out:
atomic_inc(&tcon->num_remote_opens);
}
kfree(utf16_path);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index 60027f5aebe8..3e4209f41c18 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -659,6 +659,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
spin_lock(&tcon->stat_lock);
tcon->bytes_read = 0;
tcon->bytes_written = 0;
+ tcon->stats_from_time = ktime_get_real_seconds();
spin_unlock(&tcon->stat_lock);
if (server->ops->clear_stats)
server->ops->clear_stats(tcon);
@@ -737,8 +738,9 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
seq_printf(m, "\n%d) %s", i, tcon->tree_name);
if (tcon->need_reconnect)
seq_puts(m, "\tDISCONNECTED ");
- seq_printf(m, "\nSMBs: %d",
- atomic_read(&tcon->num_smbs_sent));
+ seq_printf(m, "\nSMBs: %d since %ptTs UTC",
+ atomic_read(&tcon->num_smbs_sent),
+ &tcon->stats_from_time);
if (server->ops->print_stats)
server->ops->print_stats(m, tcon);
}
diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
index ef4c2e3c9fa6..6322f0f68a17 100644
--- a/fs/smb/client/cifsencrypt.c
+++ b/fs/smb/client/cifsencrypt.c
@@ -572,7 +572,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp);
UniStrupr(user);
} else {
- memset(user, '\0', 2);
+ *(u16 *)user = 0;
}
rc = crypto_shash_update(ses->server->secmech.hmacmd5,
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 99b0ade833aa..2a4a4e3a8751 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -396,7 +396,7 @@ cifs_alloc_inode(struct super_block *sb)
spin_lock_init(&cifs_inode->writers_lock);
cifs_inode->writers = 0;
cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
- cifs_inode->server_eof = 0;
+ cifs_inode->netfs.remote_i_size = 0;
cifs_inode->uniqueid = 0;
cifs_inode->createtime = 0;
cifs_inode->epoch = 0;
@@ -430,7 +430,7 @@ static void
cifs_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
- if (inode->i_state & I_PINNING_FSCACHE_WB)
+ if (inode->i_state & I_PINNING_NETFS_WB)
cifs_fscache_unuse_inode_cookie(inode, true);
cifs_fscache_release_inode_cookie(inode);
clear_inode(inode);
@@ -681,6 +681,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",rasize=%u", cifs_sb->ctx->rasize);
if (tcon->ses->server->min_offload)
seq_printf(s, ",esize=%u", tcon->ses->server->min_offload);
+ if (tcon->ses->server->retrans)
+ seq_printf(s, ",retrans=%u", tcon->ses->server->retrans);
seq_printf(s, ",echo_interval=%lu",
tcon->ses->server->echo_interval / HZ);
@@ -793,8 +795,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
static int cifs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- fscache_unpin_writeback(wbc, cifs_inode_cookie(inode));
- return 0;
+ return netfs_unpin_writeback(inode, wbc);
}
static int cifs_drop_inode(struct inode *inode)
@@ -1222,7 +1223,7 @@ static int cifs_precopy_set_eof(struct inode *src_inode, struct cifsInodeInfo *s
if (rc < 0)
goto set_failed;
- netfs_resize_file(&src_cifsi->netfs, src_end);
+ netfs_resize_file(&src_cifsi->netfs, src_end, true);
fscache_resize_cookie(cifs_inode_cookie(src_inode), src_end);
return 0;
@@ -1353,7 +1354,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
smb_file_src, smb_file_target, off, len, destoff);
if (rc == 0 && new_size > i_size_read(target_inode)) {
truncate_setsize(target_inode, new_size);
- netfs_resize_file(&target_cifsi->netfs, new_size);
+ netfs_resize_file(&target_cifsi->netfs, new_size, true);
fscache_resize_cookie(cifs_inode_cookie(target_inode),
new_size);
}
@@ -1379,6 +1380,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
struct inode *src_inode = file_inode(src_file);
struct inode *target_inode = file_inode(dst_file);
struct cifsInodeInfo *src_cifsi = CIFS_I(src_inode);
+ struct cifsInodeInfo *target_cifsi = CIFS_I(target_inode);
struct cifsFileInfo *smb_file_src;
struct cifsFileInfo *smb_file_target;
struct cifs_tcon *src_tcon;
@@ -1427,7 +1429,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
* Advance the EOF marker after the flush above to the end of the range
* if it's short of that.
*/
- if (src_cifsi->server_eof < off + len) {
+ if (src_cifsi->netfs.remote_i_size < off + len) {
rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len);
if (rc < 0)
goto unlock;
@@ -1451,12 +1453,22 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
/* Discard all the folios that overlap the destination region. */
truncate_inode_pages_range(&target_inode->i_data, fstart, fend);
+ fscache_invalidate(cifs_inode_cookie(target_inode), NULL,
+ i_size_read(target_inode), 0);
+
rc = file_modified(dst_file);
if (!rc) {
rc = target_tcon->ses->server->ops->copychunk_range(xid,
smb_file_src, smb_file_target, off, len, destoff);
- if (rc > 0 && destoff + rc > i_size_read(target_inode))
+ if (rc > 0 && destoff + rc > i_size_read(target_inode)) {
truncate_setsize(target_inode, destoff + rc);
+ netfs_resize_file(&target_cifsi->netfs,
+ i_size_read(target_inode), true);
+ fscache_resize_cookie(cifs_inode_cookie(target_inode),
+ i_size_read(target_inode));
+ }
+ if (rc > 0 && destoff + rc > target_cifsi->netfs.zero_point)
+ target_cifsi->netfs.zero_point = destoff + rc;
}
file_accessed(src_file);
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 3adea10aa9da..685f7d1139c6 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -152,6 +152,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 46
-#define CIFS_VERSION "2.46"
+#define SMB3_PRODUCT_BUILD 47
+#define CIFS_VERSION "2.47"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 5e32c79f03a7..c86a72c9d9ec 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -50,6 +50,11 @@
#define CIFS_DEF_ACTIMEO (1 * HZ)
/*
+ * max sleep time before retry to server
+ */
+#define CIFS_MAX_SLEEP 2000
+
+/*
* max attribute cache timeout (jiffies) - 2^30
*/
#define CIFS_MAX_ACTIMEO (1 << 30)
@@ -82,7 +87,7 @@
#define SMB_INTERFACE_POLL_INTERVAL 600
/* maximum number of PDUs in one compound */
-#define MAX_COMPOUND 5
+#define MAX_COMPOUND 7
/*
* Default number of credits to keep available for SMB3.
@@ -192,6 +197,11 @@ struct cifs_open_info_data {
bool symlink;
};
struct {
+ /* ioctl response buffer */
+ struct {
+ int buftype;
+ struct kvec iov;
+ } io;
__u32 tag;
union {
struct reparse_data_buffer *buf;
@@ -199,19 +209,25 @@ struct cifs_open_info_data {
};
} reparse;
char *symlink_target;
+ struct cifs_sid posix_owner;
+ struct cifs_sid posix_group;
union {
struct smb2_file_all_info fi;
struct smb311_posix_qinfo posix_fi;
};
};
-#define cifs_open_data_reparse(d) \
- ((d)->reparse_point || \
- (le32_to_cpu((d)->fi.Attributes) & ATTR_REPARSE))
-
-static inline void cifs_free_open_info(struct cifs_open_info_data *data)
+static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data)
{
- kfree(data->symlink_target);
+ struct smb2_file_all_info *fi = &data->fi;
+ u32 attrs = le32_to_cpu(fi->Attributes);
+ bool ret;
+
+ ret = data->reparse_point || (attrs & ATTR_REPARSE);
+ if (ret)
+ attrs |= ATTR_REPARSE;
+ fi->Attributes = cpu_to_le32(attrs);
+ return ret;
}
/*
@@ -390,12 +406,17 @@ struct smb_version_operations {
int (*rename_pending_delete)(const char *, struct dentry *,
const unsigned int);
/* send rename request */
- int (*rename)(const unsigned int, struct cifs_tcon *, const char *,
- const char *, struct cifs_sb_info *);
+ int (*rename)(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb);
/* send create hardlink request */
- int (*create_hardlink)(const unsigned int, struct cifs_tcon *,
- const char *, const char *,
- struct cifs_sb_info *);
+ int (*create_hardlink)(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb);
/* query symlink target */
int (*query_symlink)(const unsigned int xid,
struct cifs_tcon *tcon,
@@ -560,6 +581,12 @@ struct smb_version_operations {
int (*parse_reparse_point)(struct cifs_sb_info *cifs_sb,
struct kvec *rsp_iov,
struct cifs_open_info_data *data);
+ int (*create_reparse_symlink)(const unsigned int xid,
+ struct inode *inode,
+ struct dentry *dentry,
+ struct cifs_tcon *tcon,
+ const char *full_path,
+ const char *symname);
};
struct smb_version_values {
@@ -731,6 +758,7 @@ struct TCP_Server_Info {
unsigned int max_read;
unsigned int max_write;
unsigned int min_offload;
+ unsigned int retrans;
__le16 compress_algorithm;
__u16 signing_algorithm;
__le16 cipher_type;
@@ -1004,6 +1032,8 @@ struct cifs_chan {
__u8 signkey[SMB3_SIGN_KEY_SIZE];
};
+#define CIFS_SES_FLAG_SCALE_CHANNELS (0x1)
+
/*
* Session structure. One of these for each uid session with a particular host
*/
@@ -1036,6 +1066,7 @@ struct cifs_ses {
enum securityEnum sectype; /* what security flavor was specified? */
bool sign; /* is signing required? */
bool domainAuto:1;
+ unsigned int flags;
__u16 session_flags;
__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
__u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE];
@@ -1187,6 +1218,7 @@ struct cifs_tcon {
__u64 bytes_read;
__u64 bytes_written;
spinlock_t stat_lock; /* protects the two fields above */
+ time64_t stats_from_time;
FILE_SYSTEM_DEVICE_INFO fsDevInfo;
FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */
FILE_SYSTEM_UNIX_INFO fsUnixInfo;
@@ -1477,6 +1509,7 @@ struct cifs_writedata {
struct smbd_mr *mr;
#endif
struct cifs_credits credits;
+ bool replay;
};
/*
@@ -1537,7 +1570,6 @@ struct cifsInodeInfo {
spinlock_t writers_lock;
unsigned int writers; /* Number of writers on this inode */
unsigned long time; /* jiffies of last update of inode */
- u64 server_eof; /* current file size on server -- protected by i_lock */
u64 uniqueid; /* server inode number */
u64 createtime; /* creation time on server */
__u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for this inode */
@@ -1545,6 +1577,7 @@ struct cifsInodeInfo {
spinlock_t deferred_lock; /* protection on deferred list */
bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */
char *symlink_target;
+ __u32 reparse_tag;
};
static inline struct cifsInodeInfo *
@@ -1806,6 +1839,13 @@ static inline bool is_retryable_error(int error)
return false;
}
+static inline bool is_replayable_error(int error)
+{
+ if (error == -EAGAIN || error == -ECONNABORTED)
+ return true;
+ return false;
+}
+
/* cifs_get_writable_file() flags */
#define FIND_WR_ANY 0
@@ -2238,8 +2278,8 @@ static inline void cifs_sg_set_buf(struct sg_table *sgtable,
struct smb2_compound_vars {
struct cifs_open_parms oparms;
- struct kvec rsp_iov[3];
- struct smb_rqst rqst[3];
+ struct kvec rsp_iov[MAX_COMPOUND];
+ struct smb_rqst rqst[MAX_COMPOUND];
struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
struct kvec qi_iov;
struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 46feaa0880bd..a841bf4967fa 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -211,8 +211,12 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path,
bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr,
struct cifs_open_info_data *data);
-extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path,
- struct super_block *sb, unsigned int xid);
+
+extern int smb311_posix_get_inode_info(struct inode **inode,
+ const char *full_path,
+ struct cifs_open_info_data *data,
+ struct super_block *sb,
+ const unsigned int xid);
extern int cifs_get_inode_info_unix(struct inode **pinode,
const unsigned char *search_path,
struct super_block *sb, unsigned int xid);
@@ -435,16 +439,19 @@ extern int CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon,
int remap_special_chars);
extern int CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon,
const char *name, struct cifs_sb_info *cifs_sb);
-extern int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb);
+int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb);
extern int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *tcon,
int netfid, const char *target_name,
const struct nls_table *nls_codepage,
int remap_special_chars);
-extern int CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb);
+int CIFSCreateHardLink(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb);
extern int CIFSUnixCreateHardLink(const unsigned int xid,
struct cifs_tcon *tcon,
const char *fromName, const char *toName,
@@ -649,7 +656,7 @@ cifs_chan_is_iface_active(struct cifs_ses *ses,
struct TCP_Server_Info *server);
void
cifs_disable_secondary_channels(struct cifs_ses *ses);
-int
+void
cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server);
int
SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_mount);
@@ -760,4 +767,11 @@ static inline void release_mid(struct mid_q_entry *mid)
kref_put(&mid->refcount, __release_mid);
}
+static inline void cifs_free_open_info(struct cifs_open_info_data *data)
+{
+ kfree(data->symlink_target);
+ free_rsp_buf(data->reparse.io.buftype, data->reparse.io.iov.iov_base);
+ memset(data, 0, sizeof(*data));
+}
+
#endif /* _CIFSPROTO_H */
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 9ee348e6d106..01e89070df5a 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -2149,10 +2149,10 @@ CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
return rc;
}
-int
-CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb)
+int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb)
{
int rc = 0;
RENAME_REQ *pSMB = NULL;
@@ -2530,10 +2530,11 @@ createHardLinkRetry:
return rc;
}
-int
-CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb)
+int CIFSCreateHardLink(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb)
{
int rc = 0;
NT_RENAME_REQ *pSMB = NULL;
@@ -2699,11 +2700,12 @@ int cifs_query_reparse_point(const unsigned int xid,
u32 *tag, struct kvec *rsp,
int *rsp_buftype)
{
+ struct reparse_data_buffer *buf;
struct cifs_open_parms oparms;
TRANSACT_IOCTL_REQ *io_req = NULL;
TRANSACT_IOCTL_RSP *io_rsp = NULL;
struct cifs_fid fid;
- __u32 data_offset, data_count;
+ __u32 data_offset, data_count, len;
__u8 *start, *end;
int io_rsp_len;
int oplock = 0;
@@ -2773,7 +2775,16 @@ int cifs_query_reparse_point(const unsigned int xid,
goto error;
}
- *tag = le32_to_cpu(((struct reparse_data_buffer *)start)->ReparseTag);
+ data_count = le16_to_cpu(io_rsp->ByteCount);
+ buf = (struct reparse_data_buffer *)start;
+ len = sizeof(*buf);
+ if (data_count < len ||
+ data_count < le16_to_cpu(buf->ReparseDataLength) + len) {
+ rc = -EIO;
+ goto error;
+ }
+
+ *tag = le32_to_cpu(buf->ReparseTag);
rsp->iov_base = io_rsp;
rsp->iov_len = io_rsp_len;
*rsp_buftype = CIFS_LARGE_BUFFER;
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index dc9b95ca71e6..bfd568f89710 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -483,6 +483,7 @@ static int reconnect_target_unlocked(struct TCP_Server_Info *server, struct dfs_
static int reconnect_dfs_server(struct TCP_Server_Info *server)
{
struct dfs_cache_tgt_iterator *target_hint = NULL;
+
DFS_CACHE_TGT_LIST(tl);
int num_targets = 0;
int rc = 0;
@@ -745,6 +746,7 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
{
struct msghdr smb_msg = {};
struct kvec iov = {.iov_base = buf, .iov_len = to_read};
+
iov_iter_kvec(&smb_msg.msg_iter, ITER_DEST, &iov, 1, to_read);
return cifs_readv_from_socket(server, &smb_msg);
@@ -1400,11 +1402,13 @@ cifs_match_ipaddr(struct sockaddr *srcaddr, struct sockaddr *rhs)
case AF_INET: {
struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr;
struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs;
+
return (saddr4->sin_addr.s_addr == vaddr4->sin_addr.s_addr);
}
case AF_INET6: {
struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs;
+
return (ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr)
&& saddr6->sin6_scope_id == vaddr6->sin6_scope_id);
}
@@ -1570,6 +1574,9 @@ static int match_server(struct TCP_Server_Info *server,
if (server->min_offload != ctx->min_offload)
return 0;
+ if (server->retrans != ctx->retrans)
+ return 0;
+
return 1;
}
@@ -1794,6 +1801,7 @@ smbd_connected:
goto out_err_crypto_release;
}
tcp_ses->min_offload = ctx->min_offload;
+ tcp_ses->retrans = ctx->retrans;
/*
* at this point we are the only ones with the pointer
* to the struct since the kernel thread not created yet
@@ -2599,8 +2607,8 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
rc = -EOPNOTSUPP;
goto out_fail;
} else {
- cifs_dbg(VFS, "Check vers= mount option. SMB3.11 "
- "disabled but required for POSIX extensions\n");
+ cifs_dbg(VFS,
+ "Check vers= mount option. SMB3.11 disabled but required for POSIX extensions\n");
rc = -EOPNOTSUPP;
goto out_fail;
}
@@ -2743,7 +2751,6 @@ cifs_put_tlink(struct tcon_link *tlink)
if (!IS_ERR(tlink_tcon(tlink)))
cifs_put_tcon(tlink_tcon(tlink));
kfree(tlink);
- return;
}
static int
@@ -2884,6 +2891,7 @@ static inline void
cifs_reclassify_socket4(struct socket *sock)
{
struct sock *sk = sock->sk;
+
BUG_ON(!sock_allow_reclassification(sk));
sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS",
&cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]);
@@ -2893,6 +2901,7 @@ static inline void
cifs_reclassify_socket6(struct socket *sock)
{
struct sock *sk = sock->sk;
+
BUG_ON(!sock_allow_reclassification(sk));
sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS",
&cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]);
@@ -2927,15 +2936,18 @@ static int
bind_socket(struct TCP_Server_Info *server)
{
int rc = 0;
+
if (server->srcaddr.ss_family != AF_UNSPEC) {
/* Bind to the specified local IP address */
struct socket *socket = server->ssocket;
+
rc = kernel_bind(socket,
(struct sockaddr *) &server->srcaddr,
sizeof(server->srcaddr));
if (rc < 0) {
struct sockaddr_in *saddr4;
struct sockaddr_in6 *saddr6;
+
saddr4 = (struct sockaddr_in *)&server->srcaddr;
saddr6 = (struct sockaddr_in6 *)&server->srcaddr;
if (saddr6->sin6_family == AF_INET6)
@@ -3165,6 +3177,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
if (!CIFSSMBQFSUnixInfo(xid, tcon)) {
__u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+
cifs_dbg(FYI, "unix caps which server supports %lld\n", cap);
/*
* check for reconnect case in which we do not
@@ -3668,7 +3681,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
smb_buffer_response = smb_buffer;
header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
- NULL /*no tid */ , 4 /*wct */ );
+ NULL /*no tid */, 4 /*wct */);
smb_buffer->Mid = get_next_mid(ses->server);
smb_buffer->Uid = ses->Suid;
@@ -3687,12 +3700,12 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
if (ses->server->sign)
smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
- if (ses->capabilities & CAP_STATUS32) {
+ if (ses->capabilities & CAP_STATUS32)
smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
- }
- if (ses->capabilities & CAP_DFS) {
+
+ if (ses->capabilities & CAP_DFS)
smb_buffer->Flags2 |= SMBFLG2_DFS;
- }
+
if (ses->capabilities & CAP_UNICODE) {
smb_buffer->Flags2 |= SMBFLG2_UNICODE;
length =
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index 580a27a3a7e6..89333d9bce36 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -680,9 +680,10 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
full_path, d_inode(direntry));
again:
- if (pTcon->posix_extensions)
- rc = smb311_posix_get_inode_info(&newInode, full_path, parent_dir_inode->i_sb, xid);
- else if (pTcon->unix_ext) {
+ if (pTcon->posix_extensions) {
+ rc = smb311_posix_get_inode_info(&newInode, full_path, NULL,
+ parent_dir_inode->i_sb, xid);
+ } else if (pTcon->unix_ext) {
rc = cifs_get_inode_info_unix(&newInode, full_path,
parent_dir_inode->i_sb, xid);
} else {
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 4e84e88b47e3..b75282c204da 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -87,7 +87,7 @@ void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len
continue;
if (!folio_test_writeback(folio)) {
WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio_index(folio), end);
+ len, start, folio->index, end);
continue;
}
@@ -120,7 +120,7 @@ void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len
continue;
if (!folio_test_writeback(folio)) {
WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio_index(folio), end);
+ len, start, folio->index, end);
continue;
}
@@ -151,7 +151,7 @@ void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int le
xas_for_each(&xas, folio, end) {
if (!folio_test_writeback(folio)) {
WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio_index(folio), end);
+ len, start, folio->index, end);
continue;
}
@@ -1020,14 +1020,16 @@ reopen_success:
if (!is_interrupt_error(rc))
mapping_set_error(inode->i_mapping, rc);
- if (tcon->posix_extensions)
- rc = smb311_posix_get_inode_info(&inode, full_path, inode->i_sb, xid);
- else if (tcon->unix_ext)
+ if (tcon->posix_extensions) {
+ rc = smb311_posix_get_inode_info(&inode, full_path,
+ NULL, inode->i_sb, xid);
+ } else if (tcon->unix_ext) {
rc = cifs_get_inode_info_unix(&inode, full_path,
inode->i_sb, xid);
- else
+ } else {
rc = cifs_get_inode_info(&inode, full_path, NULL,
inode->i_sb, xid, NULL);
+ }
}
/*
* Else we are writing out data to server already and could deadlock if
@@ -2118,8 +2120,8 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
{
loff_t end_of_write = offset + bytes_written;
- if (end_of_write > cifsi->server_eof)
- cifsi->server_eof = end_of_write;
+ if (end_of_write > cifsi->netfs.remote_i_size)
+ netfs_resize_file(&cifsi->netfs, end_of_write, true);
}
static ssize_t
@@ -2649,7 +2651,7 @@ static void cifs_extend_writeback(struct address_space *mapping,
continue;
if (xa_is_value(folio))
break;
- if (folio_index(folio) != index)
+ if (folio->index != index)
break;
if (!folio_try_get_rcu(folio)) {
xas_reset(&xas);
@@ -2897,7 +2899,7 @@ redo_folio:
goto skip_write;
}
- if (folio_mapping(folio) != mapping ||
+ if (folio->mapping != mapping ||
!folio_test_dirty(folio)) {
start += folio_size(folio);
folio_unlock(folio);
@@ -3245,8 +3247,8 @@ cifs_uncached_writev_complete(struct work_struct *work)
spin_lock(&inode->i_lock);
cifs_update_eof(cifsi, wdata->offset, wdata->bytes);
- if (cifsi->server_eof > inode->i_size)
- i_size_write(inode, cifsi->server_eof);
+ if (cifsi->netfs.remote_i_size > inode->i_size)
+ i_size_write(inode, cifsi->netfs.remote_i_size);
spin_unlock(&inode->i_lock);
complete(&wdata->done);
@@ -3298,6 +3300,7 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
if (wdata->cfile->invalidHandle)
rc = -EAGAIN;
else {
+ wdata->replay = true;
#ifdef CONFIG_CIFS_SMB_DIRECT
if (wdata->mr) {
wdata->mr->need_invalidate = true;
@@ -5041,27 +5044,13 @@ static void cifs_swap_deactivate(struct file *file)
/* do we need to unpin (or unlock) the file */
}
-/*
- * Mark a page as having been made dirty and thus needing writeback. We also
- * need to pin the cache object to write back to.
- */
-#ifdef CONFIG_CIFS_FSCACHE
-static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio)
-{
- return fscache_dirty_folio(mapping, folio,
- cifs_inode_cookie(mapping->host));
-}
-#else
-#define cifs_dirty_folio filemap_dirty_folio
-#endif
-
const struct address_space_operations cifs_addr_ops = {
.read_folio = cifs_read_folio,
.readahead = cifs_readahead,
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
- .dirty_folio = cifs_dirty_folio,
+ .dirty_folio = netfs_dirty_folio,
.release_folio = cifs_release_folio,
.direct_IO = cifs_direct_io,
.invalidate_folio = cifs_invalidate_folio,
@@ -5085,7 +5074,7 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
- .dirty_folio = cifs_dirty_folio,
+ .dirty_folio = netfs_dirty_folio,
.release_folio = cifs_release_folio,
.invalidate_folio = cifs_invalidate_folio,
.launder_folio = cifs_launder_folio,
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index a3493da12ad1..52cbef2eeb28 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -139,6 +139,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_u32("dir_mode", Opt_dirmode),
fsparam_u32("port", Opt_port),
fsparam_u32("min_enc_offload", Opt_min_enc_offload),
+ fsparam_u32("retrans", Opt_retrans),
fsparam_u32("esize", Opt_min_enc_offload),
fsparam_u32("bsize", Opt_blocksize),
fsparam_u32("rasize", Opt_rasize),
@@ -1064,6 +1065,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case Opt_min_enc_offload:
ctx->min_offload = result.uint_32;
break;
+ case Opt_retrans:
+ ctx->retrans = result.uint_32;
+ break;
case Opt_blocksize:
/*
* inode blocksize realistically should never need to be
@@ -1619,6 +1623,8 @@ int smb3_init_fs_context(struct fs_context *fc)
ctx->backupuid_specified = false; /* no backup intent for a user */
ctx->backupgid_specified = false; /* no backup intent for a group */
+ ctx->retrans = 1;
+
/*
* short int override_uid = -1;
* short int override_gid = -1;
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index cf46916286d0..182ce11cbe93 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -118,6 +118,7 @@ enum cifs_param {
Opt_file_mode,
Opt_dirmode,
Opt_min_enc_offload,
+ Opt_retrans,
Opt_blocksize,
Opt_rasize,
Opt_rsize,
@@ -245,6 +246,7 @@ struct smb3_fs_context {
unsigned int rsize;
unsigned int wsize;
unsigned int min_offload;
+ unsigned int retrans;
bool sockopt_tcp_nodelay:1;
/* attribute cache timemout for files and directories in jiffies */
unsigned long acregmax;
diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c
index e5cad149f5a2..c4a3cb736881 100644
--- a/fs/smb/client/fscache.c
+++ b/fs/smb/client/fscache.c
@@ -180,7 +180,7 @@ static int fscache_fallback_write_pages(struct inode *inode, loff_t start, size_
if (ret < 0)
return ret;
- ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode),
+ ret = cres.ops->prepare_write(&cres, &start, &len, len, i_size_read(inode),
no_space_allocated_yet);
if (ret == 0)
ret = fscache_write(&cres, start, &iter, NULL, NULL);
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 09c5c0f5c96e..d02f8ba29cb5 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -104,7 +104,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode);
mtime = inode_get_mtime(inode);
if (timespec64_equal(&mtime, &fattr->cf_mtime) &&
- cifs_i->server_eof == fattr->cf_eof) {
+ cifs_i->netfs.remote_i_size == fattr->cf_eof) {
cifs_dbg(FYI, "%s: inode %llu is unchanged\n",
__func__, cifs_i->uniqueid);
return;
@@ -182,6 +182,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
inode->i_mode = fattr->cf_mode;
cifs_i->cifsAttrs = fattr->cf_cifsattrs;
+ cifs_i->reparse_tag = fattr->cf_cifstag;
if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
cifs_i->time = 0;
@@ -193,7 +194,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
else
clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags);
- cifs_i->server_eof = fattr->cf_eof;
+ cifs_i->netfs.remote_i_size = fattr->cf_eof;
/*
* Can't safely change the file size here if the client is writing to
* it due to potential races.
@@ -209,7 +210,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9;
}
- if (S_ISLNK(fattr->cf_mode)) {
+ if (S_ISLNK(fattr->cf_mode) && fattr->cf_symlink_target) {
kfree(cifs_i->symlink_target);
cifs_i->symlink_target = fattr->cf_symlink_target;
fattr->cf_symlink_target = NULL;
@@ -664,8 +665,6 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
/* Fill a cifs_fattr struct with info from POSIX info struct */
static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr,
struct cifs_open_info_data *data,
- struct cifs_sid *owner,
- struct cifs_sid *group,
struct super_block *sb)
{
struct smb311_posix_qinfo *info = &data->posix_fi;
@@ -691,31 +690,38 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr,
fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj;
}
+ /*
+ * The srv fs device id is overridden on network mount so setting
+ * @fattr->cf_rdev isn't needed here.
+ */
fattr->cf_eof = le64_to_cpu(info->EndOfFile);
fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
fattr->cf_createtime = le64_to_cpu(info->CreationTime);
-
fattr->cf_nlink = le32_to_cpu(info->HardLinks);
fattr->cf_mode = (umode_t) le32_to_cpu(info->Mode);
- /* The srv fs device id is overridden on network mount so setting rdev isn't needed here */
- /* fattr->cf_rdev = le32_to_cpu(info->DeviceId); */
- if (data->symlink) {
- fattr->cf_mode |= S_IFLNK;
- fattr->cf_dtype = DT_LNK;
- fattr->cf_symlink_target = data->symlink_target;
- data->symlink_target = NULL;
- } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
+ if (cifs_open_data_reparse(data) &&
+ cifs_reparse_point_to_fattr(cifs_sb, fattr, data))
+ goto out_reparse;
+
+ fattr->cf_mode &= ~S_IFMT;
+ if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
fattr->cf_mode |= S_IFDIR;
fattr->cf_dtype = DT_DIR;
} else { /* file */
fattr->cf_mode |= S_IFREG;
fattr->cf_dtype = DT_REG;
}
- /* else if reparse point ... TODO: add support for FIFO and blk dev; special file types */
- sid_to_id(cifs_sb, owner, fattr, SIDOWNER);
- sid_to_id(cifs_sb, group, fattr, SIDGROUP);
+out_reparse:
+ if (S_ISLNK(fattr->cf_mode)) {
+ if (likely(data->symlink_target))
+ fattr->cf_eof = strnlen(data->symlink_target, PATH_MAX);
+ fattr->cf_symlink_target = data->symlink_target;
+ data->symlink_target = NULL;
+ }
+ sid_to_id(cifs_sb, &data->posix_owner, fattr, SIDOWNER);
+ sid_to_id(cifs_sb, &data->posix_group, fattr, SIDGROUP);
cifs_dbg(FYI, "POSIX query info: mode 0x%x uniqueid 0x%llx nlink %d\n",
fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink);
@@ -738,25 +744,25 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
if (tag == IO_REPARSE_TAG_NFS && buf) {
switch (le64_to_cpu(buf->InodeType)) {
case NFS_SPECFILE_CHR:
- fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFCHR;
fattr->cf_dtype = DT_CHR;
fattr->cf_rdev = nfs_mkdev(buf);
break;
case NFS_SPECFILE_BLK:
- fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFBLK;
fattr->cf_dtype = DT_BLK;
fattr->cf_rdev = nfs_mkdev(buf);
break;
case NFS_SPECFILE_FIFO:
- fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFIFO;
fattr->cf_dtype = DT_FIFO;
break;
case NFS_SPECFILE_SOCK:
- fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFSOCK;
fattr->cf_dtype = DT_SOCK;
break;
case NFS_SPECFILE_LNK:
- fattr->cf_mode = S_IFLNK | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFLNK;
fattr->cf_dtype = DT_LNK;
break;
default:
@@ -768,29 +774,29 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
switch (tag) {
case IO_REPARSE_TAG_LX_SYMLINK:
- fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFLNK;
fattr->cf_dtype = DT_LNK;
break;
case IO_REPARSE_TAG_LX_FIFO:
- fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFIFO;
fattr->cf_dtype = DT_FIFO;
break;
case IO_REPARSE_TAG_AF_UNIX:
- fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFSOCK;
fattr->cf_dtype = DT_SOCK;
break;
case IO_REPARSE_TAG_LX_CHR:
- fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFCHR;
fattr->cf_dtype = DT_CHR;
break;
case IO_REPARSE_TAG_LX_BLK:
- fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFBLK;
fattr->cf_dtype = DT_BLK;
break;
case 0: /* SMB1 symlink */
case IO_REPARSE_TAG_SYMLINK:
case IO_REPARSE_TAG_NFS:
- fattr->cf_mode = S_IFLNK | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFLNK;
fattr->cf_dtype = DT_LNK;
break;
default:
@@ -830,6 +836,7 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr,
fattr->cf_createtime = le64_to_cpu(info->CreationTime);
fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+ fattr->cf_mode = cifs_sb->ctx->file_mode;
if (cifs_open_data_reparse(data) &&
cifs_reparse_point_to_fattr(cifs_sb, fattr, data))
goto out_reparse;
@@ -1076,6 +1083,9 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
&rsp_iov, &rsp_buftype);
if (!rc)
iov = &rsp_iov;
+ } else if (data->reparse.io.buftype != CIFS_NO_BUFFER &&
+ data->reparse.io.iov.iov_base) {
+ iov = &data->reparse.io.iov;
}
rc = -EOPNOTSUPP;
@@ -1092,17 +1102,22 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
rc = 0;
goto out;
default:
- if (data->symlink_target) {
+ /* Check for cached reparse point data */
+ if (data->symlink_target || data->reparse.buf) {
rc = 0;
- } else if (server->ops->parse_reparse_point) {
+ } else if (iov && server->ops->parse_reparse_point) {
rc = server->ops->parse_reparse_point(cifs_sb,
iov, data);
}
break;
}
- cifs_open_info_to_fattr(fattr, data, sb);
+ if (tcon->posix_extensions)
+ smb311_posix_info_to_fattr(fattr, data, sb);
+ else
+ cifs_open_info_to_fattr(fattr, data, sb);
out:
+ fattr->cf_cifstag = data->reparse.tag;
free_rsp_buf(rsp_buftype, rsp_iov.iov_base);
return rc;
}
@@ -1290,31 +1305,34 @@ out:
return rc;
}
-static int smb311_posix_get_fattr(struct cifs_fattr *fattr,
+static int smb311_posix_get_fattr(struct cifs_open_info_data *data,
+ struct cifs_fattr *fattr,
const char *full_path,
struct super_block *sb,
const unsigned int xid)
{
- struct cifs_open_info_data data = {};
+ struct cifs_open_info_data tmp_data = {};
+ struct TCP_Server_Info *server;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon;
struct tcon_link *tlink;
- struct cifs_sid owner, group;
int tmprc;
- int rc;
+ int rc = 0;
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
tcon = tlink_tcon(tlink);
+ server = tcon->ses->server;
/*
- * 1. Fetch file metadata
+ * 1. Fetch file metadata if not provided (data)
*/
-
- rc = smb311_posix_query_path_info(xid, tcon, cifs_sb,
- full_path, &data,
- &owner, &group);
+ if (!data) {
+ rc = server->ops->query_path_info(xid, tcon, cifs_sb,
+ full_path, &tmp_data);
+ data = &tmp_data;
+ }
/*
* 2. Convert it to internal cifs metadata (fattr)
@@ -1322,7 +1340,12 @@ static int smb311_posix_get_fattr(struct cifs_fattr *fattr,
switch (rc) {
case 0:
- smb311_posix_info_to_fattr(fattr, &data, &owner, &group, sb);
+ if (cifs_open_data_reparse(data)) {
+ rc = reparse_info_to_fattr(data, sb, xid, tcon,
+ full_path, fattr);
+ } else {
+ smb311_posix_info_to_fattr(fattr, data, sb);
+ }
break;
case -EREMOTE:
/* DFS link, no metadata available on this server */
@@ -1353,12 +1376,15 @@ static int smb311_posix_get_fattr(struct cifs_fattr *fattr,
out:
cifs_put_tlink(tlink);
- cifs_free_open_info(&data);
+ cifs_free_open_info(data);
return rc;
}
-int smb311_posix_get_inode_info(struct inode **inode, const char *full_path,
- struct super_block *sb, const unsigned int xid)
+int smb311_posix_get_inode_info(struct inode **inode,
+ const char *full_path,
+ struct cifs_open_info_data *data,
+ struct super_block *sb,
+ const unsigned int xid)
{
struct cifs_fattr fattr = {};
int rc;
@@ -1368,7 +1394,7 @@ int smb311_posix_get_inode_info(struct inode **inode, const char *full_path,
return 0;
}
- rc = smb311_posix_get_fattr(&fattr, full_path, sb, xid);
+ rc = smb311_posix_get_fattr(data, &fattr, full_path, sb, xid);
if (rc)
goto out;
@@ -1516,7 +1542,7 @@ struct inode *cifs_root_iget(struct super_block *sb)
convert_delimiter(path, CIFS_DIR_SEP(cifs_sb));
if (tcon->posix_extensions)
- rc = smb311_posix_get_fattr(&fattr, path, sb, xid);
+ rc = smb311_posix_get_fattr(NULL, &fattr, path, sb, xid);
else
rc = cifs_get_fattr(NULL, sb, xid, NULL, &fattr, &inode, path);
@@ -1889,16 +1915,18 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
int rc = 0;
struct inode *inode = NULL;
- if (tcon->posix_extensions)
- rc = smb311_posix_get_inode_info(&inode, full_path, parent->i_sb, xid);
+ if (tcon->posix_extensions) {
+ rc = smb311_posix_get_inode_info(&inode, full_path,
+ NULL, parent->i_sb, xid);
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
- else if (tcon->unix_ext)
+ } else if (tcon->unix_ext) {
rc = cifs_get_inode_info_unix(&inode, full_path, parent->i_sb,
xid);
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
- else
+ } else {
rc = cifs_get_inode_info(&inode, full_path, NULL, parent->i_sb,
xid, NULL);
+ }
if (rc)
return rc;
@@ -2219,7 +2247,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
return -ENOSYS;
/* try path-based rename first */
- rc = server->ops->rename(xid, tcon, from_path, to_path, cifs_sb);
+ rc = server->ops->rename(xid, tcon, from_dentry,
+ from_path, to_path, cifs_sb);
/*
* Don't bother with rename by filehandle unless file is busy and
@@ -2579,13 +2608,15 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
dentry, cifs_get_time(dentry), jiffies);
again:
- if (cifs_sb_master_tcon(CIFS_SB(sb))->posix_extensions)
- rc = smb311_posix_get_inode_info(&inode, full_path, sb, xid);
- else if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
+ if (cifs_sb_master_tcon(CIFS_SB(sb))->posix_extensions) {
+ rc = smb311_posix_get_inode_info(&inode, full_path,
+ NULL, sb, xid);
+ } else if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) {
rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
- else
+ } else {
rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
xid, NULL);
+ }
if (rc == -EAGAIN && count++ < 10)
goto again;
out:
@@ -2827,7 +2858,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
set_size_out:
if (rc == 0) {
- cifsInode->server_eof = attrs->ia_size;
+ netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true);
cifs_setsize(inode, attrs->ia_size);
/*
* i_blocks is not related to (i_size / i_blksize), but instead
@@ -2980,6 +3011,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
if ((attrs->ia_valid & ATTR_SIZE) &&
attrs->ia_size != i_size_read(inode)) {
truncate_setsize(inode, attrs->ia_size);
+ netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true);
fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size);
}
@@ -3179,6 +3211,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
if ((attrs->ia_valid & ATTR_SIZE) &&
attrs->ia_size != i_size_read(inode)) {
truncate_setsize(inode, attrs->ia_size);
+ netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true);
fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size);
}
diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c
index a1da50e66fbb..d86da949a919 100644
--- a/fs/smb/client/link.c
+++ b/fs/smb/client/link.c
@@ -510,8 +510,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
rc = -ENOSYS;
goto cifs_hl_exit;
}
- rc = server->ops->create_hardlink(xid, tcon, from_name, to_name,
- cifs_sb);
+ rc = server->ops->create_hardlink(xid, tcon, old_file,
+ from_name, to_name, cifs_sb);
if ((rc == -EIO) || (rc == -EINVAL))
rc = -EOPNOTSUPP;
}
@@ -569,6 +569,7 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
int rc = -EOPNOTSUPP;
unsigned int xid;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct TCP_Server_Info *server;
struct tcon_link *tlink;
struct cifs_tcon *pTcon;
const char *full_path;
@@ -590,6 +591,7 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
goto symlink_exit;
}
pTcon = tlink_tcon(tlink);
+ server = cifs_pick_channel(pTcon->ses);
full_path = build_path_from_dentry(direntry, page);
if (IS_ERR(full_path)) {
@@ -601,27 +603,32 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
cifs_dbg(FYI, "symname is %s\n", symname);
/* BB what if DFS and this volume is on different share? BB */
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname);
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
- else if (pTcon->unix_ext)
+ } else if (pTcon->unix_ext) {
rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
cifs_sb->local_nls,
cifs_remap(cifs_sb));
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
- /* else
- rc = CIFSCreateReparseSymLink(xid, pTcon, fromName, toName,
- cifs_sb_target->local_nls); */
+ } else if (server->ops->create_reparse_symlink) {
+ rc = server->ops->create_reparse_symlink(xid, inode, direntry,
+ pTcon, full_path,
+ symname);
+ goto symlink_exit;
+ }
if (rc == 0) {
- if (pTcon->posix_extensions)
- rc = smb311_posix_get_inode_info(&newinode, full_path, inode->i_sb, xid);
- else if (pTcon->unix_ext)
+ if (pTcon->posix_extensions) {
+ rc = smb311_posix_get_inode_info(&newinode, full_path,
+ NULL, inode->i_sb, xid);
+ } else if (pTcon->unix_ext) {
rc = cifs_get_inode_info_unix(&newinode, full_path,
inode->i_sb, xid);
- else
+ } else {
rc = cifs_get_inode_info(&newinode, full_path, NULL,
inode->i_sb, xid, NULL);
+ }
if (rc != 0) {
cifs_dbg(FYI, "Create symlink ok, getinodeinfo fail rc = %d\n",
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index c2137ea3c253..0748d7b757b9 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -140,6 +140,7 @@ tcon_info_alloc(bool dir_leases_enabled)
spin_lock_init(&ret_buf->stat_lock);
atomic_set(&ret_buf->num_local_opens, 0);
atomic_set(&ret_buf->num_remote_opens, 0);
+ ret_buf->stats_from_time = ktime_get_real_seconds();
#ifdef CONFIG_CIFS_DFS_UPCALL
INIT_LIST_HEAD(&ret_buf->dfs_ses_list);
#endif
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index d30ea2005eb3..3b1b01d10f7d 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -56,6 +56,23 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
#endif /* DEBUG2 */
/*
+ * Match a reparse point inode if reparse tag and ctime haven't changed.
+ *
+ * Windows Server updates ctime of reparse points when their data have changed.
+ * The server doesn't allow changing reparse tags from existing reparse points,
+ * though it's worth checking.
+ */
+static inline bool reparse_inode_match(struct inode *inode,
+ struct cifs_fattr *fattr)
+{
+ struct timespec64 ctime = inode_get_ctime(inode);
+
+ return (CIFS_I(inode)->cifsAttrs & ATTR_REPARSE) &&
+ CIFS_I(inode)->reparse_tag == fattr->cf_cifstag &&
+ timespec64_equal(&ctime, &fattr->cf_ctime);
+}
+
+/*
* Attempt to preload the dcache with the results from the FIND_FIRST/NEXT
*
* Find the dentry that matches "name". If there isn't one, create one. If it's
@@ -71,6 +88,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
struct super_block *sb = parent->d_sb;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ int rc;
cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
@@ -82,9 +100,11 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
* We'll end up doing an on the wire call either way and
* this spares us an invalidation.
*/
- if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
- return;
retry:
+ if ((fattr->cf_cifsattrs & ATTR_REPARSE) ||
+ (fattr->cf_flags & CIFS_FATTR_NEED_REVAL))
+ return;
+
dentry = d_alloc_parallel(parent, name, &wq);
}
if (IS_ERR(dentry))
@@ -104,12 +124,34 @@ retry:
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
fattr->cf_uniqueid = CIFS_I(inode)->uniqueid;
- /* update inode in place
- * if both i_ino and i_mode didn't change */
- if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid &&
- cifs_fattr_to_inode(inode, fattr) == 0) {
- dput(dentry);
- return;
+ /*
+ * Update inode in place if both i_ino and i_mode didn't
+ * change.
+ */
+ if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
+ /*
+ * Query dir responses don't provide enough
+ * information about reparse points other than
+ * their reparse tags. Save an invalidation by
+ * not clobbering some existing attributes when
+ * reparse tag and ctime haven't changed.
+ */
+ rc = 0;
+ if (fattr->cf_cifsattrs & ATTR_REPARSE) {
+ if (likely(reparse_inode_match(inode, fattr))) {
+ fattr->cf_mode = inode->i_mode;
+ fattr->cf_rdev = inode->i_rdev;
+ fattr->cf_eof = CIFS_I(inode)->netfs.remote_i_size;
+ fattr->cf_symlink_target = NULL;
+ } else {
+ CIFS_I(inode)->time = 0;
+ rc = -ESTALE;
+ }
+ }
+ if (!rc && !cifs_fattr_to_inode(inode, fattr)) {
+ dput(dentry);
+ return;
+ }
}
}
d_invalidate(dentry);
@@ -127,29 +169,6 @@ retry:
dput(dentry);
}
-static bool reparse_file_needs_reval(const struct cifs_fattr *fattr)
-{
- if (!(fattr->cf_cifsattrs & ATTR_REPARSE))
- return false;
- /*
- * The DFS tags should be only intepreted by server side as per
- * MS-FSCC 2.1.2.1, but let's include them anyway.
- *
- * Besides, if cf_cifstag is unset (0), then we still need it to be
- * revalidated to know exactly what reparse point it is.
- */
- switch (fattr->cf_cifstag) {
- case IO_REPARSE_TAG_DFS:
- case IO_REPARSE_TAG_DFSR:
- case IO_REPARSE_TAG_SYMLINK:
- case IO_REPARSE_TAG_NFS:
- case IO_REPARSE_TAG_MOUNT_POINT:
- case 0:
- return true;
- }
- return false;
-}
-
static void
cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
{
@@ -181,14 +200,6 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
}
out_reparse:
- /*
- * We need to revalidate it further to make a decision about whether it
- * is a symbolic link, DFS referral or a reparse point with a direct
- * access like junctions, deduplicated files, NFS symlinks.
- */
- if (reparse_file_needs_reval(fattr))
- fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
-
/* non-unix readdir doesn't provide nlink */
fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
@@ -269,9 +280,6 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info,
fattr->cf_dtype = DT_REG;
}
- if (reparse_file_needs_reval(fattr))
- fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
-
sid_to_id(cifs_sb, &parsed.owner, fattr, SIDOWNER);
sid_to_id(cifs_sb, &parsed.group, fattr, SIDGROUP);
}
@@ -331,38 +339,6 @@ cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
cifs_fill_common_info(fattr, cifs_sb);
}
-/* BB eventually need to add the following helper function to
- resolve NT_STATUS_STOPPED_ON_SYMLINK return code when
- we try to do FindFirst on (NTFS) directory symlinks */
-/*
-int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
- unsigned int xid)
-{
- __u16 fid;
- int len;
- int oplock = 0;
- int rc;
- struct cifs_tcon *ptcon = cifs_sb_tcon(cifs_sb);
- char *tmpbuffer;
-
- rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
- OPEN_REPARSE_POINT, &fid, &oplock, NULL,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb);
- if (!rc) {
- tmpbuffer = kmalloc(maxpath);
- rc = CIFSSMBQueryReparseLinkInfo(xid, ptcon, full_path,
- tmpbuffer,
- maxpath -1,
- fid,
- cifs_sb->local_nls);
- if (CIFSSMBClose(xid, ptcon, fid)) {
- cifs_dbg(FYI, "Error closing temporary reparsepoint open\n");
- }
- }
-}
- */
-
static int
_initiate_cifs_search(const unsigned int xid, struct file *file,
const char *full_path)
@@ -431,13 +407,10 @@ ffirst_retry:
&cifsFile->fid, search_flags,
&cifsFile->srch_inf);
- if (rc == 0)
+ if (rc == 0) {
cifsFile->invalidHandle = false;
- /* BB add following call to handle readdir on new NTFS symlink errors
- else if STATUS_STOPPED_ON_SYMLINK
- call get_symlink_reparse_path and retry with new path */
- else if ((rc == -EOPNOTSUPP) &&
- (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
+ } else if ((rc == -EOPNOTSUPP) &&
+ (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
goto ffirst_retry;
}
@@ -672,10 +645,10 @@ static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode)
static int is_dir_changed(struct file *file)
{
struct inode *inode = file_inode(file);
- struct cifsInodeInfo *cifsInfo = CIFS_I(inode);
+ struct cifsInodeInfo *cifs_inode_info = CIFS_I(inode);
- if (cifsInfo->time == 0)
- return 1; /* directory was changed, perhaps due to unlink */
+ if (cifs_inode_info->time == 0)
+ return 1; /* directory was changed, e.g. unlink or new file */
else
return 0;
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 2d3b332a79a1..ed4bd88dd528 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -75,6 +75,10 @@ cifs_ses_get_chan_index(struct cifs_ses *ses,
{
unsigned int i;
+ /* if the channel is waiting for termination */
+ if (server->terminate)
+ return CIFS_INVAL_CHAN_INDEX;
+
for (i = 0; i < ses->chan_count; i++) {
if (ses->chans[i].server == server)
return i;
@@ -269,6 +273,8 @@ int cifs_try_adding_channels(struct cifs_ses *ses)
&iface->sockaddr,
rc);
kref_put(&iface->refcount, release_iface);
+ /* failure to add chan should increase weight */
+ iface->weight_fulfilled++;
continue;
}
@@ -356,10 +362,9 @@ done:
/*
* update the iface for the channel if necessary.
- * will return 0 when iface is updated, 1 if removed, 2 otherwise
* Must be called with chan_lock held.
*/
-int
+void
cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
{
unsigned int chan_index;
@@ -368,20 +373,19 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
struct cifs_server_iface *old_iface = NULL;
struct cifs_server_iface *last_iface = NULL;
struct sockaddr_storage ss;
- int rc = 0;
spin_lock(&ses->chan_lock);
chan_index = cifs_ses_get_chan_index(ses, server);
if (chan_index == CIFS_INVAL_CHAN_INDEX) {
spin_unlock(&ses->chan_lock);
- return 0;
+ return;
}
if (ses->chans[chan_index].iface) {
old_iface = ses->chans[chan_index].iface;
if (old_iface->is_active) {
spin_unlock(&ses->chan_lock);
- return 1;
+ return;
}
}
spin_unlock(&ses->chan_lock);
@@ -394,7 +398,7 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
if (!ses->iface_count) {
spin_unlock(&ses->iface_lock);
cifs_dbg(VFS, "server %s does not advertise interfaces\n", ses->server->hostname);
- return 0;
+ return;
}
last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface,
@@ -434,16 +438,21 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
}
if (list_entry_is_head(iface, &ses->iface_list, iface_head)) {
- rc = 1;
iface = NULL;
cifs_dbg(FYI, "unable to find a suitable iface\n");
}
if (!iface) {
- cifs_dbg(FYI, "unable to get the interface matching: %pIS\n",
- &ss);
+ if (!chan_index)
+ cifs_dbg(FYI, "unable to get the interface matching: %pIS\n",
+ &ss);
+ else {
+ cifs_dbg(FYI, "unable to find another interface to replace: %pIS\n",
+ &old_iface->sockaddr);
+ }
+
spin_unlock(&ses->iface_lock);
- return 0;
+ return;
}
/* now drop the ref to the current iface */
@@ -459,34 +468,24 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
iface->weight_fulfilled++;
kref_put(&old_iface->refcount, release_iface);
- } else if (old_iface) {
- /* if a new candidate is not found, keep things as is */
- cifs_dbg(FYI, "could not replace iface: %pIS\n",
- &old_iface->sockaddr);
} else if (!chan_index) {
/* special case: update interface for primary channel */
- if (iface) {
- cifs_dbg(FYI, "referencing primary channel iface: %pIS\n",
- &iface->sockaddr);
- iface->num_channels++;
- iface->weight_fulfilled++;
- }
+ cifs_dbg(FYI, "referencing primary channel iface: %pIS\n",
+ &iface->sockaddr);
+ iface->num_channels++;
+ iface->weight_fulfilled++;
}
spin_unlock(&ses->iface_lock);
- if (iface) {
- spin_lock(&ses->chan_lock);
- chan_index = cifs_ses_get_chan_index(ses, server);
- if (chan_index == CIFS_INVAL_CHAN_INDEX) {
- spin_unlock(&ses->chan_lock);
- return 0;
- }
-
- ses->chans[chan_index].iface = iface;
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX) {
spin_unlock(&ses->chan_lock);
+ return;
}
- return rc;
+ ses->chans[chan_index].iface = iface;
+ spin_unlock(&ses->chan_lock);
}
/*
diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h
index 82e916ad167c..a0c156996fc5 100644
--- a/fs/smb/client/smb2glob.h
+++ b/fs/smb/client/smb2glob.h
@@ -23,17 +23,21 @@
* Identifiers for functions that use the open, operation, close pattern
* in smb2inode.c:smb2_compound_op()
*/
-#define SMB2_OP_SET_DELETE 1
-#define SMB2_OP_SET_INFO 2
-#define SMB2_OP_QUERY_INFO 3
-#define SMB2_OP_QUERY_DIR 4
-#define SMB2_OP_MKDIR 5
-#define SMB2_OP_RENAME 6
-#define SMB2_OP_DELETE 7
-#define SMB2_OP_HARDLINK 8
-#define SMB2_OP_SET_EOF 9
-#define SMB2_OP_RMDIR 10
-#define SMB2_OP_POSIX_QUERY_INFO 11
+enum smb2_compound_ops {
+ SMB2_OP_SET_DELETE = 1,
+ SMB2_OP_SET_INFO,
+ SMB2_OP_QUERY_INFO,
+ SMB2_OP_QUERY_DIR,
+ SMB2_OP_MKDIR,
+ SMB2_OP_RENAME,
+ SMB2_OP_DELETE,
+ SMB2_OP_HARDLINK,
+ SMB2_OP_SET_EOF,
+ SMB2_OP_RMDIR,
+ SMB2_OP_POSIX_QUERY_INFO,
+ SMB2_OP_SET_REPARSE,
+ SMB2_OP_GET_REPARSE
+};
/* Used when constructing chained read requests. */
#define CHAINED_REQUEST 1
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index c94940af5d4b..05818cd6d932 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -26,13 +26,63 @@
#include "cached_dir.h"
#include "smb2status.h"
-static void
-free_set_inf_compound(struct smb_rqst *rqst)
+static struct reparse_data_buffer *reparse_buf_ptr(struct kvec *iov)
{
- if (rqst[1].rq_iov)
- SMB2_set_info_free(&rqst[1]);
- if (rqst[2].rq_iov)
- SMB2_close_free(&rqst[2]);
+ struct reparse_data_buffer *buf;
+ struct smb2_ioctl_rsp *io = iov->iov_base;
+ u32 off, count, len;
+
+ count = le32_to_cpu(io->OutputCount);
+ off = le32_to_cpu(io->OutputOffset);
+ if (check_add_overflow(off, count, &len) || len > iov->iov_len)
+ return ERR_PTR(-EIO);
+
+ buf = (struct reparse_data_buffer *)((u8 *)io + off);
+ len = sizeof(*buf);
+ if (count < len || count < le16_to_cpu(buf->ReparseDataLength) + len)
+ return ERR_PTR(-EIO);
+ return buf;
+}
+
+static inline __u32 file_create_options(struct dentry *dentry)
+{
+ struct cifsInodeInfo *ci;
+
+ if (dentry) {
+ ci = CIFS_I(d_inode(dentry));
+ if (ci->cifsAttrs & ATTR_REPARSE)
+ return OPEN_REPARSE_POINT;
+ }
+ return 0;
+}
+
+/* Parse owner and group from SMB3.1.1 POSIX query info */
+static int parse_posix_sids(struct cifs_open_info_data *data,
+ struct kvec *rsp_iov)
+{
+ struct smb2_query_info_rsp *qi = rsp_iov->iov_base;
+ unsigned int out_len = le32_to_cpu(qi->OutputBufferLength);
+ unsigned int qi_len = sizeof(data->posix_fi);
+ int owner_len, group_len;
+ u8 *sidsbuf, *sidsbuf_end;
+
+ if (out_len <= qi_len)
+ return -EINVAL;
+
+ sidsbuf = (u8 *)qi + le16_to_cpu(qi->OutputBufferOffset) + qi_len;
+ sidsbuf_end = sidsbuf + out_len - qi_len;
+
+ owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end);
+ if (owner_len == -1)
+ return -EINVAL;
+
+ memcpy(&data->posix_owner, sidsbuf, owner_len);
+ group_len = posix_info_sid_size(sidsbuf + owner_len, sidsbuf_end);
+ if (group_len == -1)
+ return -EINVAL;
+
+ memcpy(&data->posix_group, sidsbuf + owner_len, group_len);
+ return 0;
}
/*
@@ -45,13 +95,15 @@ free_set_inf_compound(struct smb_rqst *rqst)
*/
static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
- __u32 desired_access, __u32 create_disposition, __u32 create_options,
- umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile,
- __u8 **extbuf, size_t *extbuflen,
+ __u32 desired_access, __u32 create_disposition,
+ __u32 create_options, umode_t mode, struct kvec *in_iov,
+ int *cmds, int num_cmds, struct cifsFileInfo *cfile,
struct kvec *out_iov, int *out_buftype)
{
+
+ struct reparse_data_buffer *rbuf;
struct smb2_compound_vars *vars = NULL;
- struct kvec *rsp_iov;
+ struct kvec *rsp_iov, *iov;
struct smb_rqst *rqst;
int rc;
__le16 *utf16_path = NULL;
@@ -59,8 +111,8 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid fid;
struct cifs_ses *ses = tcon->ses;
struct TCP_Server_Info *server;
- int num_rqst = 0;
- int resp_buftype[3];
+ int num_rqst = 0, i;
+ int resp_buftype[MAX_COMPOUND];
struct smb2_query_info_rsp *qi_rsp = NULL;
struct cifs_open_info_data *idata;
int flags = 0;
@@ -68,6 +120,14 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int size[2];
void *data[2];
int len;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ num_rqst = 0;
+ server = cifs_pick_channel(ses);
vars = kzalloc(sizeof(*vars), GFP_ATOMIC);
if (vars == NULL)
@@ -75,12 +135,11 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
rqst = &vars->rqst[0];
rsp_iov = &vars->rsp_iov[0];
- server = cifs_pick_channel(ses);
-
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
+ for (i = 0; i < ARRAY_SIZE(resp_buftype); i++)
+ resp_buftype[i] = CIFS_NO_BUFFER;
/* We already have a handle so we can skip the open */
if (cfile)
@@ -118,242 +177,277 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
num_rqst++;
rc = 0;
- /* Operation */
- switch (command) {
- case SMB2_OP_QUERY_INFO:
- rqst[num_rqst].rq_iov = &vars->qi_iov;
- rqst[num_rqst].rq_nvec = 1;
-
- if (cfile)
- rc = SMB2_query_info_init(tcon, server,
- &rqst[num_rqst],
- cfile->fid.persistent_fid,
- cfile->fid.volatile_fid,
- FILE_ALL_INFORMATION,
- SMB2_O_INFO_FILE, 0,
- sizeof(struct smb2_file_all_info) +
- PATH_MAX * 2, 0, NULL);
- else {
- rc = SMB2_query_info_init(tcon, server,
- &rqst[num_rqst],
- COMPOUND_FID,
- COMPOUND_FID,
- FILE_ALL_INFORMATION,
- SMB2_O_INFO_FILE, 0,
- sizeof(struct smb2_file_all_info) +
- PATH_MAX * 2, 0, NULL);
- if (!rc) {
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst]);
+ for (i = 0; i < num_cmds; i++) {
+ /* Operation */
+ switch (cmds[i]) {
+ case SMB2_OP_QUERY_INFO:
+ rqst[num_rqst].rq_iov = &vars->qi_iov;
+ rqst[num_rqst].rq_nvec = 1;
+
+ if (cfile) {
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ FILE_ALL_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb2_file_all_info) +
+ PATH_MAX * 2, 0, NULL);
+ } else {
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[num_rqst],
+ COMPOUND_FID,
+ COMPOUND_FID,
+ FILE_ALL_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb2_file_all_info) +
+ PATH_MAX * 2, 0, NULL);
+ if (!rc) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ }
}
- }
- if (rc)
- goto finished;
- num_rqst++;
- trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid,
- full_path);
- break;
- case SMB2_OP_POSIX_QUERY_INFO:
- rqst[num_rqst].rq_iov = &vars->qi_iov;
- rqst[num_rqst].rq_nvec = 1;
-
- if (cfile)
- rc = SMB2_query_info_init(tcon, server,
- &rqst[num_rqst],
- cfile->fid.persistent_fid,
- cfile->fid.volatile_fid,
- SMB_FIND_FILE_POSIX_INFO,
- SMB2_O_INFO_FILE, 0,
+ if (rc)
+ goto finished;
+ num_rqst++;
+ trace_smb3_query_info_compound_enter(xid, ses->Suid,
+ tcon->tid, full_path);
+ break;
+ case SMB2_OP_POSIX_QUERY_INFO:
+ rqst[num_rqst].rq_iov = &vars->qi_iov;
+ rqst[num_rqst].rq_nvec = 1;
+
+ if (cfile) {
/* TBD: fix following to allow for longer SIDs */
- sizeof(struct smb311_posix_qinfo *) + (PATH_MAX * 2) +
- (sizeof(struct cifs_sid) * 2), 0, NULL);
- else {
- rc = SMB2_query_info_init(tcon, server,
- &rqst[num_rqst],
- COMPOUND_FID,
- COMPOUND_FID,
- SMB_FIND_FILE_POSIX_INFO,
- SMB2_O_INFO_FILE, 0,
- sizeof(struct smb311_posix_qinfo *) + (PATH_MAX * 2) +
- (sizeof(struct cifs_sid) * 2), 0, NULL);
- if (!rc) {
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst]);
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ SMB_FIND_FILE_POSIX_INFO,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb311_posix_qinfo *) +
+ (PATH_MAX * 2) +
+ (sizeof(struct cifs_sid) * 2), 0, NULL);
+ } else {
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[num_rqst],
+ COMPOUND_FID,
+ COMPOUND_FID,
+ SMB_FIND_FILE_POSIX_INFO,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb311_posix_qinfo *) +
+ (PATH_MAX * 2) +
+ (sizeof(struct cifs_sid) * 2), 0, NULL);
+ if (!rc) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ }
}
- }
- if (rc)
- goto finished;
- num_rqst++;
- trace_smb3_posix_query_info_compound_enter(xid, ses->Suid, tcon->tid, full_path);
- break;
- case SMB2_OP_DELETE:
- trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path);
- break;
- case SMB2_OP_MKDIR:
- /*
- * Directories are created through parameters in the
- * SMB2_open() call.
- */
- trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path);
- break;
- case SMB2_OP_RMDIR:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
- rqst[num_rqst].rq_nvec = 1;
-
- size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
- data[0] = &delete_pending[0];
-
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst], COMPOUND_FID,
- COMPOUND_FID, current->tgid,
- FILE_DISPOSITION_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
- if (rc)
- goto finished;
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst++]);
- trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path);
- break;
- case SMB2_OP_SET_EOF:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
- rqst[num_rqst].rq_nvec = 1;
+ if (rc)
+ goto finished;
+ num_rqst++;
+ trace_smb3_posix_query_info_compound_enter(xid, ses->Suid,
+ tcon->tid, full_path);
+ break;
+ case SMB2_OP_DELETE:
+ trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path);
+ break;
+ case SMB2_OP_MKDIR:
+ /*
+ * Directories are created through parameters in the
+ * SMB2_open() call.
+ */
+ trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path);
+ break;
+ case SMB2_OP_RMDIR:
+ rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ rqst[num_rqst].rq_nvec = 1;
- size[0] = 8; /* sizeof __le64 */
- data[0] = ptr;
+ size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
+ data[0] = &delete_pending[0];
- if (cfile) {
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst],
- cfile->fid.persistent_fid,
- cfile->fid.volatile_fid,
- current->tgid,
- FILE_END_OF_FILE_INFORMATION,
- SMB2_O_INFO_FILE, 0,
- data, size);
- } else {
rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst],
- COMPOUND_FID,
- COMPOUND_FID,
- current->tgid,
- FILE_END_OF_FILE_INFORMATION,
- SMB2_O_INFO_FILE, 0,
- data, size);
- if (!rc) {
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst]);
+ &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_DISPOSITION_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ if (rc)
+ goto finished;
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path);
+ break;
+ case SMB2_OP_SET_EOF:
+ rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ rqst[num_rqst].rq_nvec = 1;
+
+ size[0] = in_iov[i].iov_len;
+ data[0] = in_iov[i].iov_base;
+
+ if (cfile) {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ current->tgid,
+ FILE_END_OF_FILE_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ data, size);
+ } else {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ COMPOUND_FID,
+ COMPOUND_FID,
+ current->tgid,
+ FILE_END_OF_FILE_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ data, size);
+ if (!rc) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ }
}
- }
- if (rc)
- goto finished;
- num_rqst++;
- trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path);
- break;
- case SMB2_OP_SET_INFO:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
- rqst[num_rqst].rq_nvec = 1;
-
-
- size[0] = sizeof(FILE_BASIC_INFO);
- data[0] = ptr;
-
- if (cfile)
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst],
- cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, current->tgid,
- FILE_BASIC_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
- else {
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst],
- COMPOUND_FID,
- COMPOUND_FID, current->tgid,
- FILE_BASIC_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
- if (!rc) {
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst]);
+ if (rc)
+ goto finished;
+ num_rqst++;
+ trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path);
+ break;
+ case SMB2_OP_SET_INFO:
+ rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ rqst[num_rqst].rq_nvec = 1;
+
+ size[0] = in_iov[i].iov_len;
+ data[0] = in_iov[i].iov_base;
+
+ if (cfile) {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, current->tgid,
+ FILE_BASIC_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ } else {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_BASIC_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ if (!rc) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ }
}
- }
- if (rc)
- goto finished;
- num_rqst++;
- trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid,
- full_path);
- break;
- case SMB2_OP_RENAME:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
- rqst[num_rqst].rq_nvec = 2;
+ if (rc)
+ goto finished;
+ num_rqst++;
+ trace_smb3_set_info_compound_enter(xid, ses->Suid,
+ tcon->tid, full_path);
+ break;
+ case SMB2_OP_RENAME:
+ rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ rqst[num_rqst].rq_nvec = 2;
- len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX));
+ len = in_iov[i].iov_len;
- vars->rename_info.ReplaceIfExists = 1;
- vars->rename_info.RootDirectory = 0;
- vars->rename_info.FileNameLength = cpu_to_le32(len);
+ vars->rename_info.ReplaceIfExists = 1;
+ vars->rename_info.RootDirectory = 0;
+ vars->rename_info.FileNameLength = cpu_to_le32(len);
- size[0] = sizeof(struct smb2_file_rename_info);
- data[0] = &vars->rename_info;
+ size[0] = sizeof(struct smb2_file_rename_info);
+ data[0] = &vars->rename_info;
- size[1] = len + 2 /* null */;
- data[1] = (__le16 *)ptr;
+ size[1] = len + 2 /* null */;
+ data[1] = in_iov[i].iov_base;
- if (cfile)
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst],
- cfile->fid.persistent_fid,
- cfile->fid.volatile_fid,
- current->tgid, FILE_RENAME_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
- else {
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst],
- COMPOUND_FID, COMPOUND_FID,
- current->tgid, FILE_RENAME_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
- if (!rc) {
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst]);
+ if (cfile) {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ current->tgid, FILE_RENAME_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ } else {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ COMPOUND_FID, COMPOUND_FID,
+ current->tgid, FILE_RENAME_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ if (!rc) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ }
}
- }
- if (rc)
- goto finished;
- num_rqst++;
- trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path);
- break;
- case SMB2_OP_HARDLINK:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
- rqst[num_rqst].rq_nvec = 2;
+ if (rc)
+ goto finished;
+ num_rqst++;
+ trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path);
+ break;
+ case SMB2_OP_HARDLINK:
+ rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ rqst[num_rqst].rq_nvec = 2;
- len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX));
+ len = in_iov[i].iov_len;
- vars->link_info.ReplaceIfExists = 0;
- vars->link_info.RootDirectory = 0;
- vars->link_info.FileNameLength = cpu_to_le32(len);
+ vars->link_info.ReplaceIfExists = 0;
+ vars->link_info.RootDirectory = 0;
+ vars->link_info.FileNameLength = cpu_to_le32(len);
- size[0] = sizeof(struct smb2_file_link_info);
- data[0] = &vars->link_info;
+ size[0] = sizeof(struct smb2_file_link_info);
+ data[0] = &vars->link_info;
- size[1] = len + 2 /* null */;
- data[1] = (__le16 *)ptr;
+ size[1] = len + 2 /* null */;
+ data[1] = in_iov[i].iov_base;
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst], COMPOUND_FID,
- COMPOUND_FID, current->tgid,
- FILE_LINK_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
- if (rc)
- goto finished;
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst++]);
- trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path);
- break;
- default:
- cifs_dbg(VFS, "Invalid command\n");
- rc = -EINVAL;
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_LINK_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ if (rc)
+ goto finished;
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path);
+ break;
+ case SMB2_OP_SET_REPARSE:
+ rqst[num_rqst].rq_iov = vars->io_iov;
+ rqst[num_rqst].rq_nvec = ARRAY_SIZE(vars->io_iov);
+
+ rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
+ COMPOUND_FID, COMPOUND_FID,
+ FSCTL_SET_REPARSE_POINT,
+ in_iov[i].iov_base,
+ in_iov[i].iov_len, 0);
+ if (rc)
+ goto finished;
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_set_reparse_compound_enter(xid, ses->Suid,
+ tcon->tid, full_path);
+ break;
+ case SMB2_OP_GET_REPARSE:
+ rqst[num_rqst].rq_iov = vars->io_iov;
+ rqst[num_rqst].rq_nvec = ARRAY_SIZE(vars->io_iov);
+
+ rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
+ COMPOUND_FID, COMPOUND_FID,
+ FSCTL_GET_REPARSE_POINT,
+ NULL, 0, CIFSMaxBufSize);
+ if (rc)
+ goto finished;
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_get_reparse_compound_enter(xid, ses->Suid,
+ tcon->tid, full_path);
+ break;
+ default:
+ cifs_dbg(VFS, "Invalid command\n");
+ rc = -EINVAL;
+ }
}
if (rc)
goto finished;
@@ -375,157 +469,191 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
num_rqst++;
if (cfile) {
+ if (retries)
+ for (i = 1; i < num_rqst - 2; i++)
+ smb2_set_replay(server, &rqst[i]);
+
rc = compound_send_recv(xid, ses, server,
flags, num_rqst - 2,
&rqst[1], &resp_buftype[1],
&rsp_iov[1]);
- } else
+ } else {
+ if (retries)
+ for (i = 0; i < num_rqst; i++)
+ smb2_set_replay(server, &rqst[i]);
+
rc = compound_send_recv(xid, ses, server,
flags, num_rqst,
rqst, resp_buftype,
rsp_iov);
+ }
- finished:
- SMB2_open_free(&rqst[0]);
+finished:
+ num_rqst = 0;
+ SMB2_open_free(&rqst[num_rqst++]);
if (rc == -EREMCHG) {
pr_warn_once("server share %s deleted\n", tcon->tree_name);
tcon->need_reconnect = true;
}
- switch (command) {
- case SMB2_OP_QUERY_INFO:
- idata = ptr;
- if (rc == 0 && cfile && cfile->symlink_target) {
- idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
- if (!idata->symlink_target)
- rc = -ENOMEM;
- }
- if (rc == 0) {
- qi_rsp = (struct smb2_query_info_rsp *)
- rsp_iov[1].iov_base;
- rc = smb2_validate_and_copy_iov(
- le16_to_cpu(qi_rsp->OutputBufferOffset),
- le32_to_cpu(qi_rsp->OutputBufferLength),
- &rsp_iov[1], sizeof(idata->fi), (char *)&idata->fi);
- }
- if (rqst[1].rq_iov)
- SMB2_query_info_free(&rqst[1]);
- if (rqst[2].rq_iov)
- SMB2_close_free(&rqst[2]);
- if (rc)
- trace_smb3_query_info_compound_err(xid, ses->Suid,
- tcon->tid, rc);
- else
- trace_smb3_query_info_compound_done(xid, ses->Suid,
- tcon->tid);
- break;
- case SMB2_OP_POSIX_QUERY_INFO:
- idata = ptr;
- if (rc == 0 && cfile && cfile->symlink_target) {
- idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
- if (!idata->symlink_target)
- rc = -ENOMEM;
- }
- if (rc == 0) {
- qi_rsp = (struct smb2_query_info_rsp *)
- rsp_iov[1].iov_base;
- rc = smb2_validate_and_copy_iov(
- le16_to_cpu(qi_rsp->OutputBufferOffset),
- le32_to_cpu(qi_rsp->OutputBufferLength),
- &rsp_iov[1], sizeof(idata->posix_fi) /* add SIDs */,
- (char *)&idata->posix_fi);
- }
- if (rc == 0) {
- unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength);
-
- if (length > sizeof(idata->posix_fi)) {
- char *base = (char *)rsp_iov[1].iov_base +
- le16_to_cpu(qi_rsp->OutputBufferOffset) +
- sizeof(idata->posix_fi);
- *extbuflen = length - sizeof(idata->posix_fi);
- *extbuf = kmemdup(base, *extbuflen, GFP_KERNEL);
- if (!*extbuf)
+ for (i = 0; i < num_cmds; i++) {
+ switch (cmds[i]) {
+ case SMB2_OP_QUERY_INFO:
+ idata = in_iov[i].iov_base;
+ if (rc == 0 && cfile && cfile->symlink_target) {
+ idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
+ if (!idata->symlink_target)
rc = -ENOMEM;
+ }
+ if (rc == 0) {
+ qi_rsp = (struct smb2_query_info_rsp *)
+ rsp_iov[i + 1].iov_base;
+ rc = smb2_validate_and_copy_iov(
+ le16_to_cpu(qi_rsp->OutputBufferOffset),
+ le32_to_cpu(qi_rsp->OutputBufferLength),
+ &rsp_iov[i + 1], sizeof(idata->fi), (char *)&idata->fi);
+ }
+ SMB2_query_info_free(&rqst[num_rqst++]);
+ if (rc)
+ trace_smb3_query_info_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ else
+ trace_smb3_query_info_compound_done(xid, ses->Suid,
+ tcon->tid);
+ break;
+ case SMB2_OP_POSIX_QUERY_INFO:
+ idata = in_iov[i].iov_base;
+ if (rc == 0 && cfile && cfile->symlink_target) {
+ idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
+ if (!idata->symlink_target)
+ rc = -ENOMEM;
+ }
+ if (rc == 0) {
+ qi_rsp = (struct smb2_query_info_rsp *)
+ rsp_iov[i + 1].iov_base;
+ rc = smb2_validate_and_copy_iov(
+ le16_to_cpu(qi_rsp->OutputBufferOffset),
+ le32_to_cpu(qi_rsp->OutputBufferLength),
+ &rsp_iov[i + 1], sizeof(idata->posix_fi) /* add SIDs */,
+ (char *)&idata->posix_fi);
+ }
+ if (rc == 0)
+ rc = parse_posix_sids(idata, &rsp_iov[i + 1]);
+
+ SMB2_query_info_free(&rqst[num_rqst++]);
+ if (rc)
+ trace_smb3_posix_query_info_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ else
+ trace_smb3_posix_query_info_compound_done(xid, ses->Suid,
+ tcon->tid);
+ break;
+ case SMB2_OP_DELETE:
+ if (rc)
+ trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_delete_done(xid, ses->Suid, tcon->tid);
+ break;
+ case SMB2_OP_MKDIR:
+ if (rc)
+ trace_smb3_mkdir_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid);
+ break;
+ case SMB2_OP_HARDLINK:
+ if (rc)
+ trace_smb3_hardlink_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid);
+ SMB2_set_info_free(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_RENAME:
+ if (rc)
+ trace_smb3_rename_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_rename_done(xid, ses->Suid, tcon->tid);
+ SMB2_set_info_free(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_RMDIR:
+ if (rc)
+ trace_smb3_rmdir_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid);
+ SMB2_set_info_free(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_SET_EOF:
+ if (rc)
+ trace_smb3_set_eof_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid);
+ SMB2_set_info_free(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_SET_INFO:
+ if (rc)
+ trace_smb3_set_info_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ else
+ trace_smb3_set_info_compound_done(xid, ses->Suid,
+ tcon->tid);
+ SMB2_set_info_free(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_SET_REPARSE:
+ if (rc) {
+ trace_smb3_set_reparse_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ } else {
+ trace_smb3_set_reparse_compound_done(xid, ses->Suid,
+ tcon->tid);
+ }
+ SMB2_ioctl_free(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_GET_REPARSE:
+ if (!rc) {
+ iov = &rsp_iov[i + 1];
+ idata = in_iov[i].iov_base;
+ idata->reparse.io.iov = *iov;
+ idata->reparse.io.buftype = resp_buftype[i + 1];
+ rbuf = reparse_buf_ptr(iov);
+ if (IS_ERR(rbuf)) {
+ rc = PTR_ERR(rbuf);
+ trace_smb3_set_reparse_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ } else {
+ idata->reparse.tag = le32_to_cpu(rbuf->ReparseTag);
+ trace_smb3_set_reparse_compound_done(xid, ses->Suid,
+ tcon->tid);
+ }
+ memset(iov, 0, sizeof(*iov));
+ resp_buftype[i + 1] = CIFS_NO_BUFFER;
} else {
- rc = -EINVAL;
+ trace_smb3_set_reparse_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
}
+ SMB2_ioctl_free(&rqst[num_rqst++]);
+ break;
}
- if (rqst[1].rq_iov)
- SMB2_query_info_free(&rqst[1]);
- if (rqst[2].rq_iov)
- SMB2_close_free(&rqst[2]);
- if (rc)
- trace_smb3_posix_query_info_compound_err(xid, ses->Suid, tcon->tid, rc);
- else
- trace_smb3_posix_query_info_compound_done(xid, ses->Suid, tcon->tid);
- break;
- case SMB2_OP_DELETE:
- if (rc)
- trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc);
- else
- trace_smb3_delete_done(xid, ses->Suid, tcon->tid);
- if (rqst[1].rq_iov)
- SMB2_close_free(&rqst[1]);
- break;
- case SMB2_OP_MKDIR:
- if (rc)
- trace_smb3_mkdir_err(xid, ses->Suid, tcon->tid, rc);
- else
- trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid);
- if (rqst[1].rq_iov)
- SMB2_close_free(&rqst[1]);
- break;
- case SMB2_OP_HARDLINK:
- if (rc)
- trace_smb3_hardlink_err(xid, ses->Suid, tcon->tid, rc);
- else
- trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid);
- free_set_inf_compound(rqst);
- break;
- case SMB2_OP_RENAME:
- if (rc)
- trace_smb3_rename_err(xid, ses->Suid, tcon->tid, rc);
- else
- trace_smb3_rename_done(xid, ses->Suid, tcon->tid);
- free_set_inf_compound(rqst);
- break;
- case SMB2_OP_RMDIR:
- if (rc)
- trace_smb3_rmdir_err(xid, ses->Suid, tcon->tid, rc);
- else
- trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid);
- free_set_inf_compound(rqst);
- break;
- case SMB2_OP_SET_EOF:
- if (rc)
- trace_smb3_set_eof_err(xid, ses->Suid, tcon->tid, rc);
- else
- trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid);
- free_set_inf_compound(rqst);
- break;
- case SMB2_OP_SET_INFO:
- if (rc)
- trace_smb3_set_info_compound_err(xid, ses->Suid,
- tcon->tid, rc);
- else
- trace_smb3_set_info_compound_done(xid, ses->Suid,
- tcon->tid);
- free_set_inf_compound(rqst);
- break;
}
+ SMB2_close_free(&rqst[num_rqst]);
- if (cfile)
- cifsFileInfo_put(cfile);
-
+ num_cmds += 2;
if (out_iov && out_buftype) {
- memcpy(out_iov, rsp_iov, 3 * sizeof(*out_iov));
- memcpy(out_buftype, resp_buftype, 3 * sizeof(*out_buftype));
+ memcpy(out_iov, rsp_iov, num_cmds * sizeof(*out_iov));
+ memcpy(out_buftype, resp_buftype,
+ num_cmds * sizeof(*out_buftype));
} else {
- free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
- free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
- free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+ for (i = 0; i < num_cmds; i++)
+ free_rsp_buf(resp_buftype[i], rsp_iov[i].iov_base);
}
+ num_cmds -= 2; /* correct num_cmds as there could be a retry */
kfree(vars);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
+ if (cfile)
+ cifsFileInfo_put(cfile);
+
return rc;
}
@@ -569,34 +697,57 @@ int smb2_query_path_info(const unsigned int xid,
struct cifsFileInfo *cfile;
struct cached_fid *cfid = NULL;
struct smb2_hdr *hdr;
- struct kvec out_iov[3] = {};
+ struct kvec in_iov[2], out_iov[3] = {};
int out_buftype[3] = {};
+ int cmds[2];
bool islink;
+ int i, num_cmds;
int rc, rc2;
data->adjust_tz = false;
data->reparse_point = false;
- if (strcmp(full_path, ""))
- rc = -ENOENT;
- else
- rc = open_cached_dir(xid, tcon, full_path, cifs_sb, false, &cfid);
- /* If it is a root and its handle is cached then use it */
- if (!rc) {
- if (cfid->file_all_info_is_valid) {
- memcpy(&data->fi, &cfid->file_all_info, sizeof(data->fi));
+ /*
+ * BB TODO: Add support for using cached root handle in SMB3.1.1 POSIX.
+ * Create SMB2_query_posix_info worker function to do non-compounded
+ * query when we already have an open file handle for this. For now this
+ * is fast enough (always using the compounded version).
+ */
+ if (!tcon->posix_extensions) {
+ if (*full_path) {
+ rc = -ENOENT;
} else {
- rc = SMB2_query_info(xid, tcon, cfid->fid.persistent_fid,
- cfid->fid.volatile_fid, &data->fi);
+ rc = open_cached_dir(xid, tcon, full_path,
+ cifs_sb, false, &cfid);
}
- close_cached_dir(cfid);
- return rc;
+ /* If it is a root and its handle is cached then use it */
+ if (!rc) {
+ if (cfid->file_all_info_is_valid) {
+ memcpy(&data->fi, &cfid->file_all_info,
+ sizeof(data->fi));
+ } else {
+ rc = SMB2_query_info(xid, tcon,
+ cfid->fid.persistent_fid,
+ cfid->fid.volatile_fid,
+ &data->fi);
+ }
+ close_cached_dir(cfid);
+ return rc;
+ }
+ cmds[0] = SMB2_OP_QUERY_INFO;
+ } else {
+ cmds[0] = SMB2_OP_POSIX_QUERY_INFO;
}
+ in_iov[0].iov_base = data;
+ in_iov[0].iov_len = sizeof(*data);
+ in_iov[1] = in_iov[0];
+
cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile,
- NULL, NULL, out_iov, out_buftype);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ FILE_READ_ATTRIBUTES, FILE_OPEN,
+ create_options, ACL_NO_MODE, in_iov,
+ cmds, 1, cfile, out_iov, out_buftype);
hdr = out_iov[0].iov_base;
/*
* If first iov is unset, then SMB session was dropped or we've got a
@@ -608,18 +759,27 @@ int smb2_query_path_info(const unsigned int xid,
switch (rc) {
case 0:
case -EOPNOTSUPP:
+ /*
+ * BB TODO: When support for special files added to Samba
+ * re-verify this path.
+ */
rc = parse_create_response(data, cifs_sb, &out_iov[0]);
if (rc || !data->reparse_point)
goto out;
+ if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK) {
+ /* symlink already parsed in create response */
+ num_cmds = 1;
+ } else {
+ cmds[1] = SMB2_OP_GET_REPARSE;
+ num_cmds = 2;
+ }
create_options |= OPEN_REPARSE_POINT;
- /* Failed on a symbolic link - query a reparse point info */
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE, data,
- SMB2_OP_QUERY_INFO, cfile, NULL, NULL,
- NULL, NULL);
+ create_options, ACL_NO_MODE, in_iov,
+ cmds, num_cmds, cfile, NULL, NULL);
break;
case -EREMOTE:
break;
@@ -637,93 +797,8 @@ int smb2_query_path_info(const unsigned int xid,
}
out:
- free_rsp_buf(out_buftype[0], out_iov[0].iov_base);
- free_rsp_buf(out_buftype[1], out_iov[1].iov_base);
- free_rsp_buf(out_buftype[2], out_iov[2].iov_base);
- return rc;
-}
-
-int smb311_posix_query_path_info(const unsigned int xid,
- struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb,
- const char *full_path,
- struct cifs_open_info_data *data,
- struct cifs_sid *owner,
- struct cifs_sid *group)
-{
- int rc;
- __u32 create_options = 0;
- struct cifsFileInfo *cfile;
- struct kvec out_iov[3] = {};
- int out_buftype[3] = {};
- __u8 *sidsbuf = NULL;
- __u8 *sidsbuf_end = NULL;
- size_t sidsbuflen = 0;
- size_t owner_len, group_len;
-
- data->adjust_tz = false;
- data->reparse_point = false;
-
- /*
- * BB TODO: Add support for using the cached root handle.
- * Create SMB2_query_posix_info worker function to do non-compounded query
- * when we already have an open file handle for this. For now this is fast enough
- * (always using the compounded version).
- */
-
- cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile,
- &sidsbuf, &sidsbuflen, out_iov, out_buftype);
- /*
- * If first iov is unset, then SMB session was dropped or we've got a
- * cached open file (@cfile).
- */
- if (!out_iov[0].iov_base || out_buftype[0] == CIFS_NO_BUFFER)
- goto out;
-
- switch (rc) {
- case 0:
- case -EOPNOTSUPP:
- /* BB TODO: When support for special files added to Samba re-verify this path */
- rc = parse_create_response(data, cifs_sb, &out_iov[0]);
- if (rc || !data->reparse_point)
- goto out;
-
- create_options |= OPEN_REPARSE_POINT;
- /* Failed on a symbolic link - query a reparse point info */
- cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES,
- FILE_OPEN, create_options, ACL_NO_MODE, data,
- SMB2_OP_POSIX_QUERY_INFO, cfile,
- &sidsbuf, &sidsbuflen, NULL, NULL);
- break;
- }
-
-out:
- if (rc == 0) {
- sidsbuf_end = sidsbuf + sidsbuflen;
-
- owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end);
- if (owner_len == -1) {
- rc = -EINVAL;
- goto out;
- }
- memcpy(owner, sidsbuf, owner_len);
-
- group_len = posix_info_sid_size(
- sidsbuf + owner_len, sidsbuf_end);
- if (group_len == -1) {
- rc = -EINVAL;
- goto out;
- }
- memcpy(group, sidsbuf + owner_len, group_len);
- }
-
- kfree(sidsbuf);
- free_rsp_buf(out_buftype[0], out_iov[0].iov_base);
- free_rsp_buf(out_buftype[1], out_iov[1].iov_base);
- free_rsp_buf(out_buftype[2], out_iov[2].iov_base);
+ for (i = 0; i < ARRAY_SIZE(out_buftype); i++)
+ free_rsp_buf(out_buftype[i], out_iov[i].iov_base);
return rc;
}
@@ -734,8 +809,9 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
{
return smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
- CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR,
- NULL, NULL, NULL, NULL, NULL);
+ CREATE_NOT_FILE, mode,
+ NULL, &(int){SMB2_OP_MKDIR}, 1,
+ NULL, NULL, NULL);
}
void
@@ -743,21 +819,24 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon,
const unsigned int xid)
{
- FILE_BASIC_INFO data;
+ FILE_BASIC_INFO data = {};
struct cifsInodeInfo *cifs_i;
struct cifsFileInfo *cfile;
+ struct kvec in_iov;
u32 dosattrs;
int tmprc;
- memset(&data, 0, sizeof(data));
+ in_iov.iov_base = &data;
+ in_iov.iov_len = sizeof(data);
cifs_i = CIFS_I(inode);
dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
data.Attributes = cpu_to_le32(dosattrs);
cifs_get_writable_path(tcon, name, FIND_WR_ANY, &cfile);
tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
- CREATE_NOT_FILE, ACL_NO_MODE,
- &data, SMB2_OP_SET_INFO, cfile, NULL, NULL, NULL, NULL);
+ CREATE_NOT_FILE, ACL_NO_MODE, &in_iov,
+ &(int){SMB2_OP_SET_INFO}, 1,
+ cfile, NULL, NULL);
if (tmprc == 0)
cifs_i->cifsAttrs = dosattrs;
}
@@ -767,9 +846,11 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
drop_cached_dir_by_name(xid, tcon, name, cifs_sb);
- return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
- CREATE_NOT_FILE, ACL_NO_MODE,
- NULL, SMB2_OP_RMDIR, NULL, NULL, NULL, NULL, NULL);
+ return smb2_compound_op(xid, tcon, cifs_sb, name,
+ DELETE, FILE_OPEN, CREATE_NOT_FILE,
+ ACL_NO_MODE, NULL,
+ &(int){SMB2_OP_RMDIR}, 1,
+ NULL, NULL, NULL);
}
int
@@ -778,15 +859,18 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
{
return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
- ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL, NULL, NULL);
+ ACL_NO_MODE, NULL,
+ &(int){SMB2_OP_DELETE}, 1,
+ NULL, NULL, NULL);
}
-static int
-smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb, __u32 access, int command,
- struct cifsFileInfo *cfile)
+static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb,
+ __u32 create_options, __u32 access,
+ int command, struct cifsFileInfo *cfile)
{
+ struct kvec in_iov;
__le16 *smb2_to_name = NULL;
int rc;
@@ -795,36 +879,43 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
rc = -ENOMEM;
goto smb2_rename_path;
}
+ in_iov.iov_base = smb2_to_name;
+ in_iov.iov_len = 2 * UniStrnlen((wchar_t *)smb2_to_name, PATH_MAX);
rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
- FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name,
- command, cfile, NULL, NULL, NULL, NULL);
+ FILE_OPEN, create_options, ACL_NO_MODE,
+ &in_iov, &command, 1, cfile, NULL, NULL);
smb2_rename_path:
kfree(smb2_to_name);
return rc;
}
-int
-smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb)
+int smb2_rename_path(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb)
{
struct cifsFileInfo *cfile;
+ __u32 co = file_create_options(source_dentry);
drop_cached_dir_by_name(xid, tcon, from_name, cifs_sb);
cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile);
- return smb2_set_path_attr(xid, tcon, from_name, to_name,
- cifs_sb, DELETE, SMB2_OP_RENAME, cfile);
+ return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
+ co, DELETE, SMB2_OP_RENAME, cfile);
}
-int
-smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb)
+int smb2_create_hardlink(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb)
{
- return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
- FILE_READ_ATTRIBUTES, SMB2_OP_HARDLINK,
- NULL);
+ __u32 co = file_create_options(source_dentry);
+
+ return smb2_set_path_attr(xid, tcon, from_name, to_name,
+ cifs_sb, co, FILE_READ_ATTRIBUTES,
+ SMB2_OP_HARDLINK, NULL);
}
int
@@ -832,13 +923,18 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
const char *full_path, __u64 size,
struct cifs_sb_info *cifs_sb, bool set_alloc)
{
- __le64 eof = cpu_to_le64(size);
struct cifsFileInfo *cfile;
+ struct kvec in_iov;
+ __le64 eof = cpu_to_le64(size);
+ in_iov.iov_base = &eof;
+ in_iov.iov_len = sizeof(eof);
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
return smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE,
- &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL, NULL, NULL);
+ FILE_WRITE_DATA, FILE_OPEN,
+ 0, ACL_NO_MODE, &in_iov,
+ &(int){SMB2_OP_SET_EOF}, 1,
+ cfile, NULL, NULL);
}
int
@@ -849,6 +945,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
struct tcon_link *tlink;
struct cifs_tcon *tcon;
struct cifsFileInfo *cfile;
+ struct kvec in_iov = { .iov_base = buf, .iov_len = sizeof(*buf), };
int rc;
if ((buf->CreationTime == 0) && (buf->LastAccessTime == 0) &&
@@ -864,8 +961,91 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_WRITE_ATTRIBUTES, FILE_OPEN,
- 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile,
- NULL, NULL, NULL, NULL);
+ 0, ACL_NO_MODE, &in_iov,
+ &(int){SMB2_OP_SET_INFO}, 1,
+ cfile, NULL, NULL);
cifs_put_tlink(tlink);
return rc;
}
+
+struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
+ struct super_block *sb,
+ const unsigned int xid,
+ struct cifs_tcon *tcon,
+ const char *full_path,
+ struct kvec *iov)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct cifsFileInfo *cfile;
+ struct inode *new = NULL;
+ struct kvec in_iov[2];
+ int cmds[2];
+ int da, co, cd;
+ int rc;
+
+ da = SYNCHRONIZE | DELETE |
+ FILE_READ_ATTRIBUTES |
+ FILE_WRITE_ATTRIBUTES;
+ co = CREATE_NOT_DIR | OPEN_REPARSE_POINT;
+ cd = FILE_CREATE;
+ cmds[0] = SMB2_OP_SET_REPARSE;
+ in_iov[0] = *iov;
+ in_iov[1].iov_base = data;
+ in_iov[1].iov_len = sizeof(*data);
+
+ if (tcon->posix_extensions) {
+ cmds[1] = SMB2_OP_POSIX_QUERY_INFO;
+ cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ da, cd, co, ACL_NO_MODE, in_iov,
+ cmds, 2, cfile, NULL, NULL);
+ if (!rc) {
+ rc = smb311_posix_get_inode_info(&new, full_path,
+ data, sb, xid);
+ }
+ } else {
+ cmds[1] = SMB2_OP_QUERY_INFO;
+ cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ da, cd, co, ACL_NO_MODE, in_iov,
+ cmds, 2, cfile, NULL, NULL);
+ if (!rc) {
+ rc = cifs_get_inode_info(&new, full_path,
+ data, sb, xid, NULL);
+ }
+ }
+ return rc ? ERR_PTR(rc) : new;
+}
+
+int smb2_query_reparse_point(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ u32 *tag, struct kvec *rsp,
+ int *rsp_buftype)
+{
+ struct cifs_open_info_data data = {};
+ struct cifsFileInfo *cfile;
+ struct kvec in_iov = { .iov_base = &data, .iov_len = sizeof(data), };
+ int rc;
+
+ cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
+
+ cifs_get_readable_path(tcon, full_path, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ FILE_READ_ATTRIBUTES, FILE_OPEN,
+ OPEN_REPARSE_POINT, ACL_NO_MODE, &in_iov,
+ &(int){SMB2_OP_GET_REPARSE}, 1,
+ cfile, NULL, NULL);
+ if (rc)
+ goto out;
+
+ *tag = data.reparse.tag;
+ *rsp = data.reparse.io.iov;
+ *rsp_buftype = data.reparse.io.buftype;
+ memset(&data.reparse.io.iov, 0, sizeof(data.reparse.io.iov));
+ data.reparse.io.buftype = CIFS_NO_BUFFER;
+out:
+ cifs_free_open_info(&data);
+ return rc;
+}
diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c
index 1a90dd78b238..ac1895358908 100644
--- a/fs/smb/client/smb2maperror.c
+++ b/fs/smb/client/smb2maperror.c
@@ -1210,6 +1210,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_INVALID_TASK_INDEX, -EIO, "STATUS_INVALID_TASK_INDEX"},
{STATUS_THREAD_ALREADY_IN_TASK, -EIO, "STATUS_THREAD_ALREADY_IN_TASK"},
{STATUS_CALLBACK_BYPASS, -EIO, "STATUS_CALLBACK_BYPASS"},
+ {STATUS_SERVER_UNAVAILABLE, -EAGAIN, "STATUS_SERVER_UNAVAILABLE"},
+ {STATUS_FILE_NOT_AVAILABLE, -EAGAIN, "STATUS_FILE_NOT_AVAILABLE"},
{STATUS_PORT_CLOSED, -EIO, "STATUS_PORT_CLOSED"},
{STATUS_MESSAGE_LOST, -EIO, "STATUS_MESSAGE_LOST"},
{STATUS_INVALID_MESSAGE, -EIO, "STATUS_INVALID_MESSAGE"},
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 14bc745de199..83c898afc835 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -614,7 +614,8 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
"multichannel not available\n"
"Empty network interface list returned by server %s\n",
ses->server->hostname);
- rc = -EINVAL;
+ rc = -EOPNOTSUPP;
+ ses->iface_last_update = jiffies;
goto out;
}
@@ -712,7 +713,6 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
ses->iface_count++;
spin_unlock(&ses->iface_lock);
- ses->iface_last_update = jiffies;
next_iface:
nb_iface++;
next = le32_to_cpu(p->Next);
@@ -734,11 +734,7 @@ next_iface:
if ((bytes_left > 8) || p->Next)
cifs_dbg(VFS, "%s: incomplete interface info\n", __func__);
-
- if (!ses->iface_count) {
- rc = -EINVAL;
- goto out;
- }
+ ses->iface_last_update = jiffies;
out:
/*
@@ -1112,7 +1108,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
{
struct smb2_compound_vars *vars;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
struct smb_rqst *rqst;
struct kvec *rsp_iov;
__le16 *utf16_path = NULL;
@@ -1128,6 +1124,13 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
struct smb2_file_full_ea_info *ea = NULL;
struct smb2_query_info_rsp *rsp;
int rc, used_len = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = CIFS_CP_CREATE_CLOSE_OP;
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ server = cifs_pick_channel(ses);
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -1248,6 +1251,12 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
goto sea_exit;
smb2_set_related(&rqst[2]);
+ if (retries) {
+ smb2_set_replay(server, &rqst[0]);
+ smb2_set_replay(server, &rqst[1]);
+ smb2_set_replay(server, &rqst[2]);
+ }
+
rc = compound_send_recv(xid, ses, server,
flags, 3, rqst,
resp_buftype, rsp_iov);
@@ -1264,6 +1273,11 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
kfree(vars);
out_free_path:
kfree(utf16_path);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
#endif
@@ -1488,7 +1502,7 @@ smb2_ioctl_query_info(const unsigned int xid,
struct smb_rqst *rqst;
struct kvec *rsp_iov;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
char __user *arg = (char __user *)p;
struct smb_query_info qi;
struct smb_query_info __user *pqi;
@@ -1505,6 +1519,13 @@ smb2_ioctl_query_info(const unsigned int xid,
void *data[2];
int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR;
void (*free_req1_func)(struct smb_rqst *r);
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = CIFS_CP_CREATE_CLOSE_OP;
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ server = cifs_pick_channel(ses);
vars = kzalloc(sizeof(*vars), GFP_ATOMIC);
if (vars == NULL)
@@ -1645,6 +1666,12 @@ smb2_ioctl_query_info(const unsigned int xid,
goto free_req_1;
smb2_set_related(&rqst[2]);
+ if (retries) {
+ smb2_set_replay(server, &rqst[0]);
+ smb2_set_replay(server, &rqst[1]);
+ smb2_set_replay(server, &rqst[2]);
+ }
+
rc = compound_send_recv(xid, ses, server,
flags, 3, rqst,
resp_buftype, rsp_iov);
@@ -1705,6 +1732,11 @@ free_output_buffer:
kfree(buffer);
free_vars:
kfree(vars);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -1935,7 +1967,6 @@ static int
smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
struct cifsFileInfo *cfile, __u64 size, bool set_alloc)
{
- __le64 eof = cpu_to_le64(size);
struct inode *inode;
/*
@@ -1952,7 +1983,7 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
}
return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, cfile->pid, &eof);
+ cfile->fid.volatile_fid, cfile->pid, size);
}
static int
@@ -2232,8 +2263,14 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct smb2_query_directory_rsp *qd_rsp = NULL;
struct smb2_create_rsp *op_rsp = NULL;
- struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
- int retry_count = 0;
+ struct TCP_Server_Info *server;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ server = cifs_pick_channel(tcon->ses);
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (!utf16_path)
@@ -2283,14 +2320,15 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
smb2_set_related(&rqst[1]);
-again:
+ if (retries) {
+ smb2_set_replay(server, &rqst[0]);
+ smb2_set_replay(server, &rqst[1]);
+ }
+
rc = compound_send_recv(xid, tcon->ses, server,
flags, 2, rqst,
resp_buftype, rsp_iov);
- if (rc == -EAGAIN && retry_count++ < 10)
- goto again;
-
/* If the open failed there is nothing to do */
op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
if (op_rsp == NULL || op_rsp->hdr.Status != STATUS_SUCCESS) {
@@ -2338,6 +2376,11 @@ again:
SMB2_query_directory_free(&rqst[1]);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -2463,6 +2506,22 @@ smb2_oplock_response(struct cifs_tcon *tcon, __u64 persistent_fid,
}
void
+smb2_set_replay(struct TCP_Server_Info *server, struct smb_rqst *rqst)
+{
+ struct smb2_hdr *shdr;
+
+ if (server->dialect < SMB30_PROT_ID)
+ return;
+
+ shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base);
+ if (shdr == NULL) {
+ cifs_dbg(FYI, "shdr NULL in smb2_set_related\n");
+ return;
+ }
+ shdr->Flags |= SMB2_FLAGS_REPLAY_OPERATION;
+}
+
+void
smb2_set_related(struct smb_rqst *rqst)
{
struct smb2_hdr *shdr;
@@ -2535,6 +2594,27 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
}
/*
+ * helper function for exponential backoff and check if replayable
+ */
+bool smb2_should_replay(struct cifs_tcon *tcon,
+ int *pretries,
+ int *pcur_sleep)
+{
+ if (!pretries || !pcur_sleep)
+ return false;
+
+ if (tcon->retry || (*pretries)++ < tcon->ses->server->retrans) {
+ msleep(*pcur_sleep);
+ (*pcur_sleep) = ((*pcur_sleep) << 1);
+ if ((*pcur_sleep) > CIFS_MAX_SLEEP)
+ (*pcur_sleep) = CIFS_MAX_SLEEP;
+ return true;
+ }
+
+ return false;
+}
+
+/*
* Passes the query info response back to the caller on success.
* Caller need to free this with free_rsp_buf().
*/
@@ -2547,7 +2627,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
{
struct smb2_compound_vars *vars;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
int flags = CIFS_CP_CREATE_CLOSE_OP;
struct smb_rqst *rqst;
int resp_buftype[3];
@@ -2558,6 +2638,13 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
int rc;
__le16 *utf16_path;
struct cached_fid *cfid = NULL;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = CIFS_CP_CREATE_CLOSE_OP;
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ server = cifs_pick_channel(ses);
if (!path)
path = "";
@@ -2638,6 +2725,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
goto qic_exit;
smb2_set_related(&rqst[2]);
+ if (retries) {
+ if (!cfid) {
+ smb2_set_replay(server, &rqst[0]);
+ smb2_set_replay(server, &rqst[2]);
+ }
+ smb2_set_replay(server, &rqst[1]);
+ }
+
if (cfid) {
rc = compound_send_recv(xid, ses, server,
flags, 1, &rqst[1],
@@ -2670,6 +2765,11 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
kfree(vars);
out_free_path:
kfree(utf16_path);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -2948,18 +3048,6 @@ int parse_reparse_point(struct reparse_data_buffer *buf,
u32 plen, struct cifs_sb_info *cifs_sb,
bool unicode, struct cifs_open_info_data *data)
{
- if (plen < sizeof(*buf)) {
- cifs_dbg(VFS, "%s: reparse buffer is too small. Must be at least 8 bytes but was %d\n",
- __func__, plen);
- return -EIO;
- }
-
- if (plen < le16_to_cpu(buf->ReparseDataLength) + sizeof(*buf)) {
- cifs_dbg(VFS, "%s: invalid reparse buf length: %d\n",
- __func__, plen);
- return -EIO;
- }
-
data->reparse.buf = buf;
/* See MS-FSCC 2.1.2 */
@@ -2997,145 +3085,6 @@ static int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
return parse_reparse_point(buf, plen, cifs_sb, true, data);
}
-static int smb2_query_reparse_point(const unsigned int xid,
- struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb,
- const char *full_path,
- u32 *tag, struct kvec *rsp,
- int *rsp_buftype)
-{
- struct smb2_compound_vars *vars;
- int rc;
- __le16 *utf16_path = NULL;
- __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
- struct cifs_open_parms oparms;
- struct cifs_fid fid;
- struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
- int flags = CIFS_CP_CREATE_CLOSE_OP;
- struct smb_rqst *rqst;
- int resp_buftype[3];
- struct kvec *rsp_iov;
- struct smb2_ioctl_rsp *ioctl_rsp;
- struct reparse_data_buffer *reparse_buf;
- u32 off, count, len;
-
- cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
-
- if (smb3_encryption_required(tcon))
- flags |= CIFS_TRANSFORM_REQ;
-
- utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
- if (!utf16_path)
- return -ENOMEM;
-
- resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
- vars = kzalloc(sizeof(*vars), GFP_KERNEL);
- if (!vars) {
- rc = -ENOMEM;
- goto out_free_path;
- }
- rqst = vars->rqst;
- rsp_iov = vars->rsp_iov;
-
- /*
- * setup smb2open - TODO add optimization to call cifs_get_readable_path
- * to see if there is a handle already open that we can use
- */
- rqst[0].rq_iov = vars->open_iov;
- rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
-
- oparms = (struct cifs_open_parms) {
- .tcon = tcon,
- .path = full_path,
- .desired_access = FILE_READ_ATTRIBUTES,
- .disposition = FILE_OPEN,
- .create_options = cifs_create_options(cifs_sb, OPEN_REPARSE_POINT),
- .fid = &fid,
- };
-
- rc = SMB2_open_init(tcon, server,
- &rqst[0], &oplock, &oparms, utf16_path);
- if (rc)
- goto query_rp_exit;
- smb2_set_next_command(tcon, &rqst[0]);
-
-
- /* IOCTL */
- rqst[1].rq_iov = vars->io_iov;
- rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
-
- rc = SMB2_ioctl_init(tcon, server,
- &rqst[1], COMPOUND_FID,
- COMPOUND_FID, FSCTL_GET_REPARSE_POINT, NULL, 0,
- CIFSMaxBufSize -
- MAX_SMB2_CREATE_RESPONSE_SIZE -
- MAX_SMB2_CLOSE_RESPONSE_SIZE);
- if (rc)
- goto query_rp_exit;
-
- smb2_set_next_command(tcon, &rqst[1]);
- smb2_set_related(&rqst[1]);
-
- /* Close */
- rqst[2].rq_iov = &vars->close_iov;
- rqst[2].rq_nvec = 1;
-
- rc = SMB2_close_init(tcon, server,
- &rqst[2], COMPOUND_FID, COMPOUND_FID, false);
- if (rc)
- goto query_rp_exit;
-
- smb2_set_related(&rqst[2]);
-
- rc = compound_send_recv(xid, tcon->ses, server,
- flags, 3, rqst,
- resp_buftype, rsp_iov);
-
- ioctl_rsp = rsp_iov[1].iov_base;
-
- /*
- * Open was successful and we got an ioctl response.
- */
- if (rc == 0) {
- /* See MS-FSCC 2.3.23 */
- off = le32_to_cpu(ioctl_rsp->OutputOffset);
- count = le32_to_cpu(ioctl_rsp->OutputCount);
- if (check_add_overflow(off, count, &len) ||
- len > rsp_iov[1].iov_len) {
- cifs_tcon_dbg(VFS, "%s: invalid ioctl: off=%d count=%d\n",
- __func__, off, count);
- rc = -EIO;
- goto query_rp_exit;
- }
-
- reparse_buf = (void *)((u8 *)ioctl_rsp + off);
- len = sizeof(*reparse_buf);
- if (count < len ||
- count < le16_to_cpu(reparse_buf->ReparseDataLength) + len) {
- cifs_tcon_dbg(VFS, "%s: invalid ioctl: off=%d count=%d\n",
- __func__, off, count);
- rc = -EIO;
- goto query_rp_exit;
- }
- *tag = le32_to_cpu(reparse_buf->ReparseTag);
- *rsp = rsp_iov[1];
- *rsp_buftype = resp_buftype[1];
- resp_buftype[1] = CIFS_NO_BUFFER;
- }
-
- query_rp_exit:
- SMB2_open_free(&rqst[0]);
- SMB2_ioctl_free(&rqst[1]);
- SMB2_close_free(&rqst[2]);
- free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
- free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
- free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
- kfree(vars);
-out_free_path:
- kfree(utf16_path);
- return rc;
-}
-
static struct cifs_ntsd *
get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb,
const struct cifs_fid *cifsfid, u32 *pacllen, u32 info)
@@ -3336,7 +3285,6 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
unsigned long long new_size;
long rc;
unsigned int xid;
- __le64 eof;
xid = get_xid();
@@ -3366,11 +3314,13 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
*/
new_size = offset + len;
if (keep_size == false && (unsigned long long)i_size_read(inode) < new_size) {
- eof = cpu_to_le64(new_size);
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, cfile->pid, &eof);
+ cfile->fid.volatile_fid, cfile->pid, new_size);
if (rc >= 0) {
truncate_setsize(inode, new_size);
+ netfs_resize_file(&cifsi->netfs, new_size, true);
+ if (offset < cifsi->netfs.zero_point)
+ cifsi->netfs.zero_point = offset;
fscache_resize_cookie(cifs_inode_cookie(inode), new_size);
}
}
@@ -3561,7 +3511,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
struct cifsFileInfo *cfile = file->private_data;
long rc = -EOPNOTSUPP;
unsigned int xid;
- __le64 eof;
+ loff_t new_eof;
xid = get_xid();
@@ -3590,14 +3540,14 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
if (cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)
smb2_set_sparse(xid, tcon, cfile, inode, false);
- eof = cpu_to_le64(off + len);
+ new_eof = off + len;
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, cfile->pid, &eof);
+ cfile->fid.volatile_fid, cfile->pid, new_eof);
if (rc == 0) {
- cifsi->server_eof = off + len;
- cifs_setsize(inode, off + len);
+ netfs_resize_file(&cifsi->netfs, new_eof, true);
+ cifs_setsize(inode, new_eof);
cifs_truncate_page(inode->i_mapping, inode->i_size);
- truncate_setsize(inode, off + len);
+ truncate_setsize(inode, new_eof);
}
goto out;
}
@@ -3686,10 +3636,10 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
int rc;
unsigned int xid;
struct inode *inode = file_inode(file);
- struct cifsFileInfo *cfile = file->private_data;
struct cifsInodeInfo *cifsi = CIFS_I(inode);
- __le64 eof;
- loff_t old_eof;
+ struct cifsFileInfo *cfile = file->private_data;
+ struct netfs_inode *ictx = &cifsi->netfs;
+ loff_t old_eof, new_eof;
xid = get_xid();
@@ -3708,23 +3658,25 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
goto out_2;
truncate_pagecache_range(inode, off, old_eof);
+ ictx->zero_point = old_eof;
rc = smb2_copychunk_range(xid, cfile, cfile, off + len,
old_eof - off - len, off);
if (rc < 0)
goto out_2;
- eof = cpu_to_le64(old_eof - len);
+ new_eof = old_eof - len;
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, cfile->pid, &eof);
+ cfile->fid.volatile_fid, cfile->pid, new_eof);
if (rc < 0)
goto out_2;
rc = 0;
- cifsi->server_eof = i_size_read(inode) - len;
- truncate_setsize(inode, cifsi->server_eof);
- fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof);
+ truncate_setsize(inode, new_eof);
+ netfs_resize_file(&cifsi->netfs, new_eof, true);
+ ictx->zero_point = new_eof;
+ fscache_resize_cookie(cifs_inode_cookie(inode), new_eof);
out_2:
filemap_invalidate_unlock(inode->i_mapping);
out:
@@ -3740,8 +3692,8 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon,
unsigned int xid;
struct cifsFileInfo *cfile = file->private_data;
struct inode *inode = file_inode(file);
- __le64 eof;
- __u64 count, old_eof;
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
+ __u64 count, old_eof, new_eof;
xid = get_xid();
@@ -3754,20 +3706,21 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon,
}
count = old_eof - off;
- eof = cpu_to_le64(old_eof + len);
+ new_eof = old_eof + len;
filemap_invalidate_lock(inode->i_mapping);
- rc = filemap_write_and_wait_range(inode->i_mapping, off, old_eof + len - 1);
+ rc = filemap_write_and_wait_range(inode->i_mapping, off, new_eof - 1);
if (rc < 0)
goto out_2;
truncate_pagecache_range(inode, off, old_eof);
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, cfile->pid, &eof);
+ cfile->fid.volatile_fid, cfile->pid, new_eof);
if (rc < 0)
goto out_2;
- truncate_setsize(inode, old_eof + len);
+ truncate_setsize(inode, new_eof);
+ netfs_resize_file(&cifsi->netfs, i_size_read(inode), true);
fscache_resize_cookie(cifs_inode_cookie(inode), i_size_read(inode));
rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len);
@@ -5171,11 +5124,154 @@ int cifs_sfu_make_node(unsigned int xid, struct inode *inode,
return rc;
}
+static inline u64 mode_nfs_type(mode_t mode)
+{
+ switch (mode & S_IFMT) {
+ case S_IFBLK: return NFS_SPECFILE_BLK;
+ case S_IFCHR: return NFS_SPECFILE_CHR;
+ case S_IFIFO: return NFS_SPECFILE_FIFO;
+ case S_IFSOCK: return NFS_SPECFILE_SOCK;
+ }
+ return 0;
+}
+
+static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
+ mode_t mode, dev_t dev,
+ struct kvec *iov)
+{
+ u64 type;
+ u16 len, dlen;
+
+ len = sizeof(*buf);
+
+ switch ((type = mode_nfs_type(mode))) {
+ case NFS_SPECFILE_BLK:
+ case NFS_SPECFILE_CHR:
+ dlen = sizeof(__le64);
+ break;
+ case NFS_SPECFILE_FIFO:
+ case NFS_SPECFILE_SOCK:
+ dlen = 0;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_NFS);
+ buf->Reserved = 0;
+ buf->InodeType = cpu_to_le64(type);
+ buf->ReparseDataLength = cpu_to_le16(len + dlen -
+ sizeof(struct reparse_data_buffer));
+ *(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MAJOR(dev) << 32) |
+ MINOR(dev));
+ iov->iov_base = buf;
+ iov->iov_len = len + dlen;
+ return 0;
+}
+
+static int nfs_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev)
+{
+ struct cifs_open_info_data data;
+ struct reparse_posix_data *p;
+ struct inode *new;
+ struct kvec iov;
+ __u8 buf[sizeof(*p) + sizeof(__le64)];
+ int rc;
+
+ p = (struct reparse_posix_data *)buf;
+ rc = nfs_set_reparse_buf(p, mode, dev, &iov);
+ if (rc)
+ return rc;
+
+ data = (struct cifs_open_info_data) {
+ .reparse_point = true,
+ .reparse = { .tag = IO_REPARSE_TAG_NFS, .posix = p, },
+ };
+
+ new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+ tcon, full_path, &iov);
+ if (!IS_ERR(new))
+ d_instantiate(dentry, new);
+ else
+ rc = PTR_ERR(new);
+ cifs_free_open_info(&data);
+ return rc;
+}
+
+static int smb2_create_reparse_symlink(const unsigned int xid,
+ struct inode *inode,
+ struct dentry *dentry,
+ struct cifs_tcon *tcon,
+ const char *full_path,
+ const char *symname)
+{
+ struct reparse_symlink_data_buffer *buf = NULL;
+ struct cifs_open_info_data data;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct inode *new;
+ struct kvec iov;
+ __le16 *path;
+ char *sym;
+ u16 len, plen;
+ int rc = 0;
+
+ sym = kstrdup(symname, GFP_KERNEL);
+ if (!sym)
+ return -ENOMEM;
+
+ data = (struct cifs_open_info_data) {
+ .reparse_point = true,
+ .reparse = { .tag = IO_REPARSE_TAG_SYMLINK, },
+ .symlink_target = sym,
+ };
+
+ path = cifs_convert_path_to_utf16(symname, cifs_sb);
+ if (!path) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ plen = 2 * UniStrnlen((wchar_t *)path, PATH_MAX);
+ len = sizeof(*buf) + plen * 2;
+ buf = kzalloc(len, GFP_KERNEL);
+ if (!buf) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_SYMLINK);
+ buf->ReparseDataLength = cpu_to_le16(len - sizeof(struct reparse_data_buffer));
+ buf->SubstituteNameOffset = cpu_to_le16(plen);
+ buf->SubstituteNameLength = cpu_to_le16(plen);
+ memcpy(&buf->PathBuffer[plen], path, plen);
+ buf->PrintNameOffset = 0;
+ buf->PrintNameLength = cpu_to_le16(plen);
+ memcpy(buf->PathBuffer, path, plen);
+ buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0);
+
+ iov.iov_base = buf;
+ iov.iov_len = len;
+ new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+ tcon, full_path, &iov);
+ if (!IS_ERR(new))
+ d_instantiate(dentry, new);
+ else
+ rc = PTR_ERR(new);
+out:
+ kfree(path);
+ cifs_free_open_info(&data);
+ kfree(buf);
+ return rc;
+}
+
static int smb2_make_node(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ int rc;
/*
* Check if mounted with mount parm 'sfu' mount parm.
@@ -5183,15 +5279,14 @@ static int smb2_make_node(unsigned int xid, struct inode *inode,
* supports block and char device (no socket & fifo),
* and was used by default in earlier versions of Windows
*/
- if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
- return -EPERM;
- /*
- * TODO: Add ability to create instead via reparse point. Windows (e.g.
- * their current NFS server) uses this approach to expose special files
- * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions
- */
- return cifs_sfu_make_node(xid, inode, dentry, tcon,
- full_path, mode, dev);
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+ rc = cifs_sfu_make_node(xid, inode, dentry, tcon,
+ full_path, mode, dev);
+ } else {
+ rc = nfs_make_node(xid, inode, dentry, tcon,
+ full_path, mode, dev);
+ }
+ return rc;
}
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
@@ -5247,6 +5342,7 @@ struct smb_version_operations smb20_operations = {
.parse_reparse_point = smb2_parse_reparse_point,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
+ .create_reparse_symlink = smb2_create_reparse_symlink,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
@@ -5349,6 +5445,7 @@ struct smb_version_operations smb21_operations = {
.parse_reparse_point = smb2_parse_reparse_point,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
+ .create_reparse_symlink = smb2_create_reparse_symlink,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
@@ -5454,6 +5551,7 @@ struct smb_version_operations smb30_operations = {
.parse_reparse_point = smb2_parse_reparse_point,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
+ .create_reparse_symlink = smb2_create_reparse_symlink,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
@@ -5568,6 +5666,7 @@ struct smb_version_operations smb311_operations = {
.parse_reparse_point = smb2_parse_reparse_point,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
+ .create_reparse_symlink = smb2_create_reparse_symlink,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 4f971c1061f0..c58fa44dd6b0 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -156,6 +156,56 @@ out:
return;
}
+/* helper function for code reuse */
+static int
+cifs_chan_skip_or_disable(struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
+ bool from_reconnect)
+{
+ struct TCP_Server_Info *pserver;
+ unsigned int chan_index;
+
+ if (SERVER_IS_CHAN(server)) {
+ cifs_dbg(VFS,
+ "server %s does not support multichannel anymore. Skip secondary channel\n",
+ ses->server->hostname);
+
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX) {
+ spin_unlock(&ses->chan_lock);
+ goto skip_terminate;
+ }
+
+ ses->chans[chan_index].server = NULL;
+ server->terminate = true;
+ spin_unlock(&ses->chan_lock);
+
+ /*
+ * the above reference of server by channel
+ * needs to be dropped without holding chan_lock
+ * as cifs_put_tcp_session takes a higher lock
+ * i.e. cifs_tcp_ses_lock
+ */
+ cifs_put_tcp_session(server, from_reconnect);
+
+ cifs_signal_cifsd_for_reconnect(server, false);
+
+ /* mark primary server as needing reconnect */
+ pserver = server->primary_server;
+ cifs_signal_cifsd_for_reconnect(pserver, false);
+skip_terminate:
+ return -EHOSTDOWN;
+ }
+
+ cifs_server_dbg(VFS,
+ "server does not support multichannel anymore. Disable all other channels\n");
+ cifs_disable_secondary_channels(ses);
+
+
+ return 0;
+}
+
static int
smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
struct TCP_Server_Info *server, bool from_reconnect)
@@ -164,8 +214,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
struct nls_table *nls_codepage = NULL;
struct cifs_ses *ses;
int xid;
- struct TCP_Server_Info *pserver;
- unsigned int chan_index;
/*
* SMB2s NegProt, SessSetup, Logoff do not have tcon yet so
@@ -310,44 +358,11 @@ again:
*/
if (ses->chan_count > 1 &&
!(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
- if (SERVER_IS_CHAN(server)) {
- cifs_dbg(VFS, "server %s does not support " \
- "multichannel anymore. skipping secondary channel\n",
- ses->server->hostname);
-
- spin_lock(&ses->chan_lock);
- chan_index = cifs_ses_get_chan_index(ses, server);
- if (chan_index == CIFS_INVAL_CHAN_INDEX) {
- spin_unlock(&ses->chan_lock);
- goto skip_terminate;
- }
-
- ses->chans[chan_index].server = NULL;
- spin_unlock(&ses->chan_lock);
-
- /*
- * the above reference of server by channel
- * needs to be dropped without holding chan_lock
- * as cifs_put_tcp_session takes a higher lock
- * i.e. cifs_tcp_ses_lock
- */
- cifs_put_tcp_session(server, from_reconnect);
-
- server->terminate = true;
- cifs_signal_cifsd_for_reconnect(server, false);
-
- /* mark primary server as needing reconnect */
- pserver = server->primary_server;
- cifs_signal_cifsd_for_reconnect(pserver, false);
-
-skip_terminate:
+ rc = cifs_chan_skip_or_disable(ses, server,
+ from_reconnect);
+ if (rc) {
mutex_unlock(&ses->session_mutex);
- rc = -EHOSTDOWN;
goto out;
- } else {
- cifs_server_dbg(VFS, "does not support " \
- "multichannel anymore. disabling all other channels\n");
- cifs_disable_secondary_channels(ses);
}
}
@@ -384,6 +399,15 @@ skip_sess_setup:
goto out;
}
+ spin_lock(&ses->ses_lock);
+ if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS) {
+ spin_unlock(&ses->ses_lock);
+ mutex_unlock(&ses->session_mutex);
+ goto skip_add_channels;
+ }
+ ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS;
+ spin_unlock(&ses->ses_lock);
+
if (!rc &&
(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
mutex_unlock(&ses->session_mutex);
@@ -395,14 +419,29 @@ skip_sess_setup:
rc = SMB3_request_interfaces(xid, tcon, false);
free_xid(xid);
- if (rc)
+ if (rc == -EOPNOTSUPP && ses->chan_count > 1) {
+ /*
+ * some servers like Azure SMB server do not advertise
+ * that multichannel has been disabled with server
+ * capabilities, rather return STATUS_NOT_IMPLEMENTED.
+ * treat this as server not supporting multichannel
+ */
+
+ rc = cifs_chan_skip_or_disable(ses, server,
+ from_reconnect);
+ goto skip_add_channels;
+ } else if (rc)
cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
__func__, rc);
if (ses->chan_max > ses->chan_count &&
+ ses->iface_count &&
!SERVER_IS_CHAN(server)) {
- if (ses->chan_count == 1)
+ if (ses->chan_count == 1) {
cifs_server_dbg(VFS, "supports multichannel now\n");
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
+ }
cifs_try_adding_channels(ses);
}
@@ -410,6 +449,11 @@ skip_sess_setup:
mutex_unlock(&ses->session_mutex);
}
+skip_add_channels:
+ spin_lock(&ses->ses_lock);
+ ses->flags &= ~CIFS_SES_FLAG_SCALE_CHANNELS;
+ spin_unlock(&ses->ses_lock);
+
if (smb2_command != SMB2_INTERNAL_CMD)
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
@@ -1958,10 +2002,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
__le16 *unc_path = NULL;
int flags = 0;
unsigned int total_len;
- struct TCP_Server_Info *server;
-
- /* always use master channel */
- server = ses->server;
+ struct TCP_Server_Info *server = cifs_pick_channel(ses);
cifs_dbg(FYI, "TCON\n");
@@ -2094,6 +2135,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
struct smb2_tree_disconnect_req *req; /* response is trivial */
int rc = 0;
struct cifs_ses *ses = tcon->ses;
+ struct TCP_Server_Info *server = cifs_pick_channel(ses);
int flags = 0;
unsigned int total_len;
struct kvec iov[1];
@@ -2116,7 +2158,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
invalidate_all_cached_dirs(tcon);
- rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, ses->server,
+ rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, server,
(void **) &req,
&total_len);
if (rc)
@@ -2134,7 +2176,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
rqst.rq_iov = iov;
rqst.rq_nvec = 1;
- rc = cifs_send_recv(xid, ses, ses->server,
+ rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buf_type, flags, &rsp_iov);
cifs_small_buf_release(req);
if (rc) {
@@ -2279,7 +2321,7 @@ int smb2_parse_contexts(struct TCP_Server_Info *server,
noff = le16_to_cpu(cc->NameOffset);
nlen = le16_to_cpu(cc->NameLength);
- if (noff + nlen >= doff)
+ if (noff + nlen > doff)
return -EINVAL;
name = (char *)cc + noff;
@@ -2736,7 +2778,14 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
int flags = 0;
unsigned int total_len;
__le16 *utf16_path = NULL;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ n_iov = 2;
+ server = cifs_pick_channel(ses);
cifs_dbg(FYI, "mkdir\n");
@@ -2840,6 +2889,10 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
/* no need to inc num_remote_opens because we close it just below */
trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, full_path, CREATE_NOT_FILE,
FILE_WRITE_ATTRIBUTES);
+
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
/* resource #4: response buffer */
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
@@ -2877,6 +2930,11 @@ err_free_req:
cifs_small_buf_release(req);
err_free_path:
kfree(utf16_path);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3072,12 +3130,18 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
struct smb2_create_rsp *rsp = NULL;
struct cifs_tcon *tcon = oparms->tcon;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
struct kvec iov[SMB2_CREATE_IOV_SIZE];
struct kvec rsp_iov = {NULL, 0};
int resp_buftype = CIFS_NO_BUFFER;
int rc = 0;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
cifs_dbg(FYI, "create/open\n");
if (!ses || !server)
@@ -3099,6 +3163,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
trace_smb3_open_enter(xid, tcon->tid, tcon->ses->Suid, oparms->path,
oparms->create_options, oparms->desired_access);
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags,
&rsp_iov);
@@ -3152,6 +3219,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
creat_exit:
SMB2_open_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3276,15 +3348,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
int resp_buftype = CIFS_NO_BUFFER;
int rc = 0;
int flags = 0;
-
- cifs_dbg(FYI, "SMB2 IOCTL\n");
-
- if (out_data != NULL)
- *out_data = NULL;
-
- /* zero out returned data len, in case of error */
- if (plen)
- *plen = 0;
+ int retries = 0, cur_sleep = 1;
if (!tcon)
return -EIO;
@@ -3293,10 +3357,23 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
if (!ses)
return -EIO;
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
server = cifs_pick_channel(ses);
+
if (!server)
return -EIO;
+ cifs_dbg(FYI, "SMB2 IOCTL\n");
+
+ if (out_data != NULL)
+ *out_data = NULL;
+
+ /* zero out returned data len, in case of error */
+ if (plen)
+ *plen = 0;
+
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -3311,6 +3388,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
if (rc)
goto ioctl_exit;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags,
&rsp_iov);
@@ -3380,6 +3460,11 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
ioctl_exit:
SMB2_ioctl_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3451,13 +3536,20 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
struct smb_rqst rqst;
struct smb2_close_rsp *rsp = NULL;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
struct kvec iov[1];
struct kvec rsp_iov;
int resp_buftype = CIFS_NO_BUFFER;
int rc = 0;
int flags = 0;
bool query_attrs = false;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ query_attrs = false;
+ server = cifs_pick_channel(ses);
cifs_dbg(FYI, "Close\n");
@@ -3483,6 +3575,9 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
goto close_exit;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
rsp = (struct smb2_close_rsp *)rsp_iov.iov_base;
@@ -3516,6 +3611,11 @@ close_exit:
cifs_dbg(VFS, "handle cancelled close fid 0x%llx returned error %d\n",
persistent_fid, tmp_rc);
}
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3646,12 +3746,19 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
struct TCP_Server_Info *server;
int flags = 0;
bool allocated = false;
+ int retries = 0, cur_sleep = 1;
cifs_dbg(FYI, "Query Info\n");
if (!ses)
return -EIO;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ allocated = false;
server = cifs_pick_channel(ses);
+
if (!server)
return -EIO;
@@ -3673,6 +3780,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
trace_smb3_query_info_enter(xid, persistent_fid, tcon->tid,
ses->Suid, info_class, (__u32)info_type);
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
@@ -3715,6 +3825,11 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
qinf_exit:
SMB2_query_info_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3815,7 +3930,7 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
u32 *plen /* returned data len */)
{
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
struct smb_rqst rqst;
struct smb2_change_notify_rsp *smb_rsp;
struct kvec iov[1];
@@ -3823,6 +3938,12 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
int resp_buftype = CIFS_NO_BUFFER;
int flags = 0;
int rc = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
cifs_dbg(FYI, "change notify\n");
if (!ses || !server)
@@ -3847,6 +3968,10 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
trace_smb3_notify_enter(xid, persistent_fid, tcon->tid, ses->Suid,
(u8)watch_tree, completion_filter);
+
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
@@ -3881,6 +4006,11 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
if (rqst.rq_iov)
cifs_small_buf_release(rqst.rq_iov[0].iov_base); /* request */
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3918,7 +4048,7 @@ void smb2_reconnect_server(struct work_struct *work)
struct cifs_ses *ses, *ses2;
struct cifs_tcon *tcon, *tcon2;
struct list_head tmp_list, tmp_ses_list;
- bool tcon_exist = false, ses_exist = false;
+ bool ses_exist = false;
bool tcon_selected = false;
int rc;
bool resched = false;
@@ -3964,7 +4094,7 @@ void smb2_reconnect_server(struct work_struct *work)
if (tcon->need_reconnect || tcon->need_reopen_files) {
tcon->tc_count++;
list_add_tail(&tcon->rlist, &tmp_list);
- tcon_selected = tcon_exist = true;
+ tcon_selected = true;
}
}
/*
@@ -3973,7 +4103,7 @@ void smb2_reconnect_server(struct work_struct *work)
*/
if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) {
list_add_tail(&ses->tcon_ipc->rlist, &tmp_list);
- tcon_selected = tcon_exist = true;
+ tcon_selected = true;
cifs_smb_ses_inc_refcount(ses);
}
/*
@@ -4123,10 +4253,16 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
struct smb_rqst rqst;
struct kvec iov[1];
struct kvec rsp_iov = {NULL, 0};
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
int resp_buftype = CIFS_NO_BUFFER;
int flags = 0;
int rc = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
cifs_dbg(FYI, "flush\n");
if (!ses || !(ses->server))
@@ -4146,6 +4282,10 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
goto flush_exit;
trace_smb3_flush_enter(xid, persistent_fid, tcon->tid, ses->Suid);
+
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
@@ -4160,6 +4300,11 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
flush_exit:
SMB2_flush_free(&rqst);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -4639,7 +4784,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
struct cifs_io_parms *io_parms = NULL;
int credit_request;
- if (!wdata->server)
+ if (!wdata->server || wdata->replay)
server = wdata->server = cifs_pick_channel(tcon->ses);
/*
@@ -4724,6 +4869,8 @@ smb2_async_writev(struct cifs_writedata *wdata,
rqst.rq_nvec = 1;
rqst.rq_iter = wdata->iter;
rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter);
+ if (wdata->replay)
+ smb2_set_replay(server, &rqst);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (wdata->mr)
iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1);
@@ -4797,18 +4944,21 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
int flags = 0;
unsigned int total_len;
struct TCP_Server_Info *server;
+ int retries = 0, cur_sleep = 1;
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
*nbytes = 0;
-
- if (n_vec < 1)
- return rc;
-
if (!io_parms->server)
io_parms->server = cifs_pick_channel(io_parms->tcon->ses);
server = io_parms->server;
if (server == NULL)
return -ECONNABORTED;
+ if (n_vec < 1)
+ return rc;
+
rc = smb2_plain_req_init(SMB2_WRITE, io_parms->tcon, server,
(void **) &req, &total_len);
if (rc)
@@ -4842,6 +4992,9 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
rqst.rq_iov = iov;
rqst.rq_nvec = n_vec + 1;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, io_parms->tcon->ses, server,
&rqst,
&resp_buftype, flags, &rsp_iov);
@@ -4866,6 +5019,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
cifs_small_buf_release(req);
free_rsp_buf(resp_buftype, rsp);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(io_parms->tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5177,8 +5335,14 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec rsp_iov;
int rc = 0;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
if (!ses || !(ses->server))
return -EIO;
@@ -5198,6 +5362,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
goto qdir_exit;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base;
@@ -5232,6 +5399,11 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
qdir_exit:
SMB2_query_directory_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5298,8 +5470,14 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
if (!ses || !server)
return -EIO;
@@ -5327,6 +5505,8 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
+ if (retries)
+ smb2_set_replay(server, &rqst);
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags,
@@ -5342,23 +5522,28 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
free_rsp_buf(resp_buftype, rsp);
kfree(iov);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
int
SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
- u64 volatile_fid, u32 pid, __le64 *eof)
+ u64 volatile_fid, u32 pid, loff_t new_eof)
{
struct smb2_file_eof_info info;
void *data;
unsigned int size;
- info.EndOfFile = *eof;
+ info.EndOfFile = cpu_to_le64(new_eof);
data = &info;
size = sizeof(struct smb2_file_eof_info);
- trace_smb3_set_eof(xid, persistent_fid, tcon->tid, tcon->ses->Suid, le64_to_cpu(*eof));
+ trace_smb3_set_eof(xid, persistent_fid, tcon->tid, tcon->ses->Suid, new_eof);
return send_set_info(xid, tcon, persistent_fid, volatile_fid,
pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE,
@@ -5394,12 +5579,18 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
int rc;
struct smb2_oplock_break *req = NULL;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
int flags = CIFS_OBREAK_OP;
unsigned int total_len;
struct kvec iov[1];
struct kvec rsp_iov;
int resp_buf_type;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = CIFS_OBREAK_OP;
+ server = cifs_pick_channel(ses);
cifs_dbg(FYI, "SMB2_oplock_break\n");
rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, server,
@@ -5424,15 +5615,21 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = iov;
rqst.rq_nvec = 1;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buf_type, flags, &rsp_iov);
cifs_small_buf_release(req);
-
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
cifs_dbg(FYI, "Send error in Oplock Break = %d\n", rc);
}
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5518,9 +5715,15 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
FILE_SYSTEM_POSIX_INFO *info = NULL;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
rc = build_qfs_info_req(&iov, tcon, server,
FS_POSIX_INFORMATION,
@@ -5536,6 +5739,9 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = &iov;
rqst.rq_nvec = 1;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
free_qfs_info_req(&iov);
@@ -5555,6 +5761,11 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
posix_qfsinf_exit:
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5569,9 +5780,15 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
struct smb2_fs_full_size_info *info = NULL;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
rc = build_qfs_info_req(&iov, tcon, server,
FS_FULL_SIZE_INFORMATION,
@@ -5587,6 +5804,9 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = &iov;
rqst.rq_nvec = 1;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
free_qfs_info_req(&iov);
@@ -5606,6 +5826,11 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
qfsinf_exit:
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5620,9 +5845,15 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype, max_len, min_len;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
unsigned int rsp_len, offset;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
if (level == FS_DEVICE_INFORMATION) {
max_len = sizeof(FILE_SYSTEM_DEVICE_INFO);
@@ -5654,6 +5885,9 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = &iov;
rqst.rq_nvec = 1;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
free_qfs_info_req(&iov);
@@ -5691,6 +5925,11 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
qfsattr_exit:
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5708,7 +5947,13 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int count;
int flags = CIFS_NO_RSP_BUF;
unsigned int total_len;
- struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
+ struct TCP_Server_Info *server;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = CIFS_NO_RSP_BUF;
+ server = cifs_pick_channel(tcon->ses);
cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock);
@@ -5739,6 +5984,9 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, tcon->ses, server,
&rqst, &resp_buf_type, flags,
&rsp_iov);
@@ -5750,6 +5998,10 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
tcon->ses->Suid, rc);
}
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 0e371f7e2854..b3069911e9dd 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -56,6 +56,18 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server,
extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *path,
__u32 *reparse_tag);
+struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
+ struct super_block *sb,
+ const unsigned int xid,
+ struct cifs_tcon *tcon,
+ const char *full_path,
+ struct kvec *iov);
+int smb2_query_reparse_point(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ u32 *tag, struct kvec *rsp,
+ int *rsp_buftype);
int smb2_query_path_info(const unsigned int xid,
struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
@@ -80,12 +92,16 @@ extern int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
const char *name, struct cifs_sb_info *cifs_sb);
extern int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon,
const char *name, struct cifs_sb_info *cifs_sb);
-extern int smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb);
-extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb);
+int smb2_rename_path(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb);
+int smb2_create_hardlink(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb);
extern int smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const unsigned char *path,
char *pbuf, unsigned int *pbytes_written);
@@ -106,6 +122,11 @@ extern unsigned long smb_rqst_len(struct TCP_Server_Info *server,
extern void smb2_set_next_command(struct cifs_tcon *tcon,
struct smb_rqst *rqst);
extern void smb2_set_related(struct smb_rqst *rqst);
+extern void smb2_set_replay(struct TCP_Server_Info *server,
+ struct smb_rqst *rqst);
+extern bool smb2_should_replay(struct cifs_tcon *tcon,
+ int *pretries,
+ int *pcur_sleep);
/*
* SMB2 Worker functions - most of protocol specific implementation details
@@ -205,7 +226,7 @@ extern int SMB2_query_directory_init(unsigned int xid, struct cifs_tcon *tcon,
extern void SMB2_query_directory_free(struct smb_rqst *rqst);
extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, u32 pid,
- __le64 *eof);
+ loff_t new_eof);
extern int SMB2_set_info_init(struct cifs_tcon *tcon,
struct TCP_Server_Info *server,
struct smb_rqst *rqst,
@@ -283,10 +304,9 @@ int smb311_posix_query_path_info(const unsigned int xid,
struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const char *full_path,
- struct cifs_open_info_data *data,
- struct cifs_sid *owner,
- struct cifs_sid *group);
+ struct cifs_open_info_data *data);
int posix_info_parse(const void *beg, const void *end,
struct smb2_posix_info_parsed *out);
int posix_info_sid_size(const void *beg, const void *end);
+
#endif /* _SMB2PROTO_H */
diff --git a/fs/smb/client/smb2status.h b/fs/smb/client/smb2status.h
index a9e958166fc5..9c6d79b0bd49 100644
--- a/fs/smb/client/smb2status.h
+++ b/fs/smb/client/smb2status.h
@@ -982,6 +982,8 @@ struct ntstatus {
#define STATUS_INVALID_TASK_INDEX cpu_to_le32(0xC0000501)
#define STATUS_THREAD_ALREADY_IN_TASK cpu_to_le32(0xC0000502)
#define STATUS_CALLBACK_BYPASS cpu_to_le32(0xC0000503)
+#define STATUS_SERVER_UNAVAILABLE cpu_to_le32(0xC0000466)
+#define STATUS_FILE_NOT_AVAILABLE cpu_to_le32(0xC0000467)
#define STATUS_PORT_CLOSED cpu_to_le32(0xC0000700)
#define STATUS_MESSAGE_LOST cpu_to_le32(0xC0000701)
#define STATUS_INVALID_MESSAGE cpu_to_le32(0xC0000702)
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index 94df9eec3d8d..d74e829de51c 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -2136,7 +2136,7 @@ static int allocate_mr_list(struct smbd_connection *info)
for (i = 0; i < info->responder_resources * 2; i++) {
smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
if (!smbdirect_mr)
- goto out;
+ goto cleanup_entries;
smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type,
info->max_frmr_depth);
if (IS_ERR(smbdirect_mr->mr)) {
@@ -2162,7 +2162,7 @@ static int allocate_mr_list(struct smbd_connection *info)
out:
kfree(smbdirect_mr);
-
+cleanup_entries:
list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) {
list_del(&smbdirect_mr->list);
ib_dereg_mr(smbdirect_mr->mr);
diff --git a/fs/smb/client/smbencrypt.c b/fs/smb/client/smbencrypt.c
index f0ce26414f17..1d1ee9f18f37 100644
--- a/fs/smb/client/smbencrypt.c
+++ b/fs/smb/client/smbencrypt.c
@@ -26,13 +26,6 @@
#include "cifsproto.h"
#include "../common/md4.h"
-#ifndef false
-#define false 0
-#endif
-#ifndef true
-#define true 1
-#endif
-
/* following came from the other byteorder.h to avoid include conflicts */
#define CVAL(buf,pos) (((unsigned char *)(buf))[pos])
#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index de199ec9f726..522fa387fcfd 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -370,11 +370,12 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_reparse_compound_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter);
-
DECLARE_EVENT_CLASS(smb3_inf_compound_done_class,
TP_PROTO(unsigned int xid,
__u32 tid,
@@ -408,6 +409,8 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done);
@@ -451,6 +454,8 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err);
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 4f717ad7c21b..994d70193432 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -400,10 +400,17 @@ unmask:
server->conn_id, server->hostname);
}
smbd_done:
- if (rc < 0 && rc != -EINTR)
+ /*
+ * there's hardly any use for the layers above to know the
+ * actual error code here. All they should do at this point is
+ * to retry the connection and hope it goes away.
+ */
+ if (rc < 0 && rc != -EINTR && rc != -EAGAIN) {
cifs_server_dbg(VFS, "Error %d sending data on socket to server\n",
rc);
- else if (rc > 0)
+ rc = -ECONNABORTED;
+ cifs_signal_cifsd_for_reconnect(server, false);
+ } else if (rc > 0)
rc = 0;
out:
cifs_in_send_dec(server);
@@ -428,8 +435,8 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
if (!(flags & CIFS_TRANSFORM_REQ))
return __smb_send_rqst(server, num_rqst, rqst);
- if (num_rqst > MAX_COMPOUND - 1)
- return -ENOMEM;
+ if (WARN_ON_ONCE(num_rqst > MAX_COMPOUND - 1))
+ return -EIO;
if (!server->ops->init_transform_rq) {
cifs_server_dbg(VFS, "Encryption requested but transform callback is missing\n");
@@ -1026,6 +1033,9 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
if (!server || server->terminate)
continue;
+ if (CIFS_CHAN_NEEDS_RECONNECT(ses, i))
+ continue;
+
/*
* strictly speaking, we should pick up req_lock to read
* server->in_flight. But it shouldn't matter much here if we
diff --git a/fs/smb/server/asn1.c b/fs/smb/server/asn1.c
index 4a4b2b03ff33..b931a99ab9c8 100644
--- a/fs/smb/server/asn1.c
+++ b/fs/smb/server/asn1.c
@@ -214,10 +214,15 @@ static int ksmbd_neg_token_alloc(void *context, size_t hdrlen,
{
struct ksmbd_conn *conn = context;
+ if (!vlen)
+ return -EINVAL;
+
conn->mechToken = kmemdup_nul(value, vlen, GFP_KERNEL);
if (!conn->mechToken)
return -ENOMEM;
+ conn->mechTokenLen = (unsigned int)vlen;
+
return 0;
}
diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c
index 229a6527870d..09b20039636e 100644
--- a/fs/smb/server/auth.c
+++ b/fs/smb/server/auth.c
@@ -208,10 +208,12 @@ out:
/**
* ksmbd_auth_ntlmv2() - NTLMv2 authentication handler
- * @sess: session of connection
+ * @conn: connection
+ * @sess: session of connection
* @ntlmv2: NTLMv2 challenge response
* @blen: NTLMv2 blob length
* @domain_name: domain name
+ * @cryptkey: session crypto key
*
* Return: 0 on success, error number on error
*/
@@ -294,7 +296,8 @@ out:
* ksmbd_decode_ntlmssp_auth_blob() - helper function to construct
* authenticate blob
* @authblob: authenticate blob source pointer
- * @usr: user details
+ * @blob_len: length of the @authblob message
+ * @conn: connection
* @sess: session of connection
*
* Return: 0 on success, error number on error
@@ -376,8 +379,8 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
* ksmbd_decode_ntlmssp_neg_blob() - helper function to construct
* negotiate blob
* @negblob: negotiate blob source pointer
- * @rsp: response header pointer to be updated
- * @sess: session of connection
+ * @blob_len: length of the @authblob message
+ * @conn: connection
*
*/
int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob,
@@ -403,8 +406,7 @@ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob,
* ksmbd_build_ntlmssp_challenge_blob() - helper function to construct
* challenge blob
* @chgblob: challenge blob source pointer to initialize
- * @rsp: response header pointer to be updated
- * @sess: session of connection
+ * @conn: connection
*
*/
unsigned int
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index b6fa1e285c40..09e1e7771592 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -284,6 +284,7 @@ int ksmbd_conn_handler_loop(void *p)
goto out;
conn->last_active = jiffies;
+ set_freezable();
while (ksmbd_conn_alive(conn)) {
if (try_to_freeze())
continue;
@@ -415,13 +416,7 @@ static void stop_sessions(void)
again:
down_read(&conn_list_lock);
list_for_each_entry(conn, &conn_list, conns_list) {
- struct task_struct *task;
-
t = conn->transport;
- task = t->handler;
- if (task)
- ksmbd_debug(CONN, "Stop session handler %s/%d\n",
- task->comm, task_pid_nr(task));
ksmbd_conn_set_exiting(conn);
if (t->ops->shutdown) {
up_read(&conn_list_lock);
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 3c005246a32e..0e04cf8b1d89 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -88,6 +88,7 @@ struct ksmbd_conn {
__u16 dialect;
char *mechToken;
+ unsigned int mechTokenLen;
struct ksmbd_conn_ops *conn_ops;
@@ -134,7 +135,6 @@ struct ksmbd_transport_ops {
struct ksmbd_transport {
struct ksmbd_conn *conn;
struct ksmbd_transport_ops *ops;
- struct task_struct *handler;
};
#define KSMBD_TCP_RECV_TIMEOUT (7 * HZ)
diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h
index b7521e41402e..0ebf91ffa236 100644
--- a/fs/smb/server/ksmbd_netlink.h
+++ b/fs/smb/server/ksmbd_netlink.h
@@ -304,7 +304,8 @@ enum ksmbd_event {
KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST,
KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE = 15,
- KSMBD_EVENT_MAX
+ __KSMBD_EVENT_MAX,
+ KSMBD_EVENT_MAX = __KSMBD_EVENT_MAX - 1
};
/*
diff --git a/fs/smb/server/mgmt/ksmbd_ida.c b/fs/smb/server/mgmt/ksmbd_ida.c
index 54194d959a5e..a18e27e9e0cd 100644
--- a/fs/smb/server/mgmt/ksmbd_ida.c
+++ b/fs/smb/server/mgmt/ksmbd_ida.c
@@ -5,42 +5,33 @@
#include "ksmbd_ida.h"
-static inline int __acquire_id(struct ida *ida, int from, int to)
-{
- return ida_simple_get(ida, from, to, GFP_KERNEL);
-}
-
int ksmbd_acquire_smb2_tid(struct ida *ida)
{
- int id;
-
- id = __acquire_id(ida, 1, 0xFFFFFFFF);
-
- return id;
+ return ida_alloc_range(ida, 1, 0xFFFFFFFE, GFP_KERNEL);
}
int ksmbd_acquire_smb2_uid(struct ida *ida)
{
int id;
- id = __acquire_id(ida, 1, 0);
+ id = ida_alloc_min(ida, 1, GFP_KERNEL);
if (id == 0xFFFE)
- id = __acquire_id(ida, 1, 0);
+ id = ida_alloc_min(ida, 1, GFP_KERNEL);
return id;
}
int ksmbd_acquire_async_msg_id(struct ida *ida)
{
- return __acquire_id(ida, 1, 0);
+ return ida_alloc_min(ida, 1, GFP_KERNEL);
}
int ksmbd_acquire_id(struct ida *ida)
{
- return __acquire_id(ida, 0, 0);
+ return ida_alloc(ida, GFP_KERNEL);
}
void ksmbd_release_id(struct ida *ida, int id)
{
- ida_simple_remove(ida, id);
+ ida_free(ida, id);
}
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index 562b180459a1..53dfaac425c6 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -105,7 +105,7 @@ static int alloc_lease(struct oplock_info *opinfo, struct lease_ctx_info *lctx)
lease->is_dir = lctx->is_dir;
memcpy(lease->parent_lease_key, lctx->parent_lease_key, SMB2_LEASE_KEY_SIZE);
lease->version = lctx->version;
- lease->epoch = le16_to_cpu(lctx->epoch);
+ lease->epoch = le16_to_cpu(lctx->epoch) + 1;
INIT_LIST_HEAD(&opinfo->lease_entry);
opinfo->o_lease = lease;
@@ -546,6 +546,7 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci,
atomic_read(&ci->sop_count)) == 1) {
if (lease->state != SMB2_LEASE_NONE_LE &&
lease->state == (lctx->req_state & lease->state)) {
+ lease->epoch++;
lease->state |= lctx->req_state;
if (lctx->req_state &
SMB2_LEASE_WRITE_CACHING_LE)
@@ -556,13 +557,17 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci,
atomic_read(&ci->sop_count)) > 1) {
if (lctx->req_state ==
(SMB2_LEASE_READ_CACHING_LE |
- SMB2_LEASE_HANDLE_CACHING_LE))
+ SMB2_LEASE_HANDLE_CACHING_LE)) {
+ lease->epoch++;
lease->state = lctx->req_state;
+ }
}
if (lctx->req_state && lease->state ==
- SMB2_LEASE_NONE_LE)
+ SMB2_LEASE_NONE_LE) {
+ lease->epoch++;
lease_none_upgrade(opinfo, lctx->req_state);
+ }
}
read_lock(&ci->m_lock);
}
@@ -1035,7 +1040,8 @@ static void copy_lease(struct oplock_info *op1, struct oplock_info *op2)
SMB2_LEASE_KEY_SIZE);
lease2->duration = lease1->duration;
lease2->flags = lease1->flags;
- lease2->epoch = lease1->epoch++;
+ lease2->epoch = lease1->epoch;
+ lease2->version = lease1->version;
}
static int add_lease_global_list(struct oplock_info *opinfo)
@@ -1191,6 +1197,12 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid,
bool prev_op_has_lease;
__le32 prev_op_state = 0;
+ /* Only v2 leases handle the directory */
+ if (S_ISDIR(file_inode(fp->filp)->i_mode)) {
+ if (!lctx || lctx->version != 2)
+ return 0;
+ }
+
opinfo = alloc_opinfo(work, pid, tid);
if (!opinfo)
return -ENOMEM;
@@ -1447,7 +1459,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
memcpy(buf->lcontext.LeaseKey, lease->lease_key,
SMB2_LEASE_KEY_SIZE);
buf->lcontext.LeaseFlags = lease->flags;
- buf->lcontext.Epoch = cpu_to_le16(++lease->epoch);
+ buf->lcontext.Epoch = cpu_to_le16(lease->epoch);
buf->lcontext.LeaseState = lease->state;
memcpy(buf->lcontext.ParentLeaseKey, lease->parent_lease_key,
SMB2_LEASE_KEY_SIZE);
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 652ab429bf2e..ba7a72a6a4f4 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1414,7 +1414,10 @@ static struct ksmbd_user *session_user(struct ksmbd_conn *conn,
char *name;
unsigned int name_off, name_len, secbuf_len;
- secbuf_len = le16_to_cpu(req->SecurityBufferLength);
+ if (conn->use_spnego && conn->mechToken)
+ secbuf_len = conn->mechTokenLen;
+ else
+ secbuf_len = le16_to_cpu(req->SecurityBufferLength);
if (secbuf_len < sizeof(struct authenticate_message)) {
ksmbd_debug(SMB, "blob len %d too small\n", secbuf_len);
return NULL;
@@ -1505,7 +1508,10 @@ static int ntlm_authenticate(struct ksmbd_work *work,
struct authenticate_message *authblob;
authblob = user_authblob(conn, req);
- sz = le16_to_cpu(req->SecurityBufferLength);
+ if (conn->use_spnego && conn->mechToken)
+ sz = conn->mechTokenLen;
+ else
+ sz = le16_to_cpu(req->SecurityBufferLength);
rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, conn, sess);
if (rc) {
set_user_flag(sess->user, KSMBD_USER_FLAG_BAD_PASSWORD);
@@ -1778,8 +1784,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
negblob_off = le16_to_cpu(req->SecurityBufferOffset);
negblob_len = le16_to_cpu(req->SecurityBufferLength);
- if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer) ||
- negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) {
+ if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer)) {
rc = -EINVAL;
goto out_err;
}
@@ -1788,8 +1793,15 @@ int smb2_sess_setup(struct ksmbd_work *work)
negblob_off);
if (decode_negotiation_token(conn, negblob, negblob_len) == 0) {
- if (conn->mechToken)
+ if (conn->mechToken) {
negblob = (struct negotiate_message *)conn->mechToken;
+ negblob_len = conn->mechTokenLen;
+ }
+ }
+
+ if (negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) {
+ rc = -EINVAL;
+ goto out_err;
}
if (server_conf.auth_mechs & conn->auth_mechs) {
@@ -2311,11 +2323,12 @@ out:
* @eabuf: set info command buffer
* @buf_len: set info command buffer length
* @path: dentry path for get ea
+ * @get_write: get write access to a mount
*
* Return: 0 on success, otherwise error
*/
static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
- const struct path *path)
+ const struct path *path, bool get_write)
{
struct mnt_idmap *idmap = mnt_idmap(path->mnt);
char *attr_name = NULL, *value;
@@ -2971,7 +2984,7 @@ int smb2_open(struct ksmbd_work *work)
&may_flags);
if (!test_tree_conn_flag(tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
- if (open_flags & O_CREAT) {
+ if (open_flags & (O_CREAT | O_TRUNC)) {
ksmbd_debug(SMB,
"User does not have write permission\n");
rc = -EACCES;
@@ -3003,7 +3016,7 @@ int smb2_open(struct ksmbd_work *work)
rc = smb2_set_ea(&ea_buf->ea,
le32_to_cpu(ea_buf->ccontext.DataLength),
- &path);
+ &path, false);
if (rc == -EOPNOTSUPP)
rc = 0;
else if (rc)
@@ -5568,6 +5581,7 @@ static int smb2_rename(struct ksmbd_work *work,
if (!file_info->ReplaceIfExists)
flags = RENAME_NOREPLACE;
+ smb_break_all_levII_oplock(work, fp, 0);
rc = ksmbd_vfs_rename(work, &fp->filp->f_path, new_name, flags);
out:
kfree(new_name);
@@ -5943,12 +5957,6 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
}
case FILE_RENAME_INFORMATION:
{
- if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
- ksmbd_debug(SMB,
- "User does not have write permission\n");
- return -EACCES;
- }
-
if (buf_len < sizeof(struct smb2_file_rename_info))
return -EINVAL;
@@ -5968,12 +5976,6 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
}
case FILE_DISPOSITION_INFORMATION:
{
- if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
- ksmbd_debug(SMB,
- "User does not have write permission\n");
- return -EACCES;
- }
-
if (buf_len < sizeof(struct smb2_file_disposition_info))
return -EINVAL;
@@ -5992,7 +5994,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
return -EINVAL;
return smb2_set_ea((struct smb2_ea_info *)req->Buffer,
- buf_len, &fp->filp->f_path);
+ buf_len, &fp->filp->f_path, true);
}
case FILE_POSITION_INFORMATION:
{
@@ -6035,7 +6037,7 @@ int smb2_set_info(struct ksmbd_work *work)
{
struct smb2_set_info_req *req;
struct smb2_set_info_rsp *rsp;
- struct ksmbd_file *fp;
+ struct ksmbd_file *fp = NULL;
int rc = 0;
unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID;
@@ -6055,6 +6057,13 @@ int smb2_set_info(struct ksmbd_work *work)
rsp = smb2_get_msg(work->response_buf);
}
+ if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
+ ksmbd_debug(SMB, "User does not have write permission\n");
+ pr_err("User does not have write permission\n");
+ rc = -EACCES;
+ goto err_out;
+ }
+
if (!has_file_id(id)) {
id = req->VolatileFileId;
pid = req->PersistentFileId;
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index 6691ae68af0c..7c98bf699772 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -158,8 +158,12 @@ int ksmbd_verify_smb_message(struct ksmbd_work *work)
*/
bool ksmbd_smb_request(struct ksmbd_conn *conn)
{
- __le32 *proto = (__le32 *)smb2_get_msg(conn->request_buf);
+ __le32 *proto;
+ if (conn->request_buf[0] != 0)
+ return false;
+
+ proto = (__le32 *)smb2_get_msg(conn->request_buf);
if (*proto == SMB2_COMPRESSION_TRANSFORM_ID) {
pr_err_ratelimited("smb2 compression not support yet");
return false;
diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c
index 1164365533f0..1c9775f1efa5 100644
--- a/fs/smb/server/smbacl.c
+++ b/fs/smb/server/smbacl.c
@@ -401,10 +401,6 @@ static void parse_dacl(struct mnt_idmap *idmap,
if (num_aces > ULONG_MAX / sizeof(struct smb_ace *))
return;
- ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL);
- if (!ppace)
- return;
-
ret = init_acl_state(&acl_state, num_aces);
if (ret)
return;
@@ -414,6 +410,13 @@ static void parse_dacl(struct mnt_idmap *idmap,
return;
}
+ ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL);
+ if (!ppace) {
+ free_acl_state(&default_acl_state);
+ free_acl_state(&acl_state);
+ return;
+ }
+
/*
* reset rwx permissions for user/group/other.
* Also, if num_aces is 0 i.e. DACL has no ACEs,
diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index b49d47bdafc9..f29bb03f0dc4 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -74,7 +74,7 @@ static int handle_unsupported_event(struct sk_buff *skb, struct genl_info *info)
static int handle_generic_event(struct sk_buff *skb, struct genl_info *info);
static int ksmbd_ipc_heartbeat_request(void);
-static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX] = {
+static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX + 1] = {
[KSMBD_EVENT_UNSPEC] = {
.len = 0,
},
@@ -403,7 +403,7 @@ static int handle_generic_event(struct sk_buff *skb, struct genl_info *info)
return -EPERM;
#endif
- if (type >= KSMBD_EVENT_MAX) {
+ if (type > KSMBD_EVENT_MAX) {
WARN_ON(1);
return -EINVAL;
}
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index c5629a68c8b7..8faa25c6e129 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -2039,6 +2039,7 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs)
static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id)
{
struct smb_direct_transport *t;
+ struct task_struct *handler;
int ret;
if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) {
@@ -2056,11 +2057,11 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id)
if (ret)
goto out_err;
- KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop,
- KSMBD_TRANS(t)->conn, "ksmbd:r%u",
- smb_direct_port);
- if (IS_ERR(KSMBD_TRANS(t)->handler)) {
- ret = PTR_ERR(KSMBD_TRANS(t)->handler);
+ handler = kthread_run(ksmbd_conn_handler_loop,
+ KSMBD_TRANS(t)->conn, "ksmbd:r%u",
+ smb_direct_port);
+ if (IS_ERR(handler)) {
+ ret = PTR_ERR(handler);
pr_err("Can't start thread\n");
goto out_err;
}
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index eff7a1d793f0..002a3f0dc7c5 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -185,6 +185,7 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk)
struct sockaddr *csin;
int rc = 0;
struct tcp_transport *t;
+ struct task_struct *handler;
t = alloc_transport(client_sk);
if (!t) {
@@ -199,13 +200,13 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk)
goto out_error;
}
- KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop,
- KSMBD_TRANS(t)->conn,
- "ksmbd:%u",
- ksmbd_tcp_get_port(csin));
- if (IS_ERR(KSMBD_TRANS(t)->handler)) {
+ handler = kthread_run(ksmbd_conn_handler_loop,
+ KSMBD_TRANS(t)->conn,
+ "ksmbd:%u",
+ ksmbd_tcp_get_port(csin));
+ if (IS_ERR(handler)) {
pr_err("cannot start conn thread\n");
- rc = PTR_ERR(KSMBD_TRANS(t)->handler);
+ rc = PTR_ERR(handler);
free_transport(t);
}
return rc;
@@ -364,6 +365,7 @@ static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig,
* @t: TCP transport instance
* @buf: buffer to store read data from socket
* @to_read: number of bytes to read from socket
+ * @max_retries: number of retries if reading from socket fails
*
* Return: on success return number of bytes read from socket,
* otherwise return error number
@@ -415,6 +417,7 @@ static void tcp_destroy_socket(struct socket *ksmbd_socket)
/**
* create_socket - create socket for ksmbd/0
+ * @iface: interface to bind the created socket to
*
* Return: 0 on success, error number otherwise
*/
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 4277750a6da1..a6961bfe3e13 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -49,6 +49,10 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work,
/**
* ksmbd_vfs_lock_parent() - lock parent dentry if it is stable
+ * @parent: parent dentry
+ * @child: child dentry
+ *
+ * Returns: %0 on success, %-ENOENT if the parent dentry is not stable
*/
int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child)
{
@@ -360,7 +364,7 @@ out:
/**
* ksmbd_vfs_read() - vfs helper for smb file read
* @work: smb work
- * @fid: file id of open file
+ * @fp: ksmbd file pointer
* @count: read byte count
* @pos: file pos
* @rbuf: read data buffer
@@ -474,7 +478,7 @@ out:
/**
* ksmbd_vfs_write() - vfs helper for smb file write
* @work: work
- * @fid: file id of open file
+ * @fp: ksmbd file pointer
* @buf: buf containing data for writing
* @count: read byte count
* @pos: file pos
@@ -545,10 +549,8 @@ out:
/**
* ksmbd_vfs_getattr() - vfs helper for smb getattr
- * @work: work
- * @fid: file id of open file
- * @attrs: inode attributes
- *
+ * @path: path of dentry
+ * @stat: pointer to returned kernel stat structure
* Return: 0 on success, otherwise error
*/
int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat)
@@ -565,6 +567,7 @@ int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat)
* ksmbd_vfs_fsync() - vfs helper for smb fsync
* @work: work
* @fid: file id of open file
+ * @p_id: persistent file id
*
* Return: 0 on success, otherwise error
*/
@@ -587,7 +590,8 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id)
/**
* ksmbd_vfs_remove_file() - vfs helper for smb rmdir or unlink
- * @name: directory or file name that is relative to share
+ * @work: work
+ * @path: path of dentry
*
* Return: 0 on success, otherwise error
*/
@@ -623,6 +627,7 @@ out_err:
/**
* ksmbd_vfs_link() - vfs helper for creating smb hardlink
+ * @work: work
* @oldname: source file name
* @newname: hardlink name that is relative to share
*
@@ -715,6 +720,10 @@ retry:
goto out2;
trap = lock_rename_child(old_child, new_path.dentry);
+ if (IS_ERR(trap)) {
+ err = PTR_ERR(trap);
+ goto out_drop_write;
+ }
old_parent = dget(old_child->d_parent);
if (d_unhashed(old_child)) {
@@ -777,6 +786,7 @@ out4:
out3:
dput(old_parent);
unlock_rename(old_parent, new_path.dentry);
+out_drop_write:
mnt_drop_write(old_path->mnt);
out2:
path_put(&new_path);
@@ -795,7 +805,7 @@ revert_fsids:
/**
* ksmbd_vfs_truncate() - vfs helper for smb file truncate
* @work: work
- * @fid: file id of old file
+ * @fp: ksmbd file pointer
* @size: truncate to given size
*
* Return: 0 on success, otherwise error
@@ -838,7 +848,6 @@ int ksmbd_vfs_truncate(struct ksmbd_work *work,
* ksmbd_vfs_listxattr() - vfs helper for smb list extended attributes
* @dentry: dentry of file for listing xattrs
* @list: destination buffer
- * @size: destination buffer length
*
* Return: xattr list length on success, otherwise error
*/
@@ -947,7 +956,7 @@ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap,
/**
* ksmbd_vfs_set_fadvise() - convert smb IO caching options to linux options
* @filp: file pointer for IO
- * @options: smb IO options
+ * @option: smb IO options
*/
void ksmbd_vfs_set_fadvise(struct file *filp, __le32 option)
{
@@ -1159,6 +1168,7 @@ static bool __caseless_lookup(struct dir_context *ctx, const char *name,
* @dir: path info
* @name: filename to lookup
* @namelen: filename length
+ * @um: &struct unicode_map to use
*
* Return: 0 on success, otherwise error
*/
@@ -1189,6 +1199,7 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
/**
* ksmbd_vfs_kern_path_locked() - lookup a file and get path info
+ * @work: work
* @name: file path that is relative to share
* @flags: lookup flags
* @parent_path: if lookup succeed, return parent_path info
@@ -1636,6 +1647,8 @@ int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap,
* ksmbd_vfs_init_kstat() - convert unix stat information to smb stat format
* @p: destination buffer
* @ksmbd_kstat: ksmbd kstat wrapper
+ *
+ * Returns: pointer to the converted &struct file_directory_info
*/
void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat)
{
diff --git a/fs/sysctls.c b/fs/sysctls.c
index 76a0aee8c229..8dbde9a802fa 100644
--- a/fs/sysctls.c
+++ b/fs/sysctls.c
@@ -26,7 +26,6 @@ static struct ctl_table fs_shared_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_MAXOLDUID,
},
- { }
};
static int __init init_fs_sysctls(void)
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index b6b6796e1616..4df2afa551dc 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -81,7 +81,7 @@ void sysfs_remove_dir(struct kobject *kobj)
struct kernfs_node *kn = kobj->sd;
/*
- * In general, kboject owner is responsible for ensuring removal
+ * In general, kobject owner is responsible for ensuring removal
* doesn't race with other operations and sysfs doesn't provide any
* protection; however, when @kobj is used as a symlink target, the
* symlinking entity usually doesn't own @kobj and thus has no
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index f0677ea0ec24..110e8a272189 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -32,6 +32,18 @@
*/
static DEFINE_MUTEX(eventfs_mutex);
+/* Choose something "unique" ;-) */
+#define EVENTFS_FILE_INODE_INO 0x12c4e37
+
+/* Just try to make something consistent and unique */
+static int eventfs_dir_ino(struct eventfs_inode *ei)
+{
+ if (!ei->ino)
+ ei->ino = get_next_ino();
+
+ return ei->ino;
+}
+
/*
* The eventfs_inode (ei) itself is protected by SRCU. It is released from
* its parent's list and will have is_freed set (under eventfs_mutex).
@@ -45,16 +57,55 @@ enum {
EVENTFS_SAVE_MODE = BIT(16),
EVENTFS_SAVE_UID = BIT(17),
EVENTFS_SAVE_GID = BIT(18),
+ EVENTFS_TOPLEVEL = BIT(19),
};
#define EVENTFS_MODE_MASK (EVENTFS_SAVE_MODE - 1)
+/*
+ * eventfs_inode reference count management.
+ *
+ * NOTE! We count only references from dentries, in the
+ * form 'dentry->d_fsdata'. There are also references from
+ * directory inodes ('ti->private'), but the dentry reference
+ * count is always a superset of the inode reference count.
+ */
+static void release_ei(struct kref *ref)
+{
+ struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref);
+
+ WARN_ON_ONCE(!ei->is_freed);
+
+ kfree(ei->entry_attrs);
+ kfree_const(ei->name);
+ kfree_rcu(ei, rcu);
+}
+
+static inline void put_ei(struct eventfs_inode *ei)
+{
+ if (ei)
+ kref_put(&ei->kref, release_ei);
+}
+
+static inline void free_ei(struct eventfs_inode *ei)
+{
+ if (ei) {
+ ei->is_freed = 1;
+ put_ei(ei);
+ }
+}
+
+static inline struct eventfs_inode *get_ei(struct eventfs_inode *ei)
+{
+ if (ei)
+ kref_get(&ei->kref);
+ return ei;
+}
+
static struct dentry *eventfs_root_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags);
-static int dcache_dir_open_wrapper(struct inode *inode, struct file *file);
-static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx);
-static int eventfs_release(struct inode *inode, struct file *file);
+static int eventfs_iterate(struct file *file, struct dir_context *ctx);
static void update_attr(struct eventfs_attr *attr, struct iattr *iattr)
{
@@ -94,7 +145,7 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
/* Preallocate the children mode array if necessary */
if (!(dentry->d_inode->i_mode & S_IFDIR)) {
if (!ei->entry_attrs) {
- ei->entry_attrs = kzalloc(sizeof(*ei->entry_attrs) * ei->nr_entries,
+ ei->entry_attrs = kcalloc(ei->nr_entries, sizeof(*ei->entry_attrs),
GFP_NOFS);
if (!ei->entry_attrs) {
ret = -ENOMEM;
@@ -117,10 +168,17 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
* The events directory dentry is never freed, unless its
* part of an instance that is deleted. It's attr is the
* default for its child files and directories.
- * Do not update it. It's not used for its own mode or ownership
+ * Do not update it. It's not used for its own mode or ownership.
*/
- if (!ei->is_events)
+ if (ei->is_events) {
+ /* But it still needs to know if it was modified */
+ if (iattr->ia_valid & ATTR_UID)
+ ei->attr.mode |= EVENTFS_SAVE_UID;
+ if (iattr->ia_valid & ATTR_GID)
+ ei->attr.mode |= EVENTFS_SAVE_GID;
+ } else {
update_attr(&ei->attr, iattr);
+ }
} else {
name = dentry->d_name.name;
@@ -138,9 +196,63 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
return ret;
}
+static void update_top_events_attr(struct eventfs_inode *ei, struct super_block *sb)
+{
+ struct inode *root;
+
+ /* Only update if the "events" was on the top level */
+ if (!ei || !(ei->attr.mode & EVENTFS_TOPLEVEL))
+ return;
+
+ /* Get the tracefs root inode. */
+ root = d_inode(sb->s_root);
+ ei->attr.uid = root->i_uid;
+ ei->attr.gid = root->i_gid;
+}
+
+static void set_top_events_ownership(struct inode *inode)
+{
+ struct tracefs_inode *ti = get_tracefs(inode);
+ struct eventfs_inode *ei = ti->private;
+
+ /* The top events directory doesn't get automatically updated */
+ if (!ei || !ei->is_events || !(ei->attr.mode & EVENTFS_TOPLEVEL))
+ return;
+
+ update_top_events_attr(ei, inode->i_sb);
+
+ if (!(ei->attr.mode & EVENTFS_SAVE_UID))
+ inode->i_uid = ei->attr.uid;
+
+ if (!(ei->attr.mode & EVENTFS_SAVE_GID))
+ inode->i_gid = ei->attr.gid;
+}
+
+static int eventfs_get_attr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
+{
+ struct dentry *dentry = path->dentry;
+ struct inode *inode = d_backing_inode(dentry);
+
+ set_top_events_ownership(inode);
+
+ generic_fillattr(idmap, request_mask, inode, stat);
+ return 0;
+}
+
+static int eventfs_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask)
+{
+ set_top_events_ownership(inode);
+ return generic_permission(idmap, inode, mask);
+}
+
static const struct inode_operations eventfs_root_dir_inode_operations = {
.lookup = eventfs_root_lookup,
.setattr = eventfs_set_attr,
+ .getattr = eventfs_get_attr,
+ .permission = eventfs_permission,
};
static const struct inode_operations eventfs_file_inode_operations = {
@@ -148,11 +260,9 @@ static const struct inode_operations eventfs_file_inode_operations = {
};
static const struct file_operations eventfs_file_operations = {
- .open = dcache_dir_open_wrapper,
.read = generic_read_dir,
- .iterate_shared = dcache_readdir_wrapper,
+ .iterate_shared = eventfs_iterate,
.llseek = generic_file_llseek,
- .release = eventfs_release,
};
/* Return the evenfs_inode of the "events" directory */
@@ -160,10 +270,11 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry)
{
struct eventfs_inode *ei;
- mutex_lock(&eventfs_mutex);
do {
- /* The parent always has an ei, except for events itself */
- ei = dentry->d_parent->d_fsdata;
+ // The parent is stable because we do not do renames
+ dentry = dentry->d_parent;
+ // ... and directories always have d_fsdata
+ ei = dentry->d_fsdata;
/*
* If the ei is being freed, the ownership of the children
@@ -173,10 +284,10 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry)
ei = NULL;
break;
}
-
- dentry = ei->dentry;
+ // Walk upwards until you find the events inode
} while (!ei->is_events);
- mutex_unlock(&eventfs_mutex);
+
+ update_top_events_attr(ei, dentry->d_sb);
return ei;
}
@@ -206,50 +317,11 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode,
inode->i_gid = attr->gid;
}
-static void update_gid(struct eventfs_inode *ei, kgid_t gid, int level)
-{
- struct eventfs_inode *ei_child;
-
- /* at most we have events/system/event */
- if (WARN_ON_ONCE(level > 3))
- return;
-
- ei->attr.gid = gid;
-
- if (ei->entry_attrs) {
- for (int i = 0; i < ei->nr_entries; i++) {
- ei->entry_attrs[i].gid = gid;
- }
- }
-
- /*
- * Only eventfs_inode with dentries are updated, make sure
- * all eventfs_inodes are updated. If one of the children
- * do not have a dentry, this function must traverse it.
- */
- list_for_each_entry_srcu(ei_child, &ei->children, list,
- srcu_read_lock_held(&eventfs_srcu)) {
- if (!ei_child->dentry)
- update_gid(ei_child, gid, level + 1);
- }
-}
-
-void eventfs_update_gid(struct dentry *dentry, kgid_t gid)
-{
- struct eventfs_inode *ei = dentry->d_fsdata;
- int idx;
-
- idx = srcu_read_lock(&eventfs_srcu);
- update_gid(ei, gid, 0);
- srcu_read_unlock(&eventfs_srcu, idx);
-}
-
/**
- * create_file - create a file in the tracefs filesystem
- * @name: the name of the file to create.
+ * lookup_file - look up a file in the tracefs filesystem
+ * @dentry: the dentry to look up
* @mode: the permission that the file should have.
* @attr: saved attributes changed by user
- * @parent: parent dentry for this file.
* @data: something that the caller will want to get to later on.
* @fop: struct file_operations that should be used for this file.
*
@@ -257,30 +329,25 @@ void eventfs_update_gid(struct dentry *dentry, kgid_t gid)
* directory. The inode.i_private pointer will point to @data in the open()
* call.
*/
-static struct dentry *create_file(const char *name, umode_t mode,
+static struct dentry *lookup_file(struct eventfs_inode *parent_ei,
+ struct dentry *dentry,
+ umode_t mode,
struct eventfs_attr *attr,
- struct dentry *parent, void *data,
+ void *data,
const struct file_operations *fop)
{
struct tracefs_inode *ti;
- struct dentry *dentry;
struct inode *inode;
if (!(mode & S_IFMT))
mode |= S_IFREG;
if (WARN_ON_ONCE(!S_ISREG(mode)))
- return NULL;
-
- WARN_ON_ONCE(!parent);
- dentry = eventfs_start_creating(name, parent);
-
- if (IS_ERR(dentry))
- return dentry;
+ return ERR_PTR(-EIO);
inode = tracefs_get_inode(dentry->d_sb);
if (unlikely(!inode))
- return eventfs_failed_creating(dentry);
+ return ERR_PTR(-ENOMEM);
/* If the user updated the directory's attributes, use them */
update_inode_attr(dentry, inode, attr, mode);
@@ -289,34 +356,36 @@ static struct dentry *create_file(const char *name, umode_t mode,
inode->i_fop = fop;
inode->i_private = data;
+ /* All files will have the same inode number */
+ inode->i_ino = EVENTFS_FILE_INODE_INO;
+
ti = get_tracefs(inode);
ti->flags |= TRACEFS_EVENT_INODE;
- d_instantiate(dentry, inode);
- fsnotify_create(dentry->d_parent->d_inode, dentry);
- return eventfs_end_creating(dentry);
+
+ // Files have their parent's ei as their fsdata
+ dentry->d_fsdata = get_ei(parent_ei);
+
+ d_add(dentry, inode);
+ return NULL;
};
/**
- * create_dir - create a dir in the tracefs filesystem
+ * lookup_dir_entry - look up a dir in the tracefs filesystem
+ * @dentry: the directory to look up
* @ei: the eventfs_inode that represents the directory to create
- * @parent: parent dentry for this file.
*
- * This function will create a dentry for a directory represented by
+ * This function will look up a dentry for a directory represented by
* a eventfs_inode.
*/
-static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent)
+static struct dentry *lookup_dir_entry(struct dentry *dentry,
+ struct eventfs_inode *pei, struct eventfs_inode *ei)
{
struct tracefs_inode *ti;
- struct dentry *dentry;
struct inode *inode;
- dentry = eventfs_start_creating(ei->name, parent);
- if (IS_ERR(dentry))
- return dentry;
-
inode = tracefs_get_inode(dentry->d_sb);
if (unlikely(!inode))
- return eventfs_failed_creating(dentry);
+ return ERR_PTR(-ENOMEM);
/* If the user updated the directory's attributes, use them */
update_inode_attr(dentry, inode, &ei->attr,
@@ -325,247 +394,72 @@ static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent
inode->i_op = &eventfs_root_dir_inode_operations;
inode->i_fop = &eventfs_file_operations;
+ /* All directories will have the same inode number */
+ inode->i_ino = eventfs_dir_ino(ei);
+
ti = get_tracefs(inode);
ti->flags |= TRACEFS_EVENT_INODE;
+ /* Only directories have ti->private set to an ei, not files */
+ ti->private = ei;
- inc_nlink(inode);
- d_instantiate(dentry, inode);
- inc_nlink(dentry->d_parent->d_inode);
- fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
- return eventfs_end_creating(dentry);
+ dentry->d_fsdata = get_ei(ei);
+
+ d_add(dentry, inode);
+ return NULL;
}
-static void free_ei(struct eventfs_inode *ei)
+static inline struct eventfs_inode *alloc_ei(const char *name)
{
- kfree_const(ei->name);
- kfree(ei->d_children);
- kfree(ei->entry_attrs);
- kfree(ei);
+ struct eventfs_inode *ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+
+ if (!ei)
+ return NULL;
+
+ ei->name = kstrdup_const(name, GFP_KERNEL);
+ if (!ei->name) {
+ kfree(ei);
+ return NULL;
+ }
+ kref_init(&ei->kref);
+ return ei;
}
/**
- * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode
- * @ti: the tracefs_inode of the dentry
+ * eventfs_d_release - dentry is going away
* @dentry: dentry which has the reference to remove.
*
* Remove the association between a dentry from an eventfs_inode.
*/
-void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
+void eventfs_d_release(struct dentry *dentry)
{
- struct eventfs_inode *ei;
- int i;
-
- mutex_lock(&eventfs_mutex);
-
- ei = dentry->d_fsdata;
- if (!ei)
- goto out;
-
- /* This could belong to one of the files of the ei */
- if (ei->dentry != dentry) {
- for (i = 0; i < ei->nr_entries; i++) {
- if (ei->d_children[i] == dentry)
- break;
- }
- if (WARN_ON_ONCE(i == ei->nr_entries))
- goto out;
- ei->d_children[i] = NULL;
- } else if (ei->is_freed) {
- free_ei(ei);
- } else {
- ei->dentry = NULL;
- }
-
- dentry->d_fsdata = NULL;
- out:
- mutex_unlock(&eventfs_mutex);
+ put_ei(dentry->d_fsdata);
}
/**
- * create_file_dentry - create a dentry for a file of an eventfs_inode
+ * lookup_file_dentry - create a dentry for a file of an eventfs_inode
* @ei: the eventfs_inode that the file will be created under
- * @idx: the index into the d_children[] of the @ei
+ * @idx: the index into the entry_attrs[] of the @ei
* @parent: The parent dentry of the created file.
* @name: The name of the file to create
* @mode: The mode of the file.
* @data: The data to use to set the inode of the file with on open()
* @fops: The fops of the file to be created.
- * @lookup: If called by the lookup routine, in which case, dput() the created dentry.
*
* Create a dentry for a file of an eventfs_inode @ei and place it into the
- * address located at @e_dentry. If the @e_dentry already has a dentry, then
- * just do a dget() on it and return. Otherwise create the dentry and attach it.
+ * address located at @e_dentry.
*/
static struct dentry *
-create_file_dentry(struct eventfs_inode *ei, int idx,
- struct dentry *parent, const char *name, umode_t mode, void *data,
- const struct file_operations *fops, bool lookup)
+lookup_file_dentry(struct dentry *dentry,
+ struct eventfs_inode *ei, int idx,
+ umode_t mode, void *data,
+ const struct file_operations *fops)
{
struct eventfs_attr *attr = NULL;
- struct dentry **e_dentry = &ei->d_children[idx];
- struct dentry *dentry;
-
- WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
- mutex_lock(&eventfs_mutex);
- if (ei->is_freed) {
- mutex_unlock(&eventfs_mutex);
- return NULL;
- }
- /* If the e_dentry already has a dentry, use it */
- if (*e_dentry) {
- /* lookup does not need to up the ref count */
- if (!lookup)
- dget(*e_dentry);
- mutex_unlock(&eventfs_mutex);
- return *e_dentry;
- }
-
- /* ei->entry_attrs are protected by SRCU */
if (ei->entry_attrs)
attr = &ei->entry_attrs[idx];
- mutex_unlock(&eventfs_mutex);
-
- dentry = create_file(name, mode, attr, parent, data, fops);
-
- mutex_lock(&eventfs_mutex);
-
- if (IS_ERR_OR_NULL(dentry)) {
- /*
- * When the mutex was released, something else could have
- * created the dentry for this e_dentry. In which case
- * use that one.
- *
- * If ei->is_freed is set, the e_dentry is currently on its
- * way to being freed, don't return it. If e_dentry is NULL
- * it means it was already freed.
- */
- if (ei->is_freed)
- dentry = NULL;
- else
- dentry = *e_dentry;
- /* The lookup does not need to up the dentry refcount */
- if (dentry && !lookup)
- dget(dentry);
- mutex_unlock(&eventfs_mutex);
- return dentry;
- }
-
- if (!*e_dentry && !ei->is_freed) {
- *e_dentry = dentry;
- dentry->d_fsdata = ei;
- } else {
- /*
- * Should never happen unless we get here due to being freed.
- * Otherwise it means two dentries exist with the same name.
- */
- WARN_ON_ONCE(!ei->is_freed);
- dentry = NULL;
- }
- mutex_unlock(&eventfs_mutex);
-
- if (lookup)
- dput(dentry);
-
- return dentry;
-}
-
-/**
- * eventfs_post_create_dir - post create dir routine
- * @ei: eventfs_inode of recently created dir
- *
- * Map the meta-data of files within an eventfs dir to their parent dentry
- */
-static void eventfs_post_create_dir(struct eventfs_inode *ei)
-{
- struct eventfs_inode *ei_child;
- struct tracefs_inode *ti;
-
- lockdep_assert_held(&eventfs_mutex);
-
- /* srcu lock already held */
- /* fill parent-child relation */
- list_for_each_entry_srcu(ei_child, &ei->children, list,
- srcu_read_lock_held(&eventfs_srcu)) {
- ei_child->d_parent = ei->dentry;
- }
-
- ti = get_tracefs(ei->dentry->d_inode);
- ti->private = ei;
-}
-
-/**
- * create_dir_dentry - Create a directory dentry for the eventfs_inode
- * @pei: The eventfs_inode parent of ei.
- * @ei: The eventfs_inode to create the directory for
- * @parent: The dentry of the parent of this directory
- * @lookup: True if this is called by the lookup code
- *
- * This creates and attaches a directory dentry to the eventfs_inode @ei.
- */
-static struct dentry *
-create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei,
- struct dentry *parent, bool lookup)
-{
- struct dentry *dentry = NULL;
-
- WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
-
- mutex_lock(&eventfs_mutex);
- if (pei->is_freed || ei->is_freed) {
- mutex_unlock(&eventfs_mutex);
- return NULL;
- }
- if (ei->dentry) {
- /* If the dentry already has a dentry, use it */
- dentry = ei->dentry;
- /* lookup does not need to up the ref count */
- if (!lookup)
- dget(dentry);
- mutex_unlock(&eventfs_mutex);
- return dentry;
- }
- mutex_unlock(&eventfs_mutex);
-
- dentry = create_dir(ei, parent);
-
- mutex_lock(&eventfs_mutex);
-
- if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) {
- /*
- * When the mutex was released, something else could have
- * created the dentry for this e_dentry. In which case
- * use that one.
- *
- * If ei->is_freed is set, the e_dentry is currently on its
- * way to being freed.
- */
- dentry = ei->dentry;
- if (dentry && !lookup)
- dget(dentry);
- mutex_unlock(&eventfs_mutex);
- return dentry;
- }
-
- if (!ei->dentry && !ei->is_freed) {
- ei->dentry = dentry;
- eventfs_post_create_dir(ei);
- dentry->d_fsdata = ei;
- } else {
- /*
- * Should never happen unless we get here due to being freed.
- * Otherwise it means two dentries exist with the same name.
- */
- WARN_ON_ONCE(!ei->is_freed);
- dentry = NULL;
- }
- mutex_unlock(&eventfs_mutex);
-
- if (lookup)
- dput(dentry);
-
- return dentry;
+ return lookup_file(ei, dentry, mode, attr, data, fops);
}
/**
@@ -582,250 +476,153 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags)
{
- const struct file_operations *fops;
- const struct eventfs_entry *entry;
struct eventfs_inode *ei_child;
struct tracefs_inode *ti;
struct eventfs_inode *ei;
- struct dentry *ei_dentry = NULL;
- struct dentry *ret = NULL;
const char *name = dentry->d_name.name;
- bool created = false;
- umode_t mode;
- void *data;
- int idx;
- int i;
- int r;
+ struct dentry *result = NULL;
ti = get_tracefs(dir);
if (!(ti->flags & TRACEFS_EVENT_INODE))
- return NULL;
-
- /* Grab srcu to prevent the ei from going away */
- idx = srcu_read_lock(&eventfs_srcu);
+ return ERR_PTR(-EIO);
- /*
- * Grab the eventfs_mutex to consistent value from ti->private.
- * This s
- */
mutex_lock(&eventfs_mutex);
- ei = READ_ONCE(ti->private);
- if (ei && !ei->is_freed)
- ei_dentry = READ_ONCE(ei->dentry);
- mutex_unlock(&eventfs_mutex);
- if (!ei || !ei_dentry)
+ ei = ti->private;
+ if (!ei || ei->is_freed)
goto out;
- data = ei->data;
-
- list_for_each_entry_srcu(ei_child, &ei->children, list,
- srcu_read_lock_held(&eventfs_srcu)) {
+ list_for_each_entry(ei_child, &ei->children, list) {
if (strcmp(ei_child->name, name) != 0)
continue;
- ret = simple_lookup(dir, dentry, flags);
- if (IS_ERR(ret))
+ if (ei_child->is_freed)
goto out;
- create_dir_dentry(ei, ei_child, ei_dentry, true);
- created = true;
- break;
- }
-
- if (created)
+ result = lookup_dir_entry(dentry, ei, ei_child);
goto out;
-
- for (i = 0; i < ei->nr_entries; i++) {
- entry = &ei->entries[i];
- if (strcmp(name, entry->name) == 0) {
- void *cdata = data;
- mutex_lock(&eventfs_mutex);
- /* If ei->is_freed, then the event itself may be too */
- if (!ei->is_freed)
- r = entry->callback(name, &mode, &cdata, &fops);
- else
- r = -1;
- mutex_unlock(&eventfs_mutex);
- if (r <= 0)
- continue;
- ret = simple_lookup(dir, dentry, flags);
- if (IS_ERR(ret))
- goto out;
- create_file_dentry(ei, i, ei_dentry, name, mode, cdata,
- fops, true);
- break;
- }
}
- out:
- srcu_read_unlock(&eventfs_srcu, idx);
- return ret;
-}
-
-struct dentry_list {
- void *cursor;
- struct dentry **dentries;
-};
-/**
- * eventfs_release - called to release eventfs file/dir
- * @inode: inode to be released
- * @file: file to be released (not used)
- */
-static int eventfs_release(struct inode *inode, struct file *file)
-{
- struct tracefs_inode *ti;
- struct dentry_list *dlist = file->private_data;
- void *cursor;
- int i;
+ for (int i = 0; i < ei->nr_entries; i++) {
+ void *data;
+ umode_t mode;
+ const struct file_operations *fops;
+ const struct eventfs_entry *entry = &ei->entries[i];
- ti = get_tracefs(inode);
- if (!(ti->flags & TRACEFS_EVENT_INODE))
- return -EINVAL;
+ if (strcmp(name, entry->name) != 0)
+ continue;
- if (WARN_ON_ONCE(!dlist))
- return -EINVAL;
+ data = ei->data;
+ if (entry->callback(name, &mode, &data, &fops) <= 0)
+ goto out;
- for (i = 0; dlist->dentries && dlist->dentries[i]; i++) {
- dput(dlist->dentries[i]);
+ result = lookup_file_dentry(dentry, ei, i, mode, data, fops);
+ goto out;
}
-
- cursor = dlist->cursor;
- kfree(dlist->dentries);
- kfree(dlist);
- file->private_data = cursor;
- return dcache_dir_close(inode, file);
-}
-
-static int add_dentries(struct dentry ***dentries, struct dentry *d, int cnt)
-{
- struct dentry **tmp;
-
- tmp = krealloc(*dentries, sizeof(d) * (cnt + 2), GFP_NOFS);
- if (!tmp)
- return -1;
- tmp[cnt] = d;
- tmp[cnt + 1] = NULL;
- *dentries = tmp;
- return 0;
+ out:
+ mutex_unlock(&eventfs_mutex);
+ return result;
}
-/**
- * dcache_dir_open_wrapper - eventfs open wrapper
- * @inode: not used
- * @file: dir to be opened (to create it's children)
- *
- * Used to dynamic create file/dir with-in @file, all the
- * file/dir will be created. If already created then references
- * will be increased
+/*
+ * Walk the children of a eventfs_inode to fill in getdents().
*/
-static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
+static int eventfs_iterate(struct file *file, struct dir_context *ctx)
{
const struct file_operations *fops;
+ struct inode *f_inode = file_inode(file);
const struct eventfs_entry *entry;
struct eventfs_inode *ei_child;
struct tracefs_inode *ti;
struct eventfs_inode *ei;
- struct dentry_list *dlist;
- struct dentry **dentries = NULL;
- struct dentry *parent = file_dentry(file);
- struct dentry *d;
- struct inode *f_inode = file_inode(file);
- const char *name = parent->d_name.name;
+ const char *name;
umode_t mode;
- void *data;
- int cnt = 0;
int idx;
- int ret;
- int i;
- int r;
+ int ret = -EINVAL;
+ int ino;
+ int i, r, c;
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
ti = get_tracefs(f_inode);
if (!(ti->flags & TRACEFS_EVENT_INODE))
return -EINVAL;
- if (WARN_ON_ONCE(file->private_data))
- return -EINVAL;
+ c = ctx->pos - 2;
idx = srcu_read_lock(&eventfs_srcu);
mutex_lock(&eventfs_mutex);
ei = READ_ONCE(ti->private);
+ if (ei && ei->is_freed)
+ ei = NULL;
mutex_unlock(&eventfs_mutex);
- if (!ei) {
- srcu_read_unlock(&eventfs_srcu, idx);
- return -EINVAL;
- }
-
+ if (!ei)
+ goto out;
- data = ei->data;
+ /*
+ * Need to create the dentries and inodes to have a consistent
+ * inode number.
+ */
+ ret = 0;
- dlist = kmalloc(sizeof(*dlist), GFP_KERNEL);
- if (!dlist) {
- srcu_read_unlock(&eventfs_srcu, idx);
- return -ENOMEM;
- }
+ /* Start at 'c' to jump over already read entries */
+ for (i = c; i < ei->nr_entries; i++, ctx->pos++) {
+ void *cdata = ei->data;
- inode_lock(parent->d_inode);
- list_for_each_entry_srcu(ei_child, &ei->children, list,
- srcu_read_lock_held(&eventfs_srcu)) {
- d = create_dir_dentry(ei, ei_child, parent, false);
- if (d) {
- ret = add_dentries(&dentries, d, cnt);
- if (ret < 0)
- break;
- cnt++;
- }
- }
-
- for (i = 0; i < ei->nr_entries; i++) {
- void *cdata = data;
entry = &ei->entries[i];
name = entry->name;
+
mutex_lock(&eventfs_mutex);
- /* If ei->is_freed, then the event itself may be too */
- if (!ei->is_freed)
- r = entry->callback(name, &mode, &cdata, &fops);
- else
- r = -1;
+ /* If ei->is_freed then just bail here, nothing more to do */
+ if (ei->is_freed) {
+ mutex_unlock(&eventfs_mutex);
+ goto out;
+ }
+ r = entry->callback(name, &mode, &cdata, &fops);
mutex_unlock(&eventfs_mutex);
if (r <= 0)
continue;
- d = create_file_dentry(ei, i, parent, name, mode, cdata, fops, false);
- if (d) {
- ret = add_dentries(&dentries, d, cnt);
- if (ret < 0)
- break;
- cnt++;
+
+ ino = EVENTFS_FILE_INODE_INO;
+
+ if (!dir_emit(ctx, name, strlen(name), ino, DT_REG))
+ goto out;
+ }
+
+ /* Subtract the skipped entries above */
+ c -= min((unsigned int)c, (unsigned int)ei->nr_entries);
+
+ list_for_each_entry_srcu(ei_child, &ei->children, list,
+ srcu_read_lock_held(&eventfs_srcu)) {
+
+ if (c > 0) {
+ c--;
+ continue;
}
+
+ ctx->pos++;
+
+ if (ei_child->is_freed)
+ continue;
+
+ name = ei_child->name;
+
+ ino = eventfs_dir_ino(ei_child);
+
+ if (!dir_emit(ctx, name, strlen(name), ino, DT_DIR))
+ goto out_dec;
}
- inode_unlock(parent->d_inode);
+ ret = 1;
+ out:
srcu_read_unlock(&eventfs_srcu, idx);
- ret = dcache_dir_open(inode, file);
- /*
- * dcache_dir_open() sets file->private_data to a dentry cursor.
- * Need to save that but also save all the dentries that were
- * opened by this function.
- */
- dlist->cursor = file->private_data;
- dlist->dentries = dentries;
- file->private_data = dlist;
return ret;
-}
-
-/*
- * This just sets the file->private_data back to the cursor and back.
- */
-static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx)
-{
- struct dentry_list *dlist = file->private_data;
- int ret;
- file->private_data = dlist->cursor;
- ret = dcache_readdir(file, ctx);
- dlist->cursor = file->private_data;
- file->private_data = dlist;
- return ret;
+ out_dec:
+ /* Incremented ctx->pos without adding something, reset it */
+ ctx->pos--;
+ goto out;
}
/**
@@ -872,25 +669,10 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode
if (!parent)
return ERR_PTR(-EINVAL);
- ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+ ei = alloc_ei(name);
if (!ei)
return ERR_PTR(-ENOMEM);
- ei->name = kstrdup_const(name, GFP_KERNEL);
- if (!ei->name) {
- kfree(ei);
- return ERR_PTR(-ENOMEM);
- }
-
- if (size) {
- ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL);
- if (!ei->d_children) {
- kfree_const(ei->name);
- kfree(ei);
- return ERR_PTR(-ENOMEM);
- }
- }
-
ei->entries = entries;
ei->nr_entries = size;
ei->data = data;
@@ -898,10 +680,8 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode
INIT_LIST_HEAD(&ei->list);
mutex_lock(&eventfs_mutex);
- if (!parent->is_freed) {
+ if (!parent->is_freed)
list_add_tail(&ei->list, &parent->children);
- ei->d_parent = parent->dentry;
- }
mutex_unlock(&eventfs_mutex);
/* Was the parent freed? */
@@ -941,33 +721,33 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
if (IS_ERR(dentry))
return ERR_CAST(dentry);
- ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+ ei = alloc_ei(name);
if (!ei)
- goto fail_ei;
+ goto fail;
inode = tracefs_get_inode(dentry->d_sb);
if (unlikely(!inode))
goto fail;
- if (size) {
- ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL);
- if (!ei->d_children)
- goto fail;
- }
-
- ei->dentry = dentry;
+ // Note: we have a ref to the dentry from tracefs_start_creating()
+ ei->events_dir = dentry;
ei->entries = entries;
ei->nr_entries = size;
ei->is_events = 1;
ei->data = data;
- ei->name = kstrdup_const(name, GFP_KERNEL);
- if (!ei->name)
- goto fail;
/* Save the ownership of this directory */
uid = d_inode(dentry->d_parent)->i_uid;
gid = d_inode(dentry->d_parent)->i_gid;
+ /*
+ * If the events directory is of the top instance, then parent
+ * is NULL. Set the attr.mode to reflect this and its permissions will
+ * default to the tracefs root dentry.
+ */
+ if (!parent)
+ ei->attr.mode = EVENTFS_TOPLEVEL;
+
/* This is used as the default ownership of the files and directories */
ei->attr.uid = uid;
ei->attr.gid = gid;
@@ -985,11 +765,19 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
inode->i_op = &eventfs_root_dir_inode_operations;
inode->i_fop = &eventfs_file_operations;
- dentry->d_fsdata = ei;
+ dentry->d_fsdata = get_ei(ei);
- /* directory inodes start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
+ /*
+ * Keep all eventfs directories with i_nlink == 1.
+ * Due to the dynamic nature of the dentry creations and not
+ * wanting to add a pointer to the parent eventfs_inode in the
+ * eventfs_inode structure, keeping the i_nlink in sync with the
+ * number of directories would cause too much complexity for
+ * something not worth much. Keeping directory links at 1
+ * tells userspace not to trust the link number.
+ */
d_instantiate(dentry, inode);
+ /* The dentry of the "events" parent does keep track though */
inc_nlink(dentry->d_parent->d_inode);
fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
tracefs_end_creating(dentry);
@@ -997,72 +785,11 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
return ei;
fail:
- kfree(ei->d_children);
- kfree(ei);
- fail_ei:
+ free_ei(ei);
tracefs_failed_creating(dentry);
return ERR_PTR(-ENOMEM);
}
-static LLIST_HEAD(free_list);
-
-static void eventfs_workfn(struct work_struct *work)
-{
- struct eventfs_inode *ei, *tmp;
- struct llist_node *llnode;
-
- llnode = llist_del_all(&free_list);
- llist_for_each_entry_safe(ei, tmp, llnode, llist) {
- /* This dput() matches the dget() from unhook_dentry() */
- for (int i = 0; i < ei->nr_entries; i++) {
- if (ei->d_children[i])
- dput(ei->d_children[i]);
- }
- /* This should only get here if it had a dentry */
- if (!WARN_ON_ONCE(!ei->dentry))
- dput(ei->dentry);
- }
-}
-
-static DECLARE_WORK(eventfs_work, eventfs_workfn);
-
-static void free_rcu_ei(struct rcu_head *head)
-{
- struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu);
-
- if (ei->dentry) {
- /* Do not free the ei until all references of dentry are gone */
- if (llist_add(&ei->llist, &free_list))
- queue_work(system_unbound_wq, &eventfs_work);
- return;
- }
-
- /* If the ei doesn't have a dentry, neither should its children */
- for (int i = 0; i < ei->nr_entries; i++) {
- WARN_ON_ONCE(ei->d_children[i]);
- }
-
- free_ei(ei);
-}
-
-static void unhook_dentry(struct dentry *dentry)
-{
- if (!dentry)
- return;
- /*
- * Need to add a reference to the dentry that is expected by
- * simple_recursive_removal(), which will include a dput().
- */
- dget(dentry);
-
- /*
- * Also add a reference for the dput() in eventfs_workfn().
- * That is required as that dput() will free the ei after
- * the SRCU grace period is over.
- */
- dget(dentry);
-}
-
/**
* eventfs_remove_rec - remove eventfs dir or file from list
* @ei: eventfs_inode to be removed.
@@ -1075,8 +802,6 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
{
struct eventfs_inode *ei_child;
- if (!ei)
- return;
/*
* Check recursion depth. It should never be greater than 3:
* 0 - events/
@@ -1088,28 +813,11 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
return;
/* search for nested folders or files */
- list_for_each_entry_srcu(ei_child, &ei->children, list,
- lockdep_is_held(&eventfs_mutex)) {
- /* Children only have dentry if parent does */
- WARN_ON_ONCE(ei_child->dentry && !ei->dentry);
+ list_for_each_entry(ei_child, &ei->children, list)
eventfs_remove_rec(ei_child, level + 1);
- }
-
-
- ei->is_freed = 1;
- for (int i = 0; i < ei->nr_entries; i++) {
- if (ei->d_children[i]) {
- /* Children only have dentry if parent does */
- WARN_ON_ONCE(!ei->dentry);
- unhook_dentry(ei->d_children[i]);
- }
- }
-
- unhook_dentry(ei->dentry);
-
- list_del_rcu(&ei->list);
- call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei);
+ list_del(&ei->list);
+ free_ei(ei);
}
/**
@@ -1120,22 +828,12 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
*/
void eventfs_remove_dir(struct eventfs_inode *ei)
{
- struct dentry *dentry;
-
if (!ei)
return;
mutex_lock(&eventfs_mutex);
- dentry = ei->dentry;
eventfs_remove_rec(ei, 0);
mutex_unlock(&eventfs_mutex);
-
- /*
- * If any of the ei children has a dentry, then the ei itself
- * must have a dentry.
- */
- if (dentry)
- simple_recursive_removal(dentry, NULL);
}
/**
@@ -1148,7 +846,11 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei)
{
struct dentry *dentry;
- dentry = ei->dentry;
+ dentry = ei->events_dir;
+ if (!dentry)
+ return;
+
+ ei->events_dir = NULL;
eventfs_remove_dir(ei);
/*
@@ -1158,5 +860,6 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei)
* sticks around while the other ei->dentry are created
* and destroyed dynamically.
*/
+ d_invalidate(dentry);
dput(dentry);
}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index bc86ffdb103b..d65ffad4c327 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -38,8 +38,6 @@ static struct inode *tracefs_alloc_inode(struct super_block *sb)
if (!ti)
return NULL;
- ti->flags = 0;
-
return &ti->vfs_inode;
}
@@ -91,6 +89,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
struct inode *inode, struct dentry *dentry,
umode_t mode)
{
+ struct tracefs_inode *ti;
char *name;
int ret;
@@ -99,6 +98,15 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
return -ENOMEM;
/*
+ * This is a new directory that does not take the default of
+ * the rootfs. It becomes the default permissions for all the
+ * files and directories underneath it.
+ */
+ ti = get_tracefs(inode);
+ ti->flags |= TRACEFS_INSTANCE_INODE;
+ ti->private = inode;
+
+ /*
* The mkdir call can call the generic functions that create
* the files within the tracefs system. It is up to the individual
* mkdir routine to handle races.
@@ -141,10 +149,76 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
return ret;
}
-static const struct inode_operations tracefs_dir_inode_operations = {
+static void set_tracefs_inode_owner(struct inode *inode)
+{
+ struct tracefs_inode *ti = get_tracefs(inode);
+ struct inode *root_inode = ti->private;
+
+ /*
+ * If this inode has never been referenced, then update
+ * the permissions to the superblock.
+ */
+ if (!(ti->flags & TRACEFS_UID_PERM_SET))
+ inode->i_uid = root_inode->i_uid;
+
+ if (!(ti->flags & TRACEFS_GID_PERM_SET))
+ inode->i_gid = root_inode->i_gid;
+}
+
+static int tracefs_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask)
+{
+ set_tracefs_inode_owner(inode);
+ return generic_permission(idmap, inode, mask);
+}
+
+static int tracefs_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
+{
+ struct inode *inode = d_backing_inode(path->dentry);
+
+ set_tracefs_inode_owner(inode);
+ generic_fillattr(idmap, request_mask, inode, stat);
+ return 0;
+}
+
+static int tracefs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ unsigned int ia_valid = attr->ia_valid;
+ struct inode *inode = d_inode(dentry);
+ struct tracefs_inode *ti = get_tracefs(inode);
+
+ if (ia_valid & ATTR_UID)
+ ti->flags |= TRACEFS_UID_PERM_SET;
+
+ if (ia_valid & ATTR_GID)
+ ti->flags |= TRACEFS_GID_PERM_SET;
+
+ return simple_setattr(idmap, dentry, attr);
+}
+
+static const struct inode_operations tracefs_instance_dir_inode_operations = {
.lookup = simple_lookup,
.mkdir = tracefs_syscall_mkdir,
.rmdir = tracefs_syscall_rmdir,
+ .permission = tracefs_permission,
+ .getattr = tracefs_getattr,
+ .setattr = tracefs_setattr,
+};
+
+static const struct inode_operations tracefs_dir_inode_operations = {
+ .lookup = simple_lookup,
+ .permission = tracefs_permission,
+ .getattr = tracefs_getattr,
+ .setattr = tracefs_setattr,
+};
+
+static const struct inode_operations tracefs_file_inode_operations = {
+ .permission = tracefs_permission,
+ .getattr = tracefs_getattr,
+ .setattr = tracefs_setattr,
};
struct inode *tracefs_get_inode(struct super_block *sb)
@@ -183,87 +257,6 @@ struct tracefs_fs_info {
struct tracefs_mount_opts mount_opts;
};
-static void change_gid(struct dentry *dentry, kgid_t gid)
-{
- if (!dentry->d_inode)
- return;
- dentry->d_inode->i_gid = gid;
-}
-
-/*
- * Taken from d_walk, but without he need for handling renames.
- * Nothing can be renamed while walking the list, as tracefs
- * does not support renames. This is only called when mounting
- * or remounting the file system, to set all the files to
- * the given gid.
- */
-static void set_gid(struct dentry *parent, kgid_t gid)
-{
- struct dentry *this_parent;
- struct list_head *next;
-
- this_parent = parent;
- spin_lock(&this_parent->d_lock);
-
- change_gid(this_parent, gid);
-repeat:
- next = this_parent->d_subdirs.next;
-resume:
- while (next != &this_parent->d_subdirs) {
- struct tracefs_inode *ti;
- struct list_head *tmp = next;
- struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
- next = tmp->next;
-
- /* Note, getdents() can add a cursor dentry with no inode */
- if (!dentry->d_inode)
- continue;
-
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-
- change_gid(dentry, gid);
-
- /* If this is the events directory, update that too */
- ti = get_tracefs(dentry->d_inode);
- if (ti && (ti->flags & TRACEFS_EVENT_INODE))
- eventfs_update_gid(dentry, gid);
-
- if (!list_empty(&dentry->d_subdirs)) {
- spin_unlock(&this_parent->d_lock);
- spin_release(&dentry->d_lock.dep_map, _RET_IP_);
- this_parent = dentry;
- spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
- goto repeat;
- }
- spin_unlock(&dentry->d_lock);
- }
- /*
- * All done at this level ... ascend and resume the search.
- */
- rcu_read_lock();
-ascend:
- if (this_parent != parent) {
- struct dentry *child = this_parent;
- this_parent = child->d_parent;
-
- spin_unlock(&child->d_lock);
- spin_lock(&this_parent->d_lock);
-
- /* go into the first sibling still alive */
- do {
- next = child->d_child.next;
- if (next == &this_parent->d_subdirs)
- goto ascend;
- child = list_entry(next, struct dentry, d_child);
- } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
- rcu_read_unlock();
- goto resume;
- }
- rcu_read_unlock();
- spin_unlock(&this_parent->d_lock);
- return;
-}
-
static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
{
substring_t args[MAX_OPT_ARGS];
@@ -336,10 +329,8 @@ static int tracefs_apply_options(struct super_block *sb, bool remount)
if (!remount || opts->opts & BIT(Opt_uid))
inode->i_uid = opts->uid;
- if (!remount || opts->opts & BIT(Opt_gid)) {
- /* Set all the group ids to the mount option */
- set_gid(sb->s_root, opts->gid);
- }
+ if (!remount || opts->opts & BIT(Opt_gid))
+ inode->i_gid = opts->gid;
return 0;
}
@@ -386,21 +377,30 @@ static const struct super_operations tracefs_super_operations = {
.show_options = tracefs_show_options,
};
-static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
+/*
+ * It would be cleaner if eventfs had its own dentry ops.
+ *
+ * Note that d_revalidate is called potentially under RCU,
+ * so it can't take the eventfs mutex etc. It's fine - if
+ * we open a file just as it's marked dead, things will
+ * still work just fine, and just see the old stale case.
+ */
+static void tracefs_d_release(struct dentry *dentry)
{
- struct tracefs_inode *ti;
+ if (dentry->d_fsdata)
+ eventfs_d_release(dentry);
+}
- if (!dentry || !inode)
- return;
+static int tracefs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct eventfs_inode *ei = dentry->d_fsdata;
- ti = get_tracefs(inode);
- if (ti && ti->flags & TRACEFS_EVENT_INODE)
- eventfs_set_ei_status_free(ti, dentry);
- iput(inode);
+ return !(ei && ei->is_freed);
}
static const struct dentry_operations tracefs_dentry_operations = {
- .d_iput = tracefs_dentry_iput,
+ .d_revalidate = tracefs_d_revalidate,
+ .d_release = tracefs_d_release,
};
static int trace_fill_super(struct super_block *sb, void *data, int silent)
@@ -504,73 +504,24 @@ struct dentry *tracefs_end_creating(struct dentry *dentry)
return dentry;
}
-/**
- * eventfs_start_creating - start the process of creating a dentry
- * @name: Name of the file created for the dentry
- * @parent: The parent dentry where this dentry will be created
- *
- * This is a simple helper function for the dynamically created eventfs
- * files. When the directory of the eventfs files are accessed, their
- * dentries are created on the fly. This function is used to start that
- * process.
- */
-struct dentry *eventfs_start_creating(const char *name, struct dentry *parent)
+/* Find the inode that this will use for default */
+static struct inode *instance_inode(struct dentry *parent, struct inode *inode)
{
- struct dentry *dentry;
- int error;
-
- /* Must always have a parent. */
- if (WARN_ON_ONCE(!parent))
- return ERR_PTR(-EINVAL);
-
- error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
- &tracefs_mount_count);
- if (error)
- return ERR_PTR(error);
+ struct tracefs_inode *ti;
- if (unlikely(IS_DEADDIR(parent->d_inode)))
- dentry = ERR_PTR(-ENOENT);
- else
- dentry = lookup_one_len(name, parent, strlen(name));
+ /* If parent is NULL then use root inode */
+ if (!parent)
+ return d_inode(inode->i_sb->s_root);
- if (!IS_ERR(dentry) && dentry->d_inode) {
- dput(dentry);
- dentry = ERR_PTR(-EEXIST);
+ /* Find the inode that is flagged as an instance or the root inode */
+ while (!IS_ROOT(parent)) {
+ ti = get_tracefs(d_inode(parent));
+ if (ti->flags & TRACEFS_INSTANCE_INODE)
+ break;
+ parent = parent->d_parent;
}
- if (IS_ERR(dentry))
- simple_release_fs(&tracefs_mount, &tracefs_mount_count);
-
- return dentry;
-}
-
-/**
- * eventfs_failed_creating - clean up a failed eventfs dentry creation
- * @dentry: The dentry to clean up
- *
- * If after calling eventfs_start_creating(), a failure is detected, the
- * resources created by eventfs_start_creating() needs to be cleaned up. In
- * that case, this function should be called to perform that clean up.
- */
-struct dentry *eventfs_failed_creating(struct dentry *dentry)
-{
- dput(dentry);
- simple_release_fs(&tracefs_mount, &tracefs_mount_count);
- return NULL;
-}
-
-/**
- * eventfs_end_creating - Finish the process of creating a eventfs dentry
- * @dentry: The dentry that has successfully been created.
- *
- * This function is currently just a place holder to match
- * eventfs_start_creating(). In case any synchronization needs to be added,
- * this function will be used to implement that without having to modify
- * the callers of eventfs_start_creating().
- */
-struct dentry *eventfs_end_creating(struct dentry *dentry)
-{
- return dentry;
+ return d_inode(parent);
}
/**
@@ -603,6 +554,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *fops)
{
+ struct tracefs_inode *ti;
struct dentry *dentry;
struct inode *inode;
@@ -621,7 +573,11 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
if (unlikely(!inode))
return tracefs_failed_creating(dentry);
+ ti = get_tracefs(inode);
+ ti->private = instance_inode(parent, inode);
+
inode->i_mode = mode;
+ inode->i_op = &tracefs_file_inode_operations;
inode->i_fop = fops ? fops : &tracefs_file_operations;
inode->i_private = data;
inode->i_uid = d_inode(dentry->d_parent)->i_uid;
@@ -634,6 +590,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
static struct dentry *__create_dir(const char *name, struct dentry *parent,
const struct inode_operations *ops)
{
+ struct tracefs_inode *ti;
struct dentry *dentry = tracefs_start_creating(name, parent);
struct inode *inode;
@@ -651,6 +608,9 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent,
inode->i_uid = d_inode(dentry->d_parent)->i_uid;
inode->i_gid = d_inode(dentry->d_parent)->i_gid;
+ ti = get_tracefs(inode);
+ ti->private = instance_inode(parent, inode);
+
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
d_instantiate(dentry, inode);
@@ -681,7 +641,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
if (security_locked_down(LOCKDOWN_TRACEFS))
return NULL;
- return __create_dir(name, parent, &simple_dir_inode_operations);
+ return __create_dir(name, parent, &tracefs_dir_inode_operations);
}
/**
@@ -712,7 +672,7 @@ __init struct dentry *tracefs_create_instance_dir(const char *name,
if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir))
return NULL;
- dentry = __create_dir(name, parent, &tracefs_dir_inode_operations);
+ dentry = __create_dir(name, parent, &tracefs_instance_dir_inode_operations);
if (!dentry)
return NULL;
@@ -757,7 +717,11 @@ static void init_once(void *foo)
{
struct tracefs_inode *ti = (struct tracefs_inode *) foo;
+ /* inode_init_once() calls memset() on the vfs_inode portion */
inode_init_once(&ti->vfs_inode);
+
+ /* Zero out the rest */
+ memset_after(ti, 0, vfs_inode);
}
static int __init tracefs_init(void)
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index 42bdeb471a07..beb3dcd0e434 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -5,12 +5,16 @@
enum {
TRACEFS_EVENT_INODE = BIT(1),
TRACEFS_EVENT_TOP_INODE = BIT(2),
+ TRACEFS_GID_PERM_SET = BIT(3),
+ TRACEFS_UID_PERM_SET = BIT(4),
+ TRACEFS_INSTANCE_INODE = BIT(5),
};
struct tracefs_inode {
+ struct inode vfs_inode;
+ /* The below gets initialized with memset_after(ti, 0, vfs_inode) */
unsigned long flags;
void *private;
- struct inode vfs_inode;
};
/*
@@ -28,42 +32,37 @@ struct eventfs_attr {
/*
* struct eventfs_inode - hold the properties of the eventfs directories.
* @list: link list into the parent directory
+ * @rcu: Union with @list for freeing
+ * @children: link list into the child eventfs_inode
* @entries: the array of entries representing the files in the directory
* @name: the name of the directory to create
- * @children: link list into the child eventfs_inode
- * @dentry: the dentry of the directory
- * @d_parent: pointer to the parent's dentry
- * @d_children: The array of dentries to represent the files when created
+ * @events_dir: the dentry of the events directory
* @entry_attrs: Saved mode and ownership of the @d_children
- * @attr: Saved mode and ownership of eventfs_inode itself
* @data: The private data to pass to the callbacks
+ * @attr: Saved mode and ownership of eventfs_inode itself
* @is_freed: Flag set if the eventfs is on its way to be freed
* Note if is_freed is set, then dentry is corrupted.
+ * @is_events: Flag set for only the top level "events" directory
* @nr_entries: The number of items in @entries
+ * @ino: The saved inode number
*/
struct eventfs_inode {
- struct list_head list;
+ union {
+ struct list_head list;
+ struct rcu_head rcu;
+ };
+ struct list_head children;
const struct eventfs_entry *entries;
const char *name;
- struct list_head children;
- struct dentry *dentry; /* Check is_freed to access */
- struct dentry *d_parent;
- struct dentry **d_children;
+ struct dentry *events_dir;
struct eventfs_attr *entry_attrs;
- struct eventfs_attr attr;
void *data;
- /*
- * Union - used for deletion
- * @llist: for calling dput() if needed after RCU
- * @rcu: eventfs_inode to delete in RCU
- */
- union {
- struct llist_node llist;
- struct rcu_head rcu;
- };
+ struct eventfs_attr attr;
+ struct kref kref;
unsigned int is_freed:1;
unsigned int is_events:1;
unsigned int nr_entries:30;
+ unsigned int ino;
};
static inline struct tracefs_inode *get_tracefs(const struct inode *inode)
@@ -75,10 +74,7 @@ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent);
struct dentry *tracefs_end_creating(struct dentry *dentry);
struct dentry *tracefs_failed_creating(struct dentry *dentry);
struct inode *tracefs_get_inode(struct super_block *sb);
-struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
-struct dentry *eventfs_failed_creating(struct dentry *dentry);
-struct dentry *eventfs_end_creating(struct dentry *dentry);
-void eventfs_update_gid(struct dentry *dentry, kgid_t gid);
-void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry);
+
+void eventfs_d_release(struct dentry *dentry);
#endif /* _TRACEFS_INTERNAL_H */
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index 0d561ecb6869..a4a0158f712d 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -18,7 +18,7 @@
#include "ubifs.h"
/**
- * ubifs_node_calc_hash - calculate the hash of a UBIFS node
+ * __ubifs_node_calc_hash - calculate the hash of a UBIFS node
* @c: UBIFS file-system description object
* @node: the node to calculate a hash for
* @hash: the returned hash
@@ -507,28 +507,13 @@ out:
*/
int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac)
{
- SHASH_DESC_ON_STACK(shash, c->hmac_tfm);
- int err;
const char well_known_message[] = "UBIFS";
if (!ubifs_authenticated(c))
return 0;
- shash->tfm = c->hmac_tfm;
-
- err = crypto_shash_init(shash);
- if (err)
- return err;
-
- err = crypto_shash_update(shash, well_known_message,
- sizeof(well_known_message) - 1);
- if (err < 0)
- return err;
-
- err = crypto_shash_final(shash, hmac);
- if (err)
- return err;
- return 0;
+ return crypto_shash_tfm_digest(c->hmac_tfm, well_known_message,
+ sizeof(well_known_message) - 1, hmac);
}
/*
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index c4fc1047fc07..5b3a840098b0 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -70,18 +70,29 @@ static int nothing_to_commit(struct ubifs_info *c)
return 0;
/*
+ * Increasing @c->dirty_pn_cnt/@c->dirty_nn_cnt and marking
+ * nnodes/pnodes as dirty in run_gc() could race with following
+ * checking, which leads inconsistent states between @c->nroot
+ * and @c->dirty_pn_cnt/@c->dirty_nn_cnt, holding @c->lp_mutex
+ * to avoid that.
+ */
+ mutex_lock(&c->lp_mutex);
+ /*
* Even though the TNC is clean, the LPT tree may have dirty nodes. For
* example, this may happen if the budgeting subsystem invoked GC to
* make some free space, and the GC found an LEB with only dirty and
* free space. In this case GC would just change the lprops of this
* LEB (by turning all space into free space) and unmap it.
*/
- if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags))
+ if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags)) {
+ mutex_unlock(&c->lp_mutex);
return 0;
+ }
ubifs_assert(c, atomic_long_read(&c->dirty_zn_cnt) == 0);
ubifs_assert(c, c->dirty_pn_cnt == 0);
ubifs_assert(c, c->dirty_nn_cnt == 0);
+ mutex_unlock(&c->lp_mutex);
return 1;
}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 3b13c648d490..e413a9cf8ee3 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1234,6 +1234,8 @@ out_cancel:
dir_ui->ui_size = dir->i_size;
mutex_unlock(&dir_ui->ui_mutex);
out_inode:
+ /* Free inode->i_link before inode is marked as bad. */
+ fscrypt_free_inode(inode);
make_bad_inode(inode);
iput(inode);
out_fname:
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2d2b39f843ce..5029eb3390a5 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -318,8 +318,9 @@ static int write_begin_slow(struct address_space *mapping,
* This is a helper function for 'ubifs_write_begin()' which allocates budget
* for the operation. The budget is allocated differently depending on whether
* this is appending, whether the page is dirty or not, and so on. This
- * function leaves the @ui->ui_mutex locked in case of appending. Returns zero
- * in case of success and %-ENOSPC in case of failure.
+ * function leaves the @ui->ui_mutex locked in case of appending.
+ *
+ * Returns: %0 in case of success and %-ENOSPC in case of failure.
*/
static int allocate_budget(struct ubifs_info *c, struct page *page,
struct ubifs_inode *ui, int appending)
@@ -600,7 +601,7 @@ out:
* @bu: bulk-read information
* @n: next zbranch slot
*
- * This function returns %0 on success and a negative error code on failure.
+ * Returns: %0 on success and a negative error code on failure.
*/
static int populate_page(struct ubifs_info *c, struct page *page,
struct bu_info *bu, int *n)
@@ -711,7 +712,7 @@ out_err:
* @bu: bulk-read information
* @page1: first page to read
*
- * This function returns %1 if the bulk-read is done, otherwise %0 is returned.
+ * Returns: %1 if the bulk-read is done, otherwise %0 is returned.
*/
static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
struct page *page1)
@@ -821,7 +822,9 @@ out_bu_off:
* Some flash media are capable of reading sequentially at faster rates. UBIFS
* bulk-read facility is designed to take advantage of that, by reading in one
* go consecutive data nodes that are also located consecutively in the same
- * LEB. This function returns %1 if a bulk-read is done and %0 otherwise.
+ * LEB.
+ *
+ * Returns: %1 if a bulk-read is done and %0 otherwise.
*/
static int ubifs_bulk_read(struct page *page)
{
@@ -1109,7 +1112,9 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr)
* @attr: inode attribute changes description
*
* This function implements VFS '->setattr()' call when the inode is truncated
- * to a smaller size. Returns zero in case of success and a negative error code
+ * to a smaller size.
+ *
+ * Returns: %0 in case of success and a negative error code
* in case of failure.
*/
static int do_truncation(struct ubifs_info *c, struct inode *inode,
@@ -1215,7 +1220,9 @@ out_budg:
* @attr: inode attribute changes description
*
* This function implements VFS '->setattr()' call for all cases except
- * truncations to smaller size. Returns zero in case of success and a negative
+ * truncations to smaller size.
+ *
+ * Returns: %0 in case of success and a negative
* error code in case of failure.
*/
static int do_setattr(struct ubifs_info *c, struct inode *inode,
@@ -1360,6 +1367,8 @@ out:
* This helper function checks if the inode mtime/ctime should be updated or
* not. If current values of the time-stamps are within the UBIFS inode time
* granularity, they are not updated. This is an optimization.
+ *
+ * Returns: %1 if time update is needed, %0 if not
*/
static inline int mctime_update_needed(const struct inode *inode,
const struct timespec64 *now)
@@ -1375,11 +1384,12 @@ static inline int mctime_update_needed(const struct inode *inode,
/**
* ubifs_update_time - update time of inode.
* @inode: inode to update
- * @time: timespec structure to hold the current time value
* @flags: time updating control flag determines updating
* which time fields of @inode
*
* This function updates time of the inode.
+ *
+ * Returns: %0 for success or a negative error code otherwise.
*/
int ubifs_update_time(struct inode *inode, int flags)
{
@@ -1413,7 +1423,9 @@ int ubifs_update_time(struct inode *inode, int flags)
* @inode: inode to update
*
* This function updates mtime and ctime of the inode if it is not equivalent to
- * current time. Returns zero in case of success and a negative error code in
+ * current time.
+ *
+ * Returns: %0 in case of success and a negative error code in
* case of failure.
*/
static int update_mctime(struct inode *inode)
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index c59d47fe7939..17da28d6247a 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -365,6 +365,7 @@ static void destroy_replay_list(struct ubifs_info *c)
* @lnum: node logical eraseblock number
* @offs: node offset
* @len: node length
+ * @hash: node hash
* @key: node key
* @sqnum: sequence number
* @deletion: non-zero if this is a deletion
@@ -417,6 +418,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
* @lnum: node logical eraseblock number
* @offs: node offset
* @len: node length
+ * @hash: node hash
* @key: node key
* @name: directory entry name
* @nlen: directory entry name length
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 3508ac484da3..1bb6ed948927 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -125,8 +125,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
udf_fiiter_release(&iter);
inode = udf_iget(dir->i_sb, &loc);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
}
return d_splice_alias(inode, dentry);
@@ -230,8 +228,6 @@ static int udf_fiiter_add_entry(struct inode *dir, struct dentry *dentry,
char name[UDF_NAME_LEN_CS0];
if (dentry) {
- if (!dentry->d_name.len)
- return -EINVAL;
namelen = udf_put_filename(dir->i_sb, dentry->d_name.name,
dentry->d_name.len,
name, UDF_NAME_LEN_CS0);
@@ -766,7 +762,7 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
struct udf_fileident_iter oiter, niter, diriter;
- bool has_diriter = false;
+ bool has_diriter = false, is_dir = false;
int retval;
struct kernel_lb_addr tloc;
@@ -789,6 +785,9 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (!empty_dir(new_inode))
goto out_oiter;
}
+ is_dir = true;
+ }
+ if (is_dir && old_dir != new_dir) {
retval = udf_fiiter_find_entry(old_inode, &dotdot_name,
&diriter);
if (retval == -ENOENT) {
@@ -878,7 +877,9 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
udf_dir_entry_len(&diriter.fi));
udf_fiiter_write_fi(&diriter, NULL);
udf_fiiter_release(&diriter);
+ }
+ if (is_dir) {
inode_dec_link_count(old_dir);
if (new_inode)
inode_dec_link_count(new_inode);
@@ -899,7 +900,6 @@ out_oiter:
static struct dentry *udf_get_parent(struct dentry *child)
{
struct kernel_lb_addr tloc;
- struct inode *inode = NULL;
struct udf_fileident_iter iter;
int err;
@@ -909,11 +909,7 @@ static struct dentry *udf_get_parent(struct dentry *child)
tloc = lelb_to_cpu(iter.fi.icb.extLocation);
udf_fiiter_release(&iter);
- inode = udf_iget(child->d_sb, &tloc);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
-
- return d_obtain_alias(inode);
+ return d_obtain_alias(udf_iget(child->d_sb, &tloc));
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 6e2a4d6a0d8f..959551ff9a95 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -45,7 +45,6 @@ static struct ctl_table vm_userfaultfd_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- { }
};
#endif
@@ -1033,7 +1032,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new,
{
int fd;
- fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
+ fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new,
O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
if (fd < 0)
return fd;
@@ -2261,7 +2260,8 @@ static int new_userfaultfd(int flags)
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
- fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
+ /* Create a new inode so that the LSM can block the creation. */
+ fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
if (fd < 0) {
mmdrop(ctx->mm);
diff --git a/fs/vboxsf/vboxsf_wrappers.c b/fs/vboxsf/vboxsf_wrappers.c
index bfc78a097dae..5e8d4359e171 100644
--- a/fs/vboxsf/vboxsf_wrappers.c
+++ b/fs/vboxsf/vboxsf_wrappers.c
@@ -114,7 +114,7 @@ int vboxsf_unmap_folder(u32 root)
* vboxsf_create - Create a new file or folder
* @root: Root of the shared folder in which to create the file
* @parsed_path: The path of the file or folder relative to the shared folder
- * @param: create_parms Parameters for file/folder creation.
+ * @create_parms: Parameters for file/folder creation.
*
* Create a new file or folder or open an existing one in a shared folder.
* Note this function always returns 0 / success unless an exceptional condition
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index d071a6e32581..a6a6b2749241 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -100,6 +100,16 @@ fsverity_msg(const struct inode *inode, const char *level,
#define fsverity_err(inode, fmt, ...) \
fsverity_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__)
+/* measure.c */
+
+#ifdef CONFIG_BPF_SYSCALL
+void __init fsverity_init_bpf(void);
+#else
+static inline void fsverity_init_bpf(void)
+{
+}
+#endif
+
/* open.c */
int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
diff --git a/fs/verity/init.c b/fs/verity/init.c
index a29f062f6047..cb2c9aac61ed 100644
--- a/fs/verity/init.c
+++ b/fs/verity/init.c
@@ -24,7 +24,6 @@ static struct ctl_table fsverity_sysctl_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif
- { }
};
static void __init fsverity_init_sysctl(void)
@@ -69,6 +68,7 @@ static int __init fsverity_init(void)
fsverity_init_workqueue();
fsverity_init_sysctl();
fsverity_init_signature();
+ fsverity_init_bpf();
return 0;
}
late_initcall(fsverity_init)
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index eec5956141da..bf7a5f4cccaf 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -7,6 +7,8 @@
#include "fsverity_private.h"
+#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/uaccess.h>
/**
@@ -100,3 +102,85 @@ int fsverity_get_digest(struct inode *inode,
return hash_alg->digest_size;
}
EXPORT_SYMBOL_GPL(fsverity_get_digest);
+
+#ifdef CONFIG_BPF_SYSCALL
+
+/* bpf kfuncs */
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_get_fsverity_digest: read fsverity digest of file
+ * @file: file to get digest from
+ * @digest_ptr: (out) dynptr for struct fsverity_digest
+ *
+ * Read fsverity_digest of *file* into *digest_ptr*.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr_kern *digest_ptr)
+{
+ const struct inode *inode = file_inode(file);
+ u32 dynptr_sz = __bpf_dynptr_size(digest_ptr);
+ struct fsverity_digest *arg;
+ const struct fsverity_info *vi;
+ const struct fsverity_hash_alg *hash_alg;
+ int out_digest_sz;
+
+ if (dynptr_sz < sizeof(struct fsverity_digest))
+ return -EINVAL;
+
+ arg = __bpf_dynptr_data_rw(digest_ptr, dynptr_sz);
+ if (!arg)
+ return -EINVAL;
+
+ if (!IS_ALIGNED((uintptr_t)arg, __alignof__(*arg)))
+ return -EINVAL;
+
+ vi = fsverity_get_info(inode);
+ if (!vi)
+ return -ENODATA; /* not a verity file */
+
+ hash_alg = vi->tree_params.hash_alg;
+
+ arg->digest_algorithm = hash_alg - fsverity_hash_algs;
+ arg->digest_size = hash_alg->digest_size;
+
+ out_digest_sz = dynptr_sz - sizeof(struct fsverity_digest);
+
+ /* copy digest */
+ memcpy(arg->digest, vi->file_digest, min_t(int, hash_alg->digest_size, out_digest_sz));
+
+ /* fill the extra buffer with zeros */
+ if (out_digest_sz > hash_alg->digest_size)
+ memset(arg->digest + arg->digest_size, 0, out_digest_sz - hash_alg->digest_size);
+
+ return 0;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_SET8_START(fsverity_set_ids)
+BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS)
+BTF_SET8_END(fsverity_set_ids)
+
+static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ if (!btf_id_set8_contains(&fsverity_set_ids, kfunc_id))
+ return 0;
+
+ /* Only allow to attach from LSM hooks, to avoid recursion */
+ return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0;
+}
+
+static const struct btf_kfunc_id_set bpf_fsverity_set = {
+ .owner = THIS_MODULE,
+ .set = &fsverity_set_ids,
+ .filter = bpf_get_fsverity_digest_filter,
+};
+
+void __init fsverity_init_bpf(void)
+{
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fsverity_set);
+}
+
+#endif /* CONFIG_BPF_SYSCALL */
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 9976a00a73f9..e965a48e7db9 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -421,10 +421,10 @@ xfs_attr_complete_op(
bool do_replace = args->op_flags & XFS_DA_OP_REPLACE;
args->op_flags &= ~XFS_DA_OP_REPLACE;
- if (do_replace) {
- args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ if (do_replace)
return replace_state;
- }
+
return XFS_DAS_DONE;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 98aaca933bdd..f362345467fa 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3277,7 +3277,7 @@ xfs_bmap_alloc_account(
struct xfs_bmalloca *ap)
{
bool isrt = XFS_IS_REALTIME_INODE(ap->ip) &&
- (ap->flags & XFS_BMAPI_ATTRFORK);
+ !(ap->flags & XFS_BMAPI_ATTRFORK);
uint fld;
if (ap->flags & XFS_BMAPI_COWFORK) {
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 31100120b2c5..e31663cb7b43 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1119,20 +1119,6 @@ xfs_rtbitmap_blockcount(
}
/*
- * Compute the maximum level number of the realtime summary file, as defined by
- * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct
- * use of rt volumes with more than 2^32 extents.
- */
-uint8_t
-xfs_compute_rextslog(
- xfs_rtbxlen_t rtextents)
-{
- if (!rtextents)
- return 0;
- return xfs_highbit64(rtextents);
-}
-
-/*
* Compute the number of rtbitmap words needed to populate every block of a
* bitmap that is large enough to track the given number of rt extents.
*/
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index 274dc7dae1fa..152a66750af5 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -351,20 +351,6 @@ xfs_rtfree_extent(
int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno,
xfs_filblks_t rtlen);
-uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
-
-/* Do we support an rt volume having this number of rtextents? */
-static inline bool
-xfs_validate_rtextents(
- xfs_rtbxlen_t rtextents)
-{
- /* No runt rt volumes */
- if (rtextents == 0)
- return false;
-
- return true;
-}
-
xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t
rtextents);
unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp,
@@ -383,8 +369,6 @@ unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp,
# define xfs_rtsummary_read_buf(a,b) (-ENOSYS)
# define xfs_rtbuf_cache_relse(a) (0)
# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS)
-# define xfs_compute_rextslog(rtx) (0)
-# define xfs_validate_rtextents(rtx) (false)
static inline xfs_filblks_t
xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
{
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 4a9e8588f4c9..5bb6e2bd6dee 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1377,3 +1377,17 @@ xfs_validate_stripe_geometry(
}
return true;
}
+
+/*
+ * Compute the maximum level number of the realtime summary file, as defined by
+ * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct
+ * use of rt volumes with more than 2^32 extents.
+ */
+uint8_t
+xfs_compute_rextslog(
+ xfs_rtbxlen_t rtextents)
+{
+ if (!rtextents)
+ return 0;
+ return xfs_highbit64(rtextents);
+}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 19134b23c10b..2e8e8d63d4eb 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -38,4 +38,6 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp,
extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp,
__s64 sunit, __s64 swidth, int sectorsize, bool silent);
+uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
+
#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 20b5375f2d9c..62e02d5380ad 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -251,4 +251,16 @@ bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off);
bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off,
xfs_fileoff_t len);
+/* Do we support an rt volume having this number of rtextents? */
+static inline bool
+xfs_validate_rtextents(
+ xfs_rtbxlen_t rtextents)
+{
+ /* No runt rt volumes */
+ if (rtextents == 0)
+ return false;
+
+ return true;
+}
+
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 441ca9977652..46583517377f 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -15,6 +15,7 @@
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_bit.h"
+#include "xfs_sb.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index fabd0ed9dfa6..b1ff4f33324a 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -16,6 +16,7 @@
#include "xfs_rtbitmap.h"
#include "xfs_bit.h"
#include "xfs_bmap.h"
+#include "xfs_sb.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index aff20ddd4a9f..5a2512d20bd0 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1496,6 +1496,18 @@ xfs_fs_fill_super(
mp->m_super = sb;
+ /*
+ * Copy VFS mount flags from the context now that all parameter parsing
+ * is guaranteed to have been completed by either the old mount API or
+ * the newer fsopen/fsconfig API.
+ */
+ if (fc->sb_flags & SB_RDONLY)
+ set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+ if (fc->sb_flags & SB_DIRSYNC)
+ mp->m_features |= XFS_FEAT_DIRSYNC;
+ if (fc->sb_flags & SB_SYNCHRONOUS)
+ mp->m_features |= XFS_FEAT_WSYNC;
+
error = xfs_fs_validate_params(mp);
if (error)
return error;
@@ -1965,6 +1977,11 @@ static const struct fs_context_operations xfs_context_ops = {
.free = xfs_fs_free,
};
+/*
+ * WARNING: do not initialise any parameters in this function that depend on
+ * mount option parsing having already been performed as this can be called from
+ * fsopen() before any parameters have been set.
+ */
static int xfs_init_fs_context(
struct fs_context *fc)
{
@@ -1996,16 +2013,6 @@ static int xfs_init_fs_context(
mp->m_logbsize = -1;
mp->m_allocsize_log = 16; /* 64k */
- /*
- * Copy binary VFS mount flags we are interested in.
- */
- if (fc->sb_flags & SB_RDONLY)
- set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
- if (fc->sb_flags & SB_DIRSYNC)
- mp->m_features |= XFS_FEAT_DIRSYNC;
- if (fc->sb_flags & SB_SYNCHRONOUS)
- mp->m_features |= XFS_FEAT_WSYNC;
-
fc->s_fs_info = mp;
fc->ops = &xfs_context_ops;
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index fade33735393..a191f6560f98 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -206,8 +206,6 @@ static struct ctl_table xfs_table[] = {
.extra2 = &xfs_params.stats_clear.max
},
#endif /* CONFIG_PROC_FS */
-
- {}
};
int
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index e6a75401677d..93971742613a 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -747,8 +747,6 @@ static struct dentry *zonefs_lookup(struct inode *dir, struct dentry *dentry,
inode = zonefs_get_dir_inode(dir, dentry);
else
inode = zonefs_get_file_inode(dir, dentry);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
return d_splice_alias(inode, dentry);
}