summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bcachefs/Kconfig7
-rw-r--r--fs/bcachefs/Makefile16
-rw-r--r--fs/bcachefs/acl.c4
-rw-r--r--fs/bcachefs/acl.h4
-rw-r--r--fs/bcachefs/bcachefs.h1
-rw-r--r--fs/bcachefs/btree_gc.c6
-rw-r--r--fs/bcachefs/btree_io.c1
-rw-r--r--fs/bcachefs/btree_journal_iter.c4
-rw-r--r--fs/bcachefs/btree_node_scan.c1
-rw-r--r--fs/bcachefs/clock_types.h2
-rw-r--r--fs/bcachefs/data_update.c79
-rw-r--r--fs/bcachefs/disk_accounting.c2
-rw-r--r--fs/bcachefs/errcode.h6
-rw-r--r--fs/bcachefs/fs.c12
-rw-r--r--fs/bcachefs/inode.c4
-rw-r--r--fs/bcachefs/io_write.c115
-rw-r--r--fs/bcachefs/journal_reclaim.c7
-rw-r--r--fs/bcachefs/nocow_locking.c134
-rw-r--r--fs/bcachefs/nocow_locking.h26
-rw-r--r--fs/bcachefs/nocow_locking_types.h6
-rw-r--r--fs/bcachefs/opts.c4
-rw-r--r--fs/bcachefs/replicas_format.h3
-rw-r--r--fs/bcachefs/sb-errors_format.h3
-rw-r--r--fs/bcachefs/super.c13
-rw-r--r--fs/bcachefs/util.h1
-rw-r--r--fs/bcachefs/vendor/min_heap.c59
-rw-r--r--fs/bcachefs/vendor/min_heap.h477
-rw-r--r--fs/btrfs/block-group.c9
-rw-r--r--fs/btrfs/btrfs_inode.h2
-rw-r--r--fs/btrfs/compression.c22
-rw-r--r--fs/btrfs/compression.h2
-rw-r--r--fs/btrfs/delayed-inode.c3
-rw-r--r--fs/btrfs/extent_io.c40
-rw-r--r--fs/btrfs/inode.c67
-rw-r--r--fs/btrfs/qgroup.c6
-rw-r--r--fs/btrfs/ref-verify.c9
-rw-r--r--fs/btrfs/super.c32
-rw-r--r--fs/btrfs/tree-checker.c4
-rw-r--r--fs/btrfs/tree-log.c80
-rw-r--r--fs/btrfs/volumes.c5
-rw-r--r--fs/btrfs/zoned.c2
-rw-r--r--fs/ceph/addr.c9
-rw-r--r--fs/ceph/debugfs.c14
-rw-r--r--fs/ceph/dir.c17
-rw-r--r--fs/ceph/file.c24
-rw-r--r--fs/ceph/inode.c88
-rw-r--r--fs/ceph/mds_client.c172
-rw-r--r--fs/ceph/mds_client.h18
-rw-r--r--fs/coredump.c4
-rw-r--r--fs/efivarfs/super.c4
-rw-r--r--fs/erofs/erofs_fs.h8
-rw-r--r--fs/erofs/internal.h1
-rw-r--r--fs/erofs/super.c12
-rw-r--r--fs/erofs/xattr.c13
-rw-r--r--fs/erofs/zmap.c67
-rw-r--r--fs/exec.c2
-rw-r--r--fs/fhandle.c8
-rw-r--r--fs/fuse/dev.c2
-rw-r--r--fs/fuse/dir.c3
-rw-r--r--fs/fuse/file.c5
-rw-r--r--fs/fuse/fuse_i.h14
-rw-r--r--fs/fuse/inode.c16
-rw-r--r--fs/fuse/passthrough.c5
-rw-r--r--fs/fuse/virtio_fs.c2
-rw-r--r--fs/kernfs/file.c58
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/file.c40
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c21
-rw-r--r--fs/nfs/inode.c13
-rw-r--r--fs/nfs/internal.h12
-rw-r--r--fs/nfs/io.c13
-rw-r--r--fs/nfs/localio.c21
-rw-r--r--fs/nfs/nfs42proc.c35
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4proc.c7
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/write.c53
-rw-r--r--fs/nilfs2/sysfs.c4
-rw-r--r--fs/nilfs2/sysfs.h8
-rw-r--r--fs/ocfs2/extent_map.c10
-rw-r--r--fs/ocfs2/inode.c3
-rw-r--r--fs/proc/generic.c39
-rw-r--r--fs/resctrl/ctrlmondata.c2
-rw-r--r--fs/resctrl/internal.h4
-rw-r--r--fs/resctrl/monitor.c6
-rw-r--r--fs/smb/client/cifs_debug.c31
-rw-r--r--fs/smb/client/cifs_unicode.c3
-rw-r--r--fs/smb/client/cifsfs.c14
-rw-r--r--fs/smb/client/cifsglob.h13
-rw-r--r--fs/smb/client/cifsproto.h4
-rw-r--r--fs/smb/client/file.c18
-rw-r--r--fs/smb/client/inode.c105
-rw-r--r--fs/smb/client/misc.c36
-rw-r--r--fs/smb/client/reparse.c2
-rw-r--r--fs/smb/client/smb1ops.c4
-rw-r--r--fs/smb/client/smb2glob.h3
-rw-r--r--fs/smb/client/smb2inode.c211
-rw-r--r--fs/smb/client/smb2misc.c19
-rw-r--r--fs/smb/client/smb2ops.c32
-rw-r--r--fs/smb/client/smb2pdu.c4
-rw-r--r--fs/smb/client/smb2proto.h3
-rw-r--r--fs/smb/client/smbdirect.c33
-rw-r--r--fs/smb/client/trace.h61
-rw-r--r--fs/smb/server/smb2pdu.c25
-rw-r--r--fs/smb/server/transport_rdma.c183
-rw-r--r--fs/smb/server/vfs_cache.h2
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c7
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c6
-rw-r--r--fs/xfs/xfs_aops.c3
-rw-r--r--fs/xfs/xfs_zone_alloc.c45
-rw-r--r--fs/xfs/xfs_zone_space_resv.c6
113 files changed, 2167 insertions, 851 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 5455412b2b75..df383b97aacc 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -21,7 +21,6 @@ config BCACHEFS_FS
select XOR_BLOCKS
select XXHASH
select SYMBOLIC_ERRNAME
- select MIN_HEAP
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
@@ -34,7 +33,6 @@ config BCACHEFS_QUOTA
config BCACHEFS_ERASURE_CODING
bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)"
depends on BCACHEFS_FS
- select QUOTACTL
help
This enables the "erasure_code" filesysystem and inode option, which
organizes data into reed-solomon stripes instead of ordinary
@@ -43,11 +41,6 @@ config BCACHEFS_ERASURE_CODING
WARNING: this feature is still undergoing on disk format changes, and
should only be enabled for testing purposes.
-config BCACHEFS_POSIX_ACL
- bool "bcachefs POSIX ACL support"
- depends on BCACHEFS_FS
- select FS_POSIX_ACL
-
config BCACHEFS_DEBUG
bool "bcachefs debugging"
depends on BCACHEFS_FS
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 1e87eee962ec..98acb3dc66a3 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -1,4 +1,9 @@
+ifdef BCACHEFS_DKMS
+ CONFIG_BCACHEFS_FS := m
+ # Enable other features here?
+endif
+
obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o
bcachefs-y := \
@@ -99,9 +104,16 @@ bcachefs-y := \
util.o \
varint.o \
xattr.o \
- vendor/closure.o
+ vendor/closure.o \
+ vendor/min_heap.o
-obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
+ifndef BCACHEFS_DKMS
+ obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
+endif
# Silence "note: xyz changed in GCC X.X" messages
subdir-ccflags-y += $(call cc-disable-warning, psabi)
+
+ifdef BCACHEFS_DKMS
+ subdir-ccflags-y += -I$(src)
+endif
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 3befa1f36e72..a1df0ec2d511 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -57,7 +57,7 @@ void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
}
}
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
+#ifndef NO_BCACHEFS_FS
#include "fs.h"
@@ -437,4 +437,4 @@ err:
return ret;
}
-#endif /* CONFIG_BCACHEFS_POSIX_ACL */
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
index fe730a6bf0c1..01cf21c5a0b6 100644
--- a/fs/bcachefs/acl.h
+++ b/fs/bcachefs/acl.h
@@ -26,7 +26,7 @@ typedef struct {
void bch2_acl_to_text(struct printbuf *, const void *, size_t);
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
+#ifndef NO_BCACHEFS_FS
struct posix_acl *bch2_get_acl(struct inode *, int, bool);
@@ -55,6 +55,6 @@ static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
return 0;
}
-#endif /* CONFIG_BCACHEFS_POSIX_ACL */
+#endif /* NO_BCACHEFS_FS */
#endif /* _BCACHEFS_ACL_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 3ccca855f05e..3673290586e2 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -686,6 +686,7 @@ struct btree_debug {
unsigned id;
};
+#define BCH_LINK_MAX U32_MAX
#define BCH_TRANSACTIONS_NR 128
struct btree_transaction_stats {
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 63dc0836bf08..638c2a9268a9 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -204,7 +204,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *
if (bpos_eq(expected_start, cur->data->min_key))
return 0;
- prt_printf(&buf, " at ");
+ prt_printf(&buf, " at ");
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
prt_printf(&buf, ":\nparent: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
@@ -229,8 +229,8 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *
*pulled_from_scan = cur->data->min_key;
ret = bch_err_throw(c, topology_repair_did_fill_from_scan);
} else {
- if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
- "btree node with incorrect min_key%s", buf.buf))
+ if (mustfix_fsck_err(trans, btree_node_topology_gap_between_nodes,
+ "gap between btree nodes%s", buf.buf))
ret = set_node_min(c, cur, expected_start);
}
} else { /* overlap */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 52d21259ed6f..3808c41dda84 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1318,6 +1318,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
set_btree_bset_end(b, b->set);
set_btree_node_need_rewrite(b);
set_btree_node_need_rewrite_error(b);
+ ret = 0;
continue;
}
if (ret)
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index a6f344faf751..73fa8513ee61 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -838,8 +838,10 @@ void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
bpos_ge(k->k.p, start) &&
bpos_le(k->k.p, end)))
keys->data[dst++] = *i;
- else if (i->allocated)
+ else if (i->allocated) {
kfree(i->allocated_k);
+ i->allocated_k = NULL;
+ }
}
keys->nr = keys->gap = dst;
}
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index c0dff992ad60..433d498540fd 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -12,7 +12,6 @@
#include "recovery_passes.h"
#include <linux/kthread.h>
-#include <linux/min_heap.h>
#include <linux/sched/sysctl.h>
#include <linux/sort.h>
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
index 37554e4514fe..d12c11dc6601 100644
--- a/fs/bcachefs/clock_types.h
+++ b/fs/bcachefs/clock_types.h
@@ -2,7 +2,7 @@
#ifndef _BCACHEFS_CLOCK_TYPES_H
#define _BCACHEFS_CLOCK_TYPES_H
-#include "util.h"
+#include "vendor/min_heap.h"
#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3)
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 62d5d17d681e..6615c868d066 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -55,63 +55,6 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
return true;
}
-static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
- bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
- }
-}
-
-static noinline_for_stack
-bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs,
- const struct bch_extent_ptr *start)
-{
- if (!ctxt) {
- bkey_for_each_ptr(ptrs, ptr) {
- if (ptr == start)
- break;
-
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
- bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
- }
- return false;
- }
-
- __bkey_for_each_ptr(start, ptrs.end, ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
- bool locked;
- move_ctxt_wait_event(ctxt,
- (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
- list_empty(&ctxt->ios));
- if (!locked) {
- bch2_trans_unlock(ctxt->trans);
- bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
- }
- }
- return true;
-}
-
-static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs)
-{
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
- if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0))
- return __bkey_nocow_lock(c, ctxt, ptrs, ptr);
- }
-
- return true;
-}
-
noinline_for_stack
static void trace_io_move_finish2(struct data_update *u,
struct bkey_i *new,
@@ -538,7 +481,7 @@ void bch2_data_update_exit(struct data_update *update)
update->bvecs = NULL;
if (c->opts.nocow_enabled)
- bkey_nocow_unlock(c, k);
+ bch2_bkey_nocow_unlock(c, k, 0);
bkey_put_dev_refs(c, k);
bch2_disk_reservation_put(c, &update->op.res);
bch2_bkey_buf_exit(&update->k, c);
@@ -1018,10 +961,19 @@ int bch2_data_update_init(struct btree_trans *trans,
goto out;
}
- if (c->opts.nocow_enabled &&
- !bkey_nocow_lock(c, ctxt, ptrs)) {
- ret = bch_err_throw(c, nocow_lock_blocked);
- goto out_put_dev_refs;
+ if (c->opts.nocow_enabled) {
+ if (!bch2_bkey_nocow_trylock(c, ptrs, 0)) {
+ bool locked = false;
+ if (ctxt)
+ move_ctxt_wait_event(ctxt,
+ (locked = bch2_bkey_nocow_trylock(c, ptrs, 0)) ||
+ list_empty(&ctxt->ios));
+ if (!locked) {
+ if (ctxt)
+ bch2_trans_unlock(ctxt->trans);
+ bch2_bkey_nocow_lock(c, ptrs, 0);
+ }
+ }
}
if (unwritten) {
@@ -1039,8 +991,7 @@ int bch2_data_update_init(struct btree_trans *trans,
return 0;
out_nocow_unlock:
if (c->opts.nocow_enabled)
- bkey_nocow_unlock(c, k);
-out_put_dev_refs:
+ bch2_bkey_nocow_unlock(c, k, 0);
bkey_put_dev_refs(c, k);
out:
bch2_disk_reservation_put(c, &m->op.res);
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index a99f821c6a1c..9fd32ca15d19 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -814,6 +814,8 @@ int bch2_accounting_read(struct bch_fs *c)
struct journal_keys *keys = &c->journal_keys;
struct journal_key *jk = keys->data;
+ move_gap(keys, keys->nr);
+
while (jk < &darray_top(*keys) &&
__journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0)
jk++;
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index cbf1eedddad7..2712a9754100 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -228,6 +228,7 @@
x(BCH_ERR_topology_repair, topology_repair_drop_this_node) \
x(BCH_ERR_topology_repair, topology_repair_drop_prev_node) \
x(BCH_ERR_topology_repair, topology_repair_did_fill_from_scan) \
+ x(EMLINK, too_many_links) \
x(EOPNOTSUPP, may_not_use_incompat_feature) \
x(EOPNOTSUPP, no_casefolding_without_utf8) \
x(EOPNOTSUPP, casefolding_disabled) \
@@ -367,7 +368,10 @@
x(BCH_ERR_nopromote, nopromote_enomem) \
x(0, invalid_snapshot_node) \
x(0, option_needs_open_fs) \
- x(0, remove_disk_accounting_entry)
+ x(0, remove_disk_accounting_entry) \
+ x(0, nocow_trylock_fail) \
+ x(BCH_ERR_nocow_trylock_fail, nocow_trylock_contended) \
+ x(BCH_ERR_nocow_trylock_fail, nocow_trylock_bucket_full) \
enum bch_errcode {
BCH_ERR_START = 2048,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index f1849eb8327d..414adf86efdc 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -541,11 +541,9 @@ __bch2_create(struct mnt_idmap *idmap,
* preallocate acls + vfs inode before btree transaction, so that
* nothing can fail after the transaction succeeds:
*/
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
if (ret)
return ERR_PTR(ret);
-#endif
inode = __bch2_new_inode(c, GFP_NOFS);
if (unlikely(!inode)) {
@@ -1751,10 +1749,8 @@ static const struct inode_operations bch_file_inode_operations = {
.setattr = bch2_setattr,
.fiemap = bch2_fiemap,
.listxattr = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
-#endif
.fileattr_get = bch2_fileattr_get,
.fileattr_set = bch2_fileattr_set,
};
@@ -1773,10 +1769,8 @@ static const struct inode_operations bch_dir_inode_operations = {
.setattr = bch2_setattr,
.tmpfile = bch2_tmpfile,
.listxattr = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
-#endif
.fileattr_get = bch2_fileattr_get,
.fileattr_set = bch2_fileattr_set,
};
@@ -1797,10 +1791,8 @@ static const struct inode_operations bch_symlink_inode_operations = {
.getattr = bch2_getattr,
.setattr = bch2_setattr,
.listxattr = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
-#endif
.fileattr_get = bch2_fileattr_get,
.fileattr_set = bch2_fileattr_set,
};
@@ -1809,10 +1801,8 @@ static const struct inode_operations bch_special_inode_operations = {
.getattr = bch2_getattr,
.setattr = bch2_setattr,
.listxattr = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
-#endif
.fileattr_get = bch2_fileattr_get,
.fileattr_set = bch2_fileattr_set,
};
@@ -2546,10 +2536,8 @@ got_sb:
c->dev = sb->s_dev;
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
if (c->opts.acl)
sb->s_flags |= SB_POSIXACL;
-#endif
sb->s_shrink->seeks = 0;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 543627fb58be..fda4ca783848 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1184,8 +1184,8 @@ int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
if (bi->bi_flags & BCH_INODE_unlinked)
bi->bi_flags &= ~BCH_INODE_unlinked;
else {
- if (bi->bi_nlink == U32_MAX)
- return -EINVAL;
+ if (bi->bi_nlink == BCH_LINK_MAX - nlink_bias(bi->bi_mode))
+ return -BCH_ERR_too_many_links;
bi->bi_nlink++;
}
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 6a5da02ce266..ed4a9440143a 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -33,7 +33,6 @@
#include <linux/blkdev.h>
#include <linux/moduleparam.h>
-#include <linux/prefetch.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
@@ -1323,11 +1322,25 @@ static CLOSURE_CALLBACK(bch2_nocow_write_done)
bch2_write_done(cl);
}
-struct bucket_to_lock {
- struct bpos b;
- unsigned gen;
- struct nocow_lock_bucket *l;
-};
+static bool bkey_get_dev_iorefs(struct bch_fs *c, struct bkey_ptrs_c ptrs)
+{
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE,
+ BCH_DEV_WRITE_REF_io_write);
+ if (unlikely(!ca)) {
+ bkey_for_each_ptr(ptrs, ptr2) {
+ if (ptr2 == ptr)
+ break;
+ enumerated_ref_put(&bch2_dev_have_ref(c, ptr2->dev)->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_io_write);
+ }
+
+ return false;
+ }
+ }
+
+ return true;
+}
static void bch2_nocow_write(struct bch_write_op *op)
{
@@ -1335,15 +1348,14 @@ static void bch2_nocow_write(struct bch_write_op *op)
struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
- DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
+ struct bkey_ptrs_c ptrs;
u32 snapshot;
- struct bucket_to_lock *stale_at;
+ const struct bch_extent_ptr *stale_at;
int stale, ret;
if (op->flags & BCH_WRITE_move)
return;
- darray_init(&buckets);
trans = bch2_trans_get(c);
retry:
bch2_trans_begin(trans);
@@ -1358,8 +1370,6 @@ retry:
while (1) {
struct bio *bio = &op->wbio.bio;
- buckets.nr = 0;
-
ret = bch2_trans_relock(trans);
if (ret)
break;
@@ -1381,50 +1391,42 @@ retry:
break;
/* Get iorefs before dropping btree locks: */
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE,
- BCH_DEV_WRITE_REF_io_write);
- if (unlikely(!ca))
- goto err_get_ioref;
-
- struct bpos b = PTR_BUCKET_POS(ca, ptr);
- struct nocow_lock_bucket *l =
- bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));
- prefetch(l);
-
- /* XXX allocating memory with btree locks held - rare */
- darray_push_gfp(&buckets, ((struct bucket_to_lock) {
- .b = b, .gen = ptr->gen, .l = l,
- }), GFP_KERNEL|__GFP_NOFAIL);
-
- if (ptr->unwritten)
- op->flags |= BCH_WRITE_convert_unwritten;
- }
+ ptrs = bch2_bkey_ptrs_c(k);
+ if (!bkey_get_dev_iorefs(c, ptrs))
+ goto out;
/* Unlock before taking nocow locks, doing IO: */
bkey_reassemble(op->insert_keys.top, k);
- bch2_trans_unlock(trans);
+ k = bkey_i_to_s_c(op->insert_keys.top);
+ ptrs = bch2_bkey_ptrs_c(k);
- bch2_cut_front(op->pos, op->insert_keys.top);
- if (op->flags & BCH_WRITE_convert_unwritten)
- bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
+ bch2_trans_unlock(trans);
- darray_for_each(buckets, i) {
- struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode);
+ bch2_bkey_nocow_lock(c, ptrs, BUCKET_NOCOW_LOCK_UPDATE);
- __bch2_bucket_nocow_lock(&c->nocow_locks, i->l,
- bucket_to_u64(i->b),
- BUCKET_NOCOW_LOCK_UPDATE);
+ /*
+ * This could be handled better: If we're able to trylock the
+ * nocow locks with btree locks held we know dirty pointers
+ * can't be stale
+ */
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- int gen = bucket_gen_get(ca, i->b.offset);
- stale = gen < 0 ? gen : gen_after(gen, i->gen);
+ int gen = bucket_gen_get(ca, PTR_BUCKET_NR(ca, ptr));
+ stale = gen < 0 ? gen : gen_after(gen, ptr->gen);
if (unlikely(stale)) {
- stale_at = i;
+ stale_at = ptr;
goto err_bucket_stale;
}
+
+ if (ptr->unwritten)
+ op->flags |= BCH_WRITE_convert_unwritten;
}
+ bch2_cut_front(op->pos, op->insert_keys.top);
+ if (op->flags & BCH_WRITE_convert_unwritten)
+ bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
+
bio = &op->wbio.bio;
if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
bio = bio_split(bio, k.k->p.offset - op->pos.offset,
@@ -1458,8 +1460,6 @@ err:
goto retry;
bch2_trans_put(trans);
- darray_exit(&buckets);
-
if (ret) {
bch2_write_op_error(op, op->pos.offset,
"%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
@@ -1484,24 +1484,11 @@ err:
continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
}
return;
-err_get_ioref:
- darray_for_each(buckets, i)
- enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE],
- BCH_DEV_WRITE_REF_io_write);
-
- /* Fall back to COW path: */
- goto out;
err_bucket_stale:
- darray_for_each(buckets, i) {
- bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE);
- if (i == stale_at)
- break;
- }
-
CLASS(printbuf, buf)();
if (bch2_fs_inconsistent_on(stale < 0, c,
- "pointer to invalid bucket in nocow path on device %llu\n %s",
- stale_at->b.inode,
+ "pointer to invalid bucket in nocow path on device %u\n %s",
+ stale_at->dev,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch_err_throw(c, data_write_invalid_ptr);
} else {
@@ -1509,7 +1496,13 @@ err_bucket_stale:
ret = bch_err_throw(c, transaction_restart);
}
- goto err_get_ioref;
+ bch2_bkey_nocow_unlock(c, k, BUCKET_NOCOW_LOCK_UPDATE);
+ bkey_for_each_ptr(ptrs, ptr)
+ enumerated_ref_put(&bch2_dev_have_ref(c, ptr->dev)->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_io_write);
+
+ /* Fall back to COW path: */
+ goto out;
}
static void __bch2_write(struct bch_write_op *op)
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index ae747c87fcf9..f7b0fdd99c75 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -766,6 +766,9 @@ static int bch2_journal_reclaim_thread(void *arg)
set_freezable();
+ kthread_wait_freezable(test_bit(BCH_FS_rw, &c->flags) ||
+ kthread_should_stop());
+
j->last_flushed = jiffies;
while (!ret && !kthread_should_stop()) {
@@ -826,8 +829,10 @@ int bch2_journal_reclaim_start(struct journal *j)
struct task_struct *p;
int ret;
- if (j->reclaim_thread)
+ if (j->reclaim_thread) {
+ wake_up_process(j->reclaim_thread);
return 0;
+ }
p = kthread_create(bch2_journal_reclaim_thread, j,
"bch-reclaim/%s", c->name);
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
index 71b17f18e90c..c8907070796f 100644
--- a/fs/bcachefs/nocow_locking.c
+++ b/fs/bcachefs/nocow_locking.c
@@ -6,6 +6,16 @@
#include "nocow_locking.h"
#include "util.h"
+#include <linux/prefetch.h>
+
+static bool nocow_bucket_empty(struct nocow_lock_bucket *l)
+{
+ for (unsigned i = 0; i < ARRAY_SIZE(l->b); i++)
+ if (atomic_read(&l->l[i]))
+ return false;
+ return true;
+}
+
bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
{
u64 dev_bucket = bucket_to_u64(bucket);
@@ -20,14 +30,12 @@ bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos
#define sign(v) (v < 0 ? -1 : v > 0 ? 1 : 0)
-void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags)
+void __bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, u64 dev_bucket, int flags)
{
- u64 dev_bucket = bucket_to_u64(bucket);
struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
int lock_val = flags ? 1 : -1;
- unsigned i;
- for (i = 0; i < ARRAY_SIZE(l->b); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(l->b); i++)
if (l->b[i] == dev_bucket) {
int v = atomic_sub_return(lock_val, &l->l[i]);
@@ -40,8 +48,8 @@ void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos buc
BUG();
}
-bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
- u64 dev_bucket, int flags)
+static int __bch2_bucket_nocow_trylock(struct bch_fs *c, struct nocow_lock_bucket *l,
+ u64 dev_bucket, int flags)
{
int v, lock_val = flags ? 1 : -1;
unsigned i;
@@ -58,32 +66,128 @@ bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
goto take_lock;
}
- return false;
+ return bch_err_throw(c, nocow_trylock_bucket_full);
got_entry:
v = atomic_read(&l->l[i]);
if (lock_val > 0 ? v < 0 : v > 0)
- return false;
+ return bch_err_throw(c, nocow_trylock_contended);
take_lock:
v = atomic_read(&l->l[i]);
/* Overflow? */
if (v && sign(v + lock_val) != sign(v))
- return false;
+ return bch_err_throw(c, nocow_trylock_contended);
atomic_add(lock_val, &l->l[i]);
+ return 0;
+}
+
+static inline bool bch2_bucket_nocow_trylock(struct bch_fs *c, struct bpos bucket, int flags)
+{
+ struct bucket_nocow_lock_table *t = &c->nocow_locks;
+ u64 dev_bucket = bucket_to_u64(bucket);
+ struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+
+ return !__bch2_bucket_nocow_trylock(c, l, dev_bucket, flags);
+}
+
+void bch2_bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k, int flags)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+ bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, flags);
+ }
+}
+
+bool bch2_bkey_nocow_trylock(struct bch_fs *c, struct bkey_ptrs_c ptrs, int flags)
+{
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+ if (unlikely(!bch2_bucket_nocow_trylock(c, bucket, flags))) {
+ bkey_for_each_ptr(ptrs, ptr2) {
+ if (ptr2 == ptr)
+ break;
+
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr2->dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr2);
+ bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, flags);
+ }
+ return false;
+ }
+ }
+
return true;
}
-void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
- struct nocow_lock_bucket *l,
- u64 dev_bucket, int flags)
+struct bucket_to_lock {
+ u64 b;
+ struct nocow_lock_bucket *l;
+};
+
+static inline int bucket_to_lock_cmp(struct bucket_to_lock l,
+ struct bucket_to_lock r)
{
- if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) {
- struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
+ return cmp_int(l.l, r.l);
+}
+
+void bch2_bkey_nocow_lock(struct bch_fs *c, struct bkey_ptrs_c ptrs, int flags)
+{
+ if (bch2_bkey_nocow_trylock(c, ptrs, flags))
+ return;
+
+ DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
+ darray_init(&buckets);
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ u64 b = bucket_to_u64(PTR_BUCKET_POS(ca, ptr));
+ struct nocow_lock_bucket *l =
+ bucket_nocow_lock(&c->nocow_locks, b);
+ prefetch(l);
+
+ /* XXX allocating memory with btree locks held - rare */
+ darray_push_gfp(&buckets, ((struct bucket_to_lock) { .b = b, .l = l, }),
+ GFP_KERNEL|__GFP_NOFAIL);
+ }
+
+ WARN_ON_ONCE(buckets.nr > NOCOW_LOCK_BUCKET_SIZE);
+
+ bubble_sort(buckets.data, buckets.nr, bucket_to_lock_cmp);
+retake_all:
+ darray_for_each(buckets, i) {
+ int ret = __bch2_bucket_nocow_trylock(c, i->l, i->b, flags);
+ if (!ret)
+ continue;
+
u64 start_time = local_clock();
- __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
+ if (ret == -BCH_ERR_nocow_trylock_contended)
+ __closure_wait_event(&i->l->wait,
+ (ret = __bch2_bucket_nocow_trylock(c, i->l, i->b, flags)) != -BCH_ERR_nocow_trylock_contended);
+ if (!ret) {
+ bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+ continue;
+ }
+
+ BUG_ON(ret != -BCH_ERR_nocow_trylock_bucket_full);
+
+ darray_for_each(buckets, i2) {
+ if (i2 == i)
+ break;
+ __bch2_bucket_nocow_unlock(&c->nocow_locks, i2->b, flags);
+ }
+
+ __closure_wait_event(&i->l->wait, nocow_bucket_empty(i->l));
bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+ goto retake_all;
}
+
+ darray_exit(&buckets);
}
void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
index 48b8a003c0d2..972c914780f9 100644
--- a/fs/bcachefs/nocow_locking.h
+++ b/fs/bcachefs/nocow_locking.h
@@ -19,29 +19,19 @@ static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lo
#define BUCKET_NOCOW_LOCK_UPDATE (1 << 0)
bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos);
-void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int);
-bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int);
-void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *,
- struct nocow_lock_bucket *, u64, int);
-static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
- struct bpos bucket, int flags)
-{
- u64 dev_bucket = bucket_to_u64(bucket);
- struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-
- __bch2_bucket_nocow_lock(t, l, dev_bucket, flags);
-}
+void __bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, u64, int);
-static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
- struct bpos bucket, int flags)
+static inline void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket,
+ int flags)
{
- u64 dev_bucket = bucket_to_u64(bucket);
- struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-
- return __bch2_bucket_nocow_trylock(l, dev_bucket, flags);
+ __bch2_bucket_nocow_unlock(t, bucket_to_u64(bucket), flags);
}
+void bch2_bkey_nocow_unlock(struct bch_fs *, struct bkey_s_c, int);
+bool bch2_bkey_nocow_trylock(struct bch_fs *, struct bkey_ptrs_c, int);
+void bch2_bkey_nocow_lock(struct bch_fs *, struct bkey_ptrs_c, int);
+
void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
void bch2_fs_nocow_locking_exit(struct bch_fs *);
diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h
index bd12bf677924..3fed8e957e20 100644
--- a/fs/bcachefs/nocow_locking_types.h
+++ b/fs/bcachefs/nocow_locking_types.h
@@ -5,11 +5,13 @@
#define BUCKET_NOCOW_LOCKS_BITS 10
#define BUCKET_NOCOW_LOCKS (1U << BUCKET_NOCOW_LOCKS_BITS)
+#define NOCOW_LOCK_BUCKET_SIZE 6
+
struct nocow_lock_bucket {
struct closure_waitlist wait;
spinlock_t lock;
- u64 b[4];
- atomic_t l[4];
+ u64 b[NOCOW_LOCK_BUCKET_SIZE];
+ atomic_t l[NOCOW_LOCK_BUCKET_SIZE];
} __aligned(SMP_CACHE_BYTES);
struct bucket_nocow_lock_table {
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index bd5faafc9aa7..b9bac9fbbafb 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -649,10 +649,6 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
if (!(bch2_opt_table[id].flags & OPT_MOUNT))
return -BCH_ERR_option_name;
- if (id == Opt_acl &&
- !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
- return -BCH_ERR_option_name;
-
if ((id == Opt_usrquota ||
id == Opt_grpquota) &&
!IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h
index b7eff904acdb..898caf943b34 100644
--- a/fs/bcachefs/replicas_format.h
+++ b/fs/bcachefs/replicas_format.h
@@ -17,7 +17,8 @@ struct bch_replicas_entry_v1 {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
- __u8 devs[] __counted_by(nr_devs);
+ /* No counted_by: bch_replicas_cpu entries are all the size of the biggest entry */
+ __u8 devs[];
} __packed;
struct bch_sb_field_replicas {
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 77e3fc92e39b..fbdb7b7df23e 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -74,6 +74,7 @@ enum bch_fsck_flags {
x(btree_root_bad_min_key, 60, 0) \
x(btree_root_bad_max_key, 61, 0) \
x(btree_node_read_error, 62, FSCK_AUTOFIX) \
+ x(btree_node_topology_gap_between_nodes, 328, FSCK_AUTOFIX) \
x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \
x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \
x(btree_node_topology_bad_root_min_key, 323, FSCK_AUTOFIX) \
@@ -339,7 +340,7 @@ enum bch_fsck_flags {
x(dirent_stray_data_after_cf_name, 305, 0) \
x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \
x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \
- x(MAX, 328, 0)
+ x(MAX, 329, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 5cd308a68035..3038b5e6c1f0 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -564,15 +564,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
* successfully marked the filesystem dirty
*/
- ret = bch2_journal_reclaim_start(&c->journal);
- if (ret)
- goto err;
-
set_bit(BCH_FS_rw, &c->flags);
set_bit(BCH_FS_was_rw, &c->flags);
enumerated_ref_start(&c->writes);
+ ret = bch2_journal_reclaim_start(&c->journal);
+ if (ret) {
+ bch_err_msg(c, ret, "error starting journal reclaim thread");
+ goto err;
+ }
+
ret = bch2_copygc_start(c);
if (ret) {
bch_err_msg(c, ret, "error starting copygc thread");
@@ -852,7 +854,8 @@ int bch2_fs_init_rw(struct bch_fs *c)
bch2_fs_btree_write_buffer_init(c) ?:
bch2_fs_fs_io_buffered_init(c) ?:
bch2_fs_io_write_init(c) ?:
- bch2_fs_journal_init(&c->journal);
+ bch2_fs_journal_init(&c->journal) ?:
+ bch2_journal_reclaim_start(&c->journal);
if (ret)
return ret;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 555e0d8f3cf0..eb4222f0f34b 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -7,7 +7,6 @@
#include <linux/errno.h>
#include <linux/freezer.h>
#include <linux/kernel.h>
-#include <linux/min_heap.h>
#include <linux/sched/clock.h>
#include <linux/llist.h>
#include <linux/log2.h>
diff --git a/fs/bcachefs/vendor/min_heap.c b/fs/bcachefs/vendor/min_heap.c
new file mode 100644
index 000000000000..21f744549d66
--- /dev/null
+++ b/fs/bcachefs/vendor/min_heap.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "min_heap.h"
+
+void __bch2_min_heap_init(min_heap_char *heap, void *data, size_t size)
+{
+ __min_heap_init_inline(heap, data, size);
+}
+
+void *__bch2_min_heap_peek(struct min_heap_char *heap)
+{
+ return __min_heap_peek_inline(heap);
+}
+
+bool __bch2_min_heap_full(min_heap_char *heap)
+{
+ return __min_heap_full_inline(heap);
+}
+
+void __bch2_min_heap_sift_down(min_heap_char *heap, size_t pos, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ __min_heap_sift_down_inline(heap, pos, elem_size, func, args);
+}
+
+void __bch2_min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx,
+ const struct min_heap_callbacks *func, void *args)
+{
+ __min_heap_sift_up_inline(heap, elem_size, idx, func, args);
+}
+
+void __bch2_min_heapify_all(min_heap_char *heap, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ __min_heapify_all_inline(heap, elem_size, func, args);
+}
+
+bool __bch2_min_heap_pop(min_heap_char *heap, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ return __min_heap_pop_inline(heap, elem_size, func, args);
+}
+
+void __bch2_min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ __min_heap_pop_push_inline(heap, element, elem_size, func, args);
+}
+
+bool __bch2_min_heap_push(min_heap_char *heap, const void *element, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ return __min_heap_push_inline(heap, element, elem_size, func, args);
+}
+
+bool __bch2_min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx,
+ const struct min_heap_callbacks *func, void *args)
+{
+ return __min_heap_del_inline(heap, elem_size, idx, func, args);
+}
diff --git a/fs/bcachefs/vendor/min_heap.h b/fs/bcachefs/vendor/min_heap.h
new file mode 100644
index 000000000000..865c8b33b11f
--- /dev/null
+++ b/fs/bcachefs/vendor/min_heap.h
@@ -0,0 +1,477 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MIN_HEAP_H
+#define _LINUX_MIN_HEAP_H
+
+#include <linux/bug.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+/*
+ * The Min Heap API provides utilities for managing min-heaps, a binary tree
+ * structure where each node's value is less than or equal to its children's
+ * values, ensuring the smallest element is at the root.
+ *
+ * Users should avoid directly calling functions prefixed with __min_heap_*().
+ * Instead, use the provided macro wrappers.
+ *
+ * For further details and examples, refer to Documentation/core-api/min_heap.rst.
+ */
+
+/**
+ * Data structure to hold a min-heap.
+ * @nr: Number of elements currently in the heap.
+ * @size: Maximum number of elements that can be held in current storage.
+ * @data: Pointer to the start of array holding the heap elements.
+ * @preallocated: Start of the static preallocated array holding the heap elements.
+ */
+#define MIN_HEAP_PREALLOCATED(_type, _name, _nr) \
+struct _name { \
+ size_t nr; \
+ size_t size; \
+ _type *data; \
+ _type preallocated[_nr]; \
+}
+
+#define DEFINE_MIN_HEAP(_type, _name) MIN_HEAP_PREALLOCATED(_type, _name, 0)
+
+typedef DEFINE_MIN_HEAP(char, min_heap_char) min_heap_char;
+
+#define __minheap_cast(_heap) (typeof((_heap)->data[0]) *)
+#define __minheap_obj_size(_heap) sizeof((_heap)->data[0])
+
+/**
+ * struct min_heap_callbacks - Data/functions to customise the min_heap.
+ * @less: Partial order function for this heap.
+ * @swp: Swap elements function.
+ */
+struct min_heap_callbacks {
+ bool (*less)(const void *lhs, const void *rhs, void *args);
+ void (*swp)(void *lhs, void *rhs, void *args);
+};
+
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required alignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
+{
+ unsigned char lsbits = (unsigned char)size;
+
+ (void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+ return (lsbits & (align - 1)) == 0;
+}
+
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory. This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is still valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static __always_inline
+void swap_words_32(void *a, void *b, size_t n)
+{
+ do {
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+ } while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory. This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible. If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI). Are there any cases the kernel needs to worry about?
+ */
+static __always_inline
+void swap_words_64(void *a, void *b, size_t n)
+{
+ do {
+#ifdef CONFIG_64BIT
+ u64 t = *(u64 *)(a + (n -= 8));
+ *(u64 *)(a + n) = *(u64 *)(b + n);
+ *(u64 *)(b + n) = t;
+#else
+ /* Use two 32-bit transfers to avoid base+index+4 addressing */
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+
+ t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+#endif
+ } while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static __always_inline
+void swap_bytes(void *a, void *b, size_t n)
+{
+ do {
+ char t = ((char *)a)[--n];
+ ((char *)a)[n] = ((char *)b)[n];
+ ((char *)b)[n] = t;
+ } while (n);
+}
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 ((void (*)(void *, void *, void *))0)
+#define SWAP_WORDS_32 ((void (*)(void *, void *, void *))1)
+#define SWAP_BYTES ((void (*)(void *, void *, void *))2)
+
+/*
+ * Selects the appropriate swap function based on the element size.
+ */
+static __always_inline
+void *select_swap_func(const void *base, size_t size)
+{
+ if (is_aligned(base, size, 8))
+ return SWAP_WORDS_64;
+ else if (is_aligned(base, size, 4))
+ return SWAP_WORDS_32;
+ else
+ return SWAP_BYTES;
+}
+
+static __always_inline
+void do_swap(void *a, void *b, size_t size, void (*swap_func)(void *lhs, void *rhs, void *args),
+ void *priv)
+{
+ if (swap_func == SWAP_WORDS_64)
+ swap_words_64(a, b, size);
+ else if (swap_func == SWAP_WORDS_32)
+ swap_words_32(a, b, size);
+ else if (swap_func == SWAP_BYTES)
+ swap_bytes(a, b, size);
+ else
+ swap_func(a, b, priv);
+}
+
+/**
+ * parent - given the offset of the child, find the offset of the parent.
+ * @i: the offset of the heap element whose parent is sought. Non-zero.
+ * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
+ * @size: size of each element
+ *
+ * In terms of array indexes, the parent of element j = @i/@size is simply
+ * (j-1)/2. But when working in byte offsets, we can't use implicit
+ * truncation of integer divides.
+ *
+ * Fortunately, we only need one bit of the quotient, not the full divide.
+ * @size has a least significant bit. That bit will be clear if @i is
+ * an even multiple of @size, and set if it's an odd multiple.
+ *
+ * Logically, we're doing "if (i & lsbit) i -= size;", but since the
+ * branch is unpredictable, it's done with a bit of clever branch-free
+ * code instead.
+ */
+__attribute_const__ __always_inline
+static size_t parent(size_t i, unsigned int lsbit, size_t size)
+{
+ i -= size;
+ i -= size & -(i & lsbit);
+ return i / 2;
+}
+
+/* Initialize a min-heap. */
+static __always_inline
+void __min_heap_init_inline(min_heap_char *heap, void *data, size_t size)
+{
+ heap->nr = 0;
+ heap->size = size;
+ if (data)
+ heap->data = data;
+ else
+ heap->data = heap->preallocated;
+}
+
+#define min_heap_init_inline(_heap, _data, _size) \
+ __min_heap_init_inline(container_of(&(_heap)->nr, min_heap_char, nr), _data, _size)
+
+/* Get the minimum element from the heap. */
+static __always_inline
+void *__min_heap_peek_inline(struct min_heap_char *heap)
+{
+ return heap->nr ? heap->data : NULL;
+}
+
+#define min_heap_peek_inline(_heap) \
+ (__minheap_cast(_heap) \
+ __min_heap_peek_inline(container_of(&(_heap)->nr, min_heap_char, nr)))
+
+/* Check if the heap is full. */
+static __always_inline
+bool __min_heap_full_inline(min_heap_char *heap)
+{
+ return heap->nr == heap->size;
+}
+
+#define min_heap_full_inline(_heap) \
+ __min_heap_full_inline(container_of(&(_heap)->nr, min_heap_char, nr))
+
+/* Sift the element at pos down the heap. */
+static __always_inline
+void __min_heap_sift_down_inline(min_heap_char *heap, size_t pos, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ const unsigned long lsbit = elem_size & -elem_size;
+ void *data = heap->data;
+ void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
+ /* pre-scale counters for performance */
+ size_t a = pos * elem_size;
+ size_t b, c, d;
+ size_t n = heap->nr * elem_size;
+
+ if (!swp)
+ swp = select_swap_func(data, elem_size);
+
+ /* Find the sift-down path all the way to the leaves. */
+ for (b = a; c = 2 * b + elem_size, (d = c + elem_size) < n;)
+ b = func->less(data + c, data + d, args) ? c : d;
+
+ /* Special case for the last leaf with no sibling. */
+ if (d == n)
+ b = c;
+
+ /* Backtrack to the correct location. */
+ while (b != a && func->less(data + a, data + b, args))
+ b = parent(b, lsbit, elem_size);
+
+ /* Shift the element into its correct place. */
+ c = b;
+ while (b != a) {
+ b = parent(b, lsbit, elem_size);
+ do_swap(data + b, data + c, elem_size, swp, args);
+ }
+}
+
+#define min_heap_sift_down_inline(_heap, _pos, _func, _args) \
+ __min_heap_sift_down_inline(container_of(&(_heap)->nr, min_heap_char, nr), _pos, \
+ __minheap_obj_size(_heap), _func, _args)
+
+/* Sift up ith element from the heap, O(log2(nr)). */
+static __always_inline
+void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx,
+ const struct min_heap_callbacks *func, void *args)
+{
+ const unsigned long lsbit = elem_size & -elem_size;
+ void *data = heap->data;
+ void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
+ /* pre-scale counters for performance */
+ size_t a = idx * elem_size, b;
+
+ if (!swp)
+ swp = select_swap_func(data, elem_size);
+
+ while (a) {
+ b = parent(a, lsbit, elem_size);
+ if (func->less(data + b, data + a, args))
+ break;
+ do_swap(data + a, data + b, elem_size, swp, args);
+ a = b;
+ }
+}
+
+#define min_heap_sift_up_inline(_heap, _idx, _func, _args) \
+ __min_heap_sift_up_inline(container_of(&(_heap)->nr, min_heap_char, nr), \
+ __minheap_obj_size(_heap), _idx, _func, _args)
+
+/* Floyd's approach to heapification that is O(nr). */
+static __always_inline
+void __min_heapify_all_inline(min_heap_char *heap, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ ssize_t i;
+
+ for (i = heap->nr / 2 - 1; i >= 0; i--)
+ __min_heap_sift_down_inline(heap, i, elem_size, func, args);
+}
+
+#define min_heapify_all_inline(_heap, _func, _args) \
+ __min_heapify_all_inline(container_of(&(_heap)->nr, min_heap_char, nr), \
+ __minheap_obj_size(_heap), _func, _args)
+
+/* Remove minimum element from the heap, O(log2(nr)). */
+static __always_inline
+bool __min_heap_pop_inline(min_heap_char *heap, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ void *data = heap->data;
+
+ if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
+ return false;
+
+ /* Place last element at the root (position 0) and then sift down. */
+ heap->nr--;
+ memcpy(data, data + (heap->nr * elem_size), elem_size);
+ __min_heap_sift_down_inline(heap, 0, elem_size, func, args);
+
+ return true;
+}
+
+#define min_heap_pop_inline(_heap, _func, _args) \
+ __min_heap_pop_inline(container_of(&(_heap)->nr, min_heap_char, nr), \
+ __minheap_obj_size(_heap), _func, _args)
+
+/*
+ * Remove the minimum element and then push the given element. The
+ * implementation performs 1 sift (O(log2(nr))) and is therefore more
+ * efficient than a pop followed by a push that does 2.
+ */
+static __always_inline
+void __min_heap_pop_push_inline(min_heap_char *heap, const void *element, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ memcpy(heap->data, element, elem_size);
+ __min_heap_sift_down_inline(heap, 0, elem_size, func, args);
+}
+
+#define min_heap_pop_push_inline(_heap, _element, _func, _args) \
+ __min_heap_pop_push_inline(container_of(&(_heap)->nr, min_heap_char, nr), _element, \
+ __minheap_obj_size(_heap), _func, _args)
+
+/* Push an element on to the heap, O(log2(nr)). */
+static __always_inline
+bool __min_heap_push_inline(min_heap_char *heap, const void *element, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ void *data = heap->data;
+ size_t pos;
+
+ if (WARN_ONCE(heap->nr >= heap->size, "Pushing on a full heap"))
+ return false;
+
+ /* Place at the end of data. */
+ pos = heap->nr;
+ memcpy(data + (pos * elem_size), element, elem_size);
+ heap->nr++;
+
+ /* Sift child at pos up. */
+ __min_heap_sift_up_inline(heap, elem_size, pos, func, args);
+
+ return true;
+}
+
+#define min_heap_push_inline(_heap, _element, _func, _args) \
+ __min_heap_push_inline(container_of(&(_heap)->nr, min_heap_char, nr), _element, \
+ __minheap_obj_size(_heap), _func, _args)
+
+/* Remove ith element from the heap, O(log2(nr)). */
+static __always_inline
+bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx,
+ const struct min_heap_callbacks *func, void *args)
+{
+ void *data = heap->data;
+ void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
+
+ if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
+ return false;
+
+ if (!swp)
+ swp = select_swap_func(data, elem_size);
+
+ /* Place last element at the root (position 0) and then sift down. */
+ heap->nr--;
+ if (idx == heap->nr)
+ return true;
+ do_swap(data + (idx * elem_size), data + (heap->nr * elem_size), elem_size, swp, args);
+ __min_heap_sift_up_inline(heap, elem_size, idx, func, args);
+ __min_heap_sift_down_inline(heap, idx, elem_size, func, args);
+
+ return true;
+}
+
+#define min_heap_del_inline(_heap, _idx, _func, _args) \
+ __min_heap_del_inline(container_of(&(_heap)->nr, min_heap_char, nr), \
+ __minheap_obj_size(_heap), _idx, _func, _args)
+
+void __bch2_min_heap_init(min_heap_char *heap, void *data, size_t size);
+void *__bch2_min_heap_peek(struct min_heap_char *heap);
+bool __bch2_min_heap_full(min_heap_char *heap);
+void __bch2_min_heap_sift_down(min_heap_char *heap, size_t pos, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args);
+void __bch2_min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx,
+ const struct min_heap_callbacks *func, void *args);
+void __bch2_min_heapify_all(min_heap_char *heap, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args);
+bool __bch2_min_heap_pop(min_heap_char *heap, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args);
+void __bch2_min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args);
+bool __bch2_min_heap_push(min_heap_char *heap, const void *element, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args);
+bool __bch2_min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx,
+ const struct min_heap_callbacks *func, void *args);
+
+#define min_heap_init(_heap, _data, _size) \
+ __bch2_min_heap_init(container_of(&(_heap)->nr, min_heap_char, nr), _data, _size)
+#define min_heap_peek(_heap) \
+ (__minheap_cast(_heap) __bch2_min_heap_peek(container_of(&(_heap)->nr, min_heap_char, nr)))
+#define min_heap_full(_heap) \
+ __bch2_min_heap_full(container_of(&(_heap)->nr, min_heap_char, nr))
+#define min_heap_sift_down(_heap, _pos, _func, _args) \
+ __bch2_min_heap_sift_down(container_of(&(_heap)->nr, min_heap_char, nr), _pos, \
+ __minheap_obj_size(_heap), _func, _args)
+#define min_heap_sift_up(_heap, _idx, _func, _args) \
+ __bch2_min_heap_sift_up(container_of(&(_heap)->nr, min_heap_char, nr), \
+ __minheap_obj_size(_heap), _idx, _func, _args)
+#define min_heapify_all(_heap, _func, _args) \
+ __bch2_min_heapify_all(container_of(&(_heap)->nr, min_heap_char, nr), \
+ __minheap_obj_size(_heap), _func, _args)
+#define min_heap_pop(_heap, _func, _args) \
+ __bch2_min_heap_pop(container_of(&(_heap)->nr, min_heap_char, nr), \
+ __minheap_obj_size(_heap), _func, _args)
+#define min_heap_pop_push(_heap, _element, _func, _args) \
+ __bch2_min_heap_pop_push(container_of(&(_heap)->nr, min_heap_char, nr), _element, \
+ __minheap_obj_size(_heap), _func, _args)
+#define min_heap_push(_heap, _element, _func, _args) \
+ __bch2_min_heap_push(container_of(&(_heap)->nr, min_heap_char, nr), _element, \
+ __minheap_obj_size(_heap), _func, _args)
+#define min_heap_del(_heap, _idx, _func, _args) \
+ __bch2_min_heap_del(container_of(&(_heap)->nr, min_heap_char, nr), \
+ __minheap_obj_size(_heap), _idx, _func, _args)
+
+#endif /* _LINUX_MIN_HEAP_H */
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 9bf282d2453c..499a9edf0ca3 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1795,7 +1795,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
bg1 = list_entry(a, struct btrfs_block_group, bg_list);
bg2 = list_entry(b, struct btrfs_block_group, bg_list);
- return bg1->used > bg2->used;
+ /*
+ * Some other task may be updating the ->used field concurrently, but it
+ * is not serious if we get a stale value or load/store tearing issues,
+ * as sorting the list of block groups to reclaim is not critical and an
+ * occasional imperfect order is ok. So silence KCSAN and avoid the
+ * overhead of locking or any other synchronization.
+ */
+ return data_race(bg1->used > bg2->used);
}
static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b99fb0273292..0387b9f43a52 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -248,7 +248,7 @@ struct btrfs_inode {
u64 new_delalloc_bytes;
/*
* The offset of the last dir index key that was logged.
- * This is used only for directories.
+ * This is used only for directories. Protected by 'log_mutex'.
*/
u64 last_dir_index_offset;
};
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d09d622016ef..35e3071cec06 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1616,25 +1616,29 @@ out:
}
/*
- * Convert the compression suffix (eg. after "zlib" starting with ":") to
- * level, unrecognized string will set the default level. Negative level
- * numbers are allowed.
+ * Convert the compression suffix (eg. after "zlib" starting with ":") to level.
+ *
+ * If the resulting level exceeds the algo's supported levels, it will be clamped.
+ *
+ * Return <0 if no valid string can be found.
+ * Return 0 if everything is fine.
*/
-int btrfs_compress_str2level(unsigned int type, const char *str)
+int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret)
{
int level = 0;
int ret;
- if (!type)
+ if (!type) {
+ *level_ret = btrfs_compress_set_level(type, level);
return 0;
+ }
if (str[0] == ':') {
ret = kstrtoint(str + 1, 10, &level);
if (ret)
- level = 0;
+ return ret;
}
- level = btrfs_compress_set_level(type, level);
-
- return level;
+ *level_ret = btrfs_compress_set_level(type, level);
+ return 0;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 1b38e707bbd9..7b41b2b5ff44 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -102,7 +102,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
bool writeback);
void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
-int btrfs_compress_str2level(unsigned int type, const char *str);
+int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret);
struct folio *btrfs_alloc_compr_folio(void);
void btrfs_free_compr_folio(struct folio *folio);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0f8d8e275143..c0c1ddd46b67 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1843,7 +1843,6 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_delayed_node *delayed_node;
struct btrfs_inode_item *inode_item;
struct inode *vfs_inode = &inode->vfs_inode;
@@ -1864,8 +1863,6 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item));
i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item));
btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
- btrfs_inode_set_file_extent_range(inode, 0,
- round_up(i_size_read(vfs_inode), fs_info->sectorsize));
vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item);
set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item));
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c953297aa89a..b21cb72835cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -111,6 +111,24 @@ struct btrfs_bio_ctrl {
*/
unsigned long submit_bitmap;
struct readahead_control *ractl;
+
+ /*
+ * The start offset of the last used extent map by a read operation.
+ *
+ * This is for proper compressed read merge.
+ * U64_MAX means we are starting the read and have made no progress yet.
+ *
+ * The current btrfs_bio_is_contig() only uses disk_bytenr as
+ * the condition to check if the read can be merged with previous
+ * bio, which is not correct. E.g. two file extents pointing to the
+ * same extent but with different offset.
+ *
+ * So here we need to do extra checks to only merge reads that are
+ * covered by the same extent map.
+ * Just extent_map::start will be enough, as they are unique
+ * inside the same inode.
+ */
+ u64 last_em_start;
};
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
@@ -909,7 +927,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl,
* return 0 on success, otherwise return error
*/
static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
- struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
@@ -1019,12 +1037,11 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
* non-optimal behavior (submitting 2 bios for the same extent).
*/
if (compress_type != BTRFS_COMPRESS_NONE &&
- prev_em_start && *prev_em_start != (u64)-1 &&
- *prev_em_start != em->start)
+ bio_ctrl->last_em_start != U64_MAX &&
+ bio_ctrl->last_em_start != em->start)
force_bio_submit = true;
- if (prev_em_start)
- *prev_em_start = em->start;
+ bio_ctrl->last_em_start = em->start;
btrfs_free_extent_map(em);
em = NULL;
@@ -1238,12 +1255,15 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
const u64 start = folio_pos(folio);
const u64 end = start + folio_size(folio) - 1;
struct extent_state *cached_state = NULL;
- struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .opf = REQ_OP_READ,
+ .last_em_start = U64_MAX,
+ };
struct extent_map *em_cached = NULL;
int ret;
lock_extents_for_read(inode, start, end, &cached_state);
- ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
+ ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
btrfs_free_extent_map(em_cached);
@@ -2583,7 +2603,8 @@ void btrfs_readahead(struct readahead_control *rac)
{
struct btrfs_bio_ctrl bio_ctrl = {
.opf = REQ_OP_READ | REQ_RAHEAD,
- .ractl = rac
+ .ractl = rac,
+ .last_em_start = U64_MAX,
};
struct folio *folio;
struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
@@ -2591,12 +2612,11 @@ void btrfs_readahead(struct readahead_control *rac)
const u64 end = start + readahead_length(rac) - 1;
struct extent_state *cached_state = NULL;
struct extent_map *em_cached = NULL;
- u64 prev_em_start = (u64)-1;
lock_extents_for_read(inode, start, end, &cached_state);
while ((folio = readahead_folio(rac)) != NULL)
- btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
+ btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9e4aec7330cb..18db1053cdf0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3885,10 +3885,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
bool filled = false;
int first_xattr_slot;
- ret = btrfs_init_file_extent_tree(inode);
- if (ret)
- goto out;
-
ret = btrfs_fill_inode(inode, &rdev);
if (!ret)
filled = true;
@@ -3920,8 +3916,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
- btrfs_inode_set_file_extent_range(inode, 0,
- round_up(i_size_read(vfs_inode), fs_info->sectorsize));
inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
btrfs_timespec_nsec(leaf, &inode_item->atime));
@@ -3953,6 +3947,11 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
btrfs_set_inode_mapping_order(inode);
cache_index:
+ ret = btrfs_init_file_extent_tree(inode);
+ if (ret)
+ goto out;
+ btrfs_inode_set_file_extent_range(inode, 0,
+ round_up(i_size_read(vfs_inode), fs_info->sectorsize));
/*
* If we were modified in the current generation and evicted from memory
* and then re-read we need to do a full sync since we don't have any
@@ -5696,7 +5695,17 @@ static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
bool empty = false;
xa_lock(&root->inodes);
- entry = __xa_erase(&root->inodes, btrfs_ino(inode));
+ /*
+ * This btrfs_inode is being freed and has already been unhashed at this
+ * point. It's possible that another btrfs_inode has already been
+ * allocated for the same inode and inserted itself into the root, so
+ * don't delete it in that case.
+ *
+ * Note that this shouldn't need to allocate memory, so the gfp flags
+ * don't really matter.
+ */
+ entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL,
+ GFP_ATOMIC);
if (entry == inode)
empty = xa_empty(&root->inodes);
xa_unlock(&root->inodes);
@@ -6805,7 +6814,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct fscrypt_name fname;
u64 index;
int ret;
- int drop_inode = 0;
/* do not allow sys_link's with other subvols of the same device */
if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
@@ -6837,44 +6845,44 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
/* There are several dir indexes for this inode, clear the cache. */
BTRFS_I(inode)->dir_index = 0ULL;
- inc_nlink(inode);
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
- ihold(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
&fname.disk_name, 1, index);
+ if (ret)
+ goto fail;
+ /* Link added now we update the inode item with the new link count. */
+ inc_nlink(inode);
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret) {
- drop_inode = 1;
- } else {
- struct dentry *parent = dentry->d_parent;
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
- if (ret)
+ if (inode->i_nlink == 1) {
+ /*
+ * If the new hard link count is 1, it's a file created with the
+ * open(2) O_TMPFILE flag.
+ */
+ ret = btrfs_orphan_del(trans, BTRFS_I(inode));
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto fail;
- if (inode->i_nlink == 1) {
- /*
- * If new hard link count is 1, it's a file created
- * with open(2) O_TMPFILE flag.
- */
- ret = btrfs_orphan_del(trans, BTRFS_I(inode));
- if (ret)
- goto fail;
}
- d_instantiate(dentry, inode);
- btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
}
+ /* Grab reference for the new dentry passed to d_instantiate(). */
+ ihold(inode);
+ d_instantiate(dentry, inode);
+ btrfs_log_new_name(trans, old_dentry, NULL, 0, dentry->d_parent);
+
fail:
fscrypt_free_filename(&fname);
if (trans)
btrfs_end_transaction(trans);
- if (drop_inode) {
- inode_dec_link_count(inode);
- iput(inode);
- }
btrfs_btree_balance_dirty(fs_info);
return ret;
}
@@ -7830,6 +7838,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_sub_trans = 0;
ei->logged_trans = 0;
ei->delalloc_bytes = 0;
+ /* new_delalloc_bytes and last_dir_index_offset are in a union. */
ei->new_delalloc_bytes = 0;
ei->defrag_bytes = 0;
ei->disk_i_size = 0;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ccaa9a3cf1ce..da102da169fd 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1455,6 +1455,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
struct btrfs_qgroup *qgroup;
LIST_HEAD(qgroup_list);
u64 num_bytes = src->excl;
+ u64 num_bytes_cmpr = src->excl_cmpr;
int ret = 0;
qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -1466,11 +1467,12 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
struct btrfs_qgroup_list *glist;
qgroup->rfer += sign * num_bytes;
- qgroup->rfer_cmpr += sign * num_bytes;
+ qgroup->rfer_cmpr += sign * num_bytes_cmpr;
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+ WARN_ON(sign < 0 && qgroup->excl_cmpr < num_bytes_cmpr);
qgroup->excl += sign * num_bytes;
- qgroup->excl_cmpr += sign * num_bytes;
+ qgroup->excl_cmpr += sign * num_bytes_cmpr;
if (sign > 0)
qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 3871c3a6c743..9f1858b42c0e 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -980,11 +980,18 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
if (!btrfs_test_opt(fs_info, REF_VERIFY))
return 0;
+ extent_root = btrfs_extent_root(fs_info, 0);
+ /* If the extent tree is damaged we cannot ignore it (IGNOREBADROOTS). */
+ if (IS_ERR(extent_root)) {
+ btrfs_warn(fs_info, "ref-verify: extent tree not available, disabling");
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ return 0;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- extent_root = btrfs_extent_root(fs_info, 0);
eb = btrfs_read_lock_root_node(extent_root);
level = btrfs_header_level(eb);
path->nodes[level] = eb;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a262b494a89f..b06b8f325537 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -276,6 +276,7 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
const struct fs_parameter *param, int opt)
{
const char *string = param->string;
+ int ret;
/*
* Provide the same semantics as older kernels that don't use fs
@@ -294,21 +295,30 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
btrfs_clear_opt(ctx->mount_opt, NODATASUM);
} else if (btrfs_match_compress_type(string, "zlib", true)) {
ctx->compress_type = BTRFS_COMPRESS_ZLIB;
- ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
- string + 4);
+ ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, string + 4,
+ &ctx->compress_level);
+ if (ret < 0)
+ goto error;
btrfs_set_opt(ctx->mount_opt, COMPRESS);
btrfs_clear_opt(ctx->mount_opt, NODATACOW);
btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (btrfs_match_compress_type(string, "lzo", false)) {
+ } else if (btrfs_match_compress_type(string, "lzo", true)) {
ctx->compress_type = BTRFS_COMPRESS_LZO;
- ctx->compress_level = 0;
+ ret = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, string + 3,
+ &ctx->compress_level);
+ if (ret < 0)
+ goto error;
+ if (string[3] == ':' && string[4])
+ btrfs_warn(NULL, "Compression level ignored for LZO");
btrfs_set_opt(ctx->mount_opt, COMPRESS);
btrfs_clear_opt(ctx->mount_opt, NODATACOW);
btrfs_clear_opt(ctx->mount_opt, NODATASUM);
} else if (btrfs_match_compress_type(string, "zstd", true)) {
ctx->compress_type = BTRFS_COMPRESS_ZSTD;
- ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
- string + 4);
+ ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, string + 4,
+ &ctx->compress_level);
+ if (ret < 0)
+ goto error;
btrfs_set_opt(ctx->mount_opt, COMPRESS);
btrfs_clear_opt(ctx->mount_opt, NODATACOW);
btrfs_clear_opt(ctx->mount_opt, NODATASUM);
@@ -319,10 +329,14 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
btrfs_clear_opt(ctx->mount_opt, COMPRESS);
btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
} else {
- btrfs_err(NULL, "unrecognized compression value %s", string);
- return -EINVAL;
+ ret = -EINVAL;
+ goto error;
}
return 0;
+error:
+ btrfs_err(NULL, "failed to parse compression option '%s'", string);
+ return ret;
+
}
static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
@@ -1079,7 +1093,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_printf(seq, ",compress-force=%s", compress_type);
else
seq_printf(seq, ",compress=%s", compress_type);
- if (info->compress_level)
+ if (info->compress_level && info->compress_type != BTRFS_COMPRESS_LZO)
seq_printf(seq, ":%d", info->compress_level);
}
if (btrfs_test_opt(info, NOSSD))
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 0f556f4de3f9..a997c7cc35a2 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1756,10 +1756,10 @@ static int check_inode_ref(struct extent_buffer *leaf,
while (ptr < end) {
u16 namelen;
- if (unlikely(ptr + sizeof(iref) > end)) {
+ if (unlikely(ptr + sizeof(*iref) > end)) {
inode_ref_err(leaf, slot,
"inode ref overflow, ptr %lu end %lu inode_ref_size %zu",
- ptr, end, sizeof(iref));
+ ptr, end, sizeof(*iref));
return -EUCLEAN;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 69e11557fd13..7a63afedd01e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1964,7 +1964,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_EXTREF_KEY;
- search_key.offset = key->objectid;
+ search_key.offset = btrfs_extref_hash(key->objectid, name.name, name.len);
ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
if (ret < 0) {
goto out;
@@ -3340,6 +3340,31 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
return 0;
}
+static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ bool ret = false;
+
+ /*
+ * Do this only if ->logged_trans is still 0 to prevent races with
+ * concurrent logging as we may see the inode not logged when
+ * inode_logged() is called but it gets logged after inode_logged() did
+ * not find it in the log tree and we end up setting ->logged_trans to a
+ * value less than trans->transid after the concurrent logging task has
+ * set it to trans->transid. As a consequence, subsequent rename, unlink
+ * and link operations may end up not logging new names and removing old
+ * names from the log.
+ */
+ spin_lock(&inode->lock);
+ if (inode->logged_trans == 0)
+ inode->logged_trans = trans->transid - 1;
+ else if (inode->logged_trans == trans->transid)
+ ret = true;
+ spin_unlock(&inode->lock);
+
+ return ret;
+}
+
/*
* Check if an inode was logged in the current transaction. This correctly deals
* with the case where the inode was logged but has a logged_trans of 0, which
@@ -3357,15 +3382,32 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret;
- if (inode->logged_trans == trans->transid)
+ /*
+ * Quick lockless call, since once ->logged_trans is set to the current
+ * transaction, we never set it to a lower value anywhere else.
+ */
+ if (data_race(inode->logged_trans) == trans->transid)
return 1;
/*
- * If logged_trans is not 0, then we know the inode logged was not logged
- * in this transaction, so we can return false right away.
+ * If logged_trans is not 0 and not trans->transid, then we know the
+ * inode was not logged in this transaction, so we can return false
+ * right away. We take the lock to avoid a race caused by load/store
+ * tearing with a concurrent btrfs_log_inode() call or a concurrent task
+ * in this function further below - an update to trans->transid can be
+ * teared into two 32 bits updates for example, in which case we could
+ * see a positive value that is not trans->transid and assume the inode
+ * was not logged when it was.
*/
- if (inode->logged_trans > 0)
+ spin_lock(&inode->lock);
+ if (inode->logged_trans == trans->transid) {
+ spin_unlock(&inode->lock);
+ return 1;
+ } else if (inode->logged_trans > 0) {
+ spin_unlock(&inode->lock);
return 0;
+ }
+ spin_unlock(&inode->lock);
/*
* If no log tree was created for this root in this transaction, then
@@ -3374,10 +3416,8 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
* transaction's ID, to avoid the search below in a future call in case
* a log tree gets created after this.
*/
- if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
- inode->logged_trans = trans->transid - 1;
- return 0;
- }
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
+ return mark_inode_as_not_logged(trans, inode);
/*
* We have a log tree and the inode's logged_trans is 0. We can't tell
@@ -3431,8 +3471,7 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
* Set logged_trans to a value greater than 0 and less then the
* current transaction to avoid doing the search in future calls.
*/
- inode->logged_trans = trans->transid - 1;
- return 0;
+ return mark_inode_as_not_logged(trans, inode);
}
/*
@@ -3440,20 +3479,9 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
* the current transacion's ID, to avoid future tree searches as long as
* the inode is not evicted again.
*/
+ spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
-
- /*
- * If it's a directory, then we must set last_dir_index_offset to the
- * maximum possible value, so that the next attempt to log the inode does
- * not skip checking if dir index keys found in modified subvolume tree
- * leaves have been logged before, otherwise it would result in attempts
- * to insert duplicate dir index keys in the log tree. This must be done
- * because last_dir_index_offset is an in-memory only field, not persisted
- * in the inode item or any other on-disk structure, so its value is lost
- * once the inode is evicted.
- */
- if (S_ISDIR(inode->vfs_inode.i_mode))
- inode->last_dir_index_offset = (u64)-1;
+ spin_unlock(&inode->lock);
return 1;
}
@@ -4045,7 +4073,7 @@ done:
/*
* If the inode was logged before and it was evicted, then its
- * last_dir_index_offset is (u64)-1, so we don't the value of the last index
+ * last_dir_index_offset is 0, so we don't know the value of the last index
* key offset. If that's the case, search for it and update the inode. This
* is to avoid lookups in the log tree every time we try to insert a dir index
* key from a leaf changed in the current transaction, and to allow us to always
@@ -4061,7 +4089,7 @@ static int update_last_dir_index_offset(struct btrfs_inode *inode,
lockdep_assert_held(&inode->log_mutex);
- if (inode->last_dir_index_offset != (u64)-1)
+ if (inode->last_dir_index_offset != 0)
return 0;
if (!ctx->logged_before) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fa7a929a0461..c6e3efd6f602 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2722,6 +2722,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
goto error;
}
+ if (bdev_nr_bytes(file_bdev(bdev_file)) <= BTRFS_DEVICE_RANGE_RESERVED) {
+ ret = -EINVAL;
+ goto error;
+ }
+
if (fs_devices->seeding) {
seeding_dev = true;
down_write(&sb->s_umount);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index ea662036f441..efc2a81f50e5 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -2582,9 +2582,9 @@ again:
spin_lock(&space_info->lock);
space_info->total_bytes -= bg->length;
space_info->disk_total -= bg->length * factor;
+ space_info->disk_total -= bg->zone_unusable;
/* There is no allocation ever happened. */
ASSERT(bg->used == 0);
- ASSERT(bg->zone_unusable == 0);
/* No super block in a block group on the zoned setup. */
ASSERT(bg->bytes_super == 0);
spin_unlock(&space_info->lock);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 8b202d789e93..322ed268f14a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1264,7 +1264,9 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
0,
gfp_flags);
if (IS_ERR(pages[index])) {
- if (PTR_ERR(pages[index]) == -EINVAL) {
+ int err = PTR_ERR(pages[index]);
+
+ if (err == -EINVAL) {
pr_err_client(cl, "inode->i_blkbits=%hhu\n",
inode->i_blkbits);
}
@@ -1273,7 +1275,7 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
BUG_ON(ceph_wbc->locked_pages == 0);
pages[index] = NULL;
- return PTR_ERR(pages[index]);
+ return err;
}
} else {
pages[index] = &folio->page;
@@ -1687,6 +1689,7 @@ get_more_pages:
process_folio_batch:
rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc);
+ ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
if (rc)
goto release_folios;
@@ -1695,8 +1698,6 @@ process_folio_batch:
goto release_folios;
if (ceph_wbc.processed_in_fbatch) {
- ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
-
if (folio_batch_count(&ceph_wbc.fbatch) == 0 &&
ceph_wbc.locked_pages < ceph_wbc.max_pages) {
doutc(cl, "reached end fbatch, trying for more\n");
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fdd404fc8112..f3fe786b4143 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -55,8 +55,6 @@ static int mdsc_show(struct seq_file *s, void *p)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct rb_node *rp;
- int pathlen = 0;
- u64 pathbase;
char *path;
mutex_lock(&mdsc->mutex);
@@ -81,8 +79,8 @@ static int mdsc_show(struct seq_file *s, void *p)
if (req->r_inode) {
seq_printf(s, " #%llx", ceph_ino(req->r_inode));
} else if (req->r_dentry) {
- path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
- &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_dentry->d_lock);
@@ -91,7 +89,7 @@ static int mdsc_show(struct seq_file *s, void *p)
req->r_dentry,
path ? path : "");
spin_unlock(&req->r_dentry->d_lock);
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
} else if (req->r_path1) {
seq_printf(s, " #%llx/%s", req->r_ino1.ino,
req->r_path1);
@@ -100,8 +98,8 @@ static int mdsc_show(struct seq_file *s, void *p)
}
if (req->r_old_dentry) {
- path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen,
- &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0);
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_old_dentry->d_lock);
@@ -111,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
req->r_old_dentry,
path ? path : "");
spin_unlock(&req->r_old_dentry->d_lock);
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
} else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
if (req->r_ino2.ino)
seq_printf(s, " #%llx/%s", req->r_ino2.ino,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 8478e7e75df6..32973c62c1a2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1271,10 +1271,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
/* If op failed, mark everyone involved for errors */
if (result) {
- int pathlen = 0;
- u64 base = 0;
- char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen,
- &base, 0);
+ struct ceph_path_info path_info = {0};
+ char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
/* mark error on parent + clear complete */
mapping_set_error(req->r_parent->i_mapping, result);
@@ -1288,8 +1286,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
mapping_set_error(req->r_old_inode->i_mapping, result);
pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
- ceph_mdsc_free_path(path, pathlen);
+ path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path_info(&path_info);
}
out:
iput(req->r_old_inode);
@@ -1347,8 +1345,6 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
int err = -EROFS;
int op;
char *path;
- int pathlen;
- u64 pathbase;
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* rmdir .snap/foo is RMSNAP */
@@ -1367,14 +1363,15 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
if (!dn) {
try_async = false;
} else {
- path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
if (IS_ERR(path)) {
try_async = false;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dn);
/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index c02f100f8552..978acd3d4b32 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -368,8 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
int flags, fmode, wanted;
struct dentry *dentry;
char *path;
- int pathlen;
- u64 pathbase;
bool do_sync = false;
int mask = MAY_READ;
@@ -399,14 +397,15 @@ int ceph_open(struct inode *inode, struct file *file)
if (!dentry) {
do_sync = true;
} else {
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
if (IS_ERR(path)) {
do_sync = true;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, mask);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dentry);
/* For none EACCES cases will let the MDS do the mds auth check */
@@ -614,15 +613,13 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
mapping_set_error(req->r_parent->i_mapping, result);
if (result) {
- int pathlen = 0;
- u64 base = 0;
- char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
- &base, 0);
+ struct ceph_path_info path_info = {0};
+ char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
pr_warn_client(cl,
"async create failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
- ceph_mdsc_free_path(path, pathlen);
+ path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path_info(&path_info);
ceph_dir_clear_complete(req->r_parent);
if (!d_unhashed(dentry))
@@ -791,8 +788,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
int mask;
int err;
char *path;
- int pathlen;
- u64 pathbase;
doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n",
dir, ceph_vinop(dir), dentry, dentry,
@@ -814,7 +809,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (!dn) {
try_async = false;
} else {
- path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
if (IS_ERR(path)) {
try_async = false;
err = 0;
@@ -826,7 +822,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
mask |= MAY_WRITE;
err = ceph_mds_check_access(mdsc, path, mask);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dn);
/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index fc543075b827..f67025465de0 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -55,6 +55,52 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
return 0;
}
+/*
+ * Check if the parent inode matches the vino from directory reply info
+ */
+static inline bool ceph_vino_matches_parent(struct inode *parent,
+ struct ceph_vino vino)
+{
+ return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap;
+}
+
+/*
+ * Validate that the directory inode referenced by @req->r_parent matches the
+ * inode number and snapshot id contained in the reply's directory record. If
+ * they do not match – which can theoretically happen if the parent dentry was
+ * moved between the time the request was issued and the reply arrived – fall
+ * back to looking up the correct inode in the inode cache.
+ *
+ * A reference is *always* returned. Callers that receive a different inode
+ * than the original @parent are responsible for dropping the extra reference
+ * once the reply has been processed.
+ */
+static struct inode *ceph_get_reply_dir(struct super_block *sb,
+ struct inode *parent,
+ struct ceph_mds_reply_info_parsed *rinfo)
+{
+ struct ceph_vino vino;
+
+ if (unlikely(!rinfo->diri.in))
+ return parent; /* nothing to compare against */
+
+ /* If we didn't have a cached parent inode to begin with, just bail out. */
+ if (!parent)
+ return NULL;
+
+ vino.ino = le64_to_cpu(rinfo->diri.in->ino);
+ vino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+ if (likely(ceph_vino_matches_parent(parent, vino)))
+ return parent; /* matches – use the original reference */
+
+ /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */
+ WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n",
+ ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap);
+
+ return ceph_get_inode(sb, vino, NULL);
+}
+
/**
* ceph_new_inode - allocate a new inode in advance of an expected create
* @dir: parent directory for new inode
@@ -1523,6 +1569,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
struct ceph_vino tvino, dvino;
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
struct ceph_client *cl = fsc->client;
+ struct inode *parent_dir = NULL;
int err = 0;
doutc(cl, "%p is_dentry %d is_target %d\n", req,
@@ -1536,10 +1583,17 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
}
if (rinfo->head->is_dentry) {
- struct inode *dir = req->r_parent;
-
- if (dir) {
- err = ceph_fill_inode(dir, NULL, &rinfo->diri,
+ /*
+ * r_parent may be stale, in cases when R_PARENT_LOCKED is not set,
+ * so we need to get the correct inode
+ */
+ parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo);
+ if (unlikely(IS_ERR(parent_dir))) {
+ err = PTR_ERR(parent_dir);
+ goto done;
+ }
+ if (parent_dir) {
+ err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri,
rinfo->dirfrag, session, -1,
&req->r_caps_reservation);
if (err < 0)
@@ -1548,14 +1602,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
WARN_ON_ONCE(1);
}
- if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
+ if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
bool is_nokey = false;
struct qstr dname;
struct dentry *dn, *parent;
struct fscrypt_str oname = FSTR_INIT(NULL, 0);
- struct ceph_fname fname = { .dir = dir,
+ struct ceph_fname fname = { .dir = parent_dir,
.name = rinfo->dname,
.ctext = rinfo->altname,
.name_len = rinfo->dname_len,
@@ -1564,10 +1618,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
BUG_ON(!rinfo->head->is_target);
BUG_ON(req->r_dentry);
- parent = d_find_any_alias(dir);
+ parent = d_find_any_alias(parent_dir);
BUG_ON(!parent);
- err = ceph_fname_alloc_buffer(dir, &oname);
+ err = ceph_fname_alloc_buffer(parent_dir, &oname);
if (err < 0) {
dput(parent);
goto done;
@@ -1576,7 +1630,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
if (err < 0) {
dput(parent);
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
goto done;
}
dname.name = oname.name;
@@ -1595,7 +1649,7 @@ retry_lookup:
dname.len, dname.name, dn);
if (!dn) {
dput(parent);
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
err = -ENOMEM;
goto done;
}
@@ -1610,12 +1664,12 @@ retry_lookup:
ceph_snap(d_inode(dn)) != tvino.snap)) {
doutc(cl, " dn %p points to wrong inode %p\n",
dn, d_inode(dn));
- ceph_dir_clear_ordered(dir);
+ ceph_dir_clear_ordered(parent_dir);
d_delete(dn);
dput(dn);
goto retry_lookup;
}
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
req->r_dentry = dn;
dput(parent);
@@ -1794,6 +1848,9 @@ retry_lookup:
&dvino, ptvino);
}
done:
+ /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */
+ if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent))
+ iput(parent_dir);
doutc(cl, "done err=%d\n", err);
return err;
}
@@ -2487,22 +2544,21 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
int truncate_retry = 20; /* The RMW will take around 50ms */
struct dentry *dentry;
char *path;
- int pathlen;
- u64 pathbase;
bool do_sync = false;
dentry = d_find_alias(inode);
if (!dentry) {
do_sync = true;
} else {
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
if (IS_ERR(path)) {
do_sync = true;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dentry);
/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0f497c39ff82..3bc72b47fe4d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2681,8 +2681,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
* ceph_mdsc_build_path - build a path string to a given dentry
* @mdsc: mds client
* @dentry: dentry to which path should be built
- * @plen: returned length of string
- * @pbase: returned base inode number
+ * @path_info: output path, length, base ino+snap, and freepath ownership flag
* @for_wire: is this path going to be sent to the MDS?
*
* Build a string that represents the path to the dentry. This is mostly called
@@ -2700,7 +2699,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
* foo/.snap/bar -> foo//bar
*/
char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
- int *plen, u64 *pbase, int for_wire)
+ struct ceph_path_info *path_info, int for_wire)
{
struct ceph_client *cl = mdsc->fsc->client;
struct dentry *cur;
@@ -2810,16 +2809,28 @@ retry:
return ERR_PTR(-ENAMETOOLONG);
}
- *pbase = base;
- *plen = PATH_MAX - 1 - pos;
+ /* Initialize the output structure */
+ memset(path_info, 0, sizeof(*path_info));
+
+ path_info->vino.ino = base;
+ path_info->pathlen = PATH_MAX - 1 - pos;
+ path_info->path = path + pos;
+ path_info->freepath = true;
+
+ /* Set snap from dentry if available */
+ if (d_inode(dentry))
+ path_info->vino.snap = ceph_snap(d_inode(dentry));
+ else
+ path_info->vino.snap = CEPH_NOSNAP;
+
doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry),
- base, *plen, path + pos);
+ base, PATH_MAX - 1 - pos, path + pos);
return path + pos;
}
static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
- struct inode *dir, const char **ppath, int *ppathlen,
- u64 *pino, bool *pfreepath, bool parent_locked)
+ struct inode *dir, struct ceph_path_info *path_info,
+ bool parent_locked)
{
char *path;
@@ -2828,41 +2839,47 @@ static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry
dir = d_inode_rcu(dentry->d_parent);
if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&
!IS_ENCRYPTED(dir)) {
- *pino = ceph_ino(dir);
+ path_info->vino.ino = ceph_ino(dir);
+ path_info->vino.snap = ceph_snap(dir);
rcu_read_unlock();
- *ppath = dentry->d_name.name;
- *ppathlen = dentry->d_name.len;
+ path_info->path = dentry->d_name.name;
+ path_info->pathlen = dentry->d_name.len;
+ path_info->freepath = false;
return 0;
}
rcu_read_unlock();
- path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
if (IS_ERR(path))
return PTR_ERR(path);
- *ppath = path;
- *pfreepath = true;
+ /*
+ * ceph_mdsc_build_path already fills path_info, including snap handling.
+ */
return 0;
}
-static int build_inode_path(struct inode *inode,
- const char **ppath, int *ppathlen, u64 *pino,
- bool *pfreepath)
+static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct dentry *dentry;
char *path;
if (ceph_snap(inode) == CEPH_NOSNAP) {
- *pino = ceph_ino(inode);
- *ppathlen = 0;
+ path_info->vino.ino = ceph_ino(inode);
+ path_info->vino.snap = ceph_snap(inode);
+ path_info->pathlen = 0;
+ path_info->freepath = false;
return 0;
}
dentry = d_find_alias(inode);
- path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
dput(dentry);
if (IS_ERR(path))
return PTR_ERR(path);
- *ppath = path;
- *pfreepath = true;
+ /*
+ * ceph_mdsc_build_path already fills path_info, including snap from dentry.
+ * Override with inode's snap since that's what this function is for.
+ */
+ path_info->vino.snap = ceph_snap(inode);
return 0;
}
@@ -2872,26 +2889,32 @@ static int build_inode_path(struct inode *inode,
*/
static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
struct dentry *rdentry, struct inode *rdiri,
- const char *rpath, u64 rino, const char **ppath,
- int *pathlen, u64 *ino, bool *freepath,
+ const char *rpath, u64 rino,
+ struct ceph_path_info *path_info,
bool parent_locked)
{
struct ceph_client *cl = mdsc->fsc->client;
int r = 0;
+ /* Initialize the output structure */
+ memset(path_info, 0, sizeof(*path_info));
+
if (rinode) {
- r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
+ r = build_inode_path(rinode, path_info);
doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
ceph_snap(rinode));
} else if (rdentry) {
- r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
- freepath, parent_locked);
- doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath);
+ r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked);
+ doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino,
+ path_info->pathlen, path_info->path);
} else if (rpath || rino) {
- *ino = rino;
- *ppath = rpath;
- *pathlen = rpath ? strlen(rpath) : 0;
- doutc(cl, " path %.*s\n", *pathlen, rpath);
+ path_info->vino.ino = rino;
+ path_info->vino.snap = CEPH_NOSNAP;
+ path_info->path = rpath;
+ path_info->pathlen = rpath ? strlen(rpath) : 0;
+ path_info->freepath = false;
+
+ doutc(cl, " path %.*s\n", path_info->pathlen, rpath);
}
return r;
@@ -2968,11 +2991,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
struct ceph_mds_request_head_legacy *lhead;
- const char *path1 = NULL;
- const char *path2 = NULL;
- u64 ino1 = 0, ino2 = 0;
- int pathlen1 = 0, pathlen2 = 0;
- bool freepath1 = false, freepath2 = false;
+ struct ceph_path_info path_info1 = {0};
+ struct ceph_path_info path_info2 = {0};
struct dentry *old_dentry = NULL;
int len;
u16 releases;
@@ -2982,25 +3002,49 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
u16 request_head_version = mds_supported_head_version(session);
kuid_t caller_fsuid = req->r_cred->fsuid;
kgid_t caller_fsgid = req->r_cred->fsgid;
+ bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
- req->r_parent, req->r_path1, req->r_ino1.ino,
- &path1, &pathlen1, &ino1, &freepath1,
- test_bit(CEPH_MDS_R_PARENT_LOCKED,
- &req->r_req_flags));
+ req->r_parent, req->r_path1, req->r_ino1.ino,
+ &path_info1, parent_locked);
if (ret < 0) {
msg = ERR_PTR(ret);
goto out;
}
+ /*
+ * When the parent directory's i_rwsem is *not* locked, req->r_parent may
+ * have become stale (e.g. after a concurrent rename) between the time the
+ * dentry was looked up and now. If we detect that the stored r_parent
+ * does not match the inode number we just encoded for the request, switch
+ * to the correct inode so that the MDS receives a valid parent reference.
+ */
+ if (!parent_locked && req->r_parent && path_info1.vino.ino &&
+ ceph_ino(req->r_parent) != path_info1.vino.ino) {
+ struct inode *old_parent = req->r_parent;
+ struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL);
+ if (!IS_ERR(correct_dir)) {
+ WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n",
+ ceph_ino(old_parent), path_info1.vino.ino);
+ /*
+ * Transfer CEPH_CAP_PIN from the old parent to the new one.
+ * The pin was taken earlier in ceph_mdsc_submit_request().
+ */
+ ceph_put_cap_refs(ceph_inode(old_parent), CEPH_CAP_PIN);
+ iput(old_parent);
+ req->r_parent = correct_dir;
+ ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+ }
+ }
+
/* If r_old_dentry is set, then assume that its parent is locked */
if (req->r_old_dentry &&
!(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
old_dentry = req->r_old_dentry;
ret = set_request_path_attr(mdsc, NULL, old_dentry,
- req->r_old_dentry_dir,
- req->r_path2, req->r_ino2.ino,
- &path2, &pathlen2, &ino2, &freepath2, true);
+ req->r_old_dentry_dir,
+ req->r_path2, req->r_ino2.ino,
+ &path_info2, true);
if (ret < 0) {
msg = ERR_PTR(ret);
goto out_free1;
@@ -3031,7 +3075,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
/* filepaths */
len += 2 * (1 + sizeof(u32) + sizeof(u64));
- len += pathlen1 + pathlen2;
+ len += path_info1.pathlen + path_info2.pathlen;
/* cap releases */
len += sizeof(struct ceph_mds_request_release) *
@@ -3039,9 +3083,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
!!req->r_old_inode_drop + !!req->r_old_dentry_drop);
if (req->r_dentry_drop)
- len += pathlen1;
+ len += path_info1.pathlen;
if (req->r_old_dentry_drop)
- len += pathlen2;
+ len += path_info2.pathlen;
/* MClientRequest tail */
@@ -3154,8 +3198,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
lhead->ino = cpu_to_le64(req->r_deleg_ino);
lhead->args = req->r_args;
- ceph_encode_filepath(&p, end, ino1, path1);
- ceph_encode_filepath(&p, end, ino2, path2);
+ ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path);
+ ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path);
/* make note of release offset, in case we need to replay */
req->r_request_release_offset = p - msg->front.iov_base;
@@ -3218,11 +3262,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
msg->hdr.data_off = cpu_to_le16(0);
out_free2:
- if (freepath2)
- ceph_mdsc_free_path((char *)path2, pathlen2);
+ ceph_mdsc_free_path_info(&path_info2);
out_free1:
- if (freepath1)
- ceph_mdsc_free_path((char *)path1, pathlen1);
+ ceph_mdsc_free_path_info(&path_info1);
out:
return msg;
out_err:
@@ -4579,24 +4621,20 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
struct ceph_pagelist *pagelist = recon_state->pagelist;
struct dentry *dentry;
struct ceph_cap *cap;
- char *path;
- int pathlen = 0, err;
- u64 pathbase;
+ struct ceph_path_info path_info = {0};
+ int err;
u64 snap_follows;
dentry = d_find_primary(inode);
if (dentry) {
/* set pathbase to parent dir when msg_version >= 2 */
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase,
+ char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info,
recon_state->msg_version >= 2);
dput(dentry);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out_err;
}
- } else {
- path = NULL;
- pathbase = 0;
}
spin_lock(&ci->i_ceph_lock);
@@ -4629,7 +4667,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v2.pathbase = cpu_to_le64(pathbase);
+ rec.v2.pathbase = cpu_to_le64(path_info.vino.ino);
rec.v2.flock_len = (__force __le32)
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
} else {
@@ -4644,7 +4682,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
ts = inode_get_atime(inode);
ceph_encode_timespec64(&rec.v1.atime, &ts);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v1.pathbase = cpu_to_le64(pathbase);
+ rec.v1.pathbase = cpu_to_le64(path_info.vino.ino);
}
if (list_empty(&ci->i_cap_snaps)) {
@@ -4706,7 +4744,7 @@ encode_again:
sizeof(struct ceph_filelock);
rec.v2.flock_len = cpu_to_le32(struct_len);
- struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
+ struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2);
if (struct_v >= 2)
struct_len += sizeof(u64); /* snap_follows */
@@ -4730,7 +4768,7 @@ encode_again:
ceph_pagelist_encode_8(pagelist, 1);
ceph_pagelist_encode_32(pagelist, struct_len);
}
- ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
ceph_locks_to_pagelist(flocks, pagelist,
num_fcntl_locks, num_flock_locks);
@@ -4741,17 +4779,17 @@ out_freeflocks:
} else {
err = ceph_pagelist_reserve(pagelist,
sizeof(u64) + sizeof(u32) +
- pathlen + sizeof(rec.v1));
+ path_info.pathlen + sizeof(rec.v1));
if (err)
goto out_err;
ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
- ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
}
out_err:
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
if (!err)
recon_state->nr_caps++;
return err;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 3e2a6fa7c19a..0428a5eaf28c 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -617,14 +617,24 @@ extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath,
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
-static inline void ceph_mdsc_free_path(char *path, int len)
+/*
+ * Structure to group path-related output parameters for build_*_path functions
+ */
+struct ceph_path_info {
+ const char *path;
+ int pathlen;
+ struct ceph_vino vino;
+ bool freepath;
+};
+
+static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info)
{
- if (!IS_ERR_OR_NULL(path))
- __putname(path - (PATH_MAX - 1 - len));
+ if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path))
+ __putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen));
}
extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc,
- struct dentry *dentry, int *plen, u64 *base,
+ struct dentry *dentry, struct ceph_path_info *path_info,
int for_wire);
extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
diff --git a/fs/coredump.c b/fs/coredump.c
index 5dce257c67fc..60bc9685e149 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -1466,11 +1466,15 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write,
ssize_t retval;
char old_core_pattern[CORENAME_MAX_SIZE];
+ if (write)
+ return proc_dostring(table, write, buffer, lenp, ppos);
+
retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE);
error = proc_dostring(table, write, buffer, lenp, ppos);
if (error)
return error;
+
if (!check_coredump_socket()) {
strscpy(core_pattern, old_core_pattern, retval + 1);
return -EINVAL;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index c4a139911356..4bb4002e3cdf 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -152,6 +152,10 @@ static int efivarfs_d_compare(const struct dentry *dentry,
{
int guid = len - EFI_VARIABLE_GUID_LEN;
+ /* Parallel lookups may produce a temporary invalid filename */
+ if (guid <= 0)
+ return 1;
+
if (name->len != len)
return 1;
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 377ee12b8b96..3d5738f80072 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -12,10 +12,12 @@
/* to allow for x86 boot sectors and other oddities. */
#define EROFS_SUPER_OFFSET 1024
-#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
-#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
-#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
+#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
+#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
+#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
#define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX 0x00000008
+#define EROFS_FEATURE_COMPAT_PLAIN_XATTR_PFX 0x00000010
+
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 4ccc5f0ee8df..9319c66e86c3 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -234,6 +234,7 @@ EROFS_FEATURE_FUNCS(metabox, incompat, INCOMPAT_METABOX)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX)
+EROFS_FEATURE_FUNCS(plain_xattr_pfx, compat, COMPAT_PLAIN_XATTR_PFX)
static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid)
{
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 1b529ace4db0..db13b40a78e0 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -1018,10 +1018,22 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
+static void erofs_evict_inode(struct inode *inode)
+{
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(inode))
+ dax_break_layout_final(inode);
+#endif
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+}
+
const struct super_operations erofs_sops = {
.put_super = erofs_put_super,
.alloc_inode = erofs_alloc_inode,
.free_inode = erofs_free_inode,
+ .evict_inode = erofs_evict_inode,
.statfs = erofs_statfs,
.show_options = erofs_show_options,
};
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index eaa9efd766ee..396536d9a862 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -482,6 +482,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2;
struct erofs_xattr_prefix_item *pfs;
int ret = 0, i, len;
+ bool plain = erofs_sb_has_plain_xattr_pfx(sbi);
if (!sbi->xattr_prefix_count)
return 0;
@@ -490,9 +491,15 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
if (!pfs)
return -ENOMEM;
- if (sbi->packed_inode)
- buf.mapping = sbi->packed_inode->i_mapping;
- else
+ if (!plain) {
+ if (erofs_sb_has_metabox(sbi))
+ (void)erofs_init_metabuf(&buf, sb, true);
+ else if (sbi->packed_inode)
+ buf.mapping = sbi->packed_inode->i_mapping;
+ else
+ plain = true;
+ }
+ if (plain)
(void)erofs_init_metabuf(&buf, sb, false);
for (i = 0; i < sbi->xattr_prefix_count; i++) {
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index a93efd95c555..798223e6da9c 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -394,10 +394,10 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
.map = map,
.in_mbox = erofs_inode_in_metabox(inode),
};
- int err = 0;
- unsigned int endoff, afmt;
+ unsigned int endoff;
unsigned long initial_lcn;
unsigned long long ofs, end;
+ int err;
ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la;
if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) &&
@@ -482,20 +482,15 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
err = -EFSCORRUPTED;
goto unmap_out;
}
- afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ?
- Z_EROFS_COMPRESSION_INTERLACED :
- Z_EROFS_COMPRESSION_SHIFTED;
+ if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_INTERLACED;
+ else
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+ } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
+ map->m_algorithmformat = vi->z_algorithmtype[1];
} else {
- afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ?
- vi->z_algorithmtype[1] : vi->z_algorithmtype[0];
- if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) {
- erofs_err(sb, "inconsistent algorithmtype %u for nid %llu",
- afmt, vi->nid);
- err = -EFSCORRUPTED;
- goto unmap_out;
- }
+ map->m_algorithmformat = vi->z_algorithmtype[0];
}
- map->m_algorithmformat = afmt;
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
@@ -626,9 +621,9 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
{
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = inode->i_sb;
- int err, headnr;
- erofs_off_t pos;
struct z_erofs_map_header *h;
+ erofs_off_t pos;
+ int err = 0;
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
/*
@@ -642,7 +637,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE))
return -ERESTARTSYS;
- err = 0;
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
goto out_unlock;
@@ -679,15 +673,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER)
vi->z_idata_size = le16_to_cpu(h->h_idata_size);
- headnr = 0;
- if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
- vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) {
- erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
- headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
- err = -EOPNOTSUPP;
- goto out_unlock;
- }
-
if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
@@ -726,6 +711,30 @@ out_unlock:
return err;
}
+static int z_erofs_map_sanity_check(struct inode *inode,
+ struct erofs_map_blocks *map)
+{
+ struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+
+ if (!(map->m_flags & EROFS_MAP_ENCODED))
+ return 0;
+ if (unlikely(map->m_algorithmformat >= Z_EROFS_COMPRESSION_RUNTIME_MAX)) {
+ erofs_err(inode->i_sb, "unknown algorithm %d @ pos %llu for nid %llu, please upgrade kernel",
+ map->m_algorithmformat, map->m_la, EROFS_I(inode)->nid);
+ return -EOPNOTSUPP;
+ }
+ if (unlikely(map->m_algorithmformat < Z_EROFS_COMPRESSION_MAX &&
+ !(sbi->available_compr_algs & (1 << map->m_algorithmformat)))) {
+ erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu",
+ map->m_algorithmformat, EROFS_I(inode)->nid);
+ return -EFSCORRUPTED;
+ }
+ if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
+ map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
+ return -EOPNOTSUPP;
+ return 0;
+}
+
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags)
{
@@ -746,10 +755,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
else
err = z_erofs_map_blocks_fo(inode, map, flags);
}
- if (!err && (map->m_flags & EROFS_MAP_ENCODED) &&
- unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
- map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
- err = -EOPNOTSUPP;
+ if (!err)
+ err = z_erofs_map_sanity_check(inode, map);
if (err)
map->m_llen = 0;
}
diff --git a/fs/exec.c b/fs/exec.c
index 2a1e5e4042a1..e861a4b7ffda 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -2048,7 +2048,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ
{
int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!error)
+ if (!error && !write)
validate_coredump_safety();
return error;
}
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 68a7d2861c58..a907ddfac4d5 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -208,6 +208,14 @@ static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
return 1;
/*
+ * Verify that the decoded dentry itself has a valid id mapping.
+ * In case the decoded dentry is the mountfd root itself, this
+ * verifies that the mountfd inode itself has a valid id mapping.
+ */
+ if (!privileged_wrt_inode_uidgid(user_ns, idmap, d_inode(dentry)))
+ return 0;
+
+ /*
* It's racy as we're not taking rename_lock but we're able to ignore
* permissions and we just need an approximation whether we were able
* to follow a path to the file.
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e80cd8f2c049..5150aa25e64b 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1893,7 +1893,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
index = outarg->offset >> PAGE_SHIFT;
- while (num) {
+ while (num && ap->num_folios < num_pages) {
struct folio *folio;
unsigned int folio_offset;
unsigned int nr_bytes;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 2d817d7cab26..5c569c3cb53f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1199,7 +1199,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode,
if (attr->blksize != 0)
blkbits = ilog2(attr->blksize);
else
- blkbits = inode->i_sb->s_blocksize_bits;
+ blkbits = fc->blkbits;
stat->blksize = 1 << blkbits;
}
@@ -1377,6 +1377,7 @@ retry:
generic_fillattr(idmap, request_mask, inode, stat);
stat->mode = fi->orig_i_mode;
stat->ino = fi->orig_ino;
+ stat->blksize = 1 << fi->cached_i_blkbits;
if (test_bit(FUSE_I_BTIME, &fi->state)) {
stat->btime = fi->i_btime;
stat->result_mask |= STATX_BTIME;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5525a4520b0f..4adcf09d4b01 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2960,7 +2960,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
.nodeid_out = ff_out->nodeid,
.fh_out = ff_out->fh,
.off_out = pos_out,
- .len = len,
+ .len = min_t(size_t, len, UINT_MAX & PAGE_MASK),
.flags = flags
};
struct fuse_write_out outarg;
@@ -3026,6 +3026,9 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
fc->no_copy_file_range = 1;
err = -EOPNOTSUPP;
}
+ if (!err && outarg.size > len)
+ err = -EIO;
+
if (err)
goto out;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ec248d13c8bf..cc428d04be3e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -210,6 +210,12 @@ struct fuse_inode {
/** Reference to backing file in passthrough mode */
struct fuse_backing *fb;
#endif
+
+ /*
+ * The underlying inode->i_blkbits value will not be modified,
+ * so preserve the blocksize specified by the server.
+ */
+ u8 cached_i_blkbits;
};
/** FUSE inode state bits */
@@ -969,6 +975,14 @@ struct fuse_conn {
/* Request timeout (in jiffies). 0 = no timeout */
unsigned int req_timeout;
} timeout;
+
+ /*
+ * This is a workaround until fuse uses iomap for reads.
+ * For fuseblk servers, this represents the blocksize passed in at
+ * mount time and for regular fuse servers, this is equivalent to
+ * inode->i_blkbits.
+ */
+ u8 blkbits;
};
/*
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 67c2318bfc42..7ddfd2b3cc9c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -289,6 +289,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
}
}
+ if (attr->blksize)
+ fi->cached_i_blkbits = ilog2(attr->blksize);
+ else
+ fi->cached_i_blkbits = fc->blkbits;
+
/*
* Don't set the sticky bit in i_mode, unless we want the VFS
* to check permissions. This prevents failures due to the
@@ -1805,10 +1810,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
err = -EINVAL;
if (!sb_set_blocksize(sb, ctx->blksize))
goto err;
+ /*
+ * This is a workaround until fuse hooks into iomap for reads.
+ * Use PAGE_SIZE for the blocksize else if the writeback cache
+ * is enabled, buffered writes go through iomap and a read may
+ * overwrite partially written data if blocksize < PAGE_SIZE
+ */
+ fc->blkbits = sb->s_blocksize_bits;
+ if (ctx->blksize != PAGE_SIZE &&
+ !sb_set_blocksize(sb, PAGE_SIZE))
+ goto err;
#endif
} else {
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
+ fc->blkbits = sb->s_blocksize_bits;
}
sb->s_subtype = ctx->subtype;
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index 607ef735ad4a..eb97ac009e75 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -237,6 +237,11 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
if (!file)
goto out;
+ /* read/write/splice/mmap passthrough only relevant for regular files */
+ res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL;
+ if (!d_is_reg(file->f_path.dentry))
+ goto out_fput;
+
backing_sb = file_inode(file)->i_sb;
res = -ELOOP;
if (backing_sb->s_stack_depth >= fc->max_stack_depth)
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index c826e7ca49f5..76c8fd0bfc75 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1016,7 +1016,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
if (kaddr)
*kaddr = fs->window_kaddr + offset;
if (pfn)
- *pfn = fs->window_phys_addr + offset;
+ *pfn = PHYS_PFN(fs->window_phys_addr + offset);
return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
}
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index a6c692cac616..9adf36e6364b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -70,6 +70,24 @@ static struct kernfs_open_node *of_on(struct kernfs_open_file *of)
!list_empty(&of->list));
}
+/* Get active reference to kernfs node for an open file */
+static struct kernfs_open_file *kernfs_get_active_of(struct kernfs_open_file *of)
+{
+ /* Skip if file was already released */
+ if (unlikely(of->released))
+ return NULL;
+
+ if (!kernfs_get_active(of->kn))
+ return NULL;
+
+ return of;
+}
+
+static void kernfs_put_active_of(struct kernfs_open_file *of)
+{
+ return kernfs_put_active(of->kn);
+}
+
/**
* kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn
*
@@ -139,7 +157,7 @@ static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
if (ops->seq_stop)
ops->seq_stop(sf, v);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
}
static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
@@ -152,7 +170,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return ERR_PTR(-ENODEV);
ops = kernfs_ops(of->kn);
@@ -238,7 +256,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn)) {
+ if (!kernfs_get_active_of(of)) {
len = -ENODEV;
mutex_unlock(&of->mutex);
goto out_free;
@@ -252,7 +270,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
else
len = -EINVAL;
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
mutex_unlock(&of->mutex);
if (len < 0)
@@ -323,7 +341,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn)) {
+ if (!kernfs_get_active_of(of)) {
mutex_unlock(&of->mutex);
len = -ENODEV;
goto out_free;
@@ -335,7 +353,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
else
len = -EINVAL;
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
mutex_unlock(&of->mutex);
if (len > 0)
@@ -357,13 +375,13 @@ static void kernfs_vma_open(struct vm_area_struct *vma)
if (!of->vm_ops)
return;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return;
if (of->vm_ops->open)
of->vm_ops->open(vma);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
}
static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
@@ -375,14 +393,14 @@ static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
if (!of->vm_ops)
return VM_FAULT_SIGBUS;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return VM_FAULT_SIGBUS;
ret = VM_FAULT_SIGBUS;
if (of->vm_ops->fault)
ret = of->vm_ops->fault(vmf);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
return ret;
}
@@ -395,7 +413,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
if (!of->vm_ops)
return VM_FAULT_SIGBUS;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return VM_FAULT_SIGBUS;
ret = 0;
@@ -404,7 +422,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
else
file_update_time(file);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
return ret;
}
@@ -418,14 +436,14 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
if (!of->vm_ops)
return -EINVAL;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return -EINVAL;
ret = -EINVAL;
if (of->vm_ops->access)
ret = of->vm_ops->access(vma, addr, buf, len, write);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
return ret;
}
@@ -455,7 +473,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
mutex_lock(&of->mutex);
rc = -ENODEV;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
goto out_unlock;
ops = kernfs_ops(of->kn);
@@ -490,7 +508,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
}
vma->vm_ops = &kernfs_vm_ops;
out_put:
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
out_unlock:
mutex_unlock(&of->mutex);
@@ -852,7 +870,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
__poll_t ret;
- if (!kernfs_get_active(kn))
+ if (!kernfs_get_active_of(of))
return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
if (kn->attr.ops->poll)
@@ -860,7 +878,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
else
ret = kernfs_generic_poll(of, wait);
- kernfs_put_active(kn);
+ kernfs_put_active_of(of);
return ret;
}
@@ -875,7 +893,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn)) {
+ if (!kernfs_get_active_of(of)) {
mutex_unlock(&of->mutex);
return -ENODEV;
}
@@ -886,7 +904,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
else
ret = generic_file_llseek(file, offset, whence);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
mutex_unlock(&of->mutex);
return ret;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index ae6d1312b184..51f77c65c0c6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2455,7 +2455,7 @@ struct vfsmount *clone_private_mount(const struct path *path)
return ERR_PTR(-EINVAL);
}
- if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
if (__has_locked_children(old_mnt, path->dentry))
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8fb4a950dd55..4e3dcc157a83 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -888,6 +888,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
if (fsinfo->xattr_support)
server->caps |= NFS_CAP_XATTR;
+ else
+ server->caps &= ~NFS_CAP_XATTR;
#endif
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 86e36c630f09..8059ece82468 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -28,6 +28,7 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/gfp.h>
+#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/compaction.h>
@@ -280,6 +281,37 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
EXPORT_SYMBOL_GPL(nfs_file_fsync);
+void nfs_truncate_last_folio(struct address_space *mapping, loff_t from,
+ loff_t to)
+{
+ struct folio *folio;
+
+ if (from >= to)
+ return;
+
+ folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return;
+
+ if (folio_mkclean(folio))
+ folio_mark_dirty(folio);
+
+ if (folio_test_uptodate(folio)) {
+ loff_t fpos = folio_pos(folio);
+ size_t offset = from - fpos;
+ size_t end = folio_size(folio);
+
+ if (to - fpos < end)
+ end = to - fpos;
+ folio_zero_segment(folio, offset, end);
+ trace_nfs_size_truncate_folio(mapping->host, to);
+ }
+
+ folio_unlock(folio);
+ folio_put(folio);
+}
+EXPORT_SYMBOL_GPL(nfs_truncate_last_folio);
+
/*
* Decide whether a read/modify/write cycle may be more efficient
* then a modify/write/read cycle when writing to a page in the
@@ -356,6 +388,7 @@ static int nfs_write_begin(const struct kiocb *iocb,
dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
file, mapping->host->i_ino, len, (long long) pos);
+ nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos);
fgp |= fgf_set_order(len);
start:
@@ -442,10 +475,11 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset,
dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n",
folio->index, offset, length);
- if (offset != 0 || length < folio_size(folio))
- return;
/* Cancel any unstarted writes on this page */
- nfs_wb_folio_cancel(inode, folio);
+ if (offset != 0 || length < folio_size(folio))
+ nfs_wb_folio(inode, folio);
+ else
+ nfs_wb_folio_cancel(inode, folio);
folio_wait_private_2(folio); /* [DEPRECATED] */
trace_nfs_invalidate_folio(inode, folio_pos(folio) + offset, length);
}
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 8dc921d83538..9edb5f9b0c4e 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -293,7 +293,7 @@ ff_lseg_match_mirrors(struct pnfs_layout_segment *l1,
struct pnfs_layout_segment *l2)
{
const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1);
- const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1);
+ const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l2);
u32 i;
if (fl1->mirror_array_cnt != fl2->mirror_array_cnt)
@@ -773,8 +773,11 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
continue;
if (check_device &&
- nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node))
+ nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) {
+ // reinitialize the error state in case if this is the last iteration
+ ds = ERR_PTR(-EINVAL);
continue;
+ }
*best_idx = idx;
break;
@@ -804,7 +807,7 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
struct nfs4_pnfs_ds *ds;
ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
- if (ds)
+ if (!IS_ERR(ds))
return ds;
return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
}
@@ -818,7 +821,7 @@ ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
best_idx);
- if (ds || !pgio->pg_mirror_idx)
+ if (!IS_ERR(ds) || !pgio->pg_mirror_idx)
return ds;
return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
}
@@ -868,7 +871,7 @@ retry:
req->wb_nio = 0;
ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
- if (!ds) {
+ if (IS_ERR(ds)) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
pnfs_generic_pg_cleanup(pgio);
@@ -1072,11 +1075,13 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
{
u32 idx = hdr->pgio_mirror_idx + 1;
u32 new_idx = 0;
+ struct nfs4_pnfs_ds *ds;
- if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx))
- ff_layout_send_layouterror(hdr->lseg);
- else
+ ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx);
+ if (IS_ERR(ds))
pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
+ else
+ ff_layout_send_layouterror(hdr->lseg);
pnfs_read_resend_pnfs(hdr, new_idx);
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 338ef77ae423..49df9debb1a6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -716,6 +716,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
{
struct inode *inode = d_inode(dentry);
struct nfs_fattr *fattr;
+ loff_t oldsize = i_size_read(inode);
int error = 0;
nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
@@ -731,7 +732,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (error)
return error;
- if (attr->ia_size == i_size_read(inode))
+ if (attr->ia_size == oldsize)
attr->ia_valid &= ~ATTR_SIZE;
}
@@ -767,8 +768,10 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
trace_nfs_setattr_enter(inode);
/* Write all dirty data */
- if (S_ISREG(inode->i_mode))
+ if (S_ISREG(inode->i_mode)) {
+ nfs_file_block_o_direct(NFS_I(inode));
nfs_sync_inode(inode);
+ }
fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
if (fattr == NULL) {
@@ -777,8 +780,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
- if (error == 0)
+ if (error == 0) {
+ if (attr->ia_valid & ATTR_SIZE)
+ nfs_truncate_last_folio(inode->i_mapping, oldsize,
+ attr->ia_size);
error = nfs_refresh_inode(inode, fattr);
+ }
nfs_free_fattr(fattr);
out:
trace_nfs_setattr_exit(inode, error);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 74d712b58423..c0a44f389f8f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -437,6 +437,8 @@ int nfs_file_release(struct inode *, struct file *);
int nfs_lock(struct file *, int, struct file_lock *);
int nfs_flock(struct file *, int, struct file_lock *);
int nfs_check_flags(int);
+void nfs_truncate_last_folio(struct address_space *mapping, loff_t from,
+ loff_t to);
/* inode.c */
extern struct workqueue_struct *nfsiod_workqueue;
@@ -530,6 +532,16 @@ static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi)
return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0;
}
+/* Must be called with exclusively locked inode->i_rwsem */
+static inline void nfs_file_block_o_direct(struct nfs_inode *nfsi)
+{
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+ clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
+ inode_dio_wait(&nfsi->vfs_inode);
+ }
+}
+
+
/* namespace.c */
#define NFS_PATH_CANONICAL 1
extern char *nfs_path(char **p, struct dentry *dentry,
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
index 3388faf2acb9..d275b0a250bf 100644
--- a/fs/nfs/io.c
+++ b/fs/nfs/io.c
@@ -14,15 +14,6 @@
#include "internal.h"
-/* Call with exclusively locked inode->i_rwsem */
-static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
-{
- if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
- clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
- inode_dio_wait(inode);
- }
-}
-
/**
* nfs_start_io_read - declare the file is being used for buffered reads
* @inode: file inode
@@ -57,7 +48,7 @@ nfs_start_io_read(struct inode *inode)
err = down_write_killable(&inode->i_rwsem);
if (err)
return err;
- nfs_block_o_direct(nfsi, inode);
+ nfs_file_block_o_direct(nfsi);
downgrade_write(&inode->i_rwsem);
return 0;
@@ -90,7 +81,7 @@ nfs_start_io_write(struct inode *inode)
err = down_write_killable(&inode->i_rwsem);
if (!err)
- nfs_block_o_direct(NFS_I(inode), inode);
+ nfs_file_block_o_direct(NFS_I(inode));
return err;
}
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index bd5fca285899..97abf62f109d 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -180,10 +180,8 @@ static void nfs_local_probe(struct nfs_client *clp)
return;
}
- if (nfs_client_is_local(clp)) {
- /* If already enabled, disable and re-enable */
- nfs_localio_disable_client(clp);
- }
+ if (nfs_client_is_local(clp))
+ return;
if (!nfs_uuid_begin(&clp->cl_uuid))
return;
@@ -244,7 +242,8 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
case -ENOMEM:
case -ENXIO:
case -ENOENT:
- /* Revalidate localio, will disable if unsupported */
+ /* Revalidate localio */
+ nfs_localio_disable_client(clp);
nfs_local_probe(clp);
}
}
@@ -453,12 +452,13 @@ static void nfs_local_call_read(struct work_struct *work)
nfs_local_iter_init(&iter, iocb, READ);
status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+
+ revert_creds(save_cred);
+
if (status != -EIOCBQUEUED) {
nfs_local_read_done(iocb, status);
nfs_local_pgio_release(iocb);
}
-
- revert_creds(save_cred);
}
static int
@@ -648,14 +648,15 @@ static void nfs_local_call_write(struct work_struct *work)
file_start_write(filp);
status = filp->f_op->write_iter(&iocb->kiocb, &iter);
file_end_write(filp);
+
+ revert_creds(save_cred);
+ current->flags = old_flags;
+
if (status != -EIOCBQUEUED) {
nfs_local_write_done(iocb, status);
nfs_local_vfs_getattr(iocb);
nfs_local_pgio_release(iocb);
}
-
- revert_creds(save_cred);
- current->flags = old_flags;
}
static int
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 01c01f45358b..6a0b5871ba3b 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -114,6 +114,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
exception.inode = inode;
exception.state = lock->open_context->state;
+ nfs_file_block_o_direct(NFS_I(inode));
err = nfs_sync_inode(inode);
if (err)
goto out;
@@ -137,6 +138,7 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE],
};
struct inode *inode = file_inode(filep);
+ loff_t oldsize = i_size_read(inode);
int err;
if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
@@ -145,7 +147,11 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
inode_lock(inode);
err = nfs42_proc_fallocate(&msg, filep, offset, len);
- if (err == -EOPNOTSUPP)
+
+ if (err == 0)
+ nfs_truncate_last_folio(inode->i_mapping, oldsize,
+ offset + len);
+ else if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE |
NFS_CAP_ZERO_RANGE);
@@ -183,6 +189,7 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE],
};
struct inode *inode = file_inode(filep);
+ loff_t oldsize = i_size_read(inode);
int err;
if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE))
@@ -191,9 +198,11 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len)
inode_lock(inode);
err = nfs42_proc_fallocate(&msg, filep, offset, len);
- if (err == 0)
+ if (err == 0) {
+ nfs_truncate_last_folio(inode->i_mapping, oldsize,
+ offset + len);
truncate_pagecache_range(inode, offset, (offset + len) -1);
- if (err == -EOPNOTSUPP)
+ } else if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE;
inode_unlock(inode);
@@ -354,22 +363,27 @@ out:
/**
* nfs42_copy_dest_done - perform inode cache updates after clone/copy offload
- * @inode: pointer to destination inode
+ * @file: pointer to destination file
* @pos: destination offset
* @len: copy length
+ * @oldsize: length of the file prior to clone/copy
*
* Punch a hole in the inode page cache, so that the NFS client will
* know to retrieve new data.
* Update the file size if necessary, and then mark the inode as having
* invalid cached values for change attribute, ctime, mtime and space used.
*/
-static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len)
+static void nfs42_copy_dest_done(struct file *file, loff_t pos, loff_t len,
+ loff_t oldsize)
{
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = file->f_mapping;
loff_t newsize = pos + len;
loff_t end = newsize - 1;
- WARN_ON_ONCE(invalidate_inode_pages2_range(inode->i_mapping,
- pos >> PAGE_SHIFT, end >> PAGE_SHIFT));
+ nfs_truncate_last_folio(mapping, oldsize, pos);
+ WARN_ON_ONCE(invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
+ end >> PAGE_SHIFT));
spin_lock(&inode->i_lock);
if (newsize > i_size_read(inode))
@@ -402,6 +416,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
struct nfs_server *src_server = NFS_SERVER(src_inode);
loff_t pos_src = args->src_pos;
loff_t pos_dst = args->dst_pos;
+ loff_t oldsize_dst = i_size_read(dst_inode);
size_t count = args->count;
ssize_t status;
@@ -430,6 +445,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
return status;
}
+ nfs_file_block_o_direct(NFS_I(dst_inode));
status = nfs_sync_inode(dst_inode);
if (status)
return status;
@@ -475,7 +491,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
goto out;
}
- nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count);
+ nfs42_copy_dest_done(dst, pos_dst, res->write_res.count, oldsize_dst);
nfs_invalidate_atime(src_inode);
status = res->write_res.count;
out:
@@ -1242,6 +1258,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
struct nfs42_clone_res res = {
.server = server,
};
+ loff_t oldsize_dst = i_size_read(dst_inode);
int status;
msg->rpc_argp = &args;
@@ -1276,7 +1293,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
/* a zero-length count means clone to EOF in src */
if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE)
count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset;
- nfs42_copy_dest_done(dst_inode, dst_offset, count);
+ nfs42_copy_dest_done(dst_f, dst_offset, count, oldsize_dst);
status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
}
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 1d6b5f4230c9..c9a0d1e420c6 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -278,9 +278,11 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
lock_two_nondirectories(src_inode, dst_inode);
/* flush all pending writes on both src and dst so that server
* has the latest data */
+ nfs_file_block_o_direct(NFS_I(src_inode));
ret = nfs_sync_inode(src_inode);
if (ret)
goto out_unlock;
+ nfs_file_block_o_direct(NFS_I(dst_inode));
ret = nfs_sync_inode(dst_inode);
if (ret)
goto out_unlock;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7d2b67e06cc3..ce61253efd45 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4013,8 +4013,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
res.attr_bitmask[2];
}
memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
- server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS |
- NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL);
+ server->caps &=
+ ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS |
+ NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS |
+ NFS_CAP_OPEN_XOR | NFS_CAP_DELEGTIME);
server->fattr_valid = NFS_ATTR_FATTR_V4;
if (res.attr_bitmask[0] & FATTR4_WORD0_ACL &&
res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
@@ -4092,7 +4094,6 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
};
int err;
- nfs_server_set_init_caps(server);
do {
err = nfs4_handle_exception(server,
_nfs4_server_capabilities(server, fhandle),
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 96b1323318c2..627115179795 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -272,6 +272,7 @@ DECLARE_EVENT_CLASS(nfs_update_size_class,
TP_ARGS(inode, new_size))
DEFINE_NFS_UPDATE_SIZE_EVENT(truncate);
+DEFINE_NFS_UPDATE_SIZE_EVENT(truncate_folio);
DEFINE_NFS_UPDATE_SIZE_EVENT(wcc);
DEFINE_NFS_UPDATE_SIZE_EVENT(update);
DEFINE_NFS_UPDATE_SIZE_EVENT(grow);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 8b7c04737967..647c53d1418a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -237,59 +237,17 @@ static void nfs_mapping_set_error(struct folio *folio, int error)
}
/*
- * nfs_page_group_search_locked
- * @head - head request of page group
- * @page_offset - offset into page
+ * nfs_page_covers_folio
+ * @req: struct nfs_page
*
- * Search page group with head @head to find a request that contains the
- * page offset @page_offset.
- *
- * Returns a pointer to the first matching nfs request, or NULL if no
- * match is found.
- *
- * Must be called with the page group lock held
- */
-static struct nfs_page *
-nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
-{
- struct nfs_page *req;
-
- req = head;
- do {
- if (page_offset >= req->wb_pgbase &&
- page_offset < (req->wb_pgbase + req->wb_bytes))
- return req;
-
- req = req->wb_this_page;
- } while (req != head);
-
- return NULL;
-}
-
-/*
- * nfs_page_group_covers_page
- * @head - head request of page group
- *
- * Return true if the page group with head @head covers the whole page,
- * returns false otherwise
+ * Return true if the request covers the whole folio.
+ * Note that the caller should ensure all subrequests have been joined
*/
static bool nfs_page_group_covers_page(struct nfs_page *req)
{
unsigned int len = nfs_folio_length(nfs_page_to_folio(req));
- struct nfs_page *tmp;
- unsigned int pos = 0;
-
- nfs_page_group_lock(req);
- for (;;) {
- tmp = nfs_page_group_search_locked(req->wb_head, pos);
- if (!tmp)
- break;
- pos = tmp->wb_pgbase + tmp->wb_bytes;
- }
-
- nfs_page_group_unlock(req);
- return pos >= len;
+ return req->wb_pgbase == 0 && req->wb_bytes == len;
}
/* We can set the PG_uptodate flag if we see that a write request
@@ -2045,6 +2003,7 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio)
* release it */
nfs_inode_remove_request(req);
nfs_unlock_and_release_request(req);
+ folio_cancel_dirty(folio);
}
return ret;
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index 14868a3dd592..bc52afbfc5c7 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -1075,7 +1075,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs)
************************************************************************/
static ssize_t nilfs_feature_revision_show(struct kobject *kobj,
- struct attribute *attr, char *buf)
+ struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%d.%d\n",
NILFS_CURRENT_REV, NILFS_MINOR_REV);
@@ -1087,7 +1087,7 @@ static const char features_readme_str[] =
"(1) revision\n\tshow current revision of NILFS file system driver.\n";
static ssize_t nilfs_feature_README_show(struct kobject *kobj,
- struct attribute *attr,
+ struct kobj_attribute *attr,
char *buf)
{
return sysfs_emit(buf, features_readme_str);
diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h
index 78a87a016928..d370cd5cce3f 100644
--- a/fs/nilfs2/sysfs.h
+++ b/fs/nilfs2/sysfs.h
@@ -50,16 +50,16 @@ struct nilfs_sysfs_dev_subgroups {
struct completion sg_segments_kobj_unregister;
};
-#define NILFS_COMMON_ATTR_STRUCT(name) \
+#define NILFS_KOBJ_ATTR_STRUCT(name) \
struct nilfs_##name##_attr { \
struct attribute attr; \
- ssize_t (*show)(struct kobject *, struct attribute *, \
+ ssize_t (*show)(struct kobject *, struct kobj_attribute *, \
char *); \
- ssize_t (*store)(struct kobject *, struct attribute *, \
+ ssize_t (*store)(struct kobject *, struct kobj_attribute *, \
const char *, size_t); \
}
-NILFS_COMMON_ATTR_STRUCT(feature);
+NILFS_KOBJ_ATTR_STRUCT(feature);
#define NILFS_DEV_ATTR_STRUCT(name) \
struct nilfs_##name##_attr { \
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 930150ed5db1..ef147e8b3271 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -706,6 +706,8 @@ out:
* it not only handles the fiemap for inlined files, but also deals
* with the fast symlink, cause they have no difference for extent
* mapping per se.
+ *
+ * Must be called with ip_alloc_sem semaphore held.
*/
static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
struct fiemap_extent_info *fieinfo,
@@ -717,6 +719,7 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
u64 phys;
u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ lockdep_assert_held_read(&oi->ip_alloc_sem);
di = (struct ocfs2_dinode *)di_bh->b_data;
if (ocfs2_inode_is_fast_symlink(inode))
@@ -732,8 +735,11 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
phys += offsetof(struct ocfs2_dinode,
id2.i_data.id_data);
+ /* Release the ip_alloc_sem to prevent deadlock on page fault */
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
flags);
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
if (ret < 0)
return ret;
}
@@ -802,9 +808,11 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
-
+ /* Release the ip_alloc_sem to prevent deadlock on page fault */
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
len_bytes, fe_flags);
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
if (ret)
break;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 14bf440ea4df..6c4f78f473fb 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1281,6 +1281,9 @@ static void ocfs2_clear_inode(struct inode *inode)
* the journal is flushed before journal shutdown. Thus it is safe to
* have inodes get cleaned up after journal shutdown.
*/
+ if (!osb->journal)
+ return;
+
jbd2_journal_release_jbd_inode(osb->journal->j_journal,
&oi->ip_jinode);
}
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 76e800e38c8f..176281112273 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -367,6 +367,25 @@ static const struct inode_operations proc_dir_inode_operations = {
.setattr = proc_notify_change,
};
+static void pde_set_flags(struct proc_dir_entry *pde)
+{
+ const struct proc_ops *proc_ops = pde->proc_ops;
+
+ if (!proc_ops)
+ return;
+
+ if (proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
+ pde->flags |= PROC_ENTRY_PERMANENT;
+ if (proc_ops->proc_read_iter)
+ pde->flags |= PROC_ENTRY_proc_read_iter;
+#ifdef CONFIG_COMPAT
+ if (proc_ops->proc_compat_ioctl)
+ pde->flags |= PROC_ENTRY_proc_compat_ioctl;
+#endif
+ if (proc_ops->proc_lseek)
+ pde->flags |= PROC_ENTRY_proc_lseek;
+}
+
/* returns the registered entry, or frees dp and returns NULL on failure */
struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
struct proc_dir_entry *dp)
@@ -374,6 +393,9 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
if (proc_alloc_inum(&dp->low_ino))
goto out_free_entry;
+ if (!S_ISDIR(dp->mode))
+ pde_set_flags(dp);
+
write_lock(&proc_subdir_lock);
dp->parent = dir;
if (pde_subdir_insert(dir, dp) == false) {
@@ -561,20 +583,6 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
return p;
}
-static void pde_set_flags(struct proc_dir_entry *pde)
-{
- if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
- pde->flags |= PROC_ENTRY_PERMANENT;
- if (pde->proc_ops->proc_read_iter)
- pde->flags |= PROC_ENTRY_proc_read_iter;
-#ifdef CONFIG_COMPAT
- if (pde->proc_ops->proc_compat_ioctl)
- pde->flags |= PROC_ENTRY_proc_compat_ioctl;
-#endif
- if (pde->proc_ops->proc_lseek)
- pde->flags |= PROC_ENTRY_proc_lseek;
-}
-
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
const struct proc_ops *proc_ops, void *data)
@@ -585,7 +593,6 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
if (!p)
return NULL;
p->proc_ops = proc_ops;
- pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_data);
@@ -636,7 +643,6 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
p->proc_ops = &proc_seq_ops;
p->seq_ops = ops;
p->state_size = state_size;
- pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_seq_private);
@@ -667,7 +673,6 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
return NULL;
p->proc_ops = &proc_single_ops;
p->single_show = show;
- pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_single_data);
diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
index d98e0d2de09f..3c39cfacb251 100644
--- a/fs/resctrl/ctrlmondata.c
+++ b/fs/resctrl/ctrlmondata.c
@@ -625,11 +625,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
*/
list_for_each_entry(d, &r->mon_domains, hdr.list) {
if (d->ci_id == domid) {
- rr.ci_id = d->ci_id;
cpu = cpumask_any(&d->hdr.cpu_mask);
ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
if (!ci)
continue;
+ rr.ci = ci;
mon_event_read(&rr, r, NULL, rdtgrp,
&ci->shared_cpu_map, evtid, false);
goto checkresult;
diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
index 0a1eedba2b03..9a8cf6f11151 100644
--- a/fs/resctrl/internal.h
+++ b/fs/resctrl/internal.h
@@ -98,7 +98,7 @@ struct mon_data {
* domains in @r sharing L3 @ci.id
* @evtid: Which monitor event to read.
* @first: Initialize MBM counter when true.
- * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains.
+ * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
* @err: Error encountered when reading counter.
* @val: Returned value of event counter. If @rgrp is a parent resource group,
* @val includes the sum of event counts from its child resource groups.
@@ -112,7 +112,7 @@ struct rmid_read {
struct rdt_mon_domain *d;
enum resctrl_event_id evtid;
bool first;
- unsigned int ci_id;
+ struct cacheinfo *ci;
int err;
u64 val;
void *arch_mon_ctx;
diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c
index f5637855c3ac..7326c28a7908 100644
--- a/fs/resctrl/monitor.c
+++ b/fs/resctrl/monitor.c
@@ -361,7 +361,6 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
{
int cpu = smp_processor_id();
struct rdt_mon_domain *d;
- struct cacheinfo *ci;
struct mbm_state *m;
int err, ret;
u64 tval = 0;
@@ -389,8 +388,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
}
/* Summing domains that share a cache, must be on a CPU for that cache. */
- ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
- if (!ci || ci->id != rr->ci_id)
+ if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
return -EINVAL;
/*
@@ -402,7 +400,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
*/
ret = -EINVAL;
list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
- if (d->ci_id != rr->ci_id)
+ if (d->ci_id != rr->ci->id)
continue;
err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
rr->evtid, &tval, rr->arch_mon_ctx);
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index beb4f18f05ef..2337cf795db3 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -304,6 +304,8 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v)
list_for_each(tmp1, &ses->tcon_list) {
tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
cfids = tcon->cfids;
+ if (!cfids)
+ continue;
spin_lock(&cfids->cfid_list_lock); /* check lock ordering */
seq_printf(m, "Num entries: %d\n", cfids->num_entries);
list_for_each_entry(cfid, &cfids->entries, entry) {
@@ -319,8 +321,6 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v)
seq_printf(m, "\n");
}
spin_unlock(&cfids->cfid_list_lock);
-
-
}
}
}
@@ -347,6 +347,22 @@ static __always_inline const char *compression_alg_str(__le16 alg)
}
}
+static __always_inline const char *cipher_alg_str(__le16 cipher)
+{
+ switch (cipher) {
+ case SMB2_ENCRYPTION_AES128_CCM:
+ return "AES128-CCM";
+ case SMB2_ENCRYPTION_AES128_GCM:
+ return "AES128-GCM";
+ case SMB2_ENCRYPTION_AES256_CCM:
+ return "AES256-CCM";
+ case SMB2_ENCRYPTION_AES256_GCM:
+ return "AES256-GCM";
+ default:
+ return "UNKNOWN";
+ }
+}
+
static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
{
struct mid_q_entry *mid_entry;
@@ -539,6 +555,11 @@ skip_rdma:
else
seq_puts(m, "disabled (not supported by this server)");
+ /* Show negotiated encryption cipher, even if not required */
+ seq_puts(m, "\nEncryption: ");
+ if (server->cipher_type)
+ seq_printf(m, "Negotiated cipher (%s)", cipher_alg_str(server->cipher_type));
+
seq_printf(m, "\n\n\tSessions: ");
i = 0;
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
@@ -576,12 +597,8 @@ skip_rdma:
/* dump session id helpful for use with network trace */
seq_printf(m, " SessionId: 0x%llx", ses->Suid);
- if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) {
+ if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
seq_puts(m, " encrypted");
- /* can help in debugging to show encryption type */
- if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)
- seq_puts(m, "(gcm256)");
- }
if (ses->sign)
seq_puts(m, " signed");
diff --git a/fs/smb/client/cifs_unicode.c b/fs/smb/client/cifs_unicode.c
index 4cc6e0896fad..f8659d36793f 100644
--- a/fs/smb/client/cifs_unicode.c
+++ b/fs/smb/client/cifs_unicode.c
@@ -629,6 +629,9 @@ cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len,
int len;
__le16 *dst;
+ if (!src)
+ return NULL;
+
len = cifs_local_to_utf16_bytes(src, maxlen, cp);
len += 2; /* NULL */
dst = kmalloc(len, GFP_KERNEL);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 3bd85ab2deb1..e1848276bab4 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -1358,6 +1358,20 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
truncate_setsize(target_inode, new_size);
fscache_resize_cookie(cifs_inode_cookie(target_inode),
new_size);
+ } else if (rc == -EOPNOTSUPP) {
+ /*
+ * copy_file_range syscall man page indicates EINVAL
+ * is returned e.g when "fd_in and fd_out refer to the
+ * same file and the source and target ranges overlap."
+ * Test generic/157 was what showed these cases where
+ * we need to remap EOPNOTSUPP to EINVAL
+ */
+ if (off >= src_inode->i_size) {
+ rc = -EINVAL;
+ } else if (src_inode == target_inode) {
+ if (off + len > destoff)
+ rc = -EINVAL;
+ }
}
if (rc == 0 && new_size > target_cifsi->netfs.zero_point)
target_cifsi->netfs.zero_point = new_size;
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 1e64a4fb6af0..0fae95cf81c4 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -87,7 +87,7 @@
#define SMB_INTERFACE_POLL_INTERVAL 600
/* maximum number of PDUs in one compound */
-#define MAX_COMPOUND 7
+#define MAX_COMPOUND 10
/*
* Default number of credits to keep available for SMB3.
@@ -1882,9 +1882,12 @@ static inline bool is_replayable_error(int error)
/* cifs_get_writable_file() flags */
-#define FIND_WR_ANY 0
-#define FIND_WR_FSUID_ONLY 1
-#define FIND_WR_WITH_DELETE 2
+enum cifs_writable_file_flags {
+ FIND_WR_ANY = 0U,
+ FIND_WR_FSUID_ONLY = (1U << 0),
+ FIND_WR_WITH_DELETE = (1U << 1),
+ FIND_WR_NO_PENDING_DELETE = (1U << 2),
+};
#define MID_FREE 0
#define MID_REQUEST_ALLOCATED 1
@@ -2343,6 +2346,8 @@ struct smb2_compound_vars {
struct kvec qi_iov;
struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
+ struct kvec unlink_iov[SMB2_SET_INFO_IOV_SIZE];
+ struct kvec rename_iov[SMB2_SET_INFO_IOV_SIZE];
struct kvec close_iov;
struct smb2_file_rename_info_hdr rename_info;
struct smb2_file_link_info_hdr link_info;
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index c34c533b2efa..e8fba98690ce 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -312,8 +312,8 @@ extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode);
extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon);
-extern void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon,
- const char *path);
+void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon,
+ struct dentry *dentry);
extern void cifs_mark_open_handles_for_deleted_file(struct inode *inode,
const char *path);
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 186e061068be..cb907e18cc35 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -998,7 +998,10 @@ int cifs_open(struct inode *inode, struct file *file)
/* Get the cached handle as SMB2 close is deferred */
if (OPEN_FMODE(file->f_flags) & FMODE_WRITE) {
- rc = cifs_get_writable_path(tcon, full_path, FIND_WR_FSUID_ONLY, &cfile);
+ rc = cifs_get_writable_path(tcon, full_path,
+ FIND_WR_FSUID_ONLY |
+ FIND_WR_NO_PENDING_DELETE,
+ &cfile);
} else {
rc = cifs_get_readable_path(tcon, full_path, &cfile);
}
@@ -2530,6 +2533,9 @@ refind_writable:
continue;
if (with_delete && !(open_file->fid.access & DELETE))
continue;
+ if ((flags & FIND_WR_NO_PENDING_DELETE) &&
+ open_file->status_file_deleted)
+ continue;
if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
if (!open_file->invalidHandle) {
/* found a good writable file */
@@ -2647,6 +2653,16 @@ cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
spin_unlock(&tcon->open_file_lock);
free_dentry_path(page);
*ret_file = find_readable_file(cinode, 0);
+ if (*ret_file) {
+ spin_lock(&cinode->open_file_lock);
+ if ((*ret_file)->status_file_deleted) {
+ spin_unlock(&cinode->open_file_lock);
+ cifsFileInfo_put(*ret_file);
+ *ret_file = NULL;
+ } else {
+ spin_unlock(&cinode->open_file_lock);
+ }
+ }
return *ret_file ? 0 : -ENOENT;
}
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index fe453a4b3dc8..0f0d2dae6283 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -1931,7 +1931,7 @@ cifs_drop_nlink(struct inode *inode)
* but will return the EACCES to the caller. Note that the VFS does not call
* unlink on negative dentries currently.
*/
-int cifs_unlink(struct inode *dir, struct dentry *dentry)
+static int __cifs_unlink(struct inode *dir, struct dentry *dentry, bool sillyrename)
{
int rc = 0;
unsigned int xid;
@@ -1984,7 +1984,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
}
netfs_wait_for_outstanding_io(inode);
- cifs_close_deferred_file_under_dentry(tcon, full_path);
+ cifs_close_deferred_file_under_dentry(tcon, dentry);
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
@@ -2003,7 +2003,24 @@ retry_std_delete:
goto psx_del_no_retry;
}
- rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry);
+ /* For SMB2+, if the file is open, we always perform a silly rename.
+ *
+ * We check for d_count() right after calling
+ * cifs_close_deferred_file_under_dentry() to make sure that the
+ * dentry's refcount gets dropped in case the file had any deferred
+ * close.
+ */
+ if (!sillyrename && server->vals->protocol_id > SMB10_PROT_ID) {
+ spin_lock(&dentry->d_lock);
+ if (d_count(dentry) > 1)
+ sillyrename = true;
+ spin_unlock(&dentry->d_lock);
+ }
+
+ if (sillyrename)
+ rc = -EBUSY;
+ else
+ rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry);
psx_del_no_retry:
if (!rc) {
@@ -2071,6 +2088,11 @@ unlink_out:
return rc;
}
+int cifs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ return __cifs_unlink(dir, dentry, false);
+}
+
static int
cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
const char *full_path, struct cifs_sb_info *cifs_sb,
@@ -2358,14 +2380,16 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb);
cifs_put_tlink(tlink);
+ cifsInode = CIFS_I(d_inode(direntry));
+
if (!rc) {
+ set_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags);
spin_lock(&d_inode(direntry)->i_lock);
i_size_write(d_inode(direntry), 0);
clear_nlink(d_inode(direntry));
spin_unlock(&d_inode(direntry)->i_lock);
}
- cifsInode = CIFS_I(d_inode(direntry));
/* force revalidate to go get info when needed */
cifsInode->time = 0;
@@ -2458,8 +2482,11 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
do_rename_exit:
- if (rc == 0)
+ if (rc == 0) {
d_move(from_dentry, to_dentry);
+ /* Force a new lookup */
+ d_drop(from_dentry);
+ }
cifs_put_tlink(tlink);
return rc;
}
@@ -2470,6 +2497,7 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
struct dentry *target_dentry, unsigned int flags)
{
const char *from_name, *to_name;
+ struct TCP_Server_Info *server;
void *page1, *page2;
struct cifs_sb_info *cifs_sb;
struct tcon_link *tlink;
@@ -2505,6 +2533,7 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
if (IS_ERR(tlink))
return PTR_ERR(tlink);
tcon = tlink_tcon(tlink);
+ server = tcon->ses->server;
page1 = alloc_dentry_path();
page2 = alloc_dentry_path();
@@ -2522,10 +2551,10 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
goto cifs_rename_exit;
}
- cifs_close_deferred_file_under_dentry(tcon, from_name);
+ cifs_close_deferred_file_under_dentry(tcon, source_dentry);
if (d_inode(target_dentry) != NULL) {
netfs_wait_for_outstanding_io(d_inode(target_dentry));
- cifs_close_deferred_file_under_dentry(tcon, to_name);
+ cifs_close_deferred_file_under_dentry(tcon, target_dentry);
}
rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
@@ -2591,19 +2620,53 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
unlink_target:
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
-
- /* Try unlinking the target dentry if it's not negative */
- if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) {
- if (d_is_dir(target_dentry))
- tmprc = cifs_rmdir(target_dir, target_dentry);
- else
- tmprc = cifs_unlink(target_dir, target_dentry);
- if (tmprc)
- goto cifs_rename_exit;
- rc = cifs_do_rename(xid, source_dentry, from_name,
- target_dentry, to_name);
- if (!rc)
- rehash = false;
+ if (d_really_is_positive(target_dentry)) {
+ if (!rc) {
+ struct inode *inode = d_inode(target_dentry);
+ /*
+ * Samba and ksmbd servers allow renaming a target
+ * directory that is open, so make sure to update
+ * ->i_nlink and then mark it as delete pending.
+ */
+ if (S_ISDIR(inode->i_mode)) {
+ drop_cached_dir_by_name(xid, tcon, to_name, cifs_sb);
+ spin_lock(&inode->i_lock);
+ i_size_write(inode, 0);
+ clear_nlink(inode);
+ spin_unlock(&inode->i_lock);
+ set_bit(CIFS_INO_DELETE_PENDING, &CIFS_I(inode)->flags);
+ CIFS_I(inode)->time = 0; /* force reval */
+ inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ }
+ } else if (rc == -EACCES || rc == -EEXIST) {
+ /*
+ * Rename failed, possibly due to a busy target.
+ * Retry it by unliking the target first.
+ */
+ if (d_is_dir(target_dentry)) {
+ tmprc = cifs_rmdir(target_dir, target_dentry);
+ } else {
+ tmprc = __cifs_unlink(target_dir, target_dentry,
+ server->vals->protocol_id > SMB10_PROT_ID);
+ }
+ if (tmprc) {
+ /*
+ * Some servers will return STATUS_ACCESS_DENIED
+ * or STATUS_DIRECTORY_NOT_EMPTY when failing to
+ * rename a non-empty directory. Make sure to
+ * propagate the appropriate error back to
+ * userspace.
+ */
+ if (tmprc == -EEXIST || tmprc == -ENOTEMPTY)
+ rc = tmprc;
+ goto cifs_rename_exit;
+ }
+ rc = cifs_do_rename(xid, source_dentry, from_name,
+ target_dentry, to_name);
+ if (!rc)
+ rehash = false;
+ }
}
/* force revalidate to go get info when needed */
@@ -2629,6 +2692,8 @@ cifs_dentry_needs_reval(struct dentry *dentry)
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
struct cached_fid *cfid = NULL;
+ if (test_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags))
+ return false;
if (cifs_i->time == 0)
return true;
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index da23cc12a52c..dda6dece802a 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -832,33 +832,28 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon)
kfree(tmp_list);
}
}
-void
-cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
+
+void cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon,
+ struct dentry *dentry)
{
- struct cifsFileInfo *cfile;
struct file_list *tmp_list, *tmp_next_list;
- void *page;
- const char *full_path;
+ struct cifsFileInfo *cfile;
LIST_HEAD(file_head);
- page = alloc_dentry_path();
spin_lock(&tcon->open_file_lock);
list_for_each_entry(cfile, &tcon->openFileList, tlist) {
- full_path = build_path_from_dentry(cfile->dentry, page);
- if (strstr(full_path, path)) {
- if (delayed_work_pending(&cfile->deferred)) {
- if (cancel_delayed_work(&cfile->deferred)) {
- spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
- cifs_del_deferred_close(cfile);
- spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
+ if ((cfile->dentry == dentry) &&
+ delayed_work_pending(&cfile->deferred) &&
+ cancel_delayed_work(&cfile->deferred)) {
+ spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
+ cifs_del_deferred_close(cfile);
+ spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
- tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
- if (tmp_list == NULL)
- break;
- tmp_list->cfile = cfile;
- list_add_tail(&tmp_list->list, &file_head);
- }
- }
+ tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
+ if (tmp_list == NULL)
+ break;
+ tmp_list->cfile = cfile;
+ list_add_tail(&tmp_list->list, &file_head);
}
}
spin_unlock(&tcon->open_file_lock);
@@ -868,7 +863,6 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
list_del(&tmp_list->list);
kfree(tmp_list);
}
- free_dentry_path(page);
}
/*
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 7869cec58f52..10c84c095fe7 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -278,7 +278,7 @@ static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb,
}
/*
- * For absolute symlinks it is not possible to determinate
+ * For absolute symlinks it is not possible to determine
* if it should point to directory or file.
*/
if (symname[0] == '/') {
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 893a1ea8c000..a02d41d1ce4a 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -1005,7 +1005,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
rc = -EOPNOTSUPP;
}
- /* Fallback to SMB_COM_SETATTR command when absolutelty needed. */
+ /* Fallback to SMB_COM_SETATTR command when absolutely needed. */
if (rc == -EOPNOTSUPP) {
cifs_dbg(FYI, "calling SetInformation since SetPathInfo for attrs/times not supported by this server\n");
rc = SMBSetInformation(xid, tcon, full_path,
@@ -1039,7 +1039,7 @@ set_via_filehandle:
cifsFileInfo_put(open_file);
/*
- * Setting the read-only bit is not honered on non-NT servers when done
+ * Setting the read-only bit is not honored on non-NT servers when done
* via open-semantics. So for setting it, use SMB_COM_SETATTR command.
* This command works only after the file is closed, so use it only when
* operation was called without the filehandle.
diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h
index 224495322a05..e56e4d402f13 100644
--- a/fs/smb/client/smb2glob.h
+++ b/fs/smb/client/smb2glob.h
@@ -30,10 +30,9 @@ enum smb2_compound_ops {
SMB2_OP_QUERY_DIR,
SMB2_OP_MKDIR,
SMB2_OP_RENAME,
- SMB2_OP_DELETE,
SMB2_OP_HARDLINK,
SMB2_OP_SET_EOF,
- SMB2_OP_RMDIR,
+ SMB2_OP_UNLINK,
SMB2_OP_POSIX_QUERY_INFO,
SMB2_OP_SET_REPARSE,
SMB2_OP_GET_REPARSE,
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 2a0316c514e4..7cadc8ca4f55 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -207,8 +207,10 @@ replay_again:
server = cifs_pick_channel(ses);
vars = kzalloc(sizeof(*vars), GFP_ATOMIC);
- if (vars == NULL)
- return -ENOMEM;
+ if (vars == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
rqst = &vars->rqst[0];
rsp_iov = &vars->rsp_iov[0];
@@ -344,9 +346,6 @@ replay_again:
trace_smb3_posix_query_info_compound_enter(xid, tcon->tid,
ses->Suid, full_path);
break;
- case SMB2_OP_DELETE:
- trace_smb3_delete_enter(xid, tcon->tid, ses->Suid, full_path);
- break;
case SMB2_OP_MKDIR:
/*
* Directories are created through parameters in the
@@ -354,23 +353,40 @@ replay_again:
*/
trace_smb3_mkdir_enter(xid, tcon->tid, ses->Suid, full_path);
break;
- case SMB2_OP_RMDIR:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ case SMB2_OP_UNLINK:
+ rqst[num_rqst].rq_iov = vars->unlink_iov;
rqst[num_rqst].rq_nvec = 1;
size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
data[0] = &delete_pending[0];
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst], COMPOUND_FID,
- COMPOUND_FID, current->tgid,
- FILE_DISPOSITION_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
- if (rc)
+ if (cfile) {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ current->tgid,
+ FILE_DISPOSITION_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ data, size);
+ } else {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ COMPOUND_FID,
+ COMPOUND_FID,
+ current->tgid,
+ FILE_DISPOSITION_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ data, size);
+ }
+ if (!rc && (!cfile || num_rqst > 1)) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ } else if (rc) {
goto finished;
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst++]);
- trace_smb3_rmdir_enter(xid, tcon->tid, ses->Suid, full_path);
+ }
+ num_rqst++;
+ trace_smb3_unlink_enter(xid, tcon->tid, ses->Suid, full_path);
break;
case SMB2_OP_SET_EOF:
rqst[num_rqst].rq_iov = &vars->si_iov[0];
@@ -440,7 +456,7 @@ replay_again:
ses->Suid, full_path);
break;
case SMB2_OP_RENAME:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ rqst[num_rqst].rq_iov = vars->rename_iov;
rqst[num_rqst].rq_nvec = 2;
len = in_iov[i].iov_len;
@@ -730,19 +746,6 @@ finished:
trace_smb3_posix_query_info_compound_done(xid, tcon->tid,
ses->Suid);
break;
- case SMB2_OP_DELETE:
- if (rc)
- trace_smb3_delete_err(xid, tcon->tid, ses->Suid, rc);
- else {
- /*
- * If dentry (hence, inode) is NULL, lease break is going to
- * take care of degrading leases on handles for deleted files.
- */
- if (inode)
- cifs_mark_open_handles_for_deleted_file(inode, full_path);
- trace_smb3_delete_done(xid, tcon->tid, ses->Suid);
- }
- break;
case SMB2_OP_MKDIR:
if (rc)
trace_smb3_mkdir_err(xid, tcon->tid, ses->Suid, rc);
@@ -763,11 +766,11 @@ finished:
trace_smb3_rename_done(xid, tcon->tid, ses->Suid);
SMB2_set_info_free(&rqst[num_rqst++]);
break;
- case SMB2_OP_RMDIR:
- if (rc)
- trace_smb3_rmdir_err(xid, tcon->tid, ses->Suid, rc);
+ case SMB2_OP_UNLINK:
+ if (!rc)
+ trace_smb3_unlink_done(xid, tcon->tid, ses->Suid);
else
- trace_smb3_rmdir_done(xid, tcon->tid, ses->Suid);
+ trace_smb3_unlink_err(xid, tcon->tid, ses->Suid, rc);
SMB2_set_info_free(&rqst[num_rqst++]);
break;
case SMB2_OP_SET_EOF:
@@ -864,6 +867,7 @@ finished:
smb2_should_replay(tcon, &retries, &cur_sleep))
goto replay_again;
+out:
if (cfile)
cifsFileInfo_put(cfile);
@@ -1163,7 +1167,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE);
return smb2_compound_op(xid, tcon, cifs_sb,
name, &oparms, NULL,
- &(int){SMB2_OP_RMDIR}, 1,
+ &(int){SMB2_OP_UNLINK}, 1,
NULL, NULL, NULL, NULL);
}
@@ -1172,20 +1176,29 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb, struct dentry *dentry)
{
struct cifs_open_parms oparms;
+ struct inode *inode = NULL;
+ int rc;
- oparms = CIFS_OPARMS(cifs_sb, tcon, name,
- DELETE, FILE_OPEN,
- CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
- ACL_NO_MODE);
- int rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
- NULL, &(int){SMB2_OP_DELETE}, 1,
- NULL, NULL, NULL, dentry);
+ if (dentry)
+ inode = d_inode(dentry);
+
+ oparms = CIFS_OPARMS(cifs_sb, tcon, name, DELETE,
+ FILE_OPEN, OPEN_REPARSE_POINT, ACL_NO_MODE);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
+ NULL, &(int){SMB2_OP_UNLINK},
+ 1, NULL, NULL, NULL, dentry);
if (rc == -EINVAL) {
cifs_dbg(FYI, "invalid lease key, resending request without lease");
rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
- NULL, &(int){SMB2_OP_DELETE}, 1,
- NULL, NULL, NULL, NULL);
+ NULL, &(int){SMB2_OP_UNLINK},
+ 1, NULL, NULL, NULL, NULL);
}
+ /*
+ * If dentry (hence, inode) is NULL, lease break is going to
+ * take care of degrading leases on handles for deleted files.
+ */
+ if (!rc && inode)
+ cifs_mark_open_handles_for_deleted_file(inode, name);
return rc;
}
@@ -1438,3 +1451,113 @@ out:
cifs_free_open_info(&data);
return rc;
}
+
+static inline __le16 *utf16_smb2_path(struct cifs_sb_info *cifs_sb,
+ const char *name, size_t namelen)
+{
+ int len;
+
+ if (*name == '\\' ||
+ (cifs_sb_master_tlink(cifs_sb) &&
+ cifs_sb_master_tcon(cifs_sb)->posix_extensions && *name == '/'))
+ name++;
+ return cifs_strndup_to_utf16(name, namelen, &len,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+}
+
+int smb2_rename_pending_delete(const char *full_path,
+ struct dentry *dentry,
+ const unsigned int xid)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(d_inode(dentry)->i_sb);
+ struct cifsInodeInfo *cinode = CIFS_I(d_inode(dentry));
+ __le16 *utf16_path __free(kfree) = NULL;
+ __u32 co = file_create_options(dentry);
+ int cmds[] = {
+ SMB2_OP_SET_INFO,
+ SMB2_OP_RENAME,
+ SMB2_OP_UNLINK,
+ };
+ const int num_cmds = ARRAY_SIZE(cmds);
+ char *to_name __free(kfree) = NULL;
+ __u32 attrs = cinode->cifsAttrs;
+ struct cifs_open_parms oparms;
+ static atomic_t sillycounter;
+ struct cifsFileInfo *cfile;
+ struct tcon_link *tlink;
+ struct cifs_tcon *tcon;
+ struct kvec iov[2];
+ const char *ppath;
+ void *page;
+ size_t len;
+ int rc;
+
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink))
+ return PTR_ERR(tlink);
+ tcon = tlink_tcon(tlink);
+
+ page = alloc_dentry_path();
+
+ ppath = build_path_from_dentry(dentry->d_parent, page);
+ if (IS_ERR(ppath)) {
+ rc = PTR_ERR(ppath);
+ goto out;
+ }
+
+ len = strlen(ppath) + strlen("/.__smb1234") + 1;
+ to_name = kmalloc(len, GFP_KERNEL);
+ if (!to_name) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ scnprintf(to_name, len, "%s%c.__smb%04X", ppath, CIFS_DIR_SEP(cifs_sb),
+ atomic_inc_return(&sillycounter) & 0xffff);
+
+ utf16_path = utf16_smb2_path(cifs_sb, to_name, len);
+ if (!utf16_path) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ drop_cached_dir_by_name(xid, tcon, full_path, cifs_sb);
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
+ DELETE | FILE_WRITE_ATTRIBUTES,
+ FILE_OPEN, co, ACL_NO_MODE);
+
+ attrs &= ~ATTR_READONLY;
+ if (!attrs)
+ attrs = ATTR_NORMAL;
+ if (d_inode(dentry)->i_nlink <= 1)
+ attrs |= ATTR_HIDDEN;
+ iov[0].iov_base = &(FILE_BASIC_INFO) {
+ .Attributes = cpu_to_le32(attrs),
+ };
+ iov[0].iov_len = sizeof(FILE_BASIC_INFO);
+ iov[1].iov_base = utf16_path;
+ iov[1].iov_len = sizeof(*utf16_path) * UniStrlen((wchar_t *)utf16_path);
+
+ cifs_get_writable_path(tcon, full_path, FIND_WR_WITH_DELETE, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov,
+ cmds, num_cmds, cfile, NULL, NULL, dentry);
+ if (rc == -EINVAL) {
+ cifs_dbg(FYI, "invalid lease key, resending request without lease\n");
+ cifs_get_writable_path(tcon, full_path,
+ FIND_WR_WITH_DELETE, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov,
+ cmds, num_cmds, cfile, NULL, NULL, NULL);
+ }
+ if (!rc) {
+ set_bit(CIFS_INO_DELETE_PENDING, &cinode->flags);
+ } else {
+ cifs_tcon_dbg(FYI, "%s: failed to rename '%s' to '%s': %d\n",
+ __func__, full_path, to_name, rc);
+ rc = -EIO;
+ }
+out:
+ cifs_put_tlink(tlink);
+ free_dentry_path(page);
+ return rc;
+}
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index cddf273c14ae..89d933b4a8bc 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -614,6 +614,15 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
struct cifs_tcon *tcon;
struct cifs_pending_open *open;
+ /* Trace receipt of lease break request from server */
+ trace_smb3_lease_break_enter(le32_to_cpu(rsp->CurrentLeaseState),
+ le32_to_cpu(rsp->Flags),
+ le16_to_cpu(rsp->Epoch),
+ le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
+ le64_to_cpu(rsp->hdr.SessionId),
+ *((u64 *)rsp->LeaseKey),
+ *((u64 *)&rsp->LeaseKey[8]));
+
cifs_dbg(FYI, "Checking for lease break\n");
/* If server is a channel, select the primary channel */
@@ -660,10 +669,12 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "Can not process lease break - no lease matched\n");
trace_smb3_lease_not_found(le32_to_cpu(rsp->CurrentLeaseState),
- le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
- le64_to_cpu(rsp->hdr.SessionId),
- *((u64 *)rsp->LeaseKey),
- *((u64 *)&rsp->LeaseKey[8]));
+ le32_to_cpu(rsp->Flags),
+ le16_to_cpu(rsp->Epoch),
+ le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
+ le64_to_cpu(rsp->hdr.SessionId),
+ *((u64 *)rsp->LeaseKey),
+ *((u64 *)&rsp->LeaseKey[8]));
return false;
}
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 94b1d7a395d5..e586f3f4b5c9 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -2640,13 +2640,35 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
}
/* SMB headers in a compound are 8 byte aligned. */
- if (!IS_ALIGNED(len, 8)) {
- num_padding = 8 - (len & 7);
+ if (IS_ALIGNED(len, 8))
+ goto out;
+
+ num_padding = 8 - (len & 7);
+ if (smb3_encryption_required(tcon)) {
+ int i;
+
+ /*
+ * Flatten request into a single buffer with required padding as
+ * the encryption layer can't handle the padding iovs.
+ */
+ for (i = 1; i < rqst->rq_nvec; i++) {
+ memcpy(rqst->rq_iov[0].iov_base +
+ rqst->rq_iov[0].iov_len,
+ rqst->rq_iov[i].iov_base,
+ rqst->rq_iov[i].iov_len);
+ rqst->rq_iov[0].iov_len += rqst->rq_iov[i].iov_len;
+ }
+ memset(rqst->rq_iov[0].iov_base + rqst->rq_iov[0].iov_len,
+ 0, num_padding);
+ rqst->rq_iov[0].iov_len += num_padding;
+ rqst->rq_nvec = 1;
+ } else {
rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding;
rqst->rq_iov[rqst->rq_nvec].iov_len = num_padding;
rqst->rq_nvec++;
- len += num_padding;
}
+ len += num_padding;
+out:
shdr->NextCommand = cpu_to_le32(len);
}
@@ -5376,6 +5398,7 @@ struct smb_version_operations smb20_operations = {
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
+ .rename_pending_delete = smb2_rename_pending_delete,
};
#endif /* CIFS_ALLOW_INSECURE_LEGACY */
@@ -5481,6 +5504,7 @@ struct smb_version_operations smb21_operations = {
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
+ .rename_pending_delete = smb2_rename_pending_delete,
};
struct smb_version_operations smb30_operations = {
@@ -5597,6 +5621,7 @@ struct smb_version_operations smb30_operations = {
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
+ .rename_pending_delete = smb2_rename_pending_delete,
};
struct smb_version_operations smb311_operations = {
@@ -5713,6 +5738,7 @@ struct smb_version_operations smb311_operations = {
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
+ .rename_pending_delete = smb2_rename_pending_delete,
};
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 2df93a75e3b8..c3b9d3f6210f 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -6192,11 +6192,11 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
please_key_high = (__u64 *)(lease_key+8);
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
- trace_smb3_lease_err(le32_to_cpu(lease_state), tcon->tid,
+ trace_smb3_lease_ack_err(le32_to_cpu(lease_state), tcon->tid,
ses->Suid, *please_key_low, *please_key_high, rc);
cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc);
} else
- trace_smb3_lease_done(le32_to_cpu(lease_state), tcon->tid,
+ trace_smb3_lease_ack_done(le32_to_cpu(lease_state), tcon->tid,
ses->Suid, *please_key_low, *please_key_high);
return rc;
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 6e805ece6a7b..b3f1398c9f79 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -317,5 +317,8 @@ int posix_info_sid_size(const void *beg, const void *end);
int smb2_make_nfs_node(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev);
+int smb2_rename_pending_delete(const char *full_path,
+ struct dentry *dentry,
+ const unsigned int xid);
#endif /* _SMB2PROTO_H */
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index 02d6db431fd4..e0fce5033004 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -453,9 +453,12 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
struct smbdirect_recv_io *response =
container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
struct smbdirect_socket *sc = response->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
struct smbd_connection *info =
container_of(sc, struct smbd_connection, socket);
- int data_length = 0;
+ u32 data_offset = 0;
+ u32 data_length = 0;
+ u32 remaining_data_length = 0;
log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n",
response, sc->recv_io.expected, wc->status, wc->opcode,
@@ -487,7 +490,22 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
/* SMBD data transfer packet */
case SMBDIRECT_EXPECT_DATA_TRANSFER:
data_transfer = smbdirect_recv_io_payload(response);
+
+ if (wc->byte_len <
+ offsetof(struct smbdirect_data_transfer, padding))
+ goto error;
+
+ remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length);
+ data_offset = le32_to_cpu(data_transfer->data_offset);
data_length = le32_to_cpu(data_transfer->data_length);
+ if (wc->byte_len < data_offset ||
+ (u64)wc->byte_len < (u64)data_offset + data_length)
+ goto error;
+
+ if (remaining_data_length > sp->max_fragmented_recv_size ||
+ data_length > sp->max_fragmented_recv_size ||
+ (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size)
+ goto error;
if (data_length) {
if (sc->recv_io.reassembly.full_packet_received)
@@ -1090,8 +1108,10 @@ static int smbd_negotiate(struct smbd_connection *info)
log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n",
rc, response->sge.addr,
response->sge.length, response->sge.lkey);
- if (rc)
+ if (rc) {
+ put_receive_buffer(info, response);
return rc;
+ }
init_completion(&info->negotiate_completion);
info->negotiate_done = false;
@@ -1329,13 +1349,16 @@ void smbd_destroy(struct TCP_Server_Info *server)
sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
}
+ log_rdma_event(INFO, "cancelling post_send_credits_work\n");
+ disable_work_sync(&info->post_send_credits_work);
+
log_rdma_event(INFO, "destroying qp\n");
ib_drain_qp(sc->ib.qp);
rdma_destroy_qp(sc->rdma.cm_id);
sc->ib.qp = NULL;
log_rdma_event(INFO, "cancelling idle timer\n");
- cancel_delayed_work_sync(&info->idle_timer_work);
+ disable_delayed_work_sync(&info->idle_timer_work);
/* It's not possible for upper layer to get to reassembly */
log_rdma_event(INFO, "drain the reassembly queue\n");
@@ -1708,7 +1731,7 @@ allocate_mr_failed:
return NULL;
negotiation_failed:
- cancel_delayed_work_sync(&info->idle_timer_work);
+ disable_delayed_work_sync(&info->idle_timer_work);
destroy_caches_and_workqueue(info);
sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
rdma_disconnect(sc->rdma.cm_id);
@@ -2067,7 +2090,7 @@ static void destroy_mr_list(struct smbd_connection *info)
struct smbdirect_socket *sc = &info->socket;
struct smbd_mr *mr, *tmp;
- cancel_work_sync(&info->mr_recovery_work);
+ disable_work_sync(&info->mr_recovery_work);
list_for_each_entry_safe(mr, tmp, &info->mr_list, list) {
if (mr->state == MR_INVALIDATED)
ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl,
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 93e5b2bb9f28..fd650e2afc76 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -669,13 +669,12 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(posix_query_info_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter);
-DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(unlink_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_reparse_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_wsl_ea_compound_enter);
-DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mknod_enter);
@@ -710,13 +709,12 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(posix_query_info_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done);
-DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(unlink_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_wsl_ea_compound_done);
-DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mknod_done);
@@ -756,14 +754,13 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(posix_query_info_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err);
-DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(unlink_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_wsl_ea_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
-DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mknod_err);
@@ -1171,8 +1168,54 @@ DEFINE_EVENT(smb3_lease_done_class, smb3_##name, \
__u64 lease_key_high), \
TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high))
-DEFINE_SMB3_LEASE_DONE_EVENT(lease_done);
-DEFINE_SMB3_LEASE_DONE_EVENT(lease_not_found);
+DEFINE_SMB3_LEASE_DONE_EVENT(lease_ack_done);
+/* Tracepoint when a lease break request is received/entered (includes epoch and flags) */
+DECLARE_EVENT_CLASS(smb3_lease_enter_class,
+ TP_PROTO(__u32 lease_state,
+ __u32 flags,
+ __u16 epoch,
+ __u32 tid,
+ __u64 sesid,
+ __u64 lease_key_low,
+ __u64 lease_key_high),
+ TP_ARGS(lease_state, flags, epoch, tid, sesid, lease_key_low, lease_key_high),
+ TP_STRUCT__entry(
+ __field(__u32, lease_state)
+ __field(__u32, flags)
+ __field(__u16, epoch)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, lease_key_low)
+ __field(__u64, lease_key_high)
+ ),
+ TP_fast_assign(
+ __entry->lease_state = lease_state;
+ __entry->flags = flags;
+ __entry->epoch = epoch;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->lease_key_low = lease_key_low;
+ __entry->lease_key_high = lease_key_high;
+ ),
+ TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x flags=0x%x epoch=%u",
+ __entry->sesid, __entry->tid, __entry->lease_key_high,
+ __entry->lease_key_low, __entry->lease_state, __entry->flags, __entry->epoch)
+)
+
+#define DEFINE_SMB3_LEASE_ENTER_EVENT(name) \
+DEFINE_EVENT(smb3_lease_enter_class, smb3_##name, \
+ TP_PROTO(__u32 lease_state, \
+ __u32 flags, \
+ __u16 epoch, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 lease_key_low, \
+ __u64 lease_key_high), \
+ TP_ARGS(lease_state, flags, epoch, tid, sesid, lease_key_low, lease_key_high))
+
+DEFINE_SMB3_LEASE_ENTER_EVENT(lease_break_enter);
+/* Lease not found: reuse lease_enter payload (includes epoch and flags) */
+DEFINE_SMB3_LEASE_ENTER_EVENT(lease_not_found);
DECLARE_EVENT_CLASS(smb3_lease_err_class,
TP_PROTO(__u32 lease_state,
@@ -1213,7 +1256,7 @@ DEFINE_EVENT(smb3_lease_err_class, smb3_##name, \
int rc), \
TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc))
-DEFINE_SMB3_LEASE_ERR_EVENT(lease_err);
+DEFINE_SMB3_LEASE_ERR_EVENT(lease_ack_err);
DECLARE_EVENT_CLASS(smb3_connect_class,
TP_PROTO(char *hostname,
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 0d92ce49aed7..a565fc36cee6 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -2951,18 +2951,19 @@ int smb2_open(struct ksmbd_work *work)
}
ksmbd_debug(SMB, "converted name = %s\n", name);
- if (strchr(name, ':')) {
- if (!test_share_config_flag(work->tcon->share_conf,
- KSMBD_SHARE_FLAG_STREAMS)) {
- rc = -EBADF;
- goto err_out2;
- }
- rc = parse_stream_name(name, &stream_name, &s_type);
- if (rc < 0)
- goto err_out2;
- }
if (posix_ctxt == false) {
+ if (strchr(name, ':')) {
+ if (!test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_STREAMS)) {
+ rc = -EBADF;
+ goto err_out2;
+ }
+ rc = parse_stream_name(name, &stream_name, &s_type);
+ if (rc < 0)
+ goto err_out2;
+ }
+
rc = ksmbd_validate_filename(name);
if (rc < 0)
goto err_out2;
@@ -3443,6 +3444,8 @@ int smb2_open(struct ksmbd_work *work)
fp->attrib_only = !(req->DesiredAccess & ~(FILE_READ_ATTRIBUTES_LE |
FILE_WRITE_ATTRIBUTES_LE | FILE_SYNCHRONIZE_LE));
+ fp->is_posix_ctxt = posix_ctxt;
+
/* fp should be searchable through ksmbd_inode.m_fp_list
* after daccess, saccess, attrib_only, and stream are
* initialized.
@@ -5988,7 +5991,7 @@ static int smb2_rename(struct ksmbd_work *work,
if (IS_ERR(new_name))
return PTR_ERR(new_name);
- if (strchr(new_name, ':')) {
+ if (fp->is_posix_ctxt == false && strchr(new_name, ':')) {
int s_type;
char *xattr_stream_name, *stream_name = NULL;
size_t xattr_stream_size;
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index 5466aa8c39b1..6550bd9f002c 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -554,7 +554,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
case SMB_DIRECT_MSG_DATA_TRANSFER: {
struct smb_direct_data_transfer *data_transfer =
(struct smb_direct_data_transfer *)recvmsg->packet;
- unsigned int data_length;
+ u32 remaining_data_length, data_offset, data_length;
int avail_recvmsg_count, receive_credits;
if (wc->byte_len <
@@ -564,15 +564,25 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
return;
}
+ remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length);
data_length = le32_to_cpu(data_transfer->data_length);
- if (data_length) {
- if (wc->byte_len < sizeof(struct smb_direct_data_transfer) +
- (u64)data_length) {
- put_recvmsg(t, recvmsg);
- smb_direct_disconnect_rdma_connection(t);
- return;
- }
+ data_offset = le32_to_cpu(data_transfer->data_offset);
+ if (wc->byte_len < data_offset ||
+ wc->byte_len < (u64)data_offset + data_length) {
+ put_recvmsg(t, recvmsg);
+ smb_direct_disconnect_rdma_connection(t);
+ return;
+ }
+ if (remaining_data_length > t->max_fragmented_recv_size ||
+ data_length > t->max_fragmented_recv_size ||
+ (u64)remaining_data_length + (u64)data_length >
+ (u64)t->max_fragmented_recv_size) {
+ put_recvmsg(t, recvmsg);
+ smb_direct_disconnect_rdma_connection(t);
+ return;
+ }
+ if (data_length) {
if (t->full_packet_received)
recvmsg->first_segment = true;
@@ -1209,78 +1219,130 @@ static int smb_direct_writev(struct ksmbd_transport *t,
bool need_invalidate, unsigned int remote_key)
{
struct smb_direct_transport *st = smb_trans_direct_transfort(t);
- int remaining_data_length;
- int start, i, j;
- int max_iov_size = st->max_send_size -
+ size_t remaining_data_length;
+ size_t iov_idx;
+ size_t iov_ofs;
+ size_t max_iov_size = st->max_send_size -
sizeof(struct smb_direct_data_transfer);
int ret;
- struct kvec vec;
struct smb_direct_send_ctx send_ctx;
+ int error = 0;
if (st->status != SMB_DIRECT_CS_CONNECTED)
return -ENOTCONN;
//FIXME: skip RFC1002 header..
+ if (WARN_ON_ONCE(niovs <= 1 || iov[0].iov_len != 4))
+ return -EINVAL;
buflen -= 4;
+ iov_idx = 1;
+ iov_ofs = 0;
remaining_data_length = buflen;
ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen);
smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key);
- start = i = 1;
- buflen = 0;
- while (true) {
- buflen += iov[i].iov_len;
- if (buflen > max_iov_size) {
- if (i > start) {
- remaining_data_length -=
- (buflen - iov[i].iov_len);
- ret = smb_direct_post_send_data(st, &send_ctx,
- &iov[start], i - start,
- remaining_data_length);
- if (ret)
- goto done;
- } else {
- /* iov[start] is too big, break it */
- int nvec = (buflen + max_iov_size - 1) /
- max_iov_size;
+ while (remaining_data_length) {
+ struct kvec vecs[SMB_DIRECT_MAX_SEND_SGES - 1]; /* minus smbdirect hdr */
+ size_t possible_bytes = max_iov_size;
+ size_t possible_vecs;
+ size_t bytes = 0;
+ size_t nvecs = 0;
+
+ /*
+ * For the last message remaining_data_length should be
+ * have been 0 already!
+ */
+ if (WARN_ON_ONCE(iov_idx >= niovs)) {
+ error = -EINVAL;
+ goto done;
+ }
+
+ /*
+ * We have 2 factors which limit the arguments we pass
+ * to smb_direct_post_send_data():
+ *
+ * 1. The number of supported sges for the send,
+ * while one is reserved for the smbdirect header.
+ * And we currently need one SGE per page.
+ * 2. The number of negotiated payload bytes per send.
+ */
+ possible_vecs = min_t(size_t, ARRAY_SIZE(vecs), niovs - iov_idx);
+
+ while (iov_idx < niovs && possible_vecs && possible_bytes) {
+ struct kvec *v = &vecs[nvecs];
+ int page_count;
- for (j = 0; j < nvec; j++) {
- vec.iov_base =
- (char *)iov[start].iov_base +
- j * max_iov_size;
- vec.iov_len =
- min_t(int, max_iov_size,
- buflen - max_iov_size * j);
- remaining_data_length -= vec.iov_len;
- ret = smb_direct_post_send_data(st, &send_ctx, &vec, 1,
- remaining_data_length);
- if (ret)
- goto done;
+ v->iov_base = ((u8 *)iov[iov_idx].iov_base) + iov_ofs;
+ v->iov_len = min_t(size_t,
+ iov[iov_idx].iov_len - iov_ofs,
+ possible_bytes);
+ page_count = get_buf_page_count(v->iov_base, v->iov_len);
+ if (page_count > possible_vecs) {
+ /*
+ * If the number of pages in the buffer
+ * is to much (because we currently require
+ * one SGE per page), we need to limit the
+ * length.
+ *
+ * We know possible_vecs is at least 1,
+ * so we always keep the first page.
+ *
+ * We need to calculate the number extra
+ * pages (epages) we can also keep.
+ *
+ * We calculate the number of bytes in the
+ * first page (fplen), this should never be
+ * larger than v->iov_len because page_count is
+ * at least 2, but adding a limitation feels
+ * better.
+ *
+ * Then we calculate the number of bytes (elen)
+ * we can keep for the extra pages.
+ */
+ size_t epages = possible_vecs - 1;
+ size_t fpofs = offset_in_page(v->iov_base);
+ size_t fplen = min_t(size_t, PAGE_SIZE - fpofs, v->iov_len);
+ size_t elen = min_t(size_t, v->iov_len - fplen, epages*PAGE_SIZE);
+
+ v->iov_len = fplen + elen;
+ page_count = get_buf_page_count(v->iov_base, v->iov_len);
+ if (WARN_ON_ONCE(page_count > possible_vecs)) {
+ /*
+ * Something went wrong in the above
+ * logic...
+ */
+ error = -EINVAL;
+ goto done;
}
- i++;
- if (i == niovs)
- break;
}
- start = i;
- buflen = 0;
- } else {
- i++;
- if (i == niovs) {
- /* send out all remaining vecs */
- remaining_data_length -= buflen;
- ret = smb_direct_post_send_data(st, &send_ctx,
- &iov[start], i - start,
- remaining_data_length);
- if (ret)
- goto done;
- break;
+ possible_vecs -= page_count;
+ nvecs += 1;
+ possible_bytes -= v->iov_len;
+ bytes += v->iov_len;
+
+ iov_ofs += v->iov_len;
+ if (iov_ofs >= iov[iov_idx].iov_len) {
+ iov_idx += 1;
+ iov_ofs = 0;
}
}
+
+ remaining_data_length -= bytes;
+
+ ret = smb_direct_post_send_data(st, &send_ctx,
+ vecs, nvecs,
+ remaining_data_length);
+ if (unlikely(ret)) {
+ error = ret;
+ goto done;
+ }
}
done:
ret = smb_direct_flush_send_list(st, &send_ctx, true);
+ if (unlikely(!ret && error))
+ ret = error;
/*
* As an optimization, we don't wait for individual I/O to finish
@@ -1744,6 +1806,11 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
return -EINVAL;
}
+ if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) {
+ pr_err("warning: device max_send_sge = %d too small\n",
+ device->attrs.max_send_sge);
+ return -EINVAL;
+ }
if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) {
pr_err("warning: device max_recv_sge = %d too small\n",
device->attrs.max_recv_sge);
@@ -1767,7 +1834,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
cap->max_send_wr = max_send_wrs;
cap->max_recv_wr = t->recv_credit_max;
- cap->max_send_sge = max_sge_per_wr;
+ cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES;
cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES;
cap->max_inline_data = 0;
cap->max_rdma_ctxs = t->max_rw_credits;
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index 0708155b5caf..78b506c5ef03 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -112,6 +112,8 @@ struct ksmbd_file {
bool is_durable;
bool is_persistent;
bool is_resilient;
+
+ bool is_posix_ctxt;
};
static inline void set_ctx_actor(struct dir_context *ctx,
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index ae0ca6858496..065953475cf5 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -105,6 +105,7 @@ config XFS_POSIX_ACL
config XFS_RT
bool "XFS Realtime subvolume support"
depends on XFS_FS
+ default BLK_DEV_ZONED
help
If you say Y here you will be able to mount and use XFS filesystems
which contain a realtime subvolume. The realtime subvolume is a
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 4c44ce1c8a64..bff3dc226f81 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -435,6 +435,13 @@ xfs_attr_rmtval_get(
0, &bp, &xfs_attr3_rmt_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_dirattr_mark_sick(args->dp, XFS_ATTR_FORK);
+ /*
+ * ENODATA from disk implies a disk medium failure;
+ * ENODATA for xattrs means attribute not found, so
+ * disambiguate that here.
+ */
+ if (error == -ENODATA)
+ error = -EIO;
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 17d9e6154f19..723a0643b838 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2833,6 +2833,12 @@ xfs_da_read_buf(
&bp, ops);
if (xfs_metadata_is_sick(error))
xfs_dirattr_mark_sick(dp, whichfork);
+ /*
+ * ENODATA from disk implies a disk medium failure; ENODATA for
+ * xattrs means attribute not found, so disambiguate that here.
+ */
+ if (error == -ENODATA && whichfork == XFS_ATTR_FORK)
+ error = -EIO;
if (error)
goto out_free;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 1ee4f835ac3c..a26f79815533 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -760,6 +760,9 @@ xfs_vm_swap_activate(
{
struct xfs_inode *ip = XFS_I(file_inode(swap_file));
+ if (xfs_is_zoned_inode(ip))
+ return -EINVAL;
+
/*
* Swap file activation can race against concurrent shared extent
* removal in files that have been cloned. If this happens,
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index f8bd6d741755..f28214c28ab5 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -374,44 +374,6 @@ xfs_zone_free_blocks(
return 0;
}
-/*
- * Check if the zone containing the data just before the offset we are
- * writing to is still open and has space.
- */
-static struct xfs_open_zone *
-xfs_last_used_zone(
- struct iomap_ioend *ioend)
-{
- struct xfs_inode *ip = XFS_I(ioend->io_inode);
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset);
- struct xfs_rtgroup *rtg = NULL;
- struct xfs_open_zone *oz = NULL;
- struct xfs_iext_cursor icur;
- struct xfs_bmbt_irec got;
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb,
- &icur, &got)) {
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return NULL;
- }
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock));
- if (!rtg)
- return NULL;
-
- xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
- oz = READ_ONCE(rtg->rtg_open_zone);
- if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref)))
- oz = NULL;
- xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
-
- xfs_rtgroup_rele(rtg);
- return oz;
-}
-
static struct xfs_group *
xfs_find_free_zone(
struct xfs_mount *mp,
@@ -918,12 +880,9 @@ xfs_zone_alloc_and_submit(
goto out_error;
/*
- * If we don't have a cached zone in this write context, see if the
- * last extent before the one we are writing to points to an active
- * zone. If so, just continue writing to it.
+ * If we don't have a locally cached zone in this write context, see if
+ * the inode is still associated with a zone and use that if so.
*/
- if (!*oz && ioend->io_offset)
- *oz = xfs_last_used_zone(ioend);
if (!*oz)
*oz = xfs_cached_zone(mp, ip);
diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c
index 1313c55b8cbe..9cd38716fd25 100644
--- a/fs/xfs/xfs_zone_space_resv.c
+++ b/fs/xfs/xfs_zone_space_resv.c
@@ -10,6 +10,7 @@
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_rtbitmap.h"
+#include "xfs_icache.h"
#include "xfs_zone_alloc.h"
#include "xfs_zone_priv.h"
#include "xfs_zones.h"
@@ -230,6 +231,11 @@ xfs_zoned_space_reserve(
error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
flags & XFS_ZR_RESERVED);
+ if (error == -ENOSPC && !(flags & XFS_ZR_NOWAIT)) {
+ xfs_inodegc_flush(mp);
+ error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
+ flags & XFS_ZR_RESERVED);
+ }
if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
error = xfs_zoned_reserve_extents_greedy(mp, &count_fsb, flags);
if (error)