From 7a6c3a035a2e133b41d01c1a479b50aac4aeecad Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 28 Feb 2023 20:40:51 +0800 Subject: ceph: do not print the whole xattr value if it's too long If the xattr's value size is long enough the kernel will warn and then will fail the xfstests test case. Just print part of the value string if it's too long. At the same time fix the function name issue in the debug logs. Link: https://tracker.ceph.com/issues/58404 Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f65b07cc33a2..8b6b3075f4d1 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -535,6 +535,8 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, return NULL; } +#define MAX_XATTR_VAL_PRINT_LEN 256 + static int __set_xattr(struct ceph_inode_info *ci, const char *name, int name_len, const char *val, int val_len, @@ -597,7 +599,7 @@ static int __set_xattr(struct ceph_inode_info *ci, xattr->should_free_name = update_xattr; ci->i_xattrs.count++; - dout("__set_xattr count=%d\n", ci->i_xattrs.count); + dout("%s count=%d\n", __func__, ci->i_xattrs.count); } else { kfree(*newxattr); *newxattr = NULL; @@ -625,11 +627,13 @@ static int __set_xattr(struct ceph_inode_info *ci, if (new) { rb_link_node(&xattr->node, parent, p); rb_insert_color(&xattr->node, &ci->i_xattrs.index); - dout("__set_xattr_val p=%p\n", p); + dout("%s p=%p\n", __func__, p); } - dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n", - ceph_vinop(&ci->netfs.inode), xattr, name_len, name, val_len, val); + dout("%s added %llx.%llx xattr %p %.*s=%.*s%s\n", __func__, + ceph_vinop(&ci->netfs.inode), xattr, name_len, name, + min(val_len, MAX_XATTR_VAL_PRINT_LEN), val, + val_len > MAX_XATTR_VAL_PRINT_LEN ? "..." : ""); return 0; } @@ -655,13 +659,15 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, else if (c > 0) p = &(*p)->rb_right; else { - dout("__get_xattr %s: found %.*s\n", name, - xattr->val_len, xattr->val); + int len = min(xattr->val_len, MAX_XATTR_VAL_PRINT_LEN); + + dout("%s %s: found %.*s%s\n", __func__, name, len, + xattr->val, xattr->val_len > len ? "..." : ""); return xattr; } } - dout("__get_xattr %s: not found\n", name); + dout("%s %s: not found\n", __func__, name); return NULL; } -- cgit v1.2.3 From 7d41870d65db028234333c68e60a034ac335557a Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 8 Mar 2023 10:21:44 +0800 Subject: ceph: implement writeback livelock avoidance using page tagging While the mapped IOs continue if we try to flush a file's buffer we can see that the fsync() won't complete until the IOs finish. This is analogous to Jan Kara's commit (f446daaea9d4 mm: implement writeback livelock avoidance using page tagging), we will try to avoid livelocks of writeback when some steadily creates dirty pages in a mapping we are writing out. Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d5335f445233..6bb251a4d613 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -808,6 +808,7 @@ static int ceph_writepages_start(struct address_space *mapping, bool should_loop, range_whole = false; bool done = false; bool caching = ceph_is_cache_enabled(inode); + xa_mark_t tag; if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested) @@ -834,6 +835,11 @@ static int ceph_writepages_start(struct address_space *mapping, start_index = wbc->range_cyclic ? mapping->writeback_index : 0; index = start_index; + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { + tag = PAGECACHE_TAG_TOWRITE; + } else { + tag = PAGECACHE_TAG_DIRTY; + } retry: /* find oldest snap context with dirty data */ snapc = get_oldest_context(inode, &ceph_wbc, NULL); @@ -872,6 +878,9 @@ retry: dout(" non-head snapc, range whole\n"); } + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + ceph_put_snap_context(last_snapc); last_snapc = snapc; @@ -888,7 +897,7 @@ retry: get_more_pages: nr_folios = filemap_get_folios_tag(mapping, &index, - end, PAGECACHE_TAG_DIRTY, &fbatch); + end, tag, &fbatch); dout("pagevec_lookup_range_tag got %d\n", nr_folios); if (!nr_folios && !locked_pages) break; -- cgit v1.2.3 From aaf67de78807c59c35bafb5003d4fb457c764800 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 19 Apr 2023 10:39:14 +0800 Subject: ceph: fix potential use-after-free bug when trimming caps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When trimming the caps and just after the 'session->s_cap_lock' is released in ceph_iterate_session_caps() the cap maybe removed by another thread, and when using the stale cap memory in the callbacks it will trigger use-after-free crash. We need to check the existence of the cap just after the 'ci->i_ceph_lock' being acquired. And do nothing if it's already removed. Cc: stable@vger.kernel.org Link: https://tracker.ceph.com/issues/43272 Signed-off-by: Xiubo Li Reviewed-by: Luís Henriques Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 +- fs/ceph/debugfs.c | 18 ++++++++----- fs/ceph/mds_client.c | 72 +++++++++++++++++++++++++++++++++------------------- fs/ceph/mds_client.h | 3 +-- fs/ceph/super.h | 2 ++ 5 files changed, 62 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7cc20772eac9..789be30d6ee2 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -431,7 +431,7 @@ void ceph_reservation_status(struct ceph_fs_client *fsc, * * Called with i_ceph_lock held. */ -static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) +struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) { struct ceph_cap *cap; struct rb_node *n = ci->i_caps.rb_node; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index bec3c4549c07..3904333fa6c3 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -248,14 +248,20 @@ static int metrics_caps_show(struct seq_file *s, void *p) return 0; } -static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p) +static int caps_show_cb(struct inode *inode, int mds, void *p) { + struct ceph_inode_info *ci = ceph_inode(inode); struct seq_file *s = p; - - seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode), - cap->session->s_mds, - ceph_cap_string(cap->issued), - ceph_cap_string(cap->implemented)); + struct ceph_cap *cap; + + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (cap) + seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode), + cap->session->s_mds, + ceph_cap_string(cap->issued), + ceph_cap_string(cap->implemented)); + spin_unlock(&ci->i_ceph_lock); return 0; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 27a245d959c0..54e3c2ab21d2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1632,8 +1632,8 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, * Caller must hold session s_mutex. */ int ceph_iterate_session_caps(struct ceph_mds_session *session, - int (*cb)(struct inode *, struct ceph_cap *, - void *), void *arg) + int (*cb)(struct inode *, int mds, void *), + void *arg) { struct list_head *p; struct ceph_cap *cap; @@ -1645,6 +1645,8 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, spin_lock(&session->s_cap_lock); p = session->s_caps.next; while (p != &session->s_caps) { + int mds; + cap = list_entry(p, struct ceph_cap, session_caps); inode = igrab(&cap->ci->netfs.inode); if (!inode) { @@ -1652,6 +1654,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, continue; } session->s_cap_iterator = cap; + mds = cap->mds; spin_unlock(&session->s_cap_lock); if (last_inode) { @@ -1663,7 +1666,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, old_cap = NULL; } - ret = cb(inode, cap, arg); + ret = cb(inode, mds, arg); last_inode = inode; spin_lock(&session->s_cap_lock); @@ -1696,20 +1699,25 @@ out: return ret; } -static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, - void *arg) +static int remove_session_caps_cb(struct inode *inode, int mds, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); bool invalidate = false; - int iputs; + struct ceph_cap *cap; + int iputs = 0; - dout("removing cap %p, ci is %p, inode is %p\n", - cap, ci, &ci->netfs.inode); spin_lock(&ci->i_ceph_lock); - iputs = ceph_purge_inode_cap(inode, cap, &invalidate); + cap = __get_cap_for_mds(ci, mds); + if (cap) { + dout(" removing cap %p, ci is %p, inode is %p\n", + cap, ci, &ci->netfs.inode); + + iputs = ceph_purge_inode_cap(inode, cap, &invalidate); + } spin_unlock(&ci->i_ceph_lock); - wake_up_all(&ci->i_cap_wq); + if (cap) + wake_up_all(&ci->i_cap_wq); if (invalidate) ceph_queue_invalidate(inode); while (iputs--) @@ -1780,8 +1788,7 @@ enum { * * caller must hold s_mutex. */ -static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, - void *arg) +static int wake_up_session_cb(struct inode *inode, int mds, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); unsigned long ev = (unsigned long)arg; @@ -1792,12 +1799,14 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, ci->i_requested_max_size = 0; spin_unlock(&ci->i_ceph_lock); } else if (ev == RENEWCAPS) { - if (cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) { - /* mds did not re-issue stale cap */ - spin_lock(&ci->i_ceph_lock); + struct ceph_cap *cap; + + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + /* mds did not re-issue stale cap */ + if (cap && cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) cap->issued = cap->implemented = CEPH_CAP_PIN; - spin_unlock(&ci->i_ceph_lock); - } + spin_unlock(&ci->i_ceph_lock); } else if (ev == FORCE_RO) { } wake_up_all(&ci->i_cap_wq); @@ -1959,16 +1968,22 @@ out: * Yes, this is a bit sloppy. Our only real goal here is to respond to * memory pressure from the MDS, though, so it needn't be perfect. */ -static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) +static int trim_caps_cb(struct inode *inode, int mds, void *arg) { int *remaining = arg; struct ceph_inode_info *ci = ceph_inode(inode); int used, wanted, oissued, mine; + struct ceph_cap *cap; if (*remaining <= 0) return -1; spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + spin_unlock(&ci->i_ceph_lock); + return 0; + } mine = cap->issued | cap->implemented; used = __ceph_caps_used(ci); wanted = __ceph_caps_file_wanted(ci); @@ -3911,26 +3926,22 @@ out_unlock: /* * Encode information about a cap for a reconnect with the MDS. */ -static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, - void *arg) +static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) { union { struct ceph_mds_cap_reconnect v2; struct ceph_mds_cap_reconnect_v1 v1; } rec; - struct ceph_inode_info *ci = cap->ci; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_reconnect_state *recon_state = arg; struct ceph_pagelist *pagelist = recon_state->pagelist; struct dentry *dentry; + struct ceph_cap *cap; char *path; - int pathlen = 0, err; + int pathlen = 0, err = 0; u64 pathbase; u64 snap_follows; - dout(" adding %p ino %llx.%llx cap %p %lld %s\n", - inode, ceph_vinop(inode), cap, cap->cap_id, - ceph_cap_string(cap->issued)); - dentry = d_find_primary(inode); if (dentry) { /* set pathbase to parent dir when msg_version >= 2 */ @@ -3947,6 +3958,15 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, } spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + spin_unlock(&ci->i_ceph_lock); + goto out_err; + } + dout(" adding %p ino %llx.%llx cap %p %lld %s\n", + inode, ceph_vinop(inode), cap, cap->cap_id, + ceph_cap_string(cap->issued)); + cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ cap->mseq = 0; /* and migrate_seq */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 0598faa50e2e..18b026b1ac63 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -541,8 +541,7 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); extern int ceph_iterate_session_caps(struct ceph_mds_session *session, - int (*cb)(struct inode *, - struct ceph_cap *, void *), + int (*cb)(struct inode *, int mds, void *), void *arg); extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6ecca2c6d137..d24bf0db5234 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1192,6 +1192,8 @@ extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, struct ceph_inode_info *ci); +extern struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, + int mds); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds); extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps, -- cgit v1.2.3 From a5ffd7b6e91a12975ae30de863437cc04387576a Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 26 Apr 2023 10:38:57 +0800 Subject: ceph: pass ino# instead of old_dentry if it's disconnected When exporting the kceph to NFS it may pass a DCACHE_DISCONNECTED dentry for the link operation. Then it will parse this dentry as a snapdir, and the mds will fail the link request as -EROFS. MDS allow clients to pass a ino# instead of a path. Link: https://tracker.ceph.com/issues/59515 Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 13 +++++++++++-- fs/ceph/mds_client.c | 6 +++++- 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 0ced8b570e42..cb67ac821f0e 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1050,6 +1050,9 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, struct ceph_mds_request *req; int err; + if (dentry->d_flags & DCACHE_DISCONNECTED) + return -EINVAL; + err = ceph_wait_on_conflict_unlink(dentry); if (err) return err; @@ -1057,8 +1060,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; - dout("link in dir %p old_dentry %p dentry %p\n", dir, - old_dentry, dentry); + dout("link in dir %p %llx.%llx old_dentry %p:'%pd' dentry %p:'%pd'\n", + dir, ceph_vinop(dir), old_dentry, old_dentry, dentry, dentry); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); if (IS_ERR(req)) { d_drop(dentry); @@ -1067,6 +1070,12 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); + /* + * The old_dentry maybe a DCACHE_DISCONNECTED dentry, then we + * will just pass the ino# to MDSs. + */ + if (old_dentry->d_flags & DCACHE_DISCONNECTED) + req->r_ino2 = ceph_vino(d_inode(old_dentry)); req->r_parent = dir; ihold(dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 54e3c2ab21d2..29cf00220b09 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2570,6 +2570,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, u64 ino1 = 0, ino2 = 0; int pathlen1 = 0, pathlen2 = 0; bool freepath1 = false, freepath2 = false; + struct dentry *old_dentry = NULL; int len; u16 releases; void *p, *end; @@ -2587,7 +2588,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, } /* If r_old_dentry is set, then assume that its parent is locked */ - ret = set_request_path_attr(NULL, req->r_old_dentry, + if (req->r_old_dentry && + !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED)) + old_dentry = req->r_old_dentry; + ret = set_request_path_attr(NULL, old_dentry, req->r_old_dentry_dir, req->r_path2, req->r_ino2.ino, &path2, &pathlen2, &ino2, &freepath2, true); -- cgit v1.2.3 From db2993a423e3fd0e4878f4d3ac66fe717f5f072e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 27 Apr 2023 20:05:42 +0200 Subject: ceph: reorder fields in 'struct ceph_snapid_map' Group some variables based on their sizes to reduce holes. On x86_64, this shrinks the size of 'struct ceph_snapid_map' from 72 to 64 bytes. When such a structure is allocated, because of the way memory allocation works, when 72 bytes were requested, 96 bytes were allocated. So, on x86_64, this change saves 32 bytes per allocation and has the structure fit in a single cacheline. Signed-off-by: Christophe JAILLET Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 18b026b1ac63..724307ff89cd 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -355,8 +355,8 @@ struct ceph_snapid_map { struct rb_node node; struct list_head lru; atomic_t ref; - u64 snap; dev_t dev; + u64 snap; unsigned long last_used; }; -- cgit v1.2.3