summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_buf.c
diff options
context:
space:
mode:
authorDarrick J. Wong <djwong@kernel.org>2022-07-14 09:22:14 -0700
committerDarrick J. Wong <djwong@kernel.org>2022-07-14 09:22:14 -0700
commit35c5a09f5346e690df7ff2c9075853e340ee10b3 (patch)
tree045be2e041e91ad1e6d25bde968b1c69fbdbd173 /fs/xfs/xfs_buf.c
parent4613b17cc4789d6061041f9bd424180251fb6228 (diff)
parent298f342245066309189d8637ca7339d56840c3e1 (diff)
Merge tag 'xfs-buf-lockless-lookup-5.20' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs into xfs-5.20-mergeB
xfs: lockless buffer cache lookups Current work to merge the XFS inode life cycle with the VFS inode life cycle is finding some interesting issues. If we have a path that hits buffer trylocks fairly hard (e.g. a non-blocking background inode freeing function), we end up hitting massive contention on the buffer cache hash locks: - 92.71% 0.05% [kernel] [k] xfs_inodegc_worker - 92.67% xfs_inodegc_worker - 92.13% xfs_inode_unlink - 91.52% xfs_inactive_ifree - 85.63% xfs_read_agi - 85.61% xfs_trans_read_buf_map - 85.59% xfs_buf_read_map - xfs_buf_get_map - 85.55% xfs_buf_find - 72.87% _raw_spin_lock - do_raw_spin_lock 71.86% __pv_queued_spin_lock_slowpath - 8.74% xfs_buf_rele - 7.88% _raw_spin_lock - 7.88% do_raw_spin_lock 7.63% __pv_queued_spin_lock_slowpath - 1.70% xfs_buf_trylock - 1.68% down_trylock - 1.41% _raw_spin_lock_irqsave - 1.39% do_raw_spin_lock __pv_queued_spin_lock_slowpath - 0.76% _raw_spin_unlock 0.75% do_raw_spin_unlock This is basically hammering the pag->pag_buf_lock from lots of CPUs doing trylocks at the same time. Most of the buffer trylock operations ultimately fail after we've done the lookup, so we're really hammering the buf hash lock whilst making no progress. We can also see significant spinlock traffic on the same lock just under normal operation when lots of tasks are accessing metadata from the same AG, so let's avoid all this by creating a lookup fast path which leverages the rhashtable's ability to do RCU protected lookups. Signed-off-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Darrick J. Wong <djwong@kernel.org> * tag 'xfs-buf-lockless-lookup-5.20' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: xfs: lockless buffer lookup xfs: remove a superflous hash lookup when inserting new buffers xfs: reduce the number of atomic when locking a buffer after lookup xfs: merge xfs_buf_find() and xfs_buf_get_map() xfs: break up xfs_buf_find() into individual pieces xfs: rework xfs_buf_incore() API
Diffstat (limited to 'fs/xfs/xfs_buf.c')
-rw-r--r--fs/xfs/xfs_buf.c263
1 files changed, 148 insertions, 115 deletions
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index bf4e60871068..6dac5583977f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -295,6 +295,16 @@ xfs_buf_free_pages(
}
static void
+xfs_buf_free_callback(
+ struct callback_head *cb)
+{
+ struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu);
+
+ xfs_buf_free_maps(bp);
+ kmem_cache_free(xfs_buf_cache, bp);
+}
+
+static void
xfs_buf_free(
struct xfs_buf *bp)
{
@@ -307,8 +317,7 @@ xfs_buf_free(
else if (bp->b_flags & _XBF_KMEM)
kmem_free(bp->b_addr);
- xfs_buf_free_maps(bp);
- kmem_cache_free(xfs_buf_cache, bp);
+ call_rcu(&bp->b_rcu, xfs_buf_free_callback);
}
static int
@@ -503,100 +512,45 @@ xfs_buf_hash_destroy(
rhashtable_destroy(&pag->pag_buf_hash);
}
-/*
- * Look up a buffer in the buffer cache and return it referenced and locked
- * in @found_bp.
- *
- * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the
- * cache.
- *
- * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return
- * -EAGAIN if we fail to lock it.
- *
- * Return values are:
- * -EFSCORRUPTED if have been supplied with an invalid address
- * -EAGAIN on trylock failure
- * -ENOENT if we fail to find a match and @new_bp was NULL
- * 0, with @found_bp:
- * - @new_bp if we inserted it into the cache
- * - the buffer we found and locked.
- */
static int
-xfs_buf_find(
+xfs_buf_map_verify(
struct xfs_buftarg *btp,
- struct xfs_buf_map *map,
- int nmaps,
- xfs_buf_flags_t flags,
- struct xfs_buf *new_bp,
- struct xfs_buf **found_bp)
+ struct xfs_buf_map *map)
{
- struct xfs_perag *pag;
- struct xfs_buf *bp;
- struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
xfs_daddr_t eofs;
- int i;
-
- *found_bp = NULL;
-
- for (i = 0; i < nmaps; i++)
- cmap.bm_len += map[i].bm_len;
/* Check for IOs smaller than the sector size / not sector aligned */
- ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
- ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
+ ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
+ ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
/*
* Corrupted block numbers can get through to here, unfortunately, so we
* have to check that the buffer falls within the filesystem bounds.
*/
eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
- if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
+ if (map->bm_bn < 0 || map->bm_bn >= eofs) {
xfs_alert(btp->bt_mount,
"%s: daddr 0x%llx out of range, EOFS 0x%llx",
- __func__, cmap.bm_bn, eofs);
+ __func__, map->bm_bn, eofs);
WARN_ON(1);
return -EFSCORRUPTED;
}
-
- pag = xfs_perag_get(btp->bt_mount,
- xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
-
- spin_lock(&pag->pag_buf_lock);
- bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
- xfs_buf_hash_params);
- if (bp) {
- atomic_inc(&bp->b_hold);
- goto found;
- }
-
- /* No match found */
- if (!new_bp) {
- XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
- spin_unlock(&pag->pag_buf_lock);
- xfs_perag_put(pag);
- return -ENOENT;
- }
-
- /* the buffer keeps the perag reference until it is freed */
- new_bp->b_pag = pag;
- rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
- xfs_buf_hash_params);
- spin_unlock(&pag->pag_buf_lock);
- *found_bp = new_bp;
return 0;
+}
-found:
- spin_unlock(&pag->pag_buf_lock);
- xfs_perag_put(pag);
-
- if (!xfs_buf_trylock(bp)) {
- if (flags & XBF_TRYLOCK) {
- xfs_buf_rele(bp);
- XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
+static int
+xfs_buf_find_lock(
+ struct xfs_buf *bp,
+ xfs_buf_flags_t flags)
+{
+ if (flags & XBF_TRYLOCK) {
+ if (!xfs_buf_trylock(bp)) {
+ XFS_STATS_INC(bp->b_mount, xb_busy_locked);
return -EAGAIN;
}
+ } else {
xfs_buf_lock(bp);
- XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
+ XFS_STATS_INC(bp->b_mount, xb_get_locked_waited);
}
/*
@@ -609,57 +563,59 @@ found:
bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
bp->b_ops = NULL;
}
-
- trace_xfs_buf_find(bp, flags, _RET_IP_);
- XFS_STATS_INC(btp->bt_mount, xb_get_locked);
- *found_bp = bp;
return 0;
}
-struct xfs_buf *
-xfs_buf_incore(
- struct xfs_buftarg *target,
- xfs_daddr_t blkno,
- size_t numblks,
- xfs_buf_flags_t flags)
+static inline int
+xfs_buf_lookup(
+ struct xfs_perag *pag,
+ struct xfs_buf_map *map,
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
{
- struct xfs_buf *bp;
+ struct xfs_buf *bp;
int error;
- DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- error = xfs_buf_find(target, &map, 1, flags, NULL, &bp);
- if (error)
- return NULL;
- return bp;
+ rcu_read_lock();
+ bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
+ if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+ rcu_read_unlock();
+
+ error = xfs_buf_find_lock(bp, flags);
+ if (error) {
+ xfs_buf_rele(bp);
+ return error;
+ }
+
+ trace_xfs_buf_find(bp, flags, _RET_IP_);
+ *bpp = bp;
+ return 0;
}
/*
- * Assembles a buffer covering the specified range. The code is optimised for
- * cache hits, as metadata intensive workloads will see 3 orders of magnitude
- * more hits than misses.
+ * Insert the new_bp into the hash table. This consumes the perag reference
+ * taken for the lookup regardless of the result of the insert.
*/
-int
-xfs_buf_get_map(
- struct xfs_buftarg *target,
+static int
+xfs_buf_find_insert(
+ struct xfs_buftarg *btp,
+ struct xfs_perag *pag,
+ struct xfs_buf_map *cmap,
struct xfs_buf_map *map,
int nmaps,
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
- struct xfs_buf *bp;
struct xfs_buf *new_bp;
+ struct xfs_buf *bp;
int error;
- *bpp = NULL;
- error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
- if (!error)
- goto found;
- if (error != -ENOENT)
- return error;
-
- error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp);
+ error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
if (error)
- return error;
+ goto out_drop_pag;
/*
* For buffers that fit entirely within a single page, first attempt to
@@ -674,18 +630,94 @@ xfs_buf_get_map(
goto out_free_buf;
}
- error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
- if (error)
+ spin_lock(&pag->pag_buf_lock);
+ bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
+ &new_bp->b_rhash_head, xfs_buf_hash_params);
+ if (IS_ERR(bp)) {
+ error = PTR_ERR(bp);
+ spin_unlock(&pag->pag_buf_lock);
goto out_free_buf;
+ }
+ if (bp) {
+ /* found an existing buffer */
+ atomic_inc(&bp->b_hold);
+ spin_unlock(&pag->pag_buf_lock);
+ error = xfs_buf_find_lock(bp, flags);
+ if (error)
+ xfs_buf_rele(bp);
+ else
+ *bpp = bp;
+ goto out_free_buf;
+ }
+
+ /* The new buffer keeps the perag reference until it is freed. */
+ new_bp->b_pag = pag;
+ spin_unlock(&pag->pag_buf_lock);
+ *bpp = new_bp;
+ return 0;
+
+out_free_buf:
+ xfs_buf_free(new_bp);
+out_drop_pag:
+ xfs_perag_put(pag);
+ return error;
+}
+
+/*
+ * Assembles a buffer covering the specified range. The code is optimised for
+ * cache hits, as metadata intensive workloads will see 3 orders of magnitude
+ * more hits than misses.
+ */
+int
+xfs_buf_get_map(
+ struct xfs_buftarg *btp,
+ struct xfs_buf_map *map,
+ int nmaps,
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
+{
+ struct xfs_perag *pag;
+ struct xfs_buf *bp = NULL;
+ struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
+ int error;
+ int i;
- if (bp != new_bp)
- xfs_buf_free(new_bp);
+ for (i = 0; i < nmaps; i++)
+ cmap.bm_len += map[i].bm_len;
-found:
+ error = xfs_buf_map_verify(btp, &cmap);
+ if (error)
+ return error;
+
+ pag = xfs_perag_get(btp->bt_mount,
+ xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
+
+ error = xfs_buf_lookup(pag, &cmap, flags, &bp);
+ if (error && error != -ENOENT)
+ goto out_put_perag;
+
+ /* cache hits always outnumber misses by at least 10:1 */
+ if (unlikely(!bp)) {
+ XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
+
+ if (flags & XBF_INCORE)
+ goto out_put_perag;
+
+ /* xfs_buf_find_insert() consumes the perag reference. */
+ error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
+ flags, &bp);
+ if (error)
+ return error;
+ } else {
+ XFS_STATS_INC(btp->bt_mount, xb_get_locked);
+ xfs_perag_put(pag);
+ }
+
+ /* We do not hold a perag reference anymore. */
if (!bp->b_addr) {
error = _xfs_buf_map_pages(bp, flags);
if (unlikely(error)) {
- xfs_warn_ratelimited(target->bt_mount,
+ xfs_warn_ratelimited(btp->bt_mount,
"%s: failed to map %u pages", __func__,
bp->b_page_count);
xfs_buf_relse(bp);
@@ -700,12 +732,13 @@ found:
if (!(flags & XBF_READ))
xfs_buf_ioerror(bp, 0);
- XFS_STATS_INC(target->bt_mount, xb_get);
+ XFS_STATS_INC(btp->bt_mount, xb_get);
trace_xfs_buf_get(bp, flags, _RET_IP_);
*bpp = bp;
return 0;
-out_free_buf:
- xfs_buf_free(new_bp);
+
+out_put_perag:
+ xfs_perag_put(pag);
return error;
}