diff options
author | Darrick J. Wong <djwong@kernel.org> | 2021-09-01 11:15:56 -0700 |
---|---|---|
committer | Darrick J. Wong <djwong@kernel.org> | 2021-09-17 18:55:12 -0700 |
commit | f7cac98834b0e1445645de34d0360171dc7a072e (patch) | |
tree | 28d5c04a3df88808e5b10eea1af85b620d1ad11b | |
parent | dd47b1682299880dcb15e3d621b4c342b0ede13e (diff) |
xfs: allow inode-based btrees to reserve space in the data devicereserve-rt-metadata-space_2021-09-17
Create a new space reservation scheme so that btree metadata for the
realtime volume can reserve space in the data device to avoid space
underruns.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
-rw-r--r-- | fs/xfs/libxfs/xfs_ag_resv.c | 3 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_errortag.h | 4 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_imeta.c | 188 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_imeta.h | 12 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_types.h | 7 | ||||
-rw-r--r-- | fs/xfs/xfs_error.c | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_fsops.c | 18 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.h | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_rtalloc.c | 23 | ||||
-rw-r--r-- | fs/xfs/xfs_rtalloc.h | 5 | ||||
-rw-r--r-- | fs/xfs/xfs_trace.h | 45 |
11 files changed, 310 insertions, 1 deletions
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 887c8ecff813..d51b62494c2f 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -114,6 +114,7 @@ xfs_ag_resv_needed( case XFS_AG_RESV_RMAPBT: len -= xfs_perag_resv(pag, type)->ar_reserved; break; + case XFS_AG_RESV_IMETA: case XFS_AG_RESV_NONE: /* empty */ break; @@ -348,6 +349,7 @@ xfs_ag_resv_alloc_extent( switch (type) { case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_IMETA: return; case XFS_AG_RESV_METADATA: case XFS_AG_RESV_RMAPBT: @@ -390,6 +392,7 @@ xfs_ag_resv_free_extent( switch (type) { case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_IMETA: return; case XFS_AG_RESV_METADATA: case XFS_AG_RESV_RMAPBT: diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index f5fa2151e05d..f61559a022ca 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -60,7 +60,8 @@ #define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37 #define XFS_ERRTAG_AG_RESV_FAIL 38 #define XFS_ERRTAG_SWAPEXT_FINISH_ONE 39 -#define XFS_ERRTAG_MAX 40 +#define XFS_ERRTAG_IMETA_RESV_CRITICAL 40 +#define XFS_ERRTAG_MAX 41 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -105,5 +106,6 @@ #define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 #define XFS_RANDOM_AG_RESV_FAIL 1 #define XFS_RANDOM_SWAPEXT_FINISH_ONE 1 +#define XFS_RANDOM_IMETA_RESV_CRITICAL 4 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_imeta.c b/fs/xfs/libxfs/xfs_imeta.c index 4b383bc7d7cd..4b86280d6d37 100644 --- a/fs/xfs/libxfs/xfs_imeta.c +++ b/fs/xfs/libxfs/xfs_imeta.c @@ -25,6 +25,10 @@ #include "xfs_trans_space.h" #include "xfs_dir2.h" #include "xfs_ag.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_btree.h" +#include "xfs_alloc.h" /* * Metadata Inode Number Management @@ -1034,3 +1038,187 @@ xfs_imeta_droplink( xfs_is_metadata_inode(ip)) ip->i_diflags2 &= ~XFS_DIFLAG2_METADATA; } + +/* + * Is the amount of space that could be allocated towards a given metadata + * file at or beneath a certain threshold? + */ +static inline bool +xfs_imeta_resv_can_cover( + struct xfs_inode *ip, + int64_t rhs) +{ + /* + * The amount of space that can be allocated to this metadata file is + * the global free block count + all remaining reservation for the + * metadata file. First take care of the trivial case so that we don't + * have to touch the per-cpu counter. + */ + if (ip->i_delayed_blks > rhs) + return false; + + /* + * There aren't enough blocks left in the inode's reservation, but it + * isn't critical unless there also isn't enough free space. + */ + return __percpu_counter_compare(&ip->i_mount->m_fdblocks, + rhs - ip->i_delayed_blks, 2048) <= 0; +} + +/* + * Is this metadata file critically low on blocks? For now we'll define that + * as the number of blocks we can get our hands on being less than 10% of what + * we reserved or less than some arbitrary number (maximum btree height). + */ +bool +xfs_imeta_resv_critical( + struct xfs_mount *mp, + struct xfs_inode *ip) +{ + uint64_t asked_low_water; + xfs_extlen_t btree_maxlevels; + + if (!ip) + return false; + + ASSERT(xfs_is_metadata_inode(ip)); + trace_xfs_imeta_resv_critical(ip, 0); + + /* Critically low if less than 10% or max btree height remains. */ + asked_low_water = div_u64(ip->i_meta_resv_asked, 10); + btree_maxlevels = xfs_btree_maxlevels(mp, XFS_BTNUM_MAX); + return XFS_TEST_ERROR(xfs_imeta_resv_can_cover(ip, asked_low_water) || + xfs_imeta_resv_can_cover(ip, btree_maxlevels), + mp, XFS_ERRTAG_IMETA_RESV_CRITICAL); +} + +/* Allocate a block from the metadata file's reservation. */ +void +xfs_imeta_resv_alloc_extent( + struct xfs_inode *ip, + struct xfs_alloc_arg *args) +{ + int64_t len = args->len; + + ASSERT(xfs_is_metadata_inode(ip)); + ASSERT(args->resv == XFS_AG_RESV_IMETA); + + trace_xfs_imeta_resv_alloc_extent(ip, args->len); + + /* + * Allocate the blocks from the metadata inode's block reservation + * and update the ondisk sb counter. + */ + if (ip->i_delayed_blks > 0) { + int64_t from_resv; + + from_resv = min_t(int64_t, len, ip->i_delayed_blks); + ip->i_delayed_blks -= from_resv; + xfs_mod_delalloc(ip->i_mount, -from_resv); + xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, + -from_resv); + len -= from_resv; + } + + /* + * Any allocation in excess of the reservation requires in-core and + * on-disk fdblocks updates. + */ + if (len) + xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS, -len); + + ip->i_nblocks += args->len; +} + +/* Free a block to the metadata file's reservation. */ +void +xfs_imeta_resv_free_extent( + struct xfs_inode *ip, + struct xfs_trans *tp, + xfs_filblks_t len) +{ + int64_t to_resv; + + ASSERT(xfs_is_metadata_inode(ip)); + trace_xfs_imeta_resv_free_extent(ip, len); + + ip->i_nblocks -= len; + + /* + * Add the freed blocks back into the inode's delalloc reservation + * until it reaches the maximum size. Update the ondisk fdblocks only. + */ + to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks); + if (to_resv > 0) { + to_resv = min_t(int64_t, to_resv, len); + ip->i_delayed_blks += to_resv; + xfs_mod_delalloc(ip->i_mount, to_resv); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv); + len -= to_resv; + } + + /* + * Everything else goes back to the filesystem, so update the in-core + * and on-disk counters. + */ + if (len) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len); +} + +/* Release a metadata file's space reservation. */ +void +xfs_imeta_resv_free_inode( + struct xfs_mount *mp, + struct xfs_inode *ip) +{ + if (!ip) + return; + + ASSERT(xfs_is_metadata_inode(ip)); + trace_xfs_imeta_resv_free(ip, 0); + + xfs_mod_delalloc(ip->i_mount, -ip->i_delayed_blks); + xfs_mod_fdblocks(ip->i_mount, ip->i_delayed_blks, true); + ip->i_delayed_blks = 0; + ip->i_meta_resv_asked = 0; +} + +/* Set up a metadata file's space reservation. */ +int +xfs_imeta_resv_init_inode( + struct xfs_mount *mp, + struct xfs_inode *ip, + xfs_filblks_t ask) +{ + xfs_filblks_t hidden_space; + xfs_filblks_t used; + int error; + + if (!ip || ip->i_meta_resv_asked > 0) + return 0; + + ASSERT(xfs_is_metadata_inode(ip)); + + /* + * Space taken by all other metadata btrees are accounted on-disk as + * used space. We therefore only hide the space that is reserved but + * not used by the trees. + */ + used = ip->i_nblocks; + if (used > ask) + ask = used; + hidden_space = ask - used; + + error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true); + if (error) { + trace_xfs_imeta_resv_init_error(ip, error, _RET_IP_); + return error; + } + + xfs_mod_delalloc(mp, hidden_space); + ip->i_delayed_blks = hidden_space; + ip->i_meta_resv_asked = ask; + + trace_xfs_imeta_resv_init(ip, ask); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_imeta.h b/fs/xfs/libxfs/xfs_imeta.h index 9a5fc2e66036..9633351b8839 100644 --- a/fs/xfs/libxfs/xfs_imeta.h +++ b/fs/xfs/libxfs/xfs_imeta.h @@ -66,6 +66,18 @@ void xfs_imeta_droplink(struct xfs_inode *ip); unsigned int xfs_imeta_create_space_res(struct xfs_mount *mp); unsigned int xfs_imeta_unlink_space_res(struct xfs_mount *mp); +/* Space reservations for metadata inodes. */ +struct xfs_alloc_arg; + +bool xfs_imeta_resv_critical(struct xfs_mount *mp, struct xfs_inode *ip); +void xfs_imeta_resv_alloc_extent(struct xfs_inode *ip, + struct xfs_alloc_arg *args); +void xfs_imeta_resv_free_extent(struct xfs_inode *ip, struct xfs_trans *tp, + xfs_filblks_t len); +void xfs_imeta_resv_free_inode(struct xfs_mount *mp, struct xfs_inode *ip); +int xfs_imeta_resv_init_inode(struct xfs_mount *mp, struct xfs_inode *ip, + xfs_filblks_t ask); + /* Must be implemented by the libxfs client */ int xfs_imeta_iget(struct xfs_mount *mp, xfs_ino_t ino, unsigned char ftype, struct xfs_inode **ipp); diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index d0afc3d11e37..ec8b1e941709 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -186,6 +186,13 @@ enum xfs_ag_resv_type { * altering fdblocks. If you think you need this you're wrong. */ XFS_AG_RESV_IGNORE, + + /* + * This allocation activity is being done on behalf of a metadata file. + * These files maintain their own permanent space reservations and are + * required to adjust fdblocks using the xfs_imeta_resv_* helpers. + */ + XFS_AG_RESV_IMETA, }; /* diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index cd17682869e3..19b46636825e 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -58,6 +58,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT, XFS_RANDOM_AG_RESV_FAIL, XFS_RANDOM_SWAPEXT_FINISH_ONE, + XFS_RANDOM_IMETA_RESV_CRITICAL, }; struct xfs_errortag_attr { @@ -172,6 +173,7 @@ XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT); XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); XFS_ERRORTAG_ATTR_RW(swapext_finish_one, XFS_RANDOM_SWAPEXT_FINISH_ONE); +XFS_ERRORTAG_ATTR_RW(imeta_resv_critical, XFS_RANDOM_IMETA_RESV_CRITICAL); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -214,6 +216,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent), XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), XFS_ERRORTAG_ATTR_LIST(swapext_finish_one), + XFS_ERRORTAG_ATTR_LIST(imeta_resv_critical), NULL, }; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 126f3474c1f6..e528a0cd845f 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -20,6 +20,7 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_trace.h" +#include "xfs_rtalloc.h" /* * Write new AG headers to disk. Non-transactional, but need to be @@ -580,6 +581,20 @@ xfs_fs_reserve_ag_blocks( xfs_warn(mp, "Error %d reserving per-AG metadata reserve pool.", error); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + + if (xfs_has_realtime(mp)) { + int err2 = xfs_rt_resv_init(mp); + + if (err2 && err2 != -ENOSPC) { + xfs_warn(mp, + "Error %d reserving realtime metadata reserve pool.", err2); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } + + if (err2 && !error) + error = err2; } return error; @@ -595,6 +610,9 @@ xfs_fs_unreserve_ag_blocks( struct xfs_perag *pag; xfs_agnumber_t agno; + if (xfs_has_realtime(mp)) + xfs_rt_resv_free(mp); + for_each_perag(mp, agno, pag) xfs_ag_resv_free(pag); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 5813b285303c..b2feeaf84f7f 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -55,6 +55,9 @@ typedef struct xfs_inode { /* Miscellaneous state. */ unsigned long i_flags; /* see defined flags below */ uint64_t i_delayed_blks; /* count of delay alloc blks */ + /* Space that has been set aside to root a btree in this file. */ + uint64_t i_meta_resv_asked; + xfs_fsize_t i_disk_size; /* number of bytes in file */ xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */ prid_t i_projid; /* owner's project id */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 51e3c57cef98..0fa5dd8c6433 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1138,6 +1138,14 @@ error_cancel: /* Update secondary superblocks now the physical grow has completed */ error = xfs_update_secondary_sbs(mp); + if (error) + goto out_free; + + /* Reset the rt metadata btree space reservations. */ + xfs_rt_resv_free(mp); + error = xfs_rt_resv_init(mp); + if (error == -ENOSPC) + error = 0; out_free: /* @@ -1283,6 +1291,21 @@ xfs_rtmount_init( return 0; } +/* Free space reservations for rt metadata inodes. */ +void +xfs_rt_resv_free( + struct xfs_mount *mp) +{ +} + +/* Reserve space for rt metadata inodes' space expansion. */ +int +xfs_rt_resv_init( + struct xfs_mount *mp) +{ + return 0; +} + /* * Realtime metadata files are not quite regular files because userspace can't * access the realtime bitmap directly, and because we take the ILOCK of the rt diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index a0d0e161d804..7a459c250aa6 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -79,6 +79,9 @@ int /* error */ xfs_rtmount_inodes( struct xfs_mount *mp); /* file system mount structure */ +void xfs_rt_resv_free(struct xfs_mount *mp); +int xfs_rt_resv_init(struct xfs_mount *mp); + /* * Pick an extent for allocation at the start of a new realtime file. * Use the sequence number stored in the atime field of the bitmap inode. @@ -168,6 +171,8 @@ xfs_rtmount_init( xfs_warn(mp, "Not built with CONFIG_XFS_RT"); return -ENOSYS; } +# define xfs_rt_resv_free(mp) ((void)0) +# define xfs_rt_resv_init(mp) (0) # define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) # define xfs_rtunmount_inodes(m) # define xfs_rtfile_convert_unwritten(ip, pos, len) (0) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 4d9bb3de862d..a695a523db58 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4633,6 +4633,51 @@ DEFINE_IMETA_DIR_EVENT(xfs_imeta_dir_created); DEFINE_IMETA_DIR_EVENT(xfs_imeta_dir_unlinked); DEFINE_IMETA_DIR_EVENT(xfs_imeta_dir_link); +/* metadata inode space reservations */ + +DECLARE_EVENT_CLASS(xfs_imeta_resv_class, + TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), + TP_ARGS(ip, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned long long, freeblks) + __field(unsigned long long, reserved) + __field(unsigned long long, asked) + __field(unsigned long long, used) + __field(unsigned long long, len) + ), + TP_fast_assign( + struct xfs_mount *mp = ip->i_mount; + + __entry->dev = mp->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->freeblks = percpu_counter_sum(&mp->m_fdblocks); + __entry->reserved = ip->i_delayed_blks; + __entry->asked = ip->i_meta_resv_asked; + __entry->used = ip->i_nblocks; + __entry->len = len; + ), + TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->freeblks, + __entry->reserved, + __entry->asked, + __entry->used, + __entry->len) +) +#define DEFINE_IMETA_RESV_EVENT(name) \ +DEFINE_EVENT(xfs_imeta_resv_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \ + TP_ARGS(ip, len)) +DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_init); +DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_free); +DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_alloc_extent); +DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_free_extent); +DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_critical); +DEFINE_INODE_ERROR_EVENT(xfs_imeta_resv_init_error); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH |