diff options
Diffstat (limited to 'fs/xfs')
77 files changed, 2699 insertions, 2585 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 5d47b4df61ea..35faf128f36d 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -4,6 +4,7 @@ config XFS_FS depends on (64BIT || LBDAF) select EXPORTFS select LIBCRC32C + select FS_IOMAP help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 686ba6fb20dd..339c696bbc01 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -93,19 +93,23 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) } void * -kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, - xfs_km_flags_t flags) +kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) { - void *new; + int retries = 0; + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; - new = kmem_alloc(newsize, flags); - if (ptr) { - if (new) - memcpy(new, ptr, - ((oldsize < newsize) ? oldsize : newsize)); - kmem_free(ptr); - } - return new; + do { + ptr = krealloc(old, newsize, lflags); + if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + return ptr; + if (!(++retries % 100)) + xfs_err(NULL, + "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)", + current->comm, current->pid, + newsize, __func__, lflags); + congestion_wait(BLK_RW_ASYNC, HZ/50); + } while (1); } void * diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index d1c66e465ca5..689f746224e7 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -62,7 +62,7 @@ kmem_flags_convert(xfs_km_flags_t flags) extern void *kmem_alloc(size_t, xfs_km_flags_t); extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t); -extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); +extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); static inline void kmem_free(const void *ptr) { kvfree(ptr); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index a708e38b494c..88c26b827a2d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -84,7 +84,7 @@ xfs_alloc_lookup_ge( * Lookup the first record less than or equal to [bno, len] * in the btree given by cur. */ -int /* error */ +static int /* error */ xfs_alloc_lookup_le( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ @@ -1839,19 +1839,8 @@ void xfs_alloc_compute_maxlevels( xfs_mount_t *mp) /* file system mount structure */ { - int level; - uint maxblocks; - uint maxleafents; - int minleafrecs; - int minnoderecs; - - maxleafents = (mp->m_sb.sb_agblocks + 1) / 2; - minleafrecs = mp->m_alloc_mnr[0]; - minnoderecs = mp->m_alloc_mnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; - for (level = 1; maxblocks > 1; level++) - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; - mp->m_ag_maxlevels = level; + mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr, + (mp->m_sb.sb_agblocks + 1) / 2); } /* @@ -2658,55 +2647,79 @@ error0: return error; } -/* - * Free an extent. - * Just break up the extent address and hand off to xfs_free_ag_extent - * after fixing up the freelist. - */ -int /* error */ -xfs_free_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_fsblock_t bno, /* starting block number of extent */ - xfs_extlen_t len) /* length of extent */ +/* Ensure that the freelist is at full capacity. */ +int +xfs_free_extent_fix_freelist( + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf **agbp) { - xfs_alloc_arg_t args; - int error; + struct xfs_alloc_arg args; + int error; - ASSERT(len != 0); - memset(&args, 0, sizeof(xfs_alloc_arg_t)); + memset(&args, 0, sizeof(struct xfs_alloc_arg)); args.tp = tp; args.mp = tp->t_mountp; + args.agno = agno; /* * validate that the block number is legal - the enables us to detect * and handle a silent filesystem corruption rather than crashing. */ - args.agno = XFS_FSB_TO_AGNO(args.mp, bno); if (args.agno >= args.mp->m_sb.sb_agcount) return -EFSCORRUPTED; - args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); - if (args.agbno >= args.mp->m_sb.sb_agblocks) - return -EFSCORRUPTED; - args.pag = xfs_perag_get(args.mp, args.agno); ASSERT(args.pag); error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); if (error) - goto error0; + goto out; + + *agbp = args.agbp; +out: + xfs_perag_put(args.pag); + return error; +} + +/* + * Free an extent. + * Just break up the extent address and hand off to xfs_free_ag_extent + * after fixing up the freelist. + */ +int /* error */ +xfs_free_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len) /* length of extent */ +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *agbp; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); + int error; + + ASSERT(len != 0); + + error = xfs_free_extent_fix_freelist(tp, agno, &agbp); + if (error) + return error; + + XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err); /* validate the extent size is legal now we have the agf locked */ - if (args.agbno + len > - be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) { - error = -EFSCORRUPTED; - goto error0; - } + XFS_WANT_CORRUPTED_GOTO(mp, + agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length), + err); - error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); - if (!error) - xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0); -error0: - xfs_perag_put(args.pag); + error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0); + if (error) + goto err; + + xfs_extent_busy_insert(tp, agno, agbno, len, 0); + return 0; + +err: + xfs_trans_brelse(tp, agbp); return error; } diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 135eb3d24db7..cf268b2d0b6c 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -212,13 +212,6 @@ xfs_free_extent( xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len); /* length of extent */ -int /* error */ -xfs_alloc_lookup_le( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agblock_t bno, /* starting block of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat); /* success/failure */ - int /* error */ xfs_alloc_lookup_ge( struct xfs_btree_cur *cur, /* btree cursor */ @@ -236,5 +229,7 @@ xfs_alloc_get_rec( int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); +int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, + struct xfs_buf **agbp); #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index fa3b948ef9c2..4e126f41a0aa 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -242,37 +242,21 @@ xfs_attr_set( return error; } - /* - * Start our first transaction of the day. - * - * All future transactions during this code must be "chained" off - * this one via the trans_dup() call. All transactions will contain - * the inode, and the inode will always be marked with trans_ihold(). - * Since the inode will be locked in all transactions, we must log - * the inode in every transaction to let it float upward through - * the log. - */ - args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET); + tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * args.total; + tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ - - if (rsvd) - args.trans->t_flags |= XFS_TRANS_RESERVE; - - tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * args.total; - tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - error = xfs_trans_reserve(args.trans, &tres, args.total, 0); - if (error) { - xfs_trans_cancel(args.trans); + error = xfs_trans_alloc(mp, &tres, args.total, 0, + rsvd ? XFS_TRANS_RESERVE : 0, &args.trans); + if (error) return error; - } - xfs_ilock(dp, XFS_ILOCK_EXCL); + xfs_ilock(dp, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : XFS_QMOPT_RES_REGBLKS); @@ -429,31 +413,15 @@ xfs_attr_remove( return error; /* - * Start our first transaction of the day. - * - * All future transactions during this code must be "chained" off - * this one via the trans_dup() call. All transactions will contain - * the inode, and the inode will always be marked with trans_ihold(). - * Since the inode will be locked in all transactions, we must log - * the inode in every transaction to let it float upward through - * the log. - */ - args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM); - - /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ - - if (flags & ATTR_ROOT) - args.trans->t_flags |= XFS_TRANS_RESERVE; - - error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm, - XFS_ATTRRM_SPACE_RES(mp), 0); - if (error) { - xfs_trans_cancel(args.trans); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm, + XFS_ATTRRM_SPACE_RES(mp), 0, + (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0, + &args.trans); + if (error) return error; - } xfs_ilock(dp, XFS_ILOCK_EXCL); /* diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 882c8d338891..4f2aed04f827 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -50,7 +50,6 @@ int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_shortform_remove(struct xfs_da_args *args); -int xfs_attr_shortform_list(struct xfs_attr_list_context *context); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); @@ -88,8 +87,6 @@ int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval); void xfs_attr3_leaf_unbalance(struct xfs_da_state *state, struct xfs_da_state_blk *drop_blk, struct xfs_da_state_blk *save_blk); -int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); - /* * Utility routines. */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index ce41d7fe753c..2f2c85cc8117 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -570,14 +570,12 @@ xfs_bmap_validate_ret( */ void xfs_bmap_add_free( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_bmap_free *flist, /* list of extents */ xfs_fsblock_t bno, /* fs block number of extent */ - xfs_filblks_t len, /* length of extent */ - xfs_bmap_free_t *flist, /* list of extents */ - xfs_mount_t *mp) /* mount point structure */ + xfs_filblks_t len) /* length of extent */ { - xfs_bmap_free_item_t *cur; /* current (next) element */ - xfs_bmap_free_item_t *new; /* new element */ - xfs_bmap_free_item_t *prev; /* previous element */ + struct xfs_bmap_free_item *new; /* new element */ #ifdef DEBUG xfs_agnumber_t agno; xfs_agblock_t agbno; @@ -597,17 +595,7 @@ xfs_bmap_add_free( new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); new->xbfi_startblock = bno; new->xbfi_blockcount = (xfs_extlen_t)len; - for (prev = NULL, cur = flist->xbf_first; - cur != NULL; - prev = cur, cur = cur->xbfi_next) { - if (cur->xbfi_startblock >= bno) - break; - } - if (prev) - prev->xbfi_next = new; - else - flist->xbf_first = new; - new->xbfi_next = cur; + list_add(&new->xbfi_list, &flist->xbf_flist); flist->xbf_count++; } @@ -617,14 +605,10 @@ xfs_bmap_add_free( */ void xfs_bmap_del_free( - xfs_bmap_free_t *flist, /* free item list header */ - xfs_bmap_free_item_t *prev, /* previous item on list, if any */ - xfs_bmap_free_item_t *free) /* list item to be freed */ + struct xfs_bmap_free *flist, /* free item list header */ + struct xfs_bmap_free_item *free) /* list item to be freed */ { - if (prev) - prev->xbfi_next = free->xbfi_next; - else - flist->xbf_first = free->xbfi_next; + list_del(&free->xbfi_list); flist->xbf_count--; kmem_zone_free(xfs_bmap_free_item_zone, free); } @@ -634,17 +618,16 @@ xfs_bmap_del_free( */ void xfs_bmap_cancel( - xfs_bmap_free_t *flist) /* list of bmap_free_items */ + struct xfs_bmap_free *flist) /* list of bmap_free_items */ { - xfs_bmap_free_item_t *free; /* free list item */ - xfs_bmap_free_item_t *next; + struct xfs_bmap_free_item *free; /* free list item */ if (flist->xbf_count == 0) return; - ASSERT(flist->xbf_first != NULL); - for (free = flist->xbf_first; free; free = next) { - next = free->xbfi_next; - xfs_bmap_del_free(flist, NULL, free); + while (!list_empty(&flist->xbf_flist)) { + free = list_first_entry(&flist->xbf_flist, + struct xfs_bmap_free_item, xbfi_list); + xfs_bmap_del_free(flist, free); } ASSERT(flist->xbf_count == 0); } @@ -699,7 +682,7 @@ xfs_bmap_btree_to_extents( cblock = XFS_BUF_TO_BLOCK(cbp); if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) return error; - xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); + xfs_bmap_add_free(mp, cur->bc_private.b.flist, cbno, 1); ip->i_d.di_nblocks--; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); xfs_trans_binval(tp, cbp); @@ -1121,15 +1104,14 @@ xfs_bmap_add_attrfork( mp = ip->i_mount; ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); + blks = XFS_ADDAFORK_SPACE_RES(mp); - if (rsvd) - tp->t_flags |= XFS_TRANS_RESERVE; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); - if (error) { - xfs_trans_cancel(tp); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0, + rsvd ? XFS_TRANS_RESERVE : 0, &tp); + if (error) return error; - } + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : @@ -5074,8 +5056,8 @@ xfs_bmap_del_extent( * If we need to, add to list of extents to delete. */ if (do_fx) - xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, - mp); + xfs_bmap_add_free(mp, flist, del->br_startblock, + del->br_blockcount); /* * Adjust inode # blocks in the file. */ @@ -6026,13 +6008,10 @@ xfs_bmap_split_extent( xfs_fsblock_t firstfsb; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 423a34e832bd..f1f3ae6c0a3f 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -62,12 +62,12 @@ struct xfs_bmalloca { * List of extents to be free "later". * The list is kept sorted on xbf_startblock. */ -typedef struct xfs_bmap_free_item +struct xfs_bmap_free_item { xfs_fsblock_t xbfi_startblock;/* starting fs block number */ xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */ - struct xfs_bmap_free_item *xbfi_next; /* link to next entry */ -} xfs_bmap_free_item_t; + struct list_head xbfi_list; +}; /* * Header for free extent list. @@ -85,7 +85,7 @@ typedef struct xfs_bmap_free_item */ typedef struct xfs_bmap_free { - xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */ + struct list_head xbf_flist; /* list of to-be-free extents */ int xbf_count; /* count of items on list */ int xbf_low; /* alloc in low mode */ } xfs_bmap_free_t; @@ -141,8 +141,10 @@ static inline int xfs_bmapi_aflag(int w) static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) { - ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ - (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK); + INIT_LIST_HEAD(&flp->xbf_flist); + flp->xbf_count = 0; + flp->xbf_low = 0; + *fbp = NULLFSBLOCK; } /* @@ -191,8 +193,8 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); -void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, - struct xfs_bmap_free *flist, struct xfs_mount *mp); +void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_bmap_free *flist, + xfs_fsblock_t bno, xfs_filblks_t len); void xfs_bmap_cancel(struct xfs_bmap_free *flist); int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, struct xfs_inode *ip); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 6282f6e708af..db0c71e470c9 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -526,7 +526,7 @@ xfs_bmbt_free_block( struct xfs_trans *tp = cur->bc_tp; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); - xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp); + xfs_bmap_add_free(mp, cur->bc_private.b.flist, fsbno, 1); ip->i_d.di_nblocks--; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 1f88e1ce770f..07eeb0b4ca74 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -543,12 +543,12 @@ xfs_btree_ptr_addr( */ STATIC struct xfs_btree_block * xfs_btree_get_iroot( - struct xfs_btree_cur *cur) + struct xfs_btree_cur *cur) { - struct xfs_ifork *ifp; + struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); - return (struct xfs_btree_block *)ifp->if_broot; + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); + return (struct xfs_btree_block *)ifp->if_broot; } /* @@ -4152,3 +4152,22 @@ xfs_btree_sblock_verify( return true; } + +/* + * Calculate the number of btree levels needed to store a given number of + * records in a short-format btree. + */ +uint +xfs_btree_compute_maxlevels( + struct xfs_mount *mp, + uint *limits, + unsigned long len) +{ + uint level; + unsigned long maxblocks; + + maxblocks = (len + limits[0] - 1) / limits[0]; + for (level = 1; maxblocks > 1; level++) + maxblocks = (maxblocks + limits[1] - 1) / limits[1]; + return level; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 2e874be70209..785a99682159 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -474,5 +474,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); +uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits, + unsigned long len); #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 097bf7717d80..0f1f165f4048 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -356,7 +356,6 @@ xfs_da3_split( struct xfs_da_state_blk *newblk; struct xfs_da_state_blk *addblk; struct xfs_da_intnode *node; - struct xfs_buf *bp; int max; int action = 0; int error; @@ -397,7 +396,9 @@ xfs_da3_split( break; } /* - * Entry wouldn't fit, split the leaf again. + * Entry wouldn't fit, split the leaf again. The new + * extrablk will be consumed by xfs_da3_node_split if + * the node is split. */ state->extravalid = 1; if (state->inleaf) { @@ -446,6 +447,14 @@ xfs_da3_split( return 0; /* + * xfs_da3_node_split() should have consumed any extra blocks we added + * during a double leaf split in the attr fork. This is guaranteed as + * we can't be here if the attr fork only has a single leaf block. + */ + ASSERT(state->extravalid == 0 || + state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC); + + /* * Split the root node. */ ASSERT(state->path.active == 0); @@ -457,43 +466,33 @@ xfs_da3_split( } /* - * Update pointers to the node which used to be block 0 and - * just got bumped because of the addition of a new root node. - * There might be three blocks involved if a double split occurred, - * and the original block 0 could be at any position in the list. + * Update pointers to the node which used to be block 0 and just got + * bumped because of the addition of a new root node. Note that the + * original block 0 could be at any position in the list of blocks in + * the tree. * - * Note: the magic numbers and sibling pointers are in the same - * physical place for both v2 and v3 headers (by design). Hence it - * doesn't matter which version of the xfs_da_intnode structure we use - * here as the result will be the same using either structure. + * Note: the magic numbers and sibling pointers are in the same physical + * place for both v2 and v3 headers (by design). Hence it doesn't matter + * which version of the xfs_da_intnode structure we use here as the + * result will be the same using either structure. */ node = oldblk->bp->b_addr; if (node->hdr.info.forw) { - if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { - bp = addblk->bp; - } else { - ASSERT(state->extravalid); - bp = state->extrablk.bp; - } - node = bp->b_addr; + ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno); + node = addblk->bp->b_addr; node->hdr.info.back = cpu_to_be32(oldblk->blkno); - xfs_trans_log_buf(state->args->trans, bp, - XFS_DA_LOGRANGE(node, &node->hdr.info, - sizeof(node->hdr.info))); + xfs_trans_log_buf(state->args->trans, addblk->bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); } node = oldblk->bp->b_addr; if (node->hdr.info.back) { - if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) { - bp = addblk->bp; - } else { - ASSERT(state->extravalid); - bp = state->extrablk.bp; - } - node = bp->b_addr; + ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno); + node = addblk->bp->b_addr; node->hdr.info.forw = cpu_to_be32(oldblk->blkno); - xfs_trans_log_buf(state->args->trans, bp, - XFS_DA_LOGRANGE(node, &node->hdr.info, - sizeof(node->hdr.info))); + xfs_trans_log_buf(state->args->trans, addblk->bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); } addblk->bp = NULL; return 0; diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c index 9d624a622946..f1e8d4dbb600 100644 --- a/fs/xfs/libxfs/xfs_da_format.c +++ b/fs/xfs/libxfs/xfs_da_format.c @@ -40,8 +40,7 @@ xfs_dir2_sf_entsize( int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ count += len; /* name */ - count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) : - sizeof(xfs_dir2_ino4_t); /* ino # */ + count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */ return count; } @@ -125,33 +124,33 @@ xfs_dir3_sfe_put_ftype( static xfs_ino_t xfs_dir2_sf_get_ino( struct xfs_dir2_sf_hdr *hdr, - xfs_dir2_inou_t *from) + __uint8_t *from) { if (hdr->i8count) - return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL; + return get_unaligned_be64(from) & 0x00ffffffffffffffULL; else - return get_unaligned_be32(&from->i4.i); + return get_unaligned_be32(from); } static void xfs_dir2_sf_put_ino( struct xfs_dir2_sf_hdr *hdr, - xfs_dir2_inou_t *to, + __uint8_t *to, xfs_ino_t ino) { ASSERT((ino & 0xff00000000000000ULL) == 0); if (hdr->i8count) - put_unaligned_be64(ino, &to->i8.i); + put_unaligned_be64(ino, to); else - put_unaligned_be32(ino, &to->i4.i); + put_unaligned_be32(ino, to); } static xfs_ino_t xfs_dir2_sf_get_parent_ino( struct xfs_dir2_sf_hdr *hdr) { - return xfs_dir2_sf_get_ino(hdr, &hdr->parent); + return xfs_dir2_sf_get_ino(hdr, hdr->parent); } static void @@ -159,7 +158,7 @@ xfs_dir2_sf_put_parent_ino( struct xfs_dir2_sf_hdr *hdr, xfs_ino_t ino) { - xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino); + xfs_dir2_sf_put_ino(hdr, hdr->parent, ino); } /* @@ -173,8 +172,7 @@ xfs_dir2_sfe_get_ino( struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep) { - return xfs_dir2_sf_get_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]); + return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]); } static void @@ -183,8 +181,7 @@ xfs_dir2_sfe_put_ino( struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino) { - xfs_dir2_sf_put_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino); + xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino); } static xfs_ino_t @@ -192,8 +189,7 @@ xfs_dir3_sfe_get_ino( struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep) { - return xfs_dir2_sf_get_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]); + return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]); } static void @@ -202,8 +198,7 @@ xfs_dir3_sfe_put_ino( struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino) { - xfs_dir2_sf_put_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino); + xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino); } diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 8d4d8bce41bf..685f23b67056 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -192,12 +192,6 @@ typedef __uint16_t xfs_dir2_data_off_t; typedef uint xfs_dir2_data_aoff_t; /* argument form */ /* - * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. - * Only need 16 bits, this is the byte offset into the single block form. - */ -typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t; - -/* * Offset in data space of a data entry. */ typedef __uint32_t xfs_dir2_dataptr_t; @@ -214,22 +208,10 @@ typedef xfs_off_t xfs_dir2_off_t; */ typedef __uint32_t xfs_dir2_db_t; -/* - * Inode number stored as 8 8-bit values. - */ -typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; - -/* - * Inode number stored as 4 8-bit values. - * Works a lot of the time, when all the inode numbers in a directory - * fit in 32 bits. - */ -typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t; +#define XFS_INO32_SIZE 4 +#define XFS_INO64_SIZE 8 +#define XFS_INO64_DIFF (XFS_INO64_SIZE - XFS_INO32_SIZE) -typedef union { - xfs_dir2_ino8_t i8; - xfs_dir2_ino4_t i4; -} xfs_dir2_inou_t; #define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) /* @@ -246,39 +228,38 @@ typedef union { typedef struct xfs_dir2_sf_hdr { __uint8_t count; /* count of entries */ __uint8_t i8count; /* count of 8-byte inode #s */ - xfs_dir2_inou_t parent; /* parent dir inode number */ -} __arch_pack xfs_dir2_sf_hdr_t; + __uint8_t parent[8]; /* parent dir inode number */ +} __packed xfs_dir2_sf_hdr_t; typedef struct xfs_dir2_sf_entry { __u8 namelen; /* actual name length */ - xfs_dir2_sf_off_t offset; /* saved offset */ + __u8 offset[2]; /* saved offset */ __u8 name[]; /* name, variable size */ /* * A single byte containing the file type field follows the inode * number for version 3 directory entries. * - * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a - * variable offset after the name. + * A 64-bit or 32-bit inode number follows here, at a variable offset + * after the name. */ -} __arch_pack xfs_dir2_sf_entry_t; +} xfs_dir2_sf_entry_t; static inline int xfs_dir2_sf_hdr_size(int i8count) { return sizeof(struct xfs_dir2_sf_hdr) - - (i8count == 0) * - (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t)); + (i8count == 0) * XFS_INO64_DIFF; } static inline xfs_dir2_data_aoff_t xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) { - return get_unaligned_be16(&sfep->offset.i); + return get_unaligned_be16(sfep->offset); } static inline void xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) { - put_unaligned_be16(off, &sfep->offset.i); + put_unaligned_be16(off, sfep->offset); } static inline struct xfs_dir2_sf_entry * diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 974d62e677f4..c6809ff41197 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -126,13 +126,12 @@ xfs_dir2_block_sfsize( /* * Calculate the new size, see if we should give up yet. */ - size = xfs_dir2_sf_hdr_size(i8count) + /* header */ - count + /* namelen */ - count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */ - namelen + /* name */ - (i8count ? /* inumber */ - (uint)sizeof(xfs_dir2_ino8_t) * count : - (uint)sizeof(xfs_dir2_ino4_t) * count); + size = xfs_dir2_sf_hdr_size(i8count) + /* header */ + count * 3 * sizeof(u8) + /* namelen + offset */ + namelen + /* name */ + (i8count ? /* inumber */ + count * XFS_INO64_SIZE : + count * XFS_INO32_SIZE); if (size > XFS_IFORK_DSIZE(dp)) return size; /* size value is a failure */ } @@ -257,15 +256,12 @@ xfs_dir2_block_to_sf( * * Convert the inode to local format and copy the data in. */ - dp->i_df.if_flags &= ~XFS_IFEXTENTS; - dp->i_df.if_flags |= XFS_IFINLINE; - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; ASSERT(dp->i_df.if_bytes == 0); - xfs_idata_realloc(dp, size, XFS_DATA_FORK); + xfs_init_local_fork(dp, XFS_DATA_FORK, dst, size); + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + dp->i_d.di_size = size; logflags |= XFS_ILOG_DDATA; - memcpy(dp->i_df.if_u1.if_data, dst, size); - dp->i_d.di_size = size; xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); @@ -322,10 +318,7 @@ xfs_dir2_sf_addname( /* * Yes, adjust the inode size. old count + (parent + new) */ - incr_isize += - (sfp->count + 2) * - ((uint)sizeof(xfs_dir2_ino8_t) - - (uint)sizeof(xfs_dir2_ino4_t)); + incr_isize += (sfp->count + 2) * XFS_INO64_DIFF; objchange = 1; } @@ -900,11 +893,7 @@ xfs_dir2_sf_replace( int error; /* error return value */ int newsize; /* new inode size */ - newsize = - dp->i_df.if_bytes + - (sfp->count + 1) * - ((uint)sizeof(xfs_dir2_ino8_t) - - (uint)sizeof(xfs_dir2_ino4_t)); + newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; /* * Won't fit as shortform, convert to block then do replace. */ @@ -1025,10 +1014,7 @@ xfs_dir2_sf_toino4( /* * Compute the new inode size. */ - newsize = - oldsize - - (oldsfp->count + 1) * - ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + newsize = oldsize - (oldsfp->count + 1) * XFS_INO64_DIFF; xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); /* @@ -1051,7 +1037,7 @@ xfs_dir2_sf_toino4( i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; - sfep->offset = oldsfep->offset; + memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset)); memcpy(sfep->name, oldsfep->name, sfep->namelen); dp->d_ops->sf_put_ino(sfp, sfep, dp->d_ops->sf_get_ino(oldsfp, oldsfep)); @@ -1101,10 +1087,7 @@ xfs_dir2_sf_toino8( /* * Compute the new inode size (nb: entry count + 1 for parent) */ - newsize = - oldsize + - (oldsfp->count + 1) * - ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + newsize = oldsize + (oldsfp->count + 1) * XFS_INO64_DIFF; xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); /* @@ -1127,7 +1110,7 @@ xfs_dir2_sf_toino8( i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; - sfep->offset = oldsfep->offset; + memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset)); memcpy(sfep->name, oldsfep->name, sfep->namelen); dp->d_ops->sf_put_ino(sfp, sfep, dp->d_ops->sf_get_ino(oldsfp, oldsfep)); diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index dc97eb21af07..adb204d40f22 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1435,41 +1435,57 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; * with the crc feature bit, and all accesses to them must be conditional on * that flag. */ +/* short form block header */ +struct xfs_btree_block_shdr { + __be32 bb_leftsib; + __be32 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be32 bb_owner; + __le32 bb_crc; +}; + +/* long form block header */ +struct xfs_btree_block_lhdr { + __be64 bb_leftsib; + __be64 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be64 bb_owner; + __le32 bb_crc; + __be32 bb_pad; /* padding for alignment */ +}; + struct xfs_btree_block { __be32 bb_magic; /* magic number for block type */ __be16 bb_level; /* 0 is a leaf */ __be16 bb_numrecs; /* current # of data records */ union { - struct { - __be32 bb_leftsib; - __be32 bb_rightsib; - - __be64 bb_blkno; - __be64 bb_lsn; - uuid_t bb_uuid; - __be32 bb_owner; - __le32 bb_crc; - } s; /* short form pointers */ - struct { - __be64 bb_leftsib; - __be64 bb_rightsib; - - __be64 bb_blkno; - __be64 bb_lsn; - uuid_t bb_uuid; - __be64 bb_owner; - __le32 bb_crc; - __be32 bb_pad; /* padding for alignment */ - } l; /* long form pointers */ + struct xfs_btree_block_shdr s; + struct xfs_btree_block_lhdr l; } bb_u; /* rest */ }; -#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ -#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ +/* size of a short form block */ +#define XFS_BTREE_SBLOCK_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + offsetof(struct xfs_btree_block_shdr, bb_blkno)) +/* size of a long form block */ +#define XFS_BTREE_LBLOCK_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + offsetof(struct xfs_btree_block_lhdr, bb_blkno)) /* sizes of CRC enabled btree blocks */ -#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) -#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) +#define XFS_BTREE_SBLOCK_CRC_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + sizeof(struct xfs_btree_block_shdr)) +#define XFS_BTREE_LBLOCK_CRC_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + sizeof(struct xfs_btree_block_lhdr)) #define XFS_BTREE_SBLOCK_CRC_OFF \ offsetof(struct xfs_btree_block, bb_u.s.bb_crc) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index fffe3d01bd9f..f5ec9c5ccae6 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -521,12 +521,8 @@ typedef struct xfs_swapext #define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) /* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ -/* XFS_IOC_FREEZE -- FIFREEZE 119 */ -/* XFS_IOC_THAW -- FITHAW 120 */ -#ifndef FIFREEZE -#define XFS_IOC_FREEZE _IOWR('X', 119, int) -#define XFS_IOC_THAW _IOWR('X', 120, int) -#endif +#define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */ +#define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */ #define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 22297f9b0fd5..4b1e408169a8 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1828,9 +1828,8 @@ xfs_difree_inode_chunk( if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, - XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)), - mp->m_ialloc_blks, flist, mp); + xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, sagbno), + mp->m_ialloc_blks); return; } @@ -1873,8 +1872,8 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, - flist, mp); + xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, agbno), + contigblk); /* reset range to current bit and carry on... */ startidx = endidx = nextbit; @@ -2395,20 +2394,11 @@ void xfs_ialloc_compute_maxlevels( xfs_mount_t *mp) /* file system mount structure */ { - int level; - uint maxblocks; - uint maxleafents; - int minleafrecs; - int minnoderecs; - - maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >> - XFS_INODES_PER_CHUNK_LOG; - minleafrecs = mp->m_inobt_mnr[0]; - minnoderecs = mp->m_inobt_mnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; - for (level = 1; maxblocks > 1; level++) - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; - mp->m_in_maxlevels = level; + uint inodes; + + inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG; + mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr, + inodes); } /* diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 11faf7df14c8..bbcc8c7a44b3 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -231,6 +231,48 @@ xfs_iformat_fork( return error; } +void +xfs_init_local_fork( + struct xfs_inode *ip, + int whichfork, + const void *data, + int size) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int mem_size = size, real_size = 0; + bool zero_terminate; + + /* + * If we are using the local fork to store a symlink body we need to + * zero-terminate it so that we can pass it back to the VFS directly. + * Overallocate the in-memory fork by one for that and add a zero + * to terminate it below. + */ + zero_terminate = S_ISLNK(VFS_I(ip)->i_mode); + if (zero_terminate) + mem_size++; + + if (size == 0) + ifp->if_u1.if_data = NULL; + else if (mem_size <= sizeof(ifp->if_u2.if_inline_data)) + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + else { + real_size = roundup(mem_size, 4); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); + } + + if (size) { + memcpy(ifp->if_u1.if_data, data, size); + if (zero_terminate) + ifp->if_u1.if_data[size] = '\0'; + } + + ifp->if_bytes = size; + ifp->if_real_bytes = real_size; + ifp->if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); + ifp->if_flags |= XFS_IFINLINE; +} + /* * The file is in-lined in the on-disk inode. * If it fits into if_inline_data, then copy @@ -248,8 +290,6 @@ xfs_iformat_local( int whichfork, int size) { - xfs_ifork_t *ifp; - int real_size; /* * If the size is unreasonable, then something @@ -265,22 +305,8 @@ xfs_iformat_local( ip->i_mount, dip); return -EFSCORRUPTED; } - ifp = XFS_IFORK_PTR(ip, whichfork); - real_size = 0; - if (size == 0) - ifp->if_u1.if_data = NULL; - else if (size <= sizeof(ifp->if_u2.if_inline_data)) - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - else { - real_size = roundup(size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); - } - ifp->if_bytes = size; - ifp->if_real_bytes = real_size; - if (size) - memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); - ifp->if_flags &= ~XFS_IFEXTENTS; - ifp->if_flags |= XFS_IFINLINE; + + xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size); return 0; } @@ -516,7 +542,6 @@ xfs_iroot_realloc( new_max = cur_max + rec_diff; new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max), KM_SLEEP | KM_NOFS); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); @@ -660,7 +685,6 @@ xfs_idata_realloc( ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data, real_size, - ifp->if_real_bytes, KM_SLEEP | KM_NOFS); } } else { @@ -1376,8 +1400,7 @@ xfs_iext_realloc_direct( if (rnew_size != ifp->if_real_bytes) { ifp->if_u1.if_extents = kmem_realloc(ifp->if_u1.if_extents, - rnew_size, - ifp->if_real_bytes, KM_NOFS); + rnew_size, KM_NOFS); } if (rnew_size > ifp->if_real_bytes) { memset(&ifp->if_u1.if_extents[ifp->if_bytes / @@ -1461,9 +1484,8 @@ xfs_iext_realloc_indirect( if (new_size == 0) { xfs_iext_destroy(ifp); } else { - ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) - kmem_realloc(ifp->if_u1.if_ext_irec, - new_size, size, KM_NOFS); + ifp->if_u1.if_ext_irec = + kmem_realloc(ifp->if_u1.if_ext_irec, new_size, KM_NOFS); } } @@ -1497,6 +1519,24 @@ xfs_iext_indirect_to_direct( } /* + * Remove all records from the indirection array. + */ +STATIC void +xfs_iext_irec_remove_all( + struct xfs_ifork *ifp) +{ + int nlists; + int i; + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + for (i = 0; i < nlists; i++) + kmem_free(ifp->if_u1.if_ext_irec[i].er_extbuf); + kmem_free(ifp->if_u1.if_ext_irec); + ifp->if_flags &= ~XFS_IFEXTIREC; +} + +/* * Free incore file extents. */ void @@ -1504,14 +1544,7 @@ xfs_iext_destroy( xfs_ifork_t *ifp) /* inode fork pointer */ { if (ifp->if_flags & XFS_IFEXTIREC) { - int erp_idx; - int nlists; - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { - xfs_iext_irec_remove(ifp, erp_idx); - } - ifp->if_flags &= ~XFS_IFEXTIREC; + xfs_iext_irec_remove_all(ifp); } else if (ifp->if_real_bytes) { kmem_free(ifp->if_u1.if_extents); } else if (ifp->if_bytes) { diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7d3b1ed6dcbe..f95e072ae646 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -134,6 +134,7 @@ void xfs_iroot_realloc(struct xfs_inode *, int, int); int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, int); +void xfs_init_local_fork(struct xfs_inode *, int, const void *, int); struct xfs_bmbt_rec_host * xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t); diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index d54a8018b079..e8f49c029ff0 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -212,6 +212,11 @@ typedef struct xfs_trans_header { #define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ /* + * The only type valid for th_type in CIL-enabled file system logs: + */ +#define XFS_TRANS_CHECKPOINT 40 + +/* * Log item types. */ #define XFS_LI_EFI 0x1236 diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 951c044e24e4..e2e1106c9fad 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = { * Get a buffer for the bitmap or summary file block specified. * The buffer is returned read and locked. */ -int +static int xfs_rtbuf_get( xfs_mount_t *mp, /* file system mount structure */ xfs_trans_t *tp, /* transaction pointer */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 8a53eaa349f4..12ca86778e02 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -838,12 +838,10 @@ xfs_sync_sb( struct xfs_trans *tp; int error; - tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, + XFS_TRANS_NO_WRITECOUNT, &tp); + if (error) return error; - } xfs_log_sb(tp); if (wait) diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 81ac870834da..16002b5ec4eb 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -56,103 +56,6 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops; extern const struct xfs_buf_ops xfs_rtbuf_ops; /* - * Transaction types. Used to distinguish types of buffers. These never reach - * the log. - */ -#define XFS_TRANS_SETATTR_NOT_SIZE 1 -#define XFS_TRANS_SETATTR_SIZE 2 -#define XFS_TRANS_INACTIVE 3 -#define XFS_TRANS_CREATE 4 -#define XFS_TRANS_CREATE_TRUNC 5 -#define XFS_TRANS_TRUNCATE_FILE 6 -#define XFS_TRANS_REMOVE 7 -#define XFS_TRANS_LINK 8 -#define XFS_TRANS_RENAME 9 -#define XFS_TRANS_MKDIR 10 -#define XFS_TRANS_RMDIR 11 -#define XFS_TRANS_SYMLINK 12 -#define XFS_TRANS_SET_DMATTRS 13 -#define XFS_TRANS_GROWFS 14 -#define XFS_TRANS_STRAT_WRITE 15 -#define XFS_TRANS_DIOSTRAT 16 -/* 17 was XFS_TRANS_WRITE_SYNC */ -#define XFS_TRANS_WRITEID 18 -#define XFS_TRANS_ADDAFORK 19 -#define XFS_TRANS_ATTRINVAL 20 -#define XFS_TRANS_ATRUNCATE 21 -#define XFS_TRANS_ATTR_SET 22 -#define XFS_TRANS_ATTR_RM 23 -#define XFS_TRANS_ATTR_FLAG 24 -#define XFS_TRANS_CLEAR_AGI_BUCKET 25 -#define XFS_TRANS_SB_CHANGE 26 -/* - * Dummy entries since we use the transaction type to index into the - * trans_type[] in xlog_recover_print_trans_head() - */ -#define XFS_TRANS_DUMMY1 27 -#define XFS_TRANS_DUMMY2 28 -#define XFS_TRANS_QM_QUOTAOFF 29 -#define XFS_TRANS_QM_DQALLOC 30 -#define XFS_TRANS_QM_SETQLIM 31 -#define XFS_TRANS_QM_DQCLUSTER 32 -#define XFS_TRANS_QM_QINOCREATE 33 -#define XFS_TRANS_QM_QUOTAOFF_END 34 -#define XFS_TRANS_FSYNC_TS 35 -#define XFS_TRANS_GROWFSRT_ALLOC 36 -#define XFS_TRANS_GROWFSRT_ZERO 37 -#define XFS_TRANS_GROWFSRT_FREE 38 -#define XFS_TRANS_SWAPEXT 39 -#define XFS_TRANS_CHECKPOINT 40 -#define XFS_TRANS_ICREATE 41 -#define XFS_TRANS_CREATE_TMPFILE 42 -#define XFS_TRANS_TYPE_MAX 43 -/* new transaction types need to be reflected in xfs_logprint(8) */ - -#define XFS_TRANS_TYPES \ - { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \ - { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ - { XFS_TRANS_INACTIVE, "INACTIVE" }, \ - { XFS_TRANS_CREATE, "CREATE" }, \ - { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ - { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ - { XFS_TRANS_REMOVE, "REMOVE" }, \ - { XFS_TRANS_LINK, "LINK" }, \ - { XFS_TRANS_RENAME, "RENAME" }, \ - { XFS_TRANS_MKDIR, "MKDIR" }, \ - { XFS_TRANS_RMDIR, "RMDIR" }, \ - { XFS_TRANS_SYMLINK, "SYMLINK" }, \ - { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \ - { XFS_TRANS_GROWFS, "GROWFS" }, \ - { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \ - { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \ - { XFS_TRANS_WRITEID, "WRITEID" }, \ - { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \ - { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \ - { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \ - { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \ - { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ - { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ - { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ - { XFS_TRANS_SB_CHANGE, "SBCHANGE" }, \ - { XFS_TRANS_DUMMY1, "DUMMY1" }, \ - { XFS_TRANS_DUMMY2, "DUMMY2" }, \ - { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ - { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ - { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ - { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ - { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ - { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ - { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ - { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ - { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ - { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ - { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ - { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ - { XFS_TRANS_ICREATE, "ICREATE" }, \ - { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \ - { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } - -/* * This structure is used to track log items associated with * a transaction. It points to the log item and keeps some * flags to track the state of the log item. It also tracks @@ -181,8 +84,9 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); #define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ -#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer - count in superblock */ +#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ +#define XFS_TRANS_NOFS 0x80 /* pass KM_NOFS to kmem_alloc */ + /* * Field values for xfs_trans_mod_sb. */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index c535887c60a8..7575cfc3ad15 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -84,23 +84,80 @@ xfs_find_bdev_for_inode( } /* - * We're now finished for good with this ioend structure. - * Update the page state via the associated buffer_heads, - * release holds on the inode and bio, and finally free - * up memory. Do not use the ioend after this. + * We're now finished for good with this page. Update the page state via the + * associated buffer_heads, paying attention to the start and end offsets that + * we need to process on the page. + * + * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last + * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or + * the page at all, as we may be racing with memory reclaim and it can free both + * the bufferhead chain and the page as it will see the page as clean and + * unused. + */ +static void +xfs_finish_page_writeback( + struct inode *inode, + struct bio_vec *bvec, + int error) +{ + unsigned int end = bvec->bv_offset + bvec->bv_len - 1; + struct buffer_head *head, *bh, *next; + unsigned int off = 0; + unsigned int bsize; + + ASSERT(bvec->bv_offset < PAGE_SIZE); + ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0); + ASSERT(end < PAGE_SIZE); + ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0); + + bh = head = page_buffers(bvec->bv_page); + + bsize = bh->b_size; + do { + next = bh->b_this_page; + if (off < bvec->bv_offset) + goto next_bh; + if (off > end) + break; + bh->b_end_io(bh, !error); +next_bh: + off += bsize; + } while ((bh = next) != head); +} + +/* + * We're now finished for good with this ioend structure. Update the page + * state, release holds on bios, and finally free up memory. Do not use the + * ioend after this. */ STATIC void xfs_destroy_ioend( - xfs_ioend_t *ioend) + struct xfs_ioend *ioend, + int error) { - struct buffer_head *bh, *next; + struct inode *inode = ioend->io_inode; + struct bio *last = ioend->io_bio; + struct bio *bio, *next; - for (bh = ioend->io_buffer_head; bh; bh = next) { - next = bh->b_private; - bh->b_end_io(bh, !ioend->io_error); - } + for (bio = &ioend->io_inline_bio; bio; bio = next) { + struct bio_vec *bvec; + int i; - mempool_free(ioend, xfs_ioend_pool); + /* + * For the last bio, bi_private points to the ioend, so we + * need to explicitly end the iteration here. + */ + if (bio == last) + next = NULL; + else + next = bio->bi_private; + + /* walk each page on bio, ending page IO on them */ + bio_for_each_segment_all(bvec, bio, i) + xfs_finish_page_writeback(inode, bvec, error); + + bio_put(bio); + } } /* @@ -120,13 +177,9 @@ xfs_setfilesize_trans_alloc( struct xfs_trans *tp; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); + if (error) return error; - } ioend->io_append_trans = tp; @@ -174,7 +227,8 @@ xfs_setfilesize( STATIC int xfs_setfilesize_ioend( - struct xfs_ioend *ioend) + struct xfs_ioend *ioend, + int error) { struct xfs_inode *ip = XFS_I(ioend->io_inode); struct xfs_trans *tp = ioend->io_append_trans; @@ -188,53 +242,32 @@ xfs_setfilesize_ioend( __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); /* we abort the update if there was an IO error */ - if (ioend->io_error) { + if (error) { xfs_trans_cancel(tp); - return ioend->io_error; + return error; } return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); } /* - * Schedule IO completion handling on the final put of an ioend. - * - * If there is no work to do we might as well call it a day and free the - * ioend right now. - */ -STATIC void -xfs_finish_ioend( - struct xfs_ioend *ioend) -{ - if (atomic_dec_and_test(&ioend->io_remaining)) { - struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - - if (ioend->io_type == XFS_IO_UNWRITTEN) - queue_work(mp->m_unwritten_workqueue, &ioend->io_work); - else if (ioend->io_append_trans) - queue_work(mp->m_data_workqueue, &ioend->io_work); - else - xfs_destroy_ioend(ioend); - } -} - -/* * IO write completion. */ STATIC void xfs_end_io( struct work_struct *work) { - xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); - struct xfs_inode *ip = XFS_I(ioend->io_inode); - int error = 0; + struct xfs_ioend *ioend = + container_of(work, struct xfs_ioend, io_work); + struct xfs_inode *ip = XFS_I(ioend->io_inode); + int error = ioend->io_bio->bi_error; /* * Set an error if the mount has shut down and proceed with end I/O * processing so it can perform whatever cleanups are necessary. */ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - ioend->io_error = -EIO; + error = -EIO; /* * For unwritten extents we need to issue transactions to convert a @@ -244,55 +277,33 @@ xfs_end_io( * on error. */ if (ioend->io_type == XFS_IO_UNWRITTEN) { - if (ioend->io_error) + if (error) goto done; error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); } else if (ioend->io_append_trans) { - error = xfs_setfilesize_ioend(ioend); + error = xfs_setfilesize_ioend(ioend, error); } else { ASSERT(!xfs_ioend_is_append(ioend)); } done: - if (error) - ioend->io_error = error; - xfs_destroy_ioend(ioend); + xfs_destroy_ioend(ioend, error); } -/* - * Allocate and initialise an IO completion structure. - * We need to track unwritten extent write completion here initially. - * We'll need to extend this for updating the ondisk inode size later - * (vs. incore size). - */ -STATIC xfs_ioend_t * -xfs_alloc_ioend( - struct inode *inode, - unsigned int type) +STATIC void +xfs_end_bio( + struct bio *bio) { - xfs_ioend_t *ioend; - - ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); - - /* - * Set the count to 1 initially, which will prevent an I/O - * completion callback from happening before we have started - * all the I/O from calling the completion routine too early. - */ - atomic_set(&ioend->io_remaining, 1); - ioend->io_error = 0; - INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = type; - ioend->io_inode = inode; - ioend->io_buffer_head = NULL; - ioend->io_buffer_tail = NULL; - ioend->io_offset = 0; - ioend->io_size = 0; - ioend->io_append_trans = NULL; + struct xfs_ioend *ioend = bio->bi_private; + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - INIT_WORK(&ioend->io_work, xfs_end_io); - return ioend; + if (ioend->io_type == XFS_IO_UNWRITTEN) + queue_work(mp->m_unwritten_workqueue, &ioend->io_work); + else if (ioend->io_append_trans) + queue_work(mp->m_data_workqueue, &ioend->io_work); + else + xfs_destroy_ioend(ioend, bio->bi_error); } STATIC int @@ -364,50 +375,6 @@ xfs_imap_valid( offset < imap->br_startoff + imap->br_blockcount; } -/* - * BIO completion handler for buffered IO. - */ -STATIC void -xfs_end_bio( - struct bio *bio) -{ - xfs_ioend_t *ioend = bio->bi_private; - - if (!ioend->io_error) - ioend->io_error = bio->bi_error; - - /* Toss bio and pass work off to an xfsdatad thread */ - bio->bi_private = NULL; - bio->bi_end_io = NULL; - bio_put(bio); - - xfs_finish_ioend(ioend); -} - -STATIC void -xfs_submit_ioend_bio( - struct writeback_control *wbc, - xfs_ioend_t *ioend, - struct bio *bio) -{ - atomic_inc(&ioend->io_remaining); - bio->bi_private = ioend; - bio->bi_end_io = xfs_end_bio; - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); -} - -STATIC struct bio * -xfs_alloc_ioend_bio( - struct buffer_head *bh) -{ - struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - - ASSERT(bio->bi_private == NULL); - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; - return bio; -} - STATIC void xfs_start_buffer_writeback( struct buffer_head *bh) @@ -452,28 +419,36 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) } /* - * Submit all of the bios for an ioend. We are only passed a single ioend at a - * time; the caller is responsible for chaining prior to submission. + * Submit the bio for an ioend. We are passed an ioend with a bio attached to + * it, and we submit that bio. The ioend may be used for multiple bio + * submissions, so we only want to allocate an append transaction for the ioend + * once. In the case of multiple bio submission, each bio will take an IO + * reference to the ioend to ensure that the ioend completion is only done once + * all bios have been submitted and the ioend is really done. * * If @fail is non-zero, it means that we have a situation where some part of * the submission process has failed after we have marked paged for writeback - * and unlocked them. In this situation, we need to fail the ioend chain rather - * than submit it to IO. This typically only happens on a filesystem shutdown. + * and unlocked them. In this situation, we need to fail the bio and ioend + * rather than submit it to IO. This typically only happens on a filesystem + * shutdown. */ STATIC int xfs_submit_ioend( struct writeback_control *wbc, - xfs_ioend_t *ioend, + struct xfs_ioend *ioend, int status) { - struct buffer_head *bh; - struct bio *bio; - sector_t lastblock = 0; - /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && - ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) + ioend->io_type != XFS_IO_UNWRITTEN && + xfs_ioend_is_append(ioend) && + !ioend->io_append_trans) status = xfs_setfilesize_trans_alloc(ioend); + + ioend->io_bio->bi_private = ioend; + ioend->io_bio->bi_end_io = xfs_end_bio; + bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, + (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); /* * If we are failing the IO now, just mark the ioend with an * error and finish it. This will run IO completion immediately @@ -481,33 +456,73 @@ xfs_submit_ioend( * time. */ if (status) { - ioend->io_error = status; - xfs_finish_ioend(ioend); + ioend->io_bio->bi_error = status; + bio_endio(ioend->io_bio); return status; } - bio = NULL; - for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { + submit_bio(ioend->io_bio); + return 0; +} - if (!bio) { -retry: - bio = xfs_alloc_ioend_bio(bh); - } else if (bh->b_blocknr != lastblock + 1) { - xfs_submit_ioend_bio(wbc, ioend, bio); - goto retry; - } +static void +xfs_init_bio_from_bh( + struct bio *bio, + struct buffer_head *bh) +{ + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; +} - if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { - xfs_submit_ioend_bio(wbc, ioend, bio); - goto retry; - } +static struct xfs_ioend * +xfs_alloc_ioend( + struct inode *inode, + unsigned int type, + xfs_off_t offset, + struct buffer_head *bh) +{ + struct xfs_ioend *ioend; + struct bio *bio; - lastblock = bh->b_blocknr; - } - if (bio) - xfs_submit_ioend_bio(wbc, ioend, bio); - xfs_finish_ioend(ioend); - return 0; + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset); + xfs_init_bio_from_bh(bio, bh); + + ioend = container_of(bio, struct xfs_ioend, io_inline_bio); + INIT_LIST_HEAD(&ioend->io_list); + ioend->io_type = type; + ioend->io_inode = inode; + ioend->io_size = 0; + ioend->io_offset = offset; + INIT_WORK(&ioend->io_work, xfs_end_io); + ioend->io_append_trans = NULL; + ioend->io_bio = bio; + return ioend; +} + +/* + * Allocate a new bio, and chain the old bio to the new one. + * + * Note that we have to do perform the chaining in this unintuitive order + * so that the bi_private linkage is set up in the right direction for the + * traversal in xfs_destroy_ioend(). + */ +static void +xfs_chain_bio( + struct xfs_ioend *ioend, + struct writeback_control *wbc, + struct buffer_head *bh) +{ + struct bio *new; + + new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); + xfs_init_bio_from_bh(new, bh); + + bio_chain(ioend->io_bio, new); + bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ + bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, + (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); + submit_bio(ioend->io_bio); + ioend->io_bio = new; } /* @@ -523,27 +538,24 @@ xfs_add_to_ioend( struct buffer_head *bh, xfs_off_t offset, struct xfs_writepage_ctx *wpc, + struct writeback_control *wbc, struct list_head *iolist) { if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || bh->b_blocknr != wpc->last_block + 1 || offset != wpc->ioend->io_offset + wpc->ioend->io_size) { - struct xfs_ioend *new; - if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); - - new = xfs_alloc_ioend(inode, wpc->io_type); - new->io_offset = offset; - new->io_buffer_head = bh; - new->io_buffer_tail = bh; - wpc->ioend = new; - } else { - wpc->ioend->io_buffer_tail->b_private = bh; - wpc->ioend->io_buffer_tail = bh; + wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh); } - bh->b_private = NULL; + /* + * If the buffer doesn't fit into the bio we need to allocate a new + * one. This shouldn't happen more than once for a given buffer. + */ + while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size) + xfs_chain_bio(wpc->ioend, wbc, bh); + wpc->ioend->io_size += bh->b_size; wpc->last_block = bh->b_blocknr; xfs_start_buffer_writeback(bh); @@ -803,7 +815,7 @@ xfs_writepage_map( lock_buffer(bh); if (wpc->io_type != XFS_IO_OVERWRITE) xfs_map_at_offset(inode, bh, &wpc->imap, offset); - xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list); + xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list); count++; } @@ -1038,6 +1050,20 @@ xfs_vm_releasepage( trace_xfs_releasepage(page->mapping->host, page, 0, 0); + /* + * mm accommodates an old ext3 case where clean pages might not have had + * the dirty bit cleared. Thus, it can send actual dirty pages to + * ->releasepage() via shrink_active_list(). Conversely, + * block_invalidatepage() can send pages that are still marked dirty + * but otherwise have invalidated buffers. + * + * We've historically freed buffers on the latter. Instead, quietly + * filter out all dirty pages to avoid spurious buffer state warnings. + * This can likely be removed once shrink_active_list() is fixed. + */ + if (PageDirty(page)) + return 0; + xfs_count_page_state(page, &delalloc, &unwritten); if (WARN_ON_ONCE(delalloc)) @@ -1141,6 +1167,8 @@ __xfs_get_blocks( ssize_t size; int new = 0; + BUG_ON(create && !direct); + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1148,22 +1176,14 @@ __xfs_get_blocks( ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; - if (!create && direct && offset >= i_size_read(inode)) + if (!create && offset >= i_size_read(inode)) return 0; /* * Direct I/O is usually done on preallocated files, so try getting - * a block mapping without an exclusive lock first. For buffered - * writes we already have the exclusive iolock anyway, so avoiding - * a lock roundtrip here by taking the ilock exclusive from the - * beginning is a useful micro optimization. + * a block mapping without an exclusive lock first. */ - if (create && !direct) { - lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, lockmode); - } else { - lockmode = xfs_ilock_data_map_shared(ip); - } + lockmode = xfs_ilock_data_map_shared(ip); ASSERT(offset <= mp->m_super->s_maxbytes); if (offset + size > mp->m_super->s_maxbytes) @@ -1182,37 +1202,19 @@ __xfs_get_blocks( (imap.br_startblock == HOLESTARTBLOCK || imap.br_startblock == DELAYSTARTBLOCK) || (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { - if (direct || xfs_get_extsz_hint(ip)) { - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - - error = xfs_iomap_write_direct(ip, offset, size, - &imap, nimaps); - if (error) - return error; - new = 1; + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + if (lockmode == XFS_ILOCK_EXCL) + xfs_ilock_demote(ip, lockmode); - } else { - /* - * Delalloc reservations do not require a transaction, - * we can go on without dropping the lock here. If we - * are allocating a new delalloc block, make sure that - * we set the new flag so that we mark the buffer new so - * that we know that it is newly allocated if the write - * fails. - */ - if (nimaps && imap.br_startblock == HOLESTARTBLOCK) - new = 1; - error = xfs_iomap_write_delay(ip, offset, size, &imap); - if (error) - goto out_unlock; + error = xfs_iomap_write_direct(ip, offset, size, + &imap, nimaps); + if (error) + return error; + new = 1; - xfs_iunlock(ip, lockmode); - } trace_xfs_get_blocks_alloc(ip, offset, size, ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN : XFS_IO_DELALLOC, &imap); @@ -1233,9 +1235,7 @@ __xfs_get_blocks( } /* trim mapping down to size requested */ - if (direct || size > (1 << inode->i_blkbits)) - xfs_map_trim_size(inode, iblock, bh_result, - &imap, offset, size); + xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); /* * For unwritten extents do not report a disk address in the buffered @@ -1248,7 +1248,7 @@ __xfs_get_blocks( if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); /* direct IO needs special help */ - if (create && direct) { + if (create) { if (dax_fault) ASSERT(!ISUNWRITTEN(&imap)); else @@ -1277,14 +1277,7 @@ __xfs_get_blocks( (new || ISUNWRITTEN(&imap)))) set_buffer_new(bh_result); - if (imap.br_startblock == DELAYSTARTBLOCK) { - BUG_ON(direct); - if (create) { - set_buffer_uptodate(bh_result); - set_buffer_mapped(bh_result); - set_buffer_delay(bh_result); - } - } + BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK); return 0; @@ -1334,7 +1327,7 @@ xfs_get_blocks_dax_fault( * whereas if we have flags set we will always be called in task context * (i.e. from a workqueue). */ -STATIC int +int xfs_end_io_direct_write( struct kiocb *iocb, loff_t offset, @@ -1391,13 +1384,10 @@ xfs_end_io_direct_write( trace_xfs_end_io_direct_write_append(ip, offset, size); - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp); - return error; - } - error = xfs_setfilesize(ip, tp, offset, size); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, + &tp); + if (!error) + error = xfs_setfilesize(ip, tp, offset, size); } return error; @@ -1408,234 +1398,10 @@ xfs_vm_direct_IO( struct kiocb *iocb, struct iov_iter *iter) { - struct inode *inode = iocb->ki_filp->f_mapping->host; - dio_iodone_t *endio = NULL; - int flags = 0; - struct block_device *bdev; - - if (iov_iter_rw(iter) == WRITE) { - endio = xfs_end_io_direct_write; - flags = DIO_ASYNC_EXTEND; - } - - if (IS_DAX(inode)) { - return dax_do_io(iocb, inode, iter, - xfs_get_blocks_direct, endio, 0); - } - - bdev = xfs_find_bdev_for_inode(inode); - return __blockdev_direct_IO(iocb, inode, bdev, iter, - xfs_get_blocks_direct, endio, NULL, flags); -} - -/* - * Punch out the delalloc blocks we have already allocated. - * - * Don't bother with xfs_setattr given that nothing can have made it to disk yet - * as the page is still locked at this point. - */ -STATIC void -xfs_vm_kill_delalloc_range( - struct inode *inode, - loff_t start, - loff_t end) -{ - struct xfs_inode *ip = XFS_I(inode); - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; - int error; - - start_fsb = XFS_B_TO_FSB(ip->i_mount, start); - end_fsb = XFS_B_TO_FSB(ip->i_mount, end); - if (end_fsb <= start_fsb) - return; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "xfs_vm_write_failed: unable to clean up ino %lld", - ip->i_ino); - } - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); -} - -STATIC void -xfs_vm_write_failed( - struct inode *inode, - struct page *page, - loff_t pos, - unsigned len) -{ - loff_t block_offset; - loff_t block_start; - loff_t block_end; - loff_t from = pos & (PAGE_SIZE - 1); - loff_t to = from + len; - struct buffer_head *bh, *head; - struct xfs_mount *mp = XFS_I(inode)->i_mount; - /* - * The request pos offset might be 32 or 64 bit, this is all fine - * on 64-bit platform. However, for 64-bit pos request on 32-bit - * platform, the high 32-bit will be masked off if we evaluate the - * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is - * 0xfffff000 as an unsigned long, hence the result is incorrect - * which could cause the following ASSERT failed in most cases. - * In order to avoid this, we can evaluate the block_offset of the - * start of the page by using shifts rather than masks the mismatch - * problem. + * We just need the method present so that open/fcntl allow direct I/O. */ - block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT; - - ASSERT(block_offset + from == pos); - - head = page_buffers(page); - block_start = 0; - for (bh = head; bh != head || !block_start; - bh = bh->b_this_page, block_start = block_end, - block_offset += bh->b_size) { - block_end = block_start + bh->b_size; - - /* skip buffers before the write */ - if (block_end <= from) - continue; - - /* if the buffer is after the write, we're done */ - if (block_start >= to) - break; - - /* - * Process delalloc and unwritten buffers beyond EOF. We can - * encounter unwritten buffers in the event that a file has - * post-EOF unwritten extents and an extending write happens to - * fail (e.g., an unaligned write that also involves a delalloc - * to the same page). - */ - if (!buffer_delay(bh) && !buffer_unwritten(bh)) - continue; - - if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) && - block_offset < i_size_read(inode)) - continue; - - if (buffer_delay(bh)) - xfs_vm_kill_delalloc_range(inode, block_offset, - block_offset + bh->b_size); - - /* - * This buffer does not contain data anymore. make sure anyone - * who finds it knows that for certain. - */ - clear_buffer_delay(bh); - clear_buffer_uptodate(bh); - clear_buffer_mapped(bh); - clear_buffer_new(bh); - clear_buffer_dirty(bh); - clear_buffer_unwritten(bh); - } - -} - -/* - * This used to call block_write_begin(), but it unlocks and releases the page - * on error, and we need that page to be able to punch stale delalloc blocks out - * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at - * the appropriate point. - */ -STATIC int -xfs_vm_write_begin( - struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned len, - unsigned flags, - struct page **pagep, - void **fsdata) -{ - pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; - int status; - struct xfs_mount *mp = XFS_I(mapping->host)->i_mount; - - ASSERT(len <= PAGE_SIZE); - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; - - status = __block_write_begin(page, pos, len, xfs_get_blocks); - if (xfs_mp_fail_writes(mp)) - status = -EIO; - if (unlikely(status)) { - struct inode *inode = mapping->host; - size_t isize = i_size_read(inode); - - xfs_vm_write_failed(inode, page, pos, len); - unlock_page(page); - - /* - * If the write is beyond EOF, we only want to kill blocks - * allocated in this write, not blocks that were previously - * written successfully. - */ - if (xfs_mp_fail_writes(mp)) - isize = 0; - if (pos + len > isize) { - ssize_t start = max_t(ssize_t, pos, isize); - - truncate_pagecache_range(inode, start, pos + len); - } - - put_page(page); - page = NULL; - } - - *pagep = page; - return status; -} - -/* - * On failure, we only need to kill delalloc blocks beyond EOF in the range of - * this specific write because they will never be written. Previous writes - * beyond EOF where block allocation succeeded do not need to be trashed, so - * only new blocks from this write should be trashed. For blocks within - * EOF, generic_write_end() zeros them so they are safe to leave alone and be - * written with all the other valid data. - */ -STATIC int -xfs_vm_write_end( - struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned len, - unsigned copied, - struct page *page, - void *fsdata) -{ - int ret; - - ASSERT(len <= PAGE_SIZE); - - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (unlikely(ret < len)) { - struct inode *inode = mapping->host; - size_t isize = i_size_read(inode); - loff_t to = pos + len; - - if (to > isize) { - /* only kill blocks in this write beyond EOF */ - if (pos > isize) - isize = pos; - xfs_vm_kill_delalloc_range(inode, isize, to); - truncate_pagecache_range(inode, isize, to); - } - } - return ret; + return -EINVAL; } STATIC sector_t @@ -1748,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = { .set_page_dirty = xfs_vm_set_page_dirty, .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, - .write_begin = xfs_vm_write_begin, - .write_end = xfs_vm_write_end, .bmap = xfs_vm_bmap, .direct_IO = xfs_vm_direct_IO, .migratepage = buffer_migrate_page, diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index b4421177b68d..bf2d9a141a73 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -18,7 +18,7 @@ #ifndef __XFS_AOPS_H__ #define __XFS_AOPS_H__ -extern mempool_t *xfs_ioend_pool; +extern struct bio_set *xfs_ioend_bioset; /* * Types of I/O for bmap clustering and I/O completion tracking. @@ -37,22 +37,19 @@ enum { { XFS_IO_OVERWRITE, "overwrite" } /* - * xfs_ioend struct manages large extent writes for XFS. - * It can manage several multi-page bio's at once. + * Structure for buffered I/O completions. */ -typedef struct xfs_ioend { +struct xfs_ioend { struct list_head io_list; /* next ioend in chain */ unsigned int io_type; /* delalloc / unwritten */ - int io_error; /* I/O error code */ - atomic_t io_remaining; /* hold count */ struct inode *io_inode; /* file being written to */ - struct buffer_head *io_buffer_head;/* buffer linked list head */ - struct buffer_head *io_buffer_tail;/* buffer linked list tail */ size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ struct xfs_trans *io_append_trans;/* xact. for size update */ -} xfs_ioend_t; + struct bio *io_bio; /* bio being built */ + struct bio io_inline_bio; /* MUST BE LAST! */ +}; extern const struct address_space_operations xfs_address_space_operations; @@ -63,6 +60,9 @@ int xfs_get_blocks_direct(struct inode *inode, sector_t offset, int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); +int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, + ssize_t size, void *private); + extern void xfs_count_page_state(struct page *, int *, int *); extern struct block_device *xfs_find_bdev_for_inode(struct inode *); diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index dd4824589470..e3da5d448bcf 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -112,8 +112,9 @@ typedef struct attrlist_cursor_kern { *========================================================================*/ +/* Return 0 on success, or -errno; other state communicated via *context */ typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, - unsigned char *, int, int, unsigned char *); + unsigned char *, int, int); typedef struct xfs_attr_list_context { struct xfs_inode *dp; /* inode */ @@ -126,7 +127,6 @@ typedef struct xfs_attr_list_context { int firstu; /* first used byte in buffer */ int flags; /* from VOP call */ int resynch; /* T/F: resynch with cursor */ - int put_value; /* T/F: need value for listent */ put_listent_func_t put_listent; /* list output fmt function */ int index; /* index into output buffer */ } xfs_attr_list_context_t; diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 2bb959ada45b..be0b79d8900f 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -322,7 +322,7 @@ xfs_attr3_node_inactive( * Recurse (gasp!) through the attribute nodes until we find leaves. * We're doing a depth-first traversal in order to invalidate everything. */ -int +static int xfs_attr3_root_inactive( struct xfs_trans **trans, struct xfs_inode *dp) @@ -405,21 +405,11 @@ xfs_attr_inactive( goto out_destroy_fork; xfs_iunlock(dp, lock_mode); - /* - * Start our first transaction of the day. - * - * All future transactions during this code must be "chained" off - * this one via the trans_dup() call. All transactions will contain - * the inode, and the inode will always be marked with trans_ihold(). - * Since the inode will be locked in all transactions, we must log - * the inode in every transaction to let it float upward through - * the log. - */ lock_mode = 0; - trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); - error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrinval, 0, 0, 0, &trans); if (error) - goto out_cancel; + goto out_destroy_fork; lock_mode = XFS_ILOCK_EXCL; xfs_ilock(dp, lock_mode); diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 4fa14820e2e2..25e76cd6c053 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -65,7 +65,7 @@ xfs_attr_shortform_compare(const void *a, const void *b) * we have to calculate each entries' hashvalue and sort them before * we can begin returning them to the user. */ -int +static int xfs_attr_shortform_list(xfs_attr_list_context_t *context) { attrlist_cursor_kern_t *cursor; @@ -106,18 +106,15 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) sfe->flags, sfe->nameval, (int)sfe->namelen, - (int)sfe->valuelen, - &sfe->nameval[sfe->namelen]); - + (int)sfe->valuelen); + if (error) + return error; /* * Either search callback finished early or * didn't fit it all in the buffer after all. */ if (context->seen_enough) break; - - if (error) - return error; sfe = XFS_ATTR_SF_NEXTENTRY(sfe); } trace_xfs_attr_list_sf_all(context); @@ -200,8 +197,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) sbp->flags, sbp->name, sbp->namelen, - sbp->valuelen, - &sbp->name[sbp->namelen]); + sbp->valuelen); if (error) { kmem_free(sbuf); return error; @@ -416,6 +412,9 @@ xfs_attr3_leaf_list_int( */ retval = 0; for (; i < ichdr.count; entry++, i++) { + char *name; + int namelen, valuelen; + if (be32_to_cpu(entry->hashval) != cursor->hashval) { cursor->hashval = be32_to_cpu(entry->hashval); cursor->offset = 0; @@ -425,56 +424,25 @@ xfs_attr3_leaf_list_int( continue; /* skip incomplete entries */ if (entry->flags & XFS_ATTR_LOCAL) { - xfs_attr_leaf_name_local_t *name_loc = - xfs_attr3_leaf_name_local(leaf, i); - - retval = context->put_listent(context, - entry->flags, - name_loc->nameval, - (int)name_loc->namelen, - be16_to_cpu(name_loc->valuelen), - &name_loc->nameval[name_loc->namelen]); - if (retval) - return retval; + xfs_attr_leaf_name_local_t *name_loc; + + name_loc = xfs_attr3_leaf_name_local(leaf, i); + name = name_loc->nameval; + namelen = name_loc->namelen; + valuelen = be16_to_cpu(name_loc->valuelen); } else { - xfs_attr_leaf_name_remote_t *name_rmt = - xfs_attr3_leaf_name_remote(leaf, i); - - int valuelen = be32_to_cpu(name_rmt->valuelen); - - if (context->put_value) { - xfs_da_args_t args; - - memset((char *)&args, 0, sizeof(args)); - args.geo = context->dp->i_mount->m_attr_geo; - args.dp = context->dp; - args.whichfork = XFS_ATTR_FORK; - args.valuelen = valuelen; - args.rmtvaluelen = valuelen; - args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); - args.rmtblkno = be32_to_cpu(name_rmt->valueblk); - args.rmtblkcnt = xfs_attr3_rmt_blocks( - args.dp->i_mount, valuelen); - retval = xfs_attr_rmtval_get(&args); - if (!retval) - retval = context->put_listent(context, - entry->flags, - name_rmt->name, - (int)name_rmt->namelen, - valuelen, - args.value); - kmem_free(args.value); - } else { - retval = context->put_listent(context, - entry->flags, - name_rmt->name, - (int)name_rmt->namelen, - valuelen, - NULL); - } - if (retval) - return retval; + xfs_attr_leaf_name_remote_t *name_rmt; + + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + name = name_rmt->name; + namelen = name_rmt->namelen; + valuelen = be32_to_cpu(name_rmt->valuelen); } + + retval = context->put_listent(context, entry->flags, + name, namelen, valuelen); + if (retval) + break; if (context->seen_enough) break; cursor->offset++; @@ -551,8 +519,7 @@ xfs_attr_put_listent( int flags, unsigned char *name, int namelen, - int valuelen, - unsigned char *value) + int valuelen) { struct attrlist *alist = (struct attrlist *)context->alist; attrlist_ent_t *aep; @@ -581,7 +548,7 @@ xfs_attr_put_listent( trace_xfs_attr_list_full(context); alist->al_more = 1; context->seen_enough = 1; - return 1; + return 0; } aep = (attrlist_ent_t *)&context->alist[context->firstu]; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 3b6309865c65..cd4a850564f2 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -72,18 +72,28 @@ xfs_zero_extent( struct xfs_mount *mp = ip->i_mount; xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb); sector_t block = XFS_BB_TO_FSBT(mp, sector); - ssize_t size = XFS_FSB_TO_B(mp, count_fsb); - if (IS_DAX(VFS_I(ip))) - return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)), - sector, size); - - /* - * let the block layer decide on the fastest method of - * implementing the zeroing. - */ - return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS); + return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)), + block << (mp->m_super->s_blocksize_bits - 9), + count_fsb << (mp->m_super->s_blocksize_bits - 9), + GFP_NOFS, true); +} +/* Sort bmap items by AG. */ +static int +xfs_bmap_free_list_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_mount *mp = priv; + struct xfs_bmap_free_item *ra; + struct xfs_bmap_free_item *rb; + + ra = container_of(a, struct xfs_bmap_free_item, xbfi_list); + rb = container_of(b, struct xfs_bmap_free_item, xbfi_list); + return XFS_FSB_TO_AGNO(mp, ra->xbfi_startblock) - + XFS_FSB_TO_AGNO(mp, rb->xbfi_startblock); } /* @@ -106,14 +116,15 @@ xfs_bmap_finish( int error; /* error return value */ int committed;/* xact committed or not */ struct xfs_bmap_free_item *free; /* free extent item */ - struct xfs_bmap_free_item *next; /* next item on free list */ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); if (flist->xbf_count == 0) return 0; + list_sort((*tp)->t_mountp, &flist->xbf_flist, xfs_bmap_free_list_cmp); + efi = xfs_trans_get_efi(*tp, flist->xbf_count); - for (free = flist->xbf_first; free; free = free->xbfi_next) + list_for_each_entry(free, &flist->xbf_flist, xbfi_list) xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, free->xbfi_blockcount); @@ -132,9 +143,7 @@ xfs_bmap_finish( if (committed) { xfs_efi_release(efi); xfs_force_shutdown((*tp)->t_mountp, - (error == -EFSCORRUPTED) ? - SHUTDOWN_CORRUPT_INCORE : - SHUTDOWN_META_IO_ERROR); + SHUTDOWN_META_IO_ERROR); } return error; } @@ -145,15 +154,15 @@ xfs_bmap_finish( * on error. */ efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); - for (free = flist->xbf_first; free != NULL; free = next) { - next = free->xbfi_next; - + while (!list_empty(&flist->xbf_flist)) { + free = list_first_entry(&flist->xbf_flist, + struct xfs_bmap_free_item, xbfi_list); error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock, free->xbfi_blockcount); if (error) return error; - xfs_bmap_del_free(flist, NULL, free); + xfs_bmap_del_free(flist, free); } return 0; @@ -416,7 +425,7 @@ xfs_bmap_count_tree( /* * Count fsblocks of the given fork. */ -int /* error */ +static int /* error */ xfs_bmap_count_blocks( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode */ @@ -806,7 +815,7 @@ xfs_bmap_punch_delalloc_range( if (error) break; - ASSERT(!flist.xbf_count && !flist.xbf_first); + ASSERT(!flist.xbf_count && list_empty(&flist.xbf_flist)); next_block: start_fsb++; remaining--; @@ -900,19 +909,15 @@ xfs_free_eofblocks( * Free them up now by truncating the file to * its current size. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - if (need_iolock) { - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { - xfs_trans_cancel(tp); + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) return -EAGAIN; - } } - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, + &tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp); if (need_iolock) xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; @@ -1037,9 +1042,9 @@ xfs_alloc_file_space( /* * Allocate and setup the transaction. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - resblks, resrtextents); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, + resrtextents, 0, &tp); + /* * Check for running out of space */ @@ -1048,7 +1053,6 @@ xfs_alloc_file_space( * Free the transaction structure. */ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp); break; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1101,99 +1105,120 @@ error1: /* Just cancel transaction */ return error; } -/* - * Zero file bytes between startoff and endoff inclusive. - * The iolock is held exclusive and no blocks are buffered. - * - * This function is used by xfs_free_file_space() to zero - * partial blocks when the range to free is not block aligned. - * When unreserving space with boundaries that are not block - * aligned we round up the start and round down the end - * boundaries and then use this function to zero the parts of - * the blocks that got dropped during the rounding. - */ -STATIC int -xfs_zero_remaining_bytes( - xfs_inode_t *ip, - xfs_off_t startoff, - xfs_off_t endoff) +static int +xfs_unmap_extent( + struct xfs_inode *ip, + xfs_fileoff_t startoffset_fsb, + xfs_filblks_t len_fsb, + int *done) { - xfs_bmbt_irec_t imap; - xfs_fileoff_t offset_fsb; - xfs_off_t lastoffset; - xfs_off_t offset; - xfs_buf_t *bp; - xfs_mount_t *mp = ip->i_mount; - int nimap; - int error = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t firstfsb; + uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + int error; - /* - * Avoid doing I/O beyond eof - it's not necessary - * since nothing can read beyond eof. The space will - * be zeroed when the file is extended anyway. - */ - if (startoff >= XFS_ISIZE(ip)) - return 0; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + if (error) { + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + return error; + } - if (endoff > XFS_ISIZE(ip)) - endoff = XFS_ISIZE(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot, + ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto out_trans_cancel; - for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { - uint lock_mode; + xfs_trans_ijoin(tp, ip, 0); - offset_fsb = XFS_B_TO_FSBT(mp, offset); - nimap = 1; + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb, + &free_list, done); + if (error) + goto out_bmap_cancel; - lock_mode = xfs_ilock_data_map_shared(ip); - error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); - xfs_iunlock(ip, lock_mode); + error = xfs_bmap_finish(&tp, &free_list, NULL); + if (error) + goto out_bmap_cancel; - if (error || nimap < 1) - break; - ASSERT(imap.br_blockcount >= 1); - ASSERT(imap.br_startoff == offset_fsb); - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + error = xfs_trans_commit(tp); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; - if (imap.br_startblock == HOLESTARTBLOCK || - imap.br_state == XFS_EXT_UNWRITTEN) { - /* skip the entire extent */ - lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + - imap.br_blockcount) - 1; - continue; - } +out_bmap_cancel: + xfs_bmap_cancel(&free_list); +out_trans_cancel: + xfs_trans_cancel(tp); + goto out_unlock; +} - lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; - if (lastoffset > endoff) - lastoffset = endoff; +static int +xfs_adjust_extent_unmap_boundaries( + struct xfs_inode *ip, + xfs_fileoff_t *startoffset_fsb, + xfs_fileoff_t *endoffset_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + int nimap, error; + xfs_extlen_t mod = 0; - /* DAX can just zero the backing device directly */ - if (IS_DAX(VFS_I(ip))) { - error = dax_zero_page_range(VFS_I(ip), offset, - lastoffset - offset + 1, - xfs_get_blocks_direct); - if (error) - return error; - continue; - } + nimap = 1; + error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0); + if (error) + return error; - error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp, - xfs_fsb_to_db(ip, imap.br_startblock), - BTOBB(mp->m_sb.sb_blocksize), - 0, &bp, NULL); - if (error) - return error; + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + xfs_daddr_t block; - memset(bp->b_addr + - (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), - 0, lastoffset - offset + 1); + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + block = imap.br_startblock; + mod = do_div(block, mp->m_sb.sb_rextsize); + if (mod) + *startoffset_fsb += mp->m_sb.sb_rextsize - mod; + } - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - return error; + nimap = 1; + error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0); + if (error) + return error; + + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + mod++; + if (mod && mod != mp->m_sb.sb_rextsize) + *endoffset_fsb -= mod; } - return error; + + return 0; +} + +static int +xfs_flush_unmap_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + xfs_off_t rounding, start, end; + int error; + + /* wait for the completion of any pending DIOs */ + inode_dio_wait(inode); + + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); + start = round_down(offset, rounding); + end = round_up(offset + len, rounding) - 1; + + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + truncate_pagecache_range(inode, start, end); + return 0; } int @@ -1202,24 +1227,10 @@ xfs_free_file_space( xfs_off_t offset, xfs_off_t len) { - int done; - xfs_fileoff_t endoffset_fsb; - int error; - xfs_fsblock_t firstfsb; - xfs_bmap_free_t free_list; - xfs_bmbt_irec_t imap; - xfs_off_t ioffset; - xfs_off_t iendoffset; - xfs_extlen_t mod=0; - xfs_mount_t *mp; - int nimap; - uint resblks; - xfs_off_t rounding; - int rt; + struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t startoffset_fsb; - xfs_trans_t *tp; - - mp = ip->i_mount; + xfs_fileoff_t endoffset_fsb; + int done = 0, error; trace_xfs_free_file_space(ip); @@ -1227,143 +1238,45 @@ xfs_free_file_space( if (error) return error; - error = 0; if (len <= 0) /* if nothing being freed */ - return error; - rt = XFS_IS_REALTIME_INODE(ip); - startoffset_fsb = XFS_B_TO_FSB(mp, offset); - endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); - - /* wait for the completion of any pending DIOs */ - inode_dio_wait(VFS_I(ip)); + return 0; - rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); - ioffset = round_down(offset, rounding); - iendoffset = round_up(offset + len, rounding) - 1; - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, - iendoffset); + error = xfs_flush_unmap_range(ip, offset, len); if (error) - goto out; - truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset); + return error; + + startoffset_fsb = XFS_B_TO_FSB(mp, offset); + endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); /* - * Need to zero the stuff we're not freeing, on disk. - * If it's a realtime file & can't use unwritten extents then we - * actually need to zero the extent edges. Otherwise xfs_bunmapi - * will take care of it for us. + * Need to zero the stuff we're not freeing, on disk. If it's a RT file + * and we can't use unwritten extents then we actually need to ensure + * to zero the whole extent, otherwise we just need to take of block + * boundaries, and xfs_bunmapi will handle the rest. */ - if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { - nimap = 1; - error = xfs_bmapi_read(ip, startoffset_fsb, 1, - &imap, &nimap, 0); - if (error) - goto out; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - xfs_daddr_t block; - - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - block = imap.br_startblock; - mod = do_div(block, mp->m_sb.sb_rextsize); - if (mod) - startoffset_fsb += mp->m_sb.sb_rextsize - mod; - } - nimap = 1; - error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, - &imap, &nimap, 0); + if (XFS_IS_REALTIME_INODE(ip) && + !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb, + &endoffset_fsb); if (error) - goto out; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - mod++; - if (mod && (mod != mp->m_sb.sb_rextsize)) - endoffset_fsb -= mod; - } - } - if ((done = (endoffset_fsb <= startoffset_fsb))) - /* - * One contiguous piece to clear - */ - error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1); - else { - /* - * Some full blocks, possibly two pieces to clear - */ - if (offset < XFS_FSB_TO_B(mp, startoffset_fsb)) - error = xfs_zero_remaining_bytes(ip, offset, - XFS_FSB_TO_B(mp, startoffset_fsb) - 1); - if (!error && - XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len) - error = xfs_zero_remaining_bytes(ip, - XFS_FSB_TO_B(mp, endoffset_fsb), - offset + len - 1); + return error; } - /* - * free file space until done or until there is an error - */ - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - while (!error && !done) { - - /* - * allocate and setup the transaction. Allow this - * transaction to dip into the reserve blocks to ensure - * the freeing of the space succeeds at ENOSPC. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); - - /* - * check for running out of space - */ - if (error) { - /* - * Free the transaction structure. - */ - ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp); - break; + if (endoffset_fsb > startoffset_fsb) { + while (!done) { + error = xfs_unmap_extent(ip, startoffset_fsb, + endoffset_fsb - startoffset_fsb, &done); + if (error) + return error; } - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota(tp, mp, - ip->i_udquot, ip->i_gdquot, ip->i_pdquot, - resblks, 0, XFS_QMOPT_RES_REGBLKS); - if (error) - goto error1; - - xfs_trans_ijoin(tp, ip, 0); - - /* - * issue the bunmapi() call to free the blocks - */ - xfs_bmap_init(&free_list, &firstfsb); - error = xfs_bunmapi(tp, ip, startoffset_fsb, - endoffset_fsb - startoffset_fsb, - 0, 2, &firstfsb, &free_list, &done); - if (error) - goto error0; - - /* - * complete the transaction - */ - error = xfs_bmap_finish(&tp, &free_list, NULL); - if (error) - goto error0; - - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); } - out: - return error; - - error0: - xfs_bmap_cancel(&free_list); - error1: - xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out; + /* + * Now that we've unmap all full blocks we'll have to zero out any + * partial block at the beginning and/or end. xfs_zero_range is + * smart enough to skip any holes, including those we just created. + */ + return xfs_zero_range(ip, offset, len, NULL); } /* @@ -1482,19 +1395,16 @@ xfs_shift_file_space( } while (!error && !done) { - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); /* * We would need to reserve permanent block for transaction. * This will come into picture when after shifting extent into * hole we found that adjacent extents can be merged which * may lead to freeing of a block during record update. */ - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + if (error) break; - } xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, @@ -1747,12 +1657,9 @@ xfs_swap_extents( if (error) goto out_unlock; - tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) goto out_unlock; - } /* * Lock and join the inodes to the tansaction so that transaction commit diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index af97d9a1dfb4..f20071432ca6 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -31,8 +31,6 @@ struct xfs_bmalloca; int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, int whichfork, int *eof); -int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, - int whichfork, int *count); int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_fileoff_t start_fsb, xfs_fileoff_t length); @@ -43,7 +41,6 @@ int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, /* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */ void xfs_bmap_del_free(struct xfs_bmap_free *flist, - struct xfs_bmap_free_item *prev, struct xfs_bmap_free_item *free); int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9a2191b91137..47a318ce82e0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -80,6 +80,47 @@ xfs_buf_vmap_len( } /* + * Bump the I/O in flight count on the buftarg if we haven't yet done so for + * this buffer. The count is incremented once per buffer (per hold cycle) + * because the corresponding decrement is deferred to buffer release. Buffers + * can undergo I/O multiple times in a hold-release cycle and per buffer I/O + * tracking adds unnecessary overhead. This is used for sychronization purposes + * with unmount (see xfs_wait_buftarg()), so all we really need is a count of + * in-flight buffers. + * + * Buffers that are never released (e.g., superblock, iclog buffers) must set + * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count + * never reaches zero and unmount hangs indefinitely. + */ +static inline void +xfs_buf_ioacct_inc( + struct xfs_buf *bp) +{ + if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT)) + return; + + ASSERT(bp->b_flags & XBF_ASYNC); + bp->b_flags |= _XBF_IN_FLIGHT; + percpu_counter_inc(&bp->b_target->bt_io_count); +} + +/* + * Clear the in-flight state on a buffer about to be released to the LRU or + * freed and unaccount from the buftarg. + */ +static inline void +xfs_buf_ioacct_dec( + struct xfs_buf *bp) +{ + if (!(bp->b_flags & _XBF_IN_FLIGHT)) + return; + + ASSERT(bp->b_flags & XBF_ASYNC); + bp->b_flags &= ~_XBF_IN_FLIGHT; + percpu_counter_dec(&bp->b_target->bt_io_count); +} + +/* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer * reference count falls to zero. If the buffer is already on the LRU, we need @@ -102,6 +143,14 @@ xfs_buf_stale( */ bp->b_flags &= ~_XBF_DELWRI_Q; + /* + * Once the buffer is marked stale and unlocked, a subsequent lookup + * could reset b_flags. There is no guarantee that the buffer is + * unaccounted (released to LRU) before that occurs. Drop in-flight + * status now to preserve accounting consistency. + */ + xfs_buf_ioacct_dec(bp); + spin_lock(&bp->b_lock); atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && @@ -815,7 +864,8 @@ xfs_buf_get_uncached( struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); - bp = _xfs_buf_alloc(target, &map, 1, 0); + /* flags might contain irrelevant bits, pass only what we care about */ + bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT); if (unlikely(bp == NULL)) goto fail; @@ -866,63 +916,85 @@ xfs_buf_hold( } /* - * Releases a hold on the specified buffer. If the - * the hold count is 1, calls xfs_buf_free. + * Release a hold on the specified buffer. If the hold count is 1, the buffer is + * placed on LRU or freed (depending on b_lru_ref). */ void xfs_buf_rele( xfs_buf_t *bp) { struct xfs_perag *pag = bp->b_pag; + bool release; + bool freebuf = false; trace_xfs_buf_rele(bp, _RET_IP_); if (!pag) { ASSERT(list_empty(&bp->b_lru)); ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); - if (atomic_dec_and_test(&bp->b_hold)) + if (atomic_dec_and_test(&bp->b_hold)) { + xfs_buf_ioacct_dec(bp); xfs_buf_free(bp); + } return; } ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); ASSERT(atomic_read(&bp->b_hold) > 0); - if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { - spin_lock(&bp->b_lock); - if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { - /* - * If the buffer is added to the LRU take a new - * reference to the buffer for the LRU and clear the - * (now stale) dispose list state flag - */ - if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { - bp->b_state &= ~XFS_BSTATE_DISPOSE; - atomic_inc(&bp->b_hold); - } - spin_unlock(&bp->b_lock); - spin_unlock(&pag->pag_buf_lock); - } else { - /* - * most of the time buffers will already be removed from - * the LRU, so optimise that case by checking for the - * XFS_BSTATE_DISPOSE flag indicating the last list the - * buffer was on was the disposal list - */ - if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { - list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); - } else { - ASSERT(list_empty(&bp->b_lru)); - } - spin_unlock(&bp->b_lock); - ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); - xfs_buf_free(bp); + release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); + spin_lock(&bp->b_lock); + if (!release) { + /* + * Drop the in-flight state if the buffer is already on the LRU + * and it holds the only reference. This is racy because we + * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT + * ensures the decrement occurs only once per-buf. + */ + if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) + xfs_buf_ioacct_dec(bp); + goto out_unlock; + } + + /* the last reference has been dropped ... */ + xfs_buf_ioacct_dec(bp); + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + /* + * If the buffer is added to the LRU take a new reference to the + * buffer for the LRU and clear the (now stale) dispose list + * state flag + */ + if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + bp->b_state &= ~XFS_BSTATE_DISPOSE; + atomic_inc(&bp->b_hold); } + spin_unlock(&pag->pag_buf_lock); + } else { + /* + * most of the time buffers will already be removed from the + * LRU, so optimise that case by checking for the + * XFS_BSTATE_DISPOSE flag indicating the last list the buffer + * was on was the disposal list + */ + if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { + list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + } else { + ASSERT(list_empty(&bp->b_lru)); + } + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + freebuf = true; } + +out_unlock: + spin_unlock(&bp->b_lock); + + if (freebuf) + xfs_buf_free(bp); } @@ -944,10 +1016,12 @@ xfs_buf_trylock( int locked; locked = down_trylock(&bp->b_sema) == 0; - if (locked) + if (locked) { XB_SET_OWNER(bp); - - trace_xfs_buf_trylock(bp, _RET_IP_); + trace_xfs_buf_trylock(bp, _RET_IP_); + } else { + trace_xfs_buf_trylock_fail(bp, _RET_IP_); + } return locked; } @@ -1100,22 +1174,18 @@ xfs_bwrite( return error; } -STATIC void +static void xfs_buf_bio_end_io( struct bio *bio) { - xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; + struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; /* * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. */ - if (bio->bi_error) { - spin_lock(&bp->b_lock); - if (!bp->b_io_error) - bp->b_io_error = bio->bi_error; - spin_unlock(&bp->b_lock); - } + if (bio->bi_error) + cmpxchg(&bp->b_io_error, 0, bio->bi_error); if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); @@ -1131,7 +1201,8 @@ xfs_buf_ioapply_map( int map, int *buf_offset, int *count, - int rw) + int op, + int op_flags) { int page_index; int total_nr_pages = bp->b_page_count; @@ -1161,16 +1232,14 @@ xfs_buf_ioapply_map( next_chunk: atomic_inc(&bp->b_io_remaining); - nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); - if (nr_pages > total_nr_pages) - nr_pages = total_nr_pages; + nr_pages = min(total_nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOIO, nr_pages); bio->bi_bdev = bp->b_target->bt_bdev; bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; - + bio_set_op_attrs(bio, op, op_flags); for (; size && nr_pages; nr_pages--, page_index++) { int rbytes, nbytes = PAGE_SIZE - offset; @@ -1194,7 +1263,7 @@ next_chunk: flush_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); } - submit_bio(rw, bio); + submit_bio(bio); if (size) goto next_chunk; } else { @@ -1214,7 +1283,8 @@ _xfs_buf_ioapply( struct xfs_buf *bp) { struct blk_plug plug; - int rw; + int op; + int op_flags = 0; int offset; int size; int i; @@ -1233,14 +1303,13 @@ _xfs_buf_ioapply( bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue; if (bp->b_flags & XBF_WRITE) { + op = REQ_OP_WRITE; if (bp->b_flags & XBF_SYNCIO) - rw = WRITE_SYNC; - else - rw = WRITE; + op_flags = WRITE_SYNC; if (bp->b_flags & XBF_FUA) - rw |= REQ_FUA; + op_flags |= REQ_FUA; if (bp->b_flags & XBF_FLUSH) - rw |= REQ_FLUSH; + op_flags |= REQ_PREFLUSH; /* * Run the write verifier callback function if it exists. If @@ -1270,13 +1339,14 @@ _xfs_buf_ioapply( } } } else if (bp->b_flags & XBF_READ_AHEAD) { - rw = READA; + op = REQ_OP_READ; + op_flags = REQ_RAHEAD; } else { - rw = READ; + op = REQ_OP_READ; } /* we only use the buffer cache for meta-data */ - rw |= REQ_META; + op_flags |= REQ_META; /* * Walk all the vectors issuing IO on them. Set up the initial offset @@ -1288,7 +1358,7 @@ _xfs_buf_ioapply( size = BBTOB(bp->b_io_length); blk_start_plug(&plug); for (i = 0; i < bp->b_map_count; i++) { - xfs_buf_ioapply_map(bp, i, &offset, &size, rw); + xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags); if (bp->b_error) break; if (size <= 0) @@ -1343,6 +1413,7 @@ xfs_buf_submit( * xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); + xfs_buf_ioacct_inc(bp); _xfs_buf_ioapply(bp); /* @@ -1528,13 +1599,19 @@ xfs_wait_buftarg( int loop = 0; /* - * We need to flush the buffer workqueue to ensure that all IO - * completion processing is 100% done. Just waiting on buffer locks is - * not sufficient for async IO as the reference count held over IO is - * not released until after the buffer lock is dropped. Hence we need to - * ensure here that all reference counts have been dropped before we - * start walking the LRU list. + * First wait on the buftarg I/O count for all in-flight buffers to be + * released. This is critical as new buffers do not make the LRU until + * they are released. + * + * Next, flush the buffer workqueue to ensure all completion processing + * has finished. Just waiting on buffer locks is not sufficient for + * async IO as the reference count held over IO is not released until + * after the buffer lock is dropped. Hence we need to ensure here that + * all reference counts have been dropped before we start walking the + * LRU list. */ + while (percpu_counter_sum(&btp->bt_io_count)) + delay(100); drain_workqueue(btp->bt_mount->m_buf_workqueue); /* loop until there is nothing left on the lru list. */ @@ -1631,6 +1708,8 @@ xfs_free_buftarg( struct xfs_buftarg *btp) { unregister_shrinker(&btp->bt_shrinker); + ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); + percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); if (mp->m_flags & XFS_MOUNT_BARRIER) @@ -1695,6 +1774,9 @@ xfs_alloc_buftarg( if (list_lru_init(&btp->bt_lru)) goto error; + if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) + goto error; + btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; @@ -1778,18 +1860,33 @@ xfs_buf_cmp( return 0; } +/* + * submit buffers for write. + * + * When we have a large buffer list, we do not want to hold all the buffers + * locked while we block on the request queue waiting for IO dispatch. To avoid + * this problem, we lock and submit buffers in groups of 50, thereby minimising + * the lock hold times for lists which may contain thousands of objects. + * + * To do this, we sort the buffer list before we walk the list to lock and + * submit buffers, and we plug and unplug around each group of buffers we + * submit. + */ static int -__xfs_buf_delwri_submit( +xfs_buf_delwri_submit_buffers( struct list_head *buffer_list, - struct list_head *io_list, - bool wait) + struct list_head *wait_list) { - struct blk_plug plug; struct xfs_buf *bp, *n; + LIST_HEAD (submit_list); int pinned = 0; + struct blk_plug plug; + list_sort(NULL, buffer_list, xfs_buf_cmp); + + blk_start_plug(&plug); list_for_each_entry_safe(bp, n, buffer_list, b_list) { - if (!wait) { + if (!wait_list) { if (xfs_buf_ispinned(bp)) { pinned++; continue; @@ -1812,25 +1909,21 @@ __xfs_buf_delwri_submit( continue; } - list_move_tail(&bp->b_list, io_list); trace_xfs_buf_delwri_split(bp, _RET_IP_); - } - - list_sort(NULL, io_list, xfs_buf_cmp); - - blk_start_plug(&plug); - list_for_each_entry_safe(bp, n, io_list, b_list) { - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); - bp->b_flags |= XBF_WRITE | XBF_ASYNC; /* - * we do all Io submission async. This means if we need to wait - * for IO completion we need to take an extra reference so the - * buffer is still valid on the other side. + * We do all IO submission async. This means if we need + * to wait for IO completion we need to take an extra + * reference so the buffer is still valid on the other + * side. We need to move the buffer onto the io_list + * at this point so the caller can still access it. */ - if (wait) + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL); + bp->b_flags |= XBF_WRITE | XBF_ASYNC; + if (wait_list) { xfs_buf_hold(bp); - else + list_move_tail(&bp->b_list, wait_list); + } else list_del_init(&bp->b_list); xfs_buf_submit(bp); @@ -1853,8 +1946,7 @@ int xfs_buf_delwri_submit_nowait( struct list_head *buffer_list) { - LIST_HEAD (io_list); - return __xfs_buf_delwri_submit(buffer_list, &io_list, false); + return xfs_buf_delwri_submit_buffers(buffer_list, NULL); } /* @@ -1869,15 +1961,15 @@ int xfs_buf_delwri_submit( struct list_head *buffer_list) { - LIST_HEAD (io_list); + LIST_HEAD (wait_list); int error = 0, error2; struct xfs_buf *bp; - __xfs_buf_delwri_submit(buffer_list, &io_list, true); + xfs_buf_delwri_submit_buffers(buffer_list, &wait_list); /* Wait for IO to complete. */ - while (!list_empty(&io_list)) { - bp = list_first_entry(&io_list, struct xfs_buf, b_list); + while (!list_empty(&wait_list)) { + bp = list_first_entry(&wait_list, struct xfs_buf, b_list); list_del_init(&bp->b_list); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 4eb89bd4ee73..1c2e52b2d926 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -43,6 +43,7 @@ typedef enum { #define XBF_READ (1 << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ #define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ +#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */ #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ @@ -62,6 +63,7 @@ typedef enum { #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ #define _XBF_COMPOUND (1 << 23)/* compound buffer */ +#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */ typedef unsigned int xfs_buf_flags_t; @@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" } + { _XBF_COMPOUND, "COMPOUND" }, \ + { _XBF_IN_FLIGHT, "IN_FLIGHT" } /* @@ -115,6 +118,8 @@ typedef struct xfs_buftarg { /* LRU control structures */ struct shrinker bt_shrinker; struct list_lru bt_lru; + + struct percpu_counter bt_io_count; } xfs_buftarg_t; struct xfs_buf; @@ -183,6 +188,26 @@ typedef struct xfs_buf { unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset in first page */ int b_error; /* error code on I/O */ + + /* + * async write failure retry count. Initialised to zero on the first + * failure, then when it exceeds the maximum configured without a + * success the write is considered to be failed permanently and the + * iodone handler will take appropriate action. + * + * For retry timeouts, we record the jiffie of the first failure. This + * means that we can change the retry timeout for buffers already under + * I/O and thus avoid getting stuck in a retry loop with a long timeout. + * + * last_error is used to ensure that we are getting repeated errors, not + * different errors. e.g. a block device might change ENOSPC to EIO when + * a failure timeout occurs, so we want to re-initialise the error + * retry behaviour appropriately when that happens. + */ + int b_retries; + unsigned long b_first_retry_time; /* in jiffies */ + int b_last_error; + const struct xfs_buf_ops *b_ops; #ifdef XFS_BUF_LOCK_TRACKING diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 99e91a0e554e..e455f9098d49 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -359,7 +359,7 @@ xfs_buf_item_format( for (i = 0; i < bip->bli_format_count; i++) { xfs_buf_item_format_segment(bip, lv, &vecp, offset, &bip->bli_formats[i]); - offset += bp->b_maps[i].bm_len; + offset += BBTOB(bp->b_maps[i].bm_len); } /* @@ -915,20 +915,28 @@ xfs_buf_item_log( for (i = 0; i < bip->bli_format_count; i++) { if (start > last) break; - end = start + BBTOB(bp->b_maps[i].bm_len); + end = start + BBTOB(bp->b_maps[i].bm_len) - 1; + + /* skip to the map that includes the first byte to log */ if (first > end) { start += BBTOB(bp->b_maps[i].bm_len); continue; } + + /* + * Trim the range to this segment and mark it in the bitmap. + * Note that we must convert buffer offsets to segment relative + * offsets (e.g., the first byte of each segment is byte 0 of + * that segment). + */ if (first < start) first = start; if (end > last) end = last; - - xfs_buf_item_log_segment(first, end, + xfs_buf_item_log_segment(first - start, end - start, &bip->bli_formats[i].blf_data_map[0]); - start += bp->b_maps[i].bm_len; + start += BBTOB(bp->b_maps[i].bm_len); } } @@ -949,6 +957,7 @@ xfs_buf_item_free( xfs_buf_log_item_t *bip) { xfs_buf_item_free_format(bip); + kmem_free(bip->bli_item.li_lv_shadow); kmem_zone_free(xfs_buf_item_zone, bip); } @@ -1042,35 +1051,22 @@ xfs_buf_do_callbacks( } } -/* - * This is the iodone() function for buffers which have had callbacks - * attached to them by xfs_buf_attach_iodone(). It should remove each - * log item from the buffer's list and call the callback of each in turn. - * When done, the buffer's fsprivate field is set to NULL and the buffer - * is unlocked with a call to iodone(). - */ -void -xfs_buf_iodone_callbacks( +static bool +xfs_buf_iodone_callback_error( struct xfs_buf *bp) { struct xfs_log_item *lip = bp->b_fspriv; struct xfs_mount *mp = lip->li_mountp; static ulong lasttime; static xfs_buftarg_t *lasttarg; - - if (likely(!bp->b_error)) - goto do_callbacks; + struct xfs_error_cfg *cfg; /* * If we've already decided to shutdown the filesystem because of * I/O errors, there's no point in giving this a retry. */ - if (XFS_FORCED_SHUTDOWN(mp)) { - xfs_buf_stale(bp); - bp->b_flags |= XBF_DONE; - trace_xfs_buf_item_iodone(bp, _RET_IP_); - goto do_callbacks; - } + if (XFS_FORCED_SHUTDOWN(mp)) + goto out_stale; if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { @@ -1079,45 +1075,93 @@ xfs_buf_iodone_callbacks( } lasttarg = bp->b_target; + /* synchronous writes will have callers process the error */ + if (!(bp->b_flags & XBF_ASYNC)) + goto out_stale; + + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + ASSERT(bp->b_iodone != NULL); + + cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); + /* * If the write was asynchronous then no one will be looking for the - * error. Clear the error state and write the buffer out again. - * - * XXX: This helps against transient write errors, but we need to find - * a way to shut the filesystem down if the writes keep failing. - * - * In practice we'll shut the filesystem down soon as non-transient - * errors tend to affect the whole device and a failing log write - * will make us give up. But we really ought to do better here. + * error. If this is the first failure of this type, clear the error + * state and write the buffer out again. This means we always retry an + * async write failure at least once, but we also need to set the buffer + * up to behave correctly now for repeated failures. */ - if (bp->b_flags & XBF_ASYNC) { - ASSERT(bp->b_iodone != NULL); + if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) || + bp->b_last_error != bp->b_error) { + bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); + bp->b_last_error = bp->b_error; + if (cfg->retry_timeout && !bp->b_first_retry_time) + bp->b_first_retry_time = jiffies; + + xfs_buf_ioerror(bp, 0); + xfs_buf_submit(bp); + return true; + } - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + /* + * Repeated failure on an async write. Take action according to the + * error configuration we have been set up to use. + */ - xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ + if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && + ++bp->b_retries > cfg->max_retries) + goto permanent_error; + if (cfg->retry_timeout && + time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) + goto permanent_error; - if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { - bp->b_flags |= XBF_WRITE | XBF_ASYNC | - XBF_DONE | XBF_WRITE_FAIL; - xfs_buf_submit(bp); - } else { - xfs_buf_relse(bp); - } + /* At unmount we may treat errors differently */ + if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) + goto permanent_error; - return; - } + /* still a transient error, higher layers will retry */ + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return true; /* - * If the write of the buffer was synchronous, we want to make - * sure to return the error to the caller of xfs_bwrite(). + * Permanent error - we need to trigger a shutdown if we haven't already + * to indicate that inconsistency will result from this action. */ +permanent_error: + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); +out_stale: xfs_buf_stale(bp); bp->b_flags |= XBF_DONE; - trace_xfs_buf_error_relse(bp, _RET_IP_); + return false; +} + +/* + * This is the iodone() function for buffers which have had callbacks attached + * to them by xfs_buf_attach_iodone(). We need to iterate the items on the + * callback list, mark the buffer as having no more callbacks and then push the + * buffer through IO completion processing. + */ +void +xfs_buf_iodone_callbacks( + struct xfs_buf *bp) +{ + /* + * If there is an error, process it. Some errors require us + * to run callbacks after failure processing is done so we + * detect that and take appropriate action. + */ + if (bp->b_error && xfs_buf_iodone_callback_error(bp)) + return; + + /* + * Successful IO or permanent error. Either way, we can clear the + * retry state here in preparation for the next error that may occur. + */ + bp->b_last_error = 0; + bp->b_retries = 0; -do_callbacks: xfs_buf_do_callbacks(bp); bp->b_fspriv = NULL; bp->b_iodone = NULL; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 316b2a1bdba5..ccb0811963b2 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -74,6 +74,7 @@ xfs_qm_dqdestroy( { ASSERT(list_empty(&dqp->q_lru)); + kmem_free(dqp->q_logitem.qli_item.li_lv_shadow); mutex_destroy(&dqp->q_qlock); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); @@ -614,11 +615,10 @@ xfs_qm_dqread( trace_xfs_dqread(dqp); if (flags & XFS_QMOPT_DQALLOC) { - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc, - XFS_QM_DQALLOC_SPACE_RES(mp), 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, + XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); if (error) - goto error1; + goto error0; } /* @@ -692,7 +692,7 @@ error0: * end of the chunk, skip ahead to first id in next allocated chunk * using the SEEK_DATA interface. */ -int +static int xfs_dq_get_next_id( xfs_mount_t *mp, uint type, diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 814cff94e78f..2c7a1629e064 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed( spin_lock(&ailp->xa_lock); xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); + kmem_free(qfs->qql_item.li_lv_shadow); + kmem_free(lip->li_lv_shadow); kmem_free(qfs); kmem_free(qfe); return (xfs_lsn_t)-1; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 88693a98fac5..ed7ee4e8af73 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -55,12 +55,15 @@ xfs_error_test(int error_tag, int *fsidp, char *expression, } int -xfs_errortag_add(int error_tag, xfs_mount_t *mp) +xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp) { int i; int len; int64_t fsid; + if (error_tag >= XFS_ERRTAG_MAX) + return -EINVAL; + memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 4ed3042a0f16..2e4f67f68856 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -128,7 +128,7 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ (rf)))) -extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp); +extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp); extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); #else #define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 4aa0153214f9..ab779460ecbf 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -40,6 +40,7 @@ void xfs_efi_item_free( struct xfs_efi_log_item *efip) { + kmem_free(efip->efi_item.li_lv_shadow); if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) kmem_free(efip); else @@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) STATIC void xfs_efd_item_free(struct xfs_efd_log_item *efdp) { + kmem_free(efdp->efd_item.li_lv_shadow); if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) kmem_free(efdp); else diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 85ce3032f815..ed95e5bb04e6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -37,6 +37,7 @@ #include "xfs_log.h" #include "xfs_icache.h" #include "xfs_pnfs.h" +#include "xfs_iomap.h" #include <linux/dcache.h> #include <linux/falloc.h> @@ -80,61 +81,17 @@ xfs_rw_ilock_demote( } /* - * xfs_iozero clears the specified range supplied via the page cache (except in - * the DAX case). Writes through the page cache will allocate blocks over holes, - * though the callers usually map the holes first and avoid them. If a block is - * not completely zeroed, then it will be read from disk before being partially - * zeroed. - * - * In the DAX case, we can just directly write to the underlying pages. This - * will not allocate blocks, but will avoid holes and unwritten extents and so - * not do unnecessary work. + * Clear the specified ranges to zero through either the pagecache or DAX. + * Holes and unwritten extents will be left as-is as they already are zeroed. */ int -xfs_iozero( - struct xfs_inode *ip, /* inode */ - loff_t pos, /* offset in file */ - size_t count) /* size of data to zero */ +xfs_zero_range( + struct xfs_inode *ip, + xfs_off_t pos, + xfs_off_t count, + bool *did_zero) { - struct page *page; - struct address_space *mapping; - int status = 0; - - - mapping = VFS_I(ip)->i_mapping; - do { - unsigned offset, bytes; - void *fsdata; - - offset = (pos & (PAGE_SIZE -1)); /* Within page */ - bytes = PAGE_SIZE - offset; - if (bytes > count) - bytes = count; - - if (IS_DAX(VFS_I(ip))) { - status = dax_zero_page_range(VFS_I(ip), pos, bytes, - xfs_get_blocks_direct); - if (status) - break; - } else { - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; - - zero_user(page, offset, bytes); - - status = pagecache_write_end(NULL, mapping, pos, bytes, - bytes, page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ - status = 0; - } - pos += bytes; - count -= bytes; - } while (count); - - return status; + return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops); } int @@ -145,12 +102,10 @@ xfs_update_prealloc_flags( struct xfs_trans *tp; int error; - tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); - error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, + 0, 0, 0, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -284,48 +239,35 @@ xfs_file_fsync( } STATIC ssize_t -xfs_file_read_iter( +xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - size_t size = iov_iter_count(to); + loff_t isize = i_size_read(inode); + size_t count = iov_iter_count(to); + struct iov_iter data; + struct xfs_buftarg *target; ssize_t ret = 0; - int ioflags = 0; - xfs_fsize_t n; - loff_t pos = iocb->ki_pos; - XFS_STATS_INC(mp, xs_read_calls); + trace_xfs_file_direct_read(ip, count, iocb->ki_pos); - if (unlikely(iocb->ki_flags & IOCB_DIRECT)) - ioflags |= XFS_IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - - if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - /* DIO must be aligned to device logical sector size */ - if ((pos | size) & target->bt_logical_sectormask) { - if (pos == i_size_read(inode)) - return 0; - return -EINVAL; - } - } + if (!count) + return 0; /* skip atime */ - n = mp->m_super->s_maxbytes - pos; - if (n <= 0 || size == 0) - return 0; - - if (n < size) - size = n; + if (XFS_IS_REALTIME_INODE(ip)) + target = ip->i_mount->m_rtdev_targp; + else + target = ip->i_mount->m_ddev_targp; - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; + /* DIO must be aligned to device logical sector size */ + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { + if (iocb->ki_pos == isize) + return 0; + return -EINVAL; + } /* * Locking is a bit tricky here. If we take an exclusive lock for direct @@ -338,7 +280,7 @@ xfs_file_read_iter( * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { + if (mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); @@ -353,8 +295,8 @@ xfs_file_read_iter( * flush and reduce the chances of repeated iolock cycles going * forward. */ - if (inode->i_mapping->nrpages) { - ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (mapping->nrpages) { + ret = filemap_write_and_wait(mapping); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; @@ -365,20 +307,95 @@ xfs_file_read_iter( * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ - ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); + ret = invalidate_inode_pages2(mapping); WARN_ON_ONCE(ret); ret = 0; } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - trace_xfs_file_read(ip, size, pos, ioflags); + data = *to; + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, NULL, NULL, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); + } + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + file_accessed(iocb->ki_filp); + return ret; +} + +static noinline ssize_t +xfs_file_dax_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct iov_iter data = *to; + size_t count = iov_iter_count(to); + ssize_t ret = 0; + + trace_xfs_file_dax_read(ip, count, iocb->ki_pos); + + if (!count) + return 0; /* skip atime */ + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); + } + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + file_accessed(iocb->ki_filp); + return ret; +} + +STATIC ssize_t +xfs_file_buffered_aio_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + ssize_t ret; + + trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); ret = generic_file_read_iter(iocb, to); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + return ret; +} + +STATIC ssize_t +xfs_file_read_iter( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_mount *mp = XFS_I(inode)->i_mount; + ssize_t ret = 0; + + XFS_STATS_INC(mp, xs_read_calls); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (IS_DAX(inode)) + ret = xfs_file_dax_read(iocb, to); + else if (iocb->ki_flags & IOCB_DIRECT) + ret = xfs_file_dio_aio_read(iocb, to); + else + ret = xfs_file_buffered_aio_read(iocb, to); + if (ret > 0) XFS_STATS_ADD(mp, xs_read_bytes, ret); - - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -391,18 +408,14 @@ xfs_file_splice_read( unsigned int flags) { struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); - int ioflags = 0; ssize_t ret; XFS_STATS_INC(ip->i_mount, xs_read_calls); - if (infilp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + trace_xfs_file_splice_read(ip, count, *ppos); /* * DAX inodes cannot ues the page cache for splice, so we have to push @@ -426,49 +439,6 @@ out: } /* - * This routine is called to handle zeroing any space in the last block of the - * file that is beyond the EOF. We do this since the size is being increased - * without writing anything to that block and we don't want to read the - * garbage on the disk. - */ -STATIC int /* error (positive) */ -xfs_zero_last_block( - struct xfs_inode *ip, - xfs_fsize_t offset, - xfs_fsize_t isize, - bool *did_zeroing) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); - int zero_offset = XFS_B_FSB_OFFSET(mp, isize); - int zero_len; - int nimaps = 1; - int error = 0; - struct xfs_bmbt_irec imap; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - - ASSERT(nimaps > 0); - - /* - * If the block underlying isize is just a hole, then there - * is nothing to zero. - */ - if (imap.br_startblock == HOLESTARTBLOCK) - return 0; - - zero_len = mp->m_sb.sb_blocksize - zero_offset; - if (isize + zero_len > offset) - zero_len = offset - isize; - *did_zeroing = true; - return xfs_iozero(ip, isize, zero_len); -} - -/* * Zero any on disk space between the current EOF and the new, larger EOF. * * This handles the normal case of zeroing the remainder of the last block in @@ -486,94 +456,11 @@ xfs_zero_eof( xfs_fsize_t isize, /* current inode size */ bool *did_zeroing) { - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t start_zero_fsb; - xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t zero_count_fsb; - xfs_fileoff_t last_fsb; - xfs_fileoff_t zero_off; - xfs_fsize_t zero_len; - int nimaps; - int error = 0; - struct xfs_bmbt_irec imap; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(offset > isize); trace_xfs_zero_eof(ip, isize, offset - isize); - - /* - * First handle zeroing the block on which isize resides. - * - * We only zero a part of that block so it is handled specially. - */ - if (XFS_B_FSB_OFFSET(mp, isize) != 0) { - error = xfs_zero_last_block(ip, offset, isize, did_zeroing); - if (error) - return error; - } - - /* - * Calculate the range between the new size and the old where blocks - * needing to be zeroed may exist. - * - * To get the block where the last byte in the file currently resides, - * we need to subtract one from the size and truncate back to a block - * boundary. We subtract 1 in case the size is exactly on a block - * boundary. - */ - last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; - start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); - end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); - ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); - if (last_fsb == end_zero_fsb) { - /* - * The size was only incremented on its last block. - * We took care of that above, so just return. - */ - return 0; - } - - ASSERT(start_zero_fsb <= end_zero_fsb); - while (start_zero_fsb <= end_zero_fsb) { - nimaps = 1; - zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, - &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - - ASSERT(nimaps > 0); - - if (imap.br_state == XFS_EXT_UNWRITTEN || - imap.br_startblock == HOLESTARTBLOCK) { - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - continue; - } - - /* - * There are blocks we need to zero. - */ - zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); - zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); - - if ((zero_off + zero_len) > offset) - zero_len = offset - zero_off; - - error = xfs_iozero(ip, zero_off, zero_len); - if (error) - return error; - - *did_zeroing = true; - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - } - - return 0; + return xfs_zero_range(ip, isize, offset - isize, did_zeroing); } /* @@ -724,8 +611,7 @@ xfs_file_dio_aio_write( mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if (!IS_DAX(inode) && - ((iocb->ki_pos | count) & target->bt_logical_sectormask)) + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -764,7 +650,7 @@ xfs_file_dio_aio_write( end = iocb->ki_pos + count - 1; /* - * See xfs_file_read_iter() for why we do a full-file flush here. + * See xfs_file_dio_aio_read() for why we do a full-file flush here. */ if (mapping->nrpages) { ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); @@ -791,10 +677,12 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + trace_xfs_file_direct_write(ip, count, iocb->ki_pos); data = *from; - ret = mapping->a_ops->direct_IO(iocb, &data); + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, xfs_end_io_direct_write, + NULL, DIO_ASYNC_EXTEND); /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { @@ -811,10 +699,70 @@ out: xfs_rw_iunlock(ip, iolock); /* - * No fallback to buffered IO on errors for XFS. DAX can result in - * partial writes, but direct IO will either complete fully or fail. + * No fallback to buffered IO on errors for XFS, direct IO will either + * complete fully or fail. */ - ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); + ASSERT(ret < 0 || ret == count); + return ret; +} + +static noinline ssize_t +xfs_file_dax_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + int unaligned_io = 0; + int iolock; + struct iov_iter data; + + /* "unaligned" here means not aligned to a filesystem block */ + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { + unaligned_io = 1; + iolock = XFS_IOLOCK_EXCL; + } else if (mapping->nrpages) { + iolock = XFS_IOLOCK_EXCL; + } else { + iolock = XFS_IOLOCK_SHARED; + } + xfs_rw_ilock(ip, iolock); + + ret = xfs_file_aio_write_checks(iocb, from, &iolock); + if (ret) + goto out; + + /* + * Yes, even DAX files can have page cache attached to them: A zeroed + * page is inserted into the pagecache when we have to serve a write + * fault on a hole. It should never be dirtied and can simply be + * dropped from the pagecache once we get real data for the page. + */ + if (mapping->nrpages) { + ret = invalidate_inode_pages2(mapping); + WARN_ON_ONCE(ret); + } + + if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); + + data = *from; + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, + xfs_end_io_direct_write, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(from, ret); + } +out: + xfs_rw_iunlock(ip, iolock); return ret; } @@ -841,9 +789,8 @@ xfs_file_buffered_aio_write( current->backing_dev_info = inode_to_bdi(inode); write_retry: - trace_xfs_file_buffered_write(ip, iov_iter_count(from), - iocb->ki_pos, 0); - ret = generic_perform_write(file, from, iocb->ki_pos); + trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); + ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); if (likely(ret >= 0)) iocb->ki_pos += ret; @@ -897,7 +844,9 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) + if (IS_DAX(inode)) + ret = xfs_file_dax_write(iocb, from); + else if (iocb->ki_flags & IOCB_DIRECT) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); @@ -1553,9 +1502,9 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL); + ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); } else { - ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); } @@ -1587,7 +1536,7 @@ xfs_filemap_fault( * changes to xfs_get_blocks_direct() to map unwritten extent * ioend for conversion on read-only mappings. */ - ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL); + ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault); } else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1624,8 +1573,7 @@ xfs_filemap_pmd_fault( } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault, - NULL); + ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (flags & FAULT_FLAG_WRITE) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index ee3aaa0a5317..7191c3878b4a 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -198,14 +198,10 @@ xfs_growfs_data_private( return error; } - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); - tp->t_flags |= XFS_TRANS_RESERVE; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, - XFS_GROWFS_SPACE_RES(mp), 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, + XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); + if (error) return error; - } /* * Write new AG headers to disk. Non-transactional, but written @@ -243,8 +239,8 @@ xfs_growfs_data_private( agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); - agf->agf_flfirst = 0; - agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1); + agf->agf_flfirst = cpu_to_be32(1); + agf->agf_fllast = 0; agf->agf_flcount = 0; tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); agf->agf_freeblks = cpu_to_be32(tmpsize); @@ -671,8 +667,11 @@ xfs_reserve_blocks( __uint64_t *inval, xfs_fsop_resblks_t *outval) { - __int64_t lcounter, delta, fdblks_delta; + __int64_t lcounter, delta; + __int64_t fdblks_delta = 0; __uint64_t request; + __int64_t free; + int error = 0; /* If inval is null, report current values and return */ if (inval == (__uint64_t *)NULL) { @@ -686,24 +685,23 @@ xfs_reserve_blocks( request = *inval; /* - * With per-cpu counters, this becomes an interesting - * problem. we needto work out if we are freeing or allocation - * blocks first, then we can do the modification as necessary. + * With per-cpu counters, this becomes an interesting problem. we need + * to work out if we are freeing or allocation blocks first, then we can + * do the modification as necessary. * - * We do this under the m_sb_lock so that if we are near - * ENOSPC, we will hold out any changes while we work out - * what to do. This means that the amount of free space can - * change while we do this, so we need to retry if we end up - * trying to reserve more space than is available. + * We do this under the m_sb_lock so that if we are near ENOSPC, we will + * hold out any changes while we work out what to do. This means that + * the amount of free space can change while we do this, so we need to + * retry if we end up trying to reserve more space than is available. */ -retry: spin_lock(&mp->m_sb_lock); /* * If our previous reservation was larger than the current value, - * then move any unused blocks back to the free pool. + * then move any unused blocks back to the free pool. Modify the resblks + * counters directly since we shouldn't have any problems unreserving + * space. */ - fdblks_delta = 0; if (mp->m_resblks > request) { lcounter = mp->m_resblks_avail - request; if (lcounter > 0) { /* release unused blocks */ @@ -711,54 +709,67 @@ retry: mp->m_resblks_avail -= lcounter; } mp->m_resblks = request; - } else { - __int64_t free; + if (fdblks_delta) { + spin_unlock(&mp->m_sb_lock); + error = xfs_mod_fdblocks(mp, fdblks_delta, 0); + spin_lock(&mp->m_sb_lock); + } + + goto out; + } + /* + * If the request is larger than the current reservation, reserve the + * blocks before we update the reserve counters. Sample m_fdblocks and + * perform a partial reservation if the request exceeds free space. + */ + error = -ENOSPC; + do { free = percpu_counter_sum(&mp->m_fdblocks) - XFS_ALLOC_SET_ASIDE(mp); if (!free) - goto out; /* ENOSPC and fdblks_delta = 0 */ + break; delta = request - mp->m_resblks; lcounter = free - delta; - if (lcounter < 0) { + if (lcounter < 0) /* We can't satisfy the request, just get what we can */ - mp->m_resblks += free; - mp->m_resblks_avail += free; - fdblks_delta = -free; - } else { - fdblks_delta = -delta; - mp->m_resblks = request; - mp->m_resblks_avail += delta; - } - } -out: - if (outval) { - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - } - spin_unlock(&mp->m_sb_lock); + fdblks_delta = free; + else + fdblks_delta = delta; - if (fdblks_delta) { /* - * If we are putting blocks back here, m_resblks_avail is - * already at its max so this will put it in the free pool. - * - * If we need space, we'll either succeed in getting it - * from the free block count or we'll get an enospc. If - * we get a ENOSPC, it means things changed while we were - * calculating fdblks_delta and so we should try again to - * see if there is anything left to reserve. + * We'll either succeed in getting space from the free block + * count or we'll get an ENOSPC. If we get a ENOSPC, it means + * things changed while we were calculating fdblks_delta and so + * we should try again to see if there is anything left to + * reserve. * * Don't set the reserved flag here - we don't want to reserve * the extra reserve blocks from the reserve..... */ - int error; - error = xfs_mod_fdblocks(mp, fdblks_delta, 0); - if (error == -ENOSPC) - goto retry; + spin_unlock(&mp->m_sb_lock); + error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); + spin_lock(&mp->m_sb_lock); + } while (error == -ENOSPC); + + /* + * Update the reserve counters if blocks have been successfully + * allocated. + */ + if (!error && fdblks_delta) { + mp->m_resblks += fdblks_delta; + mp->m_resblks_avail += fdblks_delta; } - return 0; + +out: + if (outval) { + outval->resblks = mp->m_resblks; + outval->resblks_avail = mp->m_resblks_avail; + } + + spin_unlock(&mp->m_sb_lock); + return error; } int diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index bf2d60749278..fb39a66914dd 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -37,9 +37,6 @@ #include <linux/kthread.h> #include <linux/freezer.h> -STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, - struct xfs_perag *pag, struct xfs_inode *ip); - /* * Allocate and initialise an xfs_inode. */ @@ -94,13 +91,6 @@ xfs_inode_free_callback( struct inode *inode = container_of(head, struct inode, i_rcu); struct xfs_inode *ip = XFS_I(inode); - kmem_zone_free(xfs_inode_zone, ip); -} - -void -xfs_inode_free( - struct xfs_inode *ip) -{ switch (VFS_I(ip)->i_mode & S_IFMT) { case S_IFREG: case S_IFDIR: @@ -118,6 +108,25 @@ xfs_inode_free( ip->i_itemp = NULL; } + kmem_zone_free(xfs_inode_zone, ip); +} + +static void +__xfs_inode_free( + struct xfs_inode *ip) +{ + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!xfs_isiflocked(ip)); + XFS_STATS_DEC(ip->i_mount, vn_active); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +} + +void +xfs_inode_free( + struct xfs_inode *ip) +{ /* * Because we use RCU freeing we need to ensure the inode always * appears to be reclaimed with an invalid inode number when in the @@ -129,12 +138,123 @@ xfs_inode_free( ip->i_ino = 0; spin_unlock(&ip->i_flags_lock); - /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!xfs_isiflocked(ip)); - XFS_STATS_DEC(ip->i_mount, vn_active); + __xfs_inode_free(ip); +} - call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +/* + * Queue a new inode reclaim pass if there are reclaimable inodes and there + * isn't a reclaim pass already in progress. By default it runs every 5s based + * on the xfs periodic sync default of 30s. Perhaps this should have it's own + * tunable, but that can be done if this method proves to be ineffective or too + * aggressive. + */ +static void +xfs_reclaim_work_queue( + struct xfs_mount *mp) +{ + + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, + msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); + } + rcu_read_unlock(); +} + +/* + * This is a fast pass over the inode cache to try to get reclaim moving on as + * many inodes as possible in a short period of time. It kicks itself every few + * seconds, as well as being kicked by the inode cache shrinker when memory + * goes low. It scans as quickly as possible avoiding locked inodes or those + * already being flushed, and once done schedules a future pass. + */ +void +xfs_reclaim_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_reclaim_work); + + xfs_reclaim_inodes(mp, SYNC_TRYLOCK); + xfs_reclaim_work_queue(mp); +} + +static void +xfs_perag_set_reclaim_tag( + struct xfs_perag *pag) +{ + struct xfs_mount *mp = pag->pag_mount; + + ASSERT(spin_is_locked(&pag->pag_ici_lock)); + if (pag->pag_ici_reclaimable++) + return; + + /* propagate the reclaim tag up into the perag radix tree */ + spin_lock(&mp->m_perag_lock); + radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, + XFS_ICI_RECLAIM_TAG); + spin_unlock(&mp->m_perag_lock); + + /* schedule periodic background inode reclaim */ + xfs_reclaim_work_queue(mp); + + trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); +} + +static void +xfs_perag_clear_reclaim_tag( + struct xfs_perag *pag) +{ + struct xfs_mount *mp = pag->pag_mount; + + ASSERT(spin_is_locked(&pag->pag_ici_lock)); + if (--pag->pag_ici_reclaimable) + return; + + /* clear the reclaim tag from the perag radix tree */ + spin_lock(&mp->m_perag_lock); + radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, + XFS_ICI_RECLAIM_TAG); + spin_unlock(&mp->m_perag_lock); + trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); +} + + +/* + * We set the inode flag atomically with the radix tree tag. + * Once we get tag lookups on the radix tree, this inode flag + * can go away. + */ +void +xfs_inode_set_reclaim_tag( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + xfs_perag_set_reclaim_tag(pag); + __xfs_iflags_set(ip, XFS_IRECLAIMABLE); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +STATIC void +xfs_inode_clear_reclaim_tag( + struct xfs_perag *pag, + xfs_ino_t ino) +{ + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(pag->pag_mount, ino), + XFS_ICI_RECLAIM_TAG); + xfs_perag_clear_reclaim_tag(pag); } /* @@ -264,7 +384,7 @@ xfs_iget_cache_hit( */ ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; ip->i_flags |= XFS_INEW; - __xfs_inode_clear_reclaim_tag(mp, pag, ip); + xfs_inode_clear_reclaim_tag(pag, ip->i_ino); inode->i_state = I_NEW; ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); @@ -645,7 +765,7 @@ restart: * Background scanning to trim post-EOF preallocated space. This is queued * based on the 'speculative_prealloc_lifetime' tunable (5m by default). */ -STATIC void +void xfs_queue_eofblocks( struct xfs_mount *mp) { @@ -723,121 +843,6 @@ xfs_inode_ag_iterator_tag( } /* - * Queue a new inode reclaim pass if there are reclaimable inodes and there - * isn't a reclaim pass already in progress. By default it runs every 5s based - * on the xfs periodic sync default of 30s. Perhaps this should have it's own - * tunable, but that can be done if this method proves to be ineffective or too - * aggressive. - */ -static void -xfs_reclaim_work_queue( - struct xfs_mount *mp) -{ - - rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { - queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, - msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); - } - rcu_read_unlock(); -} - -/* - * This is a fast pass over the inode cache to try to get reclaim moving on as - * many inodes as possible in a short period of time. It kicks itself every few - * seconds, as well as being kicked by the inode cache shrinker when memory - * goes low. It scans as quickly as possible avoiding locked inodes or those - * already being flushed, and once done schedules a future pass. - */ -void -xfs_reclaim_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_reclaim_work); - - xfs_reclaim_inodes(mp, SYNC_TRYLOCK); - xfs_reclaim_work_queue(mp); -} - -static void -__xfs_inode_set_reclaim_tag( - struct xfs_perag *pag, - struct xfs_inode *ip) -{ - radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - - if (!pag->pag_ici_reclaimable) { - /* propagate the reclaim tag up into the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_set(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - - /* schedule periodic background inode reclaim */ - xfs_reclaim_work_queue(ip->i_mount); - - trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); - } - pag->pag_ici_reclaimable++; -} - -/* - * We set the inode flag atomically with the radix tree tag. - * Once we get tag lookups on the radix tree, this inode flag - * can go away. - */ -void -xfs_inode_set_reclaim_tag( - xfs_inode_t *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - __xfs_inode_set_reclaim_tag(pag, ip); - __xfs_iflags_set(ip, XFS_IRECLAIMABLE); - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); - xfs_perag_put(pag); -} - -STATIC void -__xfs_inode_clear_reclaim( - xfs_perag_t *pag, - xfs_inode_t *ip) -{ - pag->pag_ici_reclaimable--; - if (!pag->pag_ici_reclaimable) { - /* clear the reclaim tag from the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_clear(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); - } -} - -STATIC void -__xfs_inode_clear_reclaim_tag( - xfs_mount_t *mp, - xfs_perag_t *pag, - xfs_inode_t *ip) -{ - radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); - __xfs_inode_clear_reclaim(pag, ip); -} - -/* * Grab the inode for reclaim exclusively. * Return 0 if we grabbed it, non-zero otherwise. */ @@ -929,6 +934,7 @@ xfs_reclaim_inode( int sync_mode) { struct xfs_buf *bp = NULL; + xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ int error; restart: @@ -993,6 +999,22 @@ restart: xfs_iflock(ip); reclaim: + /* + * Because we use RCU freeing we need to ensure the inode always appears + * to be reclaimed with an invalid inode number when in the free state. + * We do this as early as possible under the ILOCK and flush lock so + * that xfs_iflush_cluster() can be guaranteed to detect races with us + * here. By doing this, we guarantee that once xfs_iflush_cluster has + * locked both the XFS_ILOCK and the flush lock that it will see either + * a valid, flushable inode that will serialise correctly against the + * locks below, or it will see a clean (and invalid) inode that it can + * skip. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1006,9 +1028,9 @@ reclaim: */ spin_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) + XFS_INO_TO_AGINO(ip->i_mount, ino))) ASSERT(0); - __xfs_inode_clear_reclaim(pag, ip); + xfs_perag_clear_reclaim_tag(pag); spin_unlock(&pag->pag_ici_lock); /* @@ -1023,7 +1045,7 @@ reclaim: xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_inode_free(ip); + __xfs_inode_free(ip); return error; out_ifunlock: diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 62f1f91c32cb..05bac99bef75 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -68,6 +68,7 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); void xfs_eofblocks_worker(struct work_struct *); +void xfs_queue_eofblocks(struct xfs_mount *); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 96f606deee31..8825bcfd314c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -431,7 +431,7 @@ xfs_lock_inumorder(int lock_mode, int subclass) * lock more than one at a time, lockdep will report false positives saying we * have violated locking orders. */ -void +static void xfs_lock_inodes( xfs_inode_t **ips, int inodes, @@ -667,14 +667,6 @@ xfs_ip2xflags( return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip)); } -uint -xfs_dic2xflags( - struct xfs_dinode *dip) -{ - return _xfs_dic2xflags(be16_to_cpu(dip->di_flags), - be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip)); -} - /* * Lookups up an inode from "name". If ci_name is not NULL, then a CI match * is allowed, otherwise it has to be an exact match. If a CI match is found, @@ -748,7 +740,7 @@ out_unlock: * are not linked into the directory structure - they are attached * directly to the superblock - and so have no parent. */ -int +static int xfs_ialloc( xfs_trans_t *tp, xfs_inode_t *pip, @@ -1030,7 +1022,7 @@ xfs_dir_ialloc( tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); } - code = xfs_trans_roll(&tp, 0); + code = xfs_trans_roll(&tp, NULL); if (committed != NULL) *committed = 1; @@ -1085,7 +1077,7 @@ xfs_dir_ialloc( * link count to go to zero, move the inode to AGI unlinked list so that it can * be freed when the last active reference goes away via xfs_inactive(). */ -int /* error */ +static int /* error */ xfs_droplink( xfs_trans_t *tp, xfs_inode_t *ip) @@ -1104,7 +1096,7 @@ xfs_droplink( /* * Increment the link count on an inode & log the change. */ -int +static int xfs_bumplink( xfs_trans_t *tp, xfs_inode_t *ip) @@ -1161,11 +1153,9 @@ xfs_create( rdev = 0; resblks = XFS_MKDIR_SPACE_RES(mp, name->len); tres = &M_RES(mp)->tr_mkdir; - tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); } else { resblks = XFS_CREATE_SPACE_RES(mp, name->len); tres = &M_RES(mp)->tr_create; - tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); } /* @@ -1174,20 +1164,19 @@ xfs_create( * the case we'll drop the one we have and get a more * appropriate transaction later. */ - error = xfs_trans_reserve(tp, tres, resblks, 0); + error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); if (error == -ENOSPC) { /* flush outstanding delalloc blocks and retry */ xfs_flush_inodes(mp); - error = xfs_trans_reserve(tp, tres, resblks, 0); + error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); } if (error == -ENOSPC) { /* No space at all so try a "no-allocation" reservation */ resblks = 0; - error = xfs_trans_reserve(tp, tres, 0, 0); + error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); } if (error) - goto out_trans_cancel; - + goto out_release_inode; xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); @@ -1337,17 +1326,16 @@ xfs_create_tmpfile( return error; resblks = XFS_IALLOC_SPACE_RES(mp); - tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE); - tres = &M_RES(mp)->tr_create_tmpfile; - error = xfs_trans_reserve(tp, tres, resblks, 0); + + error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); if (error == -ENOSPC) { /* No space at all so try a "no-allocation" reservation */ resblks = 0; - error = xfs_trans_reserve(tp, tres, 0, 0); + error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); } if (error) - goto out_trans_cancel; + goto out_release_inode; error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks, 1, 0); @@ -1432,15 +1420,14 @@ xfs_link( if (error) goto std_return; - tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); resblks = XFS_LINK_SPACE_RES(mp, target_name->len); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp); if (error == -ENOSPC) { resblks = 0; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp); } if (error) - goto error_return; + goto std_return; xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); @@ -1710,11 +1697,9 @@ xfs_inactive_truncate( struct xfs_trans *tp; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp); return error; } @@ -1764,8 +1749,6 @@ xfs_inactive_ifree( struct xfs_trans *tp; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - /* * The ifree transaction might need to allocate blocks for record * insertion to the finobt. We don't want to fail here at ENOSPC, so @@ -1781,9 +1764,8 @@ xfs_inactive_ifree( * now remains allocated and sits on the unlinked list until the fs is * repaired. */ - tp->t_flags |= XFS_TRANS_RESERVE; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, - XFS_IFREE_SPACE_RES(mp), 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); if (error) { if (error == -ENOSPC) { xfs_warn_ratelimited(mp, @@ -1792,7 +1774,6 @@ xfs_inactive_ifree( } else { ASSERT(XFS_FORCED_SHUTDOWN(mp)); } - xfs_trans_cancel(tp); return error; } @@ -2525,11 +2506,6 @@ xfs_remove( if (error) goto std_return; - if (is_dir) - tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); - else - tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); - /* * We try to get the real space reservation first, * allowing for directory btree deletion(s) implying @@ -2540,14 +2516,15 @@ xfs_remove( * block from the directory. */ resblks = XFS_REMOVE_SPACE_RES(mp); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp); if (error == -ENOSPC) { resblks = 0; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0, + &tp); } if (error) { ASSERT(error != -ENOSPC); - goto out_trans_cancel; + goto std_return; } xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); @@ -2855,6 +2832,7 @@ xfs_rename_alloc_whiteout( * and flag it as linkable. */ drop_nlink(VFS_I(tmpfile)); + xfs_setup_iops(tmpfile); xfs_finish_inode_setup(tmpfile); VFS_I(tmpfile)->i_state |= I_LINKABLE; @@ -2910,15 +2888,15 @@ xfs_rename( xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, inodes, &num_inodes); - tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); if (error == -ENOSPC) { spaceres = 0; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0, + &tp); } if (error) - goto out_trans_cancel; + goto out_release_wip; /* * Attach the dquots to the inodes @@ -3155,6 +3133,7 @@ out_bmap_cancel: xfs_bmap_cancel(&free_list); out_trans_cancel: xfs_trans_cancel(tp); +out_release_wip: if (wip) IRELE(wip); return error; @@ -3162,16 +3141,16 @@ out_trans_cancel: STATIC int xfs_iflush_cluster( - xfs_inode_t *ip, - xfs_buf_t *bp) + struct xfs_inode *ip, + struct xfs_buf *bp) { - xfs_mount_t *mp = ip->i_mount; + struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; unsigned long first_index, mask; unsigned long inodes_per_cluster; - int ilist_size; - xfs_inode_t **ilist; - xfs_inode_t *iq; + int cilist_size; + struct xfs_inode **cilist; + struct xfs_inode *cip; int nr_found; int clcount = 0; int bufwasdelwri; @@ -3180,23 +3159,23 @@ xfs_iflush_cluster( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; - ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); - ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); - if (!ilist) + cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); + cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); + if (!cilist) goto out_put; mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; rcu_read_lock(); /* really need a gang lookup range call here */ - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, first_index, inodes_per_cluster); if (nr_found == 0) goto out_free; for (i = 0; i < nr_found; i++) { - iq = ilist[i]; - if (iq == ip) + cip = cilist[i]; + if (cip == ip) continue; /* @@ -3205,20 +3184,30 @@ xfs_iflush_cluster( * We need to check under the i_flags_lock for a valid inode * here. Skip it if it is not valid or the wrong inode. */ - spin_lock(&ip->i_flags_lock); - if (!ip->i_ino || - (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { - spin_unlock(&ip->i_flags_lock); + spin_lock(&cip->i_flags_lock); + if (!cip->i_ino || + __xfs_iflags_test(cip, XFS_ISTALE)) { + spin_unlock(&cip->i_flags_lock); continue; } - spin_unlock(&ip->i_flags_lock); + + /* + * Once we fall off the end of the cluster, no point checking + * any more inodes in the list because they will also all be + * outside the cluster. + */ + if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { + spin_unlock(&cip->i_flags_lock); + break; + } + spin_unlock(&cip->i_flags_lock); /* * Do an un-protected check to see if the inode is dirty and * is a candidate for flushing. These checks will be repeated * later after the appropriate locks are acquired. */ - if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) + if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) continue; /* @@ -3226,15 +3215,28 @@ xfs_iflush_cluster( * then this inode cannot be flushed and is skipped. */ - if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) + if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) continue; - if (!xfs_iflock_nowait(iq)) { - xfs_iunlock(iq, XFS_ILOCK_SHARED); + if (!xfs_iflock_nowait(cip)) { + xfs_iunlock(cip, XFS_ILOCK_SHARED); continue; } - if (xfs_ipincount(iq)) { - xfs_ifunlock(iq); - xfs_iunlock(iq, XFS_ILOCK_SHARED); + if (xfs_ipincount(cip)) { + xfs_ifunlock(cip); + xfs_iunlock(cip, XFS_ILOCK_SHARED); + continue; + } + + + /* + * Check the inode number again, just to be certain we are not + * racing with freeing in xfs_reclaim_inode(). See the comments + * in that function for more information as to why the initial + * check is not sufficient. + */ + if (!cip->i_ino) { + xfs_ifunlock(cip); + xfs_iunlock(cip, XFS_ILOCK_SHARED); continue; } @@ -3242,18 +3244,18 @@ xfs_iflush_cluster( * arriving here means that this inode can be flushed. First * re-check that it's dirty before flushing. */ - if (!xfs_inode_clean(iq)) { + if (!xfs_inode_clean(cip)) { int error; - error = xfs_iflush_int(iq, bp); + error = xfs_iflush_int(cip, bp); if (error) { - xfs_iunlock(iq, XFS_ILOCK_SHARED); + xfs_iunlock(cip, XFS_ILOCK_SHARED); goto cluster_corrupt_out; } clcount++; } else { - xfs_ifunlock(iq); + xfs_ifunlock(cip); } - xfs_iunlock(iq, XFS_ILOCK_SHARED); + xfs_iunlock(cip, XFS_ILOCK_SHARED); } if (clcount) { @@ -3263,7 +3265,7 @@ xfs_iflush_cluster( out_free: rcu_read_unlock(); - kmem_free(ilist); + kmem_free(cilist); out_put: xfs_perag_put(pag); return 0; @@ -3306,8 +3308,8 @@ cluster_corrupt_out: /* * Unlocks the flush lock */ - xfs_iflush_abort(iq, false); - kmem_free(ilist); + xfs_iflush_abort(cip, false); + kmem_free(cilist); xfs_perag_put(pag); return -EFSCORRUPTED; } @@ -3327,7 +3329,7 @@ xfs_iflush( struct xfs_buf **bpp) { struct xfs_mount *mp = ip->i_mount; - struct xfs_buf *bp; + struct xfs_buf *bp = NULL; struct xfs_dinode *dip; int error; @@ -3369,14 +3371,22 @@ xfs_iflush( } /* - * Get the buffer containing the on-disk inode. + * Get the buffer containing the on-disk inode. We are doing a try-lock + * operation here, so we may get an EAGAIN error. In that case, we + * simply want to return with the inode still dirty. + * + * If we get any other error, we effectively have a corruption situation + * and we cannot flush the inode, so we treat it the same as failing + * xfs_iflush_int(). */ error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, 0); - if (error || !bp) { + if (error == -EAGAIN) { xfs_ifunlock(ip); return error; } + if (error) + goto corrupt_out; /* * First flush out the inode that xfs_iflush was called with. @@ -3404,7 +3414,8 @@ xfs_iflush( return 0; corrupt_out: - xfs_buf_relse(bp); + if (bp) + xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); cluster_corrupt_out: error = -EFSCORRUPTED; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 43e1d51b15eb..8eb78ec4a6e2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -395,12 +395,8 @@ void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); uint xfs_ilock_data_map_shared(struct xfs_inode *); uint xfs_ilock_attr_map_shared(struct xfs_inode *); -int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, - xfs_nlink_t, xfs_dev_t, prid_t, int, - struct xfs_buf **, xfs_inode_t **); uint xfs_ip2xflags(struct xfs_inode *); -uint xfs_dic2xflags(struct xfs_dinode *); int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_bmap_free *); int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, @@ -411,7 +407,6 @@ void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) int xfs_iflush(struct xfs_inode *, struct xfs_buf **); -void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); @@ -419,8 +414,6 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_inode **, int *); -int xfs_droplink(struct xfs_trans *, struct xfs_inode *); -int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); /* from xfs_file.c */ enum xfs_prealloc_flags { @@ -434,12 +427,16 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip, enum xfs_prealloc_flags flags); int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, xfs_fsize_t isize, bool *did_zeroing); -int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); +int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count, + bool *did_zero); loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, loff_t eof, int whence); /* from xfs_iops.c */ +extern void xfs_setup_inode(struct xfs_inode *ip); +extern void xfs_setup_iops(struct xfs_inode *ip); + /* * When setting up a newly allocated inode, we need to call * xfs_finish_inode_setup() once the inode is fully instantiated at @@ -447,7 +444,6 @@ loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, * before we've completed instantiation. Otherwise we can do it * the moment the inode lookup is complete. */ -extern void xfs_setup_inode(struct xfs_inode *ip); static inline void xfs_finish_inode_setup(struct xfs_inode *ip) { xfs_iflags_clear(ip, XFS_INEW); @@ -458,6 +454,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip) static inline void xfs_setup_existing_inode(struct xfs_inode *ip) { xfs_setup_inode(ip); + xfs_setup_iops(ip); xfs_finish_inode_setup(ip); } @@ -476,14 +473,4 @@ do { \ extern struct kmem_zone *xfs_inode_zone; -/* - * Flags for read/write calls - */ -#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */ -#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */ - -#define XFS_IO_FLAGS \ - { XFS_IO_ISDIRECT, "DIRECT" }, \ - { XFS_IO_INVIS, "INVIS"} - #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index c48b5b18d771..892c2aced207 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -210,7 +210,7 @@ xfs_inode_item_format_data_fork( */ data_bytes = roundup(ip->i_df.if_bytes, 4); ASSERT(ip->i_df.if_real_bytes == 0 || - ip->i_df.if_real_bytes == data_bytes); + ip->i_df.if_real_bytes >= data_bytes); ASSERT(ip->i_df.if_u1.if_data != NULL); ASSERT(ip->i_d.di_size > 0); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, @@ -305,7 +305,7 @@ xfs_inode_item_format_attr_fork( */ data_bytes = roundup(ip->i_afp->if_bytes, 4); ASSERT(ip->i_afp->if_real_bytes == 0 || - ip->i_afp->if_real_bytes == data_bytes); + ip->i_afp->if_real_bytes >= data_bytes); ASSERT(ip->i_afp->if_u1.if_data != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, ip->i_afp->if_u1.if_data, @@ -479,6 +479,8 @@ STATIC uint xfs_inode_item_push( struct xfs_log_item *lip, struct list_head *buffer_list) + __releases(&lip->li_ailp->xa_lock) + __acquires(&lip->li_ailp->xa_lock) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; @@ -649,6 +651,7 @@ void xfs_inode_item_destroy( xfs_inode_t *ip) { + kmem_free(ip->i_itemp->ili_item.li_lv_shadow); kmem_zone_free(xfs_ili_zone, ip->i_itemp); } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index bcb6c19ce3ea..9a7c87809d3b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -277,7 +277,6 @@ xfs_readlink_by_handle( { struct dentry *dentry; __u32 olen; - void *link; int error; if (!capable(CAP_SYS_ADMIN)) @@ -288,7 +287,7 @@ xfs_readlink_by_handle( return PTR_ERR(dentry); /* Restrict this handle operation to symlinks only. */ - if (!d_is_symlink(dentry)) { + if (!d_inode(dentry)->i_op->readlink) { error = -EINVAL; goto out_dput; } @@ -298,21 +297,8 @@ xfs_readlink_by_handle( goto out_dput; } - link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); - if (!link) { - error = -ENOMEM; - goto out_dput; - } - - error = xfs_readlink(XFS_I(d_inode(dentry)), link); - if (error) - goto out_kfree; - error = readlink_copy(hreq->ohandle, olen, link); - if (error) - goto out_kfree; + error = d_inode(dentry)->i_op->readlink(dentry, hreq->ohandle, olen); - out_kfree: - kfree(link); out_dput: dput(dentry); return error; @@ -334,12 +320,10 @@ xfs_set_dmattrs( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) return error; - } + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -611,13 +595,12 @@ xfs_attrmulti_by_handle( int xfs_ioc_space( - struct xfs_inode *ip, - struct inode *inode, struct file *filp, - int ioflags, unsigned int cmd, xfs_flock64_t *bf) { + struct inode *inode = file_inode(filp); + struct xfs_inode *ip = XFS_I(inode); struct iattr iattr; enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL; @@ -642,7 +625,7 @@ xfs_ioc_space( if (filp->f_flags & O_DSYNC) flags |= XFS_PREALLOC_SYNC; - if (ioflags & XFS_IO_INVIS) + if (filp->f_mode & FMODE_NOCMTIME) flags |= XFS_PREALLOC_INVISIBLE; error = mnt_want_write_file(filp); @@ -1141,10 +1124,9 @@ xfs_ioctl_setattr_get_trans( if (XFS_FORCED_SHUTDOWN(mp)) goto out_unlock; - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) - goto out_cancel; + return ERR_PTR(error); xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags); @@ -1481,8 +1463,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) STATIC int xfs_ioc_getbmap( - struct xfs_inode *ip, - int ioflags, + struct file *file, unsigned int cmd, void __user *arg) { @@ -1496,10 +1477,10 @@ xfs_ioc_getbmap( return -EINVAL; bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); - if (ioflags & XFS_IO_INVIS) + if (file->f_mode & FMODE_NOCMTIME) bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; - error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, + error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format, (__force struct getbmap *)arg+1); if (error) return error; @@ -1592,6 +1573,17 @@ xfs_ioc_swapext( goto out_put_tmp_file; } + /* + * We need to ensure that the fds passed in point to XFS inodes + * before we cast and access them as XFS structures as we have no + * control over what the user passes us here. + */ + if (f.file->f_op != &xfs_file_operations || + tmp.file->f_op != &xfs_file_operations) { + error = -EINVAL; + goto out_put_tmp_file; + } + ip = XFS_I(file_inode(f.file)); tip = XFS_I(file_inode(tmp.file)); @@ -1636,12 +1628,8 @@ xfs_file_ioctl( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; - int ioflags = 0; int error; - if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - trace_xfs_file_ioctl(ip); switch (cmd) { @@ -1660,7 +1648,7 @@ xfs_file_ioctl( if (copy_from_user(&bf, arg, sizeof(bf))) return -EFAULT; - return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + return xfs_ioc_space(filp, cmd, &bf); } case XFS_IOC_DIOINFO: { struct dioattr da; @@ -1719,7 +1707,7 @@ xfs_file_ioctl( case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: - return xfs_ioc_getbmap(ip, ioflags, cmd, arg); + return xfs_ioc_getbmap(filp, cmd, arg); case XFS_IOC_GETBMAPX: return xfs_ioc_getbmapx(ip, arg); diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 77c02c7900b6..8b52881bfd90 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -20,10 +20,7 @@ extern int xfs_ioc_space( - struct xfs_inode *ip, - struct inode *inode, struct file *filp, - int ioflags, unsigned int cmd, xfs_flock64_t *bf); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 1a05d8ae327d..321f57721b92 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -532,12 +532,8 @@ xfs_file_compat_ioctl( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; - int ioflags = 0; int error; - if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - trace_xfs_file_compat_ioctl(ip); switch (cmd) { @@ -589,7 +585,7 @@ xfs_file_compat_ioctl( if (xfs_compat_flock64_copyin(&bf, arg)) return -EFAULT; cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + return xfs_ioc_space(filp, cmd, &bf); } case XFS_IOC_FSGEOMETRY_V1_32: return xfs_compat_ioc_fsgeometry_v1(mp, arg); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d81bdc080370..620fc9120444 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -15,6 +15,7 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#include <linux/iomap.h> #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" @@ -132,6 +133,7 @@ xfs_iomap_write_direct( int error; int lockmode; int bmapi_flags = XFS_BMAPI_PREALLOC; + uint tflags = 0; rt = XFS_IS_REALTIME_INODE(ip); extsz = xfs_get_extsz_hint(ip); @@ -192,11 +194,6 @@ xfs_iomap_write_direct( return error; /* - * Allocate and setup the transaction - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - - /* * For DAX, we do not allocate unwritten extents, but instead we zero * the block before we commit the transaction. Ideally we'd like to do * this outside the transaction context, but if we commit and then crash @@ -209,23 +206,17 @@ xfs_iomap_write_direct( * the reserve block pool for bmbt block allocation if there is no space * left but we need to do unwritten extent conversion. */ - if (IS_DAX(VFS_I(ip))) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (ISUNWRITTEN(imap)) { - tp->t_flags |= XFS_TRANS_RESERVE; + tflags |= XFS_TRANS_RESERVE; resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; } } - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - resblks, resrtextents); - /* - * Check for running out of space, note: need lock to return - */ - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents, + tflags, &tp); + if (error) return error; - } lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, lockmode); @@ -726,15 +717,13 @@ xfs_iomap_write_allocate( nimaps = 0; while (nimaps == 0) { - tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); - tp->t_flags |= XFS_TRANS_RESERVE; nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - nres, 0); - if (error) { - xfs_trans_cancel(tp); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres, + 0, XFS_TRANS_RESERVE, &tp); + if (error) return error; - } + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); @@ -878,25 +867,18 @@ xfs_iomap_write_unwritten( do { /* - * set up a transaction to convert the range of extents + * Set up a transaction to convert the range of extents * from unwritten to real. Do allocations in a loop until * we have covered the range passed in. * - * Note that we open code the transaction allocation here - * to pass KM_NOFS--we can't risk to recursing back into - * the filesystem here as we might be asked to write out - * the same inode that we complete here and might deadlock - * on the iolock. + * Note that we can't risk to recursing back into the filesystem + * here as we might be asked to write out the same inode that we + * complete here and might deadlock on the iolock. */ - sb_start_intwrite(mp->m_super); - tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); - tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - resblks, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); @@ -959,3 +941,173 @@ error_on_bmapi_transaction: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } + +void +xfs_bmbt_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = ip->i_mount; + + if (imap->br_startblock == HOLESTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + } else if (imap->br_startblock == DELAYSTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_DELALLOC; + } else { + iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock); + if (imap->br_state == XFS_EXT_UNWRITTEN) + iomap->type = IOMAP_UNWRITTEN; + else + iomap->type = IOMAP_MAPPED; + } + iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); + iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); + iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); +} + +static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps) +{ + return !nimaps || + imap->br_startblock == HOLESTARTBLOCK || + imap->br_startblock == DELAYSTARTBLOCK; +} + +static int +xfs_file_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + xfs_fileoff_t offset_fsb, end_fsb; + int nimaps = 1, error = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + ASSERT(offset <= mp->m_super->s_maxbytes); + if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) + length = mp->m_super->s_maxbytes - offset; + offset_fsb = XFS_B_TO_FSBT(mp, offset); + end_fsb = XFS_B_TO_FSB(mp, offset + length); + + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, + &nimaps, XFS_BMAPI_ENTIRE); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + } + + if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) { + /* + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES + * pages to keep the chunks of work done where somewhat symmetric + * with the work writeback does. This is a completely arbitrary + * number pulled out of thin air as a best guess for initial + * testing. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. + */ + length = min_t(loff_t, length, 1024 * PAGE_SIZE); + if (xfs_get_extsz_hint(ip)) { + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + xfs_ilock_demote(ip, XFS_ILOCK_EXCL); + error = xfs_iomap_write_direct(ip, offset, length, &imap, + nimaps); + } else { + error = xfs_iomap_write_delay(ip, offset, length, &imap); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + if (error) + return error; + + trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); + xfs_bmbt_to_iomap(ip, iomap, &imap); + } else if (nimaps) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_iomap_found(ip, offset, length, 0, &imap); + xfs_bmbt_to_iomap(ip, iomap, &imap); + } else { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_iomap_not_found(ip, offset, length, 0, &imap); + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + iomap->offset = offset; + iomap->length = length; + } + + return 0; +} + +static int +xfs_file_iomap_end_delalloc( + struct xfs_inode *ip, + loff_t offset, + loff_t length, + ssize_t written) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error = 0; + + start_fsb = XFS_B_TO_FSB(mp, offset + written); + end_fsb = XFS_B_TO_FSB(mp, offset + length); + + /* + * Trim back delalloc blocks if we didn't manage to write the whole + * range reserved. + * + * We don't need to care about racing delalloc as we hold i_mutex + * across the reserve/allocate/unreserve calls. If there are delalloc + * blocks in the range, they are ours. + */ + if (start_fsb < end_fsb) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + if (error && !XFS_FORCED_SHUTDOWN(mp)) { + xfs_alert(mp, "%s: unable to clean up ino %lld", + __func__, ip->i_ino); + return error; + } + } + + return 0; +} + +static int +xfs_file_iomap_end( + struct inode *inode, + loff_t offset, + loff_t length, + ssize_t written, + unsigned flags, + struct iomap *iomap) +{ + if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) + return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, + length, written); + return 0; +} + +struct iomap_ops xfs_iomap_ops = { + .iomap_begin = xfs_file_iomap_begin, + .iomap_end = xfs_file_iomap_end, +}; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 8688e663d744..e066d045e2ff 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,6 +18,8 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ +#include <linux/iomap.h> + struct xfs_inode; struct xfs_bmbt_irec; @@ -29,4 +31,9 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, struct xfs_bmbt_irec *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); +void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, + struct xfs_bmbt_irec *); + +extern struct iomap_ops xfs_iomap_ops; + #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index fb7dc61f4a29..ab820f84ed50 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -38,12 +38,13 @@ #include "xfs_dir2.h" #include "xfs_trans_space.h" #include "xfs_pnfs.h" +#include "xfs_iomap.h" #include <linux/capability.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/security.h> -#include <linux/fiemap.h> +#include <linux/iomap.h> #include <linux/slab.h> /* @@ -181,6 +182,8 @@ xfs_generic_create( } #endif + xfs_setup_iops(ip); + if (tmpfile) d_tmpfile(dentry, inode); else @@ -368,6 +371,8 @@ xfs_vn_symlink( if (unlikely(error)) goto out_cleanup_inode; + xfs_setup_iops(cip); + d_instantiate(dentry, inode); xfs_finish_inode_setup(cip); return 0; @@ -442,6 +447,16 @@ xfs_vn_get_link( return ERR_PTR(error); } +STATIC const char * +xfs_vn_get_link_inline( + struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE); + return XFS_I(inode)->i_df.if_u1.if_data; +} + STATIC int xfs_vn_getattr( struct vfsmount *mnt, @@ -599,12 +614,12 @@ xfs_setattr_nonsize( return error; } - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) - goto out_trans_cancel; + goto out_dqrele; xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); /* * Change file ownership. Must be the owner or privileged. @@ -633,12 +648,10 @@ xfs_setattr_nonsize( NULL, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (error) /* out of quota */ - goto out_unlock; + goto out_cancel; } } - xfs_trans_ijoin(tp, ip, 0); - /* * Change file ownership. Must be the owner or privileged. */ @@ -722,10 +735,9 @@ xfs_setattr_nonsize( return 0; -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); -out_trans_cancel: +out_cancel: xfs_trans_cancel(tp); +out_dqrele: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); return error; @@ -790,20 +802,30 @@ xfs_setattr_size( return error; /* + * Wait for all direct I/O to complete. + */ + inode_dio_wait(inode); + + /* * File data changes must be complete before we start the transaction to * modify the inode. This needs to be done before joining the inode to * the transaction because the inode cannot be unlocked once it is a * part of the transaction. * - * Start with zeroing any data block beyond EOF that we may expose on - * file extension. + * Start with zeroing any data beyond EOF that we may expose on file + * extension, or zeroing out the rest of the block on a downward + * truncate. */ if (newsize > oldsize) { error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); - if (error) - return error; + } else { + error = iomap_truncate_page(inode, newsize, &did_zeroing, + &xfs_iomap_ops); } + if (error) + return error; + /* * We are going to log the inode size change in this transaction so * any previous writes that are beyond the on disk EOF and the new @@ -812,17 +834,14 @@ xfs_setattr_size( * problem. Note that this includes any block zeroing we did above; * otherwise those blocks may not be zeroed after a crash. */ - if (newsize > ip->i_d.di_size && - (oldsize != ip->i_d.di_size || did_zeroing)) { + if (did_zeroing || + (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) return error; } - /* Now wait for all direct I/O to complete. */ - inode_dio_wait(inode); - /* * We've already locked out new page faults, so now we can safely remove * pages from the page cache knowing they won't get refaulted until we @@ -834,25 +853,17 @@ xfs_setattr_size( * We have to do all the page cache truncate work outside the * transaction context as the "lock" order is page lock->log space * reservation as defined by extent allocation in the writeback path. - * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but + * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but * having already truncated the in-memory version of the file (i.e. made * user visible changes). There's not much we can do about this, except * to hope that the caller sees ENOMEM and retries the truncate * operation. */ - if (IS_DAX(inode)) - error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct); - else - error = block_truncate_page(inode->i_mapping, newsize, - xfs_get_blocks); - if (error) - return error; truncate_setsize(inode, newsize); - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) - goto out_trans_cancel; + return error; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -971,12 +982,9 @@ xfs_vn_update_time( trace_xfs_update_time(ip); - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); if (flags & S_CTIME) @@ -991,51 +999,6 @@ xfs_vn_update_time( return xfs_trans_commit(tp); } -#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) - -/* - * Call fiemap helper to fill in user data. - * Returns positive errors to xfs_getbmap. - */ -STATIC int -xfs_fiemap_format( - void **arg, - struct getbmapx *bmv, - int *full) -{ - int error; - struct fiemap_extent_info *fieinfo = *arg; - u32 fiemap_flags = 0; - u64 logical, physical, length; - - /* Do nothing for a hole */ - if (bmv->bmv_block == -1LL) - return 0; - - logical = BBTOB(bmv->bmv_offset); - physical = BBTOB(bmv->bmv_block); - length = BBTOB(bmv->bmv_length); - - if (bmv->bmv_oflags & BMV_OF_PREALLOC) - fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; - else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { - fiemap_flags |= (FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - physical = 0; /* no block yet */ - } - if (bmv->bmv_oflags & BMV_OF_LAST) - fiemap_flags |= FIEMAP_EXTENT_LAST; - - error = fiemap_fill_next_extent(fieinfo, logical, physical, - length, fiemap_flags); - if (error > 0) { - error = 0; - *full = 1; /* user array now full */ - } - - return error; -} - STATIC int xfs_vn_fiemap( struct inode *inode, @@ -1043,38 +1006,13 @@ xfs_vn_fiemap( u64 start, u64 length) { - xfs_inode_t *ip = XFS_I(inode); - struct getbmapx bm; int error; - error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS); - if (error) - return error; - - /* Set up bmap header for xfs internal routine */ - bm.bmv_offset = BTOBBT(start); - /* Special case for whole file */ - if (length == FIEMAP_MAX_OFFSET) - bm.bmv_length = -1LL; - else - bm.bmv_length = BTOBB(start + length) - bm.bmv_offset; - - /* We add one because in getbmap world count includes the header */ - bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : - fieinfo->fi_extents_max + 1; - bm.bmv_count = min_t(__s32, bm.bmv_count, - (PAGE_SIZE * 16 / sizeof(struct getbmapx))); - bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES; - if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) - bm.bmv_iflags |= BMV_IF_ATTRFORK; - if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) - bm.bmv_iflags |= BMV_IF_DELALLOC; - - error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo); - if (error) - return error; + xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED); + error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops); + xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED); - return 0; + return error; } STATIC int @@ -1167,6 +1105,18 @@ static const struct inode_operations xfs_symlink_inode_operations = { .update_time = xfs_vn_update_time, }; +static const struct inode_operations xfs_inline_symlink_inode_operations = { + .readlink = generic_readlink, + .get_link = xfs_vn_get_link_inline, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, +}; + STATIC void xfs_diflags_to_iflags( struct inode *inode, @@ -1193,7 +1143,7 @@ xfs_diflags_to_iflags( } /* - * Initialize the Linux inode and set up the operation vectors. + * Initialize the Linux inode. * * When reading existing inodes from disk this is called directly from xfs_iget, * when creating a new inode it is called from xfs_ialloc after setting up the @@ -1232,32 +1182,12 @@ xfs_setup_inode( i_size_write(inode, ip->i_d.di_size); xfs_diflags_to_iflags(inode, ip); - ip->d_ops = ip->i_mount->m_nondir_inode_ops; - lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - inode->i_op = &xfs_inode_operations; - inode->i_fop = &xfs_file_operations; - inode->i_mapping->a_ops = &xfs_address_space_operations; - break; - case S_IFDIR: + if (S_ISDIR(inode->i_mode)) { lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); - if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) - inode->i_op = &xfs_dir_ci_inode_operations; - else - inode->i_op = &xfs_dir_inode_operations; - inode->i_fop = &xfs_dir_file_operations; ip->d_ops = ip->i_mount->m_dir_inode_ops; - break; - case S_IFLNK: - inode->i_op = &xfs_symlink_inode_operations; - if (!(ip->i_df.if_flags & XFS_IFINLINE)) - inode->i_mapping->a_ops = &xfs_address_space_operations; - break; - default: - inode->i_op = &xfs_inode_operations; - init_special_inode(inode, inode->i_mode, inode->i_rdev); - break; + } else { + ip->d_ops = ip->i_mount->m_nondir_inode_ops; + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); } /* @@ -1277,3 +1207,35 @@ xfs_setup_inode( cache_no_acl(inode); } } + +void +xfs_setup_iops( + struct xfs_inode *ip) +{ + struct inode *inode = &ip->i_vnode; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &xfs_inode_operations; + inode->i_fop = &xfs_file_operations; + inode->i_mapping->a_ops = &xfs_address_space_operations; + break; + case S_IFDIR: + if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) + inode->i_op = &xfs_dir_ci_inode_operations; + else + inode->i_op = &xfs_dir_inode_operations; + inode->i_fop = &xfs_dir_file_operations; + break; + case S_IFLNK: + if (ip->i_df.if_flags & XFS_IFINLINE) + inode->i_op = &xfs_inline_symlink_inode_operations; + else + inode->i_op = &xfs_symlink_inode_operations; + break; + default: + inode->i_op = &xfs_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + } +} diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index a8192dc797dc..b8d64d520e12 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -328,13 +328,6 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) return x; } -/* ARM old ABI has some weird alignment/padding */ -#if defined(__arm__) && !defined(__ARM_EABI__) -#define __arch_pack __attribute__((packed)) -#else -#define __arch_pack -#endif - #define ASSERT_ALWAYS(expr) \ (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b49ccf5c1d75..3b74fa011bb1 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -435,8 +435,7 @@ xfs_log_reserve( int cnt, struct xlog_ticket **ticp, __uint8_t client, - bool permanent, - uint t_type) + bool permanent) { struct xlog *log = mp->m_log; struct xlog_ticket *tic; @@ -456,7 +455,6 @@ xfs_log_reserve( if (!tic) return -ENOMEM; - tic->t_trans_type = t_type; *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt @@ -790,7 +788,7 @@ xfs_log_mount_cancel( * As far as I know, there weren't any dependencies on the old behaviour. */ -int +static int xfs_log_unmount_write(xfs_mount_t *mp) { struct xlog *log = mp->m_log; @@ -823,8 +821,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) } while (iclog != first_iclog); #endif if (! (XLOG_FORCED_SHUTDOWN(log))) { - error = xfs_log_reserve(mp, 600, 1, &tic, - XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); + error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0); if (!error) { /* the data section must be 32 bit size aligned */ struct { @@ -1039,7 +1036,7 @@ xfs_log_space_wake( * there's no point in running a dummy transaction at this point because we * can't start trying to idle the log until both the CIL and AIL are empty. */ -int +static int xfs_log_need_covered(xfs_mount_t *mp) { struct xlog *log = mp->m_log; @@ -1180,7 +1177,7 @@ xlog_space_left( * The log manager needs its own routine, in order to control what * happens with the buffer after the write completes. */ -void +static void xlog_iodone(xfs_buf_t *bp) { struct xlog_in_core *iclog = bp->b_fspriv; @@ -1305,7 +1302,7 @@ xfs_log_work_queue( * disk. If there is nothing dirty, then we might need to cover the log to * indicate that the filesystem is idle. */ -void +static void xfs_log_worker( struct work_struct *work) { @@ -1418,7 +1415,7 @@ xlog_alloc_log( */ error = -ENOMEM; bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL, - BTOBB(log->l_iclog_size), 0); + BTOBB(log->l_iclog_size), XBF_NO_IOACCT); if (!bp) goto out_free_log; @@ -1457,7 +1454,8 @@ xlog_alloc_log( prev_iclog = iclog; bp = xfs_buf_get_uncached(mp->m_logdev_targp, - BTOBB(log->l_iclog_size), 0); + BTOBB(log->l_iclog_size), + XBF_NO_IOACCT); if (!bp) goto out_free_iclog; @@ -2032,58 +2030,8 @@ xlog_print_tic_res( REG_TYPE_STR(ICREATE, "inode create") }; #undef REG_TYPE_STR -#define TRANS_TYPE_STR(type) [XFS_TRANS_##type] = #type - static char *trans_type_str[XFS_TRANS_TYPE_MAX] = { - TRANS_TYPE_STR(SETATTR_NOT_SIZE), - TRANS_TYPE_STR(SETATTR_SIZE), - TRANS_TYPE_STR(INACTIVE), - TRANS_TYPE_STR(CREATE), - TRANS_TYPE_STR(CREATE_TRUNC), - TRANS_TYPE_STR(TRUNCATE_FILE), - TRANS_TYPE_STR(REMOVE), - TRANS_TYPE_STR(LINK), - TRANS_TYPE_STR(RENAME), - TRANS_TYPE_STR(MKDIR), - TRANS_TYPE_STR(RMDIR), - TRANS_TYPE_STR(SYMLINK), - TRANS_TYPE_STR(SET_DMATTRS), - TRANS_TYPE_STR(GROWFS), - TRANS_TYPE_STR(STRAT_WRITE), - TRANS_TYPE_STR(DIOSTRAT), - TRANS_TYPE_STR(WRITEID), - TRANS_TYPE_STR(ADDAFORK), - TRANS_TYPE_STR(ATTRINVAL), - TRANS_TYPE_STR(ATRUNCATE), - TRANS_TYPE_STR(ATTR_SET), - TRANS_TYPE_STR(ATTR_RM), - TRANS_TYPE_STR(ATTR_FLAG), - TRANS_TYPE_STR(CLEAR_AGI_BUCKET), - TRANS_TYPE_STR(SB_CHANGE), - TRANS_TYPE_STR(DUMMY1), - TRANS_TYPE_STR(DUMMY2), - TRANS_TYPE_STR(QM_QUOTAOFF), - TRANS_TYPE_STR(QM_DQALLOC), - TRANS_TYPE_STR(QM_SETQLIM), - TRANS_TYPE_STR(QM_DQCLUSTER), - TRANS_TYPE_STR(QM_QINOCREATE), - TRANS_TYPE_STR(QM_QUOTAOFF_END), - TRANS_TYPE_STR(FSYNC_TS), - TRANS_TYPE_STR(GROWFSRT_ALLOC), - TRANS_TYPE_STR(GROWFSRT_ZERO), - TRANS_TYPE_STR(GROWFSRT_FREE), - TRANS_TYPE_STR(SWAPEXT), - TRANS_TYPE_STR(CHECKPOINT), - TRANS_TYPE_STR(ICREATE), - TRANS_TYPE_STR(CREATE_TMPFILE) - }; -#undef TRANS_TYPE_STR xfs_warn(mp, "xlog_write: reservation summary:"); - xfs_warn(mp, " trans type = %s (%u)", - ((ticket->t_trans_type <= 0 || - ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? - "bad-trans-type" : trans_type_str[ticket->t_trans_type]), - ticket->t_trans_type); xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res); xfs_warn(mp, " current res = %d bytes", @@ -3378,7 +3326,7 @@ xfs_log_force( { int error; - trace_xfs_log_force(mp, 0); + trace_xfs_log_force(mp, 0, _RET_IP_); error = _xfs_log_force(mp, flags, NULL); if (error) xfs_warn(mp, "%s: error %d returned.", __func__, error); @@ -3527,7 +3475,7 @@ xfs_log_force_lsn( { int error; - trace_xfs_log_force(mp, lsn); + trace_xfs_log_force(mp, lsn, _RET_IP_); error = _xfs_log_force_lsn(mp, lsn, flags, NULL); if (error) xfs_warn(mp, "%s: error %d returned.", __func__, error); @@ -3709,7 +3657,6 @@ xlog_ticket_alloc( tic->t_tid = prandom_u32(); tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; - tic->t_trans_type = 0; if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index aa533a7d50f2..b5e71072fde5 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -161,15 +161,10 @@ int xfs_log_reserve(struct xfs_mount *mp, int count, struct xlog_ticket **ticket, __uint8_t clientid, - bool permanent, - uint t_type); + bool permanent); int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); -int xfs_log_unmount_write(struct xfs_mount *mp); void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); -int xfs_log_need_covered(struct xfs_mount *mp); - -void xlog_iodone(struct xfs_buf *); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); @@ -179,7 +174,6 @@ void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); -void xfs_log_worker(struct work_struct *work); void xfs_log_quiesce(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 4e7649351f5a..a4ab192e1792 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -51,7 +51,6 @@ xlog_cil_ticket_alloc( tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, KM_SLEEP|KM_NOFS); - tic->t_trans_type = XFS_TRANS_CHECKPOINT; /* * set the current reservation to zero so we know to steal the basic @@ -79,6 +78,157 @@ xlog_cil_init_post_recovery( log->l_cilp->xc_ctx->sequence = 1; } +static inline int +xlog_cil_iovec_space( + uint niovecs) +{ + return round_up((sizeof(struct xfs_log_vec) + + niovecs * sizeof(struct xfs_log_iovec)), + sizeof(uint64_t)); +} + +/* + * Allocate or pin log vector buffers for CIL insertion. + * + * The CIL currently uses disposable buffers for copying a snapshot of the + * modified items into the log during a push. The biggest problem with this is + * the requirement to allocate the disposable buffer during the commit if: + * a) does not exist; or + * b) it is too small + * + * If we do this allocation within xlog_cil_insert_format_items(), it is done + * under the xc_ctx_lock, which means that a CIL push cannot occur during + * the memory allocation. This means that we have a potential deadlock situation + * under low memory conditions when we have lots of dirty metadata pinned in + * the CIL and we need a CIL commit to occur to free memory. + * + * To avoid this, we need to move the memory allocation outside the + * xc_ctx_lock, but because the log vector buffers are disposable, that opens + * up a TOCTOU race condition w.r.t. the CIL committing and removing the log + * vector buffers between the check and the formatting of the item into the + * log vector buffer within the xc_ctx_lock. + * + * Because the log vector buffer needs to be unchanged during the CIL push + * process, we cannot share the buffer between the transaction commit (which + * modifies the buffer) and the CIL push context that is writing the changes + * into the log. This means skipping preallocation of buffer space is + * unreliable, but we most definitely do not want to be allocating and freeing + * buffers unnecessarily during commits when overwrites can be done safely. + * + * The simplest solution to this problem is to allocate a shadow buffer when a + * log item is committed for the second time, and then to only use this buffer + * if necessary. The buffer can remain attached to the log item until such time + * it is needed, and this is the buffer that is reallocated to match the size of + * the incoming modification. Then during the formatting of the item we can swap + * the active buffer with the new one if we can't reuse the existing buffer. We + * don't free the old buffer as it may be reused on the next modification if + * it's size is right, otherwise we'll free and reallocate it at that point. + * + * This function builds a vector for the changes in each log item in the + * transaction. It then works out the length of the buffer needed for each log + * item, allocates them and attaches the vector to the log item in preparation + * for the formatting step which occurs under the xc_ctx_lock. + * + * While this means the memory footprint goes up, it avoids the repeated + * alloc/free pattern that repeated modifications of an item would otherwise + * cause, and hence minimises the CPU overhead of such behaviour. + */ +static void +xlog_cil_alloc_shadow_bufs( + struct xlog *log, + struct xfs_trans *tp) +{ + struct xfs_log_item_desc *lidp; + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + struct xfs_log_vec *lv; + int niovecs = 0; + int nbytes = 0; + int buf_size; + bool ordered = false; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* get number of vecs and size of data to be stored */ + lip->li_ops->iop_size(lip, &niovecs, &nbytes); + + /* + * Ordered items need to be tracked but we do not wish to write + * them. We need a logvec to track the object, but we do not + * need an iovec or buffer to be allocated for copying data. + */ + if (niovecs == XFS_LOG_VEC_ORDERED) { + ordered = true; + niovecs = 0; + nbytes = 0; + } + + /* + * We 64-bit align the length of each iovec so that the start + * of the next one is naturally aligned. We'll need to + * account for that slack space here. Then round nbytes up + * to 64-bit alignment so that the initial buffer alignment is + * easy to calculate and verify. + */ + nbytes += niovecs * sizeof(uint64_t); + nbytes = round_up(nbytes, sizeof(uint64_t)); + + /* + * The data buffer needs to start 64-bit aligned, so round up + * that space to ensure we can align it appropriately and not + * overrun the buffer. + */ + buf_size = nbytes + xlog_cil_iovec_space(niovecs); + + /* + * if we have no shadow buffer, or it is too small, we need to + * reallocate it. + */ + if (!lip->li_lv_shadow || + buf_size > lip->li_lv_shadow->lv_size) { + + /* + * We free and allocate here as a realloc would copy + * unecessary data. We don't use kmem_zalloc() for the + * same reason - we don't need to zero the data area in + * the buffer, only the log vector header and the iovec + * storage. + */ + kmem_free(lip->li_lv_shadow); + + lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS); + memset(lv, 0, xlog_cil_iovec_space(niovecs)); + + lv->lv_item = lip; + lv->lv_size = buf_size; + if (ordered) + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + else + lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; + lip->li_lv_shadow = lv; + } else { + /* same or smaller, optimise common overwrite case */ + lv = lip->li_lv_shadow; + if (ordered) + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + else + lv->lv_buf_len = 0; + lv->lv_bytes = 0; + lv->lv_next = NULL; + } + + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = niovecs; + + /* The allocated data region lies beyond the iovec region */ + lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs); + } + +} + /* * Prepare the log item for insertion into the CIL. Calculate the difference in * log space and vectors it will consume, and if it is a new item pin it as @@ -101,16 +251,19 @@ xfs_cil_prepare_item( /* * If there is no old LV, this is the first time we've seen the item in * this CIL context and so we need to pin it. If we are replacing the - * old_lv, then remove the space it accounts for and free it. + * old_lv, then remove the space it accounts for and make it the shadow + * buffer for later freeing. In both cases we are now switching to the + * shadow buffer, so update the the pointer to it appropriately. */ - if (!old_lv) + if (!old_lv) { lv->lv_item->li_ops->iop_pin(lv->lv_item); - else if (old_lv != lv) { + lv->lv_item->li_lv_shadow = NULL; + } else if (old_lv != lv) { ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); *diff_len -= old_lv->lv_bytes; *diff_iovecs -= old_lv->lv_niovecs; - kmem_free(old_lv); + lv->lv_item->li_lv_shadow = old_lv; } /* attach new log vector to log item */ @@ -134,11 +287,13 @@ xfs_cil_prepare_item( * write it out asynchronously without needing to relock the object that was * modified at the time it gets written into the iclog. * - * This function builds a vector for the changes in each log item in the - * transaction. It then works out the length of the buffer needed for each log - * item, allocates them and formats the vector for the item into the buffer. - * The buffer is then attached to the log item are then inserted into the - * Committed Item List for tracking until the next checkpoint is written out. + * This function takes the prepared log vectors attached to each log item, and + * formats the changes into the log vector buffer. The buffer it uses is + * dependent on the current state of the vector in the CIL - the shadow lv is + * guaranteed to be large enough for the current modification, but we will only + * use that if we can't reuse the existing lv. If we can't reuse the existing + * lv, then simple swap it out for the shadow lv. We don't free it - that is + * done lazily either by th enext modification or the freeing of the log item. * * We don't set up region headers during this process; we simply copy the * regions into the flat buffer. We can do this because we still have to do a @@ -171,59 +326,29 @@ xlog_cil_insert_format_items( list_for_each_entry(lidp, &tp->t_items, lid_trans) { struct xfs_log_item *lip = lidp->lid_item; struct xfs_log_vec *lv; - struct xfs_log_vec *old_lv; - int niovecs = 0; - int nbytes = 0; - int buf_size; + struct xfs_log_vec *old_lv = NULL; + struct xfs_log_vec *shadow; bool ordered = false; /* Skip items which aren't dirty in this transaction. */ if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; - /* get number of vecs and size of data to be stored */ - lip->li_ops->iop_size(lip, &niovecs, &nbytes); - - /* Skip items that do not have any vectors for writing */ - if (!niovecs) - continue; - /* - * Ordered items need to be tracked but we do not wish to write - * them. We need a logvec to track the object, but we do not - * need an iovec or buffer to be allocated for copying data. + * The formatting size information is already attached to + * the shadow lv on the log item. */ - if (niovecs == XFS_LOG_VEC_ORDERED) { + shadow = lip->li_lv_shadow; + if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED) ordered = true; - niovecs = 0; - nbytes = 0; - } - /* - * We 64-bit align the length of each iovec so that the start - * of the next one is naturally aligned. We'll need to - * account for that slack space here. Then round nbytes up - * to 64-bit alignment so that the initial buffer alignment is - * easy to calculate and verify. - */ - nbytes += niovecs * sizeof(uint64_t); - nbytes = round_up(nbytes, sizeof(uint64_t)); - - /* grab the old item if it exists for reservation accounting */ - old_lv = lip->li_lv; - - /* - * The data buffer needs to start 64-bit aligned, so round up - * that space to ensure we can align it appropriately and not - * overrun the buffer. - */ - buf_size = nbytes + - round_up((sizeof(struct xfs_log_vec) + - niovecs * sizeof(struct xfs_log_iovec)), - sizeof(uint64_t)); + /* Skip items that do not have any vectors for writing */ + if (!shadow->lv_niovecs && !ordered) + continue; /* compare to existing item size */ - if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { + old_lv = lip->li_lv; + if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) { /* same or smaller, optimise common overwrite case */ lv = lip->li_lv; lv->lv_next = NULL; @@ -237,32 +362,29 @@ xlog_cil_insert_format_items( */ *diff_iovecs -= lv->lv_niovecs; *diff_len -= lv->lv_bytes; + + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = shadow->lv_niovecs; + + /* reset the lv buffer information for new formatting */ + lv->lv_buf_len = 0; + lv->lv_bytes = 0; + lv->lv_buf = (char *)lv + + xlog_cil_iovec_space(lv->lv_niovecs); } else { - /* allocate new data chunk */ - lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); + /* switch to shadow buffer! */ + lv = shadow; lv->lv_item = lip; - lv->lv_size = buf_size; if (ordered) { /* track as an ordered logvec */ ASSERT(lip->li_lv == NULL); - lv->lv_buf_len = XFS_LOG_VEC_ORDERED; goto insert; } - lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; } - /* Ensure the lv is set up according to ->iop_size */ - lv->lv_niovecs = niovecs; - - /* The allocated data region lies beyond the iovec region */ - lv->lv_buf_len = 0; - lv->lv_bytes = 0; - lv->lv_buf = (char *)lv + buf_size - nbytes; ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); - lip->li_ops->iop_format(lip, lv); insert: - ASSERT(lv->lv_buf_len <= nbytes); xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); } } @@ -784,6 +906,13 @@ xfs_log_commit_cil( struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; + /* + * Do all necessary memory allocation before we lock the CIL. + * This ensures the allocation does not deadlock with a CIL + * push in memory reclaim (e.g. from kswapd). + */ + xlog_cil_alloc_shadow_bufs(log, tp); + /* lock out background commit */ down_read(&cil->xc_ctx_lock); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index ed8896310c00..765f084759b5 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -175,7 +175,6 @@ typedef struct xlog_ticket { char t_cnt; /* current count : 1 */ char t_clientid; /* who does this belong to; : 1 */ char t_flags; /* properties of reservation : 1 */ - uint t_trans_type; /* transaction type : 4 */ /* reservation array fields */ uint t_res_num; /* num in array : 4 */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 396565f43247..835997843846 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3843,7 +3843,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); + ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP); memcpy(&ptr[old_len], dp, len); item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; @@ -4205,10 +4205,9 @@ xlog_recover_process_efi( } } - tp = xfs_trans_alloc(mp, 0); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) - goto abort_error; + return error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); for (i = 0; i < efip->efi_format.efi_nextents; i++) { @@ -4355,10 +4354,9 @@ xlog_recover_clear_agi_bucket( int offset; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp); if (error) - goto out_abort; + goto out_error; error = xfs_read_agi(mp, tp, agno, &agibp); if (error) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index cfd4210dd015..970c19ba2f56 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -89,7 +89,6 @@ xfs_uuid_mount( if (hole < 0) { xfs_uuid_table = kmem_realloc(xfs_uuid_table, (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), - xfs_uuid_table_size * sizeof(*xfs_uuid_table), KM_SLEEP); hole = xfs_uuid_table_size++; } @@ -273,13 +272,15 @@ xfs_readsb( buf_ops = NULL; /* - * Allocate a (locked) buffer to hold the superblock. - * This will be kept around at all times to optimize - * access to the superblock. + * Allocate a (locked) buffer to hold the superblock. This will be kept + * around at all times to optimize access to the superblock. Therefore, + * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count + * elevated. */ reread: error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, &bp, buf_ops); + BTOBB(sector_size), XBF_NO_IOACCT, &bp, + buf_ops); if (error) { if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); @@ -681,6 +682,9 @@ xfs_mountfs( xfs_set_maxicount(mp); + /* enable fail_at_unmount as default */ + mp->m_fail_unmount = 1; + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname); if (error) goto out; @@ -690,10 +694,15 @@ xfs_mountfs( if (error) goto out_remove_sysfs; - error = xfs_uuid_mount(mp); + error = xfs_error_sysfs_init(mp); if (error) goto out_del_stats; + + error = xfs_uuid_mount(mp); + if (error) + goto out_remove_error_sysfs; + /* * Set the minimum read and write sizes */ @@ -957,6 +966,7 @@ xfs_mountfs( cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); out_log_dealloc: + mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_log_mount_cancel(mp); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) @@ -968,6 +978,8 @@ xfs_mountfs( xfs_da_unmount(mp); out_remove_uuid: xfs_uuid_unmount(mp); + out_remove_error_sysfs: + xfs_error_sysfs_del(mp); out_del_stats: xfs_sysfs_del(&mp->m_stats.xs_kobj); out_remove_sysfs: @@ -1006,6 +1018,14 @@ xfs_unmountfs( xfs_log_force(mp, XFS_LOG_SYNC); /* + * We now need to tell the world we are unmounting. This will allow + * us to detect that the filesystem is going away and we should error + * out anything that we have been retrying in the background. This will + * prevent neverending retries in AIL pushing from hanging the unmount. + */ + mp->m_flags |= XFS_MOUNT_UNMOUNTING; + + /* * Flush all pending changes from the AIL. */ xfs_ail_push_all_sync(mp->m_ail); @@ -1056,6 +1076,7 @@ xfs_unmountfs( #endif xfs_free_perag(mp); + xfs_error_sysfs_del(mp); xfs_sysfs_del(&mp->m_stats.xs_kobj); xfs_sysfs_del(&mp->m_kobj); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index eafe257b357a..c1b798c72126 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -37,6 +37,32 @@ enum { XFS_LOWSP_MAX, }; +/* + * Error Configuration + * + * Error classes define the subsystem the configuration belongs to. + * Error numbers define the errors that are configurable. + */ +enum { + XFS_ERR_METADATA, + XFS_ERR_CLASS_MAX, +}; +enum { + XFS_ERR_DEFAULT, + XFS_ERR_EIO, + XFS_ERR_ENOSPC, + XFS_ERR_ENODEV, + XFS_ERR_ERRNO_MAX, +}; + +#define XFS_ERR_RETRY_FOREVER -1 + +struct xfs_error_cfg { + struct xfs_kobj kobj; + int max_retries; + unsigned long retry_timeout; /* in jiffies, 0 = no timeout */ +}; + typedef struct xfs_mount { struct super_block *m_super; xfs_tid_t m_tid; /* next unused tid for fs */ @@ -127,6 +153,9 @@ typedef struct xfs_mount { int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ struct xfs_kobj m_kobj; + struct xfs_kobj m_error_kobj; + struct xfs_kobj m_error_meta_kobj; + struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ struct workqueue_struct *m_buf_workqueue; @@ -148,6 +177,7 @@ typedef struct xfs_mount { */ __uint32_t m_generation; + bool m_fail_unmount; #ifdef DEBUG /* * DEBUG mode instrumentation to test and/or trigger delayed allocation @@ -166,6 +196,7 @@ typedef struct xfs_mount { #define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ +#define XFS_MOUNT_UNMOUNTING (1ULL << 1) /* filesystem is unmounting */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for @@ -364,4 +395,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb, xfs_off_t count_fsb); +struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp, + int error_class, int error); + #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 184c44effdd5..0cc8d8f74356 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -22,6 +22,11 @@ BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \ #structname ") is wrong, expected " #size) +#define XFS_CHECK_OFFSET(structname, member, off) \ + BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \ + "XFS: offsetof(" #structname ", " #member ") is wrong, " \ + "expected " #off) + static inline void __init xfs_check_ondisk_structs(void) { @@ -34,6 +39,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72); XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104); @@ -75,27 +82,39 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12); */ + XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40); - XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t, 8); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7); XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t, 6); + XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag, 0); + XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length, 2); XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3); + XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen, 0); + XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1); + XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3); XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t, 2); /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 51ddaf2c2b8c..0f14b2e4bf6c 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2014 Christoph Hellwig. */ +#include <linux/iomap.h> #include "xfs.h" #include "xfs_format.h" #include "xfs_log_format.h" @@ -79,32 +80,6 @@ xfs_fs_get_uuid( return 0; } -static void -xfs_bmbt_to_iomap( - struct xfs_inode *ip, - struct iomap *iomap, - struct xfs_bmbt_irec *imap) -{ - struct xfs_mount *mp = ip->i_mount; - - if (imap->br_startblock == HOLESTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_HOLE; - } else if (imap->br_startblock == DELAYSTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_DELALLOC; - } else { - iomap->blkno = - XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock); - if (imap->br_state == XFS_EXT_UNWRITTEN) - iomap->type = IOMAP_UNWRITTEN; - else - iomap->type = IOMAP_MAPPED; - } - iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); - iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); -} - /* * Get a layout for the pNFS client. */ @@ -308,12 +283,9 @@ xfs_fs_commit_blocks( goto out_drop_iolock; } - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) goto out_drop_iolock; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index be125e1758c1..a60d9e2739d1 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -783,13 +783,10 @@ xfs_qm_qino_alloc( } } - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create, - XFS_QM_QINOCREATE_SPACE_RES(mp), 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create, + XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp); + if (error) return error; - } if (need_alloc) { error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index f4d0e0a8f517..475a3882a81f 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -236,10 +236,8 @@ xfs_qm_scall_trunc_qfile( xfs_ilock(ip, XFS_IOLOCK_EXCL); - tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { - xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_IOLOCK_EXCL); goto out_put; } @@ -436,12 +434,9 @@ xfs_qm_scall_setqlim( defq = xfs_get_defquota(dqp, q); xfs_dqunlock(dqp); - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp); + if (error) goto out_rele; - } xfs_dqlock(dqp); xfs_trans_dqjoin(tp, dqp); @@ -569,13 +564,9 @@ xfs_qm_log_quotaoff_end( int error; xfs_qoff_logitem_t *qoffi; - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); - - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp); + if (error) return error; - } qoffi = xfs_trans_get_qoff_item(tp, startqoff, flags & XFS_ALL_QUOTA_ACCT); @@ -603,12 +594,9 @@ xfs_qm_log_quotaoff( *qoffstartp = NULL; - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp); + if (error) goto out; - } qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); xfs_trans_log_quotaoff_item(tp, qoffi); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index abf44435d04a..3938b37d1043 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -780,15 +780,14 @@ xfs_growfs_rt_alloc( * Allocate space to the file, as necessary. */ while (oblocks < nblocks) { - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); /* * Reserve space & log for one extent added to the file. */ - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc, - resblks, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, resblks, + 0, 0, &tp); if (error) - goto out_trans_cancel; + return error; /* * Lock the inode. */ @@ -823,14 +822,13 @@ xfs_growfs_rt_alloc( for (bno = map.br_startoff, fsbno = map.br_startblock; bno < map.br_startoff + map.br_blockcount; bno++, fsbno++) { - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO); /* * Reserve log for one block zeroing. */ - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero, - 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero, + 0, 0, 0, &tp); if (error) - goto out_trans_cancel; + return error; /* * Lock the bitmap inode. */ @@ -994,11 +992,10 @@ xfs_growfs_rt( /* * Start a transaction, get the log reservation. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree, - 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtfree, 0, 0, 0, + &tp); if (error) - goto error_cancel; + break; /* * Lock out other callers by grabbing the bitmap inode lock. */ diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 76c0a4a9bb17..355dd9e1cb64 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -98,8 +98,6 @@ xfs_growfs_rt( /* * From xfs_rtbitmap.c */ -int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t block, int issum, struct xfs_buf **bpp); int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, int val, xfs_rtblock_t *new, int *stat); diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 8686df6c7609..d266e835ecc3 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -128,7 +128,6 @@ static int xqm_proc_open(struct inode *inode, struct file *file) } static const struct file_operations xqm_proc_fops = { - .owner = THIS_MODULE, .open = xqm_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 187e14b696c2..0303f1005f88 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -58,8 +58,7 @@ #include <linux/parser.h> static const struct super_operations xfs_super_operations; -static kmem_zone_t *xfs_ioend_zone; -mempool_t *xfs_ioend_pool; +struct bio_set *xfs_ioend_bioset; static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG @@ -350,6 +349,7 @@ xfs_parseargs( case Opt_pqnoenforce: mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); mp->m_qflags &= ~XFS_PQUOTA_ENFD; + break; case Opt_gquota: case Opt_grpquota: mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | @@ -546,7 +546,7 @@ xfs_showargs( return 0; } -__uint64_t +static __uint64_t xfs_max_file_offset( unsigned int blockshift) { @@ -928,7 +928,7 @@ xfs_fs_alloc_inode( /* * Now that the generic code is guaranteed not to be accessing - * the linux inode, we can reclaim the inode. + * the linux inode, we can inactivate and reclaim the inode. */ STATIC void xfs_fs_destroy_inode( @@ -938,9 +938,14 @@ xfs_fs_destroy_inode( trace_xfs_destroy_inode(ip); - XFS_STATS_INC(ip->i_mount, vn_reclaim); + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + XFS_STATS_INC(ip->i_mount, vn_rele); + XFS_STATS_INC(ip->i_mount, vn_remove); + + xfs_inactive(ip); ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); + XFS_STATS_INC(ip->i_mount, vn_reclaim); /* * We should never get here with one of the reclaim flags already set. @@ -987,24 +992,6 @@ xfs_fs_inode_init_once( "xfsino", ip->i_ino); } -STATIC void -xfs_fs_evict_inode( - struct inode *inode) -{ - xfs_inode_t *ip = XFS_I(inode); - - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - - trace_xfs_evict_inode(ip); - - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); - XFS_STATS_INC(ip->i_mount, vn_rele); - XFS_STATS_INC(ip->i_mount, vn_remove); - - xfs_inactive(ip); -} - /* * We do an unlocked check for XFS_IDONTCACHE here because we are already * serialised against cache hits here via the inode->i_lock and igrab() in @@ -1276,6 +1263,16 @@ xfs_fs_remount( return -EINVAL; } + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_ro_compat_feature(sbp, + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_warn(mp, +"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem", + (sbp->sb_features_ro_compat & + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); + return -EINVAL; + } + mp->m_flags &= ~XFS_MOUNT_RDONLY; /* @@ -1297,6 +1294,7 @@ xfs_fs_remount( */ xfs_restore_resvblks(mp); xfs_log_work_queue(mp); + xfs_queue_eofblocks(mp); } /* rw -> ro */ @@ -1309,6 +1307,13 @@ xfs_fs_remount( * return it to the same size. */ xfs_save_resvblks(mp); + + /* + * Cancel background eofb scanning so it cannot race with the + * final log force+buftarg wait and deadlock the remount. + */ + cancel_delayed_work_sync(&mp->m_eofblocks_work); + xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; } @@ -1558,22 +1563,16 @@ xfs_fs_fill_super( if (mp->m_flags & XFS_MOUNT_DAX) { xfs_warn(mp, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - if (sb->s_blocksize != PAGE_SIZE) { - xfs_alert(mp, - "Filesystem block size invalid for DAX Turning DAX off."); - mp->m_flags &= ~XFS_MOUNT_DAX; - } else if (!sb->s_bdev->bd_disk->fops->direct_access) { + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + + error = bdev_dax_supported(sb, sb->s_blocksize); + if (error) { xfs_alert(mp, - "Block device does not support DAX Turning DAX off."); + "DAX unsupported by block device. Turning off DAX."); mp->m_flags &= ~XFS_MOUNT_DAX; } } - if (xfs_sb_version_hassparseinodes(&mp->m_sb)) - xfs_alert(mp, - "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!"); - error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1663,7 +1662,6 @@ xfs_fs_free_cached_objects( static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, - .evict_inode = xfs_fs_evict_inode, .drop_inode = xfs_fs_drop_inode, .put_super = xfs_fs_put_super, .sync_fs = xfs_fs_sync_fs, @@ -1688,23 +1686,19 @@ MODULE_ALIAS_FS("xfs"); STATIC int __init xfs_init_zones(void) { - - xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend"); - if (!xfs_ioend_zone) + xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE, + offsetof(struct xfs_ioend, io_inline_bio)); + if (!xfs_ioend_bioset) goto out; - xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE, - xfs_ioend_zone); - if (!xfs_ioend_pool) - goto out_destroy_ioend_zone; - xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), "xfs_log_ticket"); if (!xfs_log_ticket_zone) - goto out_destroy_ioend_pool; + goto out_free_ioend_bioset; - xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), - "xfs_bmap_free_item"); + xfs_bmap_free_item_zone = kmem_zone_init( + sizeof(struct xfs_bmap_free_item), + "xfs_bmap_free_item"); if (!xfs_bmap_free_item_zone) goto out_destroy_log_ticket_zone; @@ -1797,10 +1791,8 @@ xfs_init_zones(void) kmem_zone_destroy(xfs_bmap_free_item_zone); out_destroy_log_ticket_zone: kmem_zone_destroy(xfs_log_ticket_zone); - out_destroy_ioend_pool: - mempool_destroy(xfs_ioend_pool); - out_destroy_ioend_zone: - kmem_zone_destroy(xfs_ioend_zone); + out_free_ioend_bioset: + bioset_free(xfs_ioend_bioset); out: return -ENOMEM; } @@ -1826,9 +1818,7 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_btree_cur_zone); kmem_zone_destroy(xfs_bmap_free_item_zone); kmem_zone_destroy(xfs_log_ticket_zone); - mempool_destroy(xfs_ioend_pool); - kmem_zone_destroy(xfs_ioend_zone); - + bioset_free(xfs_ioend_bioset); } STATIC int __init diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 2dfb1ce4585f..529bce9fc37e 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -61,8 +61,6 @@ struct xfs_mount; struct xfs_buftarg; struct block_device; -extern __uint64_t xfs_max_file_offset(unsigned int); - extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index b44284c1adda..08a46c6181fd 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -131,6 +131,8 @@ xfs_readlink( trace_xfs_readlink(ip); + ASSERT(!(ip->i_df.if_flags & XFS_IFINLINE)); + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -150,12 +152,7 @@ xfs_readlink( } - if (ip->i_df.if_flags & XFS_IFINLINE) { - memcpy(link, ip->i_df.if_u1.if_data, pathlen); - link[pathlen] = '\0'; - } else { - error = xfs_readlink_bmap(ip, link); - } + error = xfs_readlink_bmap(ip, link); out: xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -221,7 +218,6 @@ xfs_symlink( if (error) return error; - tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); /* * The symlink will fit into the inode data fork? * There can't be any attributes so we get the whole variable part. @@ -231,13 +227,15 @@ xfs_symlink( else fs_blocks = xfs_symlink_blocks(mp, pathlen); resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp); if (error == -ENOSPC && fs_blocks == 0) { resblks = 0; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0, + &tp); } if (error) - goto out_trans_cancel; + goto out_release_inode; xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); @@ -302,19 +300,11 @@ xfs_symlink( * If the symlink will fit into the inode, write it inline. */ if (pathlen <= XFS_IFORK_DSIZE(ip)) { - xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); - memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); - ip->i_d.di_size = pathlen; - - /* - * The inode was initially created in extent format. - */ - ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); - ip->i_df.if_flags |= XFS_IFINLINE; + xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); + ip->i_d.di_size = pathlen; ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); - } else { int offset; @@ -455,12 +445,9 @@ xfs_inactive_symlink_rmt( */ ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 6ced4f143494..79cfd3fc5324 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -17,10 +17,11 @@ */ #include "xfs.h" -#include "xfs_sysfs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" +#include "xfs_sysfs.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_stats.h" @@ -362,3 +363,294 @@ struct kobj_type xfs_log_ktype = { .sysfs_ops = &xfs_sysfs_ops, .default_attrs = xfs_log_attrs, }; + +/* + * Metadata IO error configuration + * + * The sysfs structure here is: + * ...xfs/<dev>/error/<class>/<errno>/<error_attrs> + * + * where <class> allows us to discriminate between data IO and metadata IO, + * and any other future type of IO (e.g. special inode or directory error + * handling) we care to support. + */ +static inline struct xfs_error_cfg * +to_error_cfg(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + return container_of(kobj, struct xfs_error_cfg, kobj); +} + +static inline struct xfs_mount * +err_to_mp(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + return container_of(kobj, struct xfs_mount, m_error_kobj); +} + +static ssize_t +max_retries_show( + struct kobject *kobject, + char *buf) +{ + struct xfs_error_cfg *cfg = to_error_cfg(kobject); + + return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries); +} + +static ssize_t +max_retries_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xfs_error_cfg *cfg = to_error_cfg(kobject); + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < -1) + return -EINVAL; + + cfg->max_retries = val; + return count; +} +XFS_SYSFS_ATTR_RW(max_retries); + +static ssize_t +retry_timeout_seconds_show( + struct kobject *kobject, + char *buf) +{ + struct xfs_error_cfg *cfg = to_error_cfg(kobject); + + return snprintf(buf, PAGE_SIZE, "%ld\n", + jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC); +} + +static ssize_t +retry_timeout_seconds_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xfs_error_cfg *cfg = to_error_cfg(kobject); + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + /* 1 day timeout maximum */ + if (val < 0 || val > 86400) + return -EINVAL; + + cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); + return count; +} +XFS_SYSFS_ATTR_RW(retry_timeout_seconds); + +static ssize_t +fail_at_unmount_show( + struct kobject *kobject, + char *buf) +{ + struct xfs_mount *mp = err_to_mp(kobject); + + return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_unmount); +} + +static ssize_t +fail_at_unmount_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xfs_mount *mp = err_to_mp(kobject); + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < 0 || val > 1) + return -EINVAL; + + mp->m_fail_unmount = val; + return count; +} +XFS_SYSFS_ATTR_RW(fail_at_unmount); + +static struct attribute *xfs_error_attrs[] = { + ATTR_LIST(max_retries), + ATTR_LIST(retry_timeout_seconds), + NULL, +}; + + +struct kobj_type xfs_error_cfg_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_attrs = xfs_error_attrs, +}; + +struct kobj_type xfs_error_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, +}; + +/* + * Error initialization tables. These need to be ordered in the same + * order as the enums used to index the array. All class init tables need to + * define a "default" behaviour as the first entry, all other entries can be + * empty. + */ +struct xfs_error_init { + char *name; + int max_retries; + int retry_timeout; /* in seconds */ +}; + +static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = { + { .name = "default", + .max_retries = XFS_ERR_RETRY_FOREVER, + .retry_timeout = 0, + }, + { .name = "EIO", + .max_retries = XFS_ERR_RETRY_FOREVER, + .retry_timeout = 0, + }, + { .name = "ENOSPC", + .max_retries = XFS_ERR_RETRY_FOREVER, + .retry_timeout = 0, + }, + { .name = "ENODEV", + .max_retries = 0, + }, +}; + +static int +xfs_error_sysfs_init_class( + struct xfs_mount *mp, + int class, + const char *parent_name, + struct xfs_kobj *parent_kobj, + const struct xfs_error_init init[]) +{ + struct xfs_error_cfg *cfg; + int error; + int i; + + ASSERT(class < XFS_ERR_CLASS_MAX); + + error = xfs_sysfs_init(parent_kobj, &xfs_error_ktype, + &mp->m_error_kobj, parent_name); + if (error) + return error; + + for (i = 0; i < XFS_ERR_ERRNO_MAX; i++) { + cfg = &mp->m_error_cfg[class][i]; + error = xfs_sysfs_init(&cfg->kobj, &xfs_error_cfg_ktype, + parent_kobj, init[i].name); + if (error) + goto out_error; + + cfg->max_retries = init[i].max_retries; + cfg->retry_timeout = msecs_to_jiffies( + init[i].retry_timeout * MSEC_PER_SEC); + } + return 0; + +out_error: + /* unwind the entries that succeeded */ + for (i--; i >= 0; i--) { + cfg = &mp->m_error_cfg[class][i]; + xfs_sysfs_del(&cfg->kobj); + } + xfs_sysfs_del(parent_kobj); + return error; +} + +int +xfs_error_sysfs_init( + struct xfs_mount *mp) +{ + int error; + + /* .../xfs/<dev>/error/ */ + error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype, + &mp->m_kobj, "error"); + if (error) + return error; + + error = sysfs_create_file(&mp->m_error_kobj.kobject, + ATTR_LIST(fail_at_unmount)); + + if (error) + goto out_error; + + /* .../xfs/<dev>/error/metadata/ */ + error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA, + "metadata", &mp->m_error_meta_kobj, + xfs_error_meta_init); + if (error) + goto out_error; + + return 0; + +out_error: + xfs_sysfs_del(&mp->m_error_kobj); + return error; +} + +void +xfs_error_sysfs_del( + struct xfs_mount *mp) +{ + struct xfs_error_cfg *cfg; + int i, j; + + for (i = 0; i < XFS_ERR_CLASS_MAX; i++) { + for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) { + cfg = &mp->m_error_cfg[i][j]; + + xfs_sysfs_del(&cfg->kobj); + } + } + xfs_sysfs_del(&mp->m_error_meta_kobj); + xfs_sysfs_del(&mp->m_error_kobj); +} + +struct xfs_error_cfg * +xfs_error_get_cfg( + struct xfs_mount *mp, + int error_class, + int error) +{ + struct xfs_error_cfg *cfg; + + if (error < 0) + error = -error; + + switch (error) { + case EIO: + cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO]; + break; + case ENOSPC: + cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENOSPC]; + break; + case ENODEV: + cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENODEV]; + break; + default: + cfg = &mp->m_error_cfg[error_class][XFS_ERR_DEFAULT]; + break; + } + + return cfg; +} diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index be692e59938d..d04637181ef2 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -58,4 +58,7 @@ xfs_sysfs_del( wait_for_completion(&kobj->complete); } +int xfs_error_sysfs_init(struct xfs_mount *mp); +void xfs_error_sysfs_del(struct xfs_mount *mp); + #endif /* __XFS_SYSFS_H__ */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index c8d58426008e..145169093fe0 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -354,6 +354,7 @@ DEFINE_BUF_EVENT(xfs_buf_submit_wait); DEFINE_BUF_EVENT(xfs_buf_bawrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); +DEFINE_BUF_EVENT(xfs_buf_trylock_fail); DEFINE_BUF_EVENT(xfs_buf_trylock); DEFINE_BUF_EVENT(xfs_buf_unlock); DEFINE_BUF_EVENT(xfs_buf_iowait); @@ -364,7 +365,6 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_bdstrat_shut); DEFINE_BUF_EVENT(xfs_buf_item_relse); -DEFINE_BUF_EVENT(xfs_buf_item_iodone); DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); @@ -944,7 +944,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, TP_ARGS(log, tic), TP_STRUCT__entry( __field(dev_t, dev) - __field(unsigned, trans_type) __field(char, ocnt) __field(char, cnt) __field(int, curr_res) @@ -962,7 +961,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, ), TP_fast_assign( __entry->dev = log->l_mp->m_super->s_dev; - __entry->trans_type = tic->t_trans_type; __entry->ocnt = tic->t_ocnt; __entry->cnt = tic->t_cnt; __entry->curr_res = tic->t_curr_res; @@ -980,14 +978,13 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, __entry->curr_block = log->l_curr_block; __entry->tail_lsn = atomic64_read(&log->l_tail_lsn); ), - TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " + TP_printk("dev %d:%d t_ocnt %u t_cnt %u t_curr_res %u " "t_unit_res %u t_flags %s reserveq %s " "writeq %s grant_reserve_cycle %d " "grant_reserve_bytes %d grant_write_cycle %d " "grant_write_bytes %d curr_cycle %d curr_block %d " "tail_cycle %d tail_block %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES), __entry->ocnt, __entry->cnt, __entry->curr_res, @@ -1053,19 +1050,21 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, ) TRACE_EVENT(xfs_log_force, - TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn), - TP_ARGS(mp, lsn), + TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn, unsigned long caller_ip), + TP_ARGS(mp, lsn, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_lsn_t, lsn) + __field(unsigned long, caller_ip) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->lsn = lsn; + __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d lsn 0x%llx", + TP_printk("dev %d:%d lsn 0x%llx caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->lsn) + __entry->lsn, (void *)__entry->caller_ip) ) #define DEFINE_LOG_ITEM_EVENT(name) \ @@ -1136,15 +1135,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn, ) DECLARE_EVENT_CLASS(xfs_file_class, - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), - TP_ARGS(ip, count, offset, flags), + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), + TP_ARGS(ip, count, offset), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(xfs_fsize_t, size) __field(loff_t, offset) __field(size_t, count) - __field(int, flags) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; @@ -1152,25 +1150,25 @@ DECLARE_EVENT_CLASS(xfs_file_class, __entry->size = ip->i_d.di_size; __entry->offset = offset; __entry->count = count; - __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx " - "offset 0x%llx count 0x%zx ioflags %s", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, - __entry->count, - __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) + __entry->count) ) #define DEFINE_RW_EVENT(name) \ DEFINE_EVENT(xfs_file_class, name, \ - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ - TP_ARGS(ip, count, offset, flags)) -DEFINE_RW_EVENT(xfs_file_read); + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \ + TP_ARGS(ip, count, offset)) +DEFINE_RW_EVENT(xfs_file_buffered_read); +DEFINE_RW_EVENT(xfs_file_direct_read); +DEFINE_RW_EVENT(xfs_file_dax_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); +DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_file_splice_read); DECLARE_EVENT_CLASS(xfs_page_class, @@ -1297,6 +1295,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_found); DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct); +DEFINE_IOMAP_EVENT(xfs_iomap_alloc); +DEFINE_IOMAP_EVENT(xfs_iomap_found); +DEFINE_IOMAP_EVENT(xfs_iomap_not_found); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 20c53666cb4b..5f3d33d16e67 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -47,47 +47,6 @@ xfs_trans_init( } /* - * This routine is called to allocate a transaction structure. - * The type parameter indicates the type of the transaction. These - * are enumerated in xfs_trans.h. - * - * Dynamically allocate the transaction structure from the transaction - * zone, initialize it, and return it to the caller. - */ -xfs_trans_t * -xfs_trans_alloc( - xfs_mount_t *mp, - uint type) -{ - xfs_trans_t *tp; - - sb_start_intwrite(mp->m_super); - tp = _xfs_trans_alloc(mp, type, KM_SLEEP); - tp->t_flags |= XFS_TRANS_FREEZE_PROT; - return tp; -} - -xfs_trans_t * -_xfs_trans_alloc( - xfs_mount_t *mp, - uint type, - xfs_km_flags_t memflags) -{ - xfs_trans_t *tp; - - WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); - atomic_inc(&mp->m_active_trans); - - tp = kmem_zone_zalloc(xfs_trans_zone, memflags); - tp->t_magic = XFS_TRANS_HEADER_MAGIC; - tp->t_type = type; - tp->t_mountp = mp; - INIT_LIST_HEAD(&tp->t_items); - INIT_LIST_HEAD(&tp->t_busy); - return tp; -} - -/* * Free the transaction structure. If there is more clean up * to do when the structure is freed, add it here. */ @@ -99,7 +58,7 @@ xfs_trans_free( xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); atomic_dec(&tp->t_mountp->m_active_trans); - if (tp->t_flags & XFS_TRANS_FREEZE_PROT) + if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); kmem_zone_free(xfs_trans_zone, tp); @@ -125,7 +84,6 @@ xfs_trans_dup( * Initialize the new transaction structure. */ ntp->t_magic = XFS_TRANS_HEADER_MAGIC; - ntp->t_type = tp->t_type; ntp->t_mountp = tp->t_mountp; INIT_LIST_HEAD(&ntp->t_items); INIT_LIST_HEAD(&ntp->t_busy); @@ -135,9 +93,9 @@ xfs_trans_dup( ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE) | - (tp->t_flags & XFS_TRANS_FREEZE_PROT); + (tp->t_flags & XFS_TRANS_NO_WRITECOUNT); /* We gave our writer reference to the new transaction */ - tp->t_flags &= ~XFS_TRANS_FREEZE_PROT; + tp->t_flags |= XFS_TRANS_NO_WRITECOUNT; ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; tp->t_blk_res = tp->t_blk_res_used; @@ -165,7 +123,7 @@ xfs_trans_dup( * This does not do quota reservations. That typically is done by the * caller afterwards. */ -int +static int xfs_trans_reserve( struct xfs_trans *tp, struct xfs_trans_res *resp, @@ -219,7 +177,7 @@ xfs_trans_reserve( resp->tr_logres, resp->tr_logcount, &tp->t_ticket, XFS_TRANSACTION, - permanent, tp->t_type); + permanent); } if (error) @@ -268,6 +226,42 @@ undo_blocks: return error; } +int +xfs_trans_alloc( + struct xfs_mount *mp, + struct xfs_trans_res *resp, + uint blocks, + uint rtextents, + uint flags, + struct xfs_trans **tpp) +{ + struct xfs_trans *tp; + int error; + + if (!(flags & XFS_TRANS_NO_WRITECOUNT)) + sb_start_intwrite(mp->m_super); + + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); + atomic_inc(&mp->m_active_trans); + + tp = kmem_zone_zalloc(xfs_trans_zone, + (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP); + tp->t_magic = XFS_TRANS_HEADER_MAGIC; + tp->t_flags = flags; + tp->t_mountp = mp; + INIT_LIST_HEAD(&tp->t_items); + INIT_LIST_HEAD(&tp->t_busy); + + error = xfs_trans_reserve(tp, resp, blocks, rtextents); + if (error) { + xfs_trans_cancel(tp); + return error; + } + + *tpp = tp; + return 0; +} + /* * Record the indicated change to the given field for application * to the file system's superblock when the transaction commits. diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index e7c49cf43fbc..9b2b9fa89331 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -52,6 +52,7 @@ typedef struct xfs_log_item { /* delayed logging */ struct list_head li_cil; /* CIL pointers */ struct xfs_log_vec *li_lv; /* active log vector */ + struct xfs_log_vec *li_lv_shadow; /* standby vector */ xfs_lsn_t li_seq; /* CIL commit seq */ } xfs_log_item_t; @@ -90,7 +91,6 @@ void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, */ typedef struct xfs_trans { unsigned int t_magic; /* magic number */ - unsigned int t_type; /* transaction type */ unsigned int t_log_res; /* amt of log space resvd */ unsigned int t_log_count; /* count for perm log res */ unsigned int t_blk_res; /* # of blocks resvd */ @@ -148,10 +148,9 @@ typedef struct xfs_trans { /* * XFS transaction mechanism exported interfaces. */ -xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); -xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); -int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *, - uint, uint); +int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, + uint blocks, uint rtextents, uint flags, + struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index d111f691f313..ea62245fee26 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -74,11 +74,12 @@ xfs_forget_acl( } static int -xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *value, size_t size, int flags) +xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *value, + size_t size, int flags) { int xflags = handler->flags; - struct xfs_inode *ip = XFS_I(d_inode(dentry)); + struct xfs_inode *ip = XFS_I(inode); int error; /* Convert Linux syscall to XFS internal ATTR flags */ @@ -92,7 +93,7 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry, error = xfs_attr_set(ip, (unsigned char *)name, (void *)value, size, xflags); if (!error) - xfs_forget_acl(d_inode(dentry), name, xflags); + xfs_forget_acl(inode, name, xflags); return error; } @@ -146,7 +147,7 @@ __xfs_xattr_put_listent( arraytop = context->count + prefix_len + namelen + 1; if (arraytop > context->firstu) { context->count = -1; /* insufficient space */ - return 1; + return 0; } offset = (char *)context->alist + context->count; strncpy(offset, prefix, prefix_len); @@ -166,8 +167,7 @@ xfs_xattr_put_listent( int flags, unsigned char *name, int namelen, - int valuelen, - unsigned char *value) + int valuelen) { char *prefix; int prefix_len; @@ -221,11 +221,15 @@ xfs_xattr_put_listent( } ssize_t -xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) +xfs_vn_listxattr( + struct dentry *dentry, + char *data, + size_t size) { struct xfs_attr_list_context context; struct attrlist_cursor_kern cursor = { 0 }; - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(dentry); + int error; /* * First read the regular on-disk attributes. @@ -239,7 +243,9 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) context.firstu = context.bufsize; context.put_listent = xfs_xattr_put_listent; - xfs_attr_list_int(&context); + error = xfs_attr_list_int(&context); + if (error) + return error; if (context.count < 0) return -ERANGE; |