summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_log.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_log.c')
-rw-r--r--fs/xfs/xfs_log.c1015
1 files changed, 471 insertions, 544 deletions
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c19a82adea1e..63e2358f160a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -47,21 +47,16 @@ xlog_state_get_iclog_space(
int len,
struct xlog_in_core **iclog,
struct xlog_ticket *ticket,
- int *continued_write,
int *logoffsetp);
STATIC void
-xlog_state_switch_iclogs(
- struct xlog *log,
- struct xlog_in_core *iclog,
- int eventual_size);
-STATIC void
xlog_grant_push_ail(
struct xlog *log,
int need_bytes);
STATIC void
xlog_sync(
struct xlog *log,
- struct xlog_in_core *iclog);
+ struct xlog_in_core *iclog,
+ struct xlog_ticket *ticket);
#if defined(DEBUG)
STATIC void
xlog_verify_dest_ptr(
@@ -94,6 +89,62 @@ xlog_iclogs_empty(
static int
xfs_log_cover(struct xfs_mount *);
+/*
+ * We need to make sure the buffer pointer returned is naturally aligned for the
+ * biggest basic data type we put into it. We have already accounted for this
+ * padding when sizing the buffer.
+ *
+ * However, this padding does not get written into the log, and hence we have to
+ * track the space used by the log vectors separately to prevent log space hangs
+ * due to inaccurate accounting (i.e. a leak) of the used log space through the
+ * CIL context ticket.
+ *
+ * We also add space for the xlog_op_header that describes this region in the
+ * log. This prepends the data region we return to the caller to copy their data
+ * into, so do all the static initialisation of the ophdr now. Because the ophdr
+ * is not 8 byte aligned, we have to be careful to ensure that we align the
+ * start of the buffer such that the region we return to the call is 8 byte
+ * aligned and packed against the tail of the ophdr.
+ */
+void *
+xlog_prepare_iovec(
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp,
+ uint type)
+{
+ struct xfs_log_iovec *vec = *vecp;
+ struct xlog_op_header *oph;
+ uint32_t len;
+ void *buf;
+
+ if (vec) {
+ ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
+ vec++;
+ } else {
+ vec = &lv->lv_iovecp[0];
+ }
+
+ len = lv->lv_buf_len + sizeof(struct xlog_op_header);
+ if (!IS_ALIGNED(len, sizeof(uint64_t))) {
+ lv->lv_buf_len = round_up(len, sizeof(uint64_t)) -
+ sizeof(struct xlog_op_header);
+ }
+
+ vec->i_type = type;
+ vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+
+ oph = vec->i_addr;
+ oph->oh_clientid = XFS_TRANSACTION;
+ oph->oh_res2 = 0;
+ oph->oh_flags = 0;
+
+ buf = vec->i_addr + sizeof(struct xlog_op_header);
+ ASSERT(IS_ALIGNED((unsigned long)buf, sizeof(uint64_t)));
+
+ *vecp = vec;
+ return buf;
+}
+
static void
xlog_grant_sub_space(
struct xlog *log,
@@ -326,30 +377,6 @@ xlog_grant_head_check(
return error;
}
-static void
-xlog_tic_reset_res(xlog_ticket_t *tic)
-{
- tic->t_res_num = 0;
- tic->t_res_arr_sum = 0;
- tic->t_res_num_ophdrs = 0;
-}
-
-static void
-xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
-{
- if (tic->t_res_num == XLOG_TIC_LEN_MAX) {
- /* add to overflow and start again */
- tic->t_res_o_flow += tic->t_res_arr_sum;
- tic->t_res_num = 0;
- tic->t_res_arr_sum = 0;
- }
-
- tic->t_res_arr[tic->t_res_num].r_len = len;
- tic->t_res_arr[tic->t_res_num].r_type = type;
- tic->t_res_arr_sum += len;
- tic->t_res_num++;
-}
-
bool
xfs_log_writable(
struct xfs_mount *mp)
@@ -399,8 +426,6 @@ xfs_log_regrant(
xlog_grant_push_ail(log, tic->t_unit_res);
tic->t_curr_res = tic->t_unit_res;
- xlog_tic_reset_res(tic);
-
if (tic->t_cnt > 0)
return 0;
@@ -438,10 +463,9 @@ out_error:
int
xfs_log_reserve(
struct xfs_mount *mp,
- int unit_bytes,
- int cnt,
+ int unit_bytes,
+ int cnt,
struct xlog_ticket **ticp,
- uint8_t client,
bool permanent)
{
struct xlog *log = mp->m_log;
@@ -449,15 +473,13 @@ xfs_log_reserve(
int need_bytes;
int error = 0;
- ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
-
if (XLOG_FORCED_SHUTDOWN(log))
return -EIO;
XFS_STATS_INC(mp, xs_try_logspace);
ASSERT(*ticp == NULL);
- tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent);
+ tic = xlog_ticket_alloc(log, unit_bytes, cnt, permanent);
*ticp = tic;
xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -513,10 +535,11 @@ __xlog_state_release_iclog(
* Flush iclog to disk if this is the last reference to the given iclog and the
* it is in the WANT_SYNC state.
*/
-static int
+int
xlog_state_release_iclog(
struct xlog *log,
- struct xlog_in_core *iclog)
+ struct xlog_in_core *iclog,
+ struct xlog_ticket *ticket)
{
lockdep_assert_held(&log->l_icloglock);
@@ -526,30 +549,13 @@ xlog_state_release_iclog(
if (atomic_dec_and_test(&iclog->ic_refcnt) &&
__xlog_state_release_iclog(log, iclog)) {
spin_unlock(&log->l_icloglock);
- xlog_sync(log, iclog);
+ xlog_sync(log, iclog, ticket);
spin_lock(&log->l_icloglock);
}
return 0;
}
-void
-xfs_log_release_iclog(
- struct xlog_in_core *iclog)
-{
- struct xlog *log = iclog->ic_log;
- bool sync = false;
-
- if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) {
- if (iclog->ic_state != XLOG_STATE_IOERROR)
- sync = __xlog_state_release_iclog(log, iclog);
- spin_unlock(&log->l_icloglock);
- }
-
- if (sync)
- xlog_sync(log, iclog);
-}
-
/*
* Mount a log filesystem
*
@@ -786,10 +792,12 @@ xfs_log_mount_cancel(
}
/*
- * Wait for the iclog to be written disk, or return an error if the log has been
- * shut down.
+ * Wait for the iclog and all prior iclogs to be written disk as required by the
+ * log force state machine. Waiting on ic_force_wait ensures iclog completions
+ * have been ordered and callbacks run before we are woken here, hence
+ * guaranteeing that all the iclogs up to this one are on stable storage.
*/
-static int
+int
xlog_wait_on_iclog(
struct xlog_in_core *iclog)
__releases(iclog->ic_log->l_icloglock)
@@ -818,26 +826,49 @@ xlog_wait_on_iclog(
static int
xlog_write_unmount_record(
struct xlog *log,
- struct xlog_ticket *ticket,
- xfs_lsn_t *lsn,
- uint flags)
+ struct xlog_ticket *ticket)
{
- struct xfs_unmount_log_format ulf = {
- .magic = XLOG_UNMOUNT_TYPE,
+ struct {
+ struct xlog_op_header ophdr;
+ struct xfs_unmount_log_format ulf;
+ } unmount_rec = {
+ .ophdr = {
+ .oh_clientid = XFS_LOG,
+ .oh_tid = cpu_to_be32(ticket->t_tid),
+ .oh_flags = XLOG_UNMOUNT_TRANS,
+ },
+ .ulf = {
+ .magic = XLOG_UNMOUNT_TYPE,
+ },
};
struct xfs_log_iovec reg = {
- .i_addr = &ulf,
- .i_len = sizeof(ulf),
+ .i_addr = &unmount_rec,
+ .i_len = sizeof(unmount_rec),
.i_type = XLOG_REG_TYPE_UNMOUNT,
};
struct xfs_log_vec vec = {
.lv_niovecs = 1,
.lv_iovecp = &reg,
};
+ LIST_HEAD(lv_chain);
+ INIT_LIST_HEAD(&vec.lv_list);
+ list_add(&vec.lv_list, &lv_chain);
+
+ BUILD_BUG_ON((sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_unmount_log_format)) !=
+ sizeof(unmount_rec));
/* account for space used by record data */
- ticket->t_curr_res -= sizeof(ulf);
- return xlog_write(log, &vec, ticket, lsn, NULL, flags, false);
+ ticket->t_curr_res -= sizeof(unmount_rec);
+
+ /*
+ * For external log devices, we need to flush the data device cache
+ * first to ensure all metadata writeback is on stable storage before we
+ * stamp the tail LSN into the unmount record.
+ */
+ if (log->l_targ != log->l_mp->m_ddev_targp)
+ blkdev_issue_flush(log->l_targ->bt_bdev);
+ return xlog_write(log, &lv_chain, ticket, NULL, NULL, reg.i_len);
}
/*
@@ -851,15 +882,13 @@ xlog_unmount_write(
struct xfs_mount *mp = log->l_mp;
struct xlog_in_core *iclog;
struct xlog_ticket *tic = NULL;
- xfs_lsn_t lsn;
- uint flags = XLOG_UNMOUNT_TRANS;
int error;
- error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
+ error = xfs_log_reserve(mp, 600, 1, &tic, 0);
if (error)
goto out_err;
- error = xlog_write_unmount_record(log, tic, &lsn, flags);
+ error = xlog_write_unmount_record(log, tic);
/*
* At this point, we're umounting anyway, so there's no point in
* transitioning log state to IOERROR. Just continue...
@@ -876,7 +905,12 @@ out_err:
else
ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
iclog->ic_state == XLOG_STATE_IOERROR);
- error = xlog_state_release_iclog(log, iclog);
+ /*
+ * Ensure the journal is fully flushed and on stable storage once the
+ * iclog containing the unmount record is written.
+ */
+ iclog->ic_flags |= (XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
+ error = xlog_state_release_iclog(log, iclog, tic);
xlog_wait_on_iclog(iclog);
if (tic) {
@@ -1401,6 +1435,11 @@ xlog_alloc_log(
xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
+ if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
+ log->l_iclog_roundoff = mp->m_sb.sb_logsunit;
+ else
+ log->l_iclog_roundoff = BBSIZE;
+
xlog_grant_head_init(&log->l_reserve_head);
xlog_grant_head_init(&log->l_write_head);
@@ -1532,9 +1571,14 @@ xlog_commit_record(
struct xlog_in_core **iclog,
xfs_lsn_t *lsn)
{
+ struct xlog_op_header ophdr = {
+ .oh_clientid = XFS_TRANSACTION,
+ .oh_tid = cpu_to_be32(ticket->t_tid),
+ .oh_flags = XLOG_COMMIT_TRANS,
+ };
struct xfs_log_iovec reg = {
- .i_addr = NULL,
- .i_len = 0,
+ .i_addr = &ophdr,
+ .i_len = sizeof(struct xlog_op_header),
.i_type = XLOG_REG_TYPE_COMMIT,
};
struct xfs_log_vec vec = {
@@ -1542,12 +1586,16 @@ xlog_commit_record(
.lv_iovecp = &reg,
};
int error;
+ LIST_HEAD(lv_chain);
+ INIT_LIST_HEAD(&vec.lv_list);
+ list_add(&vec.lv_list, &lv_chain);
if (XLOG_FORCED_SHUTDOWN(log))
return -EIO;
- error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS,
- false);
+ /* account for space used by record data */
+ ticket->t_curr_res -= reg.i_len;
+ error = xlog_write(log, &lv_chain, ticket, lsn, iclog, reg.i_len);
if (error)
xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
return error;
@@ -1753,8 +1801,7 @@ xlog_write_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
uint64_t bno,
- unsigned int count,
- bool need_flush)
+ unsigned int count)
{
ASSERT(bno < log->l_logBBsize);
@@ -1792,10 +1839,12 @@ xlog_write_iclog(
* writeback throttle from throttling log writes behind background
* metadata writeback and causing priority inversions.
*/
- iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC |
- REQ_IDLE | REQ_FUA;
- if (need_flush)
+ iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE;
+ if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH)
iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
+ if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
+ iclog->ic_bio.bi_opf |= REQ_FUA;
+ iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
@@ -1854,29 +1903,15 @@ xlog_calc_iclog_size(
uint32_t *roundoff)
{
uint32_t count_init, count;
- bool use_lsunit;
-
- use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
- log->l_mp->m_sb.sb_logsunit > 1;
/* Add for LR header */
count_init = log->l_iclog_hsize + iclog->ic_offset;
+ count = roundup(count_init, log->l_iclog_roundoff);
- /* Round out the log write size */
- if (use_lsunit) {
- /* we have a v2 stripe unit to use */
- count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
- } else {
- count = BBTOB(BTOBB(count_init));
- }
-
- ASSERT(count >= count_init);
*roundoff = count - count_init;
- if (use_lsunit)
- ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit);
- else
- ASSERT(*roundoff < BBTOB(1));
+ ASSERT(count >= count_init);
+ ASSERT(*roundoff < log->l_iclog_roundoff);
return count;
}
@@ -1906,24 +1941,32 @@ xlog_calc_iclog_size(
STATIC void
xlog_sync(
struct xlog *log,
- struct xlog_in_core *iclog)
+ struct xlog_in_core *iclog,
+ struct xlog_ticket *ticket)
{
unsigned int count; /* byte count of bwrite */
unsigned int roundoff; /* roundoff to BB or stripe */
uint64_t bno;
unsigned int size;
- bool need_flush = true, split = false;
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
count = xlog_calc_iclog_size(log, iclog, &roundoff);
- /* move grant heads by roundoff in sync */
- xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
- xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
+ /*
+ * If we have a ticket, account for the roundoff via the ticket
+ * reservation to avoid touching the hot grant heads needlessly.
+ * Otherwise, we have to move grant heads directly.
+ */
+ if (ticket) {
+ ticket->t_curr_res -= roundoff;
+ } else {
+ xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
+ xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
+ }
/* put cycle number in every block */
- xlog_pack_data(log, iclog, roundoff);
+ xlog_pack_data(log, iclog, roundoff);
/* real byte length */
size = iclog->ic_offset;
@@ -1937,10 +1980,8 @@ xlog_sync(
bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn));
/* Do we need to split this write into 2 parts? */
- if (bno + BTOBB(count) > log->l_logBBsize) {
+ if (bno + BTOBB(count) > log->l_logBBsize)
xlog_split_iclog(log, &iclog->ic_header, bno, count);
- split = true;
- }
/* calculcate the checksum */
iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
@@ -1961,22 +2002,8 @@ xlog_sync(
be64_to_cpu(iclog->ic_header.h_lsn));
}
#endif
-
- /*
- * Flush the data device before flushing the log to make sure all meta
- * data written back from the AIL actually made it to disk before
- * stamping the new log tail LSN into the log buffer. For an external
- * log we need to issue the flush explicitly, and unfortunately
- * synchronously here; for an internal log we can simply use the block
- * layer state machine for preflushes.
- */
- if (log->l_targ != log->l_mp->m_ddev_targp || split) {
- xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
- need_flush = false;
- }
-
xlog_verify_iclog(log, iclog, count);
- xlog_write_iclog(log, iclog, bno, count, need_flush);
+ xlog_write_iclog(log, iclog, bno, count);
}
/*
@@ -2040,63 +2067,11 @@ xlog_print_tic_res(
struct xfs_mount *mp,
struct xlog_ticket *ticket)
{
- uint i;
- uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
-
- /* match with XLOG_REG_TYPE_* in xfs_log.h */
-#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str
- static char *res_type_str[] = {
- REG_TYPE_STR(BFORMAT, "bformat"),
- REG_TYPE_STR(BCHUNK, "bchunk"),
- REG_TYPE_STR(EFI_FORMAT, "efi_format"),
- REG_TYPE_STR(EFD_FORMAT, "efd_format"),
- REG_TYPE_STR(IFORMAT, "iformat"),
- REG_TYPE_STR(ICORE, "icore"),
- REG_TYPE_STR(IEXT, "iext"),
- REG_TYPE_STR(IBROOT, "ibroot"),
- REG_TYPE_STR(ILOCAL, "ilocal"),
- REG_TYPE_STR(IATTR_EXT, "iattr_ext"),
- REG_TYPE_STR(IATTR_BROOT, "iattr_broot"),
- REG_TYPE_STR(IATTR_LOCAL, "iattr_local"),
- REG_TYPE_STR(QFORMAT, "qformat"),
- REG_TYPE_STR(DQUOT, "dquot"),
- REG_TYPE_STR(QUOTAOFF, "quotaoff"),
- REG_TYPE_STR(LRHEADER, "LR header"),
- REG_TYPE_STR(UNMOUNT, "unmount"),
- REG_TYPE_STR(COMMIT, "commit"),
- REG_TYPE_STR(TRANSHDR, "trans header"),
- REG_TYPE_STR(ICREATE, "inode create"),
- REG_TYPE_STR(RUI_FORMAT, "rui_format"),
- REG_TYPE_STR(RUD_FORMAT, "rud_format"),
- REG_TYPE_STR(CUI_FORMAT, "cui_format"),
- REG_TYPE_STR(CUD_FORMAT, "cud_format"),
- REG_TYPE_STR(BUI_FORMAT, "bui_format"),
- REG_TYPE_STR(BUD_FORMAT, "bud_format"),
- };
- BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
-#undef REG_TYPE_STR
-
xfs_warn(mp, "ticket reservation summary:");
- xfs_warn(mp, " unit res = %d bytes",
- ticket->t_unit_res);
- xfs_warn(mp, " current res = %d bytes",
- ticket->t_curr_res);
- xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)",
- ticket->t_res_arr_sum, ticket->t_res_o_flow);
- xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)",
- ticket->t_res_num_ophdrs, ophdr_spc);
- xfs_warn(mp, " ophdr + reg = %u bytes",
- ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc);
- xfs_warn(mp, " num regions = %u",
- ticket->t_res_num);
-
- for (i = 0; i < ticket->t_res_num; i++) {
- uint r_type = ticket->t_res_arr[i].r_type;
- xfs_warn(mp, "region[%u]: %s - %u bytes", i,
- ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
- "bad-rtype" : res_type_str[r_type]),
- ticket->t_res_arr[i].r_len);
- }
+ xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res);
+ xfs_warn(mp, " current res = %d bytes", ticket->t_curr_res);
+ xfs_warn(mp, " original count = %d", ticket->t_ocnt);
+ xfs_warn(mp, " remaining count = %d", ticket->t_cnt);
}
/*
@@ -2150,191 +2125,255 @@ xlog_print_trans(
}
/*
- * Calculate the potential space needed by the log vector. We may need a start
- * record, and each region gets its own struct xlog_op_header and may need to be
- * double word aligned.
+ * Write whole log vectors into a single iclog which is guaranteed to have
+ * either sufficient space for the entire log vector chain to be written or
+ * exclusive access to the remaining space in the iclog.
+ *
+ * Return the number of iovecs and data written into the iclog, as well as
+ * a pointer to the logvec that doesn't fit in the log (or NULL if we hit the
+ * end of the chain.
*/
-static int
-xlog_write_calc_vec_length(
- struct xlog_ticket *ticket,
+static struct xfs_log_vec *
+xlog_write_single(
+ struct list_head *lv_chain,
struct xfs_log_vec *log_vector,
- bool need_start_rec)
+ struct xlog_ticket *ticket,
+ struct xlog_in_core *iclog,
+ uint32_t *log_offset,
+ uint32_t *len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
{
struct xfs_log_vec *lv;
- int headers = need_start_rec ? 1 : 0;
- int len = 0;
- int i;
+ void *ptr;
+ int index;
- for (lv = log_vector; lv; lv = lv->lv_next) {
- /* we don't write ordered log vectors */
- if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
- continue;
+ ASSERT(*log_offset + *len <= iclog->ic_size ||
+ iclog->ic_state == XLOG_STATE_WANT_SYNC);
+
+ ptr = iclog->ic_datap + *log_offset;
+ for (lv = log_vector;
+ !list_entry_is_head(lv, lv_chain, lv_list);
+ lv = list_next_entry(lv, lv_list)) {
+ /*
+ * If the entire log vec does not fit in the iclog, punt it to
+ * the partial copy loop which can handle this case.
+ */
+ if (lv->lv_niovecs &&
+ lv->lv_bytes > iclog->ic_size - *log_offset)
+ break;
- headers += lv->lv_niovecs;
+ /*
+ * Ordered log vectors have no regions to write so this
+ * loop will naturally skip them.
+ */
+ for (index = 0; index < lv->lv_niovecs; index++) {
+ struct xfs_log_iovec *reg = &lv->lv_iovecp[index];
+ struct xlog_op_header *ophdr = reg->i_addr;
- for (i = 0; i < lv->lv_niovecs; i++) {
- struct xfs_log_iovec *vecp = &lv->lv_iovecp[i];
+ ASSERT(reg->i_len % sizeof(int32_t) == 0);
+ ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
- len += vecp->i_len;
- xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_len = cpu_to_be32(reg->i_len -
+ sizeof(struct xlog_op_header));
+ memcpy(ptr, reg->i_addr, reg->i_len);
+ xlog_write_adv_cnt(&ptr, len, log_offset, reg->i_len);
+ (*record_cnt)++;
+ *data_cnt += reg->i_len;
}
}
-
- ticket->t_res_num_ophdrs += headers;
- len += headers * sizeof(struct xlog_op_header);
-
- return len;
+ if (list_entry_is_head(lv, lv_chain, lv_list))
+ lv = NULL;
+ ASSERT(*len == 0 || lv);
+ return lv;
}
-static void
-xlog_write_start_rec(
- struct xlog_op_header *ophdr,
- struct xlog_ticket *ticket)
-{
- ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
- ophdr->oh_clientid = ticket->t_clientid;
- ophdr->oh_len = 0;
- ophdr->oh_flags = XLOG_START_TRANS;
- ophdr->oh_res2 = 0;
-}
-
-static xlog_op_header_t *
-xlog_write_setup_ophdr(
+static int
+xlog_write_get_more_iclog_space(
struct xlog *log,
- struct xlog_op_header *ophdr,
struct xlog_ticket *ticket,
- uint flags)
+ struct xlog_in_core **iclogp,
+ uint32_t *log_offset,
+ uint32_t len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
{
- ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
- ophdr->oh_clientid = ticket->t_clientid;
- ophdr->oh_res2 = 0;
-
- /* are we copying a commit or unmount record? */
- ophdr->oh_flags = flags;
+ struct xlog_in_core *iclog = *iclogp;
+ int error;
- /*
- * We've seen logs corrupted with bad transaction client ids. This
- * makes sure that XFS doesn't generate them on. Turn this into an EIO
- * and shut down the filesystem.
- */
- switch (ophdr->oh_clientid) {
- case XFS_TRANSACTION:
- case XFS_VOLUME:
- case XFS_LOG:
- break;
- default:
- xfs_warn(log->l_mp,
- "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT,
- ophdr->oh_clientid, ticket);
- return NULL;
- }
+ spin_lock(&log->l_icloglock);
+ xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ iclog->ic_state == XLOG_STATE_IOERROR);
+ error = xlog_state_release_iclog(log, iclog, ticket);
+ spin_unlock(&log->l_icloglock);
+ if (error)
+ return error;
- return ophdr;
+ error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+ log_offset);
+ if (error)
+ return error;
+ *record_cnt = 0;
+ *data_cnt = 0;
+ *iclogp = iclog;
+ return 0;
}
/*
- * Set up the parameters of the region copy into the log. This has
- * to handle region write split across multiple log buffers - this
- * state is kept external to this function so that this code can
- * be written in an obvious, self documenting manner.
+ * Write log vectors into a single iclog which is smaller than the current chain
+ * length. We write until we cannot fit a full record into the remaining space
+ * and then stop. We return the log vector that is to be written that cannot
+ * wholly fit in the iclog.
*/
-static int
-xlog_write_setup_copy(
+static struct xfs_log_vec *
+xlog_write_partial(
+ struct xlog *log,
+ struct list_head *lv_chain,
+ struct xfs_log_vec *log_vector,
struct xlog_ticket *ticket,
- struct xlog_op_header *ophdr,
- int space_available,
- int space_required,
- int *copy_off,
- int *copy_len,
- int *last_was_partial_copy,
- int *bytes_consumed)
-{
- int still_to_copy;
-
- still_to_copy = space_required - *bytes_consumed;
- *copy_off = *bytes_consumed;
-
- if (still_to_copy <= space_available) {
- /* write of region completes here */
- *copy_len = still_to_copy;
- ophdr->oh_len = cpu_to_be32(*copy_len);
- if (*last_was_partial_copy)
- ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
- *last_was_partial_copy = 0;
- *bytes_consumed = 0;
- return 0;
- }
+ struct xlog_in_core **iclogp,
+ uint32_t *log_offset,
+ uint32_t *len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
+{
+ struct xlog_in_core *iclog = *iclogp;
+ struct xfs_log_vec *lv = log_vector;
+ struct xfs_log_iovec *reg;
+ struct xlog_op_header *ophdr;
+ void *ptr;
+ int index = 0;
+ uint32_t rlen;
+ int error;
- /* partial write of region, needs extra log op header reservation */
- *copy_len = space_available;
- ophdr->oh_len = cpu_to_be32(*copy_len);
- ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
- if (*last_was_partial_copy)
- ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
- *bytes_consumed += *copy_len;
- (*last_was_partial_copy)++;
+ /* walk the logvec, copying until we run out of space in the iclog */
+ ptr = iclog->ic_datap + *log_offset;
+ for (index = 0; index < lv->lv_niovecs; index++) {
+ uint32_t reg_offset = 0;
- /* account for new log op header */
- ticket->t_curr_res -= sizeof(struct xlog_op_header);
- ticket->t_res_num_ophdrs++;
+ reg = &lv->lv_iovecp[index];
+ ASSERT(reg->i_len % sizeof(int32_t) == 0);
- return sizeof(struct xlog_op_header);
-}
+ /*
+ * The first region of a continuation must have a non-zero
+ * length otherwise log recovery will just skip over it and
+ * start recovering from the next opheader it finds. Because we
+ * mark the next opheader as a continuation, recovery will then
+ * incorrectly add the continuation to the previous region and
+ * that breaks stuff.
+ *
+ * Hence if there isn't space for region data after the
+ * opheader, then we need to start afresh with a new iclog.
+ */
+ if (iclog->ic_size - *log_offset <=
+ sizeof(struct xlog_op_header)) {
+ error = xlog_write_get_more_iclog_space(log, ticket,
+ &iclog, log_offset, *len, record_cnt,
+ data_cnt);
+ if (error)
+ return ERR_PTR(error);
+ ptr = iclog->ic_datap + *log_offset;
+ }
-static int
-xlog_write_copy_finish(
- struct xlog *log,
- struct xlog_in_core *iclog,
- uint flags,
- int *record_cnt,
- int *data_cnt,
- int *partial_copy,
- int *partial_copy_len,
- int log_offset,
- struct xlog_in_core **commit_iclog)
-{
- int error;
+ ophdr = reg->i_addr;
+ rlen = min_t(uint32_t, reg->i_len, iclog->ic_size - *log_offset);
+
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_len = cpu_to_be32(rlen - sizeof(struct xlog_op_header));
+ if (rlen != reg->i_len)
+ ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+
+ ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
+ xlog_verify_dest_ptr(log, ptr);
+ memcpy(ptr, reg->i_addr, rlen);
+ xlog_write_adv_cnt(&ptr, len, log_offset, rlen);
+ (*record_cnt)++;
+ *data_cnt += rlen;
+
+ /* If we wrote the whole region, move to the next. */
+ if (rlen == reg->i_len)
+ continue;
- if (*partial_copy) {
/*
- * This iclog has already been marked WANT_SYNC by
- * xlog_state_get_iclog_space.
+ * We now have a partially written iovec, but it can span
+ * multiple iclogs so we loop here. First we release the iclog
+ * we currently have, then we get a new iclog and add a new
+ * opheader. Then we continue copying from where we were until
+ * we either complete the iovec or fill the iclog. If we
+ * complete the iovec, then we increment the index and go right
+ * back to the top of the outer loop. if we fill the iclog, we
+ * run the inner loop again.
+ *
+ * This is complicated by the tail of a region using all the
+ * space in an iclog and hence requiring us to release the iclog
+ * and get a new one before returning to the outer loop. We must
+ * always guarantee that we exit this inner loop with at least
+ * space for log transaction opheaders left in the current
+ * iclog, hence we cannot just terminate the loop at the end
+ * of the of the continuation. So we loop while there is no
+ * space left in the current iclog, and check for the end of the
+ * continuation after getting a new iclog.
*/
- spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
- *record_cnt = 0;
- *data_cnt = 0;
- goto release_iclog;
- }
+ do {
+ /*
+ * Account for the continuation opheader before we get
+ * a new iclog. This is necessary so that we reserve
+ * space in the iclog for it.
+ */
+ *len += sizeof(struct xlog_op_header);
+ ticket->t_curr_res -= sizeof(struct xlog_op_header);
- *partial_copy = 0;
- *partial_copy_len = 0;
+ error = xlog_write_get_more_iclog_space(log, ticket,
+ &iclog, log_offset, *len, record_cnt,
+ data_cnt);
+ if (error)
+ return ERR_PTR(error);
+ ptr = iclog->ic_datap + *log_offset;
- if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
- /* no more space in this iclog - push it. */
- spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
- *record_cnt = 0;
- *data_cnt = 0;
+ ophdr = ptr;
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_clientid = XFS_TRANSACTION;
+ ophdr->oh_res2 = 0;
+ ophdr->oh_flags = XLOG_WAS_CONT_TRANS;
- if (iclog->ic_state == XLOG_STATE_ACTIVE)
- xlog_state_switch_iclogs(log, iclog, 0);
- else
- ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- iclog->ic_state == XLOG_STATE_IOERROR);
- if (!commit_iclog)
- goto release_iclog;
- spin_unlock(&log->l_icloglock);
- ASSERT(flags & XLOG_COMMIT_TRANS);
- *commit_iclog = iclog;
- }
+ xlog_write_adv_cnt(&ptr, len, log_offset,
+ sizeof(struct xlog_op_header));
+ *data_cnt += sizeof(struct xlog_op_header);
- return 0;
+ /*
+ * If rlen fits in the iclog, then end the region
+ * continuation. Otherwise we're going around again.
+ */
+ reg_offset += rlen;
+ rlen = reg->i_len - reg_offset;
+ if (rlen <= iclog->ic_size - *log_offset)
+ ophdr->oh_flags |= XLOG_END_TRANS;
+ else
+ ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
-release_iclog:
- error = xlog_state_release_iclog(log, iclog);
- spin_unlock(&log->l_icloglock);
- return error;
+ rlen = min_t(uint32_t, rlen, iclog->ic_size - *log_offset);
+ ophdr->oh_len = cpu_to_be32(rlen);
+
+ xlog_verify_dest_ptr(log, ptr);
+ memcpy(ptr, reg->i_addr + reg_offset, rlen);
+ xlog_write_adv_cnt(&ptr, len, log_offset, rlen);
+ (*record_cnt)++;
+ *data_cnt += rlen;
+
+ } while (ophdr->oh_flags & XLOG_CONTINUE_TRANS);
+ }
+
+ /*
+ * No more iovecs remain in this logvec so return the next log vec to
+ * the caller so it can go back to fast path copying.
+ */
+ *iclogp = iclog;
+ lv = list_next_entry(lv, lv_list);
+ if (list_entry_is_head(lv, lv_chain, lv_list))
+ return NULL;
+ return lv;
}
/*
@@ -2380,32 +2419,19 @@ release_iclog:
int
xlog_write(
struct xlog *log,
- struct xfs_log_vec *log_vector,
+ struct list_head *lv_chain,
struct xlog_ticket *ticket,
xfs_lsn_t *start_lsn,
struct xlog_in_core **commit_iclog,
- uint flags,
- bool need_start_rec)
+ uint32_t len)
{
struct xlog_in_core *iclog = NULL;
- struct xfs_log_vec *lv = log_vector;
- struct xfs_log_iovec *vecp = lv->lv_iovecp;
- int index = 0;
- int len;
- int partial_copy = 0;
- int partial_copy_len = 0;
- int contwr = 0;
+ struct xfs_log_vec *lv;
int record_cnt = 0;
int data_cnt = 0;
int error = 0;
+ int log_offset;
- /*
- * If this is a commit or unmount transaction, we don't need a start
- * record to be written. We do, however, have to account for the
- * commit or unmount header that gets written. Hence we always have
- * to account for an extra xlog_op_header here.
- */
- ticket->t_curr_res -= sizeof(struct xlog_op_header);
if (ticket->t_curr_res < 0) {
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"ctx ticket reservation ran out. Need to up reservation");
@@ -2413,144 +2439,44 @@ xlog_write(
xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
}
- len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec);
- *start_lsn = 0;
- while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
- void *ptr;
- int log_offset;
-
- error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
- &contwr, &log_offset);
- if (error)
- return error;
-
- ASSERT(log_offset <= iclog->ic_size - 1);
- ptr = iclog->ic_datap + log_offset;
-
- /* start_lsn is the first lsn written to. That's all we need. */
- if (!*start_lsn)
- *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
-
- /*
- * This loop writes out as many regions as can fit in the amount
- * of space which was allocated by xlog_state_get_iclog_space().
- */
- while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
- struct xfs_log_iovec *reg;
- struct xlog_op_header *ophdr;
- int copy_len;
- int copy_off;
- bool ordered = false;
-
- /* ordered log vectors have no regions to write */
- if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
- ASSERT(lv->lv_niovecs == 0);
- ordered = true;
- goto next_lv;
- }
-
- reg = &vecp[index];
- ASSERT(reg->i_len % sizeof(int32_t) == 0);
- ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
-
- /*
- * Before we start formatting log vectors, we need to
- * write a start record. Only do this for the first
- * iclog we write to.
- */
- if (need_start_rec) {
- xlog_write_start_rec(ptr, ticket);
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- sizeof(struct xlog_op_header));
- }
-
- ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
- if (!ophdr)
- return -EIO;
+ error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+ &log_offset);
+ if (error)
+ return error;
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- sizeof(struct xlog_op_header));
+ /* start_lsn is the LSN of the first iclog written to. */
+ if (start_lsn)
+ *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
- len += xlog_write_setup_copy(ticket, ophdr,
- iclog->ic_size-log_offset,
- reg->i_len,
- &copy_off, &copy_len,
- &partial_copy,
- &partial_copy_len);
- xlog_verify_dest_ptr(log, ptr);
-
- /*
- * Copy region.
- *
- * Unmount records just log an opheader, so can have
- * empty payloads with no data region to copy. Hence we
- * only copy the payload if the vector says it has data
- * to copy.
- */
- ASSERT(copy_len >= 0);
- if (copy_len > 0) {
- memcpy(ptr, reg->i_addr + copy_off, copy_len);
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- copy_len);
- }
- copy_len += sizeof(struct xlog_op_header);
- record_cnt++;
- if (need_start_rec) {
- copy_len += sizeof(struct xlog_op_header);
- record_cnt++;
- need_start_rec = false;
- }
- data_cnt += contwr ? copy_len : 0;
-
- error = xlog_write_copy_finish(log, iclog, flags,
- &record_cnt, &data_cnt,
- &partial_copy,
- &partial_copy_len,
- log_offset,
- commit_iclog);
- if (error)
- return error;
-
- /*
- * if we had a partial copy, we need to get more iclog
- * space but we don't want to increment the region
- * index because there is still more is this region to
- * write.
- *
- * If we completed writing this region, and we flushed
- * the iclog (indicated by resetting of the record
- * count), then we also need to get more log space. If
- * this was the last record, though, we are done and
- * can just return.
- */
- if (partial_copy)
- break;
+ lv = list_first_entry_or_null(lv_chain, struct xfs_log_vec, lv_list);
+ while (lv) {
+ lv = xlog_write_single(lv_chain, lv, ticket, iclog, &log_offset,
+ &len, &record_cnt, &data_cnt);
+ if (!lv)
+ break;
- if (++index == lv->lv_niovecs) {
-next_lv:
- lv = lv->lv_next;
- index = 0;
- if (lv)
- vecp = lv->lv_iovecp;
- }
- if (record_cnt == 0 && !ordered) {
- if (!lv)
- return 0;
- break;
- }
+ lv = xlog_write_partial(log, lv_chain, lv, ticket, &iclog,
+ &log_offset, &len, &record_cnt,
+ &data_cnt);
+ if (IS_ERR_OR_NULL(lv)) {
+ error = PTR_ERR_OR_ZERO(lv);
+ break;
}
}
+ ASSERT((len == 0 && !lv) || error);
- ASSERT(len == 0);
-
+ /*
+ * We've already been guaranteed that the last writes will fit inside
+ * the current iclog, and hence it will already have the space used by
+ * those writes accounted to it. Hence we do not need to update the
+ * iclog with the number of bytes written here.
+ */
spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
- if (commit_iclog) {
- ASSERT(flags & XLOG_COMMIT_TRANS);
+ xlog_state_finish_copy(log, iclog, record_cnt, 0);
+ if (commit_iclog)
*commit_iclog = iclog;
- } else {
- error = xlog_state_release_iclog(log, iclog);
- }
+ else
+ error = xlog_state_release_iclog(log, iclog, ticket);
spin_unlock(&log->l_icloglock);
return error;
@@ -2946,7 +2872,6 @@ xlog_state_get_iclog_space(
int len,
struct xlog_in_core **iclogp,
struct xlog_ticket *ticket,
- int *continued_write,
int *logoffsetp)
{
int log_offset;
@@ -2981,9 +2906,6 @@ restart:
*/
if (log_offset == 0) {
ticket->t_curr_res -= log->l_iclog_hsize;
- xlog_tic_add_region(ticket,
- log->l_iclog_hsize,
- XLOG_REG_TYPE_LRHEADER);
head->h_cycle = cpu_to_be32(log->l_curr_cycle);
head->h_lsn = cpu_to_be64(
xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
@@ -3012,7 +2934,7 @@ restart:
* reference to the iclog.
*/
if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog, ticket);
spin_unlock(&log->l_icloglock);
if (error)
return error;
@@ -3025,13 +2947,10 @@ restart:
* iclogs (to mark it taken), this particular iclog will release/sync
* to disk in xlog_write().
*/
- if (len <= iclog->ic_size - iclog->ic_offset) {
- *continued_write = 0;
+ if (len <= iclog->ic_size - iclog->ic_offset)
iclog->ic_offset += len;
- } else {
- *continued_write = 1;
+ else
xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
- }
*iclogp = iclog;
ASSERT(iclog->ic_offset <= iclog->ic_size);
@@ -3063,7 +2982,6 @@ xfs_log_ticket_regrant(
xlog_grant_sub_space(log, &log->l_write_head.grant,
ticket->t_curr_res);
ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
trace_xfs_log_ticket_regrant_sub(log, ticket);
@@ -3074,7 +2992,6 @@ xfs_log_ticket_regrant(
trace_xfs_log_ticket_regrant_exit(log, ticket);
ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
}
xfs_log_ticket_put(ticket);
@@ -3131,7 +3048,7 @@ xfs_log_ticket_ungrant(
* This routine will mark the current iclog in the ring as WANT_SYNC and move
* the current iclog pointer to the next iclog in the ring.
*/
-STATIC void
+void
xlog_state_switch_iclogs(
struct xlog *log,
struct xlog_in_core *iclog,
@@ -3151,10 +3068,9 @@ xlog_state_switch_iclogs(
log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
/* Round up to next log-sunit */
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
- log->l_mp->m_sb.sb_logsunit > 1) {
- uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
- log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
+ if (log->l_iclog_roundoff > BBSIZE) {
+ log->l_curr_block = roundup(log->l_curr_block,
+ BTOBB(log->l_iclog_roundoff));
}
if (log->l_curr_block >= log->l_logBBsize) {
@@ -3246,7 +3162,7 @@ xfs_log_force(
atomic_inc(&iclog->ic_refcnt);
lsn = be64_to_cpu(iclog->ic_header.h_lsn);
xlog_state_switch_iclogs(log, iclog, 0);
- if (xlog_state_release_iclog(log, iclog))
+ if (xlog_state_release_iclog(log, iclog, NULL))
goto out_error;
if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
@@ -3279,15 +3195,28 @@ out_error:
return -EIO;
}
+/*
+ * Force the log to a specific LSN.
+ *
+ * If an iclog with that lsn can be found:
+ * If it is in the DIRTY state, just return.
+ * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
+ * state and go to sleep or return.
+ * If it is in any other state, go to sleep or return.
+ *
+ * Synchronous forces are implemented with a wait queue. All callers trying
+ * to force a given lsn to disk must wait on the queue attached to the
+ * specific in-core log. When given in-core log finally completes its write
+ * to disk, that thread will wake up all threads waiting on the queue.
+ */
static int
-__xfs_log_force_lsn(
- struct xfs_mount *mp,
+xlog_force_lsn(
+ struct xlog *log,
xfs_lsn_t lsn,
uint flags,
int *log_flushed,
bool already_slept)
{
- struct xlog *log = mp->m_log;
struct xlog_in_core *iclog;
spin_lock(&log->l_icloglock);
@@ -3320,15 +3249,13 @@ __xfs_log_force_lsn(
if (!already_slept &&
(iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC ||
iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) {
- XFS_STATS_INC(mp, xs_log_force_sleep);
-
xlog_wait(&iclog->ic_prev->ic_write_wait,
&log->l_icloglock);
return -EAGAIN;
}
atomic_inc(&iclog->ic_refcnt);
xlog_state_switch_iclogs(log, iclog, 0);
- if (xlog_state_release_iclog(log, iclog))
+ if (xlog_state_release_iclog(log, iclog, NULL))
goto out_error;
if (log_flushed)
*log_flushed = 1;
@@ -3345,39 +3272,38 @@ out_error:
}
/*
- * Force the in-core log to disk for a specific LSN.
- *
- * Find in-core log with lsn.
- * If it is in the DIRTY state, just return.
- * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
- * state and go to sleep or return.
- * If it is in any other state, go to sleep or return.
+ * Force the log to a specific checkpoint sequence.
*
- * Synchronous forces are implemented with a wait queue. All callers trying
- * to force a given lsn to disk must wait on the queue attached to the
- * specific in-core log. When given in-core log finally completes its write
- * to disk, that thread will wake up all threads waiting on the queue.
+ * First force the CIL so that all the required changes have been flushed to the
+ * iclogs. If the CIL force completed it will return a commit LSN that indicates
+ * the iclog that needs to be flushed to stable storage. If the caller needs
+ * a synchronous log force, we will wait on the iclog with the LSN returned by
+ * xlog_cil_force_seq() to be completed.
*/
int
-xfs_log_force_lsn(
+xfs_log_force_seq(
struct xfs_mount *mp,
- xfs_lsn_t lsn,
+ xfs_csn_t seq,
uint flags,
int *log_flushed)
{
+ struct xlog *log = mp->m_log;
+ xfs_lsn_t lsn;
int ret;
- ASSERT(lsn != 0);
+ ASSERT(seq != 0);
XFS_STATS_INC(mp, xs_log_force);
- trace_xfs_log_force(mp, lsn, _RET_IP_);
+ trace_xfs_log_force(mp, seq, _RET_IP_);
- lsn = xlog_cil_force_lsn(mp->m_log, lsn);
+ lsn = xlog_cil_force_seq(log, seq);
if (lsn == NULLCOMMITLSN)
return 0;
- ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false);
- if (ret == -EAGAIN)
- ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true);
+ ret = xlog_force_lsn(log, lsn, flags, log_flushed, false);
+ if (ret == -EAGAIN) {
+ XFS_STATS_INC(mp, xs_log_force_sleep);
+ ret = xlog_force_lsn(log, lsn, flags, log_flushed, true);
+ }
return ret;
}
@@ -3406,12 +3332,12 @@ xfs_log_ticket_get(
* Figure out the total log space unit (in bytes) that would be
* required for a log ticket.
*/
-int
-xfs_log_calc_unit_res(
- struct xfs_mount *mp,
- int unit_bytes)
+static int
+xlog_calc_unit_res(
+ struct xlog *log,
+ int unit_bytes,
+ int *niclogs)
{
- struct xlog *log = mp->m_log;
int iclog_space;
uint num_headers;
@@ -3487,18 +3413,22 @@ xfs_log_calc_unit_res(
/* for commit-rec LR header - note: padding will subsume the ophdr */
unit_bytes += log->l_iclog_hsize;
- /* for roundoff padding for transaction data and one for commit record */
- if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) {
- /* log su roundoff */
- unit_bytes += 2 * mp->m_sb.sb_logsunit;
- } else {
- /* BB roundoff */
- unit_bytes += 2 * BBSIZE;
- }
+ /* roundoff padding for transaction data and one for commit record */
+ unit_bytes += 2 * log->l_iclog_roundoff;
+ if (niclogs)
+ *niclogs = num_headers;
return unit_bytes;
}
+int
+xfs_log_calc_unit_res(
+ struct xfs_mount *mp,
+ int unit_bytes)
+{
+ return xlog_calc_unit_res(mp->m_log, unit_bytes, NULL);
+}
+
/*
* Allocate and initialise a new log ticket.
*/
@@ -3507,7 +3437,6 @@ xlog_ticket_alloc(
struct xlog *log,
int unit_bytes,
int cnt,
- char client,
bool permanent)
{
struct xlog_ticket *tic;
@@ -3515,7 +3444,7 @@ xlog_ticket_alloc(
tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL);
- unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
+ unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs);
atomic_set(&tic->t_ref, 1);
tic->t_task = current;
@@ -3525,12 +3454,9 @@ xlog_ticket_alloc(
tic->t_cnt = cnt;
tic->t_ocnt = cnt;
tic->t_tid = prandom_u32();
- tic->t_clientid = client;
if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
- xlog_tic_reset_res(tic);
-
return tic;
}
@@ -3698,11 +3624,12 @@ xlog_verify_iclog(
iclog->ic_header.h_cycle_data[idx]);
}
}
- if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
+ if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) {
xfs_warn(log->l_mp,
- "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx",
- __func__, clientid, ophead,
+ "%s: op %d invalid clientid %d op "PTR_FMT" offset 0x%lx",
+ __func__, i, clientid, ophead,
(unsigned long)field_offset);
+ }
/* check length */
p = &ophead->oh_len;