From 6a3afb6ac6dfab158ebdd4b87941178f58c8939f Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 29 Nov 2023 19:47:40 +0800 Subject: jbd2: increase the journal IO's priority Current jbd2 only add REQ_SYNC for descriptor block, metadata log buffer, commit buffer and superblock buffer, the submitted IO could be throttled by writeback throttle in block layer, that could lead to priority inversion in some cases. The log IO looks like a kind of high priority metadata IO, so it should not be throttled by WBT like QOS policies in block layer, let's add REQ_SYNC | REQ_IDLE to exempt from writeback throttle, and also add REQ_META together indicates it's a metadata IO. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231129114740.2686201-2-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 8d6f934c3d95..9bdb377a348f 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -119,7 +119,7 @@ static int journal_submit_commit_record(journal_t *journal, struct commit_header *tmp; struct buffer_head *bh; struct timespec64 now; - blk_opf_t write_flags = REQ_OP_WRITE | REQ_SYNC; + blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS; *cbh = NULL; @@ -395,8 +395,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ jbd2_journal_update_sb_log_tail(journal, journal->j_tail_sequence, - journal->j_tail, - REQ_SYNC); + journal->j_tail, 0); mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd2_debug(3, "superblock not updated\n"); @@ -715,6 +714,7 @@ start_journal_io: for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; + /* * Compute checksum. */ @@ -727,7 +727,8 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); + submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS, + bh); } cond_resched(); -- cgit v1.2.3 From 6c02757c936063f0631b4e43fe156f8c8f1f351f Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 11 Dec 2023 19:25:44 +0800 Subject: jbd2: fix soft lockup in journal_finish_inode_data_buffers() There's issue when do io test: WARN: soft lockup - CPU#45 stuck for 11s! [jbd2/dm-2-8:4170] CPU: 45 PID: 4170 Comm: jbd2/dm-2-8 Kdump: loaded Tainted: G OE Call trace: dump_backtrace+0x0/0x1a0 show_stack+0x24/0x30 dump_stack+0xb0/0x100 watchdog_timer_fn+0x254/0x3f8 __hrtimer_run_queues+0x11c/0x380 hrtimer_interrupt+0xfc/0x2f8 arch_timer_handler_phys+0x38/0x58 handle_percpu_devid_irq+0x90/0x248 generic_handle_irq+0x3c/0x58 __handle_domain_irq+0x68/0xc0 gic_handle_irq+0x90/0x320 el1_irq+0xcc/0x180 queued_spin_lock_slowpath+0x1d8/0x320 jbd2_journal_commit_transaction+0x10f4/0x1c78 [jbd2] kjournald2+0xec/0x2f0 [jbd2] kthread+0x134/0x138 ret_from_fork+0x10/0x18 Analyzed informations from vmcore as follows: (1) There are about 5k+ jbd2_inode in 'commit_transaction->t_inode_list'; (2) Now is processing the 855th jbd2_inode; (3) JBD2 task has TIF_NEED_RESCHED flag; (4) There's no pags in address_space around the 855th jbd2_inode; (5) There are some process is doing drop caches; (6) Mounted with 'nodioread_nolock' option; (7) 128 CPUs; According to informations from vmcore we know 'journal->j_list_lock' spin lock competition is fierce. So journal_finish_inode_data_buffers() maybe process slowly. Theoretically, there is scheduling point in the filemap_fdatawait_range_keep_errors(). However, if inode's address_space has no pages which taged with PAGECACHE_TAG_WRITEBACK, will not call cond_resched(). So may lead to soft lockup. journal_finish_inode_data_buffers filemap_fdatawait_range_keep_errors __filemap_fdatawait_range while (index <= end) nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, PAGECACHE_TAG_WRITEBACK); if (!nr_pages) break; --> If 'nr_pages' is equal zero will break, then will not call cond_resched() for (i = 0; i < nr_pages; i++) wait_on_page_writeback(page); cond_resched(); To solve above issue, add scheduling point in the journal_finish_inode_data_buffers(); Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231211112544.3879780-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 9bdb377a348f..5e122586e06e 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -270,6 +270,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, if (!ret) ret = err; } + cond_resched(); spin_lock(&journal->j_list_lock); jinode->i_flags &= ~JI_COMMIT_RUNNING; smp_mb(); -- cgit v1.2.3