From 32c80b32c053dc52712dedac5e4d0aa7c93fc353 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sat, 13 Aug 2011 12:30:59 -0400 Subject: ext4: Resolve the hang of direct i/o read in handling EXT4_IO_END_UNWRITTEN. EXT4_IO_END_UNWRITTEN flag set and the increase of i_aiodio_unwritten should be done simultaneously since ext4_end_io_nolock always clear the flag and decrease the counter in the same time. We don't increase i_aiodio_unwritten when setting EXT4_IO_END_UNWRITTEN so it will go nagative and causes some process to wait forever. Part of the patch came from Eric in his e-mail, but it doesn't fix the problem met by Michael actually. http://marc.info/?l=linux-ext4&m=131316851417460&w=2 Reported-and-Tested-by: Michael Tokarev Signed-off-by: Eric Sandeen Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/page-io.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 430c401d0895..78839af7ce29 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -334,8 +334,10 @@ submit_and_retry: if ((io_end->num_io_pages >= MAX_IO_PAGES) && (io_end->pages[io_end->num_io_pages-1] != io_page)) goto submit_and_retry; - if (buffer_uninit(bh)) - io->io_end->flag |= EXT4_IO_END_UNWRITTEN; + if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } io->io_end->size += bh->b_size; io->io_next_block++; ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); -- cgit v1.2.3 From 8c0bec2151a47906bf779c6715a10ce04453ab77 Mon Sep 17 00:00:00 2001 From: Jiaying Zhang Date: Wed, 31 Aug 2011 11:50:51 -0400 Subject: ext4: remove i_mutex lock in ext4_evict_inode to fix lockdep complaining The i_mutex lock and flush_completed_IO() added by commit 2581fdc810 in ext4_evict_inode() causes lockdep complaining about potential deadlock in several places. In most/all of these LOCKDEP complaints it looks like it's a false positive, since many of the potential circular locking cases can't take place by the time the ext4_evict_inode() is called; but since at the very least it may mask real problems, we need to address this. This change removes the flush_completed_IO() and i_mutex lock in ext4_evict_inode(). Instead, we take a different approach to resolve the software lockup that commit 2581fdc810 intends to fix. Rather than having ext4-dio-unwritten thread wait for grabing the i_mutex lock of an inode, we use mutex_trylock() instead, and simply requeue the work item if we fail to grab the inode's i_mutex lock. This should speed up work queue processing in general and also prevents the following deadlock scenario: During page fault, shrink_icache_memory is called that in turn evicts another inode B. Inode B has some pending io_end work so it calls ext4_ioend_wait() that waits for inode B's i_ioend_count to become zero. However, inode B's ioend work was queued behind some of inode A's ioend work on the same cpu's ext4-dio-unwritten workqueue. As the ext4-dio-unwritten thread on that cpu is processing inode A's ioend work, it tries to grab inode A's i_mutex lock. Since the i_mutex lock of inode A is still hold before the page fault happened, we enter a deadlock. Signed-off-by: Jiaying Zhang Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/inode.c | 3 --- fs/ext4/page-io.c | 18 +++++++++++++++++- 3 files changed, 18 insertions(+), 4 deletions(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e717dfd2f2b4..b7d7bd0f066e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -175,6 +175,7 @@ struct mpage_da_data { */ #define EXT4_IO_END_UNWRITTEN 0x0001 #define EXT4_IO_END_ERROR 0x0002 +#define EXT4_IO_END_QUEUED 0x0004 struct ext4_io_page { struct page *p_page; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c4da98a959ae..18d2558b7624 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -121,9 +121,6 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); - mutex_lock(&inode->i_mutex); - ext4_flush_completed_IO(inode); - mutex_unlock(&inode->i_mutex); ext4_ioend_wait(inode); if (inode->i_nlink) { diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 78839af7ce29..92f38ee13f8a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -142,7 +142,23 @@ static void ext4_end_io_work(struct work_struct *work) unsigned long flags; int ret; - mutex_lock(&inode->i_mutex); + if (!mutex_trylock(&inode->i_mutex)) { + /* + * Requeue the work instead of waiting so that the work + * items queued after this can be processed. + */ + queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); + /* + * To prevent the ext4-dio-unwritten thread from keeping + * requeueing end_io requests and occupying cpu for too long, + * yield the cpu if it sees an end_io request that has already + * been requeued. + */ + if (io->flag & EXT4_IO_END_QUEUED) + yield(); + io->flag |= EXT4_IO_END_QUEUED; + return; + } ret = ext4_end_io_nolock(io); if (ret < 0) { mutex_unlock(&inode->i_mutex); -- cgit v1.2.3 From d73d5046a72467d4510825b99e2269e09ad80e15 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sun, 30 Oct 2011 18:26:08 -0400 Subject: ext4: Use correct locking for ext4_end_io_nolock() We must hold i_completed_io_lock when manipulating anything on the i_completed_io_list linked list. This includes io->lock, which we were checking in ext4_end_io_nolock(). So move this check to ext4_end_io_work(). This also has the bonus of avoiding extra work if it is already done without needing to take the mutex. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/fsync.c | 3 --- fs/ext4/page-io.c | 14 +++++++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index c942924a0645..851ac5b3cec9 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -83,9 +83,6 @@ int ext4_flush_completed_IO(struct inode *inode) int ret = 0; int ret2 = 0; - if (list_empty(&ei->i_completed_io_list)) - return ret; - dump_completed_IO(inode); spin_lock_irqsave(&ei->i_completed_io_lock, flags); while (!list_empty(&ei->i_completed_io_list)){ diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 92f38ee13f8a..aed40966f342 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -87,6 +87,9 @@ void ext4_free_io_end(ext4_io_end_t *io) /* * check a range of space and convert unwritten extents to written. + * + * Called with inode->i_mutex; we depend on this when we manipulate + * io->flag, since we could otherwise race with ext4_flush_completed_IO() */ int ext4_end_io_nolock(ext4_io_end_t *io) { @@ -100,9 +103,6 @@ int ext4_end_io_nolock(ext4_io_end_t *io) "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); - if (list_empty(&io->list)) - return ret; - if (!(io->flag & EXT4_IO_END_UNWRITTEN)) return ret; @@ -142,6 +142,13 @@ static void ext4_end_io_work(struct work_struct *work) unsigned long flags; int ret; + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (list_empty(&io->list)) { + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + goto free; + } + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + if (!mutex_trylock(&inode->i_mutex)) { /* * Requeue the work instead of waiting so that the work @@ -170,6 +177,7 @@ static void ext4_end_io_work(struct work_struct *work) list_del_init(&io->list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); mutex_unlock(&inode->i_mutex); +free: ext4_free_io_end(io); } -- cgit v1.2.3 From 4e298021216727cc27017c5032ade86167c66256 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 30 Oct 2011 18:41:19 -0400 Subject: ext4: remove unnecessary call to waitqueue_active() The usage of waitqueue_active() is not necessary, and introduces (I believe) a hard-to-hit race. Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index aed40966f342..4fa1d709b604 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -70,7 +70,6 @@ static void put_io_page(struct ext4_io_page *io_page) void ext4_free_io_end(ext4_io_end_t *io) { int i; - wait_queue_head_t *wq; BUG_ON(!io); if (io->page) @@ -78,10 +77,8 @@ void ext4_free_io_end(ext4_io_end_t *io) for (i = 0; i < io->num_io_pages; i++) put_io_page(io->pages[i]); io->num_io_pages = 0; - wq = ext4_ioend_wq(io->inode); - if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && - waitqueue_active(wq)) - wake_up_all(wq); + if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) + wake_up_all(ext4_ioend_wq(io->inode)); kmem_cache_free(io_end_cachep, io); } @@ -96,7 +93,6 @@ int ext4_end_io_nolock(ext4_io_end_t *io) struct inode *inode = io->inode; loff_t offset = io->offset; ssize_t size = io->size; - wait_queue_head_t *wq; int ret = 0; ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," @@ -121,11 +117,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io) if (io->flag & EXT4_IO_END_UNWRITTEN) { io->flag &= ~EXT4_IO_END_UNWRITTEN; /* Wake up anyone waiting on unwritten extent conversion */ - wq = ext4_ioend_wq(io->inode); - if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) && - waitqueue_active(wq)) { - wake_up_all(wq); - } + if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) + wake_up_all(ext4_ioend_wq(io->inode)); } return ret; -- cgit v1.2.3 From b82e384c7bb9a19036b4daf58fa216df7cd48aa0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 31 Oct 2011 10:56:32 -0400 Subject: ext4: optimize locking for end_io extent conversion Now that we are doing the locking correctly, we need to grab the i_completed_io_lock() twice per end_io. We can clean this up by removing the structure from the i_complted_io_list, and use this as the locking mechanism to prevent ext4_flush_completed_IO() racing against ext4_end_io_work(), instead of clearing the EXT4_IO_END_UNWRITTEN in io->flag. In addition, if the ext4_convert_unwritten_extents() returns an error, we no longer keep the end_io structure on the linked list. This doesn't help, because it tends to lock up the file system and wedges the system. That's one way to call attention to the problem, but it doesn't help the overall robustness of the system. Signed-off-by: "Theodore Ts'o" --- fs/ext4/fsync.c | 5 ++--- fs/ext4/page-io.c | 37 +++++++++++-------------------------- 2 files changed, 13 insertions(+), 29 deletions(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 851ac5b3cec9..00a2cb753efd 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -88,6 +88,7 @@ int ext4_flush_completed_IO(struct inode *inode) while (!list_empty(&ei->i_completed_io_list)){ io = list_entry(ei->i_completed_io_list.next, ext4_io_end_t, list); + list_del_init(&io->list); /* * Calling ext4_end_io_nolock() to convert completed * IO to written. @@ -104,11 +105,9 @@ int ext4_flush_completed_IO(struct inode *inode) */ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); ret = ext4_end_io_nolock(io); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); if (ret < 0) ret2 = ret; - else - list_del_init(&io->list); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); } spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); return (ret2 < 0) ? ret2 : 0; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 4fa1d709b604..9eebf44646f6 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -99,28 +99,21 @@ int ext4_end_io_nolock(ext4_io_end_t *io) "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); - if (!(io->flag & EXT4_IO_END_UNWRITTEN)) - return ret; - ret = ext4_convert_unwritten_extents(inode, offset, size); if (ret < 0) { - printk(KERN_EMERG "%s: failed to convert unwritten " - "extents to written extents, error is %d " - "io is still on inode %lu aio dio list\n", - __func__, ret, inode->i_ino); - return ret; + ext4_msg(inode->i_sb, KERN_EMERG, + "failed to convert unwritten extents to written " + "extents -- potential data loss! " + "(inode %lu, offset %llu, size %zd, error %d)", + inode->i_ino, offset, size, ret); } if (io->iocb) aio_complete(io->iocb, io->result, 0); - /* clear the DIO AIO unwritten flag */ - if (io->flag & EXT4_IO_END_UNWRITTEN) { - io->flag &= ~EXT4_IO_END_UNWRITTEN; - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) - wake_up_all(ext4_ioend_wq(io->inode)); - } + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) + wake_up_all(ext4_ioend_wq(io->inode)); return ret; } @@ -133,16 +126,15 @@ static void ext4_end_io_work(struct work_struct *work) struct inode *inode = io->inode; struct ext4_inode_info *ei = EXT4_I(inode); unsigned long flags; - int ret; spin_lock_irqsave(&ei->i_completed_io_lock, flags); if (list_empty(&io->list)) { spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); goto free; } - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); if (!mutex_trylock(&inode->i_mutex)) { + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); /* * Requeue the work instead of waiting so that the work * items queued after this can be processed. @@ -159,16 +151,9 @@ static void ext4_end_io_work(struct work_struct *work) io->flag |= EXT4_IO_END_QUEUED; return; } - ret = ext4_end_io_nolock(io); - if (ret < 0) { - mutex_unlock(&inode->i_mutex); - return; - } - - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (!list_empty(&io->list)) - list_del_init(&io->list); + list_del_init(&io->list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + (void) ext4_end_io_nolock(io); mutex_unlock(&inode->i_mutex); free: ext4_free_io_end(io); -- cgit v1.2.3 From 0edeb71dc9133bfb505d3bf59642e07cd936613e Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 31 Oct 2011 17:30:44 -0400 Subject: ext4: Create helper function for EXT4_IO_END_UNWRITTEN and i_aiodio_unwritten EXT4_IO_END_UNWRITTEN flag set and the increase of i_aiodio_unwritten should be done simultaneously since ext4_end_io_nolock always clear the flag and decrease the counter in the same time. We have found some bugs that the flag is set while leaving i_aiodio_unwritten unchanged(commit 32c80b32c053d). So this patch just tries to create a helper function to wrap them to avoid any future bug. The idea is inspired by Eric. Cc: Eric Sandeen Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 9 +++++++++ fs/ext4/extents.c | 18 ++++++------------ fs/ext4/inode.c | 5 +---- fs/ext4/page-io.c | 6 ++---- 4 files changed, 18 insertions(+), 20 deletions(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 657e82649fa5..604c200216c1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1275,6 +1275,15 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } +static inline void ext4_set_io_unwritten_flag(struct inode *inode, + struct ext4_io_end *io_end) +{ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } +} + /* * Inode dynamic state flags */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 36a0f177ecad..a5c8caaaa099 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3472,12 +3472,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * that this IO needs to conversion to written when IO is * completed */ - if (io) { - if (!(io->flag & EXT4_IO_END_UNWRITTEN)) { - io->flag = EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); - } - } else + if (io) + ext4_set_io_unwritten_flag(inode, io); + else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); if (ext4_should_dioread_nolock(inode)) map->m_flags |= EXT4_MAP_UNINIT; @@ -4030,12 +4027,9 @@ got_allocated_blocks: * that we need to perform conversion when IO is done. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - if (io) { - if (!(io->flag & EXT4_IO_END_UNWRITTEN)) { - io->flag = EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); - } - } else + if (io) + ext4_set_io_unwritten_flag(inode, io); + else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e4b26faac5ff..60af5126eb05 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2831,10 +2831,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) * but being more careful is always safe for the future change. */ inode = io_end->inode; - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); - } + ext4_set_io_unwritten_flag(inode, io_end); /* Add the io_end to per-inode completed io list*/ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 9eebf44646f6..7ce1d0b19c94 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -336,10 +336,8 @@ submit_and_retry: if ((io_end->num_io_pages >= MAX_IO_PAGES) && (io_end->pages[io_end->num_io_pages-1] != io_page)) goto submit_and_retry; - if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); - } + if (buffer_uninit(bh)) + ext4_set_io_unwritten_flag(inode, io_end); io->io_end->size += bh->b_size; io->io_next_block++; ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); -- cgit v1.2.3