From 495a276e1ca96af622edb67ad0f85431935b20d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Jan 2015 10:42:34 +0100 Subject: block_dev: get bdev inode bdi directly from the block device Directly grab the backing_dev_info from the request_queue instead of detouring through the address_space. Signed-off-by: Christoph Hellwig Reviewed-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2d609a5fbfea..e8116a44cc29 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -69,10 +69,10 @@ EXPORT_SYMBOL(writeback_in_progress); static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) { struct super_block *sb = inode->i_sb; - +#ifdef CONFIG_BLOCK if (sb_is_blkdev_sb(sb)) - return inode->i_mapping->backing_dev_info; - + return blk_get_backing_dev_info(I_BDEV(inode)); +#endif return sb->s_bdi; } -- cgit v1.2.3 From de1414a654e66b81b5348dbc5259ecf2fb61655e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Jan 2015 10:42:36 +0100 Subject: fs: export inode_to_bdi and use it in favor of mapping->backing_dev_info Now that we got rid of the bdi abuse on character devices we can always use sb->s_bdi to get at the backing_dev_info for a file, except for the block device special case. Export inode_to_bdi and replace uses of mapping->backing_dev_info with it to prepare for the removal of mapping->backing_dev_info. Signed-off-by: Christoph Hellwig Reviewed-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- fs/btrfs/file.c | 2 +- fs/ceph/file.c | 2 +- fs/ext2/ialloc.c | 2 +- fs/ext4/super.c | 2 +- fs/fs-writeback.c | 3 ++- fs/fuse/file.c | 10 +++++----- fs/gfs2/aops.c | 2 +- fs/gfs2/super.c | 2 +- fs/nfs/filelayout/filelayout.c | 2 +- fs/nfs/write.c | 6 +++--- fs/ntfs/file.c | 3 ++- fs/ocfs2/file.c | 2 +- fs/xfs/xfs_file.c | 2 +- include/linux/backing-dev.h | 6 ++++-- include/trace/events/writeback.h | 6 +++--- mm/fadvise.c | 4 ++-- mm/filemap.c | 4 ++-- mm/filemap_xip.c | 3 ++- mm/page-writeback.c | 29 +++++++++++++---------------- mm/readahead.c | 4 ++-- mm/truncate.c | 2 +- mm/vmscan.c | 4 ++-- 22 files changed, 52 insertions(+), 50 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e4090259569b..835c04a874fd 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1746,7 +1746,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, mutex_lock(&inode->i_mutex); - current->backing_dev_info = inode->i_mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) { mutex_unlock(&inode->i_mutex); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ce74b394b49d..905986dd4c3c 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -945,7 +945,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) mutex_lock(&inode->i_mutex); /* We can write back this queue in page reclaim */ - current->backing_dev_info = file->f_mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 7d66fb0e4cca..6c14bb8322fa 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -170,7 +170,7 @@ static void ext2_preread_inode(struct inode *inode) struct ext2_group_desc * gdp; struct backing_dev_info *bdi; - bdi = inode->i_mapping->backing_dev_info; + bdi = inode_to_bdi(inode); if (bdi_read_congested(bdi)) return; if (bdi_write_congested(bdi)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 74c5f53595fb..ad88e601a6cd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -334,7 +334,7 @@ static void save_error_info(struct super_block *sb, const char *func, static int block_device_ejected(struct super_block *sb) { struct inode *bd_inode = sb->s_bdev->bd_inode; - struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(bd_inode); return bdi->dev == NULL; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e8116a44cc29..a20b1145f4d5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -66,7 +66,7 @@ int writeback_in_progress(struct backing_dev_info *bdi) } EXPORT_SYMBOL(writeback_in_progress); -static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) +struct backing_dev_info *inode_to_bdi(struct inode *inode) { struct super_block *sb = inode->i_sb; #ifdef CONFIG_BLOCK @@ -75,6 +75,7 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) #endif return sb->s_bdi; } +EXPORT_SYMBOL_GPL(inode_to_bdi); static inline struct inode *wb_inode(struct list_head *head) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 760b2c552197..19d80b82d344 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1159,7 +1159,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) mutex_lock(&inode->i_mutex); /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) @@ -1464,7 +1464,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) { struct inode *inode = req->inode; struct fuse_inode *fi = get_fuse_inode(inode); - struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(inode); int i; list_del(&req->writepages_entry); @@ -1658,7 +1658,7 @@ static int fuse_writepage_locked(struct page *page) req->end = fuse_writepage_end; req->inode = inode; - inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK); + inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fc->lock); @@ -1768,7 +1768,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT || old_req->state == FUSE_REQ_PENDING)) { - struct backing_dev_info *bdi = page->mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host); copy_highpage(old_req->pages[0], page); spin_unlock(&fc->lock); @@ -1872,7 +1872,7 @@ static int fuse_writepages_fill(struct page *page, req->page_descs[req->num_pages].offset = 0; req->page_descs[req->num_pages].length = PAGE_SIZE; - inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK); + inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 805b37fed638..4ad4f94edebe 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -289,7 +289,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - trace_wbc_writepage(wbc, mapping->backing_dev_info); + trace_wbc_writepage(wbc, inode_to_bdi(inode)); ret = __gfs2_jdata_writepage(page, wbc); if (unlikely(ret)) { diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 5b327f837de7..1666382b198d 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -743,7 +743,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); - struct backing_dev_info *bdi = metamapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(metamapping->host); int ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 7afb52f6a25a..51aa889611cf 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1081,7 +1081,7 @@ mds_commit: spin_unlock(cinfo->lock); if (!cinfo->dreq) { inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, + inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host), BDI_RECLAIMABLE); __mark_inode_dirty(req->wb_context->dentry->d_inode, I_DIRTY_DATASYNC); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index af3af685a9e3..298abcc5281b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -786,7 +786,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, spin_unlock(cinfo->lock); if (!cinfo->dreq) { inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, + inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host), BDI_RECLAIMABLE); __mark_inode_dirty(req->wb_context->dentry->d_inode, I_DIRTY_DATASYNC); @@ -853,7 +853,7 @@ static void nfs_clear_page_commit(struct page *page) { dec_zone_page_state(page, NR_UNSTABLE_NFS); - dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE); + dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE); } /* Called holding inode (/cinfo) lock */ @@ -1564,7 +1564,7 @@ void nfs_retry_commit(struct list_head *page_list, nfs_mark_request_commit(req, lseg, cinfo); if (!cinfo->dreq) { dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, + dec_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host), BDI_RECLAIMABLE); } nfs_unlock_and_release_request(req); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 643faa44f22b..1da9b2d184dc 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -19,6 +19,7 @@ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include #include #include #include @@ -2091,7 +2092,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb, count = iov_length(iov, nr_segs); pos = *ppos; /* We can write back this queue in page reclaim. */ - current->backing_dev_info = mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); written = 0; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 3950693dd0f6..abe7d98d6178 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2363,7 +2363,7 @@ relock: goto out_dio; } } else { - current->backing_dev_info = file->f_mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); written = generic_perform_write(file, from, *ppos); if (likely(written >= 0)) iocb->ki_pos = *ppos + written; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 13e974e6a889..5684ac3e7d18 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -699,7 +699,7 @@ xfs_file_buffered_aio_write( iov_iter_truncate(from, count); /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 478f95d92d73..ed59dee03a71 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -106,6 +106,8 @@ struct backing_dev_info { #endif }; +struct backing_dev_info *inode_to_bdi(struct inode *inode); + int __must_check bdi_init(struct backing_dev_info *bdi); void bdi_destroy(struct backing_dev_info *bdi); @@ -303,12 +305,12 @@ static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi) static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) { - return bdi_cap_writeback_dirty(mapping->backing_dev_info); + return bdi_cap_writeback_dirty(inode_to_bdi(mapping->host)); } static inline bool mapping_cap_account_dirty(struct address_space *mapping) { - return bdi_cap_account_dirty(mapping->backing_dev_info); + return bdi_cap_account_dirty(inode_to_bdi(mapping->host)); } static inline int bdi_sched_wait(void *word) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index cee02d65ab3f..74f5207bd090 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -47,7 +47,7 @@ TRACE_EVENT(writeback_dirty_page, TP_fast_assign( strncpy(__entry->name, - mapping ? dev_name(mapping->backing_dev_info->dev) : "(unknown)", 32); + mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)", 32); __entry->ino = mapping ? mapping->host->i_ino : 0; __entry->index = page->index; ), @@ -72,7 +72,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, ), TP_fast_assign( - struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(inode); /* may be called for files on pseudo FSes w/ unregistered bdi */ strncpy(__entry->name, @@ -116,7 +116,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_fast_assign( strncpy(__entry->name, - dev_name(inode->i_mapping->backing_dev_info->dev), 32); + dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; ), diff --git a/mm/fadvise.c b/mm/fadvise.c index 2ad7adf4f0a4..fac23ecf8d72 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -73,7 +73,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) else endbyte--; /* inclusive */ - bdi = mapping->backing_dev_info; + bdi = inode_to_bdi(mapping->host); switch (advice) { case POSIX_FADV_NORMAL: @@ -113,7 +113,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) case POSIX_FADV_NOREUSE: break; case POSIX_FADV_DONTNEED: - if (!bdi_write_congested(mapping->backing_dev_info)) + if (!bdi_write_congested(bdi)) __filemap_fdatawrite_range(mapping, offset, endbyte, WB_SYNC_NONE); diff --git a/mm/filemap.c b/mm/filemap.c index 673e4581a2e5..5d7c23c26f81 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -211,7 +211,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) */ if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); } } @@ -2565,7 +2565,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) size_t count = iov_iter_count(from); /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) goto out; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 0d105aeff82f..26897fbfbe19 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -410,7 +411,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, count = len; /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode)); if (ret) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6f4335238e33..d4cbb4bd7d1c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1351,7 +1351,7 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long task_ratelimit; unsigned long dirty_ratelimit; unsigned long pos_ratio; - struct backing_dev_info *bdi = mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; @@ -1574,7 +1574,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; */ void balance_dirty_pages_ratelimited(struct address_space *mapping) { - struct backing_dev_info *bdi = mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); int ratelimit; int *p; @@ -1929,7 +1929,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - trace_wbc_writepage(wbc, mapping->backing_dev_info); + trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); ret = (*writepage)(page, wbc, data); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { @@ -2094,10 +2094,12 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) trace_writeback_dirty_page(page, mapping); if (mapping_cap_account_dirty(mapping)) { + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); - __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); - __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); + __inc_bdi_stat(bdi, BDI_RECLAIMABLE); + __inc_bdi_stat(bdi, BDI_DIRTIED); task_io_account_write(PAGE_CACHE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); @@ -2156,7 +2158,7 @@ void account_page_redirty(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { current->nr_dirtied--; dec_zone_page_state(page, NR_DIRTIED); - dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED); } } EXPORT_SYMBOL(account_page_redirty); @@ -2295,7 +2297,7 @@ int clear_page_dirty_for_io(struct page *page) */ if (TestClearPageDirty(page)) { dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(mapping->backing_dev_info, + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); return 1; } @@ -2315,7 +2317,7 @@ int test_clear_page_writeback(struct page *page) memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); if (mapping) { - struct backing_dev_info *bdi = mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); @@ -2352,7 +2354,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); if (mapping) { - struct backing_dev_info *bdi = mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); @@ -2406,12 +2408,7 @@ EXPORT_SYMBOL(mapping_tagged); */ void wait_for_stable_page(struct page *page) { - struct address_space *mapping = page_mapping(page); - struct backing_dev_info *bdi = mapping->backing_dev_info; - - if (!bdi_cap_stable_pages_required(bdi)) - return; - - wait_on_page_writeback(page); + if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host))) + wait_on_page_writeback(page); } EXPORT_SYMBOL_GPL(wait_for_stable_page); diff --git a/mm/readahead.c b/mm/readahead.c index 17b9172ec37f..935675844b2e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -27,7 +27,7 @@ void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { - ra->ra_pages = mapping->backing_dev_info->ra_pages; + ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; ra->prev_pos = -1; } EXPORT_SYMBOL_GPL(file_ra_state_init); @@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping, /* * Defer asynchronous read-ahead on IO congestion. */ - if (bdi_read_congested(mapping->backing_dev_info)) + if (bdi_read_congested(inode_to_bdi(mapping->host))) return; /* do read-ahead */ diff --git a/mm/truncate.c b/mm/truncate.c index f1e4d6052369..ddec5a5966d7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -112,7 +112,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) struct address_space *mapping = page->mapping; if (mapping && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(mapping->backing_dev_info, + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); if (account_size) task_io_account_cancelled_write(account_size); diff --git a/mm/vmscan.c b/mm/vmscan.c index ab2505c3ef54..e00a16393f21 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -497,7 +497,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(mapping->backing_dev_info, sc)) + if (!may_write_to_queue(inode_to_bdi(mapping->host), sc)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { @@ -876,7 +876,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ mapping = page_mapping(page); if (((dirty || writeback) && mapping && - bdi_write_congested(mapping->backing_dev_info)) || + bdi_write_congested(inode_to_bdi(mapping->host))) || (writeback && PageReclaim(page))) nr_congested++; -- cgit v1.2.3 From b520252aa287b14e1f39a51e20051775b273b82a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Jan 2015 08:13:17 -0700 Subject: fs: make inode_to_bdi() handle NULL inode Running a heavy fs workload, I ran into a situation where we pass down a page for writeback/swap that doesn't have an inode mapping: BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: [] inode_to_bdi+0xf/0x50 PGD 0 Oops: 0000 [#1] PREEMPT SMP Modules linked in: wl(O) tun cfg80211 btusb joydev hid_apple hid_generic usbhid hid bcm5974 usb_storage nouveau snd_hda_codec_hdmi snd_hda_codec_cirrus snd_hda_codec_generic x86_pkg_temp_thermal snd_hda_intel kvm_intel snd_hda_controller snd_hda_codec kvm snd_hwdep snd_pcm applesmc input_polldev snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq snd_timer snd_seq_device snd xhci_pci xhci_hcd ttm thunderbolt soundcore apple_gmux apple_bl bluetooth binfmt_misc fuse nls_iso8859_1 nls_cp437 vfat fat [last unloaded: wl] CPU: 4 PID: 50 Comm: kswapd0 Tainted: G U O 3.19.0-rc5+ #60 Hardware name: Apple Inc. MacBookPro11,3/Mac-2BD1B31983FE1663, BIOS MBP112.88Z.0138.B02.1310181745 10/18/2013 task: ffff880462e917f0 ti: ffff880462edc000 task.ti: ffff880462edc000 RIP: 0010:[] [] inode_to_bdi+0xf/0x50 RSP: 0000:ffff880462edf8e8 EFLAGS: 00010282 RAX: ffffffff81c4cd80 RBX: ffffea0001b3abc0 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffff880462edf8f8 R08: 00000000001e8500 R09: ffff880460f7cb68 R10: ffff880462edfa00 R11: 0000000000000101 R12: 0000000000000000 R13: ffffffff81c4cd98 R14: 0000000000000000 R15: ffff880460f7c9c0 FS: 0000000000000000(0000) GS:ffff88047f300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000028 CR3: 00000002b6341000 CR4: 00000000001407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Stack: ffffea0001b3abc0 ffffffff81c4cd80 ffff880462edf948 ffffffff811244aa ffffffff811565b0 ffff880460f7c9c0 ffff880462edf948 ffffea0001b3abc0 0000000000000001 ffff880462edfb40 ffff880008b999c0 ffff880460f7c9c0 Call Trace: [] __test_set_page_writeback+0x3a/0x170 [] ? SyS_madvise+0x790/0x790 [] __swap_writepage+0x216/0x280 [] ? radix_tree_insert+0x32/0xe0 [] ? swap_info_get+0x61/0xf0 [] ? page_swapcount+0x4c/0x60 [] swap_writepage+0x2d/0x50 [] shmem_writepage+0x198/0x2c0 [] shrink_page_list+0x464/0xa00 [] shrink_inactive_list+0x266/0x500 [] shrink_lruvec+0x5d5/0x720 [] shrink_zone+0x5b/0x190 [] kswapd+0x48f/0x8d0 [] ? try_to_free_pages+0x4c0/0x4c0 [] kthread+0xd2/0xf0 [] ? workqueue_congested+0x30/0x80 [] ? kthread_create_on_node+0x180/0x180 [] ret_from_fork+0x7c/0xb0 [] ? kthread_create_on_node+0x180/0x180 Code: 00 48 c7 c7 8d 8d a4 81 e8 3f 62 eb ff e9 fc fe ff ff 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 54 49 89 fc 53 <48> 8b 5f 28 48 89 df e8 15 f8 00 00 85 c0 75 11 48 8b 83 d8 00 RIP [] inode_to_bdi+0xf/0x50 RSP CR2: 0000000000000028 ---[ end trace eb0e21aa7dad3ddf ]--- Handle this in inode_to_bdi() by punting it to noop_backing_dev_info, if mapping->host is NULL. Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a20b1145f4d5..c399152de397 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -68,7 +68,12 @@ EXPORT_SYMBOL(writeback_in_progress); struct backing_dev_info *inode_to_bdi(struct inode *inode) { - struct super_block *sb = inode->i_sb; + struct super_block *sb; + + if (!inode) + return &noop_backing_dev_info; + + sb = inode->i_sb; #ifdef CONFIG_BLOCK if (sb_is_blkdev_sb(sb)) return blk_get_backing_dev_info(I_BDEV(inode)); -- cgit v1.2.3 From 0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 2 Feb 2015 00:37:00 -0500 Subject: vfs: add support for a lazytime mount option Add a new mount option which enables a new "lazytime" mode. This mode causes atime, mtime, and ctime updates to only be made to the in-memory version of the inode. The on-disk times will only get updated when (a) if the inode needs to be updated for some non-time related change, (b) if userspace calls fsync(), syncfs() or sync(), or (c) just before an undeleted inode is evicted from memory. This is OK according to POSIX because there are no guarantees after a crash unless userspace explicitly requests via a fsync(2) call. For workloads which feature a large number of random write to a preallocated file, the lazytime mount option significantly reduces writes to the inode table. The repeated 4k writes to a single block will result in undesirable stress on flash devices and SMR disk drives. Even on conventional HDD's, the repeated writes to the inode table block will trigger Adjacent Track Interference (ATI) remediation latencies, which very negatively impact long tail latencies --- which is a very big deal for web serving tiers (for example). Google-Bug-Id: 18297052 Signed-off-by: Theodore Ts'o Signed-off-by: Al Viro --- fs/ext4/inode.c | 6 ++++ fs/fs-writeback.c | 62 +++++++++++++++++++++++++++++++++------- fs/gfs2/file.c | 4 +-- fs/inode.c | 56 +++++++++++++++++++++++++----------- fs/jfs/file.c | 2 +- fs/libfs.c | 2 +- fs/proc_namespace.c | 1 + fs/sync.c | 8 ++++++ include/linux/backing-dev.h | 1 + include/linux/fs.h | 5 ++++ include/trace/events/writeback.h | 60 +++++++++++++++++++++++++++++++++++++- include/uapi/linux/fs.h | 4 ++- mm/backing-dev.c | 10 +++++-- 13 files changed, 186 insertions(+), 35 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5653fa42930b..628df5ba44a6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4840,11 +4840,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) * If the inode is marked synchronous, we don't honour that here - doing * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. + * + * If only the I_DIRTY_TIME flag is set, we can skip everything. If + * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need + * to copy into the on-disk inode structure are the timestamp files. */ void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; + if (flags == I_DIRTY_TIME) + return; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) goto out; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2d609a5fbfea..004686191354 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -247,14 +247,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) return ret; } +#define EXPIRE_DIRTY_ATIME 0x0001 + /* * Move expired (dirtied before work->older_than_this) dirty inodes from * @delaying_queue to @dispatch_queue. */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, + int flags, struct wb_writeback_work *work) { + unsigned long *older_than_this = NULL; + unsigned long expire_time; LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; @@ -262,13 +267,21 @@ static int move_expired_inodes(struct list_head *delaying_queue, int do_sb_sort = 0; int moved = 0; + if ((flags & EXPIRE_DIRTY_ATIME) == 0) + older_than_this = work->older_than_this; + else if ((work->reason == WB_REASON_SYNC) == 0) { + expire_time = jiffies - (HZ * 86400); + older_than_this = &expire_time; + } while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); - if (work->older_than_this && - inode_dirtied_after(inode, *work->older_than_this)) + if (older_than_this && + inode_dirtied_after(inode, *older_than_this)) break; list_move(&inode->i_wb_list, &tmp); moved++; + if (flags & EXPIRE_DIRTY_ATIME) + set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); if (sb_is_blkdev_sb(inode->i_sb)) continue; if (sb && sb != inode->i_sb) @@ -309,9 +322,12 @@ out: static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) { int moved; + assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); + moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, + EXPIRE_DIRTY_ATIME, work); trace_writeback_queue_io(wb, work, moved); } @@ -435,6 +451,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * updates after data IO completion. */ redirty_tail(inode, wb); + } else if (inode->i_state & I_DIRTY_TIME) { + list_move(&inode->i_wb_list, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ list_del_init(&inode->i_wb_list); @@ -481,7 +499,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; - inode->i_state &= ~I_DIRTY; + if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) && + (inode->i_state & I_DIRTY_TIME)) || + (inode->i_state & I_DIRTY_TIME_EXPIRED)) { + dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; + trace_writeback_lazytime(inode); + } + inode->i_state &= ~dirty; /* * Paired with smp_mb() in __mark_inode_dirty(). This allows @@ -501,8 +525,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_unlock(&inode->i_lock); + if (dirty & I_DIRTY_TIME) + mark_inode_dirty_sync(inode); /* Don't write the inode if only I_DIRTY_PAGES was set */ - if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (dirty & ~I_DIRTY_PAGES) { int err = write_inode(inode, wbc); if (ret == 0) ret = err; @@ -550,7 +576,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * make sure inode is on some writeback list and leave it there unless * we have completely cleaned the inode. */ - if (!(inode->i_state & I_DIRTY) && + if (!(inode->i_state & I_DIRTY_ALL) && (wbc->sync_mode != WB_SYNC_ALL || !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; @@ -565,7 +591,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * If inode is clean, remove it from writeback lists. Otherwise don't * touch it. See comment above for explanation. */ - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY_ALL)) list_del_init(&inode->i_wb_list); spin_unlock(&wb->list_lock); inode_sync_complete(inode); @@ -707,7 +733,7 @@ static long writeback_sb_inodes(struct super_block *sb, wrote += write_chunk - wbc.nr_to_write; spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY_ALL)) wrote++; requeue_inode(inode, wb, &wbc); inode_sync_complete(inode); @@ -1145,16 +1171,20 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) * page->mapping->host, so the page-dirtying time is recorded in the internal * blockdev inode. */ +#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; struct backing_dev_info *bdi = NULL; + int dirtytime; + + trace_writeback_mark_inode_dirty(inode, flags); /* * Don't do this for I_DIRTY_PAGES - that doesn't actually * dirty the inode itself */ - if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) { trace_writeback_dirty_inode_start(inode, flags); if (sb->s_op->dirty_inode) @@ -1162,6 +1192,9 @@ void __mark_inode_dirty(struct inode *inode, int flags) trace_writeback_dirty_inode(inode, flags); } + if (flags & I_DIRTY_INODE) + flags &= ~I_DIRTY_TIME; + dirtytime = flags & I_DIRTY_TIME; /* * Paired with smp_mb() in __writeback_single_inode() for the @@ -1169,16 +1202,21 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ smp_mb(); - if ((inode->i_state & flags) == flags) + if (((inode->i_state & flags) == flags) || + (dirtytime && (inode->i_state & I_DIRTY_INODE))) return; if (unlikely(block_dump)) block_dump___mark_inode_dirty(inode); spin_lock(&inode->i_lock); + if (dirtytime && (inode->i_state & I_DIRTY_INODE)) + goto out_unlock_inode; if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; + if (flags & I_DIRTY_INODE) + inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags; /* @@ -1225,8 +1263,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) } inode->dirtied_when = jiffies; - list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + list_move(&inode->i_wb_list, dirtytime ? + &bdi->wb.b_dirty_time : &bdi->wb.b_dirty); spin_unlock(&bdi->wb.list_lock); + trace_writeback_dirty_inode_enqueue(inode); if (wakeup_bdi) bdi_wakeup_thread_delayed(bdi); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 6e600abf694a..15c44cf457cc 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -655,7 +655,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - int sync_state = inode->i_state & I_DIRTY; + int sync_state = inode->i_state & I_DIRTY_ALL; struct gfs2_inode *ip = GFS2_I(inode); int ret = 0, ret1 = 0; @@ -668,7 +668,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, if (!gfs2_is_jdata(ip)) sync_state &= ~I_DIRTY_PAGES; if (datasync) - sync_state &= ~I_DIRTY_SYNC; + sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME); if (sync_state) { ret = sync_inode_metadata(inode, 1); diff --git a/fs/inode.c b/fs/inode.c index aa149e7262ac..4feb85cc125f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -18,6 +18,7 @@ #include /* for inode_has_buffers */ #include #include +#include #include "internal.h" /* @@ -30,7 +31,7 @@ * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: - * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list + * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash * @@ -416,7 +417,8 @@ static void inode_lru_list_add(struct inode *inode) */ void inode_add_lru(struct inode *inode) { - if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) && + if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC | + I_FREEING | I_WILL_FREE)) && !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) inode_lru_list_add(inode); } @@ -647,7 +649,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) spin_unlock(&inode->i_lock); continue; } - if (inode->i_state & I_DIRTY && !kill_dirty) { + if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { spin_unlock(&inode->i_lock); busy = 1; continue; @@ -1432,11 +1434,20 @@ static void iput_final(struct inode *inode) */ void iput(struct inode *inode) { - if (inode) { - BUG_ON(inode->i_state & I_CLEAR); - - if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) - iput_final(inode); + if (!inode) + return; + BUG_ON(inode->i_state & I_CLEAR); +retry: + if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { + if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { + atomic_inc(&inode->i_count); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + trace_writeback_lazytime_iput(inode); + mark_inode_dirty_sync(inode); + goto retry; + } + iput_final(inode); } } EXPORT_SYMBOL(iput); @@ -1495,14 +1506,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, return 0; } -/* - * This does the actual work of updating an inodes time or version. Must have - * had called mnt_want_write() before calling this. - */ -static int update_time(struct inode *inode, struct timespec *time, int flags) +int generic_update_time(struct inode *inode, struct timespec *time, int flags) { - if (inode->i_op->update_time) - return inode->i_op->update_time(inode, time, flags); + int iflags = I_DIRTY_TIME; if (flags & S_ATIME) inode->i_atime = *time; @@ -1512,9 +1518,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) inode->i_ctime = *time; if (flags & S_MTIME) inode->i_mtime = *time; - mark_inode_dirty_sync(inode); + + if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION)) + iflags |= I_DIRTY_SYNC; + __mark_inode_dirty(inode, iflags); return 0; } +EXPORT_SYMBOL(generic_update_time); + +/* + * This does the actual work of updating an inodes time or version. Must have + * had called mnt_want_write() before calling this. + */ +static int update_time(struct inode *inode, struct timespec *time, int flags) +{ + int (*update_time)(struct inode *, struct timespec *, int); + + update_time = inode->i_op->update_time ? inode->i_op->update_time : + generic_update_time; + + return update_time(inode, time, flags); +} /** * touch_atime - update the access time diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 33aa0cc1f8b8..10815f8dfd8b 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -39,7 +39,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) return rc; mutex_lock(&inode->i_mutex); - if (!(inode->i_state & I_DIRTY) || + if (!(inode->i_state & I_DIRTY_ALL) || (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); diff --git a/fs/libfs.c b/fs/libfs.c index 005843ce5dbd..b2ffdb045be4 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -948,7 +948,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, mutex_lock(&inode->i_mutex); ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY_ALL)) goto out; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) goto out; diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 0f96f71ab32b..8db932da4009 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb) { MS_SYNCHRONOUS, ",sync" }, { MS_DIRSYNC, ",dirsync" }, { MS_MANDLOCK, ",mand" }, + { MS_LAZYTIME, ",lazytime" }, { 0, NULL } }; const struct proc_fs_info *fs_infop; diff --git a/fs/sync.c b/fs/sync.c index 01d9f18a70b5..fbc98ee62044 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd) */ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) { + struct inode *inode = file->f_mapping->host; + if (!file->f_op->fsync) return -EINVAL; + if (!datasync && (inode->i_state & I_DIRTY_TIME)) { + spin_lock(&inode->i_lock); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + mark_inode_dirty_sync(inode); + } return file->f_op->fsync(file, start, end, datasync); } EXPORT_SYMBOL(vfs_fsync_range); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5da6012b7a14..4cdf7336f64a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -55,6 +55,7 @@ struct bdi_writeback { struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ + struct list_head b_dirty_time; /* time stamps are dirty */ spinlock_t list_lock; /* protects the b_* lists */ }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 42efe13077b6..cd027ce2c705 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1746,8 +1746,12 @@ struct super_operations { #define __I_DIO_WAKEUP 9 #define I_DIO_WAKEUP (1 << I_DIO_WAKEUP) #define I_LINKABLE (1 << 10) +#define I_DIRTY_TIME (1 << 11) +#define __I_DIRTY_TIME_EXPIRED 12 +#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) +#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) extern void __mark_inode_dirty(struct inode *, int); static inline void mark_inode_dirty(struct inode *inode) @@ -1910,6 +1914,7 @@ extern int current_umask(void); extern void ihold(struct inode * inode); extern void iput(struct inode *); +extern int generic_update_time(struct inode *, struct timespec *, int); static inline struct inode *file_inode(const struct file *f) { diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index cee02d65ab3f..5ecb4c234625 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -18,6 +18,8 @@ {I_FREEING, "I_FREEING"}, \ {I_CLEAR, "I_CLEAR"}, \ {I_SYNC, "I_SYNC"}, \ + {I_DIRTY_TIME, "I_DIRTY_TIME"}, \ + {I_DIRTY_TIME_EXPIRED, "I_DIRTY_TIME_EXPIRED"}, \ {I_REFERENCED, "I_REFERENCED"} \ ) @@ -68,6 +70,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_STRUCT__entry ( __array(char, name, 32) __field(unsigned long, ino) + __field(unsigned long, state) __field(unsigned long, flags) ), @@ -78,16 +81,25 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, strncpy(__entry->name, bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); __entry->ino = inode->i_ino; + __entry->state = inode->i_state; __entry->flags = flags; ), - TP_printk("bdi %s: ino=%lu flags=%s", + TP_printk("bdi %s: ino=%lu state=%s flags=%s", __entry->name, __entry->ino, + show_inode_state(__entry->state), show_inode_state(__entry->flags) ) ); +DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty, + + TP_PROTO(struct inode *inode, int flags), + + TP_ARGS(inode, flags) +); + DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start, TP_PROTO(struct inode *inode, int flags), @@ -598,6 +610,52 @@ DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, TP_ARGS(inode, wbc, nr_to_write) ); +DECLARE_EVENT_CLASS(writeback_lazytime_template, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field(unsigned long, ino ) + __field(unsigned long, state ) + __field( __u16, mode ) + __field(unsigned long, dirtied_when ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->state = inode->i_state; + __entry->mode = inode->i_mode; + __entry->dirtied_when = inode->dirtied_when; + ), + + TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, __entry->dirtied_when, + show_inode_state(__entry->state), __entry->mode) +); + +DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime_iput, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +DEFINE_EVENT(writeback_lazytime_template, writeback_dirty_inode_enqueue, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 3735fa0a6784..9b964a5920af 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -90,6 +90,7 @@ struct inodes_stat_t { #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ #define MS_I_VERSION (1<<23) /* Update inode I_version field */ #define MS_STRICTATIME (1<<24) /* Always perform atime updates */ +#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ #define MS_NOSEC (1<<28) @@ -100,7 +101,8 @@ struct inodes_stat_t { /* * Superblock flags that can be altered by MS_REMOUNT */ -#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION) +#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\ + MS_LAZYTIME) /* * Old magic mount flag and mask diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0ae0df55000b..915feea94c66 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -69,10 +69,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; - unsigned long nr_dirty, nr_io, nr_more_io; + unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; struct inode *inode; - nr_dirty = nr_io = nr_more_io = 0; + nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; @@ -80,6 +80,9 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; + list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list) + if (inode->i_state & I_DIRTY_TIME) + nr_dirty_time++; spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); @@ -98,6 +101,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" + "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), @@ -111,6 +115,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_dirty, nr_io, nr_more_io, + nr_dirty_time, !list_empty(&bdi->bdi_list), bdi->state); #undef K @@ -418,6 +423,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); + INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); } -- cgit v1.2.3 From eb6ef3df4faa5424cf2a24b4e4f3eeceb1482a8e Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 19 Feb 2015 20:19:35 +0300 Subject: trylock_super(): replacement for grab_super_passive() I've noticed significant locking contention in memory reclaimer around sb_lock inside grab_super_passive(). Grab_super_passive() is called from two places: in icache/dcache shrinkers (function super_cache_scan) and from writeback (function __writeback_inodes_wb). Both are required for progress in memory allocator. Grab_super_passive() acquires sb_lock to increment sb->s_count and check sb->s_instances. It seems sb->s_umount locked for read is enough here: super-block deactivation always runs under sb->s_umount locked for write. Protecting super-block itself isn't a problem: in super_cache_scan() sb is protected by shrinker_rwsem: it cannot be freed if its slab shrinkers are still active. Inside writeback super-block comes from inode from bdi writeback list under wb->list_lock. This patch removes locking sb_lock and checks s_instances under s_umount: generic_shutdown_super() unlinks it under sb->s_umount locked for write. New variant is called trylock_super() and since it only locks semaphore, callers must call up_read(&sb->s_umount) instead of drop_super(sb) when they're done. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Al Viro --- fs/fs-writeback.c | 6 +++--- fs/internal.h | 2 +- fs/super.c | 40 ++++++++++++++++++---------------------- 3 files changed, 22 insertions(+), 26 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 073657f755d4..e907052eeadb 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -769,9 +769,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; - if (!grab_super_passive(sb)) { + if (!trylock_super(sb)) { /* - * grab_super_passive() may fail consistently due to + * trylock_super() may fail consistently due to * s_umount being grabbed by someone else. Don't use * requeue_io() to avoid busy retrying the inode/sb. */ @@ -779,7 +779,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, continue; } wrote += writeback_sb_inodes(sb, wb, work); - drop_super(sb); + up_read(&sb->s_umount); /* refer to the same tests at the end of writeback_sb_inodes */ if (wrote) { diff --git a/fs/internal.h b/fs/internal.h index 30459dab409d..01dce1d1476b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -84,7 +84,7 @@ extern struct file *get_empty_filp(void); * super.c */ extern int do_remount_sb(struct super_block *, int, void *, int); -extern bool grab_super_passive(struct super_block *sb); +extern bool trylock_super(struct super_block *sb); extern struct dentry *mount_fs(struct file_system_type *, int, const char *, void *); extern struct super_block *user_get_super(dev_t); diff --git a/fs/super.c b/fs/super.c index 65a53efc1cf4..2b7dc90ccdbb 100644 --- a/fs/super.c +++ b/fs/super.c @@ -71,7 +71,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; - if (!grab_super_passive(sb)) + if (!trylock_super(sb)) return SHRINK_STOP; if (sb->s_op->nr_cached_objects) @@ -105,7 +105,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, freed += sb->s_op->free_cached_objects(sb, sc); } - drop_super(sb); + up_read(&sb->s_umount); return freed; } @@ -118,7 +118,7 @@ static unsigned long super_cache_count(struct shrinker *shrink, sb = container_of(shrink, struct super_block, s_shrink); /* - * Don't call grab_super_passive as it is a potential + * Don't call trylock_super as it is a potential * scalability bottleneck. The counts could get updated * between super_cache_count and super_cache_scan anyway. * Call to super_cache_count with shrinker_rwsem held @@ -348,35 +348,31 @@ static int grab_super(struct super_block *s) __releases(sb_lock) } /* - * grab_super_passive - acquire a passive reference + * trylock_super - try to grab ->s_umount shared * @sb: reference we are trying to grab * - * Tries to acquire a passive reference. This is used in places where we + * Try to prevent fs shutdown. This is used in places where we * cannot take an active reference but we need to ensure that the - * superblock does not go away while we are working on it. It returns - * false if a reference was not gained, and returns true with the s_umount - * lock held in read mode if a reference is gained. On successful return, - * the caller must drop the s_umount lock and the passive reference when - * done. + * filesystem is not shut down while we are working on it. It returns + * false if we cannot acquire s_umount or if we lose the race and + * filesystem already got into shutdown, and returns true with the s_umount + * lock held in read mode in case of success. On successful return, + * the caller must drop the s_umount lock when done. + * + * Note that unlike get_super() et.al. this one does *not* bump ->s_count. + * The reason why it's safe is that we are OK with doing trylock instead + * of down_read(). There's a couple of places that are OK with that, but + * it's very much not a general-purpose interface. */ -bool grab_super_passive(struct super_block *sb) +bool trylock_super(struct super_block *sb) { - spin_lock(&sb_lock); - if (hlist_unhashed(&sb->s_instances)) { - spin_unlock(&sb_lock); - return false; - } - - sb->s_count++; - spin_unlock(&sb_lock); - if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root && (sb->s_flags & MS_BORN)) + if (!hlist_unhashed(&sb->s_instances) && + sb->s_root && (sb->s_flags & MS_BORN)) return true; up_read(&sb->s_umount); } - put_super(sb); return false; } -- cgit v1.2.3