From 4f2ac93c175c4922bdddbfec6cad94b32cea0070 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 4 Sep 2012 15:23:14 -0700 Subject: block: Remove bi_idx references For immutable bvecs, all bi_idx usage needs to be audited - so here we're removing all the unnecessary uses. Most of these are places where it was being initialized on a bio that was just allocated, a few others are conversions to standard macros. Signed-off-by: Kent Overstreet CC: Jens Axboe --- mm/page_io.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index 78eee32ee486..8d3c0c088105 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -35,7 +35,6 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, bio->bi_io_vec[0].bv_len = PAGE_SIZE; bio->bi_io_vec[0].bv_offset = 0; bio->bi_vcnt = 1; - bio->bi_idx = 0; bio->bi_size = PAGE_SIZE; bio->bi_end_io = end_io; } -- cgit v1.2.3 From 6bc454d150047fcfbf53346412e64cdf3bf61a79 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Sep 2012 14:30:37 -0700 Subject: bounce: Refactor __blk_queue_bounce to not use bi_io_vec A bunch of what __blk_queue_bounce() was doing was problematic for the immutable bvec work; this cleans that up and the code is quite a bit smaller, too. The __bio_for_each_segment() in copy_to_high_bio_irq() was changed because that one's looping over the original bio, not the bounce bio - a later patch renames __bio_for_each_segment() -> bio_for_each_segment_all(), and documents that bio_for_each_segment_all() is only for code that owns the bio. Signed-off-by: Kent Overstreet CC: Jens Axboe --- mm/bounce.c | 73 ++++++++++++++++--------------------------------------------- 1 file changed, 19 insertions(+), 54 deletions(-) (limited to 'mm') diff --git a/mm/bounce.c b/mm/bounce.c index 5f8901768602..55f512af50c7 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -101,7 +101,7 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) struct bio_vec *tovec, *fromvec; int i; - __bio_for_each_segment(tovec, to, i, 0) { + bio_for_each_segment(tovec, to, i) { fromvec = from->bi_io_vec + i; /* @@ -218,78 +218,43 @@ static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, mempool_t *pool, int force) { - struct page *page; - struct bio *bio = NULL; - int i, rw = bio_data_dir(*bio_orig); + struct bio *bio; + int rw = bio_data_dir(*bio_orig); struct bio_vec *to, *from; + unsigned i; - bio_for_each_segment(from, *bio_orig, i) { - page = from->bv_page; + bio_for_each_segment(from, *bio_orig, i) + if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q)) + goto bounce; - /* - * is destination page below bounce pfn? - */ - if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) - continue; + return; +bounce: + bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); - /* - * irk, bounce it - */ - if (!bio) { - unsigned int cnt = (*bio_orig)->bi_vcnt; - - bio = bio_alloc(GFP_NOIO, cnt); - memset(bio->bi_io_vec, 0, cnt * sizeof(struct bio_vec)); - } - + bio_for_each_segment(to, bio, i) { + struct page *page = to->bv_page; - to = bio->bi_io_vec + i; + if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) + continue; - to->bv_page = mempool_alloc(pool, q->bounce_gfp); - to->bv_len = from->bv_len; - to->bv_offset = from->bv_offset; inc_zone_page_state(to->bv_page, NR_BOUNCE); + to->bv_page = mempool_alloc(pool, q->bounce_gfp); if (rw == WRITE) { char *vto, *vfrom; - flush_dcache_page(from->bv_page); + flush_dcache_page(page); + vto = page_address(to->bv_page) + to->bv_offset; - vfrom = kmap(from->bv_page) + from->bv_offset; + vfrom = kmap_atomic(page) + to->bv_offset; memcpy(vto, vfrom, to->bv_len); - kunmap(from->bv_page); + kunmap_atomic(vfrom); } } - /* - * no pages bounced - */ - if (!bio) - return; - trace_block_bio_bounce(q, *bio_orig); - /* - * at least one page was bounced, fill in possible non-highmem - * pages - */ - __bio_for_each_segment(from, *bio_orig, i, 0) { - to = bio_iovec_idx(bio, i); - if (!to->bv_page) { - to->bv_page = from->bv_page; - to->bv_len = from->bv_len; - to->bv_offset = from->bv_offset; - } - } - - bio->bi_bdev = (*bio_orig)->bi_bdev; bio->bi_flags |= (1 << BIO_BOUNCED); - bio->bi_sector = (*bio_orig)->bi_sector; - bio->bi_rw = (*bio_orig)->bi_rw; - - bio->bi_vcnt = (*bio_orig)->bi_vcnt; - bio->bi_idx = (*bio_orig)->bi_idx; - bio->bi_size = (*bio_orig)->bi_size; if (pool == page_pool) { bio->bi_end_io = bounce_end_io_write; -- cgit v1.2.3 From d74c6d514fe314b8bdab58b487b25992291577ec Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 6 Feb 2013 12:23:11 -0800 Subject: block: Add bio_for_each_segment_all() __bio_for_each_segment() iterates bvecs from the specified index instead of bio->bv_idx. Currently, the only usage is to walk all the bvecs after the bio has been advanced by specifying 0 index. For immutable bvecs, we need to split these apart; bio_for_each_segment() is going to have a different implementation. This will also help document the intent of code that's using it - bio_for_each_segment_all() is only legal to use for code that owns the bio. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: Neil Brown CC: Boaz Harrosh --- drivers/block/rbd.c | 2 +- drivers/md/raid1.c | 2 +- fs/bio.c | 12 ++++++------ fs/exofs/ore.c | 2 +- fs/exofs/ore_raid.c | 2 +- include/linux/bio.h | 17 ++++++++++++++--- mm/bounce.c | 2 +- 7 files changed, 25 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6c81a4c040b9..11e179826b60 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -952,7 +952,7 @@ static struct bio *bio_clone_range(struct bio *bio_src, /* Find first affected segment... */ resid = offset; - __bio_for_each_segment(bv, bio_src, idx, 0) { + bio_for_each_segment(bv, bio_src, idx) { if (resid < bv->bv_len) break; resid -= bv->bv_len; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0a3988a25aab..853482015d3d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1291,7 +1291,7 @@ read_again: * know the original bi_idx, so we just free * them all */ - __bio_for_each_segment(bvec, mbio, j, 0) + bio_for_each_segment_all(bvec, mbio, j) bvec->bv_page = r1_bio->behind_bvecs[j].bv_page; if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) atomic_inc(&r1_bio->behind_remaining); diff --git a/fs/bio.c b/fs/bio.c index e437f9aae67d..618f9044c414 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -961,7 +961,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, int iov_idx = 0; unsigned int iov_off = 0; - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { char *bv_addr = page_address(bvec->bv_page); unsigned int bv_len = iovecs[i].bv_len; @@ -1143,7 +1143,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, return bio; cleanup: if (!map_data) - bio_for_each_segment(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i) __free_page(bvec->bv_page); bio_put(bio); @@ -1357,7 +1357,7 @@ static void __bio_unmap_user(struct bio *bio) /* * make sure we dirty pages we wrote to */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { if (bio_data_dir(bio) == READ) set_page_dirty_lock(bvec->bv_page); @@ -1463,7 +1463,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err) int i; char *p = bmd->sgvecs[0].iov_base; - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { char *addr = page_address(bvec->bv_page); int len = bmd->iovecs[i].bv_len; @@ -1503,7 +1503,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, if (!reading) { void *p = data; - bio_for_each_segment(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i) { char *addr = page_address(bvec->bv_page); memcpy(addr, p, bvec->bv_len); @@ -1789,7 +1789,7 @@ sector_t bio_sector_offset(struct bio *bio, unsigned short index, if (index >= bio->bi_idx) index = bio->bi_vcnt - 1; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { if (i == index) { if (offset > bv->bv_offset) sectors += (offset - bv->bv_offset) / sector_sz; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index f936cb50dc0d..b74422888604 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -401,7 +401,7 @@ static void _clear_bio(struct bio *bio) struct bio_vec *bv; unsigned i; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { unsigned this_count = bv->bv_len; if (likely(PAGE_SIZE == this_count)) diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index b963f38ac298..7682b970d0f1 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -432,7 +432,7 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) if (!bio) continue; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; SetPageUptodate(page); diff --git a/include/linux/bio.h b/include/linux/bio.h index 90d36c65cb70..be2efa09f9bf 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -137,16 +137,27 @@ static inline int bio_has_allocated_vec(struct bio *bio) #define bio_io_error(bio) bio_endio((bio), -EIO) /* - * drivers should not use the __ version unless they _really_ want to - * run through the entire bio and not just pending pieces + * drivers should not use the __ version unless they _really_ know what + * they're doing */ #define __bio_for_each_segment(bvl, bio, i, start_idx) \ for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx); \ i < (bio)->bi_vcnt; \ bvl++, i++) +/* + * drivers should _never_ use the all version - the bio may have been split + * before it got to the driver and the driver won't own all of it + */ +#define bio_for_each_segment_all(bvl, bio, i) \ + for (i = 0; \ + bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt; \ + i++) + #define bio_for_each_segment(bvl, bio, i) \ - __bio_for_each_segment(bvl, bio, i, (bio)->bi_idx) + for (i = (bio)->bi_idx; \ + bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt; \ + i++) /* * get a reference to a bio, so it won't disappear. the intended use is diff --git a/mm/bounce.c b/mm/bounce.c index 55f512af50c7..2ee1b6fef44a 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -134,7 +134,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) /* * free up bounce indirect pages used */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { org_vec = bio_orig->bi_io_vec + i; if (bvec->bv_page == org_vec->bv_page) continue; -- cgit v1.2.3 From cb34e057ad22a1c2c6f2cb6cd1cbd05cc2f28f28 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 5 Sep 2012 15:22:02 -0700 Subject: block: Convert some code to bio_for_each_segment_all() More prep work for immutable bvecs: A few places in the code were either open coding or using the wrong version - fix. After we introduce the bvec iter, it'll no longer be possible to modify the biovec through bio_for_each_segment_all() - it doesn't increment a pointer to the current bvec, you pass in a struct bio_vec (not a pointer) which is updated with what the current biovec would be (taking into account bi_bvec_done and bi_size). So because of that it's more worthwhile to be consistent about bio_for_each_segment()/bio_for_each_segment_all() usage. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: NeilBrown CC: Alasdair Kergon CC: dm-devel@redhat.com CC: Alexander Viro --- drivers/md/dm-crypt.c | 3 +-- drivers/md/raid1.c | 10 +++------- fs/bio.c | 20 ++++++++++---------- fs/direct-io.c | 8 ++++---- mm/bounce.c | 2 +- 5 files changed, 19 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 13c15480d940..6d2d41ae9e32 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -858,8 +858,7 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) unsigned int i; struct bio_vec *bv; - for (i = 0; i < clone->bi_vcnt; i++) { - bv = bio_iovec_idx(clone, i); + bio_for_each_segment_all(bv, clone, i) { BUG_ON(!bv->bv_page); mempool_free(bv->bv_page, cc->page_pool); bv->bv_page = NULL; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 853482015d3d..a7ea954abe1d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -925,7 +925,7 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio) if (unlikely(!bvecs)) return; - bio_for_each_segment(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i) { bvecs[i] = *bvec; bvecs[i].bv_page = alloc_page(GFP_NOIO); if (unlikely(!bvecs[i].bv_page)) @@ -1284,12 +1284,8 @@ read_again: struct bio_vec *bvec; int j; - /* Yes, I really want the '__' version so that - * we clear any unused pointer in the io_vec, rather - * than leave them unchanged. This is important - * because when we come to free the pages, we won't - * know the original bi_idx, so we just free - * them all + /* + * We trimmed the bio, so _all is legit */ bio_for_each_segment_all(bvec, mbio, j) bvec->bv_page = r1_bio->behind_bvecs[j].bv_page; diff --git a/fs/bio.c b/fs/bio.c index 618f9044c414..fe3aee90c988 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1548,11 +1548,11 @@ EXPORT_SYMBOL(bio_copy_kern); */ void bio_set_pages_dirty(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; int i; - for (i = 0; i < bio->bi_vcnt; i++) { - struct page *page = bvec[i].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (page && !PageCompound(page)) set_page_dirty_lock(page); @@ -1561,11 +1561,11 @@ void bio_set_pages_dirty(struct bio *bio) static void bio_release_pages(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; int i; - for (i = 0; i < bio->bi_vcnt; i++) { - struct page *page = bvec[i].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (page) put_page(page); @@ -1614,16 +1614,16 @@ static void bio_dirty_fn(struct work_struct *work) void bio_check_pages_dirty(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; int nr_clean_pages = 0; int i; - for (i = 0; i < bio->bi_vcnt; i++) { - struct page *page = bvec[i].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (PageDirty(page) || PageCompound(page)) { page_cache_release(page); - bvec[i].bv_page = NULL; + bvec->bv_page = NULL; } else { nr_clean_pages++; } diff --git a/fs/direct-io.c b/fs/direct-io.c index f853263cf74f..38484b08a39a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -441,8 +441,8 @@ static struct bio *dio_await_one(struct dio *dio) static int dio_bio_complete(struct dio *dio, struct bio *bio) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec; - int page_no; + struct bio_vec *bvec; + unsigned i; if (!uptodate) dio->io_error = -EIO; @@ -450,8 +450,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) if (dio->is_async && dio->rw == READ) { bio_check_pages_dirty(bio); /* transfers ownership */ } else { - for (page_no = 0; page_no < bio->bi_vcnt; page_no++) { - struct page *page = bvec[page_no].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (dio->rw == READ && !PageCompound(page)) set_page_dirty_lock(page); diff --git a/mm/bounce.c b/mm/bounce.c index 2ee1b6fef44a..f5326b24d65d 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -231,7 +231,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, bounce: bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); - bio_for_each_segment(to, bio, i) { + bio_for_each_segment_all(to, bio, i) { struct page *page = to->bv_page; if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) -- cgit v1.2.3 From 181387da2d64c3129e5b5186c4dd388bc5041d53 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 19:08:06 -0700 Subject: writeback: remove unused bdi_pending_list There's no user left. Remove it. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Fengguang Wu --- include/linux/backing-dev.h | 1 - mm/backing-dev.c | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 350459910fe1..a5ef27f5411a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -130,7 +130,6 @@ void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); extern spinlock_t bdi_lock; extern struct list_head bdi_list; -extern struct list_head bdi_pending_list; static inline int wb_has_dirty_io(struct bdi_writeback *wb) { diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 41733c5dc820..657569b3fcf6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -31,13 +31,11 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; /* - * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as - * reader side protection for bdi_pending_list. bdi_list has RCU reader side + * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side * locking. */ DEFINE_SPINLOCK(bdi_lock); LIST_HEAD(bdi_list); -LIST_HEAD(bdi_pending_list); void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { -- cgit v1.2.3 From 839a8e8660b6777e7fe4e80af1a048aebe2b5977 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 19:08:06 -0700 Subject: writeback: replace custom worker pool implementation with unbound workqueue Writeback implements its own worker pool - each bdi can be associated with a worker thread which is created and destroyed dynamically. The worker thread for the default bdi is always present and serves as the "forker" thread which forks off worker threads for other bdis. there's no reason for writeback to implement its own worker pool when using unbound workqueue instead is much simpler and more efficient. This patch replaces custom worker pool implementation in writeback with an unbound workqueue. The conversion isn't too complicated but the followings are worth mentioning. * bdi_writeback->last_active, task and wakeup_timer are removed. delayed_work ->dwork is added instead. Explicit timer handling is no longer necessary. Everything works by either queueing / modding / flushing / canceling the delayed_work item. * bdi_writeback_thread() becomes bdi_writeback_workfn() which runs off bdi_writeback->dwork. On each execution, it processes bdi->work_list and reschedules itself if there are more things to do. The function also handles low-mem condition, which used to be handled by the forker thread. If the function is running off a rescuer thread, it only writes out limited number of pages so that the rescuer can serve other bdis too. This preserves the flusher creation failure behavior of the forker thread. * INIT_LIST_HEAD(&bdi->bdi_list) is used to tell bdi_writeback_workfn() about on-going bdi unregistration so that it always drains work_list even if it's running off the rescuer. Note that the original code was broken in this regard. Under memory pressure, a bdi could finish unregistration with non-empty work_list. * The default bdi is no longer special. It now is treated the same as any other bdi and bdi_cap_flush_forker() is removed. * BDI_pending is no longer used. Removed. * Some tracepoints become non-applicable. The following TPs are removed - writeback_nothread, writeback_wake_thread, writeback_wake_forker_thread, writeback_thread_start, writeback_thread_stop. Everything, including devices coming and going away and rescuer operation under simulated memory pressure, seems to work fine in my test setup. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Fengguang Wu Cc: Jeff Moyer --- fs/fs-writeback.c | 102 +++++----------- include/linux/backing-dev.h | 15 +-- include/trace/events/writeback.h | 5 - mm/backing-dev.c | 255 +++++---------------------------------- 4 files changed, 65 insertions(+), 312 deletions(-) (limited to 'mm') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 21f46fb3a101..8067d3719e94 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -88,20 +87,6 @@ static inline struct inode *wb_inode(struct list_head *head) #define CREATE_TRACE_POINTS #include -/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ -static void bdi_wakeup_flusher(struct backing_dev_info *bdi) -{ - if (bdi->wb.task) { - wake_up_process(bdi->wb.task); - } else { - /* - * The bdi thread isn't there, wake up the forker thread which - * will create and run it. - */ - wake_up_process(default_backing_dev_info.wb.task); - } -} - static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { @@ -109,10 +94,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi, spin_lock_bh(&bdi->wb_lock); list_add_tail(&work->list, &bdi->work_list); - if (!bdi->wb.task) - trace_writeback_nothread(bdi, work); - bdi_wakeup_flusher(bdi); spin_unlock_bh(&bdi->wb_lock); + + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); } static void @@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - if (bdi->wb.task) { - trace_writeback_nowork(bdi); - wake_up_process(bdi->wb.task); - } + trace_writeback_nowork(bdi); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); return; } @@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) * writeback as soon as there is no other work to do. */ trace_writeback_wake_background(bdi); - spin_lock_bh(&bdi->wb_lock); - bdi_wakeup_flusher(bdi); - spin_unlock_bh(&bdi->wb_lock); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); } /* @@ -1020,66 +1000,48 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) /* * Handle writeback of dirty data for the device backed by this bdi. Also - * wakes up periodically and does kupdated style flushing. + * reschedules periodically and does kupdated style flushing. */ -int bdi_writeback_thread(void *data) +void bdi_writeback_workfn(struct work_struct *work) { - struct bdi_writeback *wb = data; + struct bdi_writeback *wb = container_of(to_delayed_work(work), + struct bdi_writeback, dwork); struct backing_dev_info *bdi = wb->bdi; long pages_written; current->flags |= PF_SWAPWRITE; - set_freezable(); - wb->last_active = jiffies; - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(current, 0); - - trace_writeback_thread_start(bdi); - while (!kthread_freezable_should_stop(NULL)) { + if (likely(!current_is_workqueue_rescuer() || + list_empty(&bdi->bdi_list))) { /* - * Remove own delayed wake-up timer, since we are already awake - * and we'll take care of the periodic write-back. + * The normal path. Keep writing back @bdi until its + * work_list is empty. Note that this path is also taken + * if @bdi is shutting down even when we're running off the + * rescuer as work_list needs to be drained. */ - del_timer(&wb->wakeup_timer); - - pages_written = wb_do_writeback(wb, 0); - + do { + pages_written = wb_do_writeback(wb, 0); + trace_writeback_pages_written(pages_written); + } while (!list_empty(&bdi->work_list)); + } else { + /* + * bdi_wq can't get enough workers and we're running off + * the emergency worker. Don't hog it. Hopefully, 1024 is + * enough for efficient IO. + */ + pages_written = writeback_inodes_wb(&bdi->wb, 1024, + WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); - - if (pages_written) - wb->last_active = jiffies; - - set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&bdi->work_list) || kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - continue; - } - - if (wb_has_dirty_io(wb) && dirty_writeback_interval) - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - else { - /* - * We have nothing to do, so can go sleep without any - * timeout and save power. When a work is queued or - * something is made dirty - we will be woken up. - */ - schedule(); - } } - /* Flush any work that raced with us exiting */ - if (!list_empty(&bdi->work_list)) - wb_do_writeback(wb, 1); + if (!list_empty(&bdi->work_list) || + (wb_has_dirty_io(wb) && dirty_writeback_interval)) + queue_delayed_work(bdi_wq, &wb->dwork, + msecs_to_jiffies(dirty_writeback_interval * 10)); - trace_writeback_thread_stop(bdi); - return 0; + current->flags &= ~PF_SWAPWRITE; } - /* * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index a5ef27f5411a..c3881553f7d1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -18,6 +18,7 @@ #include #include #include +#include struct page; struct device; @@ -27,7 +28,6 @@ struct dentry; * Bits in backing_dev_info.state */ enum bdi_state { - BDI_pending, /* On its way to being activated */ BDI_wb_alloc, /* Default embedded wb allocated */ BDI_async_congested, /* The async (write) queue is getting full */ BDI_sync_congested, /* The sync queue is getting full */ @@ -53,10 +53,8 @@ struct bdi_writeback { unsigned int nr; unsigned long last_old_flush; /* last old data flush */ - unsigned long last_active; /* last time bdi thread was active */ - struct task_struct *task; /* writeback thread */ - struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */ + struct delayed_work dwork; /* work item used for writeback */ struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ @@ -123,7 +121,7 @@ int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); -int bdi_writeback_thread(void *data); +void bdi_writeback_workfn(struct work_struct *work); int bdi_has_dirty_io(struct backing_dev_info *bdi); void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); @@ -131,6 +129,8 @@ void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); extern spinlock_t bdi_lock; extern struct list_head bdi_list; +extern struct workqueue_struct *bdi_wq; + static inline int wb_has_dirty_io(struct bdi_writeback *wb) { return !list_empty(&wb->b_dirty) || @@ -335,11 +335,6 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi) return bdi->capabilities & BDI_CAP_SWAP_BACKED; } -static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi) -{ - return bdi == &default_backing_dev_info; -} - static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) { return bdi_cap_writeback_dirty(mapping->backing_dev_info); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 6a16fd2e70ed..464ea82e10db 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -183,7 +183,6 @@ DECLARE_EVENT_CLASS(writeback_work_class, DEFINE_EVENT(writeback_work_class, name, \ TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ TP_ARGS(bdi, work)) -DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); @@ -222,12 +221,8 @@ DEFINE_EVENT(writeback_class, name, \ DEFINE_WRITEBACK_EVENT(writeback_nowork); DEFINE_WRITEBACK_EVENT(writeback_wake_background); -DEFINE_WRITEBACK_EVENT(writeback_wake_thread); -DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread); DEFINE_WRITEBACK_EVENT(writeback_bdi_register); DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); -DEFINE_WRITEBACK_EVENT(writeback_thread_start); -DEFINE_WRITEBACK_EVENT(writeback_thread_stop); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 657569b3fcf6..2857d4f6bca4 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -37,6 +37,9 @@ static struct class *bdi_class; DEFINE_SPINLOCK(bdi_lock); LIST_HEAD(bdi_list); +/* bdi_wq serves all asynchronous writeback tasks */ +struct workqueue_struct *bdi_wq; + void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { if (wb1 < wb2) { @@ -255,6 +258,11 @@ static int __init default_bdi_init(void) { int err; + bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE | + WQ_UNBOUND, 0); + if (!bdi_wq) + return -ENOMEM; + err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); @@ -269,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) return wb_has_dirty_io(&bdi->wb); } -static void wakeup_timer_fn(unsigned long data) -{ - struct backing_dev_info *bdi = (struct backing_dev_info *)data; - - spin_lock_bh(&bdi->wb_lock); - if (bdi->wb.task) { - trace_writeback_wake_thread(bdi); - wake_up_process(bdi->wb.task); - } else if (bdi->dev) { - /* - * When bdi tasks are inactive for long time, they are killed. - * In this case we have to wake-up the forker thread which - * should create and run the bdi thread. - */ - trace_writeback_wake_forker_thread(bdi); - wake_up_process(default_backing_dev_info.wb.task); - } - spin_unlock_bh(&bdi->wb_lock); -} - /* * This function is used when the first inode for this bdi is marked dirty. It * wakes-up the corresponding bdi thread which should then take care of the @@ -305,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); -} - -/* - * Calculate the longest interval (jiffies) bdi threads are allowed to be - * inactive. - */ -static unsigned long bdi_longest_inactive(void) -{ - unsigned long interval; - - interval = msecs_to_jiffies(dirty_writeback_interval * 10); - return max(5UL * 60 * HZ, interval); -} - -/* - * Clear pending bit and wakeup anybody waiting for flusher thread creation or - * shutdown - */ -static void bdi_clear_pending(struct backing_dev_info *bdi) -{ - clear_bit(BDI_pending, &bdi->state); - smp_mb__after_clear_bit(); - wake_up_bit(&bdi->state, BDI_pending); -} - -static int bdi_forker_thread(void *ptr) -{ - struct bdi_writeback *me = ptr; - - current->flags |= PF_SWAPWRITE; - set_freezable(); - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(current, 0); - - for (;;) { - struct task_struct *task = NULL; - struct backing_dev_info *bdi; - enum { - NO_ACTION, /* Nothing to do */ - FORK_THREAD, /* Fork bdi thread */ - KILL_THREAD, /* Kill inactive bdi thread */ - } action = NO_ACTION; - - /* - * Temporary measure, we want to make sure we don't see - * dirty data on the default backing_dev_info - */ - if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { - del_timer(&me->wakeup_timer); - wb_do_writeback(me, 0); - } - - spin_lock_bh(&bdi_lock); - /* - * In the following loop we are going to check whether we have - * some work to do without any synchronization with tasks - * waking us up to do work for them. Set the task state here - * so that we don't miss wakeups after verifying conditions. - */ - set_current_state(TASK_INTERRUPTIBLE); - - list_for_each_entry(bdi, &bdi_list, bdi_list) { - bool have_dirty_io; - - if (!bdi_cap_writeback_dirty(bdi) || - bdi_cap_flush_forker(bdi)) - continue; - - WARN(!test_bit(BDI_registered, &bdi->state), - "bdi %p/%s is not registered!\n", bdi, bdi->name); - - have_dirty_io = !list_empty(&bdi->work_list) || - wb_has_dirty_io(&bdi->wb); - - /* - * If the bdi has work to do, but the thread does not - * exist - create it. - */ - if (!bdi->wb.task && have_dirty_io) { - /* - * Set the pending bit - if someone will try to - * unregister this bdi - it'll wait on this bit. - */ - set_bit(BDI_pending, &bdi->state); - action = FORK_THREAD; - break; - } - - spin_lock(&bdi->wb_lock); - - /* - * If there is no work to do and the bdi thread was - * inactive long enough - kill it. The wb_lock is taken - * to make sure no-one adds more work to this bdi and - * wakes the bdi thread up. - */ - if (bdi->wb.task && !have_dirty_io && - time_after(jiffies, bdi->wb.last_active + - bdi_longest_inactive())) { - task = bdi->wb.task; - bdi->wb.task = NULL; - spin_unlock(&bdi->wb_lock); - set_bit(BDI_pending, &bdi->state); - action = KILL_THREAD; - break; - } - spin_unlock(&bdi->wb_lock); - } - spin_unlock_bh(&bdi_lock); - - /* Keep working if default bdi still has things to do */ - if (!list_empty(&me->bdi->work_list)) - __set_current_state(TASK_RUNNING); - - switch (action) { - case FORK_THREAD: - __set_current_state(TASK_RUNNING); - task = kthread_create(bdi_writeback_thread, &bdi->wb, - "flush-%s", dev_name(bdi->dev)); - if (IS_ERR(task)) { - /* - * If thread creation fails, force writeout of - * the bdi from the thread. Hopefully 1024 is - * large enough for efficient IO. - */ - writeback_inodes_wb(&bdi->wb, 1024, - WB_REASON_FORKER_THREAD); - } else { - /* - * The spinlock makes sure we do not lose - * wake-ups when racing with 'bdi_queue_work()'. - * And as soon as the bdi thread is visible, we - * can start it. - */ - spin_lock_bh(&bdi->wb_lock); - bdi->wb.task = task; - spin_unlock_bh(&bdi->wb_lock); - wake_up_process(task); - } - bdi_clear_pending(bdi); - break; - - case KILL_THREAD: - __set_current_state(TASK_RUNNING); - kthread_stop(task); - bdi_clear_pending(bdi); - break; - - case NO_ACTION: - if (!wb_has_dirty_io(me) || !dirty_writeback_interval) - /* - * There are no dirty data. The only thing we - * should now care about is checking for - * inactive bdi threads and killing them. Thus, - * let's sleep for longer time, save energy and - * be friendly for battery-driven devices. - */ - schedule_timeout(bdi_longest_inactive()); - else - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - try_to_freeze(); - break; - } - } - - return 0; + mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); } /* @@ -487,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); + + /* bdi_list is now unused, clear it to mark @bdi dying */ + INIT_LIST_HEAD(&bdi->bdi_list); } int bdi_register(struct backing_dev_info *bdi, struct device *parent, @@ -506,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, bdi->dev = dev; - /* - * Just start the forker thread for our default backing_dev_info, - * and add other bdi's to the list. They will get a thread created - * on-demand when they need it. - */ - if (bdi_cap_flush_forker(bdi)) { - struct bdi_writeback *wb = &bdi->wb; - - wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", - dev_name(dev)); - if (IS_ERR(wb->task)) - return PTR_ERR(wb->task); - } - bdi_debug_register(bdi, dev_name(dev)); set_bit(BDI_registered, &bdi->state); @@ -543,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { - struct task_struct *task; - if (!bdi_cap_writeback_dirty(bdi)) return; @@ -554,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) bdi_remove_from_list(bdi); /* - * If setup is pending, wait for that to complete first + * Drain work list and shutdown the delayed_work. At this point, + * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi + * is dying and its work_list needs to be drained no matter what. */ - wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, - TASK_UNINTERRUPTIBLE); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + flush_delayed_work(&bdi->wb.dwork); + WARN_ON(!list_empty(&bdi->work_list)); /* - * Finally, kill the kernel thread. We don't need to be RCU - * safe anymore, since the bdi is gone from visibility. + * This shouldn't be necessary unless @bdi for some reason has + * unflushed dirty IO after work_list is drained. Do it anyway + * just in case. */ - spin_lock_bh(&bdi->wb_lock); - task = bdi->wb.task; - bdi->wb.task = NULL; - spin_unlock_bh(&bdi->wb_lock); - - if (task) - kthread_stop(task); + cancel_delayed_work_sync(&bdi->wb.dwork); } /* @@ -595,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi) bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); - del_timer_sync(&bdi->wb.wakeup_timer); - if (!bdi_cap_flush_forker(bdi)) - bdi_wb_shutdown(bdi); + bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); spin_lock_bh(&bdi->wb_lock); @@ -620,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); spin_lock_init(&wb->list_lock); - setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); + INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); } /* @@ -693,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); /* - * If bdi_unregister() had already been called earlier, the - * wakeup_timer could still be armed because bdi_prune_sb() - * can race with the bdi_wakeup_thread_delayed() calls from - * __mark_inode_dirty(). + * If bdi_unregister() had already been called earlier, the dwork + * could still be pending because bdi_prune_sb() can race with the + * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty(). */ - del_timer_sync(&bdi->wb.wakeup_timer); + cancel_delayed_work_sync(&bdi->wb.dwork); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); -- cgit v1.2.3 From b5c872ddb7083c7909fb76a170c3807e04564bb3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 19:08:06 -0700 Subject: writeback: expose the bdi_wq workqueue There are cases where userland wants to tweak the priority and affinity of writeback flushers. Expose bdi_wq to userland by setting WQ_SYSFS. It appears under /sys/bus/workqueue/devices/writeback/ and allows adjusting maximum concurrency level, cpumask and nice level. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Fengguang Wu Cc: Jeff Moyer Cc: Kay Sievers Cc: Greg Kroah-Hartman --- mm/backing-dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 2857d4f6bca4..502517492258 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -259,7 +259,7 @@ static int __init default_bdi_init(void) int err; bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE | - WQ_UNBOUND, 0); + WQ_UNBOUND | WQ_SYSFS, 0); if (!bdi_wq) return -ENOMEM; -- cgit v1.2.3