summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2009-08-13 13:53:31 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2009-08-13 13:53:31 +1000
commit0bf659a9c123837734cbdd83d8871f8cec95f251 (patch)
tree040519982358c2c47b44fca53a07371eea23ed9e
parentec96d9947e85adb78d1c89f6acca210c7a6dec52 (diff)
parente1a27d56b6994ce540cdf94a7ed3e579a8e6c0bc (diff)
Merge commit 'block/for-next'
-rw-r--r--block/Makefile2
-rw-r--r--block/blk-core.c157
-rw-r--r--block/blk-iopoll.c227
-rw-r--r--block/blk-merge.c49
-rw-r--r--block/blk.h1
-rw-r--r--block/cfq-iosched.c69
-rw-r--r--block/elevator.c16
-rw-r--r--block/genhd.c18
-rw-r--r--drivers/block/aoe/aoeblk.c1
-rw-r--r--drivers/block/loop.c2
-rw-r--r--drivers/block/osdblk.c2
-rw-r--r--drivers/char/mem.c1
-rw-r--r--drivers/md/dm-raid1.c2
-rw-r--r--drivers/md/dm-stripe.c2
-rw-r--r--drivers/md/dm.c12
-rw-r--r--drivers/md/linear.c2
-rw-r--r--drivers/md/multipath.c4
-rw-r--r--drivers/md/raid0.c2
-rw-r--r--drivers/md/raid1.c14
-rw-r--r--drivers/md/raid10.c6
-rw-r--r--drivers/md/raid5.c2
-rw-r--r--drivers/scsi/scsi_lib.c6
-rw-r--r--drivers/staging/dst/dcore.c5
-rw-r--r--fs/btrfs/disk-io.c1
-rw-r--r--fs/btrfs/volumes.c4
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/char_dev.c1
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/fs-writeback.c766
-rw-r--r--fs/fuse/inode.c1
-rw-r--r--fs/hugetlbfs/inode.c1
-rw-r--r--fs/nfs/client.c1
-rw-r--r--fs/ocfs2/dlm/dlmfs.c1
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/super.c3
-rw-r--r--fs/sync.c2
-rw-r--r--fs/sysfs/inode.c1
-rw-r--r--fs/ubifs/super.c1
-rw-r--r--include/linux/backing-dev.h71
-rw-r--r--include/linux/bio.h69
-rw-r--r--include/linux/blk-iopoll.h48
-rw-r--r--include/linux/blkdev.h32
-rw-r--r--include/linux/fs.h11
-rw-r--r--include/linux/interrupt.h1
-rw-r--r--include/linux/writeback.h15
-rw-r--r--kernel/cgroup.c1
-rw-r--r--kernel/sysctl.c10
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c533
-rw-r--r--mm/page-writeback.c157
-rw-r--r--mm/pdflush.c269
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/vmscan.c2
53 files changed, 1851 insertions, 760 deletions
diff --git a/block/Makefile b/block/Makefile
index 6c54ed0ff755..ba74ca6bfa14 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- ioctl.o genhd.o scsi_ioctl.o
+ blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
diff --git a/block/blk-core.c b/block/blk-core.c
index e3299a77a0d8..5b59592b8bf7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -501,6 +501,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q->backing_dev_info.state = 0;
q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
+ q->backing_dev_info.name = "block";
err = bdi_init(&q->backing_dev_info);
if (err) {
@@ -1111,31 +1112,27 @@ void init_request_from_bio(struct request *req, struct bio *bio)
req->cmd_type = REQ_TYPE_FS;
/*
- * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
+ * Inherit FAILFAST from bio (for read-ahead, and explicit
+ * FAILFAST). FAILFAST flags are identical for req and bio.
*/
- if (bio_rw_ahead(bio))
- req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
- REQ_FAILFAST_DRIVER);
- if (bio_failfast_dev(bio))
- req->cmd_flags |= REQ_FAILFAST_DEV;
- if (bio_failfast_transport(bio))
- req->cmd_flags |= REQ_FAILFAST_TRANSPORT;
- if (bio_failfast_driver(bio))
- req->cmd_flags |= REQ_FAILFAST_DRIVER;
-
- if (unlikely(bio_discard(bio))) {
+ if (bio_rw_flagged(bio, BIO_RW_AHEAD))
+ req->cmd_flags |= REQ_FAILFAST_MASK;
+ else
+ req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK;
+
+ if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) {
req->cmd_flags |= REQ_DISCARD;
- if (bio_barrier(bio))
+ if (bio_rw_flagged(bio, BIO_RW_BARRIER))
req->cmd_flags |= REQ_SOFTBARRIER;
req->q->prepare_discard_fn(req->q, req);
- } else if (unlikely(bio_barrier(bio)))
+ } else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)))
req->cmd_flags |= REQ_HARDBARRIER;
- if (bio_sync(bio))
+ if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
req->cmd_flags |= REQ_RW_SYNC;
- if (bio_rw_meta(bio))
+ if (bio_rw_flagged(bio, BIO_RW_META))
req->cmd_flags |= REQ_RW_META;
- if (bio_noidle(bio))
+ if (bio_rw_flagged(bio, BIO_RW_NOIDLE))
req->cmd_flags |= REQ_NOIDLE;
req->errors = 0;
@@ -1150,7 +1147,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
*/
static inline bool queue_should_plug(struct request_queue *q)
{
- return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
+ return !(blk_queue_nonrot(q) && blk_queue_queuing(q));
}
static int __make_request(struct request_queue *q, struct bio *bio)
@@ -1159,11 +1156,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
int el_ret;
unsigned int bytes = bio->bi_size;
const unsigned short prio = bio_prio(bio);
- const int sync = bio_sync(bio);
- const int unplug = bio_unplug(bio);
+ const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
+ const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);
+ const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
int rw_flags;
- if (bio_barrier(bio) && bio_has_data(bio) &&
+ if (bio_rw_flagged(bio, BIO_RW_BARRIER) && bio_has_data(bio) &&
(q->next_ordered == QUEUE_ORDERED_NONE)) {
bio_endio(bio, -EOPNOTSUPP);
return 0;
@@ -1177,7 +1175,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
spin_lock_irq(q->queue_lock);
- if (unlikely(bio_barrier(bio)) || elv_queue_empty(q))
+ if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q))
goto get_rq;
el_ret = elv_merge(q, &req, bio);
@@ -1190,6 +1188,9 @@ static int __make_request(struct request_queue *q, struct bio *bio)
trace_block_bio_backmerge(q, bio);
+ if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+ blk_rq_set_mixed_merge(req);
+
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bytes;
@@ -1209,6 +1210,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
trace_block_bio_frontmerge(q, bio);
+ if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
+ blk_rq_set_mixed_merge(req);
+ req->cmd_flags &= ~REQ_FAILFAST_MASK;
+ req->cmd_flags |= ff;
+ }
+
bio->bi_next = req->bio;
req->bio = bio;
@@ -1464,7 +1471,8 @@ static inline void __generic_make_request(struct bio *bio)
if (bio_check_eod(bio, nr_sectors))
goto end_io;
- if (bio_discard(bio) && !q->prepare_discard_fn) {
+ if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
+ !q->prepare_discard_fn) {
err = -EOPNOTSUPP;
goto end_io;
}
@@ -1653,6 +1661,50 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
+/**
+ * blk_rq_err_bytes - determine number of bytes till the next failure boundary
+ * @rq: request to examine
+ *
+ * Description:
+ * A request could be merge of IOs which require different failure
+ * handling. This function determines the number of bytes which
+ * can be failed from the beginning of the request without
+ * crossing into area which need to be retried further.
+ *
+ * Return:
+ * The number of bytes to fail.
+ *
+ * Context:
+ * queue_lock must be held.
+ */
+unsigned int blk_rq_err_bytes(const struct request *rq)
+{
+ unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
+ unsigned int bytes = 0;
+ struct bio *bio;
+
+ if (!(rq->cmd_flags & REQ_MIXED_MERGE))
+ return blk_rq_bytes(rq);
+
+ /*
+ * Currently the only 'mixing' which can happen is between
+ * different fastfail types. We can safely fail portions
+ * which have all the failfast bits that the first one has -
+ * the ones which are at least as eager to fail as the first
+ * one.
+ */
+ for (bio = rq->bio; bio; bio = bio->bi_next) {
+ if ((bio->bi_rw & ff) != ff)
+ break;
+ bytes += bio->bi_size;
+ }
+
+ /* this could lead to infinite loop */
+ BUG_ON(blk_rq_bytes(rq) && !bytes);
+ return bytes;
+}
+EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
+
static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (blk_do_io_stat(req)) {
@@ -1806,8 +1858,15 @@ void blk_dequeue_request(struct request *rq)
* and to it is freed is accounted as io that is in progress at
* the driver side.
*/
- if (blk_account_rq(rq))
+ if (blk_account_rq(rq)) {
q->in_flight[rq_is_sync(rq)]++;
+ /*
+ * Mark this device as supporting hardware queuing, if
+ * we have more IOs in flight than 4.
+ */
+ if (!blk_queue_queuing(q) && queue_in_flight(q) > 4)
+ set_bit(QUEUE_FLAG_CQ, &q->queue_flags);
+ }
}
/**
@@ -1999,6 +2058,12 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
if (blk_fs_request(req) || blk_discard_rq(req))
req->__sector += total_bytes >> 9;
+ /* mixed attributes always follow the first bio */
+ if (req->cmd_flags & REQ_MIXED_MERGE) {
+ req->cmd_flags &= ~REQ_FAILFAST_MASK;
+ req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK;
+ }
+
/*
* If total number of sectors is less than the first segment
* size, something has gone terribly wrong.
@@ -2178,6 +2243,25 @@ bool blk_end_request_cur(struct request *rq, int error)
EXPORT_SYMBOL(blk_end_request_cur);
/**
+ * blk_end_request_err - Finish a request till the next failure boundary.
+ * @rq: the request to finish till the next failure boundary for
+ * @error: must be negative errno
+ *
+ * Description:
+ * Complete @rq till the next failure boundary.
+ *
+ * Return:
+ * %false - we are done with this request
+ * %true - still buffers pending for this request
+ */
+bool blk_end_request_err(struct request *rq, int error)
+{
+ WARN_ON(error >= 0);
+ return blk_end_request(rq, error, blk_rq_err_bytes(rq));
+}
+EXPORT_SYMBOL_GPL(blk_end_request_err);
+
+/**
* __blk_end_request - Helper function for drivers to complete the request.
* @rq: the request being processed
* @error: %0 for success, < %0 for error
@@ -2236,12 +2320,31 @@ bool __blk_end_request_cur(struct request *rq, int error)
}
EXPORT_SYMBOL(__blk_end_request_cur);
+/**
+ * __blk_end_request_err - Finish a request till the next failure boundary.
+ * @rq: the request to finish till the next failure boundary for
+ * @error: must be negative errno
+ *
+ * Description:
+ * Complete @rq till the next failure boundary. Must be called
+ * with queue lock held.
+ *
+ * Return:
+ * %false - we are done with this request
+ * %true - still buffers pending for this request
+ */
+bool __blk_end_request_err(struct request *rq, int error)
+{
+ WARN_ON(error >= 0);
+ return __blk_end_request(rq, error, blk_rq_err_bytes(rq));
+}
+EXPORT_SYMBOL_GPL(__blk_end_request_err);
+
void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and
- we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */
- rq->cmd_flags |= (bio->bi_rw & 3);
+ /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
+ rq->cmd_flags |= bio->bi_rw & REQ_RW;
if (bio_has_data(bio)) {
rq->nr_phys_segments = bio_phys_segments(q, bio);
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
new file mode 100644
index 000000000000..ca564202ed7a
--- /dev/null
+++ b/block/blk-iopoll.c
@@ -0,0 +1,227 @@
+/*
+ * Functions related to interrupt-poll handling in the block layer. This
+ * is similar to NAPI for network devices.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/blk-iopoll.h>
+#include <linux/delay.h>
+
+#include "blk.h"
+
+int blk_iopoll_enabled = 1;
+EXPORT_SYMBOL(blk_iopoll_enabled);
+
+static unsigned int blk_iopoll_budget __read_mostly = 256;
+
+static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
+
+/**
+ * blk_iopoll_sched - Schedule a run of the iopoll handler
+ * @iop: The parent iopoll structure
+ *
+ * Description:
+ * Add this blk_iopoll structure to the pending poll list and trigger the
+ * raise of the blk iopoll softirq. The driver must already have gotten a
+ * succesful return from blk_iopoll_sched_prep() before calling this.
+ **/
+void blk_iopoll_sched(struct blk_iopoll *iop)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll));
+ __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(blk_iopoll_sched);
+
+/**
+ * __blk_iopoll_complete - Mark this @iop as un-polled again
+ * @iop: The parent iopoll structure
+ *
+ * Description:
+ * See blk_iopoll_complete(). This function must be called with interrupts
+ * disabled.
+ **/
+void __blk_iopoll_complete(struct blk_iopoll *iop)
+{
+ list_del(&iop->list);
+ smp_mb__before_clear_bit();
+ clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(__blk_iopoll_complete);
+
+/**
+ * blk_iopoll_complete - Mark this @iop as un-polled again
+ * @iop: The parent iopoll structure
+ *
+ * Description:
+ * If a driver consumes less than the assigned budget in its run of the
+ * iopoll handler, it'll end the polled mode by calling this function. The
+ * iopoll handler will not be invoked again before blk_iopoll_sched_prep()
+ * is called.
+ **/
+void blk_iopoll_complete(struct blk_iopoll *iopoll)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __blk_iopoll_complete(iopoll);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(blk_iopoll_complete);
+
+static void blk_iopoll_softirq(struct softirq_action *h)
+{
+ struct list_head *list = &__get_cpu_var(blk_cpu_iopoll);
+ int rearm = 0, budget = blk_iopoll_budget;
+ unsigned long start_time = jiffies;
+
+ local_irq_disable();
+
+ while (!list_empty(list)) {
+ struct blk_iopoll *iop;
+ int work, weight;
+
+ /*
+ * If softirq window is exhausted then punt.
+ */
+ if (budget <= 0 || time_after(jiffies, start_time)) {
+ rearm = 1;
+ break;
+ }
+
+ local_irq_enable();
+
+ /* Even though interrupts have been re-enabled, this
+ * access is safe because interrupts can only add new
+ * entries to the tail of this list, and only ->poll()
+ * calls can remove this head entry from the list.
+ */
+ iop = list_entry(list->next, struct blk_iopoll, list);
+
+ weight = iop->weight;
+ work = 0;
+ if (test_bit(IOPOLL_F_SCHED, &iop->state))
+ work = iop->poll(iop, weight);
+
+ budget -= work;
+
+ local_irq_disable();
+
+ /*
+ * Drivers must not modify the iopoll state, if they
+ * consume their assigned weight (or more, some drivers can't
+ * easily just stop processing, they have to complete an
+ * entire mask of commands).In such cases this code
+ * still "owns" the iopoll instance and therefore can
+ * move the instance around on the list at-will.
+ */
+ if (work >= weight) {
+ if (blk_iopoll_disable_pending(iop))
+ __blk_iopoll_complete(iop);
+ else
+ list_move_tail(&iop->list, list);
+ }
+ }
+
+ if (rearm)
+ __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+
+ local_irq_enable();
+}
+
+/**
+ * blk_iopoll_disable - Disable iopoll on this @iop
+ * @iop: The parent iopoll structure
+ *
+ * Description:
+ * Disable io polling and wait for any pending callbacks to have completed.
+ **/
+void blk_iopoll_disable(struct blk_iopoll *iop)
+{
+ set_bit(IOPOLL_F_DISABLE, &iop->state);
+ while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state))
+ msleep(1);
+ clear_bit(IOPOLL_F_DISABLE, &iop->state);
+}
+EXPORT_SYMBOL(blk_iopoll_disable);
+
+/**
+ * blk_iopoll_enable - Enable iopoll on this @iop
+ * @iop: The parent iopoll structure
+ *
+ * Description:
+ * Enable iopoll on this @iop. Note that the handler run will not be
+ * scheduled, it will only mark it as active.
+ **/
+void blk_iopoll_enable(struct blk_iopoll *iop)
+{
+ BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state));
+ smp_mb__before_clear_bit();
+ clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(blk_iopoll_enable);
+
+/**
+ * blk_iopoll_init - Initialize this @iop
+ * @iop: The parent iopoll structure
+ * @weight: The default weight (or command completion budget)
+ * @poll_fn: The handler to invoke
+ *
+ * Description:
+ * Initialize this blk_iopoll structure. Before being actively used, the
+ * driver must call blk_iopoll_enable().
+ **/
+void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
+{
+ memset(iop, 0, sizeof(*iop));
+ INIT_LIST_HEAD(&iop->list);
+ iop->weight = weight;
+ iop->poll = poll_fn;
+ set_bit(IOPOLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(blk_iopoll_init);
+
+static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ /*
+ * If a CPU goes away, splice its entries to the current CPU
+ * and trigger a run of the softirq
+ */
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ int cpu = (unsigned long) hcpu;
+
+ local_irq_disable();
+ list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
+ &__get_cpu_var(blk_cpu_iopoll));
+ __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+ local_irq_enable();
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata blk_iopoll_cpu_notifier = {
+ .notifier_call = blk_iopoll_cpu_notify,
+};
+
+static __init int blk_iopoll_setup(void)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
+
+ open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq);
+ register_hotcpu_notifier(&blk_iopoll_cpu_notifier);
+ return 0;
+}
+subsys_initcall(blk_iopoll_setup);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index e1999679a4d5..b0de8574fdc8 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -311,6 +311,36 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
return 1;
}
+/**
+ * blk_rq_set_mixed_merge - mark a request as mixed merge
+ * @rq: request to mark as mixed merge
+ *
+ * Description:
+ * @rq is about to be mixed merged. Make sure the attributes
+ * which can be mixed are set in each bio and mark @rq as mixed
+ * merged.
+ */
+void blk_rq_set_mixed_merge(struct request *rq)
+{
+ unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
+ struct bio *bio;
+
+ if (rq->cmd_flags & REQ_MIXED_MERGE)
+ return;
+
+ /*
+ * @rq will no longer represent mixable attributes for all the
+ * contained bios. It will just track those of the first one.
+ * Distributes the attributs to each bio.
+ */
+ for (bio = rq->bio; bio; bio = bio->bi_next) {
+ WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) &&
+ (bio->bi_rw & REQ_FAILFAST_MASK) != ff);
+ bio->bi_rw |= ff;
+ }
+ rq->cmd_flags |= REQ_MIXED_MERGE;
+}
+
static void blk_account_io_merge(struct request *req)
{
if (blk_do_io_stat(req)) {
@@ -350,12 +380,6 @@ static int attempt_merge(struct request_queue *q, struct request *req,
if (blk_integrity_rq(req) != blk_integrity_rq(next))
return 0;
- /* don't merge requests of different failfast settings */
- if (blk_failfast_dev(req) != blk_failfast_dev(next) ||
- blk_failfast_transport(req) != blk_failfast_transport(next) ||
- blk_failfast_driver(req) != blk_failfast_driver(next))
- return 0;
-
/*
* If we are allowed to merge, then append bio list
* from next to rq and release next. merge_requests_fn
@@ -366,6 +390,19 @@ static int attempt_merge(struct request_queue *q, struct request *req,
return 0;
/*
+ * If failfast settings disagree or any of the two is already
+ * a mixed merge, mark both as mixed before proceeding. This
+ * makes sure that all involved bios have mixable attributes
+ * set properly.
+ */
+ if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE ||
+ (req->cmd_flags & REQ_FAILFAST_MASK) !=
+ (next->cmd_flags & REQ_FAILFAST_MASK)) {
+ blk_rq_set_mixed_merge(req);
+ blk_rq_set_mixed_merge(next);
+ }
+
+ /*
* At this point we have either done a back merge
* or front merge. We need the smaller start_time of
* the merged requests to be the current request
diff --git a/block/blk.h b/block/blk.h
index 3fae6add5430..5ee3d7e72feb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -104,6 +104,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
int attempt_back_merge(struct request_queue *q, struct request *rq);
int attempt_front_merge(struct request_queue *q, struct request *rq);
void blk_recalc_rq_segments(struct request *rq);
+void blk_rq_set_mixed_merge(struct request *rq);
void blk_queue_congestion_threshold(struct request_queue *q);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index fd7080ed7935..f75a54bfc547 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -134,13 +134,8 @@ struct cfq_data {
struct rb_root prio_trees[CFQ_PRIO_LISTS];
unsigned int busy_queues;
- /*
- * Used to track any pending rt requests so we can pre-empt current
- * non-RT cfqq in service when this value is non-zero.
- */
- unsigned int busy_rt_queues;
- int rq_in_driver;
+ int rq_in_driver[2];
int sync_flight;
/*
@@ -191,7 +186,6 @@ enum cfqq_state_flags {
CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */
- CFQ_CFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */
@@ -218,7 +212,6 @@ static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
CFQ_CFQQ_FNS(on_rr);
CFQ_CFQQ_FNS(wait_request);
CFQ_CFQQ_FNS(must_dispatch);
-CFQ_CFQQ_FNS(must_alloc);
CFQ_CFQQ_FNS(must_alloc_slice);
CFQ_CFQQ_FNS(fifo_expire);
CFQ_CFQQ_FNS(idle_window);
@@ -239,6 +232,11 @@ static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
struct io_context *);
+static inline int rq_in_driver(struct cfq_data *cfqd)
+{
+ return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1];
+}
+
static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
int is_sync)
{
@@ -257,7 +255,7 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic,
*/
static inline int cfq_bio_sync(struct bio *bio)
{
- if (bio_data_dir(bio) == READ || bio_sync(bio))
+ if (bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO))
return 1;
return 0;
@@ -648,8 +646,6 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
BUG_ON(cfq_cfqq_on_rr(cfqq));
cfq_mark_cfqq_on_rr(cfqq);
cfqd->busy_queues++;
- if (cfq_class_rt(cfqq))
- cfqd->busy_rt_queues++;
cfq_resort_rr_list(cfqd, cfqq);
}
@@ -673,8 +669,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
BUG_ON(!cfqd->busy_queues);
cfqd->busy_queues--;
- if (cfq_class_rt(cfqq))
- cfqd->busy_rt_queues--;
}
/*
@@ -760,9 +754,9 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
{
struct cfq_data *cfqd = q->elevator->elevator_data;
- cfqd->rq_in_driver++;
+ cfqd->rq_in_driver[rq_is_sync(rq)]++;
cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
- cfqd->rq_in_driver);
+ rq_in_driver(cfqd));
cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
}
@@ -770,11 +764,12 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
{
struct cfq_data *cfqd = q->elevator->elevator_data;
+ const int sync = rq_is_sync(rq);
- WARN_ON(!cfqd->rq_in_driver);
- cfqd->rq_in_driver--;
+ WARN_ON(!cfqd->rq_in_driver[sync]);
+ cfqd->rq_in_driver[sync]--;
cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
- cfqd->rq_in_driver);
+ rq_in_driver(cfqd));
}
static void cfq_remove_request(struct request *rq)
@@ -1080,7 +1075,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
/*
* still requests with the driver, don't idle
*/
- if (cfqd->rq_in_driver)
+ if (rq_in_driver(cfqd))
return;
/*
@@ -1179,20 +1174,6 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
goto expire;
/*
- * If we have a RT cfqq waiting, then we pre-empt the current non-rt
- * cfqq.
- */
- if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) {
- /*
- * We simulate this as cfqq timed out so that it gets to bank
- * the remaining of its time slice.
- */
- cfq_log_cfqq(cfqd, cfqq, "preempt");
- cfq_slice_expired(cfqd, 1);
- goto new_queue;
- }
-
- /*
* The active queue has requests and isn't expired, allow it to
* dispatch.
*/
@@ -1312,6 +1293,12 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
return 0;
/*
+ * Drain async requests before we start sync IO
+ */
+ if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
+ return 0;
+
+ /*
* If this is an async queue and we have sync IO in flight, let it wait
*/
if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
@@ -2130,11 +2117,11 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
*/
static void cfq_update_hw_tag(struct cfq_data *cfqd)
{
- if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak)
- cfqd->rq_in_driver_peak = cfqd->rq_in_driver;
+ if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak)
+ cfqd->rq_in_driver_peak = rq_in_driver(cfqd);
if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
- cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
+ rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
return;
if (cfqd->hw_tag_samples++ < 50)
@@ -2161,9 +2148,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
cfq_update_hw_tag(cfqd);
- WARN_ON(!cfqd->rq_in_driver);
+ WARN_ON(!cfqd->rq_in_driver[sync]);
WARN_ON(!cfqq->dispatched);
- cfqd->rq_in_driver--;
+ cfqd->rq_in_driver[sync]--;
cfqq->dispatched--;
if (cfq_cfqq_sync(cfqq))
@@ -2197,7 +2184,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
cfq_arm_slice_timer(cfqd);
}
- if (!cfqd->rq_in_driver)
+ if (!rq_in_driver(cfqd))
cfq_schedule_dispatch(cfqd);
}
@@ -2229,8 +2216,7 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
static inline int __cfq_may_queue(struct cfq_queue *cfqq)
{
- if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
- !cfq_cfqq_must_alloc_slice(cfqq)) {
+ if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
cfq_mark_cfqq_must_alloc_slice(cfqq);
return ELV_MQUEUE_MUST;
}
@@ -2317,7 +2303,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
}
cfqq->allocated[rw]++;
- cfq_clear_cfqq_must_alloc(cfqq);
atomic_inc(&cfqq->ref);
spin_unlock_irqrestore(q->queue_lock, flags);
diff --git a/block/elevator.c b/block/elevator.c
index 2d511f9105e1..51bb66236ebb 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -79,7 +79,8 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
/*
* Don't merge file system requests and discard requests
*/
- if (bio_discard(bio) != bio_discard(rq->bio))
+ if (bio_rw_flagged(bio, BIO_RW_DISCARD) !=
+ bio_rw_flagged(rq->bio, BIO_RW_DISCARD))
return 0;
/*
@@ -101,16 +102,11 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
return 0;
/*
- * Don't merge if failfast settings don't match.
- *
- * FIXME: The negation in front of each condition is necessary
- * because bio and request flags use different bit positions
- * and the accessors return those bits directly. This
- * ugliness will soon go away.
+ * Don't merge if failfast settings don't match. Just check the
+ * first four bits, they have identical mappings in the bio->bi_rw
+ * and rq->cmd_flags bits.
*/
- if (!bio_failfast_dev(bio) != !blk_failfast_dev(rq) ||
- !bio_failfast_transport(bio) != !blk_failfast_transport(rq) ||
- !bio_failfast_driver(bio) != !blk_failfast_driver(rq))
+ if ((bio->bi_rw & BIO_RW_RQ_MASK) != (rq->cmd_flags & BIO_RW_RQ_MASK))
return 0;
if (!elv_iosched_allow_merge(rq, bio))
diff --git a/block/genhd.c b/block/genhd.c
index f4c64c2b303a..b89328eceee2 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1215,6 +1215,16 @@ void put_disk(struct gendisk *disk)
EXPORT_SYMBOL(put_disk);
+static void set_disk_ro_uevent(struct gendisk *gd, int ro)
+{
+ char event[] = "DISK_RO=1";
+ char *envp[] = { event, NULL };
+
+ if (!ro)
+ event[8] = '0';
+ kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
+}
+
void set_device_ro(struct block_device *bdev, int flag)
{
bdev->bd_part->policy = flag;
@@ -1227,8 +1237,12 @@ void set_disk_ro(struct gendisk *disk, int flag)
struct disk_part_iter piter;
struct hd_struct *part;
- disk_part_iter_init(&piter, disk,
- DISK_PITER_INCL_EMPTY | DISK_PITER_INCL_PART0);
+ if (disk->part0.policy != flag) {
+ set_disk_ro_uevent(disk, flag);
+ disk->part0.policy = flag;
+ }
+
+ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
while ((part = disk_part_iter_next(&piter)))
part->policy = flag;
disk_part_iter_exit(&piter);
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 2307a271bdc9..0efb8fccb619 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -265,6 +265,7 @@ aoeblk_gdalloc(void *vp)
}
blk_queue_make_request(&d->blkq, aoeblk_make_request);
+ d->blkq.backing_dev_info.name = "aoe";
if (bdi_init(&d->blkq.backing_dev_info))
goto err_mempool;
spin_lock_irqsave(&d->lock, flags);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5757188cd1fb..bbb79441d895 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -475,7 +475,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
if (bio_rw(bio) == WRITE) {
- int barrier = bio_barrier(bio);
+ bool barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
struct file *file = lo->lo_backing_file;
if (barrier) {
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 13c1aee6aa3f..28f1f25f0f63 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -442,7 +442,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev)
* sleep when allocating a lower-request and therefore cannot be
* bouncing.
*/
- blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
+ blk_stack_limits(&q->limits, &osd_request_queue(osdev->osd)->limits, 0);
blk_queue_prep_rq(q, blk_queue_start_tag);
blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH, osdblk_prepare_flush);
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index afa8813e737a..645237bda682 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -822,6 +822,7 @@ static const struct file_operations zero_fops = {
* - permits private mappings, "copies" are taken of the source of zeros
*/
static struct backing_dev_info zero_bdi = {
+ .name = "char/mem",
.capabilities = BDI_CAP_MAP_COPY,
};
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9726577cde49..76811fd94e9e 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1123,7 +1123,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
if (error == -EOPNOTSUPP)
goto out;
- if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
+ if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD))
goto out;
if (unlikely(error)) {
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 4e0e5937e42a..5aa30d1b2d6e 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -285,7 +285,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
if (!error)
return 0; /* I/O complete */
- if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
+ if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD))
return error;
if (error == -EOPNOTSUPP)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8a311ea0d441..82350f590d98 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -586,7 +586,7 @@ static void dec_pending(struct dm_io *io, int error)
*/
spin_lock_irqsave(&md->deferred_lock, flags);
if (__noflush_suspending(md)) {
- if (!bio_barrier(io->bio))
+ if (!bio_rw_flagged(io->bio, BIO_RW_BARRIER))
bio_list_add_head(&md->deferred,
io->bio);
} else
@@ -598,7 +598,7 @@ static void dec_pending(struct dm_io *io, int error)
io_error = io->error;
bio = io->bio;
- if (bio_barrier(bio)) {
+ if (bio_rw_flagged(bio, BIO_RW_BARRIER)) {
/*
* There can be just one barrier request so we use
* a per-device variable for error reporting.
@@ -1204,7 +1204,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
ci.map = dm_get_table(md);
if (unlikely(!ci.map)) {
- if (!bio_barrier(bio))
+ if (!bio_rw_flagged(bio, BIO_RW_BARRIER))
bio_io_error(bio);
else
if (!md->barrier_error)
@@ -1316,7 +1316,7 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
* we have to queue this io for later.
*/
if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
- unlikely(bio_barrier(bio))) {
+ unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
up_read(&md->io_lock);
if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
@@ -1339,7 +1339,7 @@ static int dm_make_request(struct request_queue *q, struct bio *bio)
{
struct mapped_device *md = q->queuedata;
- if (unlikely(bio_barrier(bio))) {
+ if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
bio_endio(bio, -EOPNOTSUPP);
return 0;
}
@@ -2159,7 +2159,7 @@ static void dm_wq_work(struct work_struct *work)
if (dm_request_based(md))
generic_make_request(c);
else {
- if (bio_barrier(c))
+ if (bio_rw_flagged(c, BIO_RW_BARRIER))
process_barrier(md, c);
else
__split_and_process_bio(md, c);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 5fe39c2a3d2b..ea4842905444 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -288,7 +288,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
sector_t start_sector;
int cpu;
- if (unlikely(bio_barrier(bio))) {
+ if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
bio_endio(bio, -EOPNOTSUPP);
return 0;
}
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 7140909f6662..89e76819f61f 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -90,7 +90,7 @@ static void multipath_end_request(struct bio *bio, int error)
if (uptodate)
multipath_end_bh_io(mp_bh, 0);
- else if (!bio_rw_ahead(bio)) {
+ else if (!bio_rw_flagged(bio, BIO_RW_AHEAD)) {
/*
* oops, IO error:
*/
@@ -144,7 +144,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
const int rw = bio_data_dir(bio);
int cpu;
- if (unlikely(bio_barrier(bio))) {
+ if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
bio_endio(bio, -EOPNOTSUPP);
return 0;
}
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 898e2bdfee47..f845ed98fec9 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -448,7 +448,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
const int rw = bio_data_dir(bio);
int cpu;
- if (unlikely(bio_barrier(bio))) {
+ if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
bio_endio(bio, -EOPNOTSUPP);
return 0;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8726fd7ebce5..ff7ed3335995 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -782,8 +782,9 @@ static int make_request(struct request_queue *q, struct bio * bio)
struct bio_list bl;
struct page **behind_pages = NULL;
const int rw = bio_data_dir(bio);
- const int do_sync = bio_sync(bio);
- int cpu, do_barriers;
+ const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
+ int cpu;
+ bool do_barriers;
mdk_rdev_t *blocked_rdev;
/*
@@ -797,7 +798,8 @@ static int make_request(struct request_queue *q, struct bio * bio)
md_write_start(mddev, bio); /* wait on superblock update early */
- if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
+ if (unlikely(!mddev->barriers_work &&
+ bio_rw_flagged(bio, BIO_RW_BARRIER))) {
if (rw == WRITE)
md_write_end(mddev);
bio_endio(bio, -EOPNOTSUPP);
@@ -925,7 +927,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
atomic_set(&r1_bio->remaining, 0);
atomic_set(&r1_bio->behind_remaining, 0);
- do_barriers = bio_barrier(bio);
+ do_barriers = bio_rw_flagged(bio, BIO_RW_BARRIER);
if (do_barriers)
set_bit(R1BIO_Barrier, &r1_bio->state);
@@ -1600,7 +1602,7 @@ static void raid1d(mddev_t *mddev)
* We already have a nr_pending reference on these rdevs.
*/
int i;
- const int do_sync = bio_sync(r1_bio->master_bio);
+ const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO);
clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
clear_bit(R1BIO_Barrier, &r1_bio->state);
for (i=0; i < conf->raid_disks; i++)
@@ -1654,7 +1656,7 @@ static void raid1d(mddev_t *mddev)
(unsigned long long)r1_bio->sector);
raid_end_bio_io(r1_bio);
} else {
- const int do_sync = bio_sync(r1_bio->master_bio);
+ const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO);
r1_bio->bios[r1_bio->read_disk] =
mddev->ro ? IO_BLOCKED : NULL;
r1_bio->read_disk = disk;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3d9020cf6f6e..d0a2152e064f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -796,12 +796,12 @@ static int make_request(struct request_queue *q, struct bio * bio)
int i;
int chunk_sects = conf->chunk_mask + 1;
const int rw = bio_data_dir(bio);
- const int do_sync = bio_sync(bio);
+ const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
struct bio_list bl;
unsigned long flags;
mdk_rdev_t *blocked_rdev;
- if (unlikely(bio_barrier(bio))) {
+ if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
bio_endio(bio, -EOPNOTSUPP);
return 0;
}
@@ -1610,7 +1610,7 @@ static void raid10d(mddev_t *mddev)
raid_end_bio_io(r10_bio);
bio_put(bio);
} else {
- const int do_sync = bio_sync(r10_bio->master_bio);
+ const bool do_sync = bio_rw_flagged(r10_bio->master_bio, BIO_RW_SYNCIO);
bio_put(bio);
rdev = conf->mirrors[mirror].rdev;
if (printk_ratelimit())
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2b521ee67dfa..3233b98bdd9d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3606,7 +3606,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
const int rw = bio_data_dir(bi);
int cpu, remaining;
- if (unlikely(bio_barrier(bi))) {
+ if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
bio_endio(bi, -EOPNOTSUPP);
return 0;
}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 662024d86949..5987da857103 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -898,8 +898,10 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
scsi_print_sense("", cmd);
scsi_print_command(cmd);
}
- blk_end_request_all(req, -EIO);
- scsi_next_command(cmd);
+ if (blk_end_request_err(req, -EIO))
+ scsi_requeue_command(q, cmd);
+ else
+ scsi_next_command(cmd);
break;
case ACTION_REPREP:
/* Unprep the request and put it back at the head of the queue.
diff --git a/drivers/staging/dst/dcore.c b/drivers/staging/dst/dcore.c
index 84724187ec3e..ac8577358ba0 100644
--- a/drivers/staging/dst/dcore.c
+++ b/drivers/staging/dst/dcore.c
@@ -112,8 +112,9 @@ static int dst_request(struct request_queue *q, struct bio *bio)
* I worked with.
*
* Empty barriers are not allowed anyway, see 51fd77bd9f512
- * for example, although later it was changed to bio_discard()
- * only, which does not work in this case.
+ * for example, although later it was changed to
+ * bio_rw_flagged(bio, BIO_RW_DISCARD) only, which does not
+ * work in this case.
*/
//err = -EOPNOTSUPP;
err = 0;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e83be2e4602c..15831d5c7367 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1352,6 +1352,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
{
int err;
+ bdi->name = "btrfs";
bdi->capabilities = BDI_CAP_MAP_COPY;
err = bdi_init(bdi);
if (err)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5dbefd11b4af..5cf405b0828d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -260,7 +260,7 @@ loop_lock:
num_run++;
batch_run++;
- if (bio_sync(cur))
+ if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
num_sync_run++;
if (need_resched()) {
@@ -2903,7 +2903,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
bio->bi_rw |= rw;
spin_lock(&device->io_lock);
- if (bio_sync(bio))
+ if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
pending_bios = &device->pending_sync_bios;
else
pending_bios = &device->pending_bios;
diff --git a/fs/buffer.c b/fs/buffer.c
index a3ef091a45bd..2a01b2bc27ba 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -281,7 +281,7 @@ static void free_more_memory(void)
struct zone *zone;
int nid;
- wakeup_pdflush(1024);
+ wakeup_flusher_threads(1024);
yield();
for_each_online_node(nid) {
diff --git a/fs/char_dev.c b/fs/char_dev.c
index a173551e19d7..7c27a8ebef6a 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -31,6 +31,7 @@
* - no readahead or I/O queue unplugging required
*/
struct backing_dev_info directly_mappable_cdev_bdi = {
+ .name = "char",
.capabilities = (
#ifdef CONFIG_MMU
/* permit private copies of the data to be taken */
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4921e7426d95..a2f746066c5d 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = {
};
static struct backing_dev_info configfs_backing_dev_info = {
+ .name = "configfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c54226be5294..dfb4767d7ca4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -19,49 +19,507 @@
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include "internal.h"
+#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
-/**
- * writeback_acquire - attempt to get exclusive writeback access to a device
- * @bdi: the device's backing_dev_info structure
- *
- * It is a waste of resources to have more than one pdflush thread blocked on
- * a single request queue. Exclusion at the request_queue level is obtained
- * via a flag in the request_queue's backing_dev_info.state.
- *
- * Non-request_queue-backed address_spaces will share default_backing_dev_info,
- * unless they implement their own. Which is somewhat inefficient, as this
- * may prevent concurrent writeback against multiple devices.
+static void generic_sync_wb_inodes(struct bdi_writeback *wb,
+ struct super_block *sb,
+ struct writeback_control *wbc);
+
+/*
+ * We don't actually have pdflush, but this one is exported though /proc...
*/
-static int writeback_acquire(struct backing_dev_info *bdi)
+int nr_pdflush_threads;
+
+static void generic_sync_wb_inodes(struct bdi_writeback *wb,
+ struct super_block *sb,
+ struct writeback_control *wbc);
+
+/*
+ * Work items for the bdi_writeback threads
+ */
+struct bdi_work {
+ struct list_head list;
+ struct list_head wait_list;
+ struct rcu_head rcu_head;
+
+ unsigned long seen;
+ atomic_t pending;
+
+ unsigned long sb_data;
+ unsigned long nr_pages;
+ enum writeback_sync_modes sync_mode;
+
+ unsigned long state;
+};
+
+static struct super_block *bdi_work_sb(struct bdi_work *work)
+{
+ return (struct super_block *) (work->sb_data & ~1UL);
+}
+
+static inline bool bdi_work_on_stack(struct bdi_work *work)
+{
+ return work->sb_data & 1UL;
+}
+
+static inline void bdi_work_init(struct bdi_work *work, struct super_block *sb,
+ unsigned long nr_pages,
+ enum writeback_sync_modes sync_mode)
{
- return !test_and_set_bit(BDI_pdflush, &bdi->state);
+ INIT_RCU_HEAD(&work->rcu_head);
+ work->sb_data = (unsigned long) sb;
+ work->nr_pages = nr_pages;
+ work->sync_mode = sync_mode;
+ work->state = 1;
+}
+
+static inline void bdi_work_init_on_stack(struct bdi_work *work,
+ struct super_block *sb,
+ unsigned long nr_pages,
+ enum writeback_sync_modes sync_mode)
+{
+ bdi_work_init(work, sb, nr_pages, sync_mode);
+ work->sb_data |= 1UL;
}
/**
* writeback_in_progress - determine whether there is writeback in progress
* @bdi: the device's backing_dev_info structure.
*
- * Determine whether there is writeback in progress against a backing device.
+ * Determine whether there is writeback waiting to be handled against a
+ * backing device.
*/
int writeback_in_progress(struct backing_dev_info *bdi)
{
- return test_bit(BDI_pdflush, &bdi->state);
+ return !list_empty(&bdi->work_list);
}
-/**
- * writeback_release - relinquish exclusive writeback access against a device.
- * @bdi: the device's backing_dev_info structure
+static void bdi_work_clear(struct bdi_work *work)
+{
+ clear_bit(0, &work->state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&work->state, 0);
+}
+
+static void bdi_work_free(struct rcu_head *head)
+{
+ struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
+
+ if (!bdi_work_on_stack(work))
+ kfree(work);
+ else
+ bdi_work_clear(work);
+}
+
+static void wb_work_complete(struct bdi_work *work)
+{
+ const enum writeback_sync_modes sync_mode = work->sync_mode;
+
+ /*
+ * For allocated work, we can clear the done/seen bit right here.
+ * For on-stack work, we need to postpone both the clear and free
+ * to after the RCU grace period, since the stack could be invalidated
+ * as soon as bdi_work_clear() has done the wakeup.
+ */
+ if (!bdi_work_on_stack(work))
+ bdi_work_clear(work);
+ if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work))
+ call_rcu(&work->rcu_head, bdi_work_free);
+}
+
+static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
+{
+ /*
+ * The caller has retrieved the work arguments from this work,
+ * drop our reference. If this is the last ref, delete and free it
+ */
+ if (atomic_dec_and_test(&work->pending)) {
+ struct backing_dev_info *bdi = wb->bdi;
+
+ spin_lock(&bdi->wb_lock);
+ list_del_rcu(&work->list);
+ spin_unlock(&bdi->wb_lock);
+
+ wb_work_complete(work);
+ }
+}
+
+static void wb_start_writeback(struct bdi_writeback *wb, struct bdi_work *work)
+{
+ /*
+ * If we failed allocating the bdi work item, wake up the wb thread
+ * always. As a safety precaution, it'll flush out everything
+ */
+ if (!wb_has_dirty_io(wb) && work)
+ wb_clear_pending(wb, work);
+ else if (wb->task)
+ wake_up_process(wb->task);
+}
+
+static void bdi_sched_work(struct backing_dev_info *bdi, struct bdi_work *work)
+{
+ if (!bdi_wblist_needs_lock(bdi))
+ wb_start_writeback(&bdi->wb, work);
+ else {
+ struct bdi_writeback *wb;
+ int idx;
+
+ idx = srcu_read_lock(&bdi->srcu);
+
+ list_for_each_entry_rcu(wb, &bdi->wb_list, list)
+ wb_start_writeback(wb, work);
+
+ srcu_read_unlock(&bdi->srcu, idx);
+ }
+}
+
+static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
+{
+ if (work) {
+ work->seen = bdi->wb_mask;
+ BUG_ON(!work->seen);
+ atomic_set(&work->pending, bdi->wb_cnt);
+ BUG_ON(!bdi->wb_cnt);
+
+ /*
+ * Make sure stores are seen before it appears on the list
+ */
+ smp_mb();
+
+ spin_lock(&bdi->wb_lock);
+ list_add_tail_rcu(&work->list, &bdi->work_list);
+ spin_unlock(&bdi->wb_lock);
+ }
+
+ /*
+ * If the default thread isn't there, make sure we add it. When
+ * it gets created and wakes up, we'll run this work.
+ */
+ if (unlikely(list_empty_careful(&bdi->wb_list)))
+ wake_up_process(default_backing_dev_info.wb.task);
+ else
+ bdi_sched_work(bdi, work);
+}
+
+/*
+ * Used for on-stack allocated work items. The caller needs to wait until
+ * the wb threads have acked the work before it's safe to continue.
+ */
+static void bdi_wait_on_work_clear(struct bdi_work *work)
+{
+ wait_on_bit(&work->state, 0, bdi_sched_wait, TASK_UNINTERRUPTIBLE);
+}
+
+static struct bdi_work *bdi_alloc_work(struct super_block *sb, long nr_pages,
+ enum writeback_sync_modes sync_mode)
+{
+ struct bdi_work *work;
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work)
+ bdi_work_init(work, sb, nr_pages, sync_mode);
+
+ return work;
+}
+
+void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
+ long nr_pages, enum writeback_sync_modes sync_mode)
+{
+ const bool must_wait = sync_mode == WB_SYNC_ALL;
+ struct bdi_work work_stack, *work = NULL;
+
+ if (!must_wait)
+ work = bdi_alloc_work(sb, nr_pages, sync_mode);
+
+ if (!work) {
+ work = &work_stack;
+ bdi_work_init_on_stack(work, sb, nr_pages, sync_mode);
+ }
+
+ bdi_queue_work(bdi, work);
+
+ /*
+ * If the sync mode is WB_SYNC_ALL, block waiting for the work to
+ * complete. If not, we only need to wait for the work to be started,
+ * if we allocated it on-stack. We use the same mechanism, if the
+ * wait bit is set in the bdi_work struct, then threads will not
+ * clear pending until after they are done.
+ *
+ * Note that work == &work_stack if must_wait is true, so we don't
+ * need to do call_rcu() here ever, since the completion path will
+ * have done that for us.
+ */
+ if (must_wait || work == &work_stack) {
+ bdi_wait_on_work_clear(work);
+ if (work != &work_stack)
+ call_rcu(&work->rcu_head, bdi_work_free);
+ }
+}
+
+/*
+ * The maximum number of pages to writeout in a single bdi flush/kupdate
+ * operation. We do this so we don't hold I_SYNC against an inode for
+ * enormous amounts of time, which would block a userspace task which has
+ * been forced to throttle against that inode. Also, the code reevaluates
+ * the dirty each time it has written this many pages.
+ */
+#define MAX_WRITEBACK_PAGES 1024
+
+static inline bool over_bground_thresh(void)
+{
+ unsigned long background_thresh, dirty_thresh;
+
+ get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+
+ return (global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+}
+
+/*
+ * Explicit flushing or periodic writeback of "old" data.
+ *
+ * Define "old": the first time one of an inode's pages is dirtied, we mark the
+ * dirtying-time in the inode's address_space. So this periodic writeback code
+ * just walks the superblock inode list, writing back any inodes which are
+ * older than a specific point in time.
+ *
+ * Try to run once per dirty_writeback_interval. But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
+ * one-second gap.
+ *
+ * older_than_this takes precedence over nr_to_write. So we'll only write back
+ * all dirty pages if they are all attached to "old" mappings.
+ */
+static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
+ struct super_block *sb,
+ enum writeback_sync_modes sync_mode, int for_kupdate)
+{
+ struct writeback_control wbc = {
+ .bdi = wb->bdi,
+ .sync_mode = sync_mode,
+ .older_than_this = NULL,
+ .for_kupdate = for_kupdate,
+ .range_cyclic = 1,
+ };
+ unsigned long oldest_jif;
+ long wrote = 0;
+
+ if (wbc.for_kupdate) {
+ wbc.older_than_this = &oldest_jif;
+ oldest_jif = jiffies -
+ msecs_to_jiffies(dirty_expire_interval * 10);
+ }
+
+ for (;;) {
+ if (sync_mode == WB_SYNC_NONE && nr_pages <= 0 &&
+ !over_bground_thresh())
+ break;
+
+ wbc.more_io = 0;
+ wbc.encountered_congestion = 0;
+ wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+ wbc.pages_skipped = 0;
+ generic_sync_wb_inodes(wb, sb, &wbc);
+ nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+ wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+ /*
+ * If we ran out of stuff to write, bail unless more_io got set
+ */
+ if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
+ if (wbc.more_io && !wbc.for_kupdate)
+ continue;
+ break;
+ }
+ }
+
+ return wrote;
+}
+
+/*
+ * Return the next bdi_work struct that hasn't been processed by this
+ * wb thread yet
+ */
+static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
+ struct bdi_writeback *wb)
+{
+ struct bdi_work *work, *ret = NULL;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(work, &bdi->work_list, list) {
+ if (!test_and_clear_bit(wb->nr, &work->seen))
+ continue;
+
+ ret = work;
+ break;
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * Retrieve work items and do the writeback they describe
+ */
+long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
+{
+ struct backing_dev_info *bdi = wb->bdi;
+ struct bdi_work *work;
+ long nr_pages, wrote = 0;
+
+ while ((work = get_next_work_item(bdi, wb)) != NULL) {
+ struct super_block *sb = bdi_work_sb(work);
+ enum writeback_sync_modes sync_mode;
+
+ nr_pages = work->nr_pages;
+
+ /*
+ * Override sync mode, in case we must wait for completion
+ */
+ if (force_wait)
+ work->sync_mode = sync_mode = WB_SYNC_ALL;
+ else
+ sync_mode = work->sync_mode;
+
+ /*
+ * If this isn't a data integrity operation, just notify
+ * that we have seen this work and we are now starting it.
+ */
+ if (sync_mode == WB_SYNC_NONE)
+ wb_clear_pending(wb, work);
+
+ wrote += wb_writeback(wb, nr_pages, sb, sync_mode, 0);
+
+ /*
+ * This is a data integrity writeback, so only do the
+ * notification when we have completed the work.
+ */
+ if (sync_mode == WB_SYNC_ALL)
+ wb_clear_pending(wb, work);
+ }
+
+ /*
+ * Check for periodic writeback, kupdated() style
+ */
+ if (!wrote) {
+ nr_pages = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) +
+ (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+
+ wrote = wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1);
+ }
+
+ return wrote;
+}
+
+/*
+ * Handle writeback of dirty data for the device backed by this bdi. Also
+ * wakes up periodically and does kupdated style flushing.
*/
-static void writeback_release(struct backing_dev_info *bdi)
+int bdi_writeback_task(struct bdi_writeback *wb)
{
- BUG_ON(!writeback_in_progress(bdi));
- clear_bit(BDI_pdflush, &bdi->state);
+ unsigned long last_active = jiffies;
+ unsigned long wait_jiffies = -1UL;
+ long pages_written;
+
+ while (!kthread_should_stop()) {
+ pages_written = wb_do_writeback(wb, 0);
+
+ if (pages_written)
+ last_active = jiffies;
+ else if (wait_jiffies != -1UL) {
+ unsigned long max_idle;
+
+ /*
+ * Longest period of inactivity that we tolerate. If we
+ * see dirty data again later, the task will get
+ * recreated automatically.
+ */
+ max_idle = max(5UL * 60 * HZ, wait_jiffies);
+ if (time_after(jiffies, max_idle + last_active) &&
+ wb_is_default_task(wb))
+ break;
+ }
+
+ wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(wait_jiffies);
+ try_to_freeze();
+ }
+
+ return 0;
+}
+
+/*
+ * Schedule writeback for all backing devices. Expensive! If this is a data
+ * integrity operation, writeback will be complete when this returns. If
+ * we are simply called for WB_SYNC_NONE, then writeback will merely be
+ * scheduled to run.
+ */
+void bdi_writeback_all(struct super_block *sb, struct writeback_control *wbc)
+{
+ const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
+ struct backing_dev_info *bdi;
+ struct bdi_work *work;
+ LIST_HEAD(list);
+
+restart:
+ spin_lock(&bdi_lock);
+
+ list_for_each_entry(bdi, &bdi_list, bdi_list) {
+ struct bdi_work *work;
+
+ if (!bdi_has_dirty_io(bdi))
+ continue;
+
+ /*
+ * If work allocation fails, do the writes inline. We drop
+ * the lock and restart the list writeout. This should be OK,
+ * since this happens rarely and because the writeout should
+ * eventually make more free memory available.
+ */
+ work = bdi_alloc_work(sb, wbc->nr_to_write, wbc->sync_mode);
+ if (!work) {
+ struct writeback_control __wbc = *wbc;
+
+ /*
+ * Not a data integrity writeout, just continue
+ */
+ if (!must_wait)
+ continue;
+
+ spin_unlock(&bdi_lock);
+ __wbc = *wbc;
+ __wbc.bdi = bdi;
+ generic_sync_bdi_inodes(sb, &__wbc);
+ goto restart;
+ }
+ if (must_wait)
+ list_add_tail(&work->wait_list, &list);
+
+ bdi_queue_work(bdi, work);
+ }
+
+ spin_unlock(&bdi_lock);
+
+ /*
+ * If this is for WB_SYNC_ALL, wait for pending work to complete
+ * before returning.
+ */
+ while (!list_empty(&list)) {
+ work = list_entry(list.next, struct bdi_work, wait_list);
+ list_del(&work->wait_list);
+ bdi_wait_on_work_clear(work);
+ call_rcu(&work->rcu_head, bdi_work_free);
+ }
}
static noinline void block_dump___mark_inode_dirty(struct inode *inode)
@@ -86,6 +544,21 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
}
}
+/*
+ * If the filesystem didn't provide a way to map an inode to a dedicated
+ * flusher thread, it doesn't support more than 1 thread. So we know it's
+ * the default thread, return that.
+ */
+static inline struct bdi_writeback *inode_get_wb(struct inode *inode)
+{
+ const struct super_operations *sop = inode->i_sb->s_op;
+
+ if (!sop->inode_get_wb)
+ return &inode_to_bdi(inode)->wb;
+
+ return sop->inode_get_wb(inode);
+}
+
/**
* __mark_inode_dirty - internal function
* @inode: inode to mark
@@ -165,12 +638,21 @@ void __mark_inode_dirty(struct inode *inode, int flags)
goto out;
/*
- * If the inode was already on s_dirty/s_io/s_more_io, don't
- * reposition it (that would break s_dirty time-ordering).
+ * If the inode was already on b_dirty/b_io/b_more_io, don't
+ * reposition it (that would break b_dirty time-ordering).
*/
if (!was_dirty) {
+ struct bdi_writeback *wb = inode_get_wb(inode);
+ struct backing_dev_info *bdi = wb->bdi;
+
+ if (bdi_cap_writeback_dirty(bdi) &&
+ !test_bit(BDI_registered, &bdi->state)) {
+ WARN_ON(1);
+ printk("bdi-%s not registered\n", bdi->name);
+ }
+
inode->dirtied_when = jiffies;
- list_move(&inode->i_list, &sb->s_dirty);
+ list_move(&inode->i_list, &wb->b_dirty);
}
}
out:
@@ -191,31 +673,32 @@ static int write_inode(struct inode *inode, int sync)
* furthest end of its superblock's dirty-inode list.
*
* Before stamping the inode's ->dirtied_when, we check to see whether it is
- * already the most-recently-dirtied inode on the s_dirty list. If that is
+ * already the most-recently-dirtied inode on the b_dirty list. If that is
* the case then the inode must have been redirtied while it was being written
* out and we don't reset its dirtied_when.
*/
static void redirty_tail(struct inode *inode)
{
- struct super_block *sb = inode->i_sb;
+ struct bdi_writeback *wb = inode_get_wb(inode);
- if (!list_empty(&sb->s_dirty)) {
- struct inode *tail_inode;
+ if (!list_empty(&wb->b_dirty)) {
+ struct inode *tail;
- tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
- if (time_before(inode->dirtied_when,
- tail_inode->dirtied_when))
+ tail = list_entry(wb->b_dirty.next, struct inode, i_list);
+ if (time_before(inode->dirtied_when, tail->dirtied_when))
inode->dirtied_when = jiffies;
}
- list_move(&inode->i_list, &sb->s_dirty);
+ list_move(&inode->i_list, &wb->b_dirty);
}
/*
- * requeue inode for re-scanning after sb->s_io list is exhausted.
+ * requeue inode for re-scanning after bdi->b_io list is exhausted.
*/
static void requeue_io(struct inode *inode)
{
- list_move(&inode->i_list, &inode->i_sb->s_more_io);
+ struct bdi_writeback *wb = inode_get_wb(inode);
+
+ list_move(&inode->i_list, &wb->b_more_io);
}
static void inode_sync_complete(struct inode *inode)
@@ -262,21 +745,12 @@ static void move_expired_inodes(struct list_head *delaying_queue,
/*
* Queue all expired dirty inodes for io, eldest first.
*/
-static void queue_io(struct super_block *sb,
- unsigned long *older_than_this)
+static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
{
- list_splice_init(&sb->s_more_io, sb->s_io.prev);
- move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
+ list_splice_init(&wb->b_more_io, wb->b_io.prev);
+ move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
}
-int sb_has_dirty_inodes(struct super_block *sb)
-{
- return !list_empty(&sb->s_dirty) ||
- !list_empty(&sb->s_io) ||
- !list_empty(&sb->s_more_io);
-}
-EXPORT_SYMBOL(sb_has_dirty_inodes);
-
/*
* Wait for writeback on an inode to complete.
*/
@@ -322,11 +796,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (inode->i_state & I_SYNC) {
/*
* If this inode is locked for writeback and we are not doing
- * writeback-for-data-integrity, move it to s_more_io so that
+ * writeback-for-data-integrity, move it to b_more_io so that
* writeback can proceed with the other inodes on s_io.
*
* We'll have another go at writing back this inode when we
- * completed a full scan of s_io.
+ * completed a full scan of b_io.
*/
if (!wait) {
requeue_io(inode);
@@ -371,11 +845,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
/*
* We didn't write back all the pages. nfs_writepages()
* sometimes bales out without doing anything. Redirty
- * the inode; Move it from s_io onto s_more_io/s_dirty.
+ * the inode; Move it from b_io onto b_more_io/b_dirty.
*/
/*
* akpm: if the caller was the kupdate function we put
- * this inode at the head of s_dirty so it gets first
+ * this inode at the head of b_dirty so it gets first
* consideration. Otherwise, move it to the tail, for
* the reasons described there. I'm not really sure
* how much sense this makes. Presumably I had a good
@@ -385,7 +859,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->for_kupdate) {
/*
* For the kupdate function we move the inode
- * to s_more_io so it will get more writeout as
+ * to b_more_io so it will get more writeout as
* soon as the queue becomes uncongested.
*/
inode->i_state |= I_DIRTY_PAGES;
@@ -433,51 +907,34 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
return ret;
}
-/*
- * Write out a superblock's list of dirty inodes. A wait will be performed
- * upon no inodes, all inodes or the final one, depending upon sync_mode.
- *
- * If older_than_this is non-NULL, then only write out inodes which
- * had their first dirtying at a time earlier than *older_than_this.
- *
- * If we're a pdflush thread, then implement pdflush collision avoidance
- * against the entire list.
- *
- * If `bdi' is non-zero then we're being asked to writeback a specific queue.
- * This function assumes that the blockdev superblock's inodes are backed by
- * a variety of queues, so all inodes are searched. For other superblocks,
- * assume that all inodes are backed by the same queue.
- *
- * FIXME: this linear search could get expensive with many fileystems. But
- * how to fix? We need to go from an address_space to all inodes which share
- * a queue with that address_space. (Easy: have a global "dirty superblocks"
- * list).
- *
- * The inodes to be written are parked on sb->s_io. They are moved back onto
- * sb->s_dirty as they are selected for writing. This way, none can be missed
- * on the writer throttling path, and we get decent balancing between many
- * throttled threads: we don't want them all piling up on inode_sync_wait.
- */
-void generic_sync_sb_inodes(struct super_block *sb,
- struct writeback_control *wbc)
+static void generic_sync_wb_inodes(struct bdi_writeback *wb,
+ struct super_block *sb,
+ struct writeback_control *wbc)
{
+ const int is_blkdev_sb = sb_is_blkdev_sb(sb);
const unsigned long start = jiffies; /* livelock avoidance */
- int sync = wbc->sync_mode == WB_SYNC_ALL;
spin_lock(&inode_lock);
- if (!wbc->for_kupdate || list_empty(&sb->s_io))
- queue_io(sb, wbc->older_than_this);
- while (!list_empty(&sb->s_io)) {
- struct inode *inode = list_entry(sb->s_io.prev,
+ if (!wbc->for_kupdate || list_empty(&wb->b_io))
+ queue_io(wb, wbc->older_than_this);
+
+ while (!list_empty(&wb->b_io)) {
+ struct inode *inode = list_entry(wb->b_io.prev,
struct inode, i_list);
- struct address_space *mapping = inode->i_mapping;
- struct backing_dev_info *bdi = mapping->backing_dev_info;
long pages_skipped;
- if (!bdi_cap_writeback_dirty(bdi)) {
+ /*
+ * super block given and doesn't match, skip this inode
+ */
+ if (sb && sb != inode->i_sb) {
redirty_tail(inode);
- if (sb_is_blkdev_sb(sb)) {
+ continue;
+ }
+
+ if (!bdi_cap_writeback_dirty(wb->bdi)) {
+ redirty_tail(inode);
+ if (is_blkdev_sb) {
/*
* Dirty memory-backed blockdev: the ramdisk
* driver does this. Skip just this inode
@@ -497,21 +954,14 @@ void generic_sync_sb_inodes(struct super_block *sb,
continue;
}
- if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
wbc->encountered_congestion = 1;
- if (!sb_is_blkdev_sb(sb))
+ if (!is_blkdev_sb)
break; /* Skip a congested fs */
requeue_io(inode);
continue; /* Skip a congested blockdev */
}
- if (wbc->bdi && bdi != wbc->bdi) {
- if (!sb_is_blkdev_sb(sb))
- break; /* fs has the wrong queue */
- requeue_io(inode);
- continue; /* blockdev has wrong queue */
- }
-
/*
* Was this inode dirtied after sync_sb_inodes was called?
* This keeps sync from extra jobs and livelock.
@@ -519,16 +969,10 @@ void generic_sync_sb_inodes(struct super_block *sb,
if (inode_dirtied_after(inode, start))
break;
- /* Is another pdflush already flushing this queue? */
- if (current_is_pdflush() && !writeback_acquire(bdi))
- break;
-
BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
__iget(inode);
pages_skipped = wbc->pages_skipped;
writeback_single_inode(inode, wbc);
- if (current_is_pdflush())
- writeback_release(bdi);
if (wbc->pages_skipped != pages_skipped) {
/*
* writeback is not making progress due to locked
@@ -544,13 +988,71 @@ void generic_sync_sb_inodes(struct super_block *sb,
wbc->more_io = 1;
break;
}
- if (!list_empty(&sb->s_more_io))
+ if (!list_empty(&wb->b_more_io))
wbc->more_io = 1;
}
- if (sync) {
+ spin_unlock(&inode_lock);
+ /* Leave any unwritten inodes on b_io */
+}
+
+void generic_sync_bdi_inodes(struct super_block *sb,
+ struct writeback_control *wbc)
+{
+ struct backing_dev_info *bdi = wbc->bdi;
+ struct bdi_writeback *wb;
+
+ /*
+ * Common case is just a single wb thread and that is embedded in
+ * the bdi, so it doesn't need locking
+ */
+ if (!bdi_wblist_needs_lock(bdi))
+ generic_sync_wb_inodes(&bdi->wb, sb, wbc);
+ else {
+ int idx;
+
+ idx = srcu_read_lock(&bdi->srcu);
+
+ list_for_each_entry_rcu(wb, &bdi->wb_list, list)
+ generic_sync_wb_inodes(wb, sb, wbc);
+
+ srcu_read_unlock(&bdi->srcu, idx);
+ }
+}
+
+/*
+ * Write out a superblock's list of dirty inodes. A wait will be performed
+ * upon no inodes, all inodes or the final one, depending upon sync_mode.
+ *
+ * If older_than_this is non-NULL, then only write out inodes which
+ * had their first dirtying at a time earlier than *older_than_this.
+ *
+ * If we're a pdlfush thread, then implement pdflush collision avoidance
+ * against the entire list.
+ *
+ * If `bdi' is non-zero then we're being asked to writeback a specific queue.
+ * This function assumes that the blockdev superblock's inodes are backed by
+ * a variety of queues, so all inodes are searched. For other superblocks,
+ * assume that all inodes are backed by the same queue.
+ *
+ * The inodes to be written are parked on bdi->b_io. They are moved back onto
+ * bdi->b_dirty as they are selected for writing. This way, none can be missed
+ * on the writer throttling path, and we get decent balancing between many
+ * throttled threads: we don't want them all piling up on inode_sync_wait.
+ */
+void generic_sync_sb_inodes(struct super_block *sb,
+ struct writeback_control *wbc)
+{
+ if (wbc->bdi)
+ bdi_start_writeback(wbc->bdi, sb, wbc->nr_to_write, wbc->sync_mode);
+ else
+ bdi_writeback_all(sb, wbc);
+
+ if (wbc->sync_mode == WB_SYNC_ALL) {
struct inode *inode, *old_inode = NULL;
+ spin_lock(&inode_lock);
+
/*
* Data integrity sync. Must wait for all pages under writeback,
* because there may have been pages dirtied before our sync
@@ -588,10 +1090,8 @@ void generic_sync_sb_inodes(struct super_block *sb,
}
spin_unlock(&inode_lock);
iput(old_inode);
- } else
- spin_unlock(&inode_lock);
+ }
- return; /* Leave any unwritten inodes on s_io */
}
EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
@@ -602,58 +1102,6 @@ static void sync_sb_inodes(struct super_block *sb,
}
/*
- * Start writeback of dirty pagecache data against all unlocked inodes.
- *
- * Note:
- * We don't need to grab a reference to superblock here. If it has non-empty
- * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
- * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all
- * empty. Since __sync_single_inode() regains inode_lock before it finally moves
- * inode from superblock lists we are OK.
- *
- * If `older_than_this' is non-zero then only flush inodes which have a
- * flushtime older than *older_than_this.
- *
- * If `bdi' is non-zero then we will scan the first inode against each
- * superblock until we find the matching ones. One group will be the dirty
- * inodes against a filesystem. Then when we hit the dummy blockdev superblock,
- * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not
- * super-efficient but we're about to do a ton of I/O...
- */
-void
-writeback_inodes(struct writeback_control *wbc)
-{
- struct super_block *sb;
-
- might_sleep();
- spin_lock(&sb_lock);
-restart:
- list_for_each_entry_reverse(sb, &super_blocks, s_list) {
- if (sb_has_dirty_inodes(sb)) {
- /* we're making our own get_super here */
- sb->s_count++;
- spin_unlock(&sb_lock);
- /*
- * If we can't get the readlock, there's no sense in
- * waiting around, most of the time the FS is going to
- * be unmounted by the time it is released.
- */
- if (down_read_trylock(&sb->s_umount)) {
- if (sb->s_root)
- sync_sb_inodes(sb, wbc);
- up_read(&sb->s_umount);
- }
- spin_lock(&sb_lock);
- if (__put_super_and_need_restart(sb))
- goto restart;
- }
- if (wbc->nr_to_write <= 0)
- break;
- }
- spin_unlock(&sb_lock);
-}
-
-/*
* writeback and wait upon the filesystem's dirty inodes. The caller will
* do this in two passes - one to write, and one to wait.
*
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9aa6f46d0c32..a9a9cfff30ca 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -815,6 +815,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
{
int err;
+ fc->bdi.name = "fuse";
fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
fc->bdi.unplug_io_fn = default_unplug_io_fn;
/* fuse does it's own writeback accounting */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 941c8425c10b..2d8abaf3b8a8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -44,6 +44,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;
static struct backing_dev_info hugetlbfs_backing_dev_info = {
+ .name = "hugetlbfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d36925f9b952..e350bd6a2334 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -882,6 +882,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
server->rsize = NFS_MAX_FILE_IO_SIZE;
server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ server->backing_dev_info.name = "nfs";
server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
if (server->wsize > max_rpc_payload)
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 1c9efb406a96..02bf17808bdc 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -325,6 +325,7 @@ clear_fields:
}
static struct backing_dev_info dlmfs_backing_dev_info = {
+ .name = "ocfs2-dlmfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0ff7566c767c..a7f0110fca4c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -46,6 +46,7 @@ static const struct super_operations ramfs_ops;
static const struct inode_operations ramfs_dir_inode_operations;
static struct backing_dev_info ramfs_backing_dev_info = {
+ .name = "ramfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK |
BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
diff --git a/fs/super.c b/fs/super.c
index 2761d3e22ed9..0d22ce3be4aa 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
s = NULL;
goto out;
}
- INIT_LIST_HEAD(&s->s_dirty);
- INIT_LIST_HEAD(&s->s_io);
- INIT_LIST_HEAD(&s->s_more_io);
INIT_LIST_HEAD(&s->s_files);
INIT_LIST_HEAD(&s->s_instances);
INIT_HLIST_HEAD(&s->s_anon);
diff --git a/fs/sync.c b/fs/sync.c
index 3422ba61d86d..bf03fc75b392 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -118,7 +118,7 @@ restart:
*/
SYSCALL_DEFINE0(sync)
{
- wakeup_pdflush(0);
+ wakeup_flusher_threads(0);
sync_filesystems(0);
sync_filesystems(1);
if (unlikely(laptop_mode))
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 555f0ff988df..e57f98e54fce 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -29,6 +29,7 @@ static const struct address_space_operations sysfs_aops = {
};
static struct backing_dev_info sysfs_backing_dev_info = {
+ .name = "sysfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index b541bd75bd1f..22b2db3cad23 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1950,6 +1950,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
*
* Read-ahead will be disabled because @c->bdi.ra_pages is 0.
*/
+ c->bdi.name = "ubifs",
c->bdi.capabilities = BDI_CAP_MAP_COPY;
c->bdi.unplug_io_fn = default_unplug_io_fn;
err = bdi_init(&c->bdi);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 1d52425a6118..47ed0f22d020 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -13,6 +13,9 @@
#include <linux/proportions.h>
#include <linux/kernel.h>
#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/srcu.h>
+#include <linux/writeback.h>
#include <asm/atomic.h>
struct page;
@@ -23,9 +26,12 @@ struct dentry;
* Bits in backing_dev_info.state
*/
enum bdi_state {
- BDI_pdflush, /* A pdflush thread is working this device */
+ BDI_pending, /* On its way to being activated */
+ BDI_wb_alloc, /* Default embedded wb allocated */
+ BDI_wblist_lock, /* bdi->wb_list now needs locking */
BDI_async_congested, /* The async (write) queue is getting full */
BDI_sync_congested, /* The sync queue is getting full */
+ BDI_registered, /* bdi_register() was done */
BDI_unused, /* Available bits start here */
};
@@ -39,7 +45,23 @@ enum bdi_stat_item {
#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
+struct bdi_writeback {
+ struct list_head list; /* hangs off the bdi */
+
+ struct backing_dev_info *bdi; /* our parent bdi */
+ unsigned int nr;
+
+ struct task_struct *task; /* writeback task */
+ struct list_head b_dirty; /* dirty inodes */
+ struct list_head b_io; /* parked for writeback */
+ struct list_head b_more_io; /* parked for more writeback */
+};
+
+#define BDI_MAX_FLUSHERS 32
+
struct backing_dev_info {
+ struct srcu_struct srcu; /* for wb_list read side protection */
+ struct list_head bdi_list;
unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
unsigned long state; /* Always use atomic bitops on this */
unsigned int capabilities; /* Device capabilities */
@@ -48,6 +70,8 @@ struct backing_dev_info {
void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
void *unplug_io_data;
+ char *name;
+
struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
struct prop_local_percpu completions;
@@ -56,6 +80,14 @@ struct backing_dev_info {
unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;
+ struct bdi_writeback wb; /* default writeback info for this bdi */
+ spinlock_t wb_lock; /* protects update side of wb_list */
+ struct list_head wb_list; /* the flusher threads hanging off this bdi */
+ unsigned long wb_mask; /* bitmask of registered tasks */
+ unsigned int wb_cnt; /* number of registered tasks */
+
+ struct list_head work_list;
+
struct device *dev;
#ifdef CONFIG_DEBUG_FS
@@ -71,6 +103,32 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...);
int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
void bdi_unregister(struct backing_dev_info *bdi);
+void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
+ long nr_pages, enum writeback_sync_modes sync_mode);
+int bdi_writeback_task(struct bdi_writeback *wb);
+void bdi_writeback_all(struct super_block *sb, struct writeback_control *wbc);
+void bdi_add_flusher_task(struct backing_dev_info *bdi);
+int bdi_has_dirty_io(struct backing_dev_info *bdi);
+
+extern spinlock_t bdi_lock;
+extern struct list_head bdi_list;
+
+static inline int wb_is_default_task(struct bdi_writeback *wb)
+{
+ return wb == &wb->bdi->wb;
+}
+
+static inline int bdi_wblist_needs_lock(struct backing_dev_info *bdi)
+{
+ return test_bit(BDI_wblist_lock, &bdi->state);
+}
+
+static inline int wb_has_dirty_io(struct bdi_writeback *wb)
+{
+ return !list_empty(&wb->b_dirty) ||
+ !list_empty(&wb->b_io) ||
+ !list_empty(&wb->b_more_io);
+}
static inline void __add_bdi_stat(struct backing_dev_info *bdi,
enum bdi_stat_item item, s64 amount)
@@ -261,6 +319,11 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
return bdi->capabilities & BDI_CAP_SWAP_BACKED;
}
+static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
+{
+ return bdi == &default_backing_dev_info;
+}
+
static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
{
return bdi_cap_writeback_dirty(mapping->backing_dev_info);
@@ -276,4 +339,10 @@ static inline bool mapping_cap_swap_backed(struct address_space *mapping)
return bdi_cap_swap_backed(mapping->backing_dev_info);
}
+static inline int bdi_sched_wait(void *word)
+{
+ schedule();
+ return 0;
+}
+
#endif /* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2892b710771c..5be93f18d842 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -142,56 +142,51 @@ struct bio {
*
* bit 0 -- data direction
* If not set, bio is a read from device. If set, it's a write to device.
- * bit 1 -- rw-ahead when set
- * bit 2 -- barrier
+ * bit 1 -- fail fast device errors
+ * bit 2 -- fail fast transport errors
+ * bit 3 -- fail fast driver errors
+ * bit 4 -- rw-ahead when set
+ * bit 5 -- barrier
* Insert a serialization point in the IO queue, forcing previously
* submitted IO to be completed before this one is issued.
- * bit 3 -- synchronous I/O hint.
- * bit 4 -- Unplug the device immediately after submitting this bio.
- * bit 5 -- metadata request
+ * bit 6 -- synchronous I/O hint.
+ * bit 7 -- Unplug the device immediately after submitting this bio.
+ * bit 8 -- metadata request
* Used for tracing to differentiate metadata and data IO. May also
* get some preferential treatment in the IO scheduler
- * bit 6 -- discard sectors
+ * bit 9 -- discard sectors
* Informs the lower level device that this range of sectors is no longer
* used by the file system and may thus be freed by the device. Used
* for flash based storage.
- * bit 7 -- fail fast device errors
- * bit 8 -- fail fast transport errors
- * bit 9 -- fail fast driver errors
* Don't want driver retries for any fast fail whatever the reason.
* bit 10 -- Tell the IO scheduler not to wait for more requests after this
one has been submitted, even if it is a SYNC request.
*/
-#define BIO_RW 0 /* Must match RW in req flags (blkdev.h) */
-#define BIO_RW_AHEAD 1 /* Must match FAILFAST in req flags */
-#define BIO_RW_BARRIER 2
-#define BIO_RW_SYNCIO 3
-#define BIO_RW_UNPLUG 4
-#define BIO_RW_META 5
-#define BIO_RW_DISCARD 6
-#define BIO_RW_FAILFAST_DEV 7
-#define BIO_RW_FAILFAST_TRANSPORT 8
-#define BIO_RW_FAILFAST_DRIVER 9
-#define BIO_RW_NOIDLE 10
-
-#define bio_rw_flagged(bio, flag) ((bio)->bi_rw & (1 << (flag)))
+enum bio_rw_flags {
+ BIO_RW,
+ BIO_RW_FAILFAST_DEV,
+ BIO_RW_FAILFAST_TRANSPORT,
+ BIO_RW_FAILFAST_DRIVER,
+ /* above flags must match REQ_* */
+ BIO_RW_AHEAD,
+ BIO_RW_BARRIER,
+ BIO_RW_SYNCIO,
+ BIO_RW_UNPLUG,
+ BIO_RW_META,
+ BIO_RW_DISCARD,
+ BIO_RW_NOIDLE,
+};
/*
- * Old defines, these should eventually be replaced by direct usage of
- * bio_rw_flagged()
+ * First four bits must match between bio->bi_rw and rq->cmd_flags, make
+ * that explicit here.
*/
-#define bio_barrier(bio) bio_rw_flagged(bio, BIO_RW_BARRIER)
-#define bio_sync(bio) bio_rw_flagged(bio, BIO_RW_SYNCIO)
-#define bio_unplug(bio) bio_rw_flagged(bio, BIO_RW_UNPLUG)
-#define bio_failfast_dev(bio) bio_rw_flagged(bio, BIO_RW_FAILFAST_DEV)
-#define bio_failfast_transport(bio) \
- bio_rw_flagged(bio, BIO_RW_FAILFAST_TRANSPORT)
-#define bio_failfast_driver(bio) \
- bio_rw_flagged(bio, BIO_RW_FAILFAST_DRIVER)
-#define bio_rw_ahead(bio) bio_rw_flagged(bio, BIO_RW_AHEAD)
-#define bio_rw_meta(bio) bio_rw_flagged(bio, BIO_RW_META)
-#define bio_discard(bio) bio_rw_flagged(bio, BIO_RW_DISCARD)
-#define bio_noidle(bio) bio_rw_flagged(bio, BIO_RW_NOIDLE)
+#define BIO_RW_RQ_MASK 0xf
+
+static inline bool bio_rw_flagged(struct bio *bio, enum bio_rw_flags flag)
+{
+ return (bio->bi_rw & (1 << flag)) != 0;
+}
/*
* upper 16 bits of bi_rw define the io priority of this bio
@@ -216,7 +211,7 @@ struct bio {
#define bio_offset(bio) bio_iovec((bio))->bv_offset
#define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx)
#define bio_sectors(bio) ((bio)->bi_size >> 9)
-#define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio))
+#define bio_empty_barrier(bio) (bio_rw_flagged(bio, BIO_RW_BARRIER) && !bio_has_data(bio) && !bio_rw_flagged(bio, BIO_RW_DISCARD))
static inline unsigned int bio_cur_bytes(struct bio *bio)
{
diff --git a/include/linux/blk-iopoll.h b/include/linux/blk-iopoll.h
new file mode 100644
index 000000000000..308734d3d4a2
--- /dev/null
+++ b/include/linux/blk-iopoll.h
@@ -0,0 +1,48 @@
+#ifndef BLK_IOPOLL_H
+#define BLK_IOPOLL_H
+
+struct blk_iopoll;
+typedef int (blk_iopoll_fn)(struct blk_iopoll *, int);
+
+struct blk_iopoll {
+ struct list_head list;
+ unsigned long state;
+ unsigned long data;
+ int weight;
+ int max;
+ blk_iopoll_fn *poll;
+};
+
+enum {
+ IOPOLL_F_SCHED = 0,
+ IOPOLL_F_DISABLE = 1,
+};
+
+/*
+ * Returns 0 if we successfully set the IOPOLL_F_SCHED bit, indicating
+ * that we were the first to acquire this iop for scheduling. If this iop
+ * is currently disabled, return "failure".
+ */
+static inline int blk_iopoll_sched_prep(struct blk_iopoll *iop)
+{
+ if (!test_bit(IOPOLL_F_DISABLE, &iop->state))
+ return test_and_set_bit(IOPOLL_F_SCHED, &iop->state);
+
+ return 1;
+}
+
+static inline int blk_iopoll_disable_pending(struct blk_iopoll *iop)
+{
+ return test_bit(IOPOLL_F_DISABLE, &iop->state);
+}
+
+extern void blk_iopoll_sched(struct blk_iopoll *);
+extern void blk_iopoll_init(struct blk_iopoll *, int, blk_iopoll_fn *);
+extern void blk_iopoll_complete(struct blk_iopoll *);
+extern void __blk_iopoll_complete(struct blk_iopoll *);
+extern void blk_iopoll_enable(struct blk_iopoll *);
+extern void blk_iopoll_disable(struct blk_iopoll *);
+
+extern int blk_iopoll_enabled;
+
+#endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 69103e053c92..4c0f724511f5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -86,13 +86,14 @@ enum {
};
/*
- * request type modified bits. first two bits match BIO_RW* bits, important
+ * request type modified bits. first four bits match BIO_RW* bits, important
*/
enum rq_flag_bits {
__REQ_RW, /* not set, read. set, write */
__REQ_FAILFAST_DEV, /* no driver retries of device errors */
__REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
__REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */
+ /* above flags must match BIO_RW_* */
__REQ_DISCARD, /* request to discard sectors */
__REQ_SORTED, /* elevator knows about this request */
__REQ_SOFTBARRIER, /* may not be passed by ioscheduler */
@@ -114,6 +115,7 @@ enum rq_flag_bits {
__REQ_INTEGRITY, /* integrity metadata has been remapped */
__REQ_NOIDLE, /* Don't anticipate more IO after this one */
__REQ_IO_STAT, /* account I/O stat */
+ __REQ_MIXED_MERGE, /* merge of different types, fail separately */
__REQ_NR_BITS, /* stops here */
};
@@ -142,6 +144,10 @@ enum rq_flag_bits {
#define REQ_INTEGRITY (1 << __REQ_INTEGRITY)
#define REQ_NOIDLE (1 << __REQ_NOIDLE)
#define REQ_IO_STAT (1 << __REQ_IO_STAT)
+#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE)
+
+#define REQ_FAILFAST_MASK (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | \
+ REQ_FAILFAST_DRIVER)
#define BLK_MAX_CDB 16
@@ -453,6 +459,7 @@ struct request_queue
#define QUEUE_FLAG_NONROT 14 /* non-rotational device (SSD) */
#define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */
#define QUEUE_FLAG_IO_STAT 15 /* do IO stats */
+#define QUEUE_FLAG_CQ 16 /* hardware does queuing */
#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
(1 << QUEUE_FLAG_CLUSTER) | \
@@ -575,6 +582,7 @@ enum {
#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
+#define blk_queue_queuing(q) test_bit(QUEUE_FLAG_CQ, &(q)->queue_flags)
#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
#define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
@@ -828,11 +836,13 @@ static inline void blk_run_address_space(struct address_space *mapping)
}
/*
- * blk_rq_pos() : the current sector
- * blk_rq_bytes() : bytes left in the entire request
- * blk_rq_cur_bytes() : bytes left in the current segment
- * blk_rq_sectors() : sectors left in the entire request
- * blk_rq_cur_sectors() : sectors left in the current segment
+ * blk_rq_pos() : the current sector
+ * blk_rq_bytes() : bytes left in the entire request
+ * blk_rq_cur_bytes() : bytes left in the current segment
+ * blk_rq_err_bytes() : bytes left till the next error boundary
+ * blk_rq_sectors() : sectors left in the entire request
+ * blk_rq_cur_sectors() : sectors left in the current segment
+ * blk_rq_err_sectors() : sectors left till the next error boundary
*/
static inline sector_t blk_rq_pos(const struct request *rq)
{
@@ -849,6 +859,8 @@ static inline int blk_rq_cur_bytes(const struct request *rq)
return rq->bio ? bio_cur_bytes(rq->bio) : 0;
}
+extern unsigned int blk_rq_err_bytes(const struct request *rq);
+
static inline unsigned int blk_rq_sectors(const struct request *rq)
{
return blk_rq_bytes(rq) >> 9;
@@ -859,6 +871,11 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
return blk_rq_cur_bytes(rq) >> 9;
}
+static inline unsigned int blk_rq_err_sectors(const struct request *rq)
+{
+ return blk_rq_err_bytes(rq) >> 9;
+}
+
/*
* Request issue related functions.
*/
@@ -885,10 +902,12 @@ extern bool blk_end_request(struct request *rq, int error,
unsigned int nr_bytes);
extern void blk_end_request_all(struct request *rq, int error);
extern bool blk_end_request_cur(struct request *rq, int error);
+extern bool blk_end_request_err(struct request *rq, int error);
extern bool __blk_end_request(struct request *rq, int error,
unsigned int nr_bytes);
extern void __blk_end_request_all(struct request *rq, int error);
extern bool __blk_end_request_cur(struct request *rq, int error);
+extern bool __blk_end_request_err(struct request *rq, int error);
extern void blk_complete_request(struct request *);
extern void __blk_complete_request(struct request *);
@@ -921,7 +940,6 @@ extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
sector_t offset);
extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
sector_t offset);
-extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
extern void blk_queue_dma_pad(struct request_queue *, unsigned int);
extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
extern int blk_queue_dma_drain(struct request_queue *q,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 67888a9e0655..936017fe7214 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -715,7 +715,7 @@ struct posix_acl;
struct inode {
struct hlist_node i_hash;
- struct list_head i_list;
+ struct list_head i_list; /* backing dev IO list */
struct list_head i_sb_list;
struct list_head i_dentry;
unsigned long i_ino;
@@ -1336,9 +1336,6 @@ struct super_block {
struct xattr_handler **s_xattr;
struct list_head s_inodes; /* all inodes */
- struct list_head s_dirty; /* dirty inodes */
- struct list_head s_io; /* parked for writeback */
- struct list_head s_more_io; /* parked for more writeback */
struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */
struct list_head s_files;
/* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
@@ -1555,11 +1552,14 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
unsigned long, loff_t *);
+struct bdi_writeback;
+
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
void (*destroy_inode)(struct inode *);
void (*dirty_inode) (struct inode *);
+ struct bdi_writeback *(*inode_get_wb) (struct inode *);
int (*write_inode) (struct inode *, int);
void (*drop_inode) (struct inode *);
void (*delete_inode) (struct inode *);
@@ -2072,6 +2072,8 @@ extern int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end);
extern void generic_sync_sb_inodes(struct super_block *sb,
struct writeback_control *wbc);
+extern void generic_sync_bdi_inodes(struct super_block *sb,
+ struct writeback_control *);
extern int write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_flush(struct address_space *);
@@ -2186,7 +2188,6 @@ extern int bdev_read_only(struct block_device *);
extern int set_blocksize(struct block_device *, int);
extern int sb_set_blocksize(struct super_block *, int);
extern int sb_min_blocksize(struct super_block *, int);
-extern int sb_has_dirty_inodes(struct super_block *);
extern int generic_file_mmap(struct file *, struct vm_area_struct *);
extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 6967f6aab33a..b7ba6946fa82 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -342,6 +342,7 @@ enum
NET_TX_SOFTIRQ,
NET_RX_SOFTIRQ,
BLOCK_SOFTIRQ,
+ BLOCK_IOPOLL_SOFTIRQ,
TASKLET_SOFTIRQ,
SCHED_SOFTIRQ,
HRTIMER_SOFTIRQ,
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 3224820c8514..e070b91905d6 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -14,17 +14,6 @@ extern struct list_head inode_in_use;
extern struct list_head inode_unused;
/*
- * Yes, writeback.h requires sched.h
- * No, sched.h is not included from here.
- */
-static inline int task_is_pdflush(struct task_struct *task)
-{
- return task->flags & PF_FLUSHER;
-}
-
-#define current_is_pdflush() task_is_pdflush(current)
-
-/*
* fs/fs-writeback.c
*/
enum writeback_sync_modes {
@@ -79,6 +68,7 @@ struct writeback_control {
void writeback_inodes(struct writeback_control *wbc);
int inode_wait(void *);
void sync_inodes_sb(struct super_block *, int wait);
+long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
/* writeback.h requires fs.h; it, too, is not included from here. */
static inline void wait_on_inode(struct inode *inode)
@@ -98,7 +88,7 @@ static inline void inode_sync_wait(struct inode *inode)
/*
* mm/page-writeback.c
*/
-int wakeup_pdflush(long nr_pages);
+void wakeup_flusher_threads(long nr_pages);
void laptop_io_completion(void);
void laptop_sync_completion(void);
void throttle_vm_writeout(gfp_t gfp_mask);
@@ -150,7 +140,6 @@ balance_dirty_pages_ratelimited(struct address_space *mapping)
typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
void *data);
-int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
int generic_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int write_cache_pages(struct address_space *mapping,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b6eadfe30e7b..c7ece8f027f2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -600,6 +600,7 @@ static struct inode_operations cgroup_dir_inode_operations;
static struct file_operations proc_cgroupstats_operations;
static struct backing_dev_info cgroup_backing_dev_info = {
+ .name = "cgroup",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 98e02328c67d..121837ee602a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -91,6 +91,7 @@ extern int sysctl_nr_trim_pages;
#ifdef CONFIG_RCU_TORTURE_TEST
extern int rcutorture_runnable;
#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
+extern int blk_iopoll_enabled;
/* Constants used for minimum and maximum */
#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -989,7 +990,14 @@ static struct ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
-
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "blk_iopoll",
+ .data = &blk_iopoll_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
/*
* NOTE: do not add new entries to this table unless you have read
* Documentation/sysctl/ctl_unnumbered.txt
diff --git a/mm/Makefile b/mm/Makefile
index 5e0bd6426693..147a7a7873c4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
vmalloc.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
- maccess.o page_alloc.o page-writeback.o pdflush.o \
+ maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
page_isolation.o mm_init.o $(mmu-y)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c86edd244294..036b07ba393b 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,8 +1,11 @@
#include <linux/wait.h>
#include <linux/backing-dev.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
+#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/writeback.h>
@@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
EXPORT_SYMBOL(default_unplug_io_fn);
struct backing_dev_info default_backing_dev_info = {
+ .name = "default",
.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
.state = 0,
.capabilities = BDI_CAP_MAP_COPY,
@@ -22,6 +26,18 @@ struct backing_dev_info default_backing_dev_info = {
EXPORT_SYMBOL_GPL(default_backing_dev_info);
static struct class *bdi_class;
+DEFINE_SPINLOCK(bdi_lock);
+LIST_HEAD(bdi_list);
+LIST_HEAD(bdi_pending_list);
+
+static struct task_struct *sync_supers_tsk;
+static struct timer_list sync_supers_timer;
+
+static int bdi_sync_supers(void *);
+static void sync_supers_timer_fn(unsigned long);
+static void arm_supers_timer(void);
+
+static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
@@ -37,9 +53,29 @@ static void bdi_debug_init(void)
static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
struct backing_dev_info *bdi = m->private;
+ struct bdi_writeback *wb;
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
+ unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
+ struct inode *inode;
+
+ /*
+ * inode lock is enough here, the bdi->wb_list is protected by
+ * RCU on the reader side
+ */
+ nr_wb = nr_dirty = nr_io = nr_more_io = 0;
+ spin_lock(&inode_lock);
+ list_for_each_entry(wb, &bdi->wb_list, list) {
+ nr_wb++;
+ list_for_each_entry(inode, &wb->b_dirty, i_list)
+ nr_dirty++;
+ list_for_each_entry(inode, &wb->b_io, i_list)
+ nr_io++;
+ list_for_each_entry(inode, &wb->b_more_io, i_list)
+ nr_more_io++;
+ }
+ spin_unlock(&inode_lock);
get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
@@ -49,12 +85,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"BdiReclaimable: %8lu kB\n"
"BdiDirtyThresh: %8lu kB\n"
"DirtyThresh: %8lu kB\n"
- "BackgroundThresh: %8lu kB\n",
+ "BackgroundThresh: %8lu kB\n"
+ "WriteBack threads:%8lu\n"
+ "b_dirty: %8lu\n"
+ "b_io: %8lu\n"
+ "b_more_io: %8lu\n"
+ "bdi_list: %8u\n"
+ "state: %8lx\n"
+ "wb_mask: %8lx\n"
+ "wb_list: %8u\n"
+ "wb_cnt: %8u\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
(unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
- K(bdi_thresh),
- K(dirty_thresh),
- K(background_thresh));
+ K(bdi_thresh), K(dirty_thresh),
+ K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
+ !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
+ !list_empty(&bdi->wb_list), bdi->wb_cnt);
#undef K
return 0;
@@ -185,6 +231,13 @@ static int __init default_bdi_init(void)
{
int err;
+ sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
+ BUG_ON(IS_ERR(sync_supers_tsk));
+
+ init_timer(&sync_supers_timer);
+ setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
+ arm_supers_timer();
+
err = bdi_init(&default_backing_dev_info);
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default");
@@ -193,6 +246,390 @@ static int __init default_bdi_init(void)
}
subsys_initcall(default_bdi_init);
+static int wb_assign_nr(struct backing_dev_info *bdi, struct bdi_writeback *wb)
+{
+ unsigned long mask = BDI_MAX_FLUSHERS - 1;
+ unsigned int nr;
+
+ do {
+ if ((bdi->wb_mask & mask) == mask)
+ return 1;
+
+ nr = find_first_zero_bit(&bdi->wb_mask, BDI_MAX_FLUSHERS);
+ } while (test_and_set_bit(nr, &bdi->wb_mask));
+
+ wb->nr = nr;
+
+ spin_lock(&bdi->wb_lock);
+ bdi->wb_cnt++;
+ spin_unlock(&bdi->wb_lock);
+
+ return 0;
+}
+
+static void bdi_put_wb(struct backing_dev_info *bdi, struct bdi_writeback *wb)
+{
+ /*
+ * If this is the default wb thread exiting, leave the bit set
+ * in the wb mask as we set that before it's created as well. This
+ * is done to make sure that assigned work with no thread has at
+ * least one receipient.
+ */
+ if (wb == &bdi->wb)
+ clear_bit(BDI_wb_alloc, &bdi->state);
+ else {
+ clear_bit(wb->nr, &bdi->wb_mask);
+ kfree(wb);
+ spin_lock(&bdi->wb_lock);
+ bdi->wb_cnt--;
+ spin_unlock(&bdi->wb_lock);
+ }
+}
+
+static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+{
+ memset(wb, 0, sizeof(*wb));
+
+ wb->bdi = bdi;
+ INIT_LIST_HEAD(&wb->b_dirty);
+ INIT_LIST_HEAD(&wb->b_io);
+ INIT_LIST_HEAD(&wb->b_more_io);
+
+ return wb_assign_nr(bdi, wb);
+}
+
+static struct bdi_writeback *bdi_new_wb(struct backing_dev_info *bdi)
+{
+ struct bdi_writeback *wb;
+
+ /*
+ * Default bdi->wb is already assigned, so just return it
+ */
+ if (!test_and_set_bit(BDI_wb_alloc, &bdi->state))
+ wb = &bdi->wb;
+ else {
+ wb = kmalloc(sizeof(struct bdi_writeback), GFP_KERNEL);
+ if (wb) {
+ if (bdi_wb_init(wb, bdi)) {
+ kfree(wb);
+ wb = NULL;
+ }
+ }
+ }
+
+ return wb;
+}
+
+static void bdi_task_init(struct backing_dev_info *bdi,
+ struct bdi_writeback *wb)
+{
+ struct task_struct *tsk = current;
+ int was_empty;
+
+ /*
+ * Add us to the active bdi_list. If we are adding threads beyond
+ * the default embedded bdi_writeback, then we need to start using
+ * proper locking. Check the list for empty first, then set the
+ * BDI_wblist_lock flag if there's > 1 entry on the list now
+ */
+ spin_lock(&bdi->wb_lock);
+
+ was_empty = list_empty(&bdi->wb_list);
+ list_add_tail_rcu(&wb->list, &bdi->wb_list);
+ if (!was_empty)
+ set_bit(BDI_wblist_lock, &bdi->state);
+
+ spin_unlock(&bdi->wb_lock);
+
+ tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ set_freezable();
+
+ /*
+ * Our parent may run at a different priority, just set us to normal
+ */
+ set_user_nice(tsk, 0);
+}
+
+static int bdi_start_fn(void *ptr)
+{
+ struct bdi_writeback *wb = ptr;
+ struct backing_dev_info *bdi = wb->bdi;
+ int ret;
+
+ /*
+ * Add us to the active bdi_list
+ */
+ spin_lock(&bdi_lock);
+ list_add(&bdi->bdi_list, &bdi_list);
+ spin_unlock(&bdi_lock);
+
+ bdi_task_init(bdi, wb);
+
+ /*
+ * Clear pending bit and wakeup anybody waiting to tear us down
+ */
+ clear_bit(BDI_pending, &bdi->state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&bdi->state, BDI_pending);
+
+ ret = bdi_writeback_task(wb);
+
+ /*
+ * Remove us from the list
+ */
+ spin_lock(&bdi->wb_lock);
+ list_del_rcu(&wb->list);
+ spin_unlock(&bdi->wb_lock);
+
+ /*
+ * wait for rcu grace period to end, so we can free wb
+ */
+ synchronize_srcu(&bdi->srcu);
+
+ /*
+ * Flush any work that raced with us exiting. No new work
+ * will be added, since this bdi isn't discoverable anymore.
+ */
+ if (!list_empty(&bdi->work_list))
+ wb_do_writeback(wb, 1);
+
+ wb->task = NULL;
+ bdi_put_wb(bdi, wb);
+ return ret;
+}
+
+int bdi_has_dirty_io(struct backing_dev_info *bdi)
+{
+ struct bdi_writeback *wb;
+ int ret = 0;
+
+ if (!bdi_wblist_needs_lock(bdi))
+ ret = wb_has_dirty_io(&bdi->wb);
+ else {
+ int idx;
+
+ idx = srcu_read_lock(&bdi->srcu);
+
+ list_for_each_entry_rcu(wb, &bdi->wb_list, list) {
+ ret = wb_has_dirty_io(wb);
+ if (ret)
+ break;
+ }
+
+ srcu_read_unlock(&bdi->srcu, idx);
+ }
+
+ return ret;
+}
+
+static void bdi_flush_io(struct backing_dev_info *bdi)
+{
+ struct writeback_control wbc = {
+ .bdi = bdi,
+ .sync_mode = WB_SYNC_NONE,
+ .older_than_this = NULL,
+ .range_cyclic = 1,
+ .nr_to_write = 1024,
+ };
+
+ generic_sync_bdi_inodes(NULL, &wbc);
+}
+
+/*
+ * kupdated() used to do this. We cannot do it from the bdi_forker_task()
+ * or we risk deadlocking on ->s_umount. The longer term solution would be
+ * to implement sync_supers_bdi() or similar and simply do it from the
+ * bdi writeback tasks individually.
+ */
+static int bdi_sync_supers(void *unused)
+{
+ set_user_nice(current, 0);
+
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+
+ /*
+ * Do this periodically, like kupdated() did before.
+ */
+ sync_supers();
+ }
+
+ return 0;
+}
+
+static void arm_supers_timer(void)
+{
+ unsigned long next;
+
+ next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
+ mod_timer(&sync_supers_timer, round_jiffies_up(next));
+}
+
+static void sync_supers_timer_fn(unsigned long unused)
+{
+ wake_up_process(sync_supers_tsk);
+ arm_supers_timer();
+}
+
+static int bdi_forker_task(void *ptr)
+{
+ struct bdi_writeback *me = ptr;
+
+ bdi_task_init(me->bdi, me);
+
+ for (;;) {
+ struct backing_dev_info *bdi, *tmp;
+ struct bdi_writeback *wb;
+
+ /*
+ * Temporary measure, we want to make sure we don't see
+ * dirty data on the default backing_dev_info
+ */
+ if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
+ wb_do_writeback(me, 0);
+
+ spin_lock(&bdi_lock);
+
+ /*
+ * Check if any existing bdi's have dirty data without
+ * a thread registered. If so, set that up.
+ */
+ list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
+ if (bdi->wb.task)
+ continue;
+ if (list_empty(&bdi->work_list) &&
+ !bdi_has_dirty_io(bdi))
+ continue;
+
+ bdi_add_default_flusher_task(bdi);
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (list_empty(&bdi_pending_list)) {
+ unsigned long wait;
+
+ spin_unlock(&bdi_lock);
+ wait = msecs_to_jiffies(dirty_writeback_interval * 10);
+ schedule_timeout(wait);
+ try_to_freeze();
+ continue;
+ }
+
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * This is our real job - check for pending entries in
+ * bdi_pending_list, and create the tasks that got added
+ */
+ bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
+ bdi_list);
+ list_del_init(&bdi->bdi_list);
+ spin_unlock(&bdi_lock);
+
+ wb = bdi_new_wb(bdi);
+ if (!wb)
+ goto readd_flush;
+
+ wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
+ dev_name(bdi->dev));
+ /*
+ * If task creation fails, then readd the bdi to
+ * the pending list and force writeout of the bdi
+ * from this forker thread. That will free some memory
+ * and we can try again.
+ */
+ if (IS_ERR(wb->task)) {
+ wb->task = NULL;
+ bdi_put_wb(bdi, wb);
+readd_flush:
+ /*
+ * Add this 'bdi' to the back, so we get
+ * a chance to flush other bdi's to free
+ * memory.
+ */
+ spin_lock(&bdi_lock);
+ list_add_tail(&bdi->bdi_list, &bdi_pending_list);
+ spin_unlock(&bdi_lock);
+
+ bdi_flush_io(bdi);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * bdi_lock held on entry
+ */
+static void bdi_add_one_flusher_task(struct backing_dev_info *bdi,
+ int(*func)(struct backing_dev_info *))
+{
+ if (!bdi_cap_writeback_dirty(bdi))
+ return;
+
+ if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
+ printk("bdi %p/%s is not registered!\n", bdi, bdi->name);
+ return;
+ }
+
+ /*
+ * Check with the helper whether to proceed adding a task. Will only
+ * abort if we two or more simultanous calls to
+ * bdi_add_default_flusher_task() occured, further additions will block
+ * waiting for previous additions to finish.
+ */
+ if (!func(bdi)) {
+ list_move_tail(&bdi->bdi_list, &bdi_pending_list);
+
+ /*
+ * We are now on the pending list, wake up bdi_forker_task()
+ * to finish the job and add us back to the active bdi_list
+ */
+ wake_up_process(default_backing_dev_info.wb.task);
+ }
+}
+
+static int flusher_add_helper_block(struct backing_dev_info *bdi)
+{
+ spin_unlock(&bdi_lock);
+ wait_on_bit_lock(&bdi->state, BDI_pending, bdi_sched_wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_lock(&bdi_lock);
+ return 0;
+}
+
+static int flusher_add_helper_test(struct backing_dev_info *bdi)
+{
+ return test_and_set_bit(BDI_pending, &bdi->state);
+}
+
+/*
+ * Add the default flusher task that gets created for any bdi
+ * that has dirty data pending writeout
+ */
+void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
+{
+ bdi_add_one_flusher_task(bdi, flusher_add_helper_test);
+}
+
+/**
+ * bdi_add_flusher_task - add one more flusher task to this @bdi
+ * @bdi: the bdi
+ *
+ * Add an additional flusher task to this @bdi. Will block waiting on
+ * previous additions, if any.
+ *
+ */
+void bdi_add_flusher_task(struct backing_dev_info *bdi)
+{
+ spin_lock(&bdi_lock);
+ bdi_add_one_flusher_task(bdi, flusher_add_helper_block);
+ spin_unlock(&bdi_lock);
+}
+EXPORT_SYMBOL(bdi_add_flusher_task);
+
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
@@ -211,9 +648,42 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
goto exit;
}
+ spin_lock(&bdi_lock);
+ list_add_tail(&bdi->bdi_list, &bdi_list);
+ spin_unlock(&bdi_lock);
+
bdi->dev = dev;
- bdi_debug_register(bdi, dev_name(dev));
+ /*
+ * Just start the forker thread for our default backing_dev_info,
+ * and add other bdi's to the list. They will get a thread created
+ * on-demand when they need it.
+ */
+ if (bdi_cap_flush_forker(bdi)) {
+ struct bdi_writeback *wb;
+
+ wb = bdi_new_wb(bdi);
+ if (!wb) {
+ ret = -ENOMEM;
+ goto remove_err;
+ }
+
+ wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
+ dev_name(dev));
+ if (IS_ERR(wb->task)) {
+ wb->task = NULL;
+ bdi_put_wb(bdi, wb);
+ ret = -ENOMEM;
+remove_err:
+ spin_lock(&bdi_lock);
+ list_del(&bdi->bdi_list);
+ spin_unlock(&bdi_lock);
+ goto exit;
+ }
+ }
+
+ bdi_debug_register(bdi, dev_name(dev));
+ set_bit(BDI_registered, &bdi->state);
exit:
return ret;
}
@@ -225,9 +695,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
}
EXPORT_SYMBOL(bdi_register_dev);
+/*
+ * Remove bdi from the global list and shutdown any threads we have running
+ */
+static void bdi_wb_shutdown(struct backing_dev_info *bdi)
+{
+ struct bdi_writeback *wb;
+
+ if (!bdi_cap_writeback_dirty(bdi))
+ return;
+
+ /*
+ * If setup is pending, wait for that to complete first
+ */
+ wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
+ TASK_UNINTERRUPTIBLE);
+
+ /*
+ * Make sure nobody finds us on the bdi_list anymore
+ */
+ spin_lock(&bdi_lock);
+ list_del(&bdi->bdi_list);
+ spin_unlock(&bdi_lock);
+
+ /*
+ * Finally, kill the kernel threads. We don't need to be RCU
+ * safe anymore, since the bdi is gone from visibility.
+ */
+ list_for_each_entry(wb, &bdi->wb_list, list)
+ kthread_stop(wb->task);
+}
+
void bdi_unregister(struct backing_dev_info *bdi)
{
if (bdi->dev) {
+ if (!bdi_cap_flush_forker(bdi))
+ bdi_wb_shutdown(bdi);
bdi_debug_unregister(bdi);
device_unregister(bdi->dev);
bdi->dev = NULL;
@@ -237,14 +740,21 @@ EXPORT_SYMBOL(bdi_unregister);
int bdi_init(struct backing_dev_info *bdi)
{
- int i;
- int err;
+ int i, err;
bdi->dev = NULL;
bdi->min_ratio = 0;
bdi->max_ratio = 100;
bdi->max_prop_frac = PROP_FRAC_BASE;
+ spin_lock_init(&bdi->wb_lock);
+ bdi->wb_mask = 0;
+ bdi->wb_cnt = 0;
+ INIT_LIST_HEAD(&bdi->bdi_list);
+ INIT_LIST_HEAD(&bdi->wb_list);
+ INIT_LIST_HEAD(&bdi->work_list);
+
+ bdi_wb_init(&bdi->wb, bdi);
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -252,10 +762,15 @@ int bdi_init(struct backing_dev_info *bdi)
goto err;
}
+ err = init_srcu_struct(&bdi->srcu);
+ if (err)
+ goto err;
+
bdi->dirty_exceeded = 0;
err = prop_local_init_percpu(&bdi->completions);
if (err) {
+ cleanup_srcu_struct(&bdi->srcu);
err:
while (i--)
percpu_counter_destroy(&bdi->bdi_stat[i]);
@@ -269,8 +784,12 @@ void bdi_destroy(struct backing_dev_info *bdi)
{
int i;
+ WARN_ON(bdi_has_dirty_io(bdi));
+
bdi_unregister(bdi);
+ cleanup_srcu_struct(&bdi->srcu);
+
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 81627ebcd313..a727af9904e1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,15 +36,6 @@
#include <linux/pagevec.h>
/*
- * The maximum number of pages to writeout in a single bdflush/kupdate
- * operation. We do this so we don't hold I_SYNC against an inode for
- * enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode. Also, the code reevaluates
- * the dirty each time it has written this many pages.
- */
-#define MAX_WRITEBACK_PAGES 1024
-
-/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
* will look to see if it needs to force writeback or throttling.
*/
@@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */
-static void background_writeout(unsigned long _min_pages);
-
/*
* Scale the writeback cache size proportional to the relative writeout speeds.
*
@@ -320,15 +309,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
/*
*
*/
-static DEFINE_SPINLOCK(bdi_lock);
static unsigned int bdi_min_ratio;
int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
int ret = 0;
- unsigned long flags;
- spin_lock_irqsave(&bdi_lock, flags);
+ spin_lock(&bdi_lock);
if (min_ratio > bdi->max_ratio) {
ret = -EINVAL;
} else {
@@ -340,27 +327,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
ret = -EINVAL;
}
}
- spin_unlock_irqrestore(&bdi_lock, flags);
+ spin_unlock(&bdi_lock);
return ret;
}
int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
{
- unsigned long flags;
int ret = 0;
if (max_ratio > 100)
return -EINVAL;
- spin_lock_irqsave(&bdi_lock, flags);
+ spin_lock(&bdi_lock);
if (bdi->min_ratio > max_ratio) {
ret = -EINVAL;
} else {
bdi->max_ratio = max_ratio;
bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
}
- spin_unlock_irqrestore(&bdi_lock, flags);
+ spin_unlock(&bdi_lock);
return ret;
}
@@ -546,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping)
* up.
*/
if (bdi_nr_reclaimable > bdi_thresh) {
- writeback_inodes(&wbc);
+ generic_sync_bdi_inodes(NULL, &wbc);
pages_written += write_chunk - wbc.nr_to_write;
get_dirty_limits(&background_thresh, &dirty_thresh,
&bdi_thresh, bdi);
@@ -597,7 +583,7 @@ static void balance_dirty_pages(struct address_space *mapping)
(!laptop_mode && (global_page_state(NR_FILE_DIRTY)
+ global_page_state(NR_UNSTABLE_NFS)
> background_thresh)))
- pdflush_operation(background_writeout, 0);
+ bdi_start_writeback(bdi, NULL, 0, WB_SYNC_NONE);
}
void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -682,152 +668,53 @@ void throttle_vm_writeout(gfp_t gfp_mask)
}
/*
- * writeback at least _min_pages, and keep writing until the amount of dirty
- * memory is less than the background threshold, or until we're all clean.
+ * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
+ * the whole world.
*/
-static void background_writeout(unsigned long _min_pages)
+void wakeup_flusher_threads(long nr_pages)
{
- long min_pages = _min_pages;
struct writeback_control wbc = {
- .bdi = NULL,
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
- .nr_to_write = 0,
- .nonblocking = 1,
.range_cyclic = 1,
};
- for ( ; ; ) {
- unsigned long background_thresh;
- unsigned long dirty_thresh;
-
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
- if (global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) < background_thresh
- && min_pages <= 0)
- break;
- wbc.more_io = 0;
- wbc.encountered_congestion = 0;
- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
- wbc.pages_skipped = 0;
- writeback_inodes(&wbc);
- min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
- /* Wrote less than expected */
- if (wbc.encountered_congestion || wbc.more_io)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- else
- break;
- }
- }
-}
-
-/*
- * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
- * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
- * -1 if all pdflush threads were busy.
- */
-int wakeup_pdflush(long nr_pages)
-{
if (nr_pages == 0)
nr_pages = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
- return pdflush_operation(background_writeout, nr_pages);
+ wbc.nr_to_write = nr_pages;
+ bdi_writeback_all(NULL, &wbc);
}
-static void wb_timer_fn(unsigned long unused);
static void laptop_timer_fn(unsigned long unused);
-static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
/*
- * Periodic writeback of "old" data.
- *
- * Define "old": the first time one of an inode's pages is dirtied, we mark the
- * dirtying-time in the inode's address_space. So this periodic writeback code
- * just walks the superblock inode list, writing back any inodes which are
- * older than a specific point in time.
- *
- * Try to run once per dirty_writeback_interval. But if a writeback event
- * takes longer than a dirty_writeback_interval interval, then leave a
- * one-second gap.
- *
- * older_than_this takes precedence over nr_to_write. So we'll only write back
- * all dirty pages if they are all attached to "old" mappings.
- */
-static void wb_kupdate(unsigned long arg)
-{
- unsigned long oldest_jif;
- unsigned long start_jif;
- unsigned long next_jif;
- long nr_to_write;
- struct writeback_control wbc = {
- .bdi = NULL,
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = &oldest_jif,
- .nr_to_write = 0,
- .nonblocking = 1,
- .for_kupdate = 1,
- .range_cyclic = 1,
- };
-
- sync_supers();
-
- oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
- start_jif = jiffies;
- next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
- nr_to_write = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) +
- (inodes_stat.nr_inodes - inodes_stat.nr_unused);
- while (nr_to_write > 0) {
- wbc.more_io = 0;
- wbc.encountered_congestion = 0;
- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
- writeback_inodes(&wbc);
- if (wbc.nr_to_write > 0) {
- if (wbc.encountered_congestion || wbc.more_io)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- else
- break; /* All the old data is written */
- }
- nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- }
- if (time_before(next_jif, jiffies + HZ))
- next_jif = jiffies + HZ;
- if (dirty_writeback_interval)
- mod_timer(&wb_timer, next_jif);
-}
-
-/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
int dirty_writeback_centisecs_handler(ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec(table, write, file, buffer, length, ppos);
- if (dirty_writeback_interval)
- mod_timer(&wb_timer, jiffies +
- msecs_to_jiffies(dirty_writeback_interval * 10));
- else
- del_timer(&wb_timer);
return 0;
}
-static void wb_timer_fn(unsigned long unused)
+static void do_laptop_sync(struct work_struct *work)
{
- if (pdflush_operation(wb_kupdate, 0) < 0)
- mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
-}
-
-static void laptop_flush(unsigned long unused)
-{
- sys_sync();
+ wakeup_flusher_threads(0);
+ kfree(work);
}
static void laptop_timer_fn(unsigned long unused)
{
- pdflush_operation(laptop_flush, 0);
+ struct work_struct *work;
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work) {
+ INIT_WORK(work, do_laptop_sync);
+ schedule_work(work);
+ }
}
/*
@@ -910,8 +797,6 @@ void __init page_writeback_init(void)
{
int shift;
- mod_timer(&wb_timer,
- jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
diff --git a/mm/pdflush.c b/mm/pdflush.c
deleted file mode 100644
index 235ac440c44e..000000000000
--- a/mm/pdflush.c
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * mm/pdflush.c - worker threads for writing back filesystem data
- *
- * Copyright (C) 2002, Linus Torvalds.
- *
- * 09Apr2002 Andrew Morton
- * Initial version
- * 29Feb2004 kaos@sgi.com
- * Move worker thread creation to kthread to avoid chewing
- * up stack space with nested calls to kernel_thread.
- */
-
-#include <linux/sched.h>
-#include <linux/list.h>
-#include <linux/signal.h>
-#include <linux/spinlock.h>
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/fs.h> /* Needed by writeback.h */
-#include <linux/writeback.h> /* Prototypes pdflush_operation() */
-#include <linux/kthread.h>
-#include <linux/cpuset.h>
-#include <linux/freezer.h>
-
-
-/*
- * Minimum and maximum number of pdflush instances
- */
-#define MIN_PDFLUSH_THREADS 2
-#define MAX_PDFLUSH_THREADS 8
-
-static void start_one_pdflush_thread(void);
-
-
-/*
- * The pdflush threads are worker threads for writing back dirty data.
- * Ideally, we'd like one thread per active disk spindle. But the disk
- * topology is very hard to divine at this level. Instead, we take
- * care in various places to prevent more than one pdflush thread from
- * performing writeback against a single filesystem. pdflush threads
- * have the PF_FLUSHER flag set in current->flags to aid in this.
- */
-
-/*
- * All the pdflush threads. Protected by pdflush_lock
- */
-static LIST_HEAD(pdflush_list);
-static DEFINE_SPINLOCK(pdflush_lock);
-
-/*
- * The count of currently-running pdflush threads. Protected
- * by pdflush_lock.
- *
- * Readable by sysctl, but not writable. Published to userspace at
- * /proc/sys/vm/nr_pdflush_threads.
- */
-int nr_pdflush_threads = 0;
-
-/*
- * The time at which the pdflush thread pool last went empty
- */
-static unsigned long last_empty_jifs;
-
-/*
- * The pdflush thread.
- *
- * Thread pool management algorithm:
- *
- * - The minimum and maximum number of pdflush instances are bound
- * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
- *
- * - If there have been no idle pdflush instances for 1 second, create
- * a new one.
- *
- * - If the least-recently-went-to-sleep pdflush thread has been asleep
- * for more than one second, terminate a thread.
- */
-
-/*
- * A structure for passing work to a pdflush thread. Also for passing
- * state information between pdflush threads. Protected by pdflush_lock.
- */
-struct pdflush_work {
- struct task_struct *who; /* The thread */
- void (*fn)(unsigned long); /* A callback function */
- unsigned long arg0; /* An argument to the callback */
- struct list_head list; /* On pdflush_list, when idle */
- unsigned long when_i_went_to_sleep;
-};
-
-static int __pdflush(struct pdflush_work *my_work)
-{
- current->flags |= PF_FLUSHER | PF_SWAPWRITE;
- set_freezable();
- my_work->fn = NULL;
- my_work->who = current;
- INIT_LIST_HEAD(&my_work->list);
-
- spin_lock_irq(&pdflush_lock);
- for ( ; ; ) {
- struct pdflush_work *pdf;
-
- set_current_state(TASK_INTERRUPTIBLE);
- list_move(&my_work->list, &pdflush_list);
- my_work->when_i_went_to_sleep = jiffies;
- spin_unlock_irq(&pdflush_lock);
- schedule();
- try_to_freeze();
- spin_lock_irq(&pdflush_lock);
- if (!list_empty(&my_work->list)) {
- /*
- * Someone woke us up, but without removing our control
- * structure from the global list. swsusp will do this
- * in try_to_freeze()->refrigerator(). Handle it.
- */
- my_work->fn = NULL;
- continue;
- }
- if (my_work->fn == NULL) {
- printk("pdflush: bogus wakeup\n");
- continue;
- }
- spin_unlock_irq(&pdflush_lock);
-
- (*my_work->fn)(my_work->arg0);
-
- spin_lock_irq(&pdflush_lock);
-
- /*
- * Thread creation: For how long have there been zero
- * available threads?
- *
- * To throttle creation, we reset last_empty_jifs.
- */
- if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
- if (list_empty(&pdflush_list)) {
- if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
- last_empty_jifs = jiffies;
- nr_pdflush_threads++;
- spin_unlock_irq(&pdflush_lock);
- start_one_pdflush_thread();
- spin_lock_irq(&pdflush_lock);
- }
- }
- }
-
- my_work->fn = NULL;
-
- /*
- * Thread destruction: For how long has the sleepiest
- * thread slept?
- */
- if (list_empty(&pdflush_list))
- continue;
- if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
- continue;
- pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
- if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
- /* Limit exit rate */
- pdf->when_i_went_to_sleep = jiffies;
- break; /* exeunt */
- }
- }
- nr_pdflush_threads--;
- spin_unlock_irq(&pdflush_lock);
- return 0;
-}
-
-/*
- * Of course, my_work wants to be just a local in __pdflush(). It is
- * separated out in this manner to hopefully prevent the compiler from
- * performing unfortunate optimisations against the auto variables. Because
- * these are visible to other tasks and CPUs. (No problem has actually
- * been observed. This is just paranoia).
- */
-static int pdflush(void *dummy)
-{
- struct pdflush_work my_work;
- cpumask_var_t cpus_allowed;
-
- /*
- * Since the caller doesn't even check kthread_run() worked, let's not
- * freak out too much if this fails.
- */
- if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
- printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
- return 0;
- }
-
- /*
- * pdflush can spend a lot of time doing encryption via dm-crypt. We
- * don't want to do that at keventd's priority.
- */
- set_user_nice(current, 0);
-
- /*
- * Some configs put our parent kthread in a limited cpuset,
- * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
- * Our needs are more modest - cut back to our cpusets cpus_allowed.
- * This is needed as pdflush's are dynamically created and destroyed.
- * The boottime pdflush's are easily placed w/o these 2 lines.
- */
- cpuset_cpus_allowed(current, cpus_allowed);
- set_cpus_allowed_ptr(current, cpus_allowed);
- free_cpumask_var(cpus_allowed);
-
- return __pdflush(&my_work);
-}
-
-/*
- * Attempt to wake up a pdflush thread, and get it to do some work for you.
- * Returns zero if it indeed managed to find a worker thread, and passed your
- * payload to it.
- */
-int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
-{
- unsigned long flags;
- int ret = 0;
-
- BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
-
- spin_lock_irqsave(&pdflush_lock, flags);
- if (list_empty(&pdflush_list)) {
- ret = -1;
- } else {
- struct pdflush_work *pdf;
-
- pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
- list_del_init(&pdf->list);
- if (list_empty(&pdflush_list))
- last_empty_jifs = jiffies;
- pdf->fn = fn;
- pdf->arg0 = arg0;
- wake_up_process(pdf->who);
- }
- spin_unlock_irqrestore(&pdflush_lock, flags);
-
- return ret;
-}
-
-static void start_one_pdflush_thread(void)
-{
- struct task_struct *k;
-
- k = kthread_run(pdflush, NULL, "pdflush");
- if (unlikely(IS_ERR(k))) {
- spin_lock_irq(&pdflush_lock);
- nr_pdflush_threads--;
- spin_unlock_irq(&pdflush_lock);
- }
-}
-
-static int __init pdflush_init(void)
-{
- int i;
-
- /*
- * Pre-set nr_pdflush_threads... If we fail to create,
- * the count will be decremented.
- */
- nr_pdflush_threads = MIN_PDFLUSH_THREADS;
-
- for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
- start_one_pdflush_thread();
- return 0;
-}
-
-module_init(pdflush_init);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 42cd38eba79f..5ae6b8b78c80 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = {
};
static struct backing_dev_info swap_backing_dev_info = {
+ .name = "swap",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
.unplug_io_fn = swap_unplug_io_fn,
};
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dea7abd31098..fe8e986bb6a6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1715,7 +1715,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
*/
if (total_scanned > sc->swap_cluster_max +
sc->swap_cluster_max / 2) {
- wakeup_pdflush(laptop_mode ? 0 : total_scanned);
+ wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
sc->may_writepage = 1;
}