diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2010-10-21 12:35:49 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2010-10-21 12:35:53 +1100 |
commit | 2f7ecf517c6e67745c34187fad8f7bde563247c7 (patch) | |
tree | 771d41aa881117123436993010311f5b72ebedd9 | |
parent | 548b3df9c02434463d23d1597b1c16f14042d23a (diff) | |
parent | cc3fd878cf6f6681e81922fda2fd129a768ca007 (diff) |
Merge remote branch 'block/for-next'
Conflicts:
fs/ext4/mballoc.c
130 files changed, 5686 insertions, 3592 deletions
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX index a406286f6f3e..d111e3b23db0 100644 --- a/Documentation/block/00-INDEX +++ b/Documentation/block/00-INDEX @@ -1,7 +1,5 @@ 00-INDEX - This file -barrier.txt - - I/O Barriers biodoc.txt - Notes on the Generic Block Layer Rewrite in Linux 2.5 capability.txt @@ -16,3 +14,5 @@ stat.txt - Block layer statistics in /sys/block/<dev>/stat switching-sched.txt - Switching I/O schedulers at runtime +writeback_cache_control.txt + - Control of volatile write back caches diff --git a/Documentation/block/barrier.txt b/Documentation/block/barrier.txt deleted file mode 100644 index 2c2f24f634e4..000000000000 --- a/Documentation/block/barrier.txt +++ /dev/null @@ -1,261 +0,0 @@ -I/O Barriers -============ -Tejun Heo <htejun@gmail.com>, July 22 2005 - -I/O barrier requests are used to guarantee ordering around the barrier -requests. Unless you're crazy enough to use disk drives for -implementing synchronization constructs (wow, sounds interesting...), -the ordering is meaningful only for write requests for things like -journal checkpoints. All requests queued before a barrier request -must be finished (made it to the physical medium) before the barrier -request is started, and all requests queued after the barrier request -must be started only after the barrier request is finished (again, -made it to the physical medium). - -In other words, I/O barrier requests have the following two properties. - -1. Request ordering - -Requests cannot pass the barrier request. Preceding requests are -processed before the barrier and following requests after. - -Depending on what features a drive supports, this can be done in one -of the following three ways. - -i. For devices which have queue depth greater than 1 (TCQ devices) and -support ordered tags, block layer can just issue the barrier as an -ordered request and the lower level driver, controller and drive -itself are responsible for making sure that the ordering constraint is -met. Most modern SCSI controllers/drives should support this. - -NOTE: SCSI ordered tag isn't currently used due to limitation in the - SCSI midlayer, see the following random notes section. - -ii. For devices which have queue depth greater than 1 but don't -support ordered tags, block layer ensures that the requests preceding -a barrier request finishes before issuing the barrier request. Also, -it defers requests following the barrier until the barrier request is -finished. Older SCSI controllers/drives and SATA drives fall in this -category. - -iii. Devices which have queue depth of 1. This is a degenerate case -of ii. Just keeping issue order suffices. Ancient SCSI -controllers/drives and IDE drives are in this category. - -2. Forced flushing to physical medium - -Again, if you're not gonna do synchronization with disk drives (dang, -it sounds even more appealing now!), the reason you use I/O barriers -is mainly to protect filesystem integrity when power failure or some -other events abruptly stop the drive from operating and possibly make -the drive lose data in its cache. So, I/O barriers need to guarantee -that requests actually get written to non-volatile medium in order. - -There are four cases, - -i. No write-back cache. Keeping requests ordered is enough. - -ii. Write-back cache but no flush operation. There's no way to -guarantee physical-medium commit order. This kind of devices can't to -I/O barriers. - -iii. Write-back cache and flush operation but no FUA (forced unit -access). We need two cache flushes - before and after the barrier -request. - -iv. Write-back cache, flush operation and FUA. We still need one -flush to make sure requests preceding a barrier are written to medium, -but post-barrier flush can be avoided by using FUA write on the -barrier itself. - - -How to support barrier requests in drivers ------------------------------------------- - -All barrier handling is done inside block layer proper. All low level -drivers have to are implementing its prepare_flush_fn and using one -the following two functions to indicate what barrier type it supports -and how to prepare flush requests. Note that the term 'ordered' is -used to indicate the whole sequence of performing barrier requests -including draining and flushing. - -typedef void (prepare_flush_fn)(struct request_queue *q, struct request *rq); - -int blk_queue_ordered(struct request_queue *q, unsigned ordered, - prepare_flush_fn *prepare_flush_fn); - -@q : the queue in question -@ordered : the ordered mode the driver/device supports -@prepare_flush_fn : this function should prepare @rq such that it - flushes cache to physical medium when executed - -For example, SCSI disk driver's prepare_flush_fn looks like the -following. - -static void sd_prepare_flush(struct request_queue *q, struct request *rq) -{ - memset(rq->cmd, 0, sizeof(rq->cmd)); - rq->cmd_type = REQ_TYPE_BLOCK_PC; - rq->timeout = SD_TIMEOUT; - rq->cmd[0] = SYNCHRONIZE_CACHE; - rq->cmd_len = 10; -} - -The following seven ordered modes are supported. The following table -shows which mode should be used depending on what features a -device/driver supports. In the leftmost column of table, -QUEUE_ORDERED_ prefix is omitted from the mode names to save space. - -The table is followed by description of each mode. Note that in the -descriptions of QUEUE_ORDERED_DRAIN*, '=>' is used whereas '->' is -used for QUEUE_ORDERED_TAG* descriptions. '=>' indicates that the -preceding step must be complete before proceeding to the next step. -'->' indicates that the next step can start as soon as the previous -step is issued. - - write-back cache ordered tag flush FUA ------------------------------------------------------------------------ -NONE yes/no N/A no N/A -DRAIN no no N/A N/A -DRAIN_FLUSH yes no yes no -DRAIN_FUA yes no yes yes -TAG no yes N/A N/A -TAG_FLUSH yes yes yes no -TAG_FUA yes yes yes yes - - -QUEUE_ORDERED_NONE - I/O barriers are not needed and/or supported. - - Sequence: N/A - -QUEUE_ORDERED_DRAIN - Requests are ordered by draining the request queue and cache - flushing isn't needed. - - Sequence: drain => barrier - -QUEUE_ORDERED_DRAIN_FLUSH - Requests are ordered by draining the request queue and both - pre-barrier and post-barrier cache flushings are needed. - - Sequence: drain => preflush => barrier => postflush - -QUEUE_ORDERED_DRAIN_FUA - Requests are ordered by draining the request queue and - pre-barrier cache flushing is needed. By using FUA on barrier - request, post-barrier flushing can be skipped. - - Sequence: drain => preflush => barrier - -QUEUE_ORDERED_TAG - Requests are ordered by ordered tag and cache flushing isn't - needed. - - Sequence: barrier - -QUEUE_ORDERED_TAG_FLUSH - Requests are ordered by ordered tag and both pre-barrier and - post-barrier cache flushings are needed. - - Sequence: preflush -> barrier -> postflush - -QUEUE_ORDERED_TAG_FUA - Requests are ordered by ordered tag and pre-barrier cache - flushing is needed. By using FUA on barrier request, - post-barrier flushing can be skipped. - - Sequence: preflush -> barrier - - -Random notes/caveats --------------------- - -* SCSI layer currently can't use TAG ordering even if the drive, -controller and driver support it. The problem is that SCSI midlayer -request dispatch function is not atomic. It releases queue lock and -switch to SCSI host lock during issue and it's possible and likely to -happen in time that requests change their relative positions. Once -this problem is solved, TAG ordering can be enabled. - -* Currently, no matter which ordered mode is used, there can be only -one barrier request in progress. All I/O barriers are held off by -block layer until the previous I/O barrier is complete. This doesn't -make any difference for DRAIN ordered devices, but, for TAG ordered -devices with very high command latency, passing multiple I/O barriers -to low level *might* be helpful if they are very frequent. Well, this -certainly is a non-issue. I'm writing this just to make clear that no -two I/O barrier is ever passed to low-level driver. - -* Completion order. Requests in ordered sequence are issued in order -but not required to finish in order. Barrier implementation can -handle out-of-order completion of ordered sequence. IOW, the requests -MUST be processed in order but the hardware/software completion paths -are allowed to reorder completion notifications - eg. current SCSI -midlayer doesn't preserve completion order during error handling. - -* Requeueing order. Low-level drivers are free to requeue any request -after they removed it from the request queue with -blkdev_dequeue_request(). As barrier sequence should be kept in order -when requeued, generic elevator code takes care of putting requests in -order around barrier. See blk_ordered_req_seq() and -ELEVATOR_INSERT_REQUEUE handling in __elv_add_request() for details. - -Note that block drivers must not requeue preceding requests while -completing latter requests in an ordered sequence. Currently, no -error checking is done against this. - -* Error handling. Currently, block layer will report error to upper -layer if any of requests in an ordered sequence fails. Unfortunately, -this doesn't seem to be enough. Look at the following request flow. -QUEUE_ORDERED_TAG_FLUSH is in use. - - [0] [1] [2] [3] [pre] [barrier] [post] < [4] [5] [6] ... > - still in elevator - -Let's say request [2], [3] are write requests to update file system -metadata (journal or whatever) and [barrier] is used to mark that -those updates are valid. Consider the following sequence. - - i. Requests [0] ~ [post] leaves the request queue and enters - low-level driver. - ii. After a while, unfortunately, something goes wrong and the - drive fails [2]. Note that any of [0], [1] and [3] could have - completed by this time, but [pre] couldn't have been finished - as the drive must process it in order and it failed before - processing that command. - iii. Error handling kicks in and determines that the error is - unrecoverable and fails [2], and resumes operation. - iv. [pre] [barrier] [post] gets processed. - v. *BOOM* power fails - -The problem here is that the barrier request is *supposed* to indicate -that filesystem update requests [2] and [3] made it safely to the -physical medium and, if the machine crashes after the barrier is -written, filesystem recovery code can depend on that. Sadly, that -isn't true in this case anymore. IOW, the success of a I/O barrier -should also be dependent on success of some of the preceding requests, -where only upper layer (filesystem) knows what 'some' is. - -This can be solved by implementing a way to tell the block layer which -requests affect the success of the following barrier request and -making lower lever drivers to resume operation on error only after -block layer tells it to do so. - -As the probability of this happening is very low and the drive should -be faulty, implementing the fix is probably an overkill. But, still, -it's there. - -* In previous drafts of barrier implementation, there was fallback -mechanism such that, if FUA or ordered TAG fails, less fancy ordered -mode can be selected and the failed barrier request is retried -automatically. The rationale for this feature was that as FUA is -pretty new in ATA world and ordered tag was never used widely, there -could be devices which report to support those features but choke when -actually given such requests. - - This was removed for two reasons 1. it's an overkill 2. it's -impossible to implement properly when TAG ordering is used as low -level drivers resume after an error automatically. If it's ever -needed adding it back and modifying low level drivers accordingly -shouldn't be difficult. diff --git a/Documentation/block/writeback_cache_control.txt b/Documentation/block/writeback_cache_control.txt new file mode 100644 index 000000000000..83407d36630a --- /dev/null +++ b/Documentation/block/writeback_cache_control.txt @@ -0,0 +1,86 @@ + +Explicit volatile write back cache control +===================================== + +Introduction +------------ + +Many storage devices, especially in the consumer market, come with volatile +write back caches. That means the devices signal I/O completion to the +operating system before data actually has hit the non-volatile storage. This +behavior obviously speeds up various workloads, but it means the operating +system needs to force data out to the non-volatile storage when it performs +a data integrity operation like fsync, sync or an unmount. + +The Linux block layer provides two simple mechanisms that let filesystems +control the caching behavior of the storage device. These mechanisms are +a forced cache flush, and the Force Unit Access (FUA) flag for requests. + + +Explicit cache flushes +---------------------- + +The REQ_FLUSH flag can be OR ed into the r/w flags of a bio submitted from +the filesystem and will make sure the volatile cache of the storage device +has been flushed before the actual I/O operation is started. This explicitly +guarantees that previously completed write requests are on non-volatile +storage before the flagged bio starts. In addition the REQ_FLUSH flag can be +set on an otherwise empty bio structure, which causes only an explicit cache +flush without any dependent I/O. It is recommend to use +the blkdev_issue_flush() helper for a pure cache flush. + + +Forced Unit Access +----------------- + +The REQ_FUA flag can be OR ed into the r/w flags of a bio submitted from the +filesystem and will make sure that I/O completion for this request is only +signaled after the data has been committed to non-volatile storage. + + +Implementation details for filesystems +-------------------------------------- + +Filesystems can simply set the REQ_FLUSH and REQ_FUA bits and do not have to +worry if the underlying devices need any explicit cache flushing and how +the Forced Unit Access is implemented. The REQ_FLUSH and REQ_FUA flags +may both be set on a single bio. + + +Implementation details for make_request_fn based block drivers +-------------------------------------------------------------- + +These drivers will always see the REQ_FLUSH and REQ_FUA bits as they sit +directly below the submit_bio interface. For remapping drivers the REQ_FUA +bits need to be propagated to underlying devices, and a global flush needs +to be implemented for bios with the REQ_FLUSH bit set. For real device +drivers that do not have a volatile cache the REQ_FLUSH and REQ_FUA bits +on non-empty bios can simply be ignored, and REQ_FLUSH requests without +data can be completed successfully without doing any work. Drivers for +devices with volatile caches need to implement the support for these +flags themselves without any help from the block layer. + + +Implementation details for request_fn based block drivers +-------------------------------------------------------------- + +For devices that do not support volatile write caches there is no driver +support required, the block layer completes empty REQ_FLUSH requests before +entering the driver and strips off the REQ_FLUSH and REQ_FUA bits from +requests that have a payload. For devices with volatile write caches the +driver needs to tell the block layer that it supports flushing caches by +doing: + + blk_queue_flush(sdkp->disk->queue, REQ_FLUSH); + +and handle empty REQ_FLUSH requests in its prep_fn/request_fn. Note that +REQ_FLUSH requests with a payload are automatically turned into a sequence +of an empty REQ_FLUSH request followed by the actual write by the block +layer. For devices that also support the FUA bit the block layer needs +to be told to pass through the REQ_FUA bit using: + + blk_queue_flush(sdkp->disk->queue, REQ_FLUSH | REQ_FUA); + +and the driver must handle write requests that have the REQ_FUA bit set +in prep_fn/request_fn. If the FUA bit is not natively supported the block +layer turns it into an empty REQ_FLUSH request after the actual write. diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index 6919d62591d9..d6da611f8f63 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt @@ -8,12 +8,17 @@ both at leaf nodes as well as at intermediate nodes in a storage hierarchy. Plan is to use the same cgroup based management interface for blkio controller and based on user options switch IO policies in the background. -In the first phase, this patchset implements proportional weight time based -division of disk policy. It is implemented in CFQ. Hence this policy takes -effect only on leaf nodes when CFQ is being used. +Currently two IO control policies are implemented. First one is proportional +weight time based division of disk policy. It is implemented in CFQ. Hence +this policy takes effect only on leaf nodes when CFQ is being used. The second +one is throttling policy which can be used to specify upper IO rate limits +on devices. This policy is implemented in generic block layer and can be +used on leaf nodes as well as higher level logical devices like device mapper. HOWTO ===== +Proportional Weight division of bandwidth +----------------------------------------- You can do a very simple testing of running two dd threads in two different cgroups. Here is what you can do. @@ -55,6 +60,35 @@ cgroups. Here is what you can do. group dispatched to the disk. We provide fairness in terms of disk time, so ideally io.disk_time of cgroups should be in proportion to the weight. +Throttling/Upper Limit policy +----------------------------- +- Enable Block IO controller + CONFIG_BLK_CGROUP=y + +- Enable throttling in block layer + CONFIG_BLK_DEV_THROTTLING=y + +- Mount blkio controller + mount -t cgroup -o blkio none /cgroup/blkio + +- Specify a bandwidth rate on particular device for root group. The format + for policy is "<major>:<minor> <byes_per_second>". + + echo "8:16 1048576" > /cgroup/blkio/blkio.read_bps_device + + Above will put a limit of 1MB/second on reads happening for root group + on device having major/minor number 8:16. + +- Run dd to read a file and see if rate is throttled to 1MB/s or not. + + # dd if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 + # iflag=direct + 1024+0 records in + 1024+0 records out + 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s + + Limits for writes can be put using blkio.write_bps_device file. + Various user visible config options =================================== CONFIG_BLK_CGROUP @@ -68,8 +102,13 @@ CONFIG_CFQ_GROUP_IOSCHED - Enables group scheduling in CFQ. Currently only 1 level of group creation is allowed. +CONFIG_BLK_DEV_THROTTLING + - Enable block device throttling support in block layer. + Details of cgroup files ======================= +Proportional weight policy files +-------------------------------- - blkio.weight - Specifies per cgroup weight. This is default weight of the group on all the devices until and unless overridden by per device rule. @@ -210,6 +249,67 @@ Details of cgroup files and minor number of the device and third field specifies the number of times a group was dequeued from a particular device. +Throttling/Upper limit policy files +----------------------------------- +- blkio.throttle.read_bps_device + - Specifies upper limit on READ rate from the device. IO rate is + specified in bytes per second. Rules are per deivce. Following is + the format. + + echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.read_bps_device + +- blkio.throttle.write_bps_device + - Specifies upper limit on WRITE rate to the device. IO rate is + specified in bytes per second. Rules are per deivce. Following is + the format. + + echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.write_bps_device + +- blkio.throttle.read_iops_device + - Specifies upper limit on READ rate from the device. IO rate is + specified in IO per second. Rules are per deivce. Following is + the format. + + echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.read_iops_device + +- blkio.throttle.write_iops_device + - Specifies upper limit on WRITE rate to the device. IO rate is + specified in io per second. Rules are per deivce. Following is + the format. + + echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.write_iops_device + +Note: If both BW and IOPS rules are specified for a device, then IO is + subjectd to both the constraints. + +- blkio.throttle.io_serviced + - Number of IOs (bio) completed to/from the disk by the group (as + seen by throttling policy). These are further divided by the type + of operation - read or write, sync or async. First two fields specify + the major and minor number of the device, third field specifies the + operation type and the fourth field specifies the number of IOs. + + blkio.io_serviced does accounting as seen by CFQ and counts are in + number of requests (struct request). On the other hand, + blkio.throttle.io_serviced counts number of IO in terms of number + of bios as seen by throttling policy. These bios can later be + merged by elevator and total number of requests completed can be + lesser. + +- blkio.throttle.io_service_bytes + - Number of bytes transferred to/from the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of bytes. + + These numbers should roughly be same as blkio.io_service_bytes as + updated by CFQ. The difference between two is that + blkio.io_service_bytes will not be updated if CFQ is not operating + on request queue. + +Common files among various policies +----------------------------------- - blkio.reset_stats - Writing an int to this file will result in resetting all the stats for that cgroup. diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 8a6a8c6d4980..dc73bc54cc4e 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -1640,15 +1640,6 @@ static void blk_request(struct virtqueue *vq) off = out->sector * 512; /* - * The block device implements "barriers", where the Guest indicates - * that it wants all previous writes to occur before this write. We - * don't have a way of asking our kernel to do a barrier, so we just - * synchronize all the data in the file. Pretty poor, no? - */ - if (out->type & VIRTIO_BLK_T_BARRIER) - fdatasync(vblk->fd); - - /* * In general the virtio block driver is allowed to try SCSI commands. * It'd be nice if we supported eject, for example, but we don't. */ @@ -1680,6 +1671,13 @@ static void blk_request(struct virtqueue *vq) /* Die, bad Guest, die. */ errx(1, "Write past end %llu+%u", off, ret); } + + wlen = sizeof(*in); + *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); + } else if (out->type & VIRTIO_BLK_T_FLUSH) { + /* Flush */ + ret = fdatasync(vblk->fd); + verbose("FLUSH fdatasync: %i\n", ret); wlen = sizeof(*in); *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); } else { @@ -1703,15 +1701,6 @@ static void blk_request(struct virtqueue *vq) } } - /* - * OK, so we noted that it was pretty poor to use an fdatasync as a - * barrier. But Christoph Hellwig points out that we need a sync - * *afterwards* as well: "Barriers specify no reordering to the front - * or the back." And Jens Axboe confirmed it, so here we are: - */ - if (out->type & VIRTIO_BLK_T_BARRIER) - fdatasync(vblk->fd); - /* Finished that request. */ add_used(vq, head, wlen); } @@ -1736,8 +1725,8 @@ static void setup_block_file(const char *filename) vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); vblk->len = lseek64(vblk->fd, 0, SEEK_END); - /* We support barriers. */ - add_feature(dev, VIRTIO_BLK_F_BARRIER); + /* We support FLUSH. */ + add_feature(dev, VIRTIO_BLK_F_FLUSH); /* Tell Guest how many sectors this device has. */ conf.capacity = cpu_to_le64(vblk->len / 512); diff --git a/block/Kconfig b/block/Kconfig index 9be0b56eaee1..6c9213ef15a1 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -77,6 +77,18 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. +config BLK_DEV_THROTTLING + bool "Block layer bio throttling support" + depends on BLK_CGROUP=y && EXPERIMENTAL + default n + ---help--- + Block layer bio throttling support. It can be used to limit + the IO rate to a device. IO rate policies are per cgroup and + one needs to mount and use blkio cgroup controller for creating + cgroups and specifying per device IO rate policies. + + See Documentation/cgroups/blkio-controller.txt for more information. + endif # BLOCK config BLOCK_COMPAT diff --git a/block/Makefile b/block/Makefile index 0bb499a739cd..0fec4b3fab51 100644 --- a/block/Makefile +++ b/block/Makefile @@ -3,12 +3,13 @@ # obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ - blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ + blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o +obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o diff --git a/block/blk-barrier.c b/block/blk-barrier.c deleted file mode 100644 index f0faefca032f..000000000000 --- a/block/blk-barrier.c +++ /dev/null @@ -1,350 +0,0 @@ -/* - * Functions related to barrier IO handling - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/gfp.h> - -#include "blk.h" - -/** - * blk_queue_ordered - does this queue support ordered writes - * @q: the request queue - * @ordered: one of QUEUE_ORDERED_* - * - * Description: - * For journalled file systems, doing ordered writes on a commit - * block instead of explicitly doing wait_on_buffer (which is bad - * for performance) can be a big win. Block drivers supporting this - * feature should call this function and indicate so. - * - **/ -int blk_queue_ordered(struct request_queue *q, unsigned ordered) -{ - if (ordered != QUEUE_ORDERED_NONE && - ordered != QUEUE_ORDERED_DRAIN && - ordered != QUEUE_ORDERED_DRAIN_FLUSH && - ordered != QUEUE_ORDERED_DRAIN_FUA && - ordered != QUEUE_ORDERED_TAG && - ordered != QUEUE_ORDERED_TAG_FLUSH && - ordered != QUEUE_ORDERED_TAG_FUA) { - printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); - return -EINVAL; - } - - q->ordered = ordered; - q->next_ordered = ordered; - - return 0; -} -EXPORT_SYMBOL(blk_queue_ordered); - -/* - * Cache flushing for ordered writes handling - */ -unsigned blk_ordered_cur_seq(struct request_queue *q) -{ - if (!q->ordseq) - return 0; - return 1 << ffz(q->ordseq); -} - -unsigned blk_ordered_req_seq(struct request *rq) -{ - struct request_queue *q = rq->q; - - BUG_ON(q->ordseq == 0); - - if (rq == &q->pre_flush_rq) - return QUEUE_ORDSEQ_PREFLUSH; - if (rq == &q->bar_rq) - return QUEUE_ORDSEQ_BAR; - if (rq == &q->post_flush_rq) - return QUEUE_ORDSEQ_POSTFLUSH; - - /* - * !fs requests don't need to follow barrier ordering. Always - * put them at the front. This fixes the following deadlock. - * - * http://thread.gmane.org/gmane.linux.kernel/537473 - */ - if (rq->cmd_type != REQ_TYPE_FS) - return QUEUE_ORDSEQ_DRAIN; - - if ((rq->cmd_flags & REQ_ORDERED_COLOR) == - (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR)) - return QUEUE_ORDSEQ_DRAIN; - else - return QUEUE_ORDSEQ_DONE; -} - -bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) -{ - struct request *rq; - - if (error && !q->orderr) - q->orderr = error; - - BUG_ON(q->ordseq & seq); - q->ordseq |= seq; - - if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) - return false; - - /* - * Okay, sequence complete. - */ - q->ordseq = 0; - rq = q->orig_bar_rq; - __blk_end_request_all(rq, q->orderr); - return true; -} - -static void pre_flush_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); -} - -static void bar_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); -} - -static void post_flush_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); -} - -static void queue_flush(struct request_queue *q, unsigned which) -{ - struct request *rq; - rq_end_io_fn *end_io; - - if (which == QUEUE_ORDERED_DO_PREFLUSH) { - rq = &q->pre_flush_rq; - end_io = pre_flush_end_io; - } else { - rq = &q->post_flush_rq; - end_io = post_flush_end_io; - } - - blk_rq_init(q, rq); - rq->cmd_type = REQ_TYPE_FS; - rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH; - rq->rq_disk = q->orig_bar_rq->rq_disk; - rq->end_io = end_io; - - elv_insert(q, rq, ELEVATOR_INSERT_FRONT); -} - -static inline bool start_ordered(struct request_queue *q, struct request **rqp) -{ - struct request *rq = *rqp; - unsigned skip = 0; - - q->orderr = 0; - q->ordered = q->next_ordered; - q->ordseq |= QUEUE_ORDSEQ_STARTED; - - /* - * For an empty barrier, there's no actual BAR request, which - * in turn makes POSTFLUSH unnecessary. Mask them off. - */ - if (!blk_rq_sectors(rq)) { - q->ordered &= ~(QUEUE_ORDERED_DO_BAR | - QUEUE_ORDERED_DO_POSTFLUSH); - /* - * Empty barrier on a write-through device w/ ordered - * tag has no command to issue and without any command - * to issue, ordering by tag can't be used. Drain - * instead. - */ - if ((q->ordered & QUEUE_ORDERED_BY_TAG) && - !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) { - q->ordered &= ~QUEUE_ORDERED_BY_TAG; - q->ordered |= QUEUE_ORDERED_BY_DRAIN; - } - } - - /* stash away the original request */ - blk_dequeue_request(rq); - q->orig_bar_rq = rq; - rq = NULL; - - /* - * Queue ordered sequence. As we stack them at the head, we - * need to queue in reverse order. Note that we rely on that - * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs - * request gets inbetween ordered sequence. - */ - if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) { - queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH); - rq = &q->post_flush_rq; - } else - skip |= QUEUE_ORDSEQ_POSTFLUSH; - - if (q->ordered & QUEUE_ORDERED_DO_BAR) { - rq = &q->bar_rq; - - /* initialize proxy request and queue it */ - blk_rq_init(q, rq); - if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) - rq->cmd_flags |= REQ_WRITE; - if (q->ordered & QUEUE_ORDERED_DO_FUA) - rq->cmd_flags |= REQ_FUA; - init_request_from_bio(rq, q->orig_bar_rq->bio); - rq->end_io = bar_end_io; - - elv_insert(q, rq, ELEVATOR_INSERT_FRONT); - } else - skip |= QUEUE_ORDSEQ_BAR; - - if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) { - queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH); - rq = &q->pre_flush_rq; - } else - skip |= QUEUE_ORDSEQ_PREFLUSH; - - if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q)) - rq = NULL; - else - skip |= QUEUE_ORDSEQ_DRAIN; - - *rqp = rq; - - /* - * Complete skipped sequences. If whole sequence is complete, - * return false to tell elevator that this request is gone. - */ - return !blk_ordered_complete_seq(q, skip, 0); -} - -bool blk_do_ordered(struct request_queue *q, struct request **rqp) -{ - struct request *rq = *rqp; - const int is_barrier = rq->cmd_type == REQ_TYPE_FS && - (rq->cmd_flags & REQ_HARDBARRIER); - - if (!q->ordseq) { - if (!is_barrier) - return true; - - if (q->next_ordered != QUEUE_ORDERED_NONE) - return start_ordered(q, rqp); - else { - /* - * Queue ordering not supported. Terminate - * with prejudice. - */ - blk_dequeue_request(rq); - __blk_end_request_all(rq, -EOPNOTSUPP); - *rqp = NULL; - return false; - } - } - - /* - * Ordered sequence in progress - */ - - /* Special requests are not subject to ordering rules. */ - if (rq->cmd_type != REQ_TYPE_FS && - rq != &q->pre_flush_rq && rq != &q->post_flush_rq) - return true; - - if (q->ordered & QUEUE_ORDERED_BY_TAG) { - /* Ordered by tag. Blocking the next barrier is enough. */ - if (is_barrier && rq != &q->bar_rq) - *rqp = NULL; - } else { - /* Ordered by draining. Wait for turn. */ - WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); - if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) - *rqp = NULL; - } - - return true; -} - -static void bio_end_empty_barrier(struct bio *bio, int err) -{ - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - clear_bit(BIO_UPTODATE, &bio->bi_flags); - } - if (bio->bi_private) - complete(bio->bi_private); - bio_put(bio); -} - -/** - * blkdev_issue_flush - queue a flush - * @bdev: blockdev to issue flush for - * @gfp_mask: memory allocation flags (for bio_alloc) - * @error_sector: error sector - * @flags: BLKDEV_IFL_* flags to control behaviour - * - * Description: - * Issue a flush for the block device in question. Caller can supply - * room for storing the error offset in case of a flush error, if they - * wish to. If WAIT flag is not passed then caller may check only what - * request was pushed in some internal queue for later handling. - */ -int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, - sector_t *error_sector, unsigned long flags) -{ - DECLARE_COMPLETION_ONSTACK(wait); - struct request_queue *q; - struct bio *bio; - int ret = 0; - - if (bdev->bd_disk == NULL) - return -ENXIO; - - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - - /* - * some block devices may not have their queue correctly set up here - * (e.g. loop device without a backing file) and so issuing a flush - * here will panic. Ensure there is a request function before issuing - * the barrier. - */ - if (!q->make_request_fn) - return -ENXIO; - - bio = bio_alloc(gfp_mask, 0); - bio->bi_end_io = bio_end_empty_barrier; - bio->bi_bdev = bdev; - if (test_bit(BLKDEV_WAIT, &flags)) - bio->bi_private = &wait; - - bio_get(bio); - submit_bio(WRITE_BARRIER, bio); - if (test_bit(BLKDEV_WAIT, &flags)) { - wait_for_completion(&wait); - /* - * The driver must store the error location in ->bi_sector, if - * it supports it. For non-stacked drivers, this should be - * copied from blk_rq_pos(rq). - */ - if (error_sector) - *error_sector = bio->bi_sector; - } - - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - else if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - - bio_put(bio); - return ret; -} -EXPORT_SYMBOL(blkdev_issue_flush); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2fef1ef931a0..b1febd0f6d2a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -37,6 +37,12 @@ static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); +/* for encoding cft->private value on file */ +#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) +/* What policy owns the file, proportional or throttle */ +#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) +#define BLKIOFILE_ATTR(val) ((val) & 0xffff) + struct cgroup_subsys blkio_subsys = { .name = "blkio", .create = blkiocg_create, @@ -59,6 +65,27 @@ static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, list_add(&pn->node, &blkcg->policy_list); } +static inline bool cftype_blkg_same_policy(struct cftype *cft, + struct blkio_group *blkg) +{ + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + + if (blkg->plid == plid) + return 1; + + return 0; +} + +/* Determines if policy node matches cgroup file being accessed */ +static inline bool pn_matches_cftype(struct cftype *cft, + struct blkio_policy_node *pn) +{ + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int fileid = BLKIOFILE_ATTR(cft->private); + + return (plid == pn->plid && fileid == pn->fileid); +} + /* Must be called with blkcg->lock held */ static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) { @@ -67,12 +94,13 @@ static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) /* Must be called with blkcg->lock held */ static struct blkio_policy_node * -blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) +blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, + enum blkio_policy_id plid, int fileid) { struct blkio_policy_node *pn; list_for_each_entry(pn, &blkcg->policy_list, node) { - if (pn->dev == dev) + if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) return pn; } @@ -86,6 +114,67 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) } EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); +static inline void +blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) +{ + struct blkio_policy_type *blkiop; + + list_for_each_entry(blkiop, &blkio_list, list) { + /* If this policy does not own the blkg, do not send updates */ + if (blkiop->plid != blkg->plid) + continue; + if (blkiop->ops.blkio_update_group_weight_fn) + blkiop->ops.blkio_update_group_weight_fn(blkg->key, + blkg, weight); + } +} + +static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, + int fileid) +{ + struct blkio_policy_type *blkiop; + + list_for_each_entry(blkiop, &blkio_list, list) { + + /* If this policy does not own the blkg, do not send updates */ + if (blkiop->plid != blkg->plid) + continue; + + if (fileid == BLKIO_THROTL_read_bps_device + && blkiop->ops.blkio_update_group_read_bps_fn) + blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, + blkg, bps); + + if (fileid == BLKIO_THROTL_write_bps_device + && blkiop->ops.blkio_update_group_write_bps_fn) + blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, + blkg, bps); + } +} + +static inline void blkio_update_group_iops(struct blkio_group *blkg, + unsigned int iops, int fileid) +{ + struct blkio_policy_type *blkiop; + + list_for_each_entry(blkiop, &blkio_list, list) { + + /* If this policy does not own the blkg, do not send updates */ + if (blkiop->plid != blkg->plid) + continue; + + if (fileid == BLKIO_THROTL_read_iops_device + && blkiop->ops.blkio_update_group_read_iops_fn) + blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, + blkg, iops); + + if (fileid == BLKIO_THROTL_write_iops_device + && blkiop->ops.blkio_update_group_write_iops_fn) + blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, + blkg,iops); + } +} + /* * Add to the appropriate stat variable depending on the request type. * This should be called with the blkg->stats_lock held. @@ -341,7 +430,8 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev) + struct blkio_group *blkg, void *key, dev_t dev, + enum blkio_policy_id plid) { unsigned long flags; @@ -350,6 +440,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, rcu_assign_pointer(blkg->key, key); blkg->blkcg_id = css_id(&blkcg->css); hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); + blkg->plid = plid; spin_unlock_irqrestore(&blkcg->lock, flags); /* Need to take css reference ? */ cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); @@ -408,51 +499,6 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) } EXPORT_SYMBOL_GPL(blkiocg_lookup_group); -#define SHOW_FUNCTION(__VAR) \ -static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \ - struct cftype *cftype) \ -{ \ - struct blkio_cgroup *blkcg; \ - \ - blkcg = cgroup_to_blkio_cgroup(cgroup); \ - return (u64)blkcg->__VAR; \ -} - -SHOW_FUNCTION(weight); -#undef SHOW_FUNCTION - -static int -blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) -{ - struct blkio_cgroup *blkcg; - struct blkio_group *blkg; - struct hlist_node *n; - struct blkio_policy_type *blkiop; - struct blkio_policy_node *pn; - - if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) - return -EINVAL; - - blkcg = cgroup_to_blkio_cgroup(cgroup); - spin_lock(&blkio_list_lock); - spin_lock_irq(&blkcg->lock); - blkcg->weight = (unsigned int)val; - - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - pn = blkio_policy_search_node(blkcg, blkg->dev); - - if (pn) - continue; - - list_for_each_entry(blkiop, &blkio_list, list) - blkiop->ops.blkio_update_group_weight_fn(blkg, - blkcg->weight); - } - spin_unlock_irq(&blkcg->lock); - spin_unlock(&blkio_list_lock); - return 0; -} - static int blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) { @@ -593,52 +639,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg, return disk_total; } -#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \ -static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ - struct cftype *cftype, struct cgroup_map_cb *cb) \ -{ \ - struct blkio_cgroup *blkcg; \ - struct blkio_group *blkg; \ - struct hlist_node *n; \ - uint64_t cgroup_total = 0; \ - \ - if (!cgroup_lock_live_group(cgroup)) \ - return -ENODEV; \ - \ - blkcg = cgroup_to_blkio_cgroup(cgroup); \ - rcu_read_lock(); \ - hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ - if (blkg->dev) { \ - spin_lock_irq(&blkg->stats_lock); \ - cgroup_total += blkio_get_stat(blkg, cb, \ - blkg->dev, type); \ - spin_unlock_irq(&blkg->stats_lock); \ - } \ - } \ - if (show_total) \ - cb->fill(cb, "Total", cgroup_total); \ - rcu_read_unlock(); \ - cgroup_unlock(); \ - return 0; \ -} - -SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0); -SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0); -SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1); -SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1); -SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1); -SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1); -SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1); -SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1); -#ifdef CONFIG_DEBUG_BLK_CGROUP -SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0); -SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0); -SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0); -SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0); -SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0); -#endif -#undef SHOW_FUNCTION_PER_GROUP - static int blkio_check_dev_num(dev_t dev) { int part = 0; @@ -652,13 +652,14 @@ static int blkio_check_dev_num(dev_t dev) } static int blkio_policy_parse_and_set(char *buf, - struct blkio_policy_node *newpn) + struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) { char *s[4], *p, *major_s = NULL, *minor_s = NULL; int ret; unsigned long major, minor, temp; int i = 0; dev_t dev; + u64 bps, iops; memset(s, 0, sizeof(s)); @@ -705,12 +706,47 @@ static int blkio_policy_parse_and_set(char *buf, if (s[1] == NULL) return -EINVAL; - ret = strict_strtoul(s[1], 10, &temp); - if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || - temp > BLKIO_WEIGHT_MAX) - return -EINVAL; + switch (plid) { + case BLKIO_POLICY_PROP: + ret = strict_strtoul(s[1], 10, &temp); + if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || + temp > BLKIO_WEIGHT_MAX) + return -EINVAL; - newpn->weight = temp; + newpn->plid = plid; + newpn->fileid = fileid; + newpn->val.weight = temp; + break; + case BLKIO_POLICY_THROTL: + switch(fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + ret = strict_strtoull(s[1], 10, &bps); + if (ret) + return -EINVAL; + + newpn->plid = plid; + newpn->fileid = fileid; + newpn->val.bps = bps; + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + ret = strict_strtoull(s[1], 10, &iops); + if (ret) + return -EINVAL; + + if (iops > THROTL_IOPS_MAX) + return -EINVAL; + + newpn->plid = plid; + newpn->fileid = fileid; + newpn->val.iops = (unsigned int)iops; + break; + } + break; + default: + BUG(); + } return 0; } @@ -720,26 +756,180 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, { struct blkio_policy_node *pn; - pn = blkio_policy_search_node(blkcg, dev); + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, + BLKIO_PROP_weight_device); if (pn) - return pn->weight; + return pn->val.weight; else return blkcg->weight; } EXPORT_SYMBOL_GPL(blkcg_get_weight); +uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_bps_device); + if (pn) + return pn->val.bps; + else + return -1; +} + +uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_bps_device); + if (pn) + return pn->val.bps; + else + return -1; +} + +unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_iops_device); + if (pn) + return pn->val.iops; + else + return -1; +} + +unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_iops_device); + if (pn) + return pn->val.iops; + else + return -1; +} + +/* Checks whether user asked for deleting a policy rule */ +static bool blkio_delete_rule_command(struct blkio_policy_node *pn) +{ + switch(pn->plid) { + case BLKIO_POLICY_PROP: + if (pn->val.weight == 0) + return 1; + break; + case BLKIO_POLICY_THROTL: + switch(pn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + if (pn->val.bps == 0) + return 1; + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + if (pn->val.iops == 0) + return 1; + } + break; + default: + BUG(); + } + + return 0; +} + +static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, + struct blkio_policy_node *newpn) +{ + switch(oldpn->plid) { + case BLKIO_POLICY_PROP: + oldpn->val.weight = newpn->val.weight; + break; + case BLKIO_POLICY_THROTL: + switch(newpn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + oldpn->val.bps = newpn->val.bps; + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + oldpn->val.iops = newpn->val.iops; + } + break; + default: + BUG(); + } +} + +/* + * Some rules/values in blkg have changed. Propogate those to respective + * policies. + */ +static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, struct blkio_policy_node *pn) +{ + unsigned int weight, iops; + u64 bps; + + switch(pn->plid) { + case BLKIO_POLICY_PROP: + weight = pn->val.weight ? pn->val.weight : + blkcg->weight; + blkio_update_group_weight(blkg, weight); + break; + case BLKIO_POLICY_THROTL: + switch(pn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + bps = pn->val.bps ? pn->val.bps : (-1); + blkio_update_group_bps(blkg, bps, pn->fileid); + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + iops = pn->val.iops ? pn->val.iops : (-1); + blkio_update_group_iops(blkg, iops, pn->fileid); + break; + } + break; + default: + BUG(); + } +} + +/* + * A policy node rule has been updated. Propogate this update to all the + * block groups which might be affected by this update. + */ +static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, + struct blkio_policy_node *pn) +{ + struct blkio_group *blkg; + struct hlist_node *n; + + spin_lock(&blkio_list_lock); + spin_lock_irq(&blkcg->lock); + + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + if (pn->dev != blkg->dev || pn->plid != blkg->plid) + continue; + blkio_update_blkg_policy(blkcg, blkg, pn); + } + + spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); +} -static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) { int ret = 0; char *buf; struct blkio_policy_node *newpn, *pn; struct blkio_cgroup *blkcg; - struct blkio_group *blkg; int keep_newpn = 0; - struct hlist_node *n; - struct blkio_policy_type *blkiop; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int fileid = BLKIOFILE_ATTR(cft->private); buf = kstrdup(buffer, GFP_KERNEL); if (!buf) @@ -751,7 +941,7 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, goto free_buf; } - ret = blkio_policy_parse_and_set(buf, newpn); + ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid); if (ret) goto free_newpn; @@ -759,9 +949,9 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, spin_lock_irq(&blkcg->lock); - pn = blkio_policy_search_node(blkcg, newpn->dev); + pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid); if (!pn) { - if (newpn->weight != 0) { + if (!blkio_delete_rule_command(newpn)) { blkio_policy_insert_node(blkcg, newpn); keep_newpn = 1; } @@ -769,33 +959,17 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, goto update_io_group; } - if (newpn->weight == 0) { - /* weight == 0 means deleteing a specific weight */ + if (blkio_delete_rule_command(newpn)) { blkio_policy_delete_node(pn); spin_unlock_irq(&blkcg->lock); goto update_io_group; } spin_unlock_irq(&blkcg->lock); - pn->weight = newpn->weight; + blkio_update_policy_rule(pn, newpn); update_io_group: - /* update weight for each cfqg */ - spin_lock(&blkio_list_lock); - spin_lock_irq(&blkcg->lock); - - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - if (newpn->dev == blkg->dev) { - list_for_each_entry(blkiop, &blkio_list, list) - blkiop->ops.blkio_update_group_weight_fn(blkg, - newpn->weight ? - newpn->weight : - blkcg->weight); - } - } - - spin_unlock_irq(&blkcg->lock); - spin_unlock(&blkio_list_lock); + blkio_update_policy_node_blkg(blkcg, newpn); free_newpn: if (!keep_newpn) @@ -805,23 +979,256 @@ free_buf: return ret; } -static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *m) +static void +blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) { - struct blkio_cgroup *blkcg; - struct blkio_policy_node *pn; + switch(pn->plid) { + case BLKIO_POLICY_PROP: + if (pn->fileid == BLKIO_PROP_weight_device) + seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->val.weight); + break; + case BLKIO_POLICY_THROTL: + switch(pn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->val.bps); + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->val.iops); + break; + } + break; + default: + BUG(); + } +} - seq_printf(m, "dev\tweight\n"); +/* cgroup files which read their data from policy nodes end up here */ +static void blkio_read_policy_node_files(struct cftype *cft, + struct blkio_cgroup *blkcg, struct seq_file *m) +{ + struct blkio_policy_node *pn; - blkcg = cgroup_to_blkio_cgroup(cgrp); if (!list_empty(&blkcg->policy_list)) { spin_lock_irq(&blkcg->lock); list_for_each_entry(pn, &blkcg->policy_list, node) { - seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), - MINOR(pn->dev), pn->weight); + if (!pn_matches_cftype(cft, pn)) + continue; + blkio_print_policy_node(m, pn); } spin_unlock_irq(&blkcg->lock); } +} + +static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *m) +{ + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_weight_device: + blkio_read_policy_node_files(cft, blkcg, m); + return 0; + default: + BUG(); + } + break; + case BLKIO_POLICY_THROTL: + switch(name){ + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + blkio_read_policy_node_files(cft, blkcg, m); + return 0; + default: + BUG(); + } + break; + default: + BUG(); + } + + return 0; +} + +static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, + struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type, + bool show_total) +{ + struct blkio_group *blkg; + struct hlist_node *n; + uint64_t cgroup_total = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { + if (blkg->dev) { + if (!cftype_blkg_same_policy(cft, blkg)) + continue; + spin_lock_irq(&blkg->stats_lock); + cgroup_total += blkio_get_stat(blkg, cb, blkg->dev, + type); + spin_unlock_irq(&blkg->stats_lock); + } + } + if (show_total) + cb->fill(cb, "Total", cgroup_total); + rcu_read_unlock(); + return 0; +} + +/* All map kind of cgroup file get serviced by this function */ +static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_TIME, 0); + case BLKIO_PROP_sectors: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SECTORS, 0); + case BLKIO_PROP_io_service_bytes: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICE_BYTES, 1); + case BLKIO_PROP_io_serviced: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICED, 1); + case BLKIO_PROP_io_service_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICE_TIME, 1); + case BLKIO_PROP_io_wait_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_WAIT_TIME, 1); + case BLKIO_PROP_io_merged: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_MERGED, 1); + case BLKIO_PROP_io_queued: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_QUEUED, 1); +#ifdef CONFIG_DEBUG_BLK_CGROUP + case BLKIO_PROP_dequeue: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_DEQUEUE, 0); + case BLKIO_PROP_avg_queue_size: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_AVG_QUEUE_SIZE, 0); + case BLKIO_PROP_group_wait_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_GROUP_WAIT_TIME, 0); + case BLKIO_PROP_idle_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_IDLE_TIME, 0); + case BLKIO_PROP_empty_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_EMPTY_TIME, 0); +#endif + default: + BUG(); + } + break; + case BLKIO_POLICY_THROTL: + switch(name){ + case BLKIO_THROTL_io_service_bytes: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICE_BYTES, 1); + case BLKIO_THROTL_io_serviced: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICED, 1); + default: + BUG(); + } + break; + default: + BUG(); + } + + return 0; +} + +static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) +{ + struct blkio_group *blkg; + struct hlist_node *n; + struct blkio_policy_node *pn; + + if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) + return -EINVAL; + + spin_lock(&blkio_list_lock); + spin_lock_irq(&blkcg->lock); + blkcg->weight = (unsigned int)val; + + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + pn = blkio_policy_search_node(blkcg, blkg->dev, + BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); + if (pn) + continue; + + blkio_update_group_weight(blkg, blkcg->weight); + } + spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); + return 0; +} + +static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_weight: + return (u64)blkcg->weight; + } + break; + default: + BUG(); + } + return 0; +} + +static int +blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_weight: + return blkio_weight_write(blkcg, val); + } + break; + default: + BUG(); + } return 0; } @@ -829,71 +1236,151 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, struct cftype blkio_files[] = { { .name = "weight_device", - .read_seq_string = blkiocg_weight_device_read, - .write_string = blkiocg_weight_device_write, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_weight_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, .max_write_len = 256, }, { .name = "weight", - .read_u64 = blkiocg_weight_read, - .write_u64 = blkiocg_weight_write, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_weight), + .read_u64 = blkiocg_file_read_u64, + .write_u64 = blkiocg_file_write_u64, }, { .name = "time", - .read_map = blkiocg_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_time), + .read_map = blkiocg_file_read_map, }, { .name = "sectors", - .read_map = blkiocg_sectors_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_sectors), + .read_map = blkiocg_file_read_map, }, { .name = "io_service_bytes", - .read_map = blkiocg_io_service_bytes_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_service_bytes), + .read_map = blkiocg_file_read_map, }, { .name = "io_serviced", - .read_map = blkiocg_io_serviced_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_serviced), + .read_map = blkiocg_file_read_map, }, { .name = "io_service_time", - .read_map = blkiocg_io_service_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_service_time), + .read_map = blkiocg_file_read_map, }, { .name = "io_wait_time", - .read_map = blkiocg_io_wait_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_wait_time), + .read_map = blkiocg_file_read_map, }, { .name = "io_merged", - .read_map = blkiocg_io_merged_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_merged), + .read_map = blkiocg_file_read_map, }, { .name = "io_queued", - .read_map = blkiocg_io_queued_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_queued), + .read_map = blkiocg_file_read_map, }, { .name = "reset_stats", .write_u64 = blkiocg_reset_stats, }, +#ifdef CONFIG_BLK_DEV_THROTTLING + { + .name = "throttle.read_bps_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_bps_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + + { + .name = "throttle.write_bps_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_bps_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + + { + .name = "throttle.read_iops_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_iops_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + + { + .name = "throttle.write_iops_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_iops_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + { + .name = "throttle.io_service_bytes", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_io_service_bytes), + .read_map = blkiocg_file_read_map, + }, + { + .name = "throttle.io_serviced", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_io_serviced), + .read_map = blkiocg_file_read_map, + }, +#endif /* CONFIG_BLK_DEV_THROTTLING */ + #ifdef CONFIG_DEBUG_BLK_CGROUP { .name = "avg_queue_size", - .read_map = blkiocg_avg_queue_size_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_avg_queue_size), + .read_map = blkiocg_file_read_map, }, { .name = "group_wait_time", - .read_map = blkiocg_group_wait_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_group_wait_time), + .read_map = blkiocg_file_read_map, }, { .name = "idle_time", - .read_map = blkiocg_idle_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_idle_time), + .read_map = blkiocg_file_read_map, }, { .name = "empty_time", - .read_map = blkiocg_empty_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_empty_time), + .read_map = blkiocg_file_read_map, }, { .name = "dequeue", - .read_map = blkiocg_dequeue_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_dequeue), + .read_map = blkiocg_file_read_map, }, #endif }; @@ -932,13 +1419,14 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) /* * This blkio_group is being unlinked as associated cgroup is * going away. Let all the IO controlling policies know about - * this event. Currently this is static call to one io - * controlling policy. Once we have more policies in place, we - * need some dynamic registration of callback function. + * this event. */ spin_lock(&blkio_list_lock); - list_for_each_entry(blkiop, &blkio_list, list) + list_for_each_entry(blkiop, &blkio_list, list) { + if (blkiop->plid != blkg->plid) + continue; blkiop->ops.blkio_unlink_group_fn(key, blkg); + } spin_unlock(&blkio_list_lock); } while (1); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 2b866ec1dcea..ea4861bdd549 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -15,6 +15,14 @@ #include <linux/cgroup.h> +enum blkio_policy_id { + BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ + BLKIO_POLICY_THROTL, /* Throttling */ +}; + +/* Max limits for throttle policy */ +#define THROTL_IOPS_MAX UINT_MAX + #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) #ifndef CONFIG_BLK_CGROUP @@ -65,6 +73,35 @@ enum blkg_state_flags { BLKG_empty, }; +/* cgroup files owned by proportional weight policy */ +enum blkcg_file_name_prop { + BLKIO_PROP_weight = 1, + BLKIO_PROP_weight_device, + BLKIO_PROP_io_service_bytes, + BLKIO_PROP_io_serviced, + BLKIO_PROP_time, + BLKIO_PROP_sectors, + BLKIO_PROP_io_service_time, + BLKIO_PROP_io_wait_time, + BLKIO_PROP_io_merged, + BLKIO_PROP_io_queued, + BLKIO_PROP_avg_queue_size, + BLKIO_PROP_group_wait_time, + BLKIO_PROP_idle_time, + BLKIO_PROP_empty_time, + BLKIO_PROP_dequeue, +}; + +/* cgroup files owned by throttle policy */ +enum blkcg_file_name_throtl { + BLKIO_THROTL_read_bps_device, + BLKIO_THROTL_write_bps_device, + BLKIO_THROTL_read_iops_device, + BLKIO_THROTL_write_iops_device, + BLKIO_THROTL_io_service_bytes, + BLKIO_THROTL_io_serviced, +}; + struct blkio_cgroup { struct cgroup_subsys_state css; unsigned int weight; @@ -112,6 +149,8 @@ struct blkio_group { char path[128]; /* The device MKDEV(major, minor), this group has been created for */ dev_t dev; + /* policy which owns this blk group */ + enum blkio_policy_id plid; /* Need to serialize the stats in the case of reset/update */ spinlock_t stats_lock; @@ -121,24 +160,60 @@ struct blkio_group { struct blkio_policy_node { struct list_head node; dev_t dev; - unsigned int weight; + /* This node belongs to max bw policy or porportional weight policy */ + enum blkio_policy_id plid; + /* cgroup file to which this rule belongs to */ + int fileid; + + union { + unsigned int weight; + /* + * Rate read/write in terms of byptes per second + * Whether this rate represents read or write is determined + * by file type "fileid". + */ + u64 bps; + unsigned int iops; + } val; }; extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, dev_t dev); +extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, + dev_t dev); +extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, + dev_t dev); +extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, + dev_t dev); +extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, + dev_t dev); typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); -typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, - unsigned int weight); + +typedef void (blkio_update_group_weight_fn) (void *key, + struct blkio_group *blkg, unsigned int weight); +typedef void (blkio_update_group_read_bps_fn) (void * key, + struct blkio_group *blkg, u64 read_bps); +typedef void (blkio_update_group_write_bps_fn) (void *key, + struct blkio_group *blkg, u64 write_bps); +typedef void (blkio_update_group_read_iops_fn) (void *key, + struct blkio_group *blkg, unsigned int read_iops); +typedef void (blkio_update_group_write_iops_fn) (void *key, + struct blkio_group *blkg, unsigned int write_iops); struct blkio_policy_ops { blkio_unlink_group_fn *blkio_unlink_group_fn; blkio_update_group_weight_fn *blkio_update_group_weight_fn; + blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn; + blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn; + blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn; + blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn; }; struct blkio_policy_type { struct list_head list; struct blkio_policy_ops ops; + enum blkio_policy_id plid; }; /* Blkio controller policy registration */ @@ -212,7 +287,8 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} extern struct blkio_cgroup blkio_root_cgroup; extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev); + struct blkio_group *blkg, void *key, dev_t dev, + enum blkio_policy_id plid); extern int blkiocg_del_blkio_group(struct blkio_group *blkg); extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key); @@ -234,7 +310,8 @@ static inline struct blkio_cgroup * cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev) {} + struct blkio_group *blkg, void *key, dev_t dev, + enum blkio_policy_id plid) {} static inline int blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } diff --git a/block/blk-core.c b/block/blk-core.c index 32a1c123dfb3..2e7cef876e71 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -136,7 +136,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio, { struct request_queue *q = rq->q; - if (&q->bar_rq != rq) { + if (&q->flush_rq != rq) { if (error) clear_bit(BIO_UPTODATE, &bio->bi_flags); else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) @@ -160,13 +160,12 @@ static void req_bio_endio(struct request *rq, struct bio *bio, if (bio->bi_size == 0) bio_endio(bio, error); } else { - /* - * Okay, this is the barrier request in progress, just - * record the error; + * Okay, this is the sequenced flush request in + * progress, just record the error; */ - if (error && !q->orderr) - q->orderr = error; + if (error && !q->flush_err) + q->flush_err = error; } } @@ -382,6 +381,7 @@ void blk_sync_queue(struct request_queue *q) del_timer_sync(&q->unplug_timer); del_timer_sync(&q->timeout); cancel_work_sync(&q->unplug_work); + throtl_shutdown_timer_wq(q); } EXPORT_SYMBOL(blk_sync_queue); @@ -459,6 +459,8 @@ void blk_cleanup_queue(struct request_queue *q) if (q->elevator) elevator_exit(q->elevator); + blk_throtl_exit(q); + blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); @@ -515,11 +517,17 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) return NULL; } + if (blk_throtl_init(q)) { + kmem_cache_free(blk_requestq_cachep, q); + return NULL; + } + setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); init_timer(&q->unplug_timer); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); INIT_LIST_HEAD(&q->timeout_list); + INIT_LIST_HEAD(&q->pending_flushes); INIT_WORK(&q->unplug_work, blk_unplug_work); kobject_init(&q->kobj, &blk_queue_ktype); @@ -1037,22 +1045,6 @@ void blk_insert_request(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(blk_insert_request); -/* - * add-request adds a request to the linked list. - * queue lock is held and interrupts disabled, as we muck with the - * request queue list. - */ -static inline void add_request(struct request_queue *q, struct request *req) -{ - drive_stat_acct(req, 1); - - /* - * elevator indicated where it wants this request to be - * inserted at elevator_merge time - */ - __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); -} - static void part_round_stats_single(int cpu, struct hd_struct *part, unsigned long now) { @@ -1201,13 +1193,16 @@ static int __make_request(struct request_queue *q, struct bio *bio) const bool sync = !!(bio->bi_rw & REQ_SYNC); const bool unplug = !!(bio->bi_rw & REQ_UNPLUG); const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK; + int where = ELEVATOR_INSERT_SORT; int rw_flags; - if ((bio->bi_rw & REQ_HARDBARRIER) && - (q->next_ordered == QUEUE_ORDERED_NONE)) { + /* REQ_HARDBARRIER is no more */ + if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER, + "block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) { bio_endio(bio, -EOPNOTSUPP); return 0; } + /* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even @@ -1217,7 +1212,12 @@ static int __make_request(struct request_queue *q, struct bio *bio) spin_lock_irq(q->queue_lock); - if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q)) + if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { + where = ELEVATOR_INSERT_FRONT; + goto get_rq; + } + + if (elv_queue_empty(q)) goto get_rq; el_ret = elv_merge(q, &req, bio); @@ -1314,7 +1314,10 @@ get_rq: req->cpu = blk_cpu_to_group(smp_processor_id()); if (queue_should_plug(q) && elv_queue_empty(q)) blk_plug_device(q); - add_request(q, req); + + /* insert the request into the elevator */ + drive_stat_acct(req, 1); + __elv_add_request(q, req, where, 0); out: if (unplug || !queue_should_plug(q)) __generic_unplug_device(q); @@ -1514,6 +1517,19 @@ static inline void __generic_make_request(struct bio *bio) if (bio_check_eod(bio, nr_sectors)) goto end_io; + /* + * Filter flush bio's early so that make_request based + * drivers without flush support don't have to worry + * about them. + */ + if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { + bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); + if (!nr_sectors) { + err = 0; + goto end_io; + } + } + if ((bio->bi_rw & REQ_DISCARD) && (!blk_queue_discard(q) || ((bio->bi_rw & REQ_SECURE) && @@ -1522,6 +1538,15 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; } + blk_throtl_bio(q, &bio); + + /* + * If bio = NULL, bio has been throttled and will be submitted + * later. + */ + if (!bio) + break; + trace_block_bio_queue(q, bio); ret = q->make_request_fn(q, bio); @@ -1612,11 +1637,12 @@ void submit_bio(int rw, struct bio *bio) if (unlikely(block_dump)) { char b[BDEVNAME_SIZE]; - printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", + printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", current->comm, task_pid_nr(current), (rw & WRITE) ? "WRITE" : "READ", (unsigned long long)bio->bi_sector, - bdevname(bio->bi_bdev, b)); + bdevname(bio->bi_bdev, b), + count); } } @@ -1768,11 +1794,11 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) static void blk_account_io_done(struct request *req) { /* - * Account IO completion. bar_rq isn't accounted as a normal - * IO on queueing nor completion. Accounting the containing - * request is enough. + * Account IO completion. flush_rq isn't accounted as a + * normal IO on queueing nor completion. Accounting the + * containing request is enough. */ - if (blk_do_io_stat(req) && req != &req->q->bar_rq) { + if (blk_do_io_stat(req) && req != &req->q->flush_rq) { unsigned long duration = jiffies - req->start_time; const int rw = rq_data_dir(req); struct hd_struct *part; @@ -2497,9 +2523,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); static void __blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; - dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE); - if (src->cmd_flags & REQ_DISCARD) - dst->cmd_flags |= REQ_DISCARD; + dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; dst->cmd_type = src->cmd_type; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); @@ -2579,6 +2603,13 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) } EXPORT_SYMBOL(kblockd_schedule_work); +int kblockd_schedule_delayed_work(struct request_queue *q, + struct delayed_work *dwork, unsigned long delay) +{ + return queue_delayed_work(kblockd_workqueue, dwork, delay); +} +EXPORT_SYMBOL(kblockd_schedule_delayed_work); + int __init blk_dev_init(void) { BUILD_BUG_ON(__REQ_NR_BITS > 8 * diff --git a/block/blk-exec.c b/block/blk-exec.c index e1672f14840e..cf1456a02acd 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -80,6 +80,7 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, DECLARE_COMPLETION_ONSTACK(wait); char sense[SCSI_SENSE_BUFFERSIZE]; int err = 0; + unsigned long hang_check; /* * we need an extra reference to the request, so we can look at @@ -95,7 +96,13 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, rq->end_io_data = &wait; blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); - wait_for_completion(&wait); + + /* Prevent hang_check timer from firing at us during very long I/O */ + hang_check = sysctl_hung_task_timeout_secs; + if (hang_check) + while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2))); + else + wait_for_completion(&wait); if (rq->errors) err = -EIO; diff --git a/block/blk-flush.c b/block/blk-flush.c new file mode 100644 index 000000000000..54b123d6563e --- /dev/null +++ b/block/blk-flush.c @@ -0,0 +1,262 @@ +/* + * Functions to sequence FLUSH and FUA writes. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/gfp.h> + +#include "blk.h" + +/* FLUSH/FUA sequences */ +enum { + QUEUE_FSEQ_STARTED = (1 << 0), /* flushing in progress */ + QUEUE_FSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */ + QUEUE_FSEQ_DATA = (1 << 2), /* data write in progress */ + QUEUE_FSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */ + QUEUE_FSEQ_DONE = (1 << 4), +}; + +static struct request *queue_next_fseq(struct request_queue *q); + +unsigned blk_flush_cur_seq(struct request_queue *q) +{ + if (!q->flush_seq) + return 0; + return 1 << ffz(q->flush_seq); +} + +static struct request *blk_flush_complete_seq(struct request_queue *q, + unsigned seq, int error) +{ + struct request *next_rq = NULL; + + if (error && !q->flush_err) + q->flush_err = error; + + BUG_ON(q->flush_seq & seq); + q->flush_seq |= seq; + + if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) { + /* not complete yet, queue the next flush sequence */ + next_rq = queue_next_fseq(q); + } else { + /* complete this flush request */ + __blk_end_request_all(q->orig_flush_rq, q->flush_err); + q->orig_flush_rq = NULL; + q->flush_seq = 0; + + /* dispatch the next flush if there's one */ + if (!list_empty(&q->pending_flushes)) { + next_rq = list_entry_rq(q->pending_flushes.next); + list_move(&next_rq->queuelist, &q->queue_head); + } + } + return next_rq; +} + +static void blk_flush_complete_seq_end_io(struct request_queue *q, + unsigned seq, int error) +{ + bool was_empty = elv_queue_empty(q); + struct request *next_rq; + + next_rq = blk_flush_complete_seq(q, seq, error); + + /* + * Moving a request silently to empty queue_head may stall the + * queue. Kick the queue in those cases. + */ + if (was_empty && next_rq) + __blk_run_queue(q); +} + +static void pre_flush_end_io(struct request *rq, int error) +{ + elv_completed_request(rq->q, rq); + blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error); +} + +static void flush_data_end_io(struct request *rq, int error) +{ + elv_completed_request(rq->q, rq); + blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error); +} + +static void post_flush_end_io(struct request *rq, int error) +{ + elv_completed_request(rq->q, rq); + blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error); +} + +static void init_flush_request(struct request *rq, struct gendisk *disk) +{ + rq->cmd_type = REQ_TYPE_FS; + rq->cmd_flags = WRITE_FLUSH; + rq->rq_disk = disk; +} + +static struct request *queue_next_fseq(struct request_queue *q) +{ + struct request *orig_rq = q->orig_flush_rq; + struct request *rq = &q->flush_rq; + + blk_rq_init(q, rq); + + switch (blk_flush_cur_seq(q)) { + case QUEUE_FSEQ_PREFLUSH: + init_flush_request(rq, orig_rq->rq_disk); + rq->end_io = pre_flush_end_io; + break; + case QUEUE_FSEQ_DATA: + init_request_from_bio(rq, orig_rq->bio); + /* + * orig_rq->rq_disk may be different from + * bio->bi_bdev->bd_disk if orig_rq got here through + * remapping drivers. Make sure rq->rq_disk points + * to the same one as orig_rq. + */ + rq->rq_disk = orig_rq->rq_disk; + rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA); + rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA); + rq->end_io = flush_data_end_io; + break; + case QUEUE_FSEQ_POSTFLUSH: + init_flush_request(rq, orig_rq->rq_disk); + rq->end_io = post_flush_end_io; + break; + default: + BUG(); + } + + elv_insert(q, rq, ELEVATOR_INSERT_FRONT); + return rq; +} + +struct request *blk_do_flush(struct request_queue *q, struct request *rq) +{ + unsigned int fflags = q->flush_flags; /* may change, cache it */ + bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA; + bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH); + bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA); + unsigned skip = 0; + + /* + * Special case. If there's data but flush is not necessary, + * the request can be issued directly. + * + * Flush w/o data should be able to be issued directly too but + * currently some drivers assume that rq->bio contains + * non-zero data if it isn't NULL and empty FLUSH requests + * getting here usually have bio's without data. + */ + if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) { + rq->cmd_flags &= ~REQ_FLUSH; + if (!has_fua) + rq->cmd_flags &= ~REQ_FUA; + return rq; + } + + /* + * Sequenced flushes can't be processed in parallel. If + * another one is already in progress, queue for later + * processing. + */ + if (q->flush_seq) { + list_move_tail(&rq->queuelist, &q->pending_flushes); + return NULL; + } + + /* + * Start a new flush sequence + */ + q->flush_err = 0; + q->flush_seq |= QUEUE_FSEQ_STARTED; + + /* adjust FLUSH/FUA of the original request and stash it away */ + rq->cmd_flags &= ~REQ_FLUSH; + if (!has_fua) + rq->cmd_flags &= ~REQ_FUA; + blk_dequeue_request(rq); + q->orig_flush_rq = rq; + + /* skip unneded sequences and return the first one */ + if (!do_preflush) + skip |= QUEUE_FSEQ_PREFLUSH; + if (!blk_rq_sectors(rq)) + skip |= QUEUE_FSEQ_DATA; + if (!do_postflush) + skip |= QUEUE_FSEQ_POSTFLUSH; + return blk_flush_complete_seq(q, skip, 0); +} + +static void bio_end_flush(struct bio *bio, int err) +{ + if (err) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); +} + +/** + * blkdev_issue_flush - queue a flush + * @bdev: blockdev to issue flush for + * @gfp_mask: memory allocation flags (for bio_alloc) + * @error_sector: error sector + * + * Description: + * Issue a flush for the block device in question. Caller can supply + * room for storing the error offset in case of a flush error, if they + * wish to. If WAIT flag is not passed then caller may check only what + * request was pushed in some internal queue for later handling. + */ +int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, + sector_t *error_sector) +{ + DECLARE_COMPLETION_ONSTACK(wait); + struct request_queue *q; + struct bio *bio; + int ret = 0; + + if (bdev->bd_disk == NULL) + return -ENXIO; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + /* + * some block devices may not have their queue correctly set up here + * (e.g. loop device without a backing file) and so issuing a flush + * here will panic. Ensure there is a request function before issuing + * the flush. + */ + if (!q->make_request_fn) + return -ENXIO; + + bio = bio_alloc(gfp_mask, 0); + bio->bi_end_io = bio_end_flush; + bio->bi_bdev = bdev; + bio->bi_private = &wait; + + bio_get(bio); + submit_bio(WRITE_FLUSH, bio); + wait_for_completion(&wait); + + /* + * The driver must store the error location in ->bi_sector, if + * it supports it. For non-stacked drivers, this should be + * copied from blk_rq_pos(rq). + */ + if (error_sector) + *error_sector = bio->bi_sector; + + if (!bio_flagged(bio, BIO_UPTODATE)) + ret = -EIO; + + bio_put(bio); + return ret; +} +EXPORT_SYMBOL(blkdev_issue_flush); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index edce1ef7933d..54bcba6c02a7 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -32,24 +32,37 @@ static struct kmem_cache *integrity_cachep; /** * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements - * @rq: request with integrity metadata attached + * @q: request queue + * @bio: bio with integrity metadata attached * * Description: Returns the number of elements required in a - * scatterlist corresponding to the integrity metadata in a request. + * scatterlist corresponding to the integrity metadata in a bio. */ -int blk_rq_count_integrity_sg(struct request *rq) +int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio) { - struct bio_vec *iv, *ivprv; - struct req_iterator iter; - unsigned int segments; + struct bio_vec *iv, *ivprv = NULL; + unsigned int segments = 0; + unsigned int seg_size = 0; + unsigned int i = 0; - ivprv = NULL; - segments = 0; + bio_for_each_integrity_vec(iv, bio, i) { - rq_for_each_integrity_segment(iv, rq, iter) { + if (ivprv) { + if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) + goto new_segment; + + if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv)) + goto new_segment; - if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv)) + if (seg_size + iv->bv_len > queue_max_segment_size(q)) + goto new_segment; + + seg_size += iv->bv_len; + } else { +new_segment: segments++; + seg_size = iv->bv_len; + } ivprv = iv; } @@ -60,30 +73,34 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg); /** * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist - * @rq: request with integrity metadata attached + * @q: request queue + * @bio: bio with integrity metadata attached * @sglist: target scatterlist * * Description: Map the integrity vectors in request into a * scatterlist. The scatterlist must be big enough to hold all * elements. I.e. sized using blk_rq_count_integrity_sg(). */ -int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) +int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio, + struct scatterlist *sglist) { - struct bio_vec *iv, *ivprv; - struct req_iterator iter; - struct scatterlist *sg; - unsigned int segments; - - ivprv = NULL; - sg = NULL; - segments = 0; + struct bio_vec *iv, *ivprv = NULL; + struct scatterlist *sg = NULL; + unsigned int segments = 0; + unsigned int i = 0; - rq_for_each_integrity_segment(iv, rq, iter) { + bio_for_each_integrity_vec(iv, bio, i) { if (ivprv) { if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) goto new_segment; + if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv)) + goto new_segment; + + if (sg->length + iv->bv_len > queue_max_segment_size(q)) + goto new_segment; + sg->length += iv->bv_len; } else { new_segment: @@ -162,6 +179,40 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) } EXPORT_SYMBOL(blk_integrity_compare); +int blk_integrity_merge_rq(struct request_queue *q, struct request *req, + struct request *next) +{ + if (blk_integrity_rq(req) != blk_integrity_rq(next)) + return -1; + + if (req->nr_integrity_segments + next->nr_integrity_segments > + q->limits.max_integrity_segments) + return -1; + + return 0; +} +EXPORT_SYMBOL(blk_integrity_merge_rq); + +int blk_integrity_merge_bio(struct request_queue *q, struct request *req, + struct bio *bio) +{ + int nr_integrity_segs; + struct bio *next = bio->bi_next; + + bio->bi_next = NULL; + nr_integrity_segs = blk_rq_count_integrity_sg(q, bio); + bio->bi_next = next; + + if (req->nr_integrity_segments + nr_integrity_segs > + q->limits.max_integrity_segments) + return -1; + + req->nr_integrity_segments += nr_integrity_segs; + + return 0; +} +EXPORT_SYMBOL(blk_integrity_merge_bio); + struct integrity_sysfs_entry { struct attribute attr; ssize_t (*show)(struct blk_integrity *, char *); @@ -381,7 +432,6 @@ void blk_integrity_unregister(struct gendisk *disk) kobject_uevent(&bi->kobj, KOBJ_REMOVE); kobject_del(&bi->kobj); kobject_put(&bi->kobj); - kmem_cache_free(integrity_cachep, bi); disk->integrity = NULL; } EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/block/blk-lib.c b/block/blk-lib.c index c392029a104e..1a320d2406b0 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -39,8 +39,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, { DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q = bdev_get_queue(bdev); - int type = flags & BLKDEV_IFL_BARRIER ? - DISCARD_BARRIER : DISCARD_NOBARRIER; + int type = REQ_WRITE | REQ_DISCARD; unsigned int max_discard_sectors; struct bio *bio; int ret = 0; @@ -62,10 +61,10 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, max_discard_sectors &= ~(disc_sects - 1); } - if (flags & BLKDEV_IFL_SECURE) { + if (flags & BLKDEV_DISCARD_SECURE) { if (!blk_queue_secdiscard(q)) return -EOPNOTSUPP; - type |= DISCARD_SECURE; + type |= REQ_SECURE; } while (nr_sects && !ret) { @@ -78,8 +77,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, bio->bi_sector = sector; bio->bi_end_io = blkdev_discard_end_io; bio->bi_bdev = bdev; - if (flags & BLKDEV_IFL_WAIT) - bio->bi_private = &wait; + bio->bi_private = &wait; if (nr_sects > max_discard_sectors) { bio->bi_size = max_discard_sectors << 9; @@ -93,8 +91,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, bio_get(bio); submit_bio(type, bio); - if (flags & BLKDEV_IFL_WAIT) - wait_for_completion(&wait); + wait_for_completion(&wait); if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; @@ -140,7 +137,6 @@ static void bio_batch_end_io(struct bio *bio, int err) * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) - * @flags: BLKDEV_IFL_* flags to control behaviour * * Description: * Generate and issue number of bios with zerofiled pages. @@ -149,7 +145,7 @@ static void bio_batch_end_io(struct bio *bio, int err) */ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) + sector_t nr_sects, gfp_t gfp_mask) { int ret; struct bio *bio; @@ -162,12 +158,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, bb.wait = &wait; bb.end_io = NULL; - if (flags & BLKDEV_IFL_BARRIER) { - /* issue async barrier before the data */ - ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0); - if (ret) - return ret; - } submit: ret = 0; while (nr_sects != 0) { @@ -181,8 +171,7 @@ submit: bio->bi_sector = sector; bio->bi_bdev = bdev; bio->bi_end_io = bio_batch_end_io; - if (flags & BLKDEV_IFL_WAIT) - bio->bi_private = &bb; + bio->bi_private = &bb; while (nr_sects != 0) { sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); @@ -199,18 +188,10 @@ submit: issued++; submit_bio(WRITE, bio); } - /* - * When all data bios are in flight. Send final barrier if requeted. - */ - if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER) - ret = blkdev_issue_flush(bdev, gfp_mask, NULL, - flags & BLKDEV_IFL_WAIT); - - if (flags & BLKDEV_IFL_WAIT) - /* Wait for bios in-flight */ - while ( issued != atomic_read(&bb.done)) - wait_for_completion(&wait); + /* Wait for bios in-flight */ + while (issued != atomic_read(&bb.done)) + wait_for_completion(&wait); if (!test_bit(BIO_UPTODATE, &bb.flags)) /* One of bios in the batch was completed with error.*/ diff --git a/block/blk-map.c b/block/blk-map.c index ade0a08c9099..d4a586d8691e 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -54,7 +54,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq, * direct dma. else, set up kernel bounce buffers */ uaddr = (unsigned long) ubuf; - if (blk_rq_aligned(q, ubuf, len) && !map_data) + if (blk_rq_aligned(q, uaddr, len) && !map_data) bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask); else bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask); @@ -288,6 +288,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, unsigned int len, gfp_t gfp_mask) { int reading = rq_data_dir(rq) == READ; + unsigned long addr = (unsigned long) kbuf; int do_copy = 0; struct bio *bio; int ret; @@ -297,7 +298,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (!len || !kbuf) return -EINVAL; - do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf); + do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf); if (do_copy) bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); else diff --git a/block/blk-merge.c b/block/blk-merge.c index eafc94f68d79..77b7c26df6b5 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -205,12 +205,11 @@ static inline int ll_new_hw_segment(struct request_queue *q, { int nr_phys_segs = bio_phys_segments(q, bio); - if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) { - req->cmd_flags |= REQ_NOMERGE; - if (req == q->last_merge) - q->last_merge = NULL; - return 0; - } + if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) + goto no_merge; + + if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio)) + goto no_merge; /* * This will form the start of a new hw segment. Bump both @@ -218,6 +217,12 @@ static inline int ll_new_hw_segment(struct request_queue *q, */ req->nr_phys_segments += nr_phys_segs; return 1; + +no_merge: + req->cmd_flags |= REQ_NOMERGE; + if (req == q->last_merge) + q->last_merge = NULL; + return 0; } int ll_back_merge_fn(struct request_queue *q, struct request *req, @@ -301,6 +306,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (total_phys_segments > queue_max_segments(q)) return 0; + if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next)) + return 0; + /* Merge is OK... */ req->nr_phys_segments = total_phys_segments; return 1; @@ -384,9 +392,6 @@ static int attempt_merge(struct request_queue *q, struct request *req, || next->special) return 0; - if (blk_integrity_rq(req) != blk_integrity_rq(next)) - return 0; - /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn diff --git a/block/blk-settings.c b/block/blk-settings.c index a234f4bf1d6f..701859fb9647 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy); void blk_set_default_limits(struct queue_limits *lim) { lim->max_segments = BLK_MAX_SEGMENTS; + lim->max_integrity_segments = 0; lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; lim->max_sectors = BLK_DEF_MAX_SECTORS; @@ -213,7 +214,7 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) */ if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) dma = 1; - q->limits.bounce_pfn = max_low_pfn; + q->limits.bounce_pfn = max(max_low_pfn, b_pfn); #else if (b_pfn < blk_max_low_pfn) dma = 1; @@ -343,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_logical_block_size); * hardware can operate on without reverting to read-modify-write * operations. */ -void blk_queue_physical_block_size(struct request_queue *q, unsigned short size) +void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) { q->limits.physical_block_size = size; @@ -455,11 +456,6 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt) } EXPORT_SYMBOL(blk_queue_io_opt); -/* - * Returns the minimum that is _not_ zero, unless both are zero. - */ -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) - /** * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers * @t: the stacking driver (top) @@ -514,6 +510,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->seg_boundary_mask); t->max_segments = min_not_zero(t->max_segments, b->max_segments); + t->max_integrity_segments = min_not_zero(t->max_integrity_segments, + b->max_integrity_segments); t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); @@ -794,6 +792,26 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask) } EXPORT_SYMBOL(blk_queue_update_dma_alignment); +/** + * blk_queue_flush - configure queue's cache flush capability + * @q: the request queue for the device + * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA + * + * Tell block layer cache flush capability of @q. If it supports + * flushing, REQ_FLUSH should be set. If it supports bypassing + * write cache for individual writes, REQ_FUA should be set. + */ +void blk_queue_flush(struct request_queue *q, unsigned int flush) +{ + WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA)); + + if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA))) + flush &= ~REQ_FUA; + + q->flush_flags = flush & (REQ_FLUSH | REQ_FUA); +} +EXPORT_SYMBOL_GPL(blk_queue_flush); + static int __init blk_settings_init(void) { blk_max_low_pfn = max_low_pfn - 1; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0749b89c6885..da8a8a40cd4c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -112,6 +112,11 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page) return queue_var_show(queue_max_segments(q), (page)); } +static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->limits.max_integrity_segments, (page)); +} + static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) { if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) @@ -288,6 +293,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = { .show = queue_max_segments_show, }; +static struct queue_sysfs_entry queue_max_integrity_segments_entry = { + .attr = {.name = "max_integrity_segments", .mode = S_IRUGO }, + .show = queue_max_integrity_segments_show, +}; + static struct queue_sysfs_entry queue_max_segment_size_entry = { .attr = {.name = "max_segment_size", .mode = S_IRUGO }, .show = queue_max_segment_size_show, @@ -375,6 +385,7 @@ static struct attribute *default_attrs[] = { &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, &queue_max_segments_entry.attr, + &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, &queue_iosched_entry.attr, &queue_hw_sector_size_entry.attr, diff --git a/block/blk-throttle.c b/block/blk-throttle.c new file mode 100644 index 000000000000..56ad4531b412 --- /dev/null +++ b/block/blk-throttle.c @@ -0,0 +1,1123 @@ +/* + * Interface for controlling IO bandwidth on a request queue + * + * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/blktrace_api.h> +#include "blk-cgroup.h" + +/* Max dispatch from a group in 1 round */ +static int throtl_grp_quantum = 8; + +/* Total max dispatch from all groups in one round */ +static int throtl_quantum = 32; + +/* Throttling is performed over 100ms slice and after that slice is renewed */ +static unsigned long throtl_slice = HZ/10; /* 100 ms */ + +struct throtl_rb_root { + struct rb_root rb; + struct rb_node *left; + unsigned int count; + unsigned long min_disptime; +}; + +#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ + .count = 0, .min_disptime = 0} + +#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) + +struct throtl_grp { + /* List of throtl groups on the request queue*/ + struct hlist_node tg_node; + + /* active throtl group service_tree member */ + struct rb_node rb_node; + + /* + * Dispatch time in jiffies. This is the estimated time when group + * will unthrottle and is ready to dispatch more bio. It is used as + * key to sort active groups in service tree. + */ + unsigned long disptime; + + struct blkio_group blkg; + atomic_t ref; + unsigned int flags; + + /* Two lists for READ and WRITE */ + struct bio_list bio_lists[2]; + + /* Number of queued bios on READ and WRITE lists */ + unsigned int nr_queued[2]; + + /* bytes per second rate limits */ + uint64_t bps[2]; + + /* IOPS limits */ + unsigned int iops[2]; + + /* Number of bytes disptached in current slice */ + uint64_t bytes_disp[2]; + /* Number of bio's dispatched in current slice */ + unsigned int io_disp[2]; + + /* When did we start a new slice */ + unsigned long slice_start[2]; + unsigned long slice_end[2]; + + /* Some throttle limits got updated for the group */ + bool limits_changed; +}; + +struct throtl_data +{ + /* List of throtl groups */ + struct hlist_head tg_list; + + /* service tree for active throtl groups */ + struct throtl_rb_root tg_service_tree; + + struct throtl_grp root_tg; + struct request_queue *queue; + + /* Total Number of queued bios on READ and WRITE lists */ + unsigned int nr_queued[2]; + + /* + * number of total undestroyed groups + */ + unsigned int nr_undestroyed_grps; + + /* Work for dispatching throttled bios */ + struct delayed_work throtl_work; + + atomic_t limits_changed; +}; + +enum tg_state_flags { + THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ +}; + +#define THROTL_TG_FNS(name) \ +static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ +{ \ + (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ +} \ +static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ +{ \ + (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ +} \ +static inline int throtl_tg_##name(const struct throtl_grp *tg) \ +{ \ + return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ +} + +THROTL_TG_FNS(on_rr); + +#define throtl_log_tg(td, tg, fmt, args...) \ + blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ + blkg_path(&(tg)->blkg), ##args); \ + +#define throtl_log(td, fmt, args...) \ + blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) + +static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) +{ + if (blkg) + return container_of(blkg, struct throtl_grp, blkg); + + return NULL; +} + +static inline int total_nr_queued(struct throtl_data *td) +{ + return (td->nr_queued[0] + td->nr_queued[1]); +} + +static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) +{ + atomic_inc(&tg->ref); + return tg; +} + +static void throtl_put_tg(struct throtl_grp *tg) +{ + BUG_ON(atomic_read(&tg->ref) <= 0); + if (!atomic_dec_and_test(&tg->ref)) + return; + kfree(tg); +} + +static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, + struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + struct throtl_grp *tg = NULL; + void *key = td; + struct backing_dev_info *bdi = &td->queue->backing_dev_info; + unsigned int major, minor; + + /* + * TODO: Speed up blkiocg_lookup_group() by maintaining a radix + * tree of blkg (instead of traversing through hash list all + * the time. + */ + tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); + + /* Fill in device details for root group */ + if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + tg->blkg.dev = MKDEV(major, minor); + goto done; + } + + if (tg) + goto done; + + tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); + if (!tg) + goto done; + + INIT_HLIST_NODE(&tg->tg_node); + RB_CLEAR_NODE(&tg->rb_node); + bio_list_init(&tg->bio_lists[0]); + bio_list_init(&tg->bio_lists[1]); + + /* + * Take the initial reference that will be released on destroy + * This can be thought of a joint reference by cgroup and + * request queue which will be dropped by either request queue + * exit or cgroup deletion path depending on who is exiting first. + */ + atomic_set(&tg->ref, 1); + + /* Add group onto cgroup list */ + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, + MKDEV(major, minor), BLKIO_POLICY_THROTL); + + tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); + tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); + tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); + tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); + + hlist_add_head(&tg->tg_node, &td->tg_list); + td->nr_undestroyed_grps++; +done: + return tg; +} + +static struct throtl_grp * throtl_get_tg(struct throtl_data *td) +{ + struct cgroup *cgroup; + struct throtl_grp *tg = NULL; + + rcu_read_lock(); + cgroup = task_cgroup(current, blkio_subsys_id); + tg = throtl_find_alloc_tg(td, cgroup); + if (!tg) + tg = &td->root_tg; + rcu_read_unlock(); + return tg; +} + +static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) +{ + /* Service tree is empty */ + if (!root->count) + return NULL; + + if (!root->left) + root->left = rb_first(&root->rb); + + if (root->left) + return rb_entry_tg(root->left); + + return NULL; +} + +static void rb_erase_init(struct rb_node *n, struct rb_root *root) +{ + rb_erase(n, root); + RB_CLEAR_NODE(n); +} + +static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) +{ + if (root->left == n) + root->left = NULL; + rb_erase_init(n, &root->rb); + --root->count; +} + +static void update_min_dispatch_time(struct throtl_rb_root *st) +{ + struct throtl_grp *tg; + + tg = throtl_rb_first(st); + if (!tg) + return; + + st->min_disptime = tg->disptime; +} + +static void +tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) +{ + struct rb_node **node = &st->rb.rb_node; + struct rb_node *parent = NULL; + struct throtl_grp *__tg; + unsigned long key = tg->disptime; + int left = 1; + + while (*node != NULL) { + parent = *node; + __tg = rb_entry_tg(parent); + + if (time_before(key, __tg->disptime)) + node = &parent->rb_left; + else { + node = &parent->rb_right; + left = 0; + } + } + + if (left) + st->left = &tg->rb_node; + + rb_link_node(&tg->rb_node, parent, node); + rb_insert_color(&tg->rb_node, &st->rb); +} + +static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + struct throtl_rb_root *st = &td->tg_service_tree; + + tg_service_tree_add(st, tg); + throtl_mark_tg_on_rr(tg); + st->count++; +} + +static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + if (!throtl_tg_on_rr(tg)) + __throtl_enqueue_tg(td, tg); +} + +static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); + throtl_clear_tg_on_rr(tg); +} + +static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + if (throtl_tg_on_rr(tg)) + __throtl_dequeue_tg(td, tg); +} + +static void throtl_schedule_next_dispatch(struct throtl_data *td) +{ + struct throtl_rb_root *st = &td->tg_service_tree; + + /* + * If there are more bios pending, schedule more work. + */ + if (!total_nr_queued(td)) + return; + + BUG_ON(!st->count); + + update_min_dispatch_time(st); + + if (time_before_eq(st->min_disptime, jiffies)) + throtl_schedule_delayed_work(td->queue, 0); + else + throtl_schedule_delayed_work(td->queue, + (st->min_disptime - jiffies)); +} + +static inline void +throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) +{ + tg->bytes_disp[rw] = 0; + tg->io_disp[rw] = 0; + tg->slice_start[rw] = jiffies; + tg->slice_end[rw] = jiffies + throtl_slice; + throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', tg->slice_start[rw], + tg->slice_end[rw], jiffies); +} + +static inline void throtl_extend_slice(struct throtl_data *td, + struct throtl_grp *tg, bool rw, unsigned long jiffy_end) +{ + tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); + throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', tg->slice_start[rw], + tg->slice_end[rw], jiffies); +} + +/* Determine if previously allocated or extended slice is complete or not */ +static bool +throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) +{ + if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) + return 0; + + return 1; +} + +/* Trim the used slices and adjust slice start accordingly */ +static inline void +throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) +{ + unsigned long nr_slices, time_elapsed, io_trim; + u64 bytes_trim, tmp; + + BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); + + /* + * If bps are unlimited (-1), then time slice don't get + * renewed. Don't try to trim the slice if slice is used. A new + * slice will start when appropriate. + */ + if (throtl_slice_used(td, tg, rw)) + return; + + time_elapsed = jiffies - tg->slice_start[rw]; + + nr_slices = time_elapsed / throtl_slice; + + if (!nr_slices) + return; + tmp = tg->bps[rw] * throtl_slice * nr_slices; + do_div(tmp, HZ); + bytes_trim = tmp; + + io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; + + if (!bytes_trim && !io_trim) + return; + + if (tg->bytes_disp[rw] >= bytes_trim) + tg->bytes_disp[rw] -= bytes_trim; + else + tg->bytes_disp[rw] = 0; + + if (tg->io_disp[rw] >= io_trim) + tg->io_disp[rw] -= io_trim; + else + tg->io_disp[rw] = 0; + + tg->slice_start[rw] += nr_slices * throtl_slice; + + throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" + " start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, + tg->slice_start[rw], tg->slice_end[rw], jiffies); +} + +static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + unsigned int io_allowed; + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + u64 tmp; + + jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; + + /* Slice has just started. Consider one slice interval */ + if (!jiffy_elapsed) + jiffy_elapsed_rnd = throtl_slice; + + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); + + /* + * jiffy_elapsed_rnd should not be a big value as minimum iops can be + * 1 then at max jiffy elapsed should be equivalent of 1 second as we + * will allow dispatch after 1 second and after that slice should + * have been trimmed. + */ + + tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; + do_div(tmp, HZ); + + if (tmp > UINT_MAX) + io_allowed = UINT_MAX; + else + io_allowed = tmp; + + if (tg->io_disp[rw] + 1 <= io_allowed) { + if (wait) + *wait = 0; + return 1; + } + + /* Calc approx time to dispatch */ + jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; + + if (jiffy_wait > jiffy_elapsed) + jiffy_wait = jiffy_wait - jiffy_elapsed; + else + jiffy_wait = 1; + + if (wait) + *wait = jiffy_wait; + return 0; +} + +static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + u64 bytes_allowed, extra_bytes, tmp; + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + + jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; + + /* Slice has just started. Consider one slice interval */ + if (!jiffy_elapsed) + jiffy_elapsed_rnd = throtl_slice; + + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); + + tmp = tg->bps[rw] * jiffy_elapsed_rnd; + do_div(tmp, HZ); + bytes_allowed = tmp; + + if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { + if (wait) + *wait = 0; + return 1; + } + + /* Calc approx time to dispatch */ + extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; + jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); + + if (!jiffy_wait) + jiffy_wait = 1; + + /* + * This wait time is without taking into consideration the rounding + * up we did. Add that time also. + */ + jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); + if (wait) + *wait = jiffy_wait; + return 0; +} + +/* + * Returns whether one can dispatch a bio or not. Also returns approx number + * of jiffies to wait before this bio is with-in IO rate and can be dispatched + */ +static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; + + /* + * Currently whole state machine of group depends on first bio + * queued in the group bio list. So one should not be calling + * this function with a different bio if there are other bios + * queued. + */ + BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); + + /* If tg->bps = -1, then BW is unlimited */ + if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { + if (wait) + *wait = 0; + return 1; + } + + /* + * If previous slice expired, start a new one otherwise renew/extend + * existing slice to make sure it is at least throtl_slice interval + * long since now. + */ + if (throtl_slice_used(td, tg, rw)) + throtl_start_new_slice(td, tg, rw); + else { + if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) + throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); + } + + if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) + && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { + if (wait) + *wait = 0; + return 1; + } + + max_wait = max(bps_wait, iops_wait); + + if (wait) + *wait = max_wait; + + if (time_before(tg->slice_end[rw], jiffies + max_wait)) + throtl_extend_slice(td, tg, rw, jiffies + max_wait); + + return 0; +} + +static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) +{ + bool rw = bio_data_dir(bio); + bool sync = bio->bi_rw & REQ_SYNC; + + /* Charge the bio to the group */ + tg->bytes_disp[rw] += bio->bi_size; + tg->io_disp[rw]++; + + /* + * TODO: This will take blkg->stats_lock. Figure out a way + * to avoid this cost. + */ + blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); +} + +static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio) +{ + bool rw = bio_data_dir(bio); + + bio_list_add(&tg->bio_lists[rw], bio); + /* Take a bio reference on tg */ + throtl_ref_get_tg(tg); + tg->nr_queued[rw]++; + td->nr_queued[rw]++; + throtl_enqueue_tg(td, tg); +} + +static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) +{ + unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; + struct bio *bio; + + if ((bio = bio_list_peek(&tg->bio_lists[READ]))) + tg_may_dispatch(td, tg, bio, &read_wait); + + if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) + tg_may_dispatch(td, tg, bio, &write_wait); + + min_wait = min(read_wait, write_wait); + disptime = jiffies + min_wait; + + /* Update dispatch time */ + throtl_dequeue_tg(td, tg); + tg->disptime = disptime; + throtl_enqueue_tg(td, tg); +} + +static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, + bool rw, struct bio_list *bl) +{ + struct bio *bio; + + bio = bio_list_pop(&tg->bio_lists[rw]); + tg->nr_queued[rw]--; + /* Drop bio reference on tg */ + throtl_put_tg(tg); + + BUG_ON(td->nr_queued[rw] <= 0); + td->nr_queued[rw]--; + + throtl_charge_bio(tg, bio); + bio_list_add(bl, bio); + bio->bi_rw |= REQ_THROTTLED; + + throtl_trim_slice(td, tg, rw); +} + +static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, + struct bio_list *bl) +{ + unsigned int nr_reads = 0, nr_writes = 0; + unsigned int max_nr_reads = throtl_grp_quantum*3/4; + unsigned int max_nr_writes = throtl_grp_quantum - nr_reads; + struct bio *bio; + + /* Try to dispatch 75% READS and 25% WRITES */ + + while ((bio = bio_list_peek(&tg->bio_lists[READ])) + && tg_may_dispatch(td, tg, bio, NULL)) { + + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); + nr_reads++; + + if (nr_reads >= max_nr_reads) + break; + } + + while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) + && tg_may_dispatch(td, tg, bio, NULL)) { + + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); + nr_writes++; + + if (nr_writes >= max_nr_writes) + break; + } + + return nr_reads + nr_writes; +} + +static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) +{ + unsigned int nr_disp = 0; + struct throtl_grp *tg; + struct throtl_rb_root *st = &td->tg_service_tree; + + while (1) { + tg = throtl_rb_first(st); + + if (!tg) + break; + + if (time_before(jiffies, tg->disptime)) + break; + + throtl_dequeue_tg(td, tg); + + nr_disp += throtl_dispatch_tg(td, tg, bl); + + if (tg->nr_queued[0] || tg->nr_queued[1]) { + tg_update_disptime(td, tg); + throtl_enqueue_tg(td, tg); + } + + if (nr_disp >= throtl_quantum) + break; + } + + return nr_disp; +} + +static void throtl_process_limit_change(struct throtl_data *td) +{ + struct throtl_grp *tg; + struct hlist_node *pos, *n; + + /* + * Make sure atomic_inc() effects from + * throtl_update_blkio_group_read_bps(), group of functions are + * visible. + * Is this required or smp_mb__after_atomic_inc() was suffcient + * after the atomic_inc(). + */ + smp_rmb(); + if (!atomic_read(&td->limits_changed)) + return; + + throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed)); + + hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { + /* + * Do I need an smp_rmb() here to make sure tg->limits_changed + * update is visible. I am relying on smp_rmb() at the + * beginning of function and not putting a new one here. + */ + + if (throtl_tg_on_rr(tg) && tg->limits_changed) { + throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" + " riops=%u wiops=%u", tg->bps[READ], + tg->bps[WRITE], tg->iops[READ], + tg->iops[WRITE]); + tg_update_disptime(td, tg); + tg->limits_changed = false; + } + } + + smp_mb__before_atomic_dec(); + atomic_dec(&td->limits_changed); + smp_mb__after_atomic_dec(); +} + +/* Dispatch throttled bios. Should be called without queue lock held. */ +static int throtl_dispatch(struct request_queue *q) +{ + struct throtl_data *td = q->td; + unsigned int nr_disp = 0; + struct bio_list bio_list_on_stack; + struct bio *bio; + + spin_lock_irq(q->queue_lock); + + throtl_process_limit_change(td); + + if (!total_nr_queued(td)) + goto out; + + bio_list_init(&bio_list_on_stack); + + throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u", + total_nr_queued(td), td->nr_queued[READ], + td->nr_queued[WRITE]); + + nr_disp = throtl_select_dispatch(td, &bio_list_on_stack); + + if (nr_disp) + throtl_log(td, "bios disp=%u", nr_disp); + + throtl_schedule_next_dispatch(td); +out: + spin_unlock_irq(q->queue_lock); + + /* + * If we dispatched some requests, unplug the queue to make sure + * immediate dispatch + */ + if (nr_disp) { + while((bio = bio_list_pop(&bio_list_on_stack))) + generic_make_request(bio); + blk_unplug(q); + } + return nr_disp; +} + +void blk_throtl_work(struct work_struct *work) +{ + struct throtl_data *td = container_of(work, struct throtl_data, + throtl_work.work); + struct request_queue *q = td->queue; + + throtl_dispatch(q); +} + +/* Call with queue lock held */ +void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) +{ + + struct throtl_data *td = q->td; + struct delayed_work *dwork = &td->throtl_work; + + if (total_nr_queued(td) > 0) { + /* + * We might have a work scheduled to be executed in future. + * Cancel that and schedule a new one. + */ + __cancel_delayed_work(dwork); + kblockd_schedule_delayed_work(q, dwork, delay); + throtl_log(td, "schedule work. delay=%lu jiffies=%lu", + delay, jiffies); + } +} +EXPORT_SYMBOL(throtl_schedule_delayed_work); + +static void +throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + /* Something wrong if we are trying to remove same group twice */ + BUG_ON(hlist_unhashed(&tg->tg_node)); + + hlist_del_init(&tg->tg_node); + + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + throtl_put_tg(tg); + td->nr_undestroyed_grps--; +} + +static void throtl_release_tgs(struct throtl_data *td) +{ + struct hlist_node *pos, *n; + struct throtl_grp *tg; + + hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { + /* + * If cgroup removal path got to blk_group first and removed + * it from cgroup list, then it will take care of destroying + * cfqg also. + */ + if (!blkiocg_del_blkio_group(&tg->blkg)) + throtl_destroy_tg(td, tg); + } +} + +static void throtl_td_free(struct throtl_data *td) +{ + kfree(td); +} + +/* + * Blk cgroup controller notification saying that blkio_group object is being + * delinked as associated cgroup object is going away. That also means that + * no new IO will come in this group. So get rid of this group as soon as + * any pending IO in the group is finished. + * + * This function is called under rcu_read_lock(). key is the rcu protected + * pointer. That means "key" is a valid throtl_data pointer as long as we are + * rcu read lock. + * + * "key" was fetched from blkio_group under blkio_cgroup->lock. That means + * it should not be NULL as even if queue was going away, cgroup deltion + * path got to it first. + */ +void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) +{ + unsigned long flags; + struct throtl_data *td = key; + + spin_lock_irqsave(td->queue->queue_lock, flags); + throtl_destroy_tg(td, tg_of_blkg(blkg)); + spin_unlock_irqrestore(td->queue->queue_lock, flags); +} + +/* + * For all update functions, key should be a valid pointer because these + * update functions are called under blkcg_lock, that means, blkg is + * valid and in turn key is valid. queue exit path can not race becuase + * of blkcg_lock + * + * Can not take queue lock in update functions as queue lock under blkcg_lock + * is not allowed. Under other paths we take blkcg_lock under queue_lock. + */ +static void throtl_update_blkio_group_read_bps(void *key, + struct blkio_group *blkg, u64 read_bps) +{ + struct throtl_data *td = key; + + tg_of_blkg(blkg)->bps[READ] = read_bps; + /* Make sure read_bps is updated before setting limits_changed */ + smp_wmb(); + tg_of_blkg(blkg)->limits_changed = true; + + /* Make sure tg->limits_changed is updated before td->limits_changed */ + smp_mb__before_atomic_inc(); + atomic_inc(&td->limits_changed); + smp_mb__after_atomic_inc(); + + /* Schedule a work now to process the limit change */ + throtl_schedule_delayed_work(td->queue, 0); +} + +static void throtl_update_blkio_group_write_bps(void *key, + struct blkio_group *blkg, u64 write_bps) +{ + struct throtl_data *td = key; + + tg_of_blkg(blkg)->bps[WRITE] = write_bps; + smp_wmb(); + tg_of_blkg(blkg)->limits_changed = true; + smp_mb__before_atomic_inc(); + atomic_inc(&td->limits_changed); + smp_mb__after_atomic_inc(); + throtl_schedule_delayed_work(td->queue, 0); +} + +static void throtl_update_blkio_group_read_iops(void *key, + struct blkio_group *blkg, unsigned int read_iops) +{ + struct throtl_data *td = key; + + tg_of_blkg(blkg)->iops[READ] = read_iops; + smp_wmb(); + tg_of_blkg(blkg)->limits_changed = true; + smp_mb__before_atomic_inc(); + atomic_inc(&td->limits_changed); + smp_mb__after_atomic_inc(); + throtl_schedule_delayed_work(td->queue, 0); +} + +static void throtl_update_blkio_group_write_iops(void *key, + struct blkio_group *blkg, unsigned int write_iops) +{ + struct throtl_data *td = key; + + tg_of_blkg(blkg)->iops[WRITE] = write_iops; + smp_wmb(); + tg_of_blkg(blkg)->limits_changed = true; + smp_mb__before_atomic_inc(); + atomic_inc(&td->limits_changed); + smp_mb__after_atomic_inc(); + throtl_schedule_delayed_work(td->queue, 0); +} + +void throtl_shutdown_timer_wq(struct request_queue *q) +{ + struct throtl_data *td = q->td; + + cancel_delayed_work_sync(&td->throtl_work); +} + +static struct blkio_policy_type blkio_policy_throtl = { + .ops = { + .blkio_unlink_group_fn = throtl_unlink_blkio_group, + .blkio_update_group_read_bps_fn = + throtl_update_blkio_group_read_bps, + .blkio_update_group_write_bps_fn = + throtl_update_blkio_group_write_bps, + .blkio_update_group_read_iops_fn = + throtl_update_blkio_group_read_iops, + .blkio_update_group_write_iops_fn = + throtl_update_blkio_group_write_iops, + }, + .plid = BLKIO_POLICY_THROTL, +}; + +int blk_throtl_bio(struct request_queue *q, struct bio **biop) +{ + struct throtl_data *td = q->td; + struct throtl_grp *tg; + struct bio *bio = *biop; + bool rw = bio_data_dir(bio), update_disptime = true; + + if (bio->bi_rw & REQ_THROTTLED) { + bio->bi_rw &= ~REQ_THROTTLED; + return 0; + } + + spin_lock_irq(q->queue_lock); + tg = throtl_get_tg(td); + + if (tg->nr_queued[rw]) { + /* + * There is already another bio queued in same dir. No + * need to update dispatch time. + * Still update the disptime if rate limits on this group + * were changed. + */ + if (!tg->limits_changed) + update_disptime = false; + else + tg->limits_changed = false; + + goto queue_bio; + } + + /* Bio is with-in rate limit of group */ + if (tg_may_dispatch(td, tg, bio, NULL)) { + throtl_charge_bio(tg, bio); + goto out; + } + +queue_bio: + throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu" + " iodisp=%u iops=%u queued=%d/%d", + rw == READ ? 'R' : 'W', + tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], + tg->io_disp[rw], tg->iops[rw], + tg->nr_queued[READ], tg->nr_queued[WRITE]); + + throtl_add_bio_tg(q->td, tg, bio); + *biop = NULL; + + if (update_disptime) { + tg_update_disptime(td, tg); + throtl_schedule_next_dispatch(td); + } + +out: + spin_unlock_irq(q->queue_lock); + return 0; +} + +int blk_throtl_init(struct request_queue *q) +{ + struct throtl_data *td; + struct throtl_grp *tg; + + td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); + if (!td) + return -ENOMEM; + + INIT_HLIST_HEAD(&td->tg_list); + td->tg_service_tree = THROTL_RB_ROOT; + atomic_set(&td->limits_changed, 0); + + /* Init root group */ + tg = &td->root_tg; + INIT_HLIST_NODE(&tg->tg_node); + RB_CLEAR_NODE(&tg->rb_node); + bio_list_init(&tg->bio_lists[0]); + bio_list_init(&tg->bio_lists[1]); + + /* Practically unlimited BW */ + tg->bps[0] = tg->bps[1] = -1; + tg->iops[0] = tg->iops[1] = -1; + + /* + * Set root group reference to 2. One reference will be dropped when + * all groups on tg_list are being deleted during queue exit. Other + * reference will remain there as we don't want to delete this group + * as it is statically allocated and gets destroyed when throtl_data + * goes away. + */ + atomic_set(&tg->ref, 2); + hlist_add_head(&tg->tg_node, &td->tg_list); + td->nr_undestroyed_grps++; + + INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); + + rcu_read_lock(); + blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td, + 0, BLKIO_POLICY_THROTL); + rcu_read_unlock(); + + /* Attach throtl data to request queue */ + td->queue = q; + q->td = td; + return 0; +} + +void blk_throtl_exit(struct request_queue *q) +{ + struct throtl_data *td = q->td; + bool wait = false; + + BUG_ON(!td); + + throtl_shutdown_timer_wq(q); + + spin_lock_irq(q->queue_lock); + throtl_release_tgs(td); + + /* If there are other groups */ + if (td->nr_undestroyed_grps > 0) + wait = true; + + spin_unlock_irq(q->queue_lock); + + /* + * Wait for tg->blkg->key accessors to exit their grace periods. + * Do this wait only if there are other undestroyed groups out + * there (other than root group). This can happen if cgroup deletion + * path claimed the responsibility of cleaning up a group before + * queue cleanup code get to the group. + * + * Do not call synchronize_rcu() unconditionally as there are drivers + * which create/delete request queue hundreds of times during scan/boot + * and synchronize_rcu() can take significant time and slow down boot. + */ + if (wait) + synchronize_rcu(); + + /* + * Just being safe to make sure after previous flush if some body did + * update limits through cgroup and another work got queued, cancel + * it. + */ + throtl_shutdown_timer_wq(q); + throtl_td_free(td); +} + +static int __init throtl_init(void) +{ + blkio_policy_register(&blkio_policy_throtl); + return 0; +} + +module_init(throtl_init); diff --git a/block/blk.h b/block/blk.h index d6b911ac002c..2db8f32838e7 100644 --- a/block/blk.h +++ b/block/blk.h @@ -51,6 +51,8 @@ static inline void blk_clear_rq_complete(struct request *rq) */ #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) +struct request *blk_do_flush(struct request_queue *q, struct request *rq); + static inline struct request *__elv_next_request(struct request_queue *q) { struct request *rq; @@ -58,7 +60,11 @@ static inline struct request *__elv_next_request(struct request_queue *q) while (1) { while (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); - if (blk_do_ordered(q, &rq)) + if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) || + rq == &q->flush_rq) + return rq; + rq = blk_do_flush(q, rq); + if (rq) return rq; } @@ -132,14 +138,6 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) return q->nr_congestion_off; } -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -#define rq_for_each_integrity_segment(bvl, _rq, _iter) \ - __rq_for_each_bio(_iter.bio, _rq) \ - bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i) - -#endif /* BLK_DEV_INTEGRITY */ - static inline int blk_cpu_to_group(int cpu) { int group = NR_CPUS; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 9eba291eb6fd..28a54b0d5bb0 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -221,7 +221,6 @@ struct cfq_data { enum wl_type_t serving_type; unsigned long workload_expires; struct cfq_group *serving_group; - bool noidle_tree_requires_idle; /* * Each priority tree is sorted by next_request position. These @@ -977,8 +976,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) return NULL; } -void -cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) +void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, + unsigned int weight) { cfqg_of_blkg(blkg)->weight = weight; } @@ -2180,7 +2179,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) slice = max_t(unsigned, slice, CFQ_MIN_TT); cfq_log(cfqd, "workload slice:%d", slice); cfqd->workload_expires = jiffies + slice; - cfqd->noidle_tree_requires_idle = false; } static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) @@ -3177,7 +3175,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (cfqq->queued[0] + cfqq->queued[1] >= 4) cfq_mark_cfqq_deep(cfqq); - if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || + if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) + enable_idle = 0; + else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) enable_idle = 0; else if (sample_valid(cic->ttime_samples)) { @@ -3494,17 +3494,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_slice_expired(cfqd, 1); else if (sync && cfqq_empty && !cfq_close_cooperator(cfqd, cfqq)) { - cfqd->noidle_tree_requires_idle |= - !(rq->cmd_flags & REQ_NOIDLE); - /* - * Idling is enabled for SYNC_WORKLOAD. - * SYNC_NOIDLE_WORKLOAD idles at the end of the tree - * only if we processed at least one !REQ_NOIDLE request - */ - if (cfqd->serving_type == SYNC_WORKLOAD - || cfqd->noidle_tree_requires_idle - || cfqq->cfqg->nr_cfqq == 1) - cfq_arm_slice_timer(cfqd); + cfq_arm_slice_timer(cfqd); } } @@ -4090,6 +4080,7 @@ static struct blkio_policy_type blkio_policy_cfq = { .blkio_unlink_group_fn = cfq_unlink_blkio_group, .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, }, + .plid = BLKIO_POLICY_PROP, }; #else static struct blkio_policy_type blkio_policy_cfq; diff --git a/block/cfq.h b/block/cfq.h index 93448e5a2e41..54a6d90f8e8c 100644 --- a/block/cfq.h +++ b/block/cfq.h @@ -69,7 +69,7 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev) { - blkiocg_add_blkio_group(blkcg, blkg, key, dev); + blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP); } static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) diff --git a/block/elevator.c b/block/elevator.c index 4e11559aa2b0..282e8308f7e2 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -617,8 +617,6 @@ void elv_quiesce_end(struct request_queue *q) void elv_insert(struct request_queue *q, struct request *rq, int where) { - struct list_head *pos; - unsigned ordseq; int unplug_it = 1; trace_block_rq_insert(q, rq); @@ -626,9 +624,16 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) rq->q = q; switch (where) { + case ELEVATOR_INSERT_REQUEUE: + /* + * Most requeues happen because of a busy condition, + * don't force unplug of the queue for that case. + * Clear unplug_it and fall through. + */ + unplug_it = 0; + case ELEVATOR_INSERT_FRONT: rq->cmd_flags |= REQ_SOFTBARRIER; - list_add(&rq->queuelist, &q->queue_head); break; @@ -668,36 +673,6 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) q->elevator->ops->elevator_add_req_fn(q, rq); break; - case ELEVATOR_INSERT_REQUEUE: - /* - * If ordered flush isn't in progress, we do front - * insertion; otherwise, requests should be requeued - * in ordseq order. - */ - rq->cmd_flags |= REQ_SOFTBARRIER; - - /* - * Most requeues happen because of a busy condition, - * don't force unplug of the queue for that case. - */ - unplug_it = 0; - - if (q->ordseq == 0) { - list_add(&rq->queuelist, &q->queue_head); - break; - } - - ordseq = blk_ordered_req_seq(rq); - - list_for_each(pos, &q->queue_head) { - struct request *pos_rq = list_entry_rq(pos); - if (ordseq <= blk_ordered_req_seq(pos_rq)) - break; - } - - list_add_tail(&rq->queuelist, pos); - break; - default: printk(KERN_ERR "%s: bad insertion point %d\n", __func__, where); @@ -716,26 +691,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) void __elv_add_request(struct request_queue *q, struct request *rq, int where, int plug) { - if (q->ordcolor) - rq->cmd_flags |= REQ_ORDERED_COLOR; - if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { - /* - * toggle ordered color - */ - if (rq->cmd_flags & REQ_HARDBARRIER) - q->ordcolor ^= 1; - - /* - * barriers implicitly indicate back insertion - */ - if (where == ELEVATOR_INSERT_SORT) - where = ELEVATOR_INSERT_BACK; - - /* - * this request is scheduling boundary, update - * end_sector - */ + /* barriers are scheduling boundary, update end_sector */ if (rq->cmd_type == REQ_TYPE_FS || (rq->cmd_flags & REQ_DISCARD)) { q->end_sector = rq_end_sector(rq); @@ -855,24 +812,6 @@ void elv_completed_request(struct request_queue *q, struct request *rq) e->ops->elevator_completed_req_fn) e->ops->elevator_completed_req_fn(q, rq); } - - /* - * Check if the queue is waiting for fs requests to be - * drained for flush sequence. - */ - if (unlikely(q->ordseq)) { - struct request *next = NULL; - - if (!list_empty(&q->queue_head)) - next = list_entry_rq(q->queue_head.next); - - if (!queue_in_flight(q) && - blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN && - (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) { - blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0); - __blk_run_queue(q); - } - } } #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) diff --git a/block/genhd.c b/block/genhd.c index 59a2db6fecef..7923e720ddf5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -541,13 +541,15 @@ void add_disk(struct gendisk *disk) disk->major = MAJOR(devt); disk->first_minor = MINOR(devt); + /* Register BDI before referencing it from bdev */ + bdi = &disk->queue->backing_dev_info; + bdi_register_dev(bdi, disk_devt(disk)); + blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); register_disk(disk); blk_register_queue(disk); - bdi = &disk->queue->backing_dev_info; - bdi_register_dev(bdi, disk_devt(disk)); retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, "bdi"); WARN_ON(retval); @@ -642,6 +644,7 @@ void __init printk_all_partitions(void) struct hd_struct *part; char name_buf[BDEVNAME_SIZE]; char devt_buf[BDEVT_SIZE]; + u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1]; /* * Don't show empty devices or things that have been @@ -660,10 +663,14 @@ void __init printk_all_partitions(void) while ((part = disk_part_iter_next(&piter))) { bool is_part0 = part == &disk->part0; - printk("%s%s %10llu %s", is_part0 ? "" : " ", + uuid[0] = 0; + if (part->info) + part_unpack_uuid(part->info->uuid, uuid); + + printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), (unsigned long long)part->nr_sects >> 1, - disk_name(disk, part->partno, name_buf)); + disk_name(disk, part->partno, name_buf), uuid); if (is_part0) { if (disk->driverfs_dev != NULL && disk->driverfs_dev->driver != NULL) @@ -1004,6 +1011,7 @@ static void disk_release(struct device *dev) kfree(disk->random); disk_replace_part_tbl(disk, NULL); free_part_stats(&disk->part0); + free_part_info(&disk->part0); kfree(disk); } struct class block_class = { diff --git a/block/ioctl.c b/block/ioctl.c index d8052f0dabd3..d724ceb1d465 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -62,7 +62,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user /* all seems OK */ part = add_partition(disk, partno, start, length, - ADDPART_FLAG_NONE); + ADDPART_FLAG_NONE, NULL); mutex_unlock(&bdev->bd_mutex); return IS_ERR(part) ? PTR_ERR(part) : 0; case BLKPG_DEL_PARTITION: @@ -116,7 +116,7 @@ static int blkdev_reread_part(struct block_device *bdev) static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, uint64_t len, int secure) { - unsigned long flags = BLKDEV_IFL_WAIT; + unsigned long flags = 0; if (start & 511) return -EINVAL; @@ -128,7 +128,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, if (start + len > (bdev->bd_inode->i_size >> 9)) return -EINVAL; if (secure) - flags |= BLKDEV_IFL_SECURE; + flags |= BLKDEV_DISCARD_SECURE; return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); } diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 76f114f0bba3..327ed27dfe1a 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -114,8 +114,6 @@ static unsigned long int fd_def_df0 = FD_DD_3; /* default for df0 if it does module_param(fd_def_df0, ulong, 0); MODULE_LICENSE("GPL"); -static struct request_queue *floppy_queue; - /* * Macros */ @@ -164,6 +162,7 @@ static volatile int selected = -1; /* currently selected drive */ static int writepending; static int writefromint; static char *raw_buf; +static int fdc_queue; static DEFINE_SPINLOCK(amiflop_lock); @@ -1334,6 +1333,42 @@ static int get_track(int drive, int track) return -1; } +/* + * Round-robin between our available drives, doing one request from each + */ +static struct request *set_next_request(void) +{ + struct request_queue *q; + int cnt = FD_MAX_UNITS; + struct request *rq; + + /* Find next queue we can dispatch from */ + fdc_queue = fdc_queue + 1; + if (fdc_queue == FD_MAX_UNITS) + fdc_queue = 0; + + for(cnt = FD_MAX_UNITS; cnt > 0; cnt--) { + + if (unit[fdc_queue].type->code == FD_NODRIVE) { + if (++fdc_queue == FD_MAX_UNITS) + fdc_queue = 0; + continue; + } + + q = unit[fdc_queue].gendisk->queue; + if (q) { + rq = blk_fetch_request(q); + if (rq) + break; + } + + if (++fdc_queue == FD_MAX_UNITS) + fdc_queue = 0; + } + + return rq; +} + static void redo_fd_request(void) { struct request *rq; @@ -1345,7 +1380,7 @@ static void redo_fd_request(void) int err; next_req: - rq = blk_fetch_request(floppy_queue); + rq = set_next_request(); if (!rq) { /* Nothing left to do */ return; @@ -1682,6 +1717,13 @@ static int __init fd_probe_drives(void) continue; } unit[drive].gendisk = disk; + + disk->queue = blk_init_queue(do_fd_request, &amiflop_lock); + if (!disk->queue) { + unit[drive].type->code = FD_NODRIVE; + continue; + } + drives++; if ((unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL)) == NULL) { printk("no mem for "); @@ -1695,7 +1737,6 @@ static int __init fd_probe_drives(void) disk->fops = &floppy_fops; sprintf(disk->disk_name, "fd%d", drive); disk->private_data = &unit[drive]; - disk->queue = floppy_queue; set_capacity(disk, 880*2); add_disk(disk); } @@ -1743,11 +1784,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev) goto out_irq2; } - ret = -ENOMEM; - floppy_queue = blk_init_queue(do_fd_request, &amiflop_lock); - if (!floppy_queue) - goto out_queue; - ret = -ENODEV; if (fd_probe_drives() < 1) /* No usable drives */ goto out_probe; @@ -1791,8 +1827,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev) return 0; out_probe: - blk_cleanup_queue(floppy_queue); -out_queue: free_irq(IRQ_AMIGA_CIAA_TB, NULL); out_irq2: free_irq(IRQ_AMIGA_DSKBLK, NULL); @@ -1810,9 +1844,12 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev) for( i = 0; i < FD_MAX_UNITS; i++) { if (unit[i].type->code != FD_NODRIVE) { + struct request_queue *q = unit[i].gendisk->queue; del_gendisk(unit[i].gendisk); put_disk(unit[i].gendisk); kfree(unit[i].trackbuf); + if (q) + blk_cleanup_queue(q); } } blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); @@ -1820,7 +1857,6 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev) free_irq(IRQ_AMIGA_DSKBLK, NULL); custom.dmacon = DMAF_DISK; /* disable DMA */ amiga_chip_free(raw_buf); - blk_cleanup_queue(floppy_queue); unregister_blkdev(FLOPPY_MAJOR, "fd"); } #endif diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index aceb96476524..0f4eec442e5d 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -79,8 +79,8 @@ #undef DEBUG -static struct request_queue *floppy_queue; static struct request *fd_request; +static int fdc_queue; /* Disk types: DD, HD, ED */ static struct atari_disk_type { @@ -1391,6 +1391,29 @@ static void setup_req_params( int drive ) ReqTrack, ReqSector, (unsigned long)ReqData )); } +/* + * Round-robin between our available drives, doing one request from each + */ +static struct request *set_next_request(void) +{ + struct request_queue *q; + int old_pos = fdc_queue; + struct request *rq; + + do { + q = unit[fdc_queue].disk->queue; + if (++fdc_queue == FD_MAX_UNITS) + fdc_queue = 0; + if (q) { + rq = blk_fetch_request(q); + if (rq) + break; + } + } while (fdc_queue != old_pos); + + return rq; +} + static void redo_fd_request(void) { @@ -1405,7 +1428,7 @@ static void redo_fd_request(void) repeat: if (!fd_request) { - fd_request = blk_fetch_request(floppy_queue); + fd_request = set_next_request(); if (!fd_request) goto the_end; } @@ -1932,10 +1955,6 @@ static int __init atari_floppy_init (void) PhysTrackBuffer = virt_to_phys(TrackBuffer); BufferDrive = BufferSide = BufferTrack = -1; - floppy_queue = blk_init_queue(do_fd_request, &ataflop_lock); - if (!floppy_queue) - goto Enomem; - for (i = 0; i < FD_MAX_UNITS; i++) { unit[i].track = -1; unit[i].flags = 0; @@ -1944,7 +1963,10 @@ static int __init atari_floppy_init (void) sprintf(unit[i].disk->disk_name, "fd%d", i); unit[i].disk->fops = &floppy_fops; unit[i].disk->private_data = &unit[i]; - unit[i].disk->queue = floppy_queue; + unit[i].disk->queue = blk_init_queue(do_fd_request, + &ataflop_lock); + if (!unit[i].disk->queue) + goto Enomem; set_capacity(unit[i].disk, MAX_DISK_SIZE * 2); add_disk(unit[i].disk); } @@ -1959,10 +1981,14 @@ static int __init atari_floppy_init (void) return 0; Enomem: - while (i--) + while (i--) { + struct request_queue *q = unit[i].disk->queue; + put_disk(unit[i].disk); - if (floppy_queue) - blk_cleanup_queue(floppy_queue); + if (q) + blk_cleanup_queue(q); + } + unregister_blkdev(FLOPPY_MAJOR, "fd"); return -ENOMEM; } @@ -2011,12 +2037,14 @@ static void __exit atari_floppy_exit(void) int i; blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); for (i = 0; i < FD_MAX_UNITS; i++) { + struct request_queue *q = unit[i].disk->queue; + del_gendisk(unit[i].disk); put_disk(unit[i].disk); + blk_cleanup_queue(q); } unregister_blkdev(FLOPPY_MAJOR, "fd"); - blk_cleanup_queue(floppy_queue); del_timer_sync(&fd_timer); atari_stram_free( DMABuffer ); } diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 1c7f63792ff8..fa33f97722ba 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -482,7 +482,6 @@ static struct brd_device *brd_alloc(int i) if (!brd->brd_queue) goto out_free_dev; blk_queue_make_request(brd->brd_queue, brd_make_request); - blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG); blk_queue_max_hw_sectors(brd->brd_queue, 1024); blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 5e4fadcdece9..39d62eb93bde 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1232,470 +1232,452 @@ static void check_ioctl_unit_attention(ctlr_info_t *h, CommandList_struct *c) c->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION) (void)check_for_unit_attention(h, c); } -/* - * ioctl - */ -static int cciss_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) + +static int cciss_getpciinfo(ctlr_info_t *h, void __user *argp) { - struct gendisk *disk = bdev->bd_disk; - ctlr_info_t *h = get_host(disk); - drive_info_struct *drv = get_drv(disk); - void __user *argp = (void __user *)arg; + cciss_pci_info_struct pciinfo; - dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n", - cmd, arg); - switch (cmd) { - case CCISS_GETPCIINFO: - { - cciss_pci_info_struct pciinfo; - - if (!arg) - return -EINVAL; - pciinfo.domain = pci_domain_nr(h->pdev->bus); - pciinfo.bus = h->pdev->bus->number; - pciinfo.dev_fn = h->pdev->devfn; - pciinfo.board_id = h->board_id; - if (copy_to_user - (argp, &pciinfo, sizeof(cciss_pci_info_struct))) - return -EFAULT; - return 0; - } - case CCISS_GETINTINFO: - { - cciss_coalint_struct intinfo; - if (!arg) - return -EINVAL; - intinfo.delay = - readl(&h->cfgtable->HostWrite.CoalIntDelay); - intinfo.count = - readl(&h->cfgtable->HostWrite.CoalIntCount); - if (copy_to_user - (argp, &intinfo, sizeof(cciss_coalint_struct))) - return -EFAULT; - return 0; - } - case CCISS_SETINTINFO: - { - cciss_coalint_struct intinfo; - unsigned long flags; - int i; - - if (!arg) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (copy_from_user - (&intinfo, argp, sizeof(cciss_coalint_struct))) - return -EFAULT; - if ((intinfo.delay == 0) && (intinfo.count == 0)) - return -EINVAL; - spin_lock_irqsave(&h->lock, flags); - /* Update the field, and then ring the doorbell */ - writel(intinfo.delay, - &(h->cfgtable->HostWrite.CoalIntDelay)); - writel(intinfo.count, - &(h->cfgtable->HostWrite.CoalIntCount)); - writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); - - for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) { - if (!(readl(h->vaddr + SA5_DOORBELL) - & CFGTBL_ChangeReq)) - break; - /* delay and try again */ - udelay(1000); - } - spin_unlock_irqrestore(&h->lock, flags); - if (i >= MAX_IOCTL_CONFIG_WAIT) - return -EAGAIN; - return 0; - } - case CCISS_GETNODENAME: - { - NodeName_type NodeName; - int i; - - if (!arg) - return -EINVAL; - for (i = 0; i < 16; i++) - NodeName[i] = - readb(&h->cfgtable->ServerName[i]); - if (copy_to_user(argp, NodeName, sizeof(NodeName_type))) - return -EFAULT; - return 0; - } - case CCISS_SETNODENAME: - { - NodeName_type NodeName; - unsigned long flags; - int i; + if (!argp) + return -EINVAL; + pciinfo.domain = pci_domain_nr(h->pdev->bus); + pciinfo.bus = h->pdev->bus->number; + pciinfo.dev_fn = h->pdev->devfn; + pciinfo.board_id = h->board_id; + if (copy_to_user(argp, &pciinfo, sizeof(cciss_pci_info_struct))) + return -EFAULT; + return 0; +} - if (!arg) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; +static int cciss_getintinfo(ctlr_info_t *h, void __user *argp) +{ + cciss_coalint_struct intinfo; - if (copy_from_user - (NodeName, argp, sizeof(NodeName_type))) - return -EFAULT; + if (!argp) + return -EINVAL; + intinfo.delay = readl(&h->cfgtable->HostWrite.CoalIntDelay); + intinfo.count = readl(&h->cfgtable->HostWrite.CoalIntCount); + if (copy_to_user + (argp, &intinfo, sizeof(cciss_coalint_struct))) + return -EFAULT; + return 0; +} - spin_lock_irqsave(&h->lock, flags); +static int cciss_setintinfo(ctlr_info_t *h, void __user *argp) +{ + cciss_coalint_struct intinfo; + unsigned long flags; + int i; - /* Update the field, and then ring the doorbell */ - for (i = 0; i < 16; i++) - writeb(NodeName[i], - &h->cfgtable->ServerName[i]); + if (!argp) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&intinfo, argp, sizeof(intinfo))) + return -EFAULT; + if ((intinfo.delay == 0) && (intinfo.count == 0)) + return -EINVAL; + spin_lock_irqsave(&h->lock, flags); + /* Update the field, and then ring the doorbell */ + writel(intinfo.delay, &(h->cfgtable->HostWrite.CoalIntDelay)); + writel(intinfo.count, &(h->cfgtable->HostWrite.CoalIntCount)); + writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); - writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); + for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) { + if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq)) + break; + udelay(1000); /* delay and try again */ + } + spin_unlock_irqrestore(&h->lock, flags); + if (i >= MAX_IOCTL_CONFIG_WAIT) + return -EAGAIN; + return 0; +} - for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) { - if (!(readl(h->vaddr + SA5_DOORBELL) - & CFGTBL_ChangeReq)) - break; - /* delay and try again */ - udelay(1000); - } - spin_unlock_irqrestore(&h->lock, flags); - if (i >= MAX_IOCTL_CONFIG_WAIT) - return -EAGAIN; - return 0; - } +static int cciss_getnodename(ctlr_info_t *h, void __user *argp) +{ + NodeName_type NodeName; + int i; - case CCISS_GETHEARTBEAT: - { - Heartbeat_type heartbeat; - - if (!arg) - return -EINVAL; - heartbeat = readl(&h->cfgtable->HeartBeat); - if (copy_to_user - (argp, &heartbeat, sizeof(Heartbeat_type))) - return -EFAULT; - return 0; - } - case CCISS_GETBUSTYPES: - { - BusTypes_type BusTypes; - - if (!arg) - return -EINVAL; - BusTypes = readl(&h->cfgtable->BusTypes); - if (copy_to_user - (argp, &BusTypes, sizeof(BusTypes_type))) - return -EFAULT; - return 0; - } - case CCISS_GETFIRMVER: - { - FirmwareVer_type firmware; + if (!argp) + return -EINVAL; + for (i = 0; i < 16; i++) + NodeName[i] = readb(&h->cfgtable->ServerName[i]); + if (copy_to_user(argp, NodeName, sizeof(NodeName_type))) + return -EFAULT; + return 0; +} - if (!arg) - return -EINVAL; - memcpy(firmware, h->firm_ver, 4); +static int cciss_setnodename(ctlr_info_t *h, void __user *argp) +{ + NodeName_type NodeName; + unsigned long flags; + int i; - if (copy_to_user - (argp, firmware, sizeof(FirmwareVer_type))) - return -EFAULT; - return 0; - } - case CCISS_GETDRIVVER: - { - DriverVer_type DriverVer = DRIVER_VERSION; + if (!argp) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(NodeName, argp, sizeof(NodeName_type))) + return -EFAULT; + spin_lock_irqsave(&h->lock, flags); + /* Update the field, and then ring the doorbell */ + for (i = 0; i < 16; i++) + writeb(NodeName[i], &h->cfgtable->ServerName[i]); + writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); + for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) { + if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq)) + break; + udelay(1000); /* delay and try again */ + } + spin_unlock_irqrestore(&h->lock, flags); + if (i >= MAX_IOCTL_CONFIG_WAIT) + return -EAGAIN; + return 0; +} - if (!arg) - return -EINVAL; +static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp) +{ + Heartbeat_type heartbeat; - if (copy_to_user - (argp, &DriverVer, sizeof(DriverVer_type))) - return -EFAULT; - return 0; - } + if (!argp) + return -EINVAL; + heartbeat = readl(&h->cfgtable->HeartBeat); + if (copy_to_user(argp, &heartbeat, sizeof(Heartbeat_type))) + return -EFAULT; + return 0; +} - case CCISS_DEREGDISK: - case CCISS_REGNEWD: - case CCISS_REVALIDVOLS: - return rebuild_lun_table(h, 0, 1); +static int cciss_getbustypes(ctlr_info_t *h, void __user *argp) +{ + BusTypes_type BusTypes; + + if (!argp) + return -EINVAL; + BusTypes = readl(&h->cfgtable->BusTypes); + if (copy_to_user(argp, &BusTypes, sizeof(BusTypes_type))) + return -EFAULT; + return 0; +} - case CCISS_GETLUNINFO:{ - LogvolInfo_struct luninfo; +static int cciss_getfirmver(ctlr_info_t *h, void __user *argp) +{ + FirmwareVer_type firmware; - memcpy(&luninfo.LunID, drv->LunID, - sizeof(luninfo.LunID)); - luninfo.num_opens = drv->usage_count; - luninfo.num_parts = 0; - if (copy_to_user(argp, &luninfo, - sizeof(LogvolInfo_struct))) - return -EFAULT; - return 0; + if (!argp) + return -EINVAL; + memcpy(firmware, h->firm_ver, 4); + + if (copy_to_user + (argp, firmware, sizeof(FirmwareVer_type))) + return -EFAULT; + return 0; +} + +static int cciss_getdrivver(ctlr_info_t *h, void __user *argp) +{ + DriverVer_type DriverVer = DRIVER_VERSION; + + if (!argp) + return -EINVAL; + if (copy_to_user(argp, &DriverVer, sizeof(DriverVer_type))) + return -EFAULT; + return 0; +} + +static int cciss_getluninfo(ctlr_info_t *h, + struct gendisk *disk, void __user *argp) +{ + LogvolInfo_struct luninfo; + drive_info_struct *drv = get_drv(disk); + + if (!argp) + return -EINVAL; + memcpy(&luninfo.LunID, drv->LunID, sizeof(luninfo.LunID)); + luninfo.num_opens = drv->usage_count; + luninfo.num_parts = 0; + if (copy_to_user(argp, &luninfo, sizeof(LogvolInfo_struct))) + return -EFAULT; + return 0; +} + +static int cciss_passthru(ctlr_info_t *h, void __user *argp) +{ + IOCTL_Command_struct iocommand; + CommandList_struct *c; + char *buff = NULL; + u64bit temp64; + DECLARE_COMPLETION_ONSTACK(wait); + + if (!argp) + return -EINVAL; + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + if (copy_from_user + (&iocommand, argp, sizeof(IOCTL_Command_struct))) + return -EFAULT; + if ((iocommand.buf_size < 1) && + (iocommand.Request.Type.Direction != XFER_NONE)) { + return -EINVAL; + } + if (iocommand.buf_size > 0) { + buff = kmalloc(iocommand.buf_size, GFP_KERNEL); + if (buff == NULL) + return -EFAULT; + } + if (iocommand.Request.Type.Direction == XFER_WRITE) { + /* Copy the data into the buffer we created */ + if (copy_from_user(buff, iocommand.buf, iocommand.buf_size)) { + kfree(buff); + return -EFAULT; } - case CCISS_PASSTHRU: - { - IOCTL_Command_struct iocommand; - CommandList_struct *c; - char *buff = NULL; - u64bit temp64; - DECLARE_COMPLETION_ONSTACK(wait); - - if (!arg) - return -EINVAL; - - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - - if (copy_from_user - (&iocommand, argp, sizeof(IOCTL_Command_struct))) - return -EFAULT; - if ((iocommand.buf_size < 1) && - (iocommand.Request.Type.Direction != XFER_NONE)) { - return -EINVAL; - } -#if 0 /* 'buf_size' member is 16-bits, and always smaller than kmalloc limit */ - /* Check kmalloc limits */ - if (iocommand.buf_size > 128000) - return -EINVAL; -#endif - if (iocommand.buf_size > 0) { - buff = kmalloc(iocommand.buf_size, GFP_KERNEL); - if (buff == NULL) - return -EFAULT; - } - if (iocommand.Request.Type.Direction == XFER_WRITE) { - /* Copy the data into the buffer we created */ - if (copy_from_user - (buff, iocommand.buf, iocommand.buf_size)) { - kfree(buff); - return -EFAULT; - } - } else { - memset(buff, 0, iocommand.buf_size); - } - c = cmd_special_alloc(h); - if (!c) { - kfree(buff); - return -ENOMEM; - } - /* Fill in the command type */ - c->cmd_type = CMD_IOCTL_PEND; - /* Fill in Command Header */ - c->Header.ReplyQueue = 0; /* unused in simple mode */ - if (iocommand.buf_size > 0) /* buffer to fill */ - { - c->Header.SGList = 1; - c->Header.SGTotal = 1; - } else /* no buffers to fill */ - { - c->Header.SGList = 0; - c->Header.SGTotal = 0; - } - c->Header.LUN = iocommand.LUN_info; - /* use the kernel address the cmd block for tag */ - c->Header.Tag.lower = c->busaddr; - - /* Fill in Request block */ - c->Request = iocommand.Request; - - /* Fill in the scatter gather information */ - if (iocommand.buf_size > 0) { - temp64.val = pci_map_single(h->pdev, buff, - iocommand.buf_size, - PCI_DMA_BIDIRECTIONAL); - c->SG[0].Addr.lower = temp64.val32.lower; - c->SG[0].Addr.upper = temp64.val32.upper; - c->SG[0].Len = iocommand.buf_size; - c->SG[0].Ext = 0; /* we are not chaining */ - } - c->waiting = &wait; + } else { + memset(buff, 0, iocommand.buf_size); + } + c = cmd_special_alloc(h); + if (!c) { + kfree(buff); + return -ENOMEM; + } + /* Fill in the command type */ + c->cmd_type = CMD_IOCTL_PEND; + /* Fill in Command Header */ + c->Header.ReplyQueue = 0; /* unused in simple mode */ + if (iocommand.buf_size > 0) { /* buffer to fill */ + c->Header.SGList = 1; + c->Header.SGTotal = 1; + } else { /* no buffers to fill */ + c->Header.SGList = 0; + c->Header.SGTotal = 0; + } + c->Header.LUN = iocommand.LUN_info; + /* use the kernel address the cmd block for tag */ + c->Header.Tag.lower = c->busaddr; - enqueue_cmd_and_start_io(h, c); - wait_for_completion(&wait); + /* Fill in Request block */ + c->Request = iocommand.Request; - /* unlock the buffers from DMA */ - temp64.val32.lower = c->SG[0].Addr.lower; - temp64.val32.upper = c->SG[0].Addr.upper; - pci_unmap_single(h->pdev, (dma_addr_t) temp64.val, - iocommand.buf_size, - PCI_DMA_BIDIRECTIONAL); + /* Fill in the scatter gather information */ + if (iocommand.buf_size > 0) { + temp64.val = pci_map_single(h->pdev, buff, + iocommand.buf_size, PCI_DMA_BIDIRECTIONAL); + c->SG[0].Addr.lower = temp64.val32.lower; + c->SG[0].Addr.upper = temp64.val32.upper; + c->SG[0].Len = iocommand.buf_size; + c->SG[0].Ext = 0; /* we are not chaining */ + } + c->waiting = &wait; - check_ioctl_unit_attention(h, c); + enqueue_cmd_and_start_io(h, c); + wait_for_completion(&wait); - /* Copy the error information out */ - iocommand.error_info = *(c->err_info); - if (copy_to_user - (argp, &iocommand, sizeof(IOCTL_Command_struct))) { - kfree(buff); - cmd_special_free(h, c); - return -EFAULT; - } + /* unlock the buffers from DMA */ + temp64.val32.lower = c->SG[0].Addr.lower; + temp64.val32.upper = c->SG[0].Addr.upper; + pci_unmap_single(h->pdev, (dma_addr_t) temp64.val, iocommand.buf_size, + PCI_DMA_BIDIRECTIONAL); + check_ioctl_unit_attention(h, c); + + /* Copy the error information out */ + iocommand.error_info = *(c->err_info); + if (copy_to_user(argp, &iocommand, sizeof(IOCTL_Command_struct))) { + kfree(buff); + cmd_special_free(h, c); + return -EFAULT; + } - if (iocommand.Request.Type.Direction == XFER_READ) { - /* Copy the data out of the buffer we created */ - if (copy_to_user - (iocommand.buf, buff, iocommand.buf_size)) { - kfree(buff); - cmd_special_free(h, c); - return -EFAULT; - } - } + if (iocommand.Request.Type.Direction == XFER_READ) { + /* Copy the data out of the buffer we created */ + if (copy_to_user(iocommand.buf, buff, iocommand.buf_size)) { kfree(buff); cmd_special_free(h, c); - return 0; + return -EFAULT; } - case CCISS_BIG_PASSTHRU:{ - BIG_IOCTL_Command_struct *ioc; - CommandList_struct *c; - unsigned char **buff = NULL; - int *buff_size = NULL; - u64bit temp64; - BYTE sg_used = 0; - int status = 0; - int i; - DECLARE_COMPLETION_ONSTACK(wait); - __u32 left; - __u32 sz; - BYTE __user *data_ptr; - - if (!arg) - return -EINVAL; - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - ioc = (BIG_IOCTL_Command_struct *) - kmalloc(sizeof(*ioc), GFP_KERNEL); - if (!ioc) { - status = -ENOMEM; - goto cleanup1; - } - if (copy_from_user(ioc, argp, sizeof(*ioc))) { + } + kfree(buff); + cmd_special_free(h, c); + return 0; +} + +static int cciss_bigpassthru(ctlr_info_t *h, void __user *argp) +{ + BIG_IOCTL_Command_struct *ioc; + CommandList_struct *c; + unsigned char **buff = NULL; + int *buff_size = NULL; + u64bit temp64; + BYTE sg_used = 0; + int status = 0; + int i; + DECLARE_COMPLETION_ONSTACK(wait); + __u32 left; + __u32 sz; + BYTE __user *data_ptr; + + if (!argp) + return -EINVAL; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + ioc = (BIG_IOCTL_Command_struct *) + kmalloc(sizeof(*ioc), GFP_KERNEL); + if (!ioc) { + status = -ENOMEM; + goto cleanup1; + } + if (copy_from_user(ioc, argp, sizeof(*ioc))) { + status = -EFAULT; + goto cleanup1; + } + if ((ioc->buf_size < 1) && + (ioc->Request.Type.Direction != XFER_NONE)) { + status = -EINVAL; + goto cleanup1; + } + /* Check kmalloc limits using all SGs */ + if (ioc->malloc_size > MAX_KMALLOC_SIZE) { + status = -EINVAL; + goto cleanup1; + } + if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) { + status = -EINVAL; + goto cleanup1; + } + buff = kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL); + if (!buff) { + status = -ENOMEM; + goto cleanup1; + } + buff_size = kmalloc(MAXSGENTRIES * sizeof(int), GFP_KERNEL); + if (!buff_size) { + status = -ENOMEM; + goto cleanup1; + } + left = ioc->buf_size; + data_ptr = ioc->buf; + while (left) { + sz = (left > ioc->malloc_size) ? ioc->malloc_size : left; + buff_size[sg_used] = sz; + buff[sg_used] = kmalloc(sz, GFP_KERNEL); + if (buff[sg_used] == NULL) { + status = -ENOMEM; + goto cleanup1; + } + if (ioc->Request.Type.Direction == XFER_WRITE) { + if (copy_from_user(buff[sg_used], data_ptr, sz)) { status = -EFAULT; goto cleanup1; } - if ((ioc->buf_size < 1) && - (ioc->Request.Type.Direction != XFER_NONE)) { - status = -EINVAL; - goto cleanup1; - } - /* Check kmalloc limits using all SGs */ - if (ioc->malloc_size > MAX_KMALLOC_SIZE) { - status = -EINVAL; - goto cleanup1; - } - if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) { - status = -EINVAL; - goto cleanup1; - } - buff = - kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL); - if (!buff) { - status = -ENOMEM; - goto cleanup1; - } - buff_size = kmalloc(MAXSGENTRIES * sizeof(int), - GFP_KERNEL); - if (!buff_size) { - status = -ENOMEM; - goto cleanup1; - } - left = ioc->buf_size; - data_ptr = ioc->buf; - while (left) { - sz = (left > - ioc->malloc_size) ? ioc-> - malloc_size : left; - buff_size[sg_used] = sz; - buff[sg_used] = kmalloc(sz, GFP_KERNEL); - if (buff[sg_used] == NULL) { - status = -ENOMEM; - goto cleanup1; - } - if (ioc->Request.Type.Direction == XFER_WRITE) { - if (copy_from_user - (buff[sg_used], data_ptr, sz)) { - status = -EFAULT; - goto cleanup1; - } - } else { - memset(buff[sg_used], 0, sz); - } - left -= sz; - data_ptr += sz; - sg_used++; - } - c = cmd_special_alloc(h); - if (!c) { - status = -ENOMEM; - goto cleanup1; - } - c->cmd_type = CMD_IOCTL_PEND; - c->Header.ReplyQueue = 0; + } else { + memset(buff[sg_used], 0, sz); + } + left -= sz; + data_ptr += sz; + sg_used++; + } + c = cmd_special_alloc(h); + if (!c) { + status = -ENOMEM; + goto cleanup1; + } + c->cmd_type = CMD_IOCTL_PEND; + c->Header.ReplyQueue = 0; + c->Header.SGList = sg_used; + c->Header.SGTotal = sg_used; + c->Header.LUN = ioc->LUN_info; + c->Header.Tag.lower = c->busaddr; - if (ioc->buf_size > 0) { - c->Header.SGList = sg_used; - c->Header.SGTotal = sg_used; - } else { - c->Header.SGList = 0; - c->Header.SGTotal = 0; - } - c->Header.LUN = ioc->LUN_info; - c->Header.Tag.lower = c->busaddr; - - c->Request = ioc->Request; - if (ioc->buf_size > 0) { - for (i = 0; i < sg_used; i++) { - temp64.val = - pci_map_single(h->pdev, buff[i], - buff_size[i], - PCI_DMA_BIDIRECTIONAL); - c->SG[i].Addr.lower = - temp64.val32.lower; - c->SG[i].Addr.upper = - temp64.val32.upper; - c->SG[i].Len = buff_size[i]; - c->SG[i].Ext = 0; /* we are not chaining */ - } - } - c->waiting = &wait; - enqueue_cmd_and_start_io(h, c); - wait_for_completion(&wait); - /* unlock the buffers from DMA */ - for (i = 0; i < sg_used; i++) { - temp64.val32.lower = c->SG[i].Addr.lower; - temp64.val32.upper = c->SG[i].Addr.upper; - pci_unmap_single(h->pdev, - (dma_addr_t) temp64.val, buff_size[i], - PCI_DMA_BIDIRECTIONAL); - } - check_ioctl_unit_attention(h, c); - /* Copy the error information out */ - ioc->error_info = *(c->err_info); - if (copy_to_user(argp, ioc, sizeof(*ioc))) { + c->Request = ioc->Request; + for (i = 0; i < sg_used; i++) { + temp64.val = pci_map_single(h->pdev, buff[i], buff_size[i], + PCI_DMA_BIDIRECTIONAL); + c->SG[i].Addr.lower = temp64.val32.lower; + c->SG[i].Addr.upper = temp64.val32.upper; + c->SG[i].Len = buff_size[i]; + c->SG[i].Ext = 0; /* we are not chaining */ + } + c->waiting = &wait; + enqueue_cmd_and_start_io(h, c); + wait_for_completion(&wait); + /* unlock the buffers from DMA */ + for (i = 0; i < sg_used; i++) { + temp64.val32.lower = c->SG[i].Addr.lower; + temp64.val32.upper = c->SG[i].Addr.upper; + pci_unmap_single(h->pdev, + (dma_addr_t) temp64.val, buff_size[i], + PCI_DMA_BIDIRECTIONAL); + } + check_ioctl_unit_attention(h, c); + /* Copy the error information out */ + ioc->error_info = *(c->err_info); + if (copy_to_user(argp, ioc, sizeof(*ioc))) { + cmd_special_free(h, c); + status = -EFAULT; + goto cleanup1; + } + if (ioc->Request.Type.Direction == XFER_READ) { + /* Copy the data out of the buffer we created */ + BYTE __user *ptr = ioc->buf; + for (i = 0; i < sg_used; i++) { + if (copy_to_user(ptr, buff[i], buff_size[i])) { cmd_special_free(h, c); status = -EFAULT; goto cleanup1; } - if (ioc->Request.Type.Direction == XFER_READ) { - /* Copy the data out of the buffer we created */ - BYTE __user *ptr = ioc->buf; - for (i = 0; i < sg_used; i++) { - if (copy_to_user - (ptr, buff[i], buff_size[i])) { - cmd_special_free(h, c); - status = -EFAULT; - goto cleanup1; - } - ptr += buff_size[i]; - } - } - cmd_special_free(h, c); - status = 0; - cleanup1: - if (buff) { - for (i = 0; i < sg_used; i++) - kfree(buff[i]); - kfree(buff); - } - kfree(buff_size); - kfree(ioc); - return status; + ptr += buff_size[i]; } + } + cmd_special_free(h, c); + status = 0; +cleanup1: + if (buff) { + for (i = 0; i < sg_used; i++) + kfree(buff[i]); + kfree(buff); + } + kfree(buff_size); + kfree(ioc); + return status; +} + +static int cciss_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct gendisk *disk = bdev->bd_disk; + ctlr_info_t *h = get_host(disk); + void __user *argp = (void __user *)arg; + + dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n", + cmd, arg); + switch (cmd) { + case CCISS_GETPCIINFO: + return cciss_getpciinfo(h, argp); + case CCISS_GETINTINFO: + return cciss_getintinfo(h, argp); + case CCISS_SETINTINFO: + return cciss_setintinfo(h, argp); + case CCISS_GETNODENAME: + return cciss_getnodename(h, argp); + case CCISS_SETNODENAME: + return cciss_setnodename(h, argp); + case CCISS_GETHEARTBEAT: + return cciss_getheartbeat(h, argp); + case CCISS_GETBUSTYPES: + return cciss_getbustypes(h, argp); + case CCISS_GETFIRMVER: + return cciss_getfirmver(h, argp); + case CCISS_GETDRIVVER: + return cciss_getdrivver(h, argp); + case CCISS_DEREGDISK: + case CCISS_REGNEWD: + case CCISS_REVALIDVOLS: + return rebuild_lun_table(h, 0, 1); + case CCISS_GETLUNINFO: + return cciss_getluninfo(h, disk, argp); + case CCISS_PASSTHRU: + return cciss_passthru(h, argp); + case CCISS_BIG_PASSTHRU: + return cciss_bigpassthru(h, argp); /* scsi_cmd_ioctl handles these, below, though some are not */ /* very meaningful for cciss. SG_IO is the main one people want. */ diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 9400845d602e..ac04ef97eac2 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -965,29 +965,30 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, * ok, (capacity & 7) != 0 sometimes, but who cares... * we count rs_{total,left} in bits, not sectors. */ - spin_lock_irqsave(&mdev->al_lock, flags); count = drbd_bm_clear_bits(mdev, sbnr, ebnr); - if (count) { - /* we need the lock for drbd_try_clear_on_disk_bm */ - if (jiffies - mdev->rs_mark_time > HZ*10) { - /* should be rolling marks, - * but we estimate only anyways. */ - if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) && + if (count && get_ldev(mdev)) { + unsigned long now = jiffies; + unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark]; + int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS; + if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) { + unsigned long tw = drbd_bm_total_weight(mdev); + if (mdev->rs_mark_left[mdev->rs_last_mark] != tw && mdev->state.conn != C_PAUSED_SYNC_T && mdev->state.conn != C_PAUSED_SYNC_S) { - mdev->rs_mark_time = jiffies; - mdev->rs_mark_left = drbd_bm_total_weight(mdev); + mdev->rs_mark_time[next] = now; + mdev->rs_mark_left[next] = tw; + mdev->rs_last_mark = next; } } - if (get_ldev(mdev)) { - drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); - put_ldev(mdev); - } + spin_lock_irqsave(&mdev->al_lock, flags); + drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); + spin_unlock_irqrestore(&mdev->al_lock, flags); + /* just wake_up unconditional now, various lc_chaged(), * lc_put() in drbd_try_clear_on_disk_bm(). */ wake_up = 1; + put_ldev(mdev); } - spin_unlock_irqrestore(&mdev->al_lock, flags); if (wake_up) wake_up(&mdev->al_wait); } @@ -1118,7 +1119,7 @@ static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) * @mdev: DRBD device. * @sector: The sector number. * - * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted. + * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted. */ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) { @@ -1129,10 +1130,10 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) sig = wait_event_interruptible(mdev->al_wait, (bm_ext = _bme_get(mdev, enr))); if (sig) - return 0; + return -EINTR; if (test_bit(BME_LOCKED, &bm_ext->flags)) - return 1; + return 0; for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { sig = wait_event_interruptible(mdev->al_wait, @@ -1145,13 +1146,11 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) wake_up(&mdev->al_wait); } spin_unlock_irq(&mdev->al_lock); - return 0; + return -EINTR; } } - set_bit(BME_LOCKED, &bm_ext->flags); - - return 1; + return 0; } /** diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index e3f88d6e1412..fd42832f785b 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -569,7 +569,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) * * maybe bm_set should be atomic_t ? */ -static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) +unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) { struct drbd_bitmap *b = mdev->bitmap; unsigned long s; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 352441b0f92f..9bdcf4393c0a 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -337,13 +337,25 @@ static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c) * NOTE that the payload starts at a long aligned offset, * regardless of 32 or 64 bit arch! */ -struct p_header { +struct p_header80 { u32 magic; u16 command; u16 length; /* bytes of data after this header */ u8 payload[0]; } __packed; -/* 8 bytes. packet FIXED for the next century! */ + +/* Header for big packets, Used for data packets exceeding 64kB */ +struct p_header95 { + u16 magic; /* use DRBD_MAGIC_BIG here */ + u16 command; + u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */ + u8 payload[0]; +} __packed; + +union p_header { + struct p_header80 h80; + struct p_header95 h95; +}; /* * short commands, packets without payload, plain p_header: @@ -362,12 +374,16 @@ struct p_header { */ /* these defines must not be changed without changing the protocol version */ -#define DP_HARDBARRIER 1 -#define DP_RW_SYNC 2 +#define DP_HARDBARRIER 1 /* depricated */ +#define DP_RW_SYNC 2 /* equals REQ_SYNC */ #define DP_MAY_SET_IN_SYNC 4 +#define DP_UNPLUG 8 /* equals REQ_UNPLUG */ +#define DP_FUA 16 /* equals REQ_FUA */ +#define DP_FLUSH 32 /* equals REQ_FLUSH */ +#define DP_DISCARD 64 /* equals REQ_DISCARD */ struct p_data { - struct p_header head; + union p_header head; u64 sector; /* 64 bits sector number */ u64 block_id; /* to identify the request in protocol B&C */ u32 seq_num; @@ -383,7 +399,7 @@ struct p_data { * P_DATA_REQUEST, P_RS_DATA_REQUEST */ struct p_block_ack { - struct p_header head; + struct p_header80 head; u64 sector; u64 block_id; u32 blksize; @@ -392,7 +408,7 @@ struct p_block_ack { struct p_block_req { - struct p_header head; + struct p_header80 head; u64 sector; u64 block_id; u32 blksize; @@ -409,7 +425,7 @@ struct p_block_req { */ struct p_handshake { - struct p_header head; /* 8 bytes */ + struct p_header80 head; /* 8 bytes */ u32 protocol_min; u32 feature_flags; u32 protocol_max; @@ -424,19 +440,19 @@ struct p_handshake { /* 80 bytes, FIXED for the next century */ struct p_barrier { - struct p_header head; + struct p_header80 head; u32 barrier; /* barrier number _handle_ only */ u32 pad; /* to multiple of 8 Byte */ } __packed; struct p_barrier_ack { - struct p_header head; + struct p_header80 head; u32 barrier; u32 set_size; } __packed; struct p_rs_param { - struct p_header head; + struct p_header80 head; u32 rate; /* Since protocol version 88 and higher. */ @@ -444,20 +460,31 @@ struct p_rs_param { } __packed; struct p_rs_param_89 { - struct p_header head; + struct p_header80 head; u32 rate; /* protocol version 89: */ char verify_alg[SHARED_SECRET_MAX]; char csums_alg[SHARED_SECRET_MAX]; } __packed; +struct p_rs_param_95 { + struct p_header80 head; + u32 rate; + char verify_alg[SHARED_SECRET_MAX]; + char csums_alg[SHARED_SECRET_MAX]; + u32 c_plan_ahead; + u32 c_delay_target; + u32 c_fill_target; + u32 c_max_rate; +} __packed; + enum drbd_conn_flags { CF_WANT_LOSE = 1, CF_DRY_RUN = 2, }; struct p_protocol { - struct p_header head; + struct p_header80 head; u32 protocol; u32 after_sb_0p; u32 after_sb_1p; @@ -471,17 +498,17 @@ struct p_protocol { } __packed; struct p_uuids { - struct p_header head; + struct p_header80 head; u64 uuid[UI_EXTENDED_SIZE]; } __packed; struct p_rs_uuid { - struct p_header head; + struct p_header80 head; u64 uuid; } __packed; struct p_sizes { - struct p_header head; + struct p_header80 head; u64 d_size; /* size of disk */ u64 u_size; /* user requested size */ u64 c_size; /* current exported size */ @@ -491,18 +518,18 @@ struct p_sizes { } __packed; struct p_state { - struct p_header head; + struct p_header80 head; u32 state; } __packed; struct p_req_state { - struct p_header head; + struct p_header80 head; u32 mask; u32 val; } __packed; struct p_req_state_reply { - struct p_header head; + struct p_header80 head; u32 retcode; } __packed; @@ -517,7 +544,7 @@ struct p_drbd06_param { } __packed; struct p_discard { - struct p_header head; + struct p_header80 head; u64 block_id; u32 seq_num; u32 pad; @@ -533,7 +560,7 @@ enum drbd_bitmap_code { }; struct p_compressed_bm { - struct p_header head; + struct p_header80 head; /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code * (encoding & 0x80): polarity (set/unset) of first runlength * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits @@ -544,10 +571,10 @@ struct p_compressed_bm { u8 code[0]; } __packed; -struct p_delay_probe { - struct p_header head; - u32 seq_num; /* sequence number to match the two probe packets */ - u32 offset; /* usecs the probe got sent after the reference time point */ +struct p_delay_probe93 { + struct p_header80 head; + u32 seq_num; /* sequence number to match the two probe packets */ + u32 offset; /* usecs the probe got sent after the reference time point */ } __packed; /* DCBP: Drbd Compressed Bitmap Packet ... */ @@ -594,7 +621,7 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n) * so we need to use the fixed size 4KiB page size * most architechtures have used for a long time. */ -#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header)) +#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80)) #define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) #define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) #if (PAGE_SIZE < 4096) @@ -603,13 +630,14 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n) #endif union p_polymorph { - struct p_header header; + union p_header header; struct p_handshake handshake; struct p_data data; struct p_block_ack block_ack; struct p_barrier barrier; struct p_barrier_ack barrier_ack; struct p_rs_param_89 rs_param_89; + struct p_rs_param_95 rs_param_95; struct p_protocol protocol; struct p_sizes sizes; struct p_uuids uuids; @@ -617,6 +645,8 @@ union p_polymorph { struct p_req_state req_state; struct p_req_state_reply req_state_reply; struct p_block_req block_req; + struct p_delay_probe93 delay_probe93; + struct p_rs_uuid rs_uuid; } __packed; /**********************************************************************/ @@ -697,7 +727,7 @@ struct drbd_tl_epoch { struct list_head requests; /* requests before */ struct drbd_tl_epoch *next; /* pointer to the next barrier */ unsigned int br_number; /* the barriers identifier. */ - int n_req; /* number of requests attached before this barrier */ + int n_writes; /* number of requests attached before this barrier */ }; struct drbd_request; @@ -747,7 +777,7 @@ struct digest_info { struct drbd_epoch_entry { struct drbd_work w; struct hlist_node colision; - struct drbd_epoch *epoch; + struct drbd_epoch *epoch; /* for writes */ struct drbd_conf *mdev; struct page *pages; atomic_t pending_bios; @@ -755,7 +785,10 @@ struct drbd_epoch_entry { /* see comments on ee flag bits below */ unsigned long flags; sector_t sector; - u64 block_id; + union { + u64 block_id; + struct digest_info *digest; + }; }; /* ee flag bits. @@ -781,12 +814,16 @@ enum { * if any of those fail, we set this flag atomically * from the endio callback */ __EE_WAS_ERROR, + + /* This ee has a pointer to a digest instead of a block id */ + __EE_HAS_DIGEST, }; #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) #define EE_IS_BARRIER (1<<__EE_IS_BARRIER) #define EE_RESUBMITTED (1<<__EE_RESUBMITTED) #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) +#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) /* global flag bits */ enum { @@ -794,7 +831,6 @@ enum { SIGNAL_ASENDER, /* whether asender wants to be interrupted */ SEND_PING, /* whether asender should send a ping asap */ - STOP_SYNC_TIMER, /* tell timer to cancel itself */ UNPLUG_QUEUED, /* only relevant with kernel 2.4 */ UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ MD_DIRTY, /* current uuids and flags not yet on disk */ @@ -816,6 +852,7 @@ enum { BITMAP_IO, /* suspend application io; once no more io in flight, start bitmap io */ BITMAP_IO_QUEUED, /* Started bitmap IO */ + GO_DISKLESS, /* Disk failed, local_cnt reached zero, we are going diskless */ RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ NET_CONGESTED, /* The data socket is congested */ @@ -829,6 +866,8 @@ enum { * the peer, if it changed there as well. */ CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */ + NEW_CUR_UUID, /* Create new current UUID when thawing IO */ + AL_SUSPENDED, /* Activity logging is currently suspended. */ }; struct drbd_bitmap; /* opaque for drbd_conf */ @@ -838,10 +877,6 @@ struct drbd_bitmap; /* opaque for drbd_conf */ /* THINK maybe we actually want to use the default "event/%s" worker threads * or similar in linux 2.6, which uses per cpu data and threads. - * - * To be general, this might need a spin_lock member. - * For now, please use the mdev->req_lock to protect list_head, - * see drbd_queue_work below. */ struct drbd_work_queue { struct list_head q; @@ -915,6 +950,12 @@ enum write_ordering_e { WO_bio_barrier }; +struct fifo_buffer { + int *values; + unsigned int head_index; + unsigned int size; +}; + struct drbd_conf { /* things that are stored as / read from meta data on disk */ unsigned long flags; @@ -936,9 +977,16 @@ struct drbd_conf { unsigned int ko_count; struct drbd_work resync_work, unplug_work, + go_diskless, md_sync_work; struct timer_list resync_timer; struct timer_list md_sync_timer; +#ifdef DRBD_DEBUG_MD_SYNC + struct { + unsigned int line; + const char* func; + } last_md_mark_dirty; +#endif /* Used after attach while negotiating new disk state. */ union drbd_state new_state_tmp; @@ -946,6 +994,7 @@ struct drbd_conf { union drbd_state state; wait_queue_head_t misc_wait; wait_queue_head_t state_wait; /* upon each state change. */ + wait_queue_head_t net_cnt_wait; unsigned int send_cnt; unsigned int recv_cnt; unsigned int read_cnt; @@ -974,12 +1023,16 @@ struct drbd_conf { unsigned long rs_start; /* cumulated time in PausedSyncX state [unit jiffies] */ unsigned long rs_paused; + /* skipped because csum was equal [unit BM_BLOCK_SIZE] */ + unsigned long rs_same_csum; +#define DRBD_SYNC_MARKS 8 +#define DRBD_SYNC_MARK_STEP (3*HZ) /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */ - unsigned long rs_mark_left; + unsigned long rs_mark_left[DRBD_SYNC_MARKS]; /* marks's time [unit jiffies] */ - unsigned long rs_mark_time; - /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */ - unsigned long rs_same_csum; + unsigned long rs_mark_time[DRBD_SYNC_MARKS]; + /* current index into rs_mark_{left,time} */ + int rs_last_mark; /* where does the admin want us to start? (sector) */ sector_t ov_start_sector; @@ -1012,10 +1065,10 @@ struct drbd_conf { spinlock_t epoch_lock; unsigned int epochs; enum write_ordering_e write_ordering; - struct list_head active_ee; /* IO in progress */ - struct list_head sync_ee; /* IO in progress */ + struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */ + struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */ struct list_head done_ee; /* send ack */ - struct list_head read_ee; /* IO in progress */ + struct list_head read_ee; /* IO in progress (any read) */ struct list_head net_ee; /* zero-copy network send in progress */ struct hlist_head *ee_hash; /* is proteced by req_lock! */ unsigned int ee_hash_s; @@ -1026,7 +1079,8 @@ struct drbd_conf { int next_barrier_nr; struct hlist_head *app_reads_hash; /* is proteced by req_lock */ struct list_head resync_reads; - atomic_t pp_in_use; + atomic_t pp_in_use; /* allocated from page pool */ + atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ wait_queue_head_t ee_wait; struct page *md_io_page; /* one page buffer for md_io */ struct page *md_io_tmpp; /* for logical_block_size != 512 */ @@ -1054,6 +1108,15 @@ struct drbd_conf { u64 ed_uuid; /* UUID of the exposed data */ struct mutex state_mutex; char congestion_reason; /* Why we where congested... */ + atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */ + atomic_t rs_sect_ev; /* for submitted resync data rate, both */ + int rs_last_sect_ev; /* counter to compare with */ + int rs_last_events; /* counter of read or write "events" (unit sectors) + * on the lower level device when we last looked. */ + int c_sync_rate; /* current resync rate after syncer throttle magic */ + struct fifo_buffer rs_plan_s; /* correction values of resync planer */ + int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ + int rs_planed; /* resync sectors already planed */ }; static inline struct drbd_conf *minor_to_mdev(unsigned int minor) @@ -1138,6 +1201,8 @@ extern void drbd_free_resources(struct drbd_conf *mdev); extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, unsigned int set_size); extern void tl_clear(struct drbd_conf *mdev); +enum drbd_req_event; +extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what); extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); extern void drbd_free_sock(struct drbd_conf *mdev); extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, @@ -1150,12 +1215,12 @@ extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_f extern int _drbd_send_state(struct drbd_conf *mdev); extern int drbd_send_state(struct drbd_conf *mdev); extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, - enum drbd_packets cmd, struct p_header *h, + enum drbd_packets cmd, struct p_header80 *h, size_t size, unsigned msg_flags); #define USE_DATA_SOCKET 1 #define USE_META_SOCKET 0 extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, - enum drbd_packets cmd, struct p_header *h, + enum drbd_packets cmd, struct p_header80 *h, size_t size); extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, size_t size); @@ -1167,7 +1232,7 @@ extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, struct p_block_req *rp); extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, - struct p_data *dp); + struct p_data *dp, int data_size); extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, sector_t sector, int blksize, u64 block_id); extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, @@ -1201,7 +1266,13 @@ extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); extern int drbd_md_test_flag(struct drbd_backing_dev *, int); +#ifndef DRBD_DEBUG_MD_SYNC extern void drbd_md_mark_dirty(struct drbd_conf *mdev); +#else +#define drbd_md_mark_dirty(m) drbd_md_mark_dirty_(m, __LINE__ , __func__ ) +extern void drbd_md_mark_dirty_(struct drbd_conf *mdev, + unsigned int line, const char *func); +#endif extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), void (*done)(struct drbd_conf *, int), @@ -1209,6 +1280,7 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); +extern void drbd_go_diskless(struct drbd_conf *mdev); /* Meta data layout @@ -1264,6 +1336,8 @@ struct bm_extent { * Bit 1 ==> local node thinks this block needs to be synced. */ +#define SLEEP_TIME (HZ/10) + #define BM_BLOCK_SHIFT 12 /* 4k per bit */ #define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) /* (9+3) : 512 bytes @ 8 bits; representing 16M storage @@ -1335,11 +1409,13 @@ struct bm_extent { #endif /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables. - * With a value of 6 all IO in one 32K block make it to the same slot of the + * With a value of 8 all IO in one 128K block make it to the same slot of the * hash table. */ -#define HT_SHIFT 6 +#define HT_SHIFT 8 #define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT)) +#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */ + /* Number of elements in the app_reads_hash */ #define APP_R_HSIZE 15 @@ -1369,6 +1445,7 @@ extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_ /* bm_find_next variants for use while you hold drbd_bm_lock() */ extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo); +extern unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev); extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev); extern int drbd_bm_rs_done(struct drbd_conf *mdev); /* for receive_bitmap */ @@ -1421,7 +1498,8 @@ extern void resync_after_online_grow(struct drbd_conf *); extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force); -enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); +extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); +extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev); extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); /* drbd_worker.c */ @@ -1467,10 +1545,12 @@ extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int); extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); +extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int); extern void resync_timer_fn(unsigned long data); /* drbd_receiver.c */ +extern int drbd_rs_should_slow_down(struct drbd_conf *mdev); extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, const unsigned rw, const int fault_type); extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); @@ -1479,7 +1559,10 @@ extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, sector_t sector, unsigned int data_size, gfp_t gfp_mask) __must_hold(local); -extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e); +extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, + int is_net); +#define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0) +#define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1) extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head); extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, @@ -1487,6 +1570,7 @@ extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); extern void drbd_flush_workqueue(struct drbd_conf *mdev); +extern void drbd_free_tl_hash(struct drbd_conf *mdev); /* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ @@ -1600,6 +1684,8 @@ void drbd_bcast_ee(struct drbd_conf *mdev, #define susp_MASK 1 #define user_isp_MASK 1 #define aftr_isp_MASK 1 +#define susp_nod_MASK 1 +#define susp_fen_MASK 1 #define NS(T, S) \ ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ @@ -1856,13 +1942,6 @@ static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, } static inline void -_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) -{ - list_add_tail(&w->list, &q->q); - up(&q->s); -} - -static inline void drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) { unsigned long flags; @@ -1899,19 +1978,19 @@ static inline void request_ping(struct drbd_conf *mdev) static inline int drbd_send_short_cmd(struct drbd_conf *mdev, enum drbd_packets cmd) { - struct p_header h; + struct p_header80 h; return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h)); } static inline int drbd_send_ping(struct drbd_conf *mdev) { - struct p_header h; + struct p_header80 h; return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h)); } static inline int drbd_send_ping_ack(struct drbd_conf *mdev) { - struct p_header h; + struct p_header80 h; return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h)); } @@ -2013,7 +2092,7 @@ static inline void inc_unacked(struct drbd_conf *mdev) static inline void put_net_conf(struct drbd_conf *mdev) { if (atomic_dec_and_test(&mdev->net_cnt)) - wake_up(&mdev->misc_wait); + wake_up(&mdev->net_cnt_wait); } /** @@ -2044,10 +2123,14 @@ static inline int get_net_conf(struct drbd_conf *mdev) static inline void put_ldev(struct drbd_conf *mdev) { + int i = atomic_dec_return(&mdev->local_cnt); __release(local); - if (atomic_dec_and_test(&mdev->local_cnt)) + D_ASSERT(i >= 0); + if (i == 0) { + if (mdev->state.disk == D_FAILED) + drbd_go_diskless(mdev); wake_up(&mdev->misc_wait); - D_ASSERT(atomic_read(&mdev->local_cnt) >= 0); + } } #ifndef __CHECKER__ @@ -2179,11 +2262,16 @@ static inline int drbd_state_is_stable(union drbd_state s) return 1; } +static inline int is_susp(union drbd_state s) +{ + return s.susp || s.susp_nod || s.susp_fen; +} + static inline int __inc_ap_bio_cond(struct drbd_conf *mdev) { int mxb = drbd_get_max_buffers(mdev); - if (mdev->state.susp) + if (is_susp(mdev->state)) return 0; if (test_bit(SUSPEND_IO, &mdev->flags)) return 0; @@ -2321,8 +2409,7 @@ static inline void drbd_md_flush(struct drbd_conf *mdev) if (test_bit(MD_NO_BARRIER, &mdev->flags)) return; - r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL, - BLKDEV_IFL_WAIT); + r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL); if (r) { set_bit(MD_NO_BARRIER, &mdev->flags); dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index fa650dd85b90..8bfedc7164fa 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -77,6 +77,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); static void md_sync_timer_fn(unsigned long data); static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); +static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused); MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " "Lars Ellenberg <lars@linbit.com>"); @@ -199,7 +200,7 @@ static int tl_init(struct drbd_conf *mdev) INIT_LIST_HEAD(&b->w.list); b->next = NULL; b->br_number = 4711; - b->n_req = 0; + b->n_writes = 0; b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ mdev->oldest_tle = b; @@ -240,7 +241,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) INIT_LIST_HEAD(&new->w.list); new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ new->next = NULL; - new->n_req = 0; + new->n_writes = 0; newest_before = mdev->newest_tle; /* never send a barrier number == 0, because that is special-cased @@ -284,9 +285,9 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, barrier_nr, b->br_number); goto bail; } - if (b->n_req != set_size) { - dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n", - barrier_nr, set_size, b->n_req); + if (b->n_writes != set_size) { + dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n", + barrier_nr, set_size, b->n_writes); goto bail; } @@ -333,6 +334,82 @@ bail: drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); } +/** + * _tl_restart() - Walks the transfer log, and applies an action to all requests + * @mdev: DRBD device. + * @what: The action/event to perform with all request objects + * + * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io, + * restart_frozen_disk_io. + */ +static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) +{ + struct drbd_tl_epoch *b, *tmp, **pn; + struct list_head *le, *tle, carry_reads; + struct drbd_request *req; + int rv, n_writes, n_reads; + + b = mdev->oldest_tle; + pn = &mdev->oldest_tle; + while (b) { + n_writes = 0; + n_reads = 0; + INIT_LIST_HEAD(&carry_reads); + list_for_each_safe(le, tle, &b->requests) { + req = list_entry(le, struct drbd_request, tl_requests); + rv = _req_mod(req, what); + + n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT; + n_reads += (rv & MR_READ) >> MR_READ_SHIFT; + } + tmp = b->next; + + if (n_writes) { + if (what == resend) { + b->n_writes = n_writes; + if (b->w.cb == NULL) { + b->w.cb = w_send_barrier; + inc_ap_pending(mdev); + set_bit(CREATE_BARRIER, &mdev->flags); + } + + drbd_queue_work(&mdev->data.work, &b->w); + } + pn = &b->next; + } else { + if (n_reads) + list_add(&carry_reads, &b->requests); + /* there could still be requests on that ring list, + * in case local io is still pending */ + list_del(&b->requests); + + /* dec_ap_pending corresponding to queue_barrier. + * the newest barrier may not have been queued yet, + * in which case w.cb is still NULL. */ + if (b->w.cb != NULL) + dec_ap_pending(mdev); + + if (b == mdev->newest_tle) { + /* recycle, but reinit! */ + D_ASSERT(tmp == NULL); + INIT_LIST_HEAD(&b->requests); + list_splice(&carry_reads, &b->requests); + INIT_LIST_HEAD(&b->w.list); + b->w.cb = NULL; + b->br_number = net_random(); + b->n_writes = 0; + + *pn = b; + break; + } + *pn = tmp; + kfree(b); + } + b = tmp; + list_splice(&carry_reads, &b->requests); + } +} + /** * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL @@ -344,48 +421,12 @@ bail: */ void tl_clear(struct drbd_conf *mdev) { - struct drbd_tl_epoch *b, *tmp; struct list_head *le, *tle; struct drbd_request *r; - int new_initial_bnr = net_random(); spin_lock_irq(&mdev->req_lock); - b = mdev->oldest_tle; - while (b) { - list_for_each_safe(le, tle, &b->requests) { - r = list_entry(le, struct drbd_request, tl_requests); - /* It would be nice to complete outside of spinlock. - * But this is easier for now. */ - _req_mod(r, connection_lost_while_pending); - } - tmp = b->next; - - /* there could still be requests on that ring list, - * in case local io is still pending */ - list_del(&b->requests); - - /* dec_ap_pending corresponding to queue_barrier. - * the newest barrier may not have been queued yet, - * in which case w.cb is still NULL. */ - if (b->w.cb != NULL) - dec_ap_pending(mdev); - - if (b == mdev->newest_tle) { - /* recycle, but reinit! */ - D_ASSERT(tmp == NULL); - INIT_LIST_HEAD(&b->requests); - INIT_LIST_HEAD(&b->w.list); - b->w.cb = NULL; - b->br_number = new_initial_bnr; - b->n_req = 0; - - mdev->oldest_tle = b; - break; - } - kfree(b); - b = tmp; - } + _tl_restart(mdev, connection_lost_while_pending); /* we expect this list to be empty. */ D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); @@ -401,6 +442,15 @@ void tl_clear(struct drbd_conf *mdev) /* ensure bit indicating barrier is required is clear */ clear_bit(CREATE_BARRIER, &mdev->flags); + memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); + + spin_unlock_irq(&mdev->req_lock); +} + +void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) +{ + spin_lock_irq(&mdev->req_lock); + _tl_restart(mdev, what); spin_unlock_irq(&mdev->req_lock); } @@ -455,7 +505,7 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns); static int is_valid_state_transition(struct drbd_conf *, union drbd_state, union drbd_state); static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, int *warn_sync_abort); + union drbd_state ns, const char **warn_sync_abort); int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state); @@ -605,7 +655,7 @@ static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) drbd_role_str(ns.peer), drbd_disk_str(ns.disk), drbd_disk_str(ns.pdsk), - ns.susp ? 's' : 'r', + is_susp(ns) ? 's' : 'r', ns.aftr_isp ? 'a' : '-', ns.peer_isp ? 'p' : '-', ns.user_isp ? 'u' : '-' @@ -763,7 +813,7 @@ static int is_valid_state_transition(struct drbd_conf *mdev, * to D_UNKNOWN. This rule and many more along those lines are in this function. */ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, int *warn_sync_abort) + union drbd_state ns, const char **warn_sync_abort) { enum drbd_fencing_p fp; @@ -778,9 +828,10 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os.conn <= C_DISCONNECTING) ns.conn = os.conn; - /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */ + /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow. + * If you try to go into some Sync* state, that shall fail (elsewhere). */ if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && - ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING) + ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN) ns.conn = os.conn; /* After C_DISCONNECTING only C_STANDALONE may follow */ @@ -798,14 +849,13 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) ns.aftr_isp = 0; - if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS) - ns.pdsk = D_UNKNOWN; - /* Abort resync if a disk fails/detaches */ if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { if (warn_sync_abort) - *warn_sync_abort = 1; + *warn_sync_abort = + os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? + "Online-verify" : "Resync"; ns.conn = C_CONNECTED; } @@ -876,7 +926,12 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state if (fp == FP_STONITH && (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) - ns.susp = 1; + ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ + + if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO && + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) && + !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE)) + ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { if (ns.conn == C_SYNC_SOURCE) @@ -912,6 +967,12 @@ static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) } } +static void drbd_resume_al(struct drbd_conf *mdev) +{ + if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags)) + dev_info(DEV, "Resumed AL updates\n"); +} + /** * __drbd_set_state() - Set a new DRBD state * @mdev: DRBD device. @@ -927,7 +988,7 @@ int __drbd_set_state(struct drbd_conf *mdev, { union drbd_state os; int rv = SS_SUCCESS; - int warn_sync_abort = 0; + const char *warn_sync_abort = NULL; struct after_state_chg_work *ascw; os = mdev->state; @@ -946,14 +1007,8 @@ int __drbd_set_state(struct drbd_conf *mdev, /* If the old state was illegal as well, then let this happen...*/ - if (is_valid_state(mdev, os) == rv) { - dev_err(DEV, "Considering state change from bad state. " - "Error would be: '%s'\n", - drbd_set_st_err_str(rv)); - print_st(mdev, "old", os); - print_st(mdev, "new", ns); + if (is_valid_state(mdev, os) == rv) rv = is_valid_state_transition(mdev, ns, os); - } } else rv = is_valid_state_transition(mdev, ns, os); } @@ -965,7 +1020,7 @@ int __drbd_set_state(struct drbd_conf *mdev, } if (warn_sync_abort) - dev_warn(DEV, "Resync aborted.\n"); + dev_warn(DEV, "%s aborted.\n", warn_sync_abort); { char *pbp, pb[300]; @@ -976,7 +1031,10 @@ int __drbd_set_state(struct drbd_conf *mdev, PSC(conn); PSC(disk); PSC(pdsk); - PSC(susp); + if (is_susp(ns) != is_susp(os)) + pbp += sprintf(pbp, "susp( %s -> %s ) ", + drbd_susp_str(is_susp(os)), + drbd_susp_str(is_susp(ns))); PSC(aftr_isp); PSC(peer_isp); PSC(user_isp); @@ -1001,12 +1059,6 @@ int __drbd_set_state(struct drbd_conf *mdev, wake_up(&mdev->misc_wait); wake_up(&mdev->state_wait); - /* post-state-change actions */ - if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) { - set_bit(STOP_SYNC_TIMER, &mdev->flags); - mod_timer(&mdev->resync_timer, jiffies); - } - /* aborted verify run. log the last position */ if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && ns.conn < C_CONNECTED) { @@ -1019,41 +1071,42 @@ int __drbd_set_state(struct drbd_conf *mdev, if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { dev_info(DEV, "Syncer continues.\n"); - mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time; - if (ns.conn == C_SYNC_TARGET) { - if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags)) - mod_timer(&mdev->resync_timer, jiffies); - /* This if (!test_bit) is only needed for the case - that a device that has ceased to used its timer, - i.e. it is already in drbd_resync_finished() gets - paused and resumed. */ - } + mdev->rs_paused += (long)jiffies + -(long)mdev->rs_mark_time[mdev->rs_last_mark]; + if (ns.conn == C_SYNC_TARGET) + mod_timer(&mdev->resync_timer, jiffies); } if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { dev_info(DEV, "Resync suspended\n"); - mdev->rs_mark_time = jiffies; - if (ns.conn == C_PAUSED_SYNC_T) - set_bit(STOP_SYNC_TIMER, &mdev->flags); + mdev->rs_mark_time[mdev->rs_last_mark] = jiffies; } if (os.conn == C_CONNECTED && (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { + unsigned long now = jiffies; + int i; + mdev->ov_position = 0; - mdev->rs_total = - mdev->rs_mark_left = drbd_bm_bits(mdev); + mdev->rs_total = drbd_bm_bits(mdev); if (mdev->agreed_pro_version >= 90) set_ov_position(mdev, ns.conn); else mdev->ov_start_sector = 0; mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(mdev->ov_position); - mdev->rs_start = - mdev->rs_mark_time = jiffies; + mdev->rs_start = now; + mdev->rs_last_events = 0; + mdev->rs_last_sect_ev = 0; mdev->ov_last_oos_size = 0; mdev->ov_last_oos_start = 0; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + mdev->rs_mark_left[i] = mdev->rs_total; + mdev->rs_mark_time[i] = now; + } + if (ns.conn == C_VERIFY_S) { dev_info(DEV, "Starting Online Verify from sector %llu\n", (unsigned long long)mdev->ov_position); @@ -1106,6 +1159,10 @@ int __drbd_set_state(struct drbd_conf *mdev, ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) drbd_thread_restart_nowait(&mdev->receiver); + /* Resume AL writing if we get a connection */ + if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) + drbd_resume_al(mdev); + ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); if (ascw) { ascw->os = os; @@ -1164,6 +1221,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, union drbd_state ns, enum chg_state_flags flags) { enum drbd_fencing_p fp; + enum drbd_req_event what = nothing; + union drbd_state nsm = (union drbd_state){ .i = -1 }; if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { clear_bit(CRASHED_PRIMARY, &mdev->flags); @@ -1187,17 +1246,49 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Here we have the actions that are performed after a state change. This function might sleep */ - if (fp == FP_STONITH && ns.susp) { - /* case1: The outdate peer handler is successful: - * case2: The connection was established again: */ - if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) || - (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) { + nsm.i = -1; + if (ns.susp_nod) { + if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { + if (ns.conn == C_CONNECTED) + what = resend, nsm.susp_nod = 0; + else /* ns.conn > C_CONNECTED */ + dev_err(DEV, "Unexpected Resynd going on!\n"); + } + + if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING) + what = restart_frozen_disk_io, nsm.susp_nod = 0; + + } + + if (ns.susp_fen) { + /* case1: The outdate peer handler is successful: */ + if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) { tl_clear(mdev); + if (test_bit(NEW_CUR_UUID, &mdev->flags)) { + drbd_uuid_new_current(mdev); + clear_bit(NEW_CUR_UUID, &mdev->flags); + drbd_md_sync(mdev); + } spin_lock_irq(&mdev->req_lock); - _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); + _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); spin_unlock_irq(&mdev->req_lock); } + /* case2: The connection was established again: */ + if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { + clear_bit(NEW_CUR_UUID, &mdev->flags); + what = resend; + nsm.susp_fen = 0; + } + } + + if (what != nothing) { + spin_lock_irq(&mdev->req_lock); + _tl_restart(mdev, what); + nsm.i &= mdev->state.i; + _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL); + spin_unlock_irq(&mdev->req_lock); } + /* Do not change the order of the if above and the two below... */ if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ drbd_send_uuids(mdev); @@ -1216,16 +1307,22 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, if (get_ldev(mdev)) { if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { - drbd_uuid_new_current(mdev); - drbd_send_uuids(mdev); + if (is_susp(mdev->state)) { + set_bit(NEW_CUR_UUID, &mdev->flags); + } else { + drbd_uuid_new_current(mdev); + drbd_send_uuids(mdev); + } } put_ldev(mdev); } } if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { - if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) + if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) { drbd_uuid_new_current(mdev); + drbd_send_uuids(mdev); + } /* D_DISKLESS Peer becomes secondary */ if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) @@ -1267,42 +1364,51 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); + /* first half of local IO error */ if (os.disk > D_FAILED && ns.disk == D_FAILED) { - enum drbd_io_error_p eh; + enum drbd_io_error_p eh = EP_PASS_ON; + + if (drbd_send_state(mdev)) + dev_warn(DEV, "Notified peer that my disk is broken.\n"); + else + dev_err(DEV, "Sending state for drbd_io_error() failed\n"); + + drbd_rs_cancel_all(mdev); - eh = EP_PASS_ON; if (get_ldev_if_state(mdev, D_FAILED)) { eh = mdev->ldev->dc.on_io_error; put_ldev(mdev); } + if (eh == EP_CALL_HELPER) + drbd_khelper(mdev, "local-io-error"); + } - drbd_rs_cancel_all(mdev); - /* since get_ldev() only works as long as disk>=D_INCONSISTENT, - and it is D_DISKLESS here, local_cnt can only go down, it can - not increase... It will reach zero */ - wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + + /* second half of local IO error handling, + * after local_cnt references have reached zero: */ + if (os.disk == D_FAILED && ns.disk == D_DISKLESS) { mdev->rs_total = 0; mdev->rs_failed = 0; atomic_set(&mdev->rs_pending_cnt, 0); - - spin_lock_irq(&mdev->req_lock); - _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL); - spin_unlock_irq(&mdev->req_lock); - - if (eh == EP_CALL_HELPER) - drbd_khelper(mdev, "local-io-error"); } if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) { + /* We must still be diskless, + * re-attach has to be serialized with this! */ + if (mdev->state.disk != D_DISKLESS) + dev_err(DEV, + "ASSERT FAILED: disk is %s while going diskless\n", + drbd_disk_str(mdev->state.disk)); + + /* we cannot assert local_cnt == 0 here, as get_ldev_if_state + * will inc/dec it frequently. Since we became D_DISKLESS, no + * one has touched the protected members anymore, though, so we + * are safe to free them here. */ + if (drbd_send_state(mdev)) + dev_warn(DEV, "Notified peer that I detached my disk.\n"); + else + dev_err(DEV, "Sending state for detach failed\n"); - if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ { - if (drbd_send_state(mdev)) - dev_warn(DEV, "Notified peer that my disk is broken.\n"); - else - dev_err(DEV, "Sending state in drbd_io_error() failed\n"); - } - - wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); lc_destroy(mdev->resync); mdev->resync = NULL; lc_destroy(mdev->act_log); @@ -1311,8 +1417,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, drbd_free_bc(mdev->ldev); mdev->ldev = NULL;); - if (mdev->md_io_tmpp) + if (mdev->md_io_tmpp) { __free_page(mdev->md_io_tmpp); + mdev->md_io_tmpp = NULL; + } } /* Disks got bigger while they were detached */ @@ -1328,6 +1436,15 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, (os.user_isp && !ns.user_isp)) resume_next_sg(mdev); + /* sync target done with resync. Explicitly notify peer, even though + * it should (at least for non-empty resyncs) already know itself. */ + if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) + drbd_send_state(mdev); + + /* free tl_hash if we Got thawed and are C_STANDALONE */ + if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash) + drbd_free_tl_hash(mdev); + /* Upon network connection, we need to start the receiver */ if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED) drbd_thread_start(&mdev->receiver); @@ -1554,7 +1671,7 @@ void drbd_thread_current_set_cpu(struct drbd_conf *mdev) /* the appropriate socket mutex must be held already */ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, - enum drbd_packets cmd, struct p_header *h, + enum drbd_packets cmd, struct p_header80 *h, size_t size, unsigned msg_flags) { int sent, ok; @@ -1564,7 +1681,7 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, h->magic = BE_DRBD_MAGIC; h->command = cpu_to_be16(cmd); - h->length = cpu_to_be16(size-sizeof(struct p_header)); + h->length = cpu_to_be16(size-sizeof(struct p_header80)); sent = drbd_send(mdev, sock, h, size, msg_flags); @@ -1579,7 +1696,7 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, * when we hold the appropriate socket mutex. */ int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, - enum drbd_packets cmd, struct p_header *h, size_t size) + enum drbd_packets cmd, struct p_header80 *h, size_t size) { int ok = 0; struct socket *sock; @@ -1607,7 +1724,7 @@ int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, size_t size) { - struct p_header h; + struct p_header80 h; int ok; h.magic = BE_DRBD_MAGIC; @@ -1629,7 +1746,7 @@ int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) { - struct p_rs_param_89 *p; + struct p_rs_param_95 *p; struct socket *sock; int size, rv; const int apv = mdev->agreed_pro_version; @@ -1637,7 +1754,8 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) size = apv <= 87 ? sizeof(struct p_rs_param) : apv == 88 ? sizeof(struct p_rs_param) + strlen(mdev->sync_conf.verify_alg) + 1 - : /* 89 */ sizeof(struct p_rs_param_89); + : apv <= 94 ? sizeof(struct p_rs_param_89) + : /* apv >= 95 */ sizeof(struct p_rs_param_95); /* used from admin command context and receiver/worker context. * to avoid kmalloc, grab the socket right here, @@ -1648,12 +1766,16 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) if (likely(sock != NULL)) { enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; - p = &mdev->data.sbuf.rs_param_89; + p = &mdev->data.sbuf.rs_param_95; /* initialize verify_alg and csums_alg */ memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); p->rate = cpu_to_be32(sc->rate); + p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead); + p->c_delay_target = cpu_to_be32(sc->c_delay_target); + p->c_fill_target = cpu_to_be32(sc->c_fill_target); + p->c_max_rate = cpu_to_be32(sc->c_max_rate); if (apv >= 88) strcpy(p->verify_alg, mdev->sync_conf.verify_alg); @@ -1709,7 +1831,7 @@ int drbd_send_protocol(struct drbd_conf *mdev) strcpy(p->integrity_alg, mdev->net_conf->integrity_alg); rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, - (struct p_header *)p, size); + (struct p_header80 *)p, size); kfree(p); return rv; } @@ -1735,7 +1857,7 @@ int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags) put_ldev(mdev); return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); } int drbd_send_uuids(struct drbd_conf *mdev) @@ -1756,7 +1878,7 @@ int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val) p.uuid = cpu_to_be64(val); return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); } int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags) @@ -1786,7 +1908,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl p.dds_flags = cpu_to_be16(flags); ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); return ok; } @@ -1811,7 +1933,7 @@ int drbd_send_state(struct drbd_conf *mdev) if (likely(sock != NULL)) { ok = _drbd_send_cmd(mdev, sock, P_STATE, - (struct p_header *)&p, sizeof(p), 0); + (struct p_header80 *)&p, sizeof(p), 0); } mutex_unlock(&mdev->data.mutex); @@ -1829,7 +1951,7 @@ int drbd_send_state_req(struct drbd_conf *mdev, p.val = cpu_to_be32(val.i); return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); } int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode) @@ -1839,7 +1961,7 @@ int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode) p.retcode = cpu_to_be32(retcode); return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); } int fill_bitmap_rle_bits(struct drbd_conf *mdev, @@ -1938,7 +2060,7 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, enum { OK, FAILED, DONE } send_bitmap_rle_or_plain(struct drbd_conf *mdev, - struct p_header *h, struct bm_xfer_ctx *c) + struct p_header80 *h, struct bm_xfer_ctx *c) { struct p_compressed_bm *p = (void*)h; unsigned long num_words; @@ -1968,12 +2090,12 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev, if (len) drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload); ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP, - h, sizeof(struct p_header) + len, 0); + h, sizeof(struct p_header80) + len, 0); c->word_offset += num_words; c->bit_offset = c->word_offset * BITS_PER_LONG; c->packets[1]++; - c->bytes[1] += sizeof(struct p_header) + len; + c->bytes[1] += sizeof(struct p_header80) + len; if (c->bit_offset > c->bm_bits) c->bit_offset = c->bm_bits; @@ -1989,14 +2111,14 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev, int _drbd_send_bitmap(struct drbd_conf *mdev) { struct bm_xfer_ctx c; - struct p_header *p; + struct p_header80 *p; int ret; ERR_IF(!mdev->bitmap) return FALSE; /* maybe we should use some per thread scratch page, * and allocate that during initial device creation? */ - p = (struct p_header *) __get_free_page(GFP_NOIO); + p = (struct p_header80 *) __get_free_page(GFP_NOIO); if (!p) { dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); return FALSE; @@ -2054,7 +2176,7 @@ int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) if (mdev->state.conn < C_CONNECTED) return FALSE; ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); return ok; } @@ -2082,17 +2204,18 @@ static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED) return FALSE; ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); return ok; } +/* dp->sector and dp->block_id already/still in network byte order, + * data_size is payload size according to dp->head, + * and may need to be corrected for digest size. */ int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, - struct p_data *dp) + struct p_data *dp, int data_size) { - const int header_size = sizeof(struct p_data) - - sizeof(struct p_header); - int data_size = ((struct p_header *)dp)->length - header_size; - + data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? + crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), dp->block_id); } @@ -2140,7 +2263,7 @@ int drbd_send_drequest(struct drbd_conf *mdev, int cmd, p.blksize = cpu_to_be32(size); ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); return ok; } @@ -2158,7 +2281,7 @@ int drbd_send_drequest_csum(struct drbd_conf *mdev, p.head.magic = BE_DRBD_MAGIC; p.head.command = cpu_to_be16(cmd); - p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size); + p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size); mutex_lock(&mdev->data.mutex); @@ -2180,7 +2303,7 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) p.blksize = cpu_to_be32(size); ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); return ok; } @@ -2332,6 +2455,18 @@ static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) return 1; } +static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw) +{ + if (mdev->agreed_pro_version >= 95) + return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | + (bi_rw & REQ_UNPLUG ? DP_UNPLUG : 0) | + (bi_rw & REQ_FUA ? DP_FUA : 0) | + (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) | + (bi_rw & REQ_DISCARD ? DP_DISCARD : 0); + else + return bi_rw & (REQ_SYNC | REQ_UNPLUG) ? DP_RW_SYNC : 0; +} + /* Used to send write requests * R_PRIMARY -> Peer (P_DATA) */ @@ -2349,30 +2484,25 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; - p.head.magic = BE_DRBD_MAGIC; - p.head.command = cpu_to_be16(P_DATA); - p.head.length = - cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size); + if (req->size <= DRBD_MAX_SIZE_H80_PACKET) { + p.head.h80.magic = BE_DRBD_MAGIC; + p.head.h80.command = cpu_to_be16(P_DATA); + p.head.h80.length = + cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size); + } else { + p.head.h95.magic = BE_DRBD_MAGIC_BIG; + p.head.h95.command = cpu_to_be16(P_DATA); + p.head.h95.length = + cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size); + } p.sector = cpu_to_be64(req->sector); p.block_id = (unsigned long)req; p.seq_num = cpu_to_be32(req->seq_num = atomic_add_return(1, &mdev->packet_seq)); - dp_flags = 0; - /* NOTE: no need to check if barriers supported here as we would - * not pass the test in make_request_common in that case - */ - if (req->master_bio->bi_rw & REQ_HARDBARRIER) { - dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n"); - /* dp_flags |= DP_HARDBARRIER; */ - } - if (req->master_bio->bi_rw & REQ_SYNC) - dp_flags |= DP_RW_SYNC; - /* for now handle SYNCIO and UNPLUG - * as if they still were one and the same flag */ - if (req->master_bio->bi_rw & REQ_UNPLUG) - dp_flags |= DP_RW_SYNC; + dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); + if (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn <= C_PAUSED_SYNC_T) dp_flags |= DP_MAY_SET_IN_SYNC; @@ -2413,10 +2543,17 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; - p.head.magic = BE_DRBD_MAGIC; - p.head.command = cpu_to_be16(cmd); - p.head.length = - cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size); + if (e->size <= DRBD_MAX_SIZE_H80_PACKET) { + p.head.h80.magic = BE_DRBD_MAGIC; + p.head.h80.command = cpu_to_be16(cmd); + p.head.h80.length = + cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size); + } else { + p.head.h95.magic = BE_DRBD_MAGIC_BIG; + p.head.h95.command = cpu_to_be16(cmd); + p.head.h95.length = + cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size); + } p.sector = cpu_to_be64(e->sector); p.block_id = e->block_id; @@ -2429,8 +2566,7 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, if (!drbd_get_data_sock(mdev)) return 0; - ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, - sizeof(p), dgs ? MSG_MORE : 0); + ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0); if (ok && dgs) { dgb = mdev->int_dig_out; drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb); @@ -2605,7 +2741,13 @@ static void drbd_set_defaults(struct drbd_conf *mdev) /* .verify_alg = */ {}, 0, /* .cpu_mask = */ {}, 0, /* .csums_alg = */ {}, 0, - /* .use_rle = */ 0 + /* .use_rle = */ 0, + /* .on_no_data = */ DRBD_ON_NO_DATA_DEF, + /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF, + /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF, + /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF, + /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF, + /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF }; /* Have to use that way, because the layout differs between @@ -2616,7 +2758,9 @@ static void drbd_set_defaults(struct drbd_conf *mdev) .conn = C_STANDALONE, .disk = D_DISKLESS, .pdsk = D_UNKNOWN, - .susp = 0 + .susp = 0, + .susp_nod = 0, + .susp_fen = 0 } }; } @@ -2640,6 +2784,9 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) atomic_set(&mdev->net_cnt, 0); atomic_set(&mdev->packet_seq, 0); atomic_set(&mdev->pp_in_use, 0); + atomic_set(&mdev->pp_in_use_by_net, 0); + atomic_set(&mdev->rs_sect_in, 0); + atomic_set(&mdev->rs_sect_ev, 0); mutex_init(&mdev->md_io_mutex); mutex_init(&mdev->data.mutex); @@ -2666,11 +2813,13 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) INIT_LIST_HEAD(&mdev->meta.work.q); INIT_LIST_HEAD(&mdev->resync_work.list); INIT_LIST_HEAD(&mdev->unplug_work.list); + INIT_LIST_HEAD(&mdev->go_diskless.list); INIT_LIST_HEAD(&mdev->md_sync_work.list); INIT_LIST_HEAD(&mdev->bm_io_work.w.list); mdev->resync_work.cb = w_resync_inactive; mdev->unplug_work.cb = w_send_write_hint; + mdev->go_diskless.cb = w_go_diskless; mdev->md_sync_work.cb = w_md_sync; mdev->bm_io_work.w.cb = w_bitmap_io; init_timer(&mdev->resync_timer); @@ -2682,6 +2831,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) init_waitqueue_head(&mdev->misc_wait); init_waitqueue_head(&mdev->state_wait); + init_waitqueue_head(&mdev->net_cnt_wait); init_waitqueue_head(&mdev->ee_wait); init_waitqueue_head(&mdev->al_wait); init_waitqueue_head(&mdev->seq_wait); @@ -2697,6 +2847,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) void drbd_mdev_cleanup(struct drbd_conf *mdev) { + int i; if (mdev->receiver.t_state != None) dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", mdev->receiver.t_state); @@ -2713,9 +2864,13 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) mdev->p_size = mdev->rs_start = mdev->rs_total = - mdev->rs_failed = - mdev->rs_mark_left = - mdev->rs_mark_time = 0; + mdev->rs_failed = 0; + mdev->rs_last_events = 0; + mdev->rs_last_sect_ev = 0; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + mdev->rs_mark_left[i] = 0; + mdev->rs_mark_time[i] = 0; + } D_ASSERT(mdev->net_conf == NULL); drbd_set_my_capacity(mdev, 0); @@ -2726,6 +2881,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) } drbd_free_resources(mdev); + clear_bit(AL_SUSPENDED, &mdev->flags); /* * currently we drbd_init_ee only on module load, so @@ -2741,6 +2897,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) D_ASSERT(list_empty(&mdev->meta.work.q)); D_ASSERT(list_empty(&mdev->resync_work.list)); D_ASSERT(list_empty(&mdev->unplug_work.list)); + D_ASSERT(list_empty(&mdev->go_diskless.list)); } @@ -3280,9 +3437,10 @@ void drbd_md_sync(struct drbd_conf *mdev) sector_t sector; int i; + del_timer(&mdev->md_sync_timer); + /* timer may be rearmed by drbd_md_mark_dirty() now. */ if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) return; - del_timer(&mdev->md_sync_timer); /* We use here D_FAILED and not D_ATTACHING because we try to write * metadata even if we detach due to a disk failure! */ @@ -3310,12 +3468,9 @@ void drbd_md_sync(struct drbd_conf *mdev) D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); sector = mdev->ldev->md.md_offset; - if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { - clear_bit(MD_DIRTY, &mdev->flags); - } else { + if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { /* this was a try anyways ... */ dev_err(DEV, "meta data update failed!\n"); - drbd_chk_io_error(mdev, 1, TRUE); } @@ -3402,6 +3557,28 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) return rv; } +static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index) +{ + static char *uuid_str[UI_EXTENDED_SIZE] = { + [UI_CURRENT] = "CURRENT", + [UI_BITMAP] = "BITMAP", + [UI_HISTORY_START] = "HISTORY_START", + [UI_HISTORY_END] = "HISTORY_END", + [UI_SIZE] = "SIZE", + [UI_FLAGS] = "FLAGS", + }; + + if (index >= UI_EXTENDED_SIZE) { + dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n"); + return; + } + + dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n", + uuid_str[index], + (unsigned long long)mdev->ldev->md.uuid[index]); +} + + /** * drbd_md_mark_dirty() - Mark meta data super block as dirty * @mdev: DRBD device. @@ -3410,19 +3587,31 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) * the meta-data super block. This function sets MD_DIRTY, and starts a * timer that ensures that within five seconds you have to call drbd_md_sync(). */ +#ifdef DEBUG +void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func) +{ + if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) { + mod_timer(&mdev->md_sync_timer, jiffies + HZ); + mdev->last_md_mark_dirty.line = line; + mdev->last_md_mark_dirty.func = func; + } +} +#else void drbd_md_mark_dirty(struct drbd_conf *mdev) { - set_bit(MD_DIRTY, &mdev->flags); - mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ); + if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) + mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ); } - +#endif static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) { int i; - for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) + for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) { mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; + debug_drbd_uuid(mdev, i+1); + } } void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) @@ -3437,6 +3626,7 @@ void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) } mdev->ldev->md.uuid[idx] = val; + debug_drbd_uuid(mdev, idx); drbd_md_mark_dirty(mdev); } @@ -3446,6 +3636,7 @@ void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) if (mdev->ldev->md.uuid[idx]) { drbd_uuid_move_history(mdev); mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; + debug_drbd_uuid(mdev, UI_HISTORY_START); } _drbd_uuid_set(mdev, idx, val); } @@ -3464,6 +3655,7 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) dev_info(DEV, "Creating new current UUID\n"); D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0); mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; + debug_drbd_uuid(mdev, UI_BITMAP); get_random_bytes(&val, sizeof(u64)); _drbd_uuid_set(mdev, UI_CURRENT, val); @@ -3478,6 +3670,8 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) drbd_uuid_move_history(mdev); mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; mdev->ldev->md.uuid[UI_BITMAP] = 0; + debug_drbd_uuid(mdev, UI_HISTORY_START); + debug_drbd_uuid(mdev, UI_BITMAP); } else { if (mdev->ldev->md.uuid[UI_BITMAP]) dev_warn(DEV, "bm UUID already set"); @@ -3485,6 +3679,7 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) mdev->ldev->md.uuid[UI_BITMAP] = val; mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1); + debug_drbd_uuid(mdev, UI_BITMAP); } drbd_md_mark_dirty(mdev); } @@ -3527,6 +3722,7 @@ int drbd_bmio_clear_n_write(struct drbd_conf *mdev) { int rv = -EIO; + drbd_resume_al(mdev); if (get_ldev_if_state(mdev, D_ATTACHING)) { drbd_bm_clear_all(mdev); rv = drbd_bm_write(mdev); @@ -3559,6 +3755,32 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) return 1; } +static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + D_ASSERT(mdev->state.disk == D_FAILED); + /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will + * inc/dec it frequently. Once we are D_DISKLESS, no one will touch + * the protected members anymore, though, so in the after_state_ch work + * it will be safe to free them. */ + drbd_force_state(mdev, NS(disk, D_DISKLESS)); + /* We need to wait for return of references checked out while we still + * have been D_FAILED, though (drbd_md_sync, bitmap io). */ + wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + + clear_bit(GO_DISKLESS, &mdev->flags); + return 1; +} + +void drbd_go_diskless(struct drbd_conf *mdev) +{ + D_ASSERT(mdev->state.disk == D_FAILED); + if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) + drbd_queue_work(&mdev->data.work, &mdev->go_diskless); + /* don't drbd_queue_work_front, + * we need to serialize with the after_state_ch work + * of the -> D_FAILED transition. */ +} + /** * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap * @mdev: DRBD device. @@ -3655,8 +3877,11 @@ static void md_sync_timer_fn(unsigned long data) static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused) { dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); +#ifdef DEBUG + dev_warn(DEV, "last md_mark_dirty: %s:%u\n", + mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line); +#endif drbd_md_sync(mdev); - return 1; } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 73131c5ae339..87925e97e613 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -33,10 +33,13 @@ #include <linux/blkpg.h> #include <linux/cpumask.h> #include "drbd_int.h" +#include "drbd_req.h" #include "drbd_wrappers.h" #include <asm/unaligned.h> #include <linux/drbd_tag_magic.h> #include <linux/drbd_limits.h> +#include <linux/compiler.h> +#include <linux/kthread.h> static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int); static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *); @@ -169,6 +172,10 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) put_net_conf(mdev); } + /* The helper may take some time. + * write out any unsynced meta data changes now */ + drbd_md_sync(mdev); + dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); drbd_bcast_ev_helper(mdev, cmd); @@ -202,12 +209,10 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) put_ldev(mdev); } else { dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n"); - return mdev->state.pdsk; + nps = mdev->state.pdsk; + goto out; } - if (fp == FP_STONITH) - _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE); - r = drbd_khelper(mdev, "fence-peer"); switch ((r>>8) & 0xff) { @@ -252,9 +257,36 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) dev_info(DEV, "fence-peer helper returned %d (%s)\n", (r>>8) & 0xff, ex_to_string); + +out: + if (mdev->state.susp_fen && nps >= D_UNKNOWN) { + /* The handler was not successful... unfreeze here, the + state engine can not unfreeze... */ + _drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE); + } + return nps; } +static int _try_outdate_peer_async(void *data) +{ + struct drbd_conf *mdev = (struct drbd_conf *)data; + enum drbd_disk_state nps; + + nps = drbd_try_outdate_peer(mdev); + drbd_request_state(mdev, NS(pdsk, nps)); + + return 0; +} + +void drbd_try_outdate_peer_async(struct drbd_conf *mdev) +{ + struct task_struct *opa; + + opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev)); + if (IS_ERR(opa)) + dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n"); +} int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) { @@ -394,6 +426,39 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) return r; } +static struct drbd_conf *ensure_mdev(int minor, int create) +{ + struct drbd_conf *mdev; + + if (minor >= minor_count) + return NULL; + + mdev = minor_to_mdev(minor); + + if (!mdev && create) { + struct gendisk *disk = NULL; + mdev = drbd_new_device(minor); + + spin_lock_irq(&drbd_pp_lock); + if (minor_table[minor] == NULL) { + minor_table[minor] = mdev; + disk = mdev->vdisk; + mdev = NULL; + } /* else: we lost the race */ + spin_unlock_irq(&drbd_pp_lock); + + if (disk) /* we won the race above */ + /* in case we ever add a drbd_delete_device(), + * don't forget the del_gendisk! */ + add_disk(disk); + else /* we lost the race above */ + drbd_free_mdev(mdev); + + mdev = minor_to_mdev(minor); + } + + return mdev; +} static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, struct drbd_nl_cfg_reply *reply) @@ -494,6 +559,8 @@ char *ppsize(char *buf, unsigned long long size) void drbd_suspend_io(struct drbd_conf *mdev) { set_bit(SUSPEND_IO, &mdev->flags); + if (is_susp(mdev->state)) + return; wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); } @@ -713,9 +780,6 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu blk_queue_segment_boundary(q, PAGE_SIZE-1); blk_stack_limits(&q->limits, &b->limits, 0); - if (b->merge_bvec_fn) - dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n", - b->merge_bvec_fn); dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q)); if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { @@ -729,14 +793,16 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu /* serialize deconfig (worker exiting, doing cleanup) * and reconfig (drbdsetup disk, drbdsetup net) * - * wait for a potentially exiting worker, then restart it, - * or start a new one. + * Wait for a potentially exiting worker, then restart it, + * or start a new one. Flush any pending work, there may still be an + * after_state_change queued. */ static void drbd_reconfig_start(struct drbd_conf *mdev) { wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags)); wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags)); drbd_thread_start(&mdev->worker); + drbd_flush_workqueue(mdev); } /* if still unconfigured, stops worker again. @@ -756,6 +822,29 @@ static void drbd_reconfig_done(struct drbd_conf *mdev) wake_up(&mdev->state_wait); } +/* Make sure IO is suspended before calling this function(). */ +static void drbd_suspend_al(struct drbd_conf *mdev) +{ + int s = 0; + + if (lc_try_lock(mdev->act_log)) { + drbd_al_shrink(mdev); + lc_unlock(mdev->act_log); + } else { + dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n"); + return; + } + + spin_lock_irq(&mdev->req_lock); + if (mdev->state.conn < C_CONNECTED) + s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags); + + spin_unlock_irq(&mdev->req_lock); + + if (s) + dev_info(DEV, "Suspended AL updates\n"); +} + /* does always return 0; * interesting return code is in reply->ret_code */ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, @@ -769,6 +858,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp struct inode *inode, *inode2; struct lru_cache *resync_lru = NULL; union drbd_state ns, os; + unsigned int max_seg_s; int rv; int cp_discovered = 0; int logical_block_size; @@ -803,6 +893,15 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp goto fail; } + if (get_net_conf(mdev)) { + int prot = mdev->net_conf->wire_protocol; + put_net_conf(mdev); + if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) { + retcode = ERR_STONITH_AND_PROT_A; + goto fail; + } + } + nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0); if (IS_ERR(nbc->lo_file)) { dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, @@ -924,7 +1023,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp drbd_suspend_io(mdev); /* also wait for the last barrier ack. */ - wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt)); + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state)); /* and for any other previously queued work */ drbd_flush_workqueue(mdev); @@ -1021,7 +1120,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp else clear_bit(CRASHED_PRIMARY, &mdev->flags); - if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) { + if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && + !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) { set_bit(CRASHED_PRIMARY, &mdev->flags); cp_discovered = 1; } @@ -1031,7 +1131,20 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp mdev->read_cnt = 0; mdev->writ_cnt = 0; - drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE); + max_seg_s = DRBD_MAX_SEGMENT_SIZE; + if (mdev->state.conn == C_CONNECTED) { + /* We are Primary, Connected, and now attach a new local + * backing store. We must not increase the user visible maximum + * bio size on this device to something the peer may not be + * able to handle. */ + if (mdev->agreed_pro_version < 94) + max_seg_s = queue_max_segment_size(mdev->rq_queue); + else if (mdev->agreed_pro_version == 94) + max_seg_s = DRBD_MAX_SIZE_H80_PACKET; + /* else: drbd 8.3.9 and later, stay with default */ + } + + drbd_setup_queue_param(mdev, max_seg_s); /* If I am currently not R_PRIMARY, * but meta data primary indicator is set, @@ -1079,6 +1192,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp drbd_al_to_on_disk_bm(mdev); } + if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev)) + drbd_suspend_al(mdev); /* IO is still suspended here... */ + spin_lock_irq(&mdev->req_lock); os = mdev->state; ns.i = os.i; @@ -1235,7 +1351,16 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, && (new_conf->wire_protocol != DRBD_PROT_C)) { retcode = ERR_NOT_PROTO_C; goto fail; - }; + } + + if (get_ldev(mdev)) { + enum drbd_fencing_p fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) { + retcode = ERR_STONITH_AND_PROT_A; + goto fail; + } + } if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { retcode = ERR_DISCARD; @@ -1350,6 +1475,7 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, } } + drbd_flush_workqueue(mdev); spin_lock_irq(&mdev->req_lock); if (mdev->net_conf != NULL) { retcode = ERR_NET_CONFIGURED; @@ -1388,10 +1514,9 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, mdev->int_dig_out=int_dig_out; mdev->int_dig_in=int_dig_in; mdev->int_dig_vv=int_dig_vv; + retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL); spin_unlock_irq(&mdev->req_lock); - retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE); - kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); reply->ret_code = retcode; drbd_reconfig_done(mdev); @@ -1546,6 +1671,8 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n struct crypto_hash *csums_tfm = NULL; struct syncer_conf sc; cpumask_var_t new_cpu_mask; + int *rs_plan_s = NULL; + int fifo_size; if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) { retcode = ERR_NOMEM; @@ -1557,6 +1684,12 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n sc.rate = DRBD_RATE_DEF; sc.after = DRBD_AFTER_DEF; sc.al_extents = DRBD_AL_EXTENTS_DEF; + sc.on_no_data = DRBD_ON_NO_DATA_DEF; + sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF; + sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF; + sc.c_fill_target = DRBD_C_FILL_TARGET_DEF; + sc.c_max_rate = DRBD_C_MAX_RATE_DEF; + sc.c_min_rate = DRBD_C_MIN_RATE_DEF; } else memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); @@ -1634,6 +1767,12 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n } #undef AL_MAX + /* to avoid spurious errors when configuring minors before configuring + * the minors they depend on: if necessary, first create the minor we + * depend on */ + if (sc.after >= 0) + ensure_mdev(sc.after, 1); + /* most sanity checks done, try to assign the new sync-after * dependency. need to hold the global lock in there, * to avoid a race in the dependency loop check. */ @@ -1641,6 +1780,16 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n if (retcode != NO_ERROR) goto fail; + fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ; + if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) { + rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL); + if (!rs_plan_s) { + dev_err(DEV, "kmalloc of fifo_buffer failed"); + retcode = ERR_NOMEM; + goto fail; + } + } + /* ok, assign the rest of it as well. * lock against receive_SyncParam() */ spin_lock(&mdev->peer_seq_lock); @@ -1657,6 +1806,15 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n mdev->verify_tfm = verify_tfm; verify_tfm = NULL; } + + if (fifo_size != mdev->rs_plan_s.size) { + kfree(mdev->rs_plan_s.values); + mdev->rs_plan_s.values = rs_plan_s; + mdev->rs_plan_s.size = fifo_size; + mdev->rs_planed = 0; + rs_plan_s = NULL; + } + spin_unlock(&mdev->peer_seq_lock); if (get_ldev(mdev)) { @@ -1688,6 +1846,7 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); fail: + kfree(rs_plan_s); free_cpumask_var(new_cpu_mask); crypto_free_hash(csums_tfm); crypto_free_hash(verify_tfm); @@ -1721,12 +1880,38 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl return 0; } +static int drbd_bmio_set_susp_al(struct drbd_conf *mdev) +{ + int rv; + + rv = drbd_bmio_set_n_write(mdev); + drbd_suspend_al(mdev); + return rv; +} + static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, struct drbd_nl_cfg_reply *reply) { + int retcode; - reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); + retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); + + if (retcode < SS_SUCCESS) { + if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) { + /* The peer will get a resync upon connect anyways. Just make that + into a full resync. */ + retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); + if (retcode >= SS_SUCCESS) { + /* open coded drbd_bitmap_io() */ + if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, + "set_n_write from invalidate_peer")) + retcode = ERR_IO_MD_DISK; + } + } else + retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); + } + reply->ret_code = retcode; return 0; } @@ -1765,7 +1950,21 @@ static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, struct drbd_nl_cfg_reply *reply) { - reply->ret_code = drbd_request_state(mdev, NS(susp, 0)); + if (test_bit(NEW_CUR_UUID, &mdev->flags)) { + drbd_uuid_new_current(mdev); + clear_bit(NEW_CUR_UUID, &mdev->flags); + drbd_md_sync(mdev); + } + drbd_suspend_io(mdev); + reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); + if (reply->ret_code == SS_SUCCESS) { + if (mdev->state.conn < C_CONNECTED) + tl_clear(mdev); + if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED) + tl_restart(mdev, fail_frozen_disk_io); + } + drbd_resume_io(mdev); + return 0; } @@ -1941,40 +2140,6 @@ out: return 0; } -static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp) -{ - struct drbd_conf *mdev; - - if (nlp->drbd_minor >= minor_count) - return NULL; - - mdev = minor_to_mdev(nlp->drbd_minor); - - if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) { - struct gendisk *disk = NULL; - mdev = drbd_new_device(nlp->drbd_minor); - - spin_lock_irq(&drbd_pp_lock); - if (minor_table[nlp->drbd_minor] == NULL) { - minor_table[nlp->drbd_minor] = mdev; - disk = mdev->vdisk; - mdev = NULL; - } /* else: we lost the race */ - spin_unlock_irq(&drbd_pp_lock); - - if (disk) /* we won the race above */ - /* in case we ever add a drbd_delete_device(), - * don't forget the del_gendisk! */ - add_disk(disk); - else /* we lost the race above */ - drbd_free_mdev(mdev); - - mdev = minor_to_mdev(nlp->drbd_minor); - } - - return mdev; -} - struct cn_handler_struct { int (*function)(struct drbd_conf *, struct drbd_nl_cfg_req *, @@ -2035,7 +2200,8 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms goto fail; } - mdev = ensure_mdev(nlp); + mdev = ensure_mdev(nlp->drbd_minor, + (nlp->flags & DRBD_NL_CREATE_DEVICE)); if (!mdev) { retcode = ERR_MINOR_INVALID; goto fail; diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index be3374b68460..ad325c5d0ce1 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -57,6 +57,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) unsigned long db, dt, dbdt, rt, rs_left; unsigned int res; int i, x, y; + int stalled = 0; drbd_get_syncer_progress(mdev, &rs_left, &res); @@ -90,18 +91,17 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) * db: blocks written from mark until now * rt: remaining time */ - dt = (jiffies - mdev->rs_mark_time) / HZ; - - if (dt > 20) { - /* if we made no update to rs_mark_time for too long, - * we are stalled. show that. */ - seq_printf(seq, "stalled\n"); - return; - } + /* Rolling marks. last_mark+1 may just now be modified. last_mark+2 is + * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at + * least DRBD_SYNC_MARK_STEP time before it will be modified. */ + i = (mdev->rs_last_mark + 2) % DRBD_SYNC_MARKS; + dt = (jiffies - mdev->rs_mark_time[i]) / HZ; + if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS)) + stalled = 1; if (!dt) dt++; - db = mdev->rs_mark_left - rs_left; + db = mdev->rs_mark_left[i] - rs_left; rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */ seq_printf(seq, "finish: %lu:%02lu:%02lu", @@ -118,7 +118,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) /* mean speed since syncer started * we do account for PausedSync periods */ dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; - if (dt <= 0) + if (dt == 0) dt = 1; db = mdev->rs_total - rs_left; dbdt = Bit2KB(db/dt); @@ -128,7 +128,14 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) else seq_printf(seq, " (%ld)", dbdt); - seq_printf(seq, " K/sec\n"); + if (mdev->state.conn == C_SYNC_TARGET) { + if (mdev->c_sync_rate > 1000) + seq_printf(seq, " want: %d,%03d", + mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000); + else + seq_printf(seq, " want: %d", mdev->c_sync_rate); + } + seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : ""); } static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) @@ -196,7 +203,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%2d: cs:Unconfigured\n", i); } else { seq_printf(seq, - "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n" + "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n" " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c", i, sn, @@ -206,11 +213,12 @@ static int drbd_seq_show(struct seq_file *seq, void *v) drbd_disk_str(mdev->state.pdsk), (mdev->net_conf == NULL ? ' ' : (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), - mdev->state.susp ? 's' : 'r', + is_susp(mdev->state) ? 's' : 'r', mdev->state.aftr_isp ? 'a' : '-', mdev->state.peer_isp ? 'p' : '-', mdev->state.user_isp ? 'u' : '-', mdev->congestion_reason ?: '-', + test_bit(AL_SUSPENDED, &mdev->flags) ? 's' : '-', mdev->send_cnt/2, mdev->recv_cnt/2, mdev->writ_cnt/2, diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 081522d3c742..efd6169acf2f 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -241,7 +241,7 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) spin_unlock_irq(&mdev->req_lock); list_for_each_entry_safe(e, t, &reclaimed, w.list) - drbd_free_ee(mdev, e); + drbd_free_net_ee(mdev, e); } /** @@ -298,9 +298,11 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool * Is also used from inside an other spin_lock_irq(&mdev->req_lock); * Either links the page chain back to the global pool, * or returns all pages to the system. */ -static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) +static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net) { + atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use; int i; + if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) i = page_chain_free(page); else { @@ -311,10 +313,10 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) drbd_pp_vacant += i; spin_unlock(&drbd_pp_lock); } - atomic_sub(i, &mdev->pp_in_use); - i = atomic_read(&mdev->pp_in_use); + i = atomic_sub_return(i, a); if (i < 0) - dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i); + dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n", + is_net ? "pp_in_use_by_net" : "pp_in_use", i); wake_up(&drbd_pp_wait); } @@ -365,7 +367,6 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, e->size = data_size; e->flags = 0; e->sector = sector; - e->sector = sector; e->block_id = id; return e; @@ -375,9 +376,11 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, return NULL; } -void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) +void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net) { - drbd_pp_free(mdev, e->pages); + if (e->flags & EE_HAS_DIGEST) + kfree(e->digest); + drbd_pp_free(mdev, e->pages, is_net); D_ASSERT(atomic_read(&e->pending_bios) == 0); D_ASSERT(hlist_unhashed(&e->colision)); mempool_free(e, drbd_ee_mempool); @@ -388,13 +391,14 @@ int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list) LIST_HEAD(work_list); struct drbd_epoch_entry *e, *t; int count = 0; + int is_net = list == &mdev->net_ee; spin_lock_irq(&mdev->req_lock); list_splice_init(list, &work_list); spin_unlock_irq(&mdev->req_lock); list_for_each_entry_safe(e, t, &work_list, w.list) { - drbd_free_ee(mdev, e); + drbd_free_some_ee(mdev, e, is_net); count++; } return count; @@ -423,7 +427,7 @@ static int drbd_process_done_ee(struct drbd_conf *mdev) spin_unlock_irq(&mdev->req_lock); list_for_each_entry_safe(e, t, &reclaimed, w.list) - drbd_free_ee(mdev, e); + drbd_free_net_ee(mdev, e); /* possible callbacks here: * e_end_block, and e_end_resync_block, e_send_discard_ack. @@ -719,14 +723,14 @@ out: static int drbd_send_fp(struct drbd_conf *mdev, struct socket *sock, enum drbd_packets cmd) { - struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; + struct p_header80 *h = &mdev->data.sbuf.header.h80; return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0); } static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock) { - struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; + struct p_header80 *h = &mdev->data.rbuf.header.h80; int rr; rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0); @@ -776,9 +780,6 @@ static int drbd_connect(struct drbd_conf *mdev) D_ASSERT(!mdev->data.socket); - if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) - dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n"); - if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS) return -2; @@ -927,6 +928,11 @@ retry: drbd_thread_start(&mdev->asender); + if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) { + drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET); + put_ldev(mdev); + } + if (!drbd_send_protocol(mdev)) return -1; drbd_send_sync_param(mdev, &mdev->sync_conf); @@ -946,22 +952,28 @@ out_release_sockets: return -1; } -static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h) +static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size) { + union p_header *h = &mdev->data.rbuf.header; int r; r = drbd_recv(mdev, h, sizeof(*h)); - if (unlikely(r != sizeof(*h))) { dev_err(DEV, "short read expecting header on sock: r=%d\n", r); return FALSE; - }; - h->command = be16_to_cpu(h->command); - h->length = be16_to_cpu(h->length); - if (unlikely(h->magic != BE_DRBD_MAGIC)) { - dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n", - (long)be32_to_cpu(h->magic), - h->command, h->length); + } + + if (likely(h->h80.magic == BE_DRBD_MAGIC)) { + *cmd = be16_to_cpu(h->h80.command); + *packet_size = be16_to_cpu(h->h80.length); + } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) { + *cmd = be16_to_cpu(h->h95.command); + *packet_size = be32_to_cpu(h->h95.length); + } else { + dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n", + be32_to_cpu(h->h80.magic), + be16_to_cpu(h->h80.command), + be16_to_cpu(h->h80.length)); return FALSE; } mdev->last_received = jiffies; @@ -975,7 +987,7 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, - NULL, BLKDEV_IFL_WAIT); + NULL); if (rv) { dev_err(DEV, "local disk flush failed with status %d\n", rv); /* would rather check on EOPNOTSUPP, but that is not reliable. @@ -1268,17 +1280,12 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea return 1; } -static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h) +static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { int rv, issue_flush; - struct p_barrier *p = (struct p_barrier *)h; + struct p_barrier *p = &mdev->data.rbuf.barrier; struct drbd_epoch *epoch; - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; - - rv = drbd_recv(mdev, h->payload, h->length); - ERR_IF(rv != h->length) return FALSE; - inc_unacked(mdev); if (mdev->net_conf->wire_protocol != DRBD_PROT_C) @@ -1457,7 +1464,7 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size) data_size -= rr; } kunmap(page); - drbd_pp_free(mdev, page); + drbd_pp_free(mdev, page, 0); return rv; } @@ -1562,30 +1569,29 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si list_add(&e->w.list, &mdev->sync_ee); spin_unlock_irq(&mdev->req_lock); + atomic_add(data_size >> 9, &mdev->rs_sect_ev); if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0) return TRUE; + /* drbd_submit_ee currently fails for one reason only: + * not being able to allocate enough bios. + * Is dropping the connection going to help? */ + spin_lock_irq(&mdev->req_lock); + list_del(&e->w.list); + spin_unlock_irq(&mdev->req_lock); + drbd_free_ee(mdev, e); fail: put_ldev(mdev); return FALSE; } -static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) +static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { struct drbd_request *req; sector_t sector; - unsigned int header_size, data_size; int ok; - struct p_data *p = (struct p_data *)h; - - header_size = sizeof(*p) - sizeof(*h); - data_size = h->length - header_size; - - ERR_IF(data_size == 0) return FALSE; - - if (drbd_recv(mdev, h->payload, header_size) != header_size) - return FALSE; + struct p_data *p = &mdev->data.rbuf.data; sector = be64_to_cpu(p->sector); @@ -1611,20 +1617,11 @@ static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) return ok; } -static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h) +static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { sector_t sector; - unsigned int header_size, data_size; int ok; - struct p_data *p = (struct p_data *)h; - - header_size = sizeof(*p) - sizeof(*h); - data_size = h->length - header_size; - - ERR_IF(data_size == 0) return FALSE; - - if (drbd_recv(mdev, h->payload, header_size) != header_size) - return FALSE; + struct p_data *p = &mdev->data.rbuf.data; sector = be64_to_cpu(p->sector); D_ASSERT(p->block_id == ID_SYNCER); @@ -1640,9 +1637,11 @@ static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h) ok = drbd_drain_block(mdev, data_size); - drbd_send_ack_dp(mdev, P_NEG_ACK, p); + drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size); } + atomic_add(data_size >> 9, &mdev->rs_sect_in); + return ok; } @@ -1765,24 +1764,27 @@ static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq) return ret; } +static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf) +{ + if (mdev->agreed_pro_version >= 95) + return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | + (dpf & DP_UNPLUG ? REQ_UNPLUG : 0) | + (dpf & DP_FUA ? REQ_FUA : 0) | + (dpf & DP_FLUSH ? REQ_FUA : 0) | + (dpf & DP_DISCARD ? REQ_DISCARD : 0); + else + return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0; +} + /* mirrored write */ -static int receive_Data(struct drbd_conf *mdev, struct p_header *h) +static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { sector_t sector; struct drbd_epoch_entry *e; - struct p_data *p = (struct p_data *)h; - int header_size, data_size; + struct p_data *p = &mdev->data.rbuf.data; int rw = WRITE; u32 dp_flags; - header_size = sizeof(*p) - sizeof(*h); - data_size = h->length - header_size; - - ERR_IF(data_size == 0) return FALSE; - - if (drbd_recv(mdev, h->payload, header_size) != header_size) - return FALSE; - if (!get_ldev(mdev)) { if (__ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Can not write mirrored data block " @@ -1792,7 +1794,7 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h) mdev->peer_seq++; spin_unlock(&mdev->peer_seq_lock); - drbd_send_ack_dp(mdev, P_NEG_ACK, p); + drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size); atomic_inc(&mdev->current_epoch->epoch_size); return drbd_drain_block(mdev, data_size); } @@ -1839,12 +1841,8 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h) spin_unlock(&mdev->epoch_lock); dp_flags = be32_to_cpu(p->dp_flags); - if (dp_flags & DP_HARDBARRIER) { - dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n"); - /* rw |= REQ_HARDBARRIER; */ - } - if (dp_flags & DP_RW_SYNC) - rw |= REQ_SYNC | REQ_UNPLUG; + rw |= write_flags_to_bio(mdev, dp_flags); + if (dp_flags & DP_MAY_SET_IN_SYNC) e->flags |= EE_MAY_SET_IN_SYNC; @@ -2007,6 +2005,16 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h) if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0) return TRUE; + /* drbd_submit_ee currently fails for one reason only: + * not being able to allocate enough bios. + * Is dropping the connection going to help? */ + spin_lock_irq(&mdev->req_lock); + list_del(&e->w.list); + hlist_del_init(&e->colision); + spin_unlock_irq(&mdev->req_lock); + if (e->flags & EE_CALL_AL_COMPLETE_IO) + drbd_al_complete_io(mdev, e->sector); + out_interrupted: /* yes, the epoch_size now is imbalanced. * but we drop the connection anyways, so we don't have a chance to @@ -2016,20 +2024,64 @@ out_interrupted: return FALSE; } -static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) +/* We may throttle resync, if the lower device seems to be busy, + * and current sync rate is above c_min_rate. + * + * To decide whether or not the lower device is busy, we use a scheme similar + * to MD RAID is_mddev_idle(): if the partition stats reveal "significant" + * (more than 64 sectors) of activity we cannot account for with our own resync + * activity, it obviously is "busy". + * + * The current sync rate used here uses only the most recent two step marks, + * to have a short time average so we can react faster. + */ +int drbd_rs_should_slow_down(struct drbd_conf *mdev) +{ + struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk; + unsigned long db, dt, dbdt; + int curr_events; + int throttle = 0; + + /* feature disabled? */ + if (mdev->sync_conf.c_min_rate == 0) + return 0; + + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]) - + atomic_read(&mdev->rs_sect_ev); + if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) { + unsigned long rs_left; + int i; + + mdev->rs_last_events = curr_events; + + /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP, + * approx. */ + i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS; + rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; + + dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ; + if (!dt) + dt++; + db = mdev->rs_mark_left[i] - rs_left; + dbdt = Bit2KB(db/dt); + + if (dbdt > mdev->sync_conf.c_min_rate) + throttle = 1; + } + return throttle; +} + + +static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size) { sector_t sector; const sector_t capacity = drbd_get_capacity(mdev->this_bdev); struct drbd_epoch_entry *e; struct digest_info *di = NULL; - int size, digest_size; + int size, verb; unsigned int fault_type; - struct p_block_req *p = - (struct p_block_req *)h; - const int brps = sizeof(*p)-sizeof(*h); - - if (drbd_recv(mdev, h->payload, brps) != brps) - return FALSE; + struct p_block_req *p = &mdev->data.rbuf.block_req; sector = be64_to_cpu(p->sector); size = be32_to_cpu(p->blksize); @@ -2046,12 +2098,31 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) } if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { - if (__ratelimit(&drbd_ratelimit_state)) + verb = 1; + switch (cmd) { + case P_DATA_REQUEST: + drbd_send_ack_rp(mdev, P_NEG_DREPLY, p); + break; + case P_RS_DATA_REQUEST: + case P_CSUM_RS_REQUEST: + case P_OV_REQUEST: + drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p); + break; + case P_OV_REPLY: + verb = 0; + dec_rs_pending(mdev); + drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC); + break; + default: + dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", + cmdname(cmd)); + } + if (verb && __ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Can not satisfy peer's read request, " "no local data.\n"); - drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY : - P_NEG_RS_DREPLY , p); - return drbd_drain_block(mdev, h->length - brps); + + /* drain possibly payload */ + return drbd_drain_block(mdev, digest_size); } /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD @@ -2063,31 +2134,21 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) return FALSE; } - switch (h->command) { + switch (cmd) { case P_DATA_REQUEST: e->w.cb = w_e_end_data_req; fault_type = DRBD_FAULT_DT_RD; - break; + /* application IO, don't drbd_rs_begin_io */ + goto submit; + case P_RS_DATA_REQUEST: e->w.cb = w_e_end_rsdata_req; fault_type = DRBD_FAULT_RS_RD; - /* Eventually this should become asynchronously. Currently it - * blocks the whole receiver just to delay the reading of a - * resync data block. - * the drbd_work_queue mechanism is made for this... - */ - if (!drbd_rs_begin_io(mdev, sector)) { - /* we have been interrupted, - * probably connection lost! */ - D_ASSERT(signal_pending(current)); - goto out_free_e; - } break; case P_OV_REPLY: case P_CSUM_RS_REQUEST: fault_type = DRBD_FAULT_RS_RD; - digest_size = h->length - brps ; di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO); if (!di) goto out_free_e; @@ -2095,31 +2156,25 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) di->digest_size = digest_size; di->digest = (((char *)di)+sizeof(struct digest_info)); + e->digest = di; + e->flags |= EE_HAS_DIGEST; + if (drbd_recv(mdev, di->digest, digest_size) != digest_size) goto out_free_e; - e->block_id = (u64)(unsigned long)di; - if (h->command == P_CSUM_RS_REQUEST) { + if (cmd == P_CSUM_RS_REQUEST) { D_ASSERT(mdev->agreed_pro_version >= 89); e->w.cb = w_e_end_csum_rs_req; - } else if (h->command == P_OV_REPLY) { + } else if (cmd == P_OV_REPLY) { e->w.cb = w_e_end_ov_reply; dec_rs_pending(mdev); - break; - } - - if (!drbd_rs_begin_io(mdev, sector)) { - /* we have been interrupted, probably connection lost! */ - D_ASSERT(signal_pending(current)); - goto out_free_e; + /* drbd_rs_begin_io done when we sent this request, + * but accounting still needs to be done. */ + goto submit_for_resync; } break; case P_OV_REQUEST: - if (mdev->state.conn >= C_CONNECTED && - mdev->state.conn != C_VERIFY_T) - dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n", - drbd_conn_str(mdev->state.conn)); if (mdev->ov_start_sector == ~(sector_t)0 && mdev->agreed_pro_version >= 90) { mdev->ov_start_sector = sector; @@ -2130,37 +2185,63 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) } e->w.cb = w_e_end_ov_req; fault_type = DRBD_FAULT_RS_RD; - /* Eventually this should become asynchronous. Currently it - * blocks the whole receiver just to delay the reading of a - * resync data block. - * the drbd_work_queue mechanism is made for this... - */ - if (!drbd_rs_begin_io(mdev, sector)) { - /* we have been interrupted, - * probably connection lost! */ - D_ASSERT(signal_pending(current)); - goto out_free_e; - } break; - default: dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", - cmdname(h->command)); + cmdname(cmd)); fault_type = DRBD_FAULT_MAX; + goto out_free_e; } - spin_lock_irq(&mdev->req_lock); - list_add(&e->w.list, &mdev->read_ee); - spin_unlock_irq(&mdev->req_lock); + /* Throttle, drbd_rs_begin_io and submit should become asynchronous + * wrt the receiver, but it is not as straightforward as it may seem. + * Various places in the resync start and stop logic assume resync + * requests are processed in order, requeuing this on the worker thread + * introduces a bunch of new code for synchronization between threads. + * + * Unlimited throttling before drbd_rs_begin_io may stall the resync + * "forever", throttling after drbd_rs_begin_io will lock that extent + * for application writes for the same time. For now, just throttle + * here, where the rest of the code expects the receiver to sleep for + * a while, anyways. + */ + + /* Throttle before drbd_rs_begin_io, as that locks out application IO; + * this defers syncer requests for some time, before letting at least + * on request through. The resync controller on the receiving side + * will adapt to the incoming rate accordingly. + * + * We cannot throttle here if remote is Primary/SyncTarget: + * we would also throttle its application reads. + * In that case, throttling is done on the SyncTarget only. + */ + if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev)) + msleep(100); + if (drbd_rs_begin_io(mdev, e->sector)) + goto out_free_e; +submit_for_resync: + atomic_add(size >> 9, &mdev->rs_sect_ev); + +submit: inc_unacked(mdev); + spin_lock_irq(&mdev->req_lock); + list_add_tail(&e->w.list, &mdev->read_ee); + spin_unlock_irq(&mdev->req_lock); if (drbd_submit_ee(mdev, e, READ, fault_type) == 0) return TRUE; + /* drbd_submit_ee currently fails for one reason only: + * not being able to allocate enough bios. + * Is dropping the connection going to help? */ + spin_lock_irq(&mdev->req_lock); + list_del(&e->w.list); + spin_unlock_irq(&mdev->req_lock); + /* no drbd_rs_complete_io(), we are dropping the connection anyways */ + out_free_e: - kfree(di); put_ldev(mdev); drbd_free_ee(mdev, e); return FALSE; @@ -2699,20 +2780,13 @@ static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self) return 1; } -static int receive_protocol(struct drbd_conf *mdev, struct p_header *h) +static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct p_protocol *p = (struct p_protocol *)h; - int header_size, data_size; + struct p_protocol *p = &mdev->data.rbuf.protocol; int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; int p_want_lose, p_two_primaries, cf; char p_integrity_alg[SHARED_SECRET_MAX] = ""; - header_size = sizeof(*p) - sizeof(*h); - data_size = h->length - header_size; - - if (drbd_recv(mdev, h->payload, header_size) != header_size) - return FALSE; - p_proto = be32_to_cpu(p->protocol); p_after_sb_0p = be32_to_cpu(p->after_sb_0p); p_after_sb_1p = be32_to_cpu(p->after_sb_1p); @@ -2805,39 +2879,46 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev, return tfm; } -static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) +static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size) { int ok = TRUE; - struct p_rs_param_89 *p = (struct p_rs_param_89 *)h; + struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95; unsigned int header_size, data_size, exp_max_sz; struct crypto_hash *verify_tfm = NULL; struct crypto_hash *csums_tfm = NULL; const int apv = mdev->agreed_pro_version; + int *rs_plan_s = NULL; + int fifo_size = 0; exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) : apv == 88 ? sizeof(struct p_rs_param) + SHARED_SECRET_MAX - : /* 89 */ sizeof(struct p_rs_param_89); + : apv <= 94 ? sizeof(struct p_rs_param_89) + : /* apv >= 95 */ sizeof(struct p_rs_param_95); - if (h->length > exp_max_sz) { + if (packet_size > exp_max_sz) { dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", - h->length, exp_max_sz); + packet_size, exp_max_sz); return FALSE; } if (apv <= 88) { - header_size = sizeof(struct p_rs_param) - sizeof(*h); - data_size = h->length - header_size; - } else /* apv >= 89 */ { - header_size = sizeof(struct p_rs_param_89) - sizeof(*h); - data_size = h->length - header_size; + header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80); + data_size = packet_size - header_size; + } else if (apv <= 94) { + header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80); + data_size = packet_size - header_size; + D_ASSERT(data_size == 0); + } else { + header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80); + data_size = packet_size - header_size; D_ASSERT(data_size == 0); } /* initialize verify_alg and csums_alg */ memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); - if (drbd_recv(mdev, h->payload, header_size) != header_size) + if (drbd_recv(mdev, &p->head.payload, header_size) != header_size) return FALSE; mdev->sync_conf.rate = be32_to_cpu(p->rate); @@ -2896,6 +2977,22 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) } } + if (apv > 94) { + mdev->sync_conf.rate = be32_to_cpu(p->rate); + mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead); + mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target); + mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target); + mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate); + + fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; + if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) { + rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL); + if (!rs_plan_s) { + dev_err(DEV, "kmalloc of fifo_buffer failed"); + goto disconnect; + } + } + } spin_lock(&mdev->peer_seq_lock); /* lock against drbd_nl_syncer_conf() */ @@ -2913,6 +3010,12 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) mdev->csums_tfm = csums_tfm; dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); } + if (fifo_size != mdev->rs_plan_s.size) { + kfree(mdev->rs_plan_s.values); + mdev->rs_plan_s.values = rs_plan_s; + mdev->rs_plan_s.size = fifo_size; + mdev->rs_planed = 0; + } spin_unlock(&mdev->peer_seq_lock); } @@ -2946,19 +3049,15 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev, (unsigned long long)a, (unsigned long long)b); } -static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) +static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct p_sizes *p = (struct p_sizes *)h; + struct p_sizes *p = &mdev->data.rbuf.sizes; enum determine_dev_size dd = unchanged; unsigned int max_seg_s; sector_t p_size, p_usize, my_usize; int ldsc = 0; /* local disk size changed */ enum dds_flags ddsf; - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; - if (drbd_recv(mdev, h->payload, h->length) != h->length) - return FALSE; - p_size = be64_to_cpu(p->d_size); p_usize = be64_to_cpu(p->u_size); @@ -2972,7 +3071,6 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) * we still need to figure out whether we accept that. */ mdev->p_size = p_size; -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) if (get_ldev(mdev)) { warn_if_differ_considerably(mdev, "lower level device sizes", p_size, drbd_get_max_capacity(mdev->ldev)); @@ -3029,6 +3127,8 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) if (mdev->agreed_pro_version < 94) max_seg_s = be32_to_cpu(p->max_segment_size); + else if (mdev->agreed_pro_version == 94) + max_seg_s = DRBD_MAX_SIZE_H80_PACKET; else /* drbd 8.3.8 onwards */ max_seg_s = DRBD_MAX_SEGMENT_SIZE; @@ -3062,16 +3162,12 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -static int receive_uuids(struct drbd_conf *mdev, struct p_header *h) +static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct p_uuids *p = (struct p_uuids *)h; + struct p_uuids *p = &mdev->data.rbuf.uuids; u64 *p_uuid; int i; - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; - if (drbd_recv(mdev, h->payload, h->length) != h->length) - return FALSE; - p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) @@ -3107,6 +3203,11 @@ static int receive_uuids(struct drbd_conf *mdev, struct p_header *h) drbd_md_sync(mdev); } put_ldev(mdev); + } else if (mdev->state.disk < D_INCONSISTENT && + mdev->state.role == R_PRIMARY) { + /* I am a diskless primary, the peer just created a new current UUID + for me. */ + drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); } /* Before we test for the disk state, we should wait until an eventually @@ -3150,16 +3251,12 @@ static union drbd_state convert_state(union drbd_state ps) return ms; } -static int receive_req_state(struct drbd_conf *mdev, struct p_header *h) +static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct p_req_state *p = (struct p_req_state *)h; + struct p_req_state *p = &mdev->data.rbuf.req_state; union drbd_state mask, val; int rv; - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; - if (drbd_recv(mdev, h->payload, h->length) != h->length) - return FALSE; - mask.i = be32_to_cpu(p->mask); val.i = be32_to_cpu(p->val); @@ -3180,20 +3277,14 @@ static int receive_req_state(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -static int receive_state(struct drbd_conf *mdev, struct p_header *h) +static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct p_state *p = (struct p_state *)h; - enum drbd_conns nconn, oconn; - union drbd_state ns, peer_state; + struct p_state *p = &mdev->data.rbuf.state; + union drbd_state os, ns, peer_state; enum drbd_disk_state real_peer_disk; + enum chg_state_flags cs_flags; int rv; - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) - return FALSE; - - if (drbd_recv(mdev, h->payload, h->length) != h->length) - return FALSE; - peer_state.i = be32_to_cpu(p->state); real_peer_disk = peer_state.disk; @@ -3204,38 +3295,72 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h) spin_lock_irq(&mdev->req_lock); retry: - oconn = nconn = mdev->state.conn; + os = ns = mdev->state; spin_unlock_irq(&mdev->req_lock); - if (nconn == C_WF_REPORT_PARAMS) - nconn = C_CONNECTED; + /* peer says his disk is uptodate, while we think it is inconsistent, + * and this happens while we think we have a sync going on. */ + if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE && + os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) { + /* If we are (becoming) SyncSource, but peer is still in sync + * preparation, ignore its uptodate-ness to avoid flapping, it + * will change to inconsistent once the peer reaches active + * syncing states. + * It may have changed syncer-paused flags, however, so we + * cannot ignore this completely. */ + if (peer_state.conn > C_CONNECTED && + peer_state.conn < C_SYNC_SOURCE) + real_peer_disk = D_INCONSISTENT; + + /* if peer_state changes to connected at the same time, + * it explicitly notifies us that it finished resync. + * Maybe we should finish it up, too? */ + else if (os.conn >= C_SYNC_SOURCE && + peer_state.conn == C_CONNECTED) { + if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) + drbd_resync_finished(mdev); + return TRUE; + } + } + + /* peer says his disk is inconsistent, while we think it is uptodate, + * and this happens while the peer still thinks we have a sync going on, + * but we think we are already done with the sync. + * We ignore this to avoid flapping pdsk. + * This should not happen, if the peer is a recent version of drbd. */ + if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT && + os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE) + real_peer_disk = D_UP_TO_DATE; + + if (ns.conn == C_WF_REPORT_PARAMS) + ns.conn = C_CONNECTED; if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING && get_ldev_if_state(mdev, D_NEGOTIATING)) { int cr; /* consider resync */ /* if we established a new connection */ - cr = (oconn < C_CONNECTED); + cr = (os.conn < C_CONNECTED); /* if we had an established connection * and one of the nodes newly attaches a disk */ - cr |= (oconn == C_CONNECTED && + cr |= (os.conn == C_CONNECTED && (peer_state.disk == D_NEGOTIATING || - mdev->state.disk == D_NEGOTIATING)); + os.disk == D_NEGOTIATING)); /* if we have both been inconsistent, and the peer has been * forced to be UpToDate with --overwrite-data */ cr |= test_bit(CONSIDER_RESYNC, &mdev->flags); /* if we had been plain connected, and the admin requested to * start a sync by "invalidate" or "invalidate-remote" */ - cr |= (oconn == C_CONNECTED && + cr |= (os.conn == C_CONNECTED && (peer_state.conn >= C_STARTING_SYNC_S && peer_state.conn <= C_WF_BITMAP_T)); if (cr) - nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk); + ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk); put_ldev(mdev); - if (nconn == C_MASK) { - nconn = C_CONNECTED; + if (ns.conn == C_MASK) { + ns.conn = C_CONNECTED; if (mdev->state.disk == D_NEGOTIATING) { drbd_force_state(mdev, NS(disk, D_DISKLESS)); } else if (peer_state.disk == D_NEGOTIATING) { @@ -3245,7 +3370,7 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h) } else { if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags)) return FALSE; - D_ASSERT(oconn == C_WF_REPORT_PARAMS); + D_ASSERT(os.conn == C_WF_REPORT_PARAMS); drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); return FALSE; } @@ -3253,18 +3378,28 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h) } spin_lock_irq(&mdev->req_lock); - if (mdev->state.conn != oconn) + if (mdev->state.i != os.i) goto retry; clear_bit(CONSIDER_RESYNC, &mdev->flags); - ns.i = mdev->state.i; - ns.conn = nconn; ns.peer = peer_state.role; ns.pdsk = real_peer_disk; ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); - if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) + if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) ns.disk = mdev->new_state_tmp.disk; - - rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL); + cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD); + if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED && + test_bit(NEW_CUR_UUID, &mdev->flags)) { + /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this + for temporal network outages! */ + spin_unlock_irq(&mdev->req_lock); + dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n"); + tl_clear(mdev); + drbd_uuid_new_current(mdev); + clear_bit(NEW_CUR_UUID, &mdev->flags); + drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0)); + return FALSE; + } + rv = _drbd_set_state(mdev, ns, cs_flags, NULL); ns = mdev->state; spin_unlock_irq(&mdev->req_lock); @@ -3273,8 +3408,8 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h) return FALSE; } - if (oconn > C_WF_REPORT_PARAMS) { - if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED && + if (os.conn > C_WF_REPORT_PARAMS) { + if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED && peer_state.disk != D_NEGOTIATING ) { /* we want resync, peer has not yet decided to sync... */ /* Nowadays only used when forcing a node into primary role and @@ -3291,9 +3426,9 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h) +static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct p_rs_uuid *p = (struct p_rs_uuid *)h; + struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid; wait_event(mdev->misc_wait, mdev->state.conn == C_WF_SYNC_UUID || @@ -3302,10 +3437,6 @@ static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h) /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */ - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; - if (drbd_recv(mdev, h->payload, h->length) != h->length) - return FALSE; - /* Here the _drbd_uuid_ functions are right, current should _not_ be rotated into the history */ if (get_ldev_if_state(mdev, D_NEGOTIATING)) { @@ -3324,14 +3455,14 @@ static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h) enum receive_bitmap_ret { OK, DONE, FAILED }; static enum receive_bitmap_ret -receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h, - unsigned long *buffer, struct bm_xfer_ctx *c) +receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size, + unsigned long *buffer, struct bm_xfer_ctx *c) { unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); unsigned want = num_words * sizeof(long); - if (want != h->length) { - dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length); + if (want != data_size) { + dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size); return FAILED; } if (want == 0) @@ -3360,7 +3491,7 @@ recv_bm_rle_bits(struct drbd_conf *mdev, u64 tmp; unsigned long s = c->bit_offset; unsigned long e; - int len = p->head.length - (sizeof(*p) - sizeof(p->head)); + int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head)); int toggle = DCBP_get_start(p); int have; int bits; @@ -3429,7 +3560,7 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev, const char *direction, struct bm_xfer_ctx *c) { /* what would it take to transfer it "plaintext" */ - unsigned plain = sizeof(struct p_header) * + unsigned plain = sizeof(struct p_header80) * ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1) + c->bm_words * sizeof(long); unsigned total = c->bytes[0] + c->bytes[1]; @@ -3467,12 +3598,13 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev, in order to be agnostic to the 32 vs 64 bits issue. returns 0 on failure, 1 if we successfully received it. */ -static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h) +static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { struct bm_xfer_ctx c; void *buffer; enum receive_bitmap_ret ret; int ok = FALSE; + struct p_header80 *h = &mdev->data.rbuf.header.h80; wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); @@ -3492,39 +3624,39 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h) }; do { - if (h->command == P_BITMAP) { - ret = receive_bitmap_plain(mdev, h, buffer, &c); - } else if (h->command == P_COMPRESSED_BITMAP) { + if (cmd == P_BITMAP) { + ret = receive_bitmap_plain(mdev, data_size, buffer, &c); + } else if (cmd == P_COMPRESSED_BITMAP) { /* MAYBE: sanity check that we speak proto >= 90, * and the feature is enabled! */ struct p_compressed_bm *p; - if (h->length > BM_PACKET_PAYLOAD_BYTES) { + if (data_size > BM_PACKET_PAYLOAD_BYTES) { dev_err(DEV, "ReportCBitmap packet too large\n"); goto out; } /* use the page buff */ p = buffer; memcpy(p, h, sizeof(*h)); - if (drbd_recv(mdev, p->head.payload, h->length) != h->length) + if (drbd_recv(mdev, p->head.payload, data_size) != data_size) goto out; - if (p->head.length <= (sizeof(*p) - sizeof(p->head))) { - dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length); + if (data_size <= (sizeof(*p) - sizeof(p->head))) { + dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size); return FAILED; } ret = decode_bitmap_c(mdev, p, &c); } else { - dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command); + dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd); goto out; } - c.packets[h->command == P_BITMAP]++; - c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length; + c.packets[cmd == P_BITMAP]++; + c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size; if (ret != OK) break; - if (!drbd_recv_header(mdev, h)) + if (!drbd_recv_header(mdev, &cmd, &data_size)) goto out; } while (ret == OK); if (ret == FAILED) @@ -3555,17 +3687,16 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h) return ok; } -static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent) +static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { /* TODO zero copy sink :) */ static char sink[128]; int size, want, r; - if (!silent) - dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", - h->command, h->length); + dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", + cmd, data_size); - size = h->length; + size = data_size; while (size > 0) { want = min_t(int, size, sizeof(sink)); r = drbd_recv(mdev, sink, want); @@ -3575,17 +3706,7 @@ static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent) return size == 0; } -static int receive_skip(struct drbd_conf *mdev, struct p_header *h) -{ - return receive_skip_(mdev, h, 0); -} - -static int receive_skip_silent(struct drbd_conf *mdev, struct p_header *h) -{ - return receive_skip_(mdev, h, 1); -} - -static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h) +static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { if (mdev->state.disk >= D_INCONSISTENT) drbd_kick_lo(mdev); @@ -3597,108 +3718,94 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *); - -static drbd_cmd_handler_f drbd_default_handler[] = { - [P_DATA] = receive_Data, - [P_DATA_REPLY] = receive_DataReply, - [P_RS_DATA_REPLY] = receive_RSDataReply, - [P_BARRIER] = receive_Barrier, - [P_BITMAP] = receive_bitmap, - [P_COMPRESSED_BITMAP] = receive_bitmap, - [P_UNPLUG_REMOTE] = receive_UnplugRemote, - [P_DATA_REQUEST] = receive_DataRequest, - [P_RS_DATA_REQUEST] = receive_DataRequest, - [P_SYNC_PARAM] = receive_SyncParam, - [P_SYNC_PARAM89] = receive_SyncParam, - [P_PROTOCOL] = receive_protocol, - [P_UUIDS] = receive_uuids, - [P_SIZES] = receive_sizes, - [P_STATE] = receive_state, - [P_STATE_CHG_REQ] = receive_req_state, - [P_SYNC_UUID] = receive_sync_uuid, - [P_OV_REQUEST] = receive_DataRequest, - [P_OV_REPLY] = receive_DataRequest, - [P_CSUM_RS_REQUEST] = receive_DataRequest, - [P_DELAY_PROBE] = receive_skip_silent, +typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive); + +struct data_cmd { + int expect_payload; + size_t pkt_size; + drbd_cmd_handler_f function; +}; + +static struct data_cmd drbd_cmd_handler[] = { + [P_DATA] = { 1, sizeof(struct p_data), receive_Data }, + [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply }, + [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } , + [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } , + [P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } , + [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } , + [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote }, + [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, + [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, + [P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam }, + [P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam }, + [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol }, + [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids }, + [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes }, + [P_STATE] = { 0, sizeof(struct p_state), receive_state }, + [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state }, + [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid }, + [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, + [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest }, + [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, + [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, /* anything missing from this table is in * the asender_tbl, see get_asender_cmd */ - [P_MAX_CMD] = NULL, + [P_MAX_CMD] = { 0, 0, NULL }, }; -static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler; -static drbd_cmd_handler_f *drbd_opt_cmd_handler; +/* All handler functions that expect a sub-header get that sub-heder in + mdev->data.rbuf.header.head.payload. + + Usually in mdev->data.rbuf.header.head the callback can find the usual + p_header, but they may not rely on that. Since there is also p_header95 ! + */ static void drbdd(struct drbd_conf *mdev) { - drbd_cmd_handler_f handler; - struct p_header *header = &mdev->data.rbuf.header; + union p_header *header = &mdev->data.rbuf.header; + unsigned int packet_size; + enum drbd_packets cmd; + size_t shs; /* sub header size */ + int rv; while (get_t_state(&mdev->receiver) == Running) { drbd_thread_current_set_cpu(mdev); - if (!drbd_recv_header(mdev, header)) { - drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); - break; - } + if (!drbd_recv_header(mdev, &cmd, &packet_size)) + goto err_out; - if (header->command < P_MAX_CMD) - handler = drbd_cmd_handler[header->command]; - else if (P_MAY_IGNORE < header->command - && header->command < P_MAX_OPT_CMD) - handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE]; - else if (header->command > P_MAX_OPT_CMD) - handler = receive_skip; - else - handler = NULL; + if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) { + dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size); + goto err_out; + } - if (unlikely(!handler)) { - dev_err(DEV, "unknown packet type %d, l: %d!\n", - header->command, header->length); - drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); - break; + shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header); + rv = drbd_recv(mdev, &header->h80.payload, shs); + if (unlikely(rv != shs)) { + dev_err(DEV, "short read while reading sub header: rv=%d\n", rv); + goto err_out; } - if (unlikely(!handler(mdev, header))) { - dev_err(DEV, "error receiving %s, l: %d!\n", - cmdname(header->command), header->length); - drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); - break; + + if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) { + dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size); + goto err_out; } - } -} -static void drbd_fail_pending_reads(struct drbd_conf *mdev) -{ - struct hlist_head *slot; - struct hlist_node *pos; - struct hlist_node *tmp; - struct drbd_request *req; - int i; + rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs); - /* - * Application READ requests - */ - spin_lock_irq(&mdev->req_lock); - for (i = 0; i < APP_R_HSIZE; i++) { - slot = mdev->app_reads_hash+i; - hlist_for_each_entry_safe(req, pos, tmp, slot, colision) { - /* it may (but should not any longer!) - * be on the work queue; if that assert triggers, - * we need to also grab the - * spin_lock_irq(&mdev->data.work.q_lock); - * and list_del_init here. */ - D_ASSERT(list_empty(&req->w.list)); - /* It would be nice to complete outside of spinlock. - * But this is easier for now. */ - _req_mod(req, connection_lost_while_pending); + if (unlikely(!rv)) { + dev_err(DEV, "error receiving %s, l: %d!\n", + cmdname(cmd), packet_size); + goto err_out; } } - for (i = 0; i < APP_R_HSIZE; i++) - if (!hlist_empty(mdev->app_reads_hash+i)) - dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: " - "%p, should be NULL\n", i, mdev->app_reads_hash[i].first); - memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); - spin_unlock_irq(&mdev->req_lock); + if (0) { + err_out: + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); + } + /* If we leave here, we probably want to update at least the + * "Connected" indicator on stable storage. Do so explicitly here. */ + drbd_md_sync(mdev); } void drbd_flush_workqueue(struct drbd_conf *mdev) @@ -3711,6 +3818,36 @@ void drbd_flush_workqueue(struct drbd_conf *mdev) wait_for_completion(&barr.done); } +void drbd_free_tl_hash(struct drbd_conf *mdev) +{ + struct hlist_head *h; + + spin_lock_irq(&mdev->req_lock); + + if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) { + spin_unlock_irq(&mdev->req_lock); + return; + } + /* paranoia code */ + for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++) + if (h->first) + dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n", + (int)(h - mdev->ee_hash), h->first); + kfree(mdev->ee_hash); + mdev->ee_hash = NULL; + mdev->ee_hash_s = 0; + + /* paranoia code */ + for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) + if (h->first) + dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n", + (int)(h - mdev->tl_hash), h->first); + kfree(mdev->tl_hash); + mdev->tl_hash = NULL; + mdev->tl_hash_s = 0; + spin_unlock_irq(&mdev->req_lock); +} + static void drbd_disconnect(struct drbd_conf *mdev) { enum drbd_fencing_p fp; @@ -3728,6 +3865,7 @@ static void drbd_disconnect(struct drbd_conf *mdev) drbd_thread_stop(&mdev->asender); drbd_free_sock(mdev); + /* wait for current activity to cease. */ spin_lock_irq(&mdev->req_lock); _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); @@ -3752,7 +3890,6 @@ static void drbd_disconnect(struct drbd_conf *mdev) /* make sure syncer is stopped and w_resume_next_sg queued */ del_timer_sync(&mdev->resync_timer); - set_bit(STOP_SYNC_TIMER, &mdev->flags); resync_timer_fn((unsigned long)mdev); /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, @@ -3767,11 +3904,9 @@ static void drbd_disconnect(struct drbd_conf *mdev) kfree(mdev->p_uuid); mdev->p_uuid = NULL; - if (!mdev->state.susp) + if (!is_susp(mdev->state)) tl_clear(mdev); - drbd_fail_pending_reads(mdev); - dev_info(DEV, "Connection closed\n"); drbd_md_sync(mdev); @@ -3782,12 +3917,8 @@ static void drbd_disconnect(struct drbd_conf *mdev) put_ldev(mdev); } - if (mdev->state.role == R_PRIMARY) { - if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) { - enum drbd_disk_state nps = drbd_try_outdate_peer(mdev); - drbd_request_state(mdev, NS(pdsk, nps)); - } - } + if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) + drbd_try_outdate_peer_async(mdev); spin_lock_irq(&mdev->req_lock); os = mdev->state; @@ -3800,32 +3931,14 @@ static void drbd_disconnect(struct drbd_conf *mdev) spin_unlock_irq(&mdev->req_lock); if (os.conn == C_DISCONNECTING) { - struct hlist_head *h; - wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0); + wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0); - /* we must not free the tl_hash - * while application io is still on the fly */ - wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0); - - spin_lock_irq(&mdev->req_lock); - /* paranoia code */ - for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++) - if (h->first) - dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n", - (int)(h - mdev->ee_hash), h->first); - kfree(mdev->ee_hash); - mdev->ee_hash = NULL; - mdev->ee_hash_s = 0; - - /* paranoia code */ - for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) - if (h->first) - dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n", - (int)(h - mdev->tl_hash), h->first); - kfree(mdev->tl_hash); - mdev->tl_hash = NULL; - mdev->tl_hash_s = 0; - spin_unlock_irq(&mdev->req_lock); + if (!is_susp(mdev->state)) { + /* we must not free the tl_hash + * while application io is still on the fly */ + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); + drbd_free_tl_hash(mdev); + } crypto_free_hash(mdev->cram_hmac_tfm); mdev->cram_hmac_tfm = NULL; @@ -3845,6 +3958,9 @@ static void drbd_disconnect(struct drbd_conf *mdev) i = drbd_release_ee(mdev, &mdev->net_ee); if (i) dev_info(DEV, "net_ee not empty, killed %u entries\n", i); + i = atomic_read(&mdev->pp_in_use_by_net); + if (i) + dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i); i = atomic_read(&mdev->pp_in_use); if (i) dev_info(DEV, "pp_in_use = %d, expected 0\n", i); @@ -3888,7 +4004,7 @@ static int drbd_send_handshake(struct drbd_conf *mdev) p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE, - (struct p_header *)p, sizeof(*p), 0 ); + (struct p_header80 *)p, sizeof(*p), 0 ); mutex_unlock(&mdev->data.mutex); return ok; } @@ -3904,27 +4020,28 @@ static int drbd_do_handshake(struct drbd_conf *mdev) { /* ASSERT current == mdev->receiver ... */ struct p_handshake *p = &mdev->data.rbuf.handshake; - const int expect = sizeof(struct p_handshake) - -sizeof(struct p_header); + const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80); + unsigned int length; + enum drbd_packets cmd; int rv; rv = drbd_send_handshake(mdev); if (!rv) return 0; - rv = drbd_recv_header(mdev, &p->head); + rv = drbd_recv_header(mdev, &cmd, &length); if (!rv) return 0; - if (p->head.command != P_HAND_SHAKE) { + if (cmd != P_HAND_SHAKE) { dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n", - cmdname(p->head.command), p->head.command); + cmdname(cmd), cmd); return -1; } - if (p->head.length != expect) { + if (length != expect) { dev_err(DEV, "expected HandShake length: %u, received: %u\n", - expect, p->head.length); + expect, length); return -1; } @@ -3982,10 +4099,11 @@ static int drbd_do_auth(struct drbd_conf *mdev) char *response = NULL; char *right_response = NULL; char *peers_ch = NULL; - struct p_header p; unsigned int key_len = strlen(mdev->net_conf->shared_secret); unsigned int resp_size; struct hash_desc desc; + enum drbd_packets cmd; + unsigned int length; int rv; desc.tfm = mdev->cram_hmac_tfm; @@ -4005,33 +4123,33 @@ static int drbd_do_auth(struct drbd_conf *mdev) if (!rv) goto fail; - rv = drbd_recv_header(mdev, &p); + rv = drbd_recv_header(mdev, &cmd, &length); if (!rv) goto fail; - if (p.command != P_AUTH_CHALLENGE) { + if (cmd != P_AUTH_CHALLENGE) { dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n", - cmdname(p.command), p.command); + cmdname(cmd), cmd); rv = 0; goto fail; } - if (p.length > CHALLENGE_LEN*2) { + if (length > CHALLENGE_LEN * 2) { dev_err(DEV, "expected AuthChallenge payload too big.\n"); rv = -1; goto fail; } - peers_ch = kmalloc(p.length, GFP_NOIO); + peers_ch = kmalloc(length, GFP_NOIO); if (peers_ch == NULL) { dev_err(DEV, "kmalloc of peers_ch failed\n"); rv = -1; goto fail; } - rv = drbd_recv(mdev, peers_ch, p.length); + rv = drbd_recv(mdev, peers_ch, length); - if (rv != p.length) { + if (rv != length) { dev_err(DEV, "short read AuthChallenge: l=%u\n", rv); rv = 0; goto fail; @@ -4046,7 +4164,7 @@ static int drbd_do_auth(struct drbd_conf *mdev) } sg_init_table(&sg, 1); - sg_set_buf(&sg, peers_ch, p.length); + sg_set_buf(&sg, peers_ch, length); rv = crypto_hash_digest(&desc, &sg, sg.length, response); if (rv) { @@ -4059,18 +4177,18 @@ static int drbd_do_auth(struct drbd_conf *mdev) if (!rv) goto fail; - rv = drbd_recv_header(mdev, &p); + rv = drbd_recv_header(mdev, &cmd, &length); if (!rv) goto fail; - if (p.command != P_AUTH_RESPONSE) { + if (cmd != P_AUTH_RESPONSE) { dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n", - cmdname(p.command), p.command); + cmdname(cmd), cmd); rv = 0; goto fail; } - if (p.length != resp_size) { + if (length != resp_size) { dev_err(DEV, "expected AuthResponse payload of wrong size\n"); rv = 0; goto fail; @@ -4155,7 +4273,7 @@ int drbdd_init(struct drbd_thread *thi) /* ********* acknowledge sender ******** */ -static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h) +static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h) { struct p_req_state_reply *p = (struct p_req_state_reply *)h; @@ -4173,13 +4291,13 @@ static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -static int got_Ping(struct drbd_conf *mdev, struct p_header *h) +static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h) { return drbd_send_ping_ack(mdev); } -static int got_PingAck(struct drbd_conf *mdev, struct p_header *h) +static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h) { /* restore idle timeout */ mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; @@ -4189,7 +4307,7 @@ static int got_PingAck(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h) +static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h) { struct p_block_ack *p = (struct p_block_ack *)h; sector_t sector = be64_to_cpu(p->sector); @@ -4199,11 +4317,15 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h) update_peer_seq(mdev, be32_to_cpu(p->seq_num)); - drbd_rs_complete_io(mdev, sector); - drbd_set_in_sync(mdev, sector, blksize); - /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ - mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); + if (get_ldev(mdev)) { + drbd_rs_complete_io(mdev, sector); + drbd_set_in_sync(mdev, sector, blksize); + /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ + mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); + put_ldev(mdev); + } dec_rs_pending(mdev); + atomic_add(blksize >> 9, &mdev->rs_sect_in); return TRUE; } @@ -4259,7 +4381,7 @@ static int validate_req_change_req_state(struct drbd_conf *mdev, return TRUE; } -static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h) +static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h) { struct p_block_ack *p = (struct p_block_ack *)h; sector_t sector = be64_to_cpu(p->sector); @@ -4299,7 +4421,7 @@ static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h) _ack_id_to_req, __func__ , what); } -static int got_NegAck(struct drbd_conf *mdev, struct p_header *h) +static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h) { struct p_block_ack *p = (struct p_block_ack *)h; sector_t sector = be64_to_cpu(p->sector); @@ -4319,7 +4441,7 @@ static int got_NegAck(struct drbd_conf *mdev, struct p_header *h) _ack_id_to_req, __func__ , neg_acked); } -static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h) +static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h) { struct p_block_ack *p = (struct p_block_ack *)h; sector_t sector = be64_to_cpu(p->sector); @@ -4332,7 +4454,7 @@ static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h) _ar_id_to_req, __func__ , neg_acked); } -static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h) +static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h) { sector_t sector; int size; @@ -4354,7 +4476,7 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h) +static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h) { struct p_barrier_ack *p = (struct p_barrier_ack *)h; @@ -4363,7 +4485,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -static int got_OVResult(struct drbd_conf *mdev, struct p_header *h) +static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h) { struct p_block_ack *p = (struct p_block_ack *)h; struct drbd_work *w; @@ -4380,6 +4502,9 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h) else ov_oos_print(mdev); + if (!get_ldev(mdev)) + return TRUE; + drbd_rs_complete_io(mdev, sector); dec_rs_pending(mdev); @@ -4394,18 +4519,18 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h) drbd_resync_finished(mdev); } } + put_ldev(mdev); return TRUE; } -static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header *h) +static int got_skip(struct drbd_conf *mdev, struct p_header80 *h) { - /* IGNORE */ return TRUE; } struct asender_cmd { size_t pkt_size; - int (*process)(struct drbd_conf *mdev, struct p_header *h); + int (*process)(struct drbd_conf *mdev, struct p_header80 *h); }; static struct asender_cmd *get_asender_cmd(int cmd) @@ -4414,8 +4539,8 @@ static struct asender_cmd *get_asender_cmd(int cmd) /* anything missing from this table is in * the drbd_cmd_handler (drbd_default_handler) table, * see the beginning of drbdd() */ - [P_PING] = { sizeof(struct p_header), got_Ping }, - [P_PING_ACK] = { sizeof(struct p_header), got_PingAck }, + [P_PING] = { sizeof(struct p_header80), got_Ping }, + [P_PING_ACK] = { sizeof(struct p_header80), got_PingAck }, [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, @@ -4427,7 +4552,7 @@ static struct asender_cmd *get_asender_cmd(int cmd) [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, - [P_DELAY_PROBE] = { sizeof(struct p_delay_probe), got_something_to_ignore_m }, + [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip }, [P_MAX_CMD] = { 0, NULL }, }; if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) @@ -4438,13 +4563,13 @@ static struct asender_cmd *get_asender_cmd(int cmd) int drbd_asender(struct drbd_thread *thi) { struct drbd_conf *mdev = thi->mdev; - struct p_header *h = &mdev->meta.rbuf.header; + struct p_header80 *h = &mdev->meta.rbuf.header.h80; struct asender_cmd *cmd = NULL; int rv, len; void *buf = h; int received = 0; - int expect = sizeof(struct p_header); + int expect = sizeof(struct p_header80); int empty; sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); @@ -4468,10 +4593,8 @@ int drbd_asender(struct drbd_thread *thi) while (1) { clear_bit(SIGNAL_ASENDER, &mdev->flags); flush_signals(current); - if (!drbd_process_done_ee(mdev)) { - dev_err(DEV, "process_done_ee() = NOT_OK\n"); + if (!drbd_process_done_ee(mdev)) goto reconnect; - } /* to avoid race with newly queued ACKs */ set_bit(SIGNAL_ASENDER, &mdev->flags); spin_lock_irq(&mdev->req_lock); @@ -4530,21 +4653,23 @@ int drbd_asender(struct drbd_thread *thi) if (received == expect && cmd == NULL) { if (unlikely(h->magic != BE_DRBD_MAGIC)) { - dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n", - (long)be32_to_cpu(h->magic), - h->command, h->length); + dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n", + be32_to_cpu(h->magic), + be16_to_cpu(h->command), + be16_to_cpu(h->length)); goto reconnect; } cmd = get_asender_cmd(be16_to_cpu(h->command)); len = be16_to_cpu(h->length); if (unlikely(cmd == NULL)) { - dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n", - (long)be32_to_cpu(h->magic), - h->command, h->length); + dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n", + be32_to_cpu(h->magic), + be16_to_cpu(h->command), + be16_to_cpu(h->length)); goto disconnect; } expect = cmd->pkt_size; - ERR_IF(len != expect-sizeof(struct p_header)) + ERR_IF(len != expect-sizeof(struct p_header80)) goto reconnect; } if (received == expect) { @@ -4554,7 +4679,7 @@ int drbd_asender(struct drbd_thread *thi) buf = h; received = 0; - expect = sizeof(struct p_header); + expect = sizeof(struct p_header80); cmd = NULL; } } @@ -4562,10 +4687,12 @@ int drbd_asender(struct drbd_thread *thi) if (0) { reconnect: drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); + drbd_md_sync(mdev); } if (0) { disconnect: drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + drbd_md_sync(mdev); } clear_bit(SIGNAL_ASENDER, &mdev->flags); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index f761d98a4e90..9e91a2545fc8 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -59,17 +59,19 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req) static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) { const unsigned long s = req->rq_state; + + /* remove it from the transfer log. + * well, only if it had been there in the first + * place... if it had not (local only or conflicting + * and never sent), it should still be "empty" as + * initialized in drbd_req_new(), so we can list_del() it + * here unconditionally */ + list_del(&req->tl_requests); + /* if it was a write, we may have to set the corresponding * bit(s) out-of-sync first. If it had a local part, we need to * release the reference to the activity log. */ if (rw == WRITE) { - /* remove it from the transfer log. - * well, only if it had been there in the first - * place... if it had not (local only or conflicting - * and never sent), it should still be "empty" as - * initialized in drbd_req_new(), so we can list_del() it - * here unconditionally */ - list_del(&req->tl_requests); /* Set out-of-sync unless both OK flags are set * (local only or remote failed). * Other places where we set out-of-sync: @@ -92,7 +94,8 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const */ if (s & RQ_LOCAL_MASK) { if (get_ldev_if_state(mdev, D_FAILED)) { - drbd_al_complete_io(mdev, req->sector); + if (s & RQ_IN_ACT_LOG) + drbd_al_complete_io(mdev, req->sector); put_ldev(mdev); } else if (__ratelimit(&drbd_ratelimit_state)) { dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), " @@ -280,6 +283,14 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) * protocol A or B, barrier ack still pending... */ } +static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m) +{ + struct drbd_conf *mdev = req->mdev; + + if (!is_susp(mdev->state)) + _req_may_be_done(req, m); +} + /* * checks whether there was an overlapping request * or ee already registered. @@ -380,10 +391,11 @@ out_conflict: * and it enforces that we have to think in a very structured manner * about the "events" that may happen to a request during its life time ... */ -void __req_mod(struct drbd_request *req, enum drbd_req_event what, +int __req_mod(struct drbd_request *req, enum drbd_req_event what, struct bio_and_error *m) { struct drbd_conf *mdev = req->mdev; + int rv = 0; m->bio = NULL; switch (what) { @@ -420,7 +432,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); req->rq_state &= ~RQ_LOCAL_PENDING; - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); put_ldev(mdev); break; @@ -429,7 +441,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state &= ~RQ_LOCAL_PENDING; __drbd_chk_io_error(mdev, FALSE); - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); put_ldev(mdev); break; @@ -437,7 +449,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, /* it is legal to fail READA */ req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); put_ldev(mdev); break; @@ -455,7 +467,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, /* no point in retrying if there is no good remote data, * or we have no connection. */ if (mdev->state.pdsk != D_UP_TO_DATE) { - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); break; } @@ -517,11 +529,9 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0); req->epoch = mdev->newest_tle->br_number; - list_add_tail(&req->tl_requests, - &mdev->newest_tle->requests); /* increment size of current epoch */ - mdev->newest_tle->n_req++; + mdev->newest_tle->n_writes++; /* queue work item to send data */ D_ASSERT(req->rq_state & RQ_NET_PENDING); @@ -530,7 +540,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, drbd_queue_work(&mdev->data.work, &req->w); /* close the epoch, in case it outgrew the limit */ - if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size) + if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size) queue_barrier(mdev); break; @@ -543,7 +553,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state &= ~RQ_NET_QUEUED; /* if we did it right, tl_clear should be scheduled only after * this, so this should not be necessary! */ - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); break; case handed_over_to_network: @@ -568,7 +578,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, * "completed_ok" events came in, once we return from * _drbd_send_zc_bio (drbd_send_dblock), we have to check * whether it is done already, and end it. */ - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); break; case read_retry_remote_canceled: @@ -584,7 +594,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, /* if it is still queued, we may not complete it here. * it will be canceled soon. */ if (!(req->rq_state & RQ_NET_QUEUED)) - _req_may_be_done(req, m); + _req_may_be_done(req, m); /* Allowed while state.susp */ break; case write_acked_by_peer_and_sis: @@ -619,7 +629,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, D_ASSERT(req->rq_state & RQ_NET_PENDING); dec_ap_pending(mdev); req->rq_state &= ~RQ_NET_PENDING; - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); break; case neg_acked: @@ -629,11 +639,50 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); req->rq_state |= RQ_NET_DONE; - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); /* else: done by handed_over_to_network */ break; + case fail_frozen_disk_io: + if (!(req->rq_state & RQ_LOCAL_COMPLETED)) + break; + + _req_may_be_done(req, m); /* Allowed while state.susp */ + break; + + case restart_frozen_disk_io: + if (!(req->rq_state & RQ_LOCAL_COMPLETED)) + break; + + req->rq_state &= ~RQ_LOCAL_COMPLETED; + + rv = MR_READ; + if (bio_data_dir(req->master_bio) == WRITE) + rv = MR_WRITE; + + get_ldev(mdev); + req->w.cb = w_restart_disk_io; + drbd_queue_work(&mdev->data.work, &req->w); + break; + + case resend: + /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK + before the connection loss (B&C only); only P_BARRIER_ACK was missing. + Trowing them out of the TL here by pretending we got a BARRIER_ACK + We ensure that the peer was not rebooted */ + if (!(req->rq_state & RQ_NET_OK)) { + if (req->w.cb) { + drbd_queue_work(&mdev->data.work, &req->w); + rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; + } + break; + } + /* else, fall through to barrier_acked */ + case barrier_acked: + if (!(req->rq_state & RQ_WRITE)) + break; + if (req->rq_state & RQ_NET_PENDING) { /* barrier came in before all requests have been acked. * this is bad, because if the connection is lost now, @@ -643,7 +692,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, } D_ASSERT(req->rq_state & RQ_NET_SENT); req->rq_state |= RQ_NET_DONE; - _req_may_be_done(req, m); + _req_may_be_done(req, m); /* Allowed while state.susp */ break; case data_received: @@ -651,9 +700,11 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, dec_ap_pending(mdev); req->rq_state &= ~RQ_NET_PENDING; req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); break; }; + + return rv; } /* we may do a local read if: @@ -752,14 +803,16 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) * resync extent to finish, and, if necessary, pulls in the target * extent into the activity log, which involves further disk io because * of transactional on-disk meta data updates. */ - if (rw == WRITE && local) + if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) { + req->rq_state |= RQ_IN_ACT_LOG; drbd_al_begin_io(mdev, sector); + } remote = remote && (mdev->state.pdsk == D_UP_TO_DATE || (mdev->state.pdsk == D_INCONSISTENT && mdev->state.conn >= C_CONNECTED)); - if (!(local || remote) && !mdev->state.susp) { + if (!(local || remote) && !is_susp(mdev->state)) { dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); goto fail_free_complete; } @@ -785,7 +838,7 @@ allocate_barrier: /* GOOD, everything prepared, grab the spin_lock */ spin_lock_irq(&mdev->req_lock); - if (mdev->state.susp) { + if (is_susp(mdev->state)) { /* If we got suspended, use the retry mechanism of generic_make_request() to restart processing of this bio. In the next call to drbd_make_request_26 @@ -867,30 +920,10 @@ allocate_barrier: /* check this request on the collision detection hash tables. * if we have a conflict, just complete it here. * THINK do we want to check reads, too? (I don't think so...) */ - if (rw == WRITE && _req_conflicts(req)) { - /* this is a conflicting request. - * even though it may have been only _partially_ - * overlapping with one of the currently pending requests, - * without even submitting or sending it, we will - * pretend that it was successfully served right now. - */ - if (local) { - bio_put(req->private_bio); - req->private_bio = NULL; - drbd_al_complete_io(mdev, req->sector); - put_ldev(mdev); - local = 0; - } - if (remote) - dec_ap_pending(mdev); - _drbd_end_io_acct(mdev, req); - /* THINK: do we want to fail it (-EIO), or pretend success? */ - bio_endio(req->master_bio, 0); - req->master_bio = NULL; - dec_ap_bio(mdev); - drbd_req_free(req); - remote = 0; - } + if (rw == WRITE && _req_conflicts(req)) + goto fail_conflicting; + + list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); /* NOTE remote first: to get the concurrent write detection right, * we must register the request before start of local IO. */ @@ -923,6 +956,21 @@ allocate_barrier: return 0; +fail_conflicting: + /* this is a conflicting request. + * even though it may have been only _partially_ + * overlapping with one of the currently pending requests, + * without even submitting or sending it, we will + * pretend that it was successfully served right now. + */ + _drbd_end_io_acct(mdev, req); + spin_unlock_irq(&mdev->req_lock); + if (remote) + dec_ap_pending(mdev); + /* THINK: do we want to fail it (-EIO), or pretend success? + * this pretends success. */ + err = 0; + fail_free_complete: if (rw == WRITE && local) drbd_al_complete_io(mdev, sector); @@ -961,21 +1009,6 @@ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) return 1; } - /* - * Paranoia: we might have been primary, but sync target, or - * even diskless, then lost the connection. - * This should have been handled (panic? suspend?) somewhere - * else. But maybe it was not, so check again here. - * Caution: as long as we do not have a read/write lock on mdev, - * to serialize state changes, this is racy, since we may lose - * the connection *after* we test for the cstate. - */ - if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) { - if (__ratelimit(&drbd_ratelimit_state)) - dev_err(DEV, "Sorry, I have no access to good data anymore.\n"); - return 1; - } - return 0; } diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 02d575d24518..181ea0364822 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -104,6 +104,9 @@ enum drbd_req_event { read_ahead_completed_with_error, write_completed_with_error, completed_ok, + resend, + fail_frozen_disk_io, + restart_frozen_disk_io, nothing, /* for tracing only */ }; @@ -183,6 +186,12 @@ enum drbd_req_state_bits { /* keep this last, its for the RQ_NET_MASK */ __RQ_NET_MAX, + + /* Set when this is a write, clear for a read */ + __RQ_WRITE, + + /* Should call drbd_al_complete_io() for this request... */ + __RQ_IN_ACT_LOG, }; #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) @@ -201,6 +210,16 @@ enum drbd_req_state_bits { /* 0x1f8 */ #define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) +#define RQ_WRITE (1UL << __RQ_WRITE) +#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) + +/* For waking up the frozen transfer log mod_req() has to return if the request + should be counted in the epoch object*/ +#define MR_WRITE_SHIFT 0 +#define MR_WRITE (1 << MR_WRITE_SHIFT) +#define MR_READ_SHIFT 1 +#define MR_READ (1 << MR_READ_SHIFT) + /* epoch entries */ static inline struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector) @@ -244,30 +263,36 @@ static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev, return NULL; } +static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) +{ + struct bio *bio; + bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ + + req->private_bio = bio; + + bio->bi_private = req; + bio->bi_end_io = drbd_endio_pri; + bio->bi_next = NULL; +} + static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, struct bio *bio_src) { - struct bio *bio; struct drbd_request *req = mempool_alloc(drbd_request_mempool, GFP_NOIO); if (likely(req)) { - bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ + drbd_req_make_private_bio(req, bio_src); - req->rq_state = 0; + req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; req->mdev = mdev; req->master_bio = bio_src; - req->private_bio = bio; req->epoch = 0; - req->sector = bio->bi_sector; - req->size = bio->bi_size; + req->sector = bio_src->bi_sector; + req->size = bio_src->bi_size; req->start_time = jiffies; INIT_HLIST_NODE(&req->colision); INIT_LIST_HEAD(&req->tl_requests); INIT_LIST_HEAD(&req->w.list); - - bio->bi_private = req; - bio->bi_end_io = drbd_endio_pri; - bio->bi_next = NULL; } return req; } @@ -292,36 +317,43 @@ struct bio_and_error { extern void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m); -extern void __req_mod(struct drbd_request *req, enum drbd_req_event what, +extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, struct bio_and_error *m); extern void complete_master_bio(struct drbd_conf *mdev, struct bio_and_error *m); /* use this if you don't want to deal with calling complete_master_bio() * outside the spinlock, e.g. when walking some list on cleanup. */ -static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what) +static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what) { struct drbd_conf *mdev = req->mdev; struct bio_and_error m; + int rv; /* __req_mod possibly frees req, do not touch req after that! */ - __req_mod(req, what, &m); + rv = __req_mod(req, what, &m); if (m.bio) complete_master_bio(mdev, &m); + + return rv; } /* completion of master bio is outside of spinlock. * If you need it irqsave, do it your self! */ -static inline void req_mod(struct drbd_request *req, +static inline int req_mod(struct drbd_request *req, enum drbd_req_event what) { struct drbd_conf *mdev = req->mdev; struct bio_and_error m; + int rv; + spin_lock_irq(&mdev->req_lock); - __req_mod(req, what, &m); + rv = __req_mod(req, what, &m); spin_unlock_irq(&mdev->req_lock); if (m.bio) complete_master_bio(mdev, &m); + + return rv; } #endif diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index ca4a16cea2d8..108d58015cd1 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -39,8 +39,6 @@ #include "drbd_int.h" #include "drbd_req.h" -#define SLEEP_TIME (HZ/10) - static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel); @@ -217,10 +215,8 @@ void drbd_endio_sec(struct bio *bio, int error) */ void drbd_endio_pri(struct bio *bio, int error) { - unsigned long flags; struct drbd_request *req = bio->bi_private; struct drbd_conf *mdev = req->mdev; - struct bio_and_error m; enum drbd_req_event what; int uptodate = bio_flagged(bio, BIO_UPTODATE); @@ -246,12 +242,7 @@ void drbd_endio_pri(struct bio *bio, int error) bio_put(req->private_bio); req->private_bio = ERR_PTR(error); - spin_lock_irqsave(&mdev->req_lock, flags); - __req_mod(req, what, &m); - spin_unlock_irqrestore(&mdev->req_lock, flags); - - if (m.bio) - complete_master_bio(mdev, &m); + req_mod(req, what); } int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) @@ -376,54 +367,145 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) struct drbd_epoch_entry *e; if (!get_ldev(mdev)) - return 0; + return -EIO; + + if (drbd_rs_should_slow_down(mdev)) + goto defer; /* GFP_TRY, because if there is no memory available right now, this may * be rescheduled for later. It is "only" background resync, after all. */ e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); if (!e) - goto fail; + goto defer; + e->w.cb = w_e_send_csum; spin_lock_irq(&mdev->req_lock); list_add(&e->w.list, &mdev->read_ee); spin_unlock_irq(&mdev->req_lock); - e->w.cb = w_e_send_csum; + atomic_add(size >> 9, &mdev->rs_sect_ev); if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) - return 1; + return 0; + + /* drbd_submit_ee currently fails for one reason only: + * not being able to allocate enough bios. + * Is dropping the connection going to help? */ + spin_lock_irq(&mdev->req_lock); + list_del(&e->w.list); + spin_unlock_irq(&mdev->req_lock); drbd_free_ee(mdev, e); -fail: +defer: put_ldev(mdev); - return 2; + return -EAGAIN; } void resync_timer_fn(unsigned long data) { - unsigned long flags; struct drbd_conf *mdev = (struct drbd_conf *) data; int queue; - spin_lock_irqsave(&mdev->req_lock, flags); - - if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) { - queue = 1; - if (mdev->state.conn == C_VERIFY_S) - mdev->resync_work.cb = w_make_ov_request; - else - mdev->resync_work.cb = w_make_resync_request; - } else { + queue = 1; + switch (mdev->state.conn) { + case C_VERIFY_S: + mdev->resync_work.cb = w_make_ov_request; + break; + case C_SYNC_TARGET: + mdev->resync_work.cb = w_make_resync_request; + break; + default: queue = 0; mdev->resync_work.cb = w_resync_inactive; } - spin_unlock_irqrestore(&mdev->req_lock, flags); - /* harmless race: list_empty outside data.work.q_lock */ if (list_empty(&mdev->resync_work.list) && queue) drbd_queue_work(&mdev->data.work, &mdev->resync_work); } +static void fifo_set(struct fifo_buffer *fb, int value) +{ + int i; + + for (i = 0; i < fb->size; i++) + fb->values[i] = value; +} + +static int fifo_push(struct fifo_buffer *fb, int value) +{ + int ov; + + ov = fb->values[fb->head_index]; + fb->values[fb->head_index++] = value; + + if (fb->head_index >= fb->size) + fb->head_index = 0; + + return ov; +} + +static void fifo_add_val(struct fifo_buffer *fb, int value) +{ + int i; + + for (i = 0; i < fb->size; i++) + fb->values[i] += value; +} + +int drbd_rs_controller(struct drbd_conf *mdev) +{ + unsigned int sect_in; /* Number of sectors that came in since the last turn */ + unsigned int want; /* The number of sectors we want in the proxy */ + int req_sect; /* Number of sectors to request in this turn */ + int correction; /* Number of sectors more we need in the proxy*/ + int cps; /* correction per invocation of drbd_rs_controller() */ + int steps; /* Number of time steps to plan ahead */ + int curr_corr; + int max_sect; + + sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */ + mdev->rs_in_flight -= sect_in; + + spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */ + + steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */ + + if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */ + want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps; + } else { /* normal path */ + want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target : + sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10); + } + + correction = want - mdev->rs_in_flight - mdev->rs_planed; + + /* Plan ahead */ + cps = correction / steps; + fifo_add_val(&mdev->rs_plan_s, cps); + mdev->rs_planed += cps * steps; + + /* What we do in this step */ + curr_corr = fifo_push(&mdev->rs_plan_s, 0); + spin_unlock(&mdev->peer_seq_lock); + mdev->rs_planed -= curr_corr; + + req_sect = sect_in + curr_corr; + if (req_sect < 0) + req_sect = 0; + + max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ; + if (req_sect > max_sect) + req_sect = max_sect; + + /* + dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n", + sect_in, mdev->rs_in_flight, want, correction, + steps, cps, mdev->rs_planed, curr_corr, req_sect); + */ + + return req_sect; +} + int w_make_resync_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { @@ -431,8 +513,9 @@ int w_make_resync_request(struct drbd_conf *mdev, sector_t sector; const sector_t capacity = drbd_get_capacity(mdev->this_bdev); int max_segment_size; - int number, i, size, pe, mx; + int number, rollback_i, size, pe, mx; int align, queued, sndbuf; + int i = 0; if (unlikely(cancel)) return 1; @@ -446,6 +529,12 @@ int w_make_resync_request(struct drbd_conf *mdev, dev_err(DEV, "%s in w_make_resync_request\n", drbd_conn_str(mdev->state.conn)); + if (mdev->rs_total == 0) { + /* empty resync? */ + drbd_resync_finished(mdev); + return 1; + } + if (!get_ldev(mdev)) { /* Since we only need to access mdev->rsync a get_ldev_if_state(mdev,D_FAILED) would be sufficient, but @@ -458,11 +547,25 @@ int w_make_resync_request(struct drbd_conf *mdev, /* starting with drbd 8.3.8, we can handle multi-bio EEs, * if it should be necessary */ - max_segment_size = mdev->agreed_pro_version < 94 ? - queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE; + max_segment_size = + mdev->agreed_pro_version < 94 ? queue_max_segment_size(mdev->rq_queue) : + mdev->agreed_pro_version < 95 ? DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_SEGMENT_SIZE; - number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE / 1024) * HZ); - pe = atomic_read(&mdev->rs_pending_cnt); + if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */ + number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9); + mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; + } else { + mdev->c_sync_rate = mdev->sync_conf.rate; + number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); + } + + /* Throttle resync on lower level disk activity, which may also be + * caused by application IO on Primary/SyncTarget. + * Keep this after the call to drbd_rs_controller, as that assumes + * to be called as precisely as possible every SLEEP_TIME, + * and would be confused otherwise. */ + if (drbd_rs_should_slow_down(mdev)) + goto requeue; mutex_lock(&mdev->data.mutex); if (mdev->data.socket) @@ -476,6 +579,7 @@ int w_make_resync_request(struct drbd_conf *mdev, mx = number; /* Limit the number of pending RS requests to no more than the peer's receive buffer */ + pe = atomic_read(&mdev->rs_pending_cnt); if ((pe + number) > mx) { number = mx - pe; } @@ -526,6 +630,7 @@ next_sector: * be prepared for all stripe sizes of software RAIDs. */ align = 1; + rollback_i = i; for (;;) { if (size + BM_BLOCK_SIZE > max_segment_size) break; @@ -561,14 +666,19 @@ next_sector: size = (capacity-sector)<<9; if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) { switch (read_for_csum(mdev, sector, size)) { - case 0: /* Disk failure*/ + case -EIO: /* Disk failure */ put_ldev(mdev); return 0; - case 2: /* Allocation failed */ + case -EAGAIN: /* allocation failed, or ldev busy */ drbd_rs_complete_io(mdev, sector); mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); + i = rollback_i; goto requeue; - /* case 1: everything ok */ + case 0: + /* everything ok */ + break; + default: + BUG(); } } else { inc_rs_pending(mdev); @@ -595,6 +705,7 @@ next_sector: } requeue: + mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); put_ldev(mdev); return 1; @@ -670,6 +781,14 @@ static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int ca return 1; } +static void ping_peer(struct drbd_conf *mdev) +{ + clear_bit(GOT_PING_ACK, &mdev->flags); + request_ping(mdev); + wait_event(mdev->misc_wait, + test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED); +} + int drbd_resync_finished(struct drbd_conf *mdev) { unsigned long db, dt, dbdt; @@ -709,6 +828,8 @@ int drbd_resync_finished(struct drbd_conf *mdev) if (!get_ldev(mdev)) goto out; + ping_peer(mdev); + spin_lock_irq(&mdev->req_lock); os = mdev->state; @@ -801,6 +922,8 @@ out: mdev->rs_paused = 0; mdev->ov_start_sector = 0; + drbd_md_sync(mdev); + if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) { dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n"); drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished"); @@ -817,9 +940,13 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent { if (drbd_ee_has_active_page(e)) { /* This might happen if sendpage() has not finished */ + int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT; + atomic_add(i, &mdev->pp_in_use_by_net); + atomic_sub(i, &mdev->pp_in_use); spin_lock_irq(&mdev->req_lock); list_add_tail(&e->w.list, &mdev->net_ee); spin_unlock_irq(&mdev->req_lock); + wake_up(&drbd_pp_wait); } else drbd_free_ee(mdev, e); } @@ -926,9 +1053,12 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) return 1; } - drbd_rs_complete_io(mdev, e->sector); + if (get_ldev(mdev)) { + drbd_rs_complete_io(mdev, e->sector); + put_ldev(mdev); + } - di = (struct digest_info *)(unsigned long)e->block_id; + di = e->digest; if (likely((e->flags & EE_WAS_ERROR) == 0)) { /* quick hack to try to avoid a race against reconfiguration. @@ -952,7 +1082,9 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e); } else { inc_rs_pending(mdev); - e->block_id = ID_SYNCER; + e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */ + e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */ + kfree(di); ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); } } else { @@ -962,9 +1094,6 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) } dec_unacked(mdev); - - kfree(di); - move_to_net_ee_or_free(mdev, e); if (unlikely(!ok)) @@ -1034,9 +1163,12 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all * the resync lru has been cleaned up already */ - drbd_rs_complete_io(mdev, e->sector); + if (get_ldev(mdev)) { + drbd_rs_complete_io(mdev, e->sector); + put_ldev(mdev); + } - di = (struct digest_info *)(unsigned long)e->block_id; + di = e->digest; if (likely((e->flags & EE_WAS_ERROR) == 0)) { digest_size = crypto_hash_digestsize(mdev->verify_tfm); @@ -1055,9 +1187,6 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) } dec_unacked(mdev); - - kfree(di); - if (!eq) drbd_ov_oos_found(mdev, e->sector, e->size); else @@ -1108,7 +1237,7 @@ int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel) * dec_ap_pending will be done in got_BarrierAck * or (on connection loss) in w_clear_epoch. */ ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER, - (struct p_header *)p, sizeof(*p), 0); + (struct p_header80 *)p, sizeof(*p), 0); drbd_put_data_sock(mdev); return ok; @@ -1173,6 +1302,24 @@ int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) return ok; } +int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + + if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) + drbd_al_begin_io(mdev, req->sector); + /* Calling drbd_al_begin_io() out of the worker might deadlocks + theoretically. Practically it can not deadlock, since this is + only used when unfreezing IOs. All the extents of the requests + that made it into the TL are already active */ + + drbd_req_make_private_bio(req, req->master_bio); + req->private_bio->bi_bdev = mdev->ldev->backing_bdev; + generic_make_request(req->private_bio); + + return 1; +} + static int _drbd_may_sync_now(struct drbd_conf *mdev) { struct drbd_conf *odev = mdev; @@ -1298,14 +1445,6 @@ int drbd_alter_sa(struct drbd_conf *mdev, int na) return retcode; } -static void ping_peer(struct drbd_conf *mdev) -{ - clear_bit(GOT_PING_ACK, &mdev->flags); - request_ping(mdev); - wait_event(mdev->misc_wait, - test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED); -} - /** * drbd_start_resync() - Start the resync process * @mdev: DRBD device. @@ -1379,13 +1518,21 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) r = SS_UNKNOWN_ERROR; if (r == SS_SUCCESS) { - mdev->rs_total = - mdev->rs_mark_left = drbd_bm_total_weight(mdev); + unsigned long tw = drbd_bm_total_weight(mdev); + unsigned long now = jiffies; + int i; + mdev->rs_failed = 0; mdev->rs_paused = 0; - mdev->rs_start = - mdev->rs_mark_time = jiffies; mdev->rs_same_csum = 0; + mdev->rs_last_events = 0; + mdev->rs_last_sect_ev = 0; + mdev->rs_total = tw; + mdev->rs_start = now; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + mdev->rs_mark_left[i] = tw; + mdev->rs_mark_time[i] = now; + } _drbd_pause_after(mdev); } write_unlock_irq(&global_state_lock); @@ -1397,12 +1544,31 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), (unsigned long) mdev->rs_total); - if (mdev->rs_total == 0) { - /* Peer still reachable? Beware of failing before-resync-target handlers! */ - ping_peer(mdev); + if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) { + /* This still has a race (about when exactly the peers + * detect connection loss) that can lead to a full sync + * on next handshake. In 8.3.9 we fixed this with explicit + * resync-finished notifications, but the fix + * introduces a protocol change. Sleeping for some + * time longer than the ping interval + timeout on the + * SyncSource, to give the SyncTarget the chance to + * detect connection loss, then waiting for a ping + * response (implicit in drbd_resync_finished) reduces + * the race considerably, but does not solve it. */ + if (side == C_SYNC_SOURCE) + schedule_timeout_interruptible( + mdev->net_conf->ping_int * HZ + + mdev->net_conf->ping_timeo*HZ/9); drbd_resync_finished(mdev); } + atomic_set(&mdev->rs_sect_in, 0); + atomic_set(&mdev->rs_sect_ev, 0); + mdev->rs_in_flight = 0; + mdev->rs_planed = 0; + spin_lock(&mdev->peer_seq_lock); + fifo_set(&mdev->rs_plan_s, 0); + spin_unlock(&mdev->peer_seq_lock); /* ns.conn may already be != mdev->state.conn, * we may have been paused in between, or become paused until * the timer triggers. diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index cf04c1b234ed..aa42e7766c6a 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -258,8 +258,8 @@ static int irqdma_allocated; #include <linux/completion.h> static struct request *current_req; -static struct request_queue *floppy_queue; static void do_fd_request(struct request_queue *q); +static int set_next_request(void); #ifndef fd_get_dma_residue #define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA) @@ -413,6 +413,7 @@ static struct gendisk *disks[N_DRIVE]; static struct block_device *opened_bdev[N_DRIVE]; static DEFINE_MUTEX(open_lock); static struct floppy_raw_cmd *raw_cmd, default_raw_cmd; +static int fdc_queue; /* * This struct defines the different floppy types. @@ -890,8 +891,8 @@ static void unlock_fdc(void) del_timer(&fd_timeout); cont = NULL; clear_bit(0, &fdc_busy); - if (current_req || blk_peek_request(floppy_queue)) - do_fd_request(floppy_queue); + if (current_req || set_next_request()) + do_fd_request(current_req->q); spin_unlock_irqrestore(&floppy_lock, flags); wake_up(&fdc_wait); } @@ -2243,8 +2244,8 @@ static void floppy_end_request(struct request *req, int error) * logical buffer */ static void request_done(int uptodate) { - struct request_queue *q = floppy_queue; struct request *req = current_req; + struct request_queue *q; unsigned long flags; int block; char msg[sizeof("request done ") + sizeof(int) * 3]; @@ -2258,6 +2259,8 @@ static void request_done(int uptodate) return; } + q = req->q; + if (uptodate) { /* maintain values for invalidation on geometry * change */ @@ -2811,6 +2814,28 @@ static int make_raw_rw_request(void) return 2; } +/* + * Round-robin between our available drives, doing one request from each + */ +static int set_next_request(void) +{ + struct request_queue *q; + int old_pos = fdc_queue; + + do { + q = disks[fdc_queue]->queue; + if (++fdc_queue == N_DRIVE) + fdc_queue = 0; + if (q) { + current_req = blk_fetch_request(q); + if (current_req) + break; + } + } while (fdc_queue != old_pos); + + return current_req != NULL; +} + static void redo_fd_request(void) { int drive; @@ -2822,17 +2847,17 @@ static void redo_fd_request(void) do_request: if (!current_req) { - struct request *req; + int pending; + + spin_lock_irq(&floppy_lock); + pending = set_next_request(); + spin_unlock_irq(&floppy_lock); - spin_lock_irq(floppy_queue->queue_lock); - req = blk_fetch_request(floppy_queue); - spin_unlock_irq(floppy_queue->queue_lock); - if (!req) { + if (!pending) { do_floppy = NULL; unlock_fdc(); return; } - current_req = req; } drive = (long)current_req->rq_disk->private_data; set_fdc(drive); @@ -4165,6 +4190,13 @@ static int __init floppy_init(void) goto out_put_disk; } + disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock); + if (!disks[dr]->queue) { + err = -ENOMEM; + goto out_put_disk; + } + + blk_queue_max_hw_sectors(disks[dr]->queue, 64); disks[dr]->major = FLOPPY_MAJOR; disks[dr]->first_minor = TOMINOR(dr); disks[dr]->fops = &floppy_fops; @@ -4183,13 +4215,6 @@ static int __init floppy_init(void) if (err) goto out_unreg_blkdev; - floppy_queue = blk_init_queue(do_fd_request, &floppy_lock); - if (!floppy_queue) { - err = -ENOMEM; - goto out_unreg_driver; - } - blk_queue_max_hw_sectors(floppy_queue, 64); - blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE, floppy_find, NULL, NULL); @@ -4317,7 +4342,6 @@ static int __init floppy_init(void) /* to be cleaned up... */ disks[drive]->private_data = (void *)(long)drive; - disks[drive]->queue = floppy_queue; disks[drive]->flags |= GENHD_FL_REMOVABLE; disks[drive]->driverfs_dev = &floppy_device[drive].dev; add_disk(disks[drive]); @@ -4333,8 +4357,6 @@ out_flush_work: floppy_release_irq_and_dma(); out_unreg_region: blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); - blk_cleanup_queue(floppy_queue); -out_unreg_driver: platform_driver_unregister(&floppy_driver); out_unreg_blkdev: unregister_blkdev(FLOPPY_MAJOR, "fd"); @@ -4342,6 +4364,8 @@ out_put_disk: while (dr--) { del_timer(&motor_off_timer[dr]); put_disk(disks[dr]); + if (disks[dr]->queue) + blk_cleanup_queue(disks[dr]->queue); } return err; } @@ -4550,11 +4574,11 @@ static void __exit floppy_module_exit(void) platform_device_unregister(&floppy_device[drive]); } put_disk(disks[drive]); + blk_cleanup_queue(disks[drive]->queue); } del_timer_sync(&fd_timeout); del_timer_sync(&fd_timer); - blk_cleanup_queue(floppy_queue); if (atomic_read(&usage_count)) floppy_release_irq_and_dma(); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 91797bbbe702..5a3985925c14 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -74,6 +74,7 @@ #include <linux/highmem.h> #include <linux/kthread.h> #include <linux/splice.h> +#include <linux/sysfs.h> #include <asm/uaccess.h> @@ -477,17 +478,17 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset; if (bio_rw(bio) == WRITE) { - bool barrier = !!(bio->bi_rw & REQ_HARDBARRIER); struct file *file = lo->lo_backing_file; - if (barrier) { - if (unlikely(!file->f_op->fsync)) { - ret = -EOPNOTSUPP; - goto out; - } + /* REQ_HARDBARRIER is deprecated */ + if (bio->bi_rw & REQ_HARDBARRIER) { + ret = -EOPNOTSUPP; + goto out; + } + if (bio->bi_rw & REQ_FLUSH) { ret = vfs_fsync(file, 0); - if (unlikely(ret)) { + if (unlikely(ret && ret != -EINVAL)) { ret = -EIO; goto out; } @@ -495,9 +496,9 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) ret = lo_send(lo, bio, pos); - if (barrier && !ret) { + if ((bio->bi_rw & REQ_FUA) && !ret) { ret = vfs_fsync(file, 0); - if (unlikely(ret)) + if (unlikely(ret && ret != -EINVAL)) ret = -EIO; } } else @@ -737,6 +738,103 @@ static inline int is_loop_device(struct file *file) return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR; } +/* loop sysfs attributes */ + +static ssize_t loop_attr_show(struct device *dev, char *page, + ssize_t (*callback)(struct loop_device *, char *)) +{ + struct loop_device *l, *lo = NULL; + + mutex_lock(&loop_devices_mutex); + list_for_each_entry(l, &loop_devices, lo_list) + if (disk_to_dev(l->lo_disk) == dev) { + lo = l; + break; + } + mutex_unlock(&loop_devices_mutex); + + return lo ? callback(lo, page) : -EIO; +} + +#define LOOP_ATTR_RO(_name) \ +static ssize_t loop_attr_##_name##_show(struct loop_device *, char *); \ +static ssize_t loop_attr_do_show_##_name(struct device *d, \ + struct device_attribute *attr, char *b) \ +{ \ + return loop_attr_show(d, b, loop_attr_##_name##_show); \ +} \ +static struct device_attribute loop_attr_##_name = \ + __ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL); + +static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) +{ + ssize_t ret; + char *p = NULL; + + mutex_lock(&lo->lo_ctl_mutex); + if (lo->lo_backing_file) + p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1); + mutex_unlock(&lo->lo_ctl_mutex); + + if (IS_ERR_OR_NULL(p)) + ret = PTR_ERR(p); + else { + ret = strlen(p); + memmove(buf, p, ret); + buf[ret++] = '\n'; + buf[ret] = 0; + } + + return ret; +} + +static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf) +{ + return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset); +} + +static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf) +{ + return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); +} + +static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf) +{ + int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR); + + return sprintf(buf, "%s\n", autoclear ? "1" : "0"); +} + +LOOP_ATTR_RO(backing_file); +LOOP_ATTR_RO(offset); +LOOP_ATTR_RO(sizelimit); +LOOP_ATTR_RO(autoclear); + +static struct attribute *loop_attrs[] = { + &loop_attr_backing_file.attr, + &loop_attr_offset.attr, + &loop_attr_sizelimit.attr, + &loop_attr_autoclear.attr, + NULL, +}; + +static struct attribute_group loop_attribute_group = { + .name = "loop", + .attrs= loop_attrs, +}; + +static int loop_sysfs_init(struct loop_device *lo) +{ + return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj, + &loop_attribute_group); +} + +static void loop_sysfs_exit(struct loop_device *lo) +{ + sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj, + &loop_attribute_group); +} + static int loop_set_fd(struct loop_device *lo, fmode_t mode, struct block_device *bdev, unsigned int arg) { @@ -832,10 +930,11 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, lo->lo_queue->unplug_fn = loop_unplug; if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) - blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN); + blk_queue_flush(lo->lo_queue, REQ_FLUSH); set_capacity(lo->lo_disk, size); bd_set_size(bdev, size << 9); + loop_sysfs_init(lo); /* let user-space know about the new size */ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); @@ -854,6 +953,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, return 0; out_clr: + loop_sysfs_exit(lo); lo->lo_thread = NULL; lo->lo_device = NULL; lo->lo_backing_file = NULL; @@ -950,6 +1050,7 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) set_capacity(lo->lo_disk, 0); if (bdev) { bd_set_size(bdev, 0); + loop_sysfs_exit(lo); /* let user-space know about this change */ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); } diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index 2284b4f05c62..87311ebac0db 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -310,8 +310,7 @@ static void osdblk_rq_fn(struct request_queue *q) break; /* filter out block requests we don't understand */ - if (rq->cmd_type != REQ_TYPE_FS && - !(rq->cmd_flags & REQ_HARDBARRIER)) { + if (rq->cmd_type != REQ_TYPE_FS) { blk_end_request_all(rq, 0); continue; } @@ -439,7 +438,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev) blk_queue_stack_limits(q, osd_request_queue(osdev->osd)); blk_queue_prep_rq(q, blk_queue_start_tag); - blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH); + blk_queue_flush(q, REQ_FLUSH); disk->queue = q; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 37a2bb595076..1b5cfcccd654 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -753,7 +753,6 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * rq->timeout = 60*HZ; rq->cmd_type = REQ_TYPE_BLOCK_PC; - rq->cmd_flags |= REQ_HARDBARRIER; if (cgc->quiet) rq->cmd_flags |= REQ_QUIET; diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 03688c2da319..8e1ce2e2916a 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -468,7 +468,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev) blk_queue_dma_alignment(queue, dev->blk_size-1); blk_queue_logical_block_size(queue, dev->blk_size); - blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH); + blk_queue_flush(queue, REQ_FLUSH); blk_queue_max_segments(queue, -1); blk_queue_max_segment_size(queue, dev->bounce_size); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 8320490226b7..6ecf89cdf006 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -127,9 +127,6 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, } } - if (vbr->req->cmd_flags & REQ_HARDBARRIER) - vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER; - sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); /* @@ -379,31 +376,9 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) vblk->disk->driverfs_dev = &vdev->dev; index++; - if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) { - /* - * If the FLUSH feature is supported we do have support for - * flushing a volatile write cache on the host. Use that - * to implement write barrier support. - */ - blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH); - } else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) { - /* - * If the BARRIER feature is supported the host expects us - * to order request by tags. This implies there is not - * volatile write cache on the host, and that the host - * never re-orders outstanding I/O. This feature is not - * useful for real life scenarious and deprecated. - */ - blk_queue_ordered(q, QUEUE_ORDERED_TAG); - } else { - /* - * If the FLUSH feature is not supported we must assume that - * the host does not perform any kind of volatile write - * caching. We still need to drain the queue to provider - * proper barrier semantics. - */ - blk_queue_ordered(q, QUEUE_ORDERED_DRAIN); - } + /* configure queue flush support */ + if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) + blk_queue_flush(q, REQ_FLUSH); /* If disk is read-only in the host, the guest should obey */ if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) @@ -522,9 +497,9 @@ static const struct virtio_device_id id_table[] = { }; static unsigned int features[] = { - VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, - VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, - VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY + VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, + VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI, + VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY }; /* diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index ab735a605cf3..f2ffc46644df 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -95,7 +95,7 @@ struct blkfront_info struct gnttab_free_callback callback; struct blk_shadow shadow[BLK_RING_SIZE]; unsigned long shadow_free; - int feature_barrier; + unsigned int feature_flush; int is_ready; }; @@ -418,26 +418,12 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) } -static int xlvbd_barrier(struct blkfront_info *info) +static void xlvbd_flush(struct blkfront_info *info) { - int err; - const char *barrier; - - switch (info->feature_barrier) { - case QUEUE_ORDERED_DRAIN: barrier = "enabled (drain)"; break; - case QUEUE_ORDERED_TAG: barrier = "enabled (tag)"; break; - case QUEUE_ORDERED_NONE: barrier = "disabled"; break; - default: return -EINVAL; - } - - err = blk_queue_ordered(info->rq, info->feature_barrier); - - if (err) - return err; - + blk_queue_flush(info->rq, info->feature_flush); printk(KERN_INFO "blkfront: %s: barriers %s\n", - info->gd->disk_name, barrier); - return 0; + info->gd->disk_name, + info->feature_flush ? "enabled" : "disabled"); } @@ -516,7 +502,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, info->rq = gd->queue; info->gd = gd; - xlvbd_barrier(info); + xlvbd_flush(info); if (vdisk_info & VDISK_READONLY) set_disk_ro(gd, 1); @@ -662,8 +648,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) printk(KERN_WARNING "blkfront: %s: write barrier op failed\n", info->gd->disk_name); error = -EOPNOTSUPP; - info->feature_barrier = QUEUE_ORDERED_NONE; - xlvbd_barrier(info); + info->feature_flush = 0; + xlvbd_flush(info); } /* fall through */ case BLKIF_OP_READ: @@ -1076,20 +1062,13 @@ static void blkfront_connect(struct blkfront_info *info) /* * If there's no "feature-barrier" defined, then it means * we're dealing with a very old backend which writes - * synchronously; draining will do what needs to get done. + * synchronously; nothing to do. * - * If there are barriers, then we can do full queued writes - * with tagged barriers. - * - * If barriers are not supported, then there's no much we can - * do, so just set ordering to NONE. + * If there are barriers, then we use flush. */ - if (err) - info->feature_barrier = QUEUE_ORDERED_DRAIN; - else if (barrier) - info->feature_barrier = QUEUE_ORDERED_TAG; - else - info->feature_barrier = QUEUE_ORDERED_NONE; + info->feature_flush = 0; + if (!err && barrier) + info->feature_flush = REQ_FLUSH; err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); if (err) { diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 7433e07de30e..7c5b01ce51d2 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -516,10 +516,10 @@ static int ide_do_setfeature(ide_drive_t *drive, u8 feature, u8 nsect) return ide_no_data_taskfile(drive, &cmd); } -static void update_ordered(ide_drive_t *drive) +static void update_flush(ide_drive_t *drive) { u16 *id = drive->id; - unsigned ordered = QUEUE_ORDERED_NONE; + unsigned flush = 0; if (drive->dev_flags & IDE_DFLAG_WCACHE) { unsigned long long capacity; @@ -543,13 +543,12 @@ static void update_ordered(ide_drive_t *drive) drive->name, barrier ? "" : "not "); if (barrier) { - ordered = QUEUE_ORDERED_DRAIN_FLUSH; + flush = REQ_FLUSH; blk_queue_prep_rq(drive->queue, idedisk_prep_fn); } - } else - ordered = QUEUE_ORDERED_DRAIN; + } - blk_queue_ordered(drive->queue, ordered); + blk_queue_flush(drive->queue, flush); } ide_devset_get_flag(wcache, IDE_DFLAG_WCACHE); @@ -572,7 +571,7 @@ static int set_wcache(ide_drive_t *drive, int arg) } } - update_ordered(drive); + update_flush(drive); return err; } diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index a381be814070..999dac054bcc 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -441,19 +441,6 @@ void do_ide_request(struct request_queue *q) struct request *rq = NULL; ide_startstop_t startstop; - /* - * drive is doing pre-flush, ordered write, post-flush sequence. even - * though that is 3 requests, it must be seen as a single transaction. - * we must not preempt this drive until that is complete - */ - if (blk_queue_flushing(q)) - /* - * small race where queue could get replugged during - * the 3-request flush cycle, just yank the plug since - * we want it to finish asap - */ - blk_remove_plug(q); - spin_unlock_irq(q->queue_lock); /* HLD do_request() callback might sleep, make sure it's okay */ diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 368e8e98f705..d5b0e4c0e702 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1278,7 +1278,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, struct dm_crypt_io *io; struct crypt_config *cc; - if (unlikely(bio_empty_barrier(bio))) { + if (bio->bi_rw & REQ_FLUSH) { cc = ti->private; bio->bi_bdev = cc->dev->bdev; return DM_MAPIO_REMAPPED; diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 0590c75b0ab6..136d4f71a116 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -31,7 +31,6 @@ struct dm_io_client { */ struct io { unsigned long error_bits; - unsigned long eopnotsupp_bits; atomic_t count; struct task_struct *sleeper; struct dm_io_client *client; @@ -130,11 +129,8 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, *---------------------------------------------------------------*/ static void dec_count(struct io *io, unsigned int region, int error) { - if (error) { + if (error) set_bit(region, &io->error_bits); - if (error == -EOPNOTSUPP) - set_bit(region, &io->eopnotsupp_bits); - } if (atomic_dec_and_test(&io->count)) { if (io->sleeper) @@ -310,8 +306,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, sector_t remaining = where->count; /* - * where->count may be zero if rw holds a write barrier and we - * need to send a zero-sized barrier. + * where->count may be zero if rw holds a flush and we need to + * send a zero-sized flush. */ do { /* @@ -364,7 +360,7 @@ static void dispatch_io(int rw, unsigned int num_regions, */ for (i = 0; i < num_regions; i++) { *dp = old_pages; - if (where[i].count || (rw & REQ_HARDBARRIER)) + if (where[i].count || (rw & REQ_FLUSH)) do_region(rw, i, where + i, dp, io); } @@ -393,9 +389,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, return -EIO; } -retry: io->error_bits = 0; - io->eopnotsupp_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ io->sleeper = current; io->client = client; @@ -412,11 +406,6 @@ retry: } set_current_state(TASK_RUNNING); - if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) { - rw &= ~REQ_HARDBARRIER; - goto retry; - } - if (error_bits) *error_bits = io->error_bits; @@ -437,7 +426,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io = mempool_alloc(client->pool, GFP_NOIO); io->error_bits = 0; - io->eopnotsupp_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ io->sleeper = NULL; io->client = client; diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 5a08be0222db..33420e68d153 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -300,7 +300,7 @@ static int flush_header(struct log_c *lc) .count = 0, }; - lc->io_req.bi_rw = WRITE_BARRIER; + lc->io_req.bi_rw = WRITE_FLUSH; return dm_io(&lc->io_req, 1, &null_location, NULL); } diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 7c081bcbc3cf..19a59b041c27 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -259,7 +259,7 @@ static int mirror_flush(struct dm_target *ti) struct dm_io_region io[ms->nr_mirrors]; struct mirror *m; struct dm_io_request io_req = { - .bi_rw = WRITE_BARRIER, + .bi_rw = WRITE_FLUSH, .mem.type = DM_IO_KMEM, .mem.ptr.bvec = NULL, .client = ms->io_client, @@ -629,7 +629,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) struct dm_io_region io[ms->nr_mirrors], *dest = io; struct mirror *m; struct dm_io_request io_req = { - .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER), + .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA), .mem.type = DM_IO_BVEC, .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, .notify.fn = write_callback, @@ -670,7 +670,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) bio_list_init(&requeue); while ((bio = bio_list_pop(writes))) { - if (unlikely(bio_empty_barrier(bio))) { + if (bio->bi_rw & REQ_FLUSH) { bio_list_add(&sync, bio); continue; } @@ -1203,7 +1203,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, * We need to dec pending if this was a write. */ if (rw == WRITE) { - if (likely(!bio_empty_barrier(bio))) + if (!(bio->bi_rw & REQ_FLUSH)) dm_rh_dec(ms->rh, map_context->ll); return error; } diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index bd5c58b28868..dad011aed0c9 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -81,9 +81,9 @@ struct dm_region_hash { struct list_head failed_recovered_regions; /* - * If there was a barrier failure no regions can be marked clean. + * If there was a flush failure no regions can be marked clean. */ - int barrier_failure; + int flush_failure; void *context; sector_t target_begin; @@ -217,7 +217,7 @@ struct dm_region_hash *dm_region_hash_create( INIT_LIST_HEAD(&rh->quiesced_regions); INIT_LIST_HEAD(&rh->recovered_regions); INIT_LIST_HEAD(&rh->failed_recovered_regions); - rh->barrier_failure = 0; + rh->flush_failure = 0; rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, sizeof(struct dm_region)); @@ -399,8 +399,8 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) region_t region = dm_rh_bio_to_region(rh, bio); int recovering = 0; - if (bio_empty_barrier(bio)) { - rh->barrier_failure = 1; + if (bio->bi_rw & REQ_FLUSH) { + rh->flush_failure = 1; return; } @@ -524,7 +524,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) struct bio *bio; for (bio = bios->head; bio; bio = bio->bi_next) { - if (bio_empty_barrier(bio)) + if (bio->bi_rw & REQ_FLUSH) continue; rh_inc(rh, dm_rh_bio_to_region(rh, bio)); } @@ -555,9 +555,9 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region) */ /* do nothing for DM_RH_NOSYNC */ - if (unlikely(rh->barrier_failure)) { + if (unlikely(rh->flush_failure)) { /* - * If a write barrier failed some time ago, we + * If a write flush failed some time ago, we * don't know whether or not this write made it * to the disk, so we must resync the device. */ diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index cc2bdb83f9ad..0b61792a2780 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -687,7 +687,7 @@ static void persistent_commit_exception(struct dm_exception_store *store, /* * Commit exceptions to disk. */ - if (ps->valid && area_io(ps, WRITE_BARRIER)) + if (ps->valid && area_io(ps, WRITE_FLUSH_FUA)) ps->valid = 0; /* diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 5974d3094d97..53cf79d8bcbc 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -706,8 +706,6 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) return 0; } -#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r))) - /* * Return a minimum chunk size of all snapshots that have the specified origin. * Return zero if the origin has no snapshots. @@ -1587,7 +1585,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, chunk_t chunk; struct dm_snap_pending_exception *pe = NULL; - if (unlikely(bio_empty_barrier(bio))) { + if (bio->bi_rw & REQ_FLUSH) { bio->bi_bdev = s->cow->bdev; return DM_MAPIO_REMAPPED; } @@ -1691,7 +1689,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, int r = DM_MAPIO_REMAPPED; chunk_t chunk; - if (unlikely(bio_empty_barrier(bio))) { + if (bio->bi_rw & REQ_FLUSH) { if (!map_context->target_request_nr) bio->bi_bdev = s->origin->bdev; else @@ -2135,7 +2133,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio, struct dm_dev *dev = ti->private; bio->bi_bdev = dev->bdev; - if (unlikely(bio_empty_barrier(bio))) + if (bio->bi_rw & REQ_FLUSH) return DM_MAPIO_REMAPPED; /* Only tell snapshots if this is a write */ diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index c297f6da91ea..f0371b4c4fbf 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -271,7 +271,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, uint32_t stripe; unsigned target_request_nr; - if (unlikely(bio_empty_barrier(bio))) { + if (bio->bi_rw & REQ_FLUSH) { target_request_nr = map_context->target_request_nr; BUG_ON(target_request_nr >= sc->stripes); bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index f9fc07d7a4b9..90267f8d64ee 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -486,11 +486,6 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, return 0; } -/* - * Returns the minimum that is _not_ zero, unless both are zero. - */ -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) - int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ac384b2a6a33..f934e9878436 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -110,7 +110,6 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_FREEING 3 #define DMF_DELETING 4 #define DMF_NOFLUSH_SUSPENDING 5 -#define DMF_QUEUE_IO_TO_THREAD 6 /* * Work processed by per-device workqueue. @@ -144,24 +143,9 @@ struct mapped_device { spinlock_t deferred_lock; /* - * An error from the barrier request currently being processed. - */ - int barrier_error; - - /* - * Protect barrier_error from concurrent endio processing - * in request-based dm. - */ - spinlock_t barrier_error_lock; - - /* - * Processing queue (flush/barriers) + * Processing queue (flush) */ struct workqueue_struct *wq; - struct work_struct barrier_work; - - /* A pointer to the currently processing pre/post flush request */ - struct request *flush_request; /* * The current mapping. @@ -200,8 +184,8 @@ struct mapped_device { /* sysfs handle */ struct kobject kobj; - /* zero-length barrier that will be cloned and submitted to targets */ - struct bio barrier_bio; + /* zero-length flush that will be cloned and submitted to targets */ + struct bio flush_bio; }; /* @@ -512,7 +496,7 @@ static void end_io_acct(struct dm_io *io) /* * After this is decremented the bio must not be touched if it is - * a barrier. + * a flush. */ dm_disk(md)->part0.in_flight[rw] = pending = atomic_dec_return(&md->pending[rw]); @@ -528,16 +512,12 @@ static void end_io_acct(struct dm_io *io) */ static void queue_io(struct mapped_device *md, struct bio *bio) { - down_write(&md->io_lock); + unsigned long flags; - spin_lock_irq(&md->deferred_lock); + spin_lock_irqsave(&md->deferred_lock, flags); bio_list_add(&md->deferred, bio); - spin_unlock_irq(&md->deferred_lock); - - if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) - queue_work(md->wq, &md->work); - - up_write(&md->io_lock); + spin_unlock_irqrestore(&md->deferred_lock, flags); + queue_work(md->wq, &md->work); } /* @@ -625,11 +605,9 @@ static void dec_pending(struct dm_io *io, int error) * Target requested pushing back the I/O. */ spin_lock_irqsave(&md->deferred_lock, flags); - if (__noflush_suspending(md)) { - if (!(io->bio->bi_rw & REQ_HARDBARRIER)) - bio_list_add_head(&md->deferred, - io->bio); - } else + if (__noflush_suspending(md)) + bio_list_add_head(&md->deferred, io->bio); + else /* noflush suspend was interrupted. */ io->error = -EIO; spin_unlock_irqrestore(&md->deferred_lock, flags); @@ -637,32 +615,23 @@ static void dec_pending(struct dm_io *io, int error) io_error = io->error; bio = io->bio; + end_io_acct(io); + free_io(md, io); + + if (io_error == DM_ENDIO_REQUEUE) + return; - if (bio->bi_rw & REQ_HARDBARRIER) { + if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { /* - * There can be just one barrier request so we use - * a per-device variable for error reporting. - * Note that you can't touch the bio after end_io_acct - * - * We ignore -EOPNOTSUPP for empty flush reported by - * underlying devices. We assume that if the device - * doesn't support empty barriers, it doesn't need - * cache flushing commands. + * Preflush done for flush with data, reissue + * without REQ_FLUSH. */ - if (!md->barrier_error && - !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP)) - md->barrier_error = io_error; - end_io_acct(io); - free_io(md, io); + bio->bi_rw &= ~REQ_FLUSH; + queue_io(md, bio); } else { - end_io_acct(io); - free_io(md, io); - - if (io_error != DM_ENDIO_REQUEUE) { - trace_block_bio_complete(md->queue, bio); - - bio_endio(bio, io_error); - } + /* done with normal IO or empty flush */ + trace_block_bio_complete(md->queue, bio); + bio_endio(bio, io_error); } } } @@ -755,23 +724,6 @@ static void end_clone_bio(struct bio *clone, int error) blk_update_request(tio->orig, 0, nr_bytes); } -static void store_barrier_error(struct mapped_device *md, int error) -{ - unsigned long flags; - - spin_lock_irqsave(&md->barrier_error_lock, flags); - /* - * Basically, the first error is taken, but: - * -EOPNOTSUPP supersedes any I/O error. - * Requeue request supersedes any I/O error but -EOPNOTSUPP. - */ - if (!md->barrier_error || error == -EOPNOTSUPP || - (md->barrier_error != -EOPNOTSUPP && - error == DM_ENDIO_REQUEUE)) - md->barrier_error = error; - spin_unlock_irqrestore(&md->barrier_error_lock, flags); -} - /* * Don't touch any member of the md after calling this function because * the md may be freed in dm_put() at the end of this function. @@ -809,13 +761,11 @@ static void free_rq_clone(struct request *clone) static void dm_end_request(struct request *clone, int error) { int rw = rq_data_dir(clone); - int run_queue = 1; - bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER; struct dm_rq_target_io *tio = clone->end_io_data; struct mapped_device *md = tio->md; struct request *rq = tio->orig; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) { + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { rq->errors = clone->errors; rq->resid_len = clone->resid_len; @@ -829,15 +779,8 @@ static void dm_end_request(struct request *clone, int error) } free_rq_clone(clone); - - if (unlikely(is_barrier)) { - if (unlikely(error)) - store_barrier_error(md, error); - run_queue = 0; - } else - blk_end_request_all(rq, error); - - rq_completed(md, rw, run_queue); + blk_end_request_all(rq, error); + rq_completed(md, rw, true); } static void dm_unprep_request(struct request *rq) @@ -862,16 +805,6 @@ void dm_requeue_unmapped_request(struct request *clone) struct request_queue *q = rq->q; unsigned long flags; - if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { - /* - * Barrier clones share an original request. - * Leave it to dm_end_request(), which handles this special - * case. - */ - dm_end_request(clone, DM_ENDIO_REQUEUE); - return; - } - dm_unprep_request(rq); spin_lock_irqsave(q->queue_lock, flags); @@ -961,19 +894,6 @@ static void dm_complete_request(struct request *clone, int error) struct dm_rq_target_io *tio = clone->end_io_data; struct request *rq = tio->orig; - if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { - /* - * Barrier clones share an original request. So can't use - * softirq_done with the original. - * Pass the clone to dm_done() directly in this special case. - * It is safe (even if clone->q->queue_lock is held here) - * because there is no I/O dispatching during the completion - * of barrier clone. - */ - dm_done(clone, error, true); - return; - } - tio->error = error; rq->completion_data = clone; blk_complete_request(rq); @@ -990,17 +910,6 @@ void dm_kill_unmapped_request(struct request *clone, int error) struct dm_rq_target_io *tio = clone->end_io_data; struct request *rq = tio->orig; - if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { - /* - * Barrier clones share an original request. - * Leave it to dm_end_request(), which handles this special - * case. - */ - BUG_ON(error > 0); - dm_end_request(clone, error); - return; - } - rq->cmd_flags |= REQ_FAILED; dm_complete_request(clone, error); } @@ -1119,7 +1028,7 @@ static void dm_bio_destructor(struct bio *bio) } /* - * Creates a little bio that is just does part of a bvec. + * Creates a little bio that just does part of a bvec. */ static struct bio *split_bvec(struct bio *bio, sector_t sector, unsigned short idx, unsigned int offset, @@ -1134,7 +1043,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector, clone->bi_sector = sector; clone->bi_bdev = bio->bi_bdev; - clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER; + clone->bi_rw = bio->bi_rw; clone->bi_vcnt = 1; clone->bi_size = to_bytes(len); clone->bi_io_vec->bv_offset = offset; @@ -1161,7 +1070,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); __bio_clone(clone, bio); - clone->bi_rw &= ~REQ_HARDBARRIER; clone->bi_destructor = dm_bio_destructor; clone->bi_sector = sector; clone->bi_idx = idx; @@ -1225,16 +1133,15 @@ static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, __issue_target_request(ci, ti, request_nr, len); } -static int __clone_and_map_empty_barrier(struct clone_info *ci) +static int __clone_and_map_empty_flush(struct clone_info *ci) { unsigned target_nr = 0; struct dm_target *ti; + BUG_ON(bio_has_data(ci->bio)); while ((ti = dm_table_get_target(ci->map, target_nr++))) __issue_target_requests(ci, ti, ti->num_flush_requests, 0); - ci->sector_count = 0; - return 0; } @@ -1289,9 +1196,6 @@ static int __clone_and_map(struct clone_info *ci) sector_t len = 0, max; struct dm_target_io *tio; - if (unlikely(bio_empty_barrier(bio))) - return __clone_and_map_empty_barrier(ci); - if (unlikely(bio->bi_rw & REQ_DISCARD)) return __clone_and_map_discard(ci); @@ -1383,16 +1287,11 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) ci.map = dm_get_live_table(md); if (unlikely(!ci.map)) { - if (!(bio->bi_rw & REQ_HARDBARRIER)) - bio_io_error(bio); - else - if (!md->barrier_error) - md->barrier_error = -EIO; + bio_io_error(bio); return; } ci.md = md; - ci.bio = bio; ci.io = alloc_io(md); ci.io->error = 0; atomic_set(&ci.io->io_count, 1); @@ -1400,14 +1299,20 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) ci.io->md = md; spin_lock_init(&ci.io->endio_lock); ci.sector = bio->bi_sector; - ci.sector_count = bio_sectors(bio); - if (unlikely(bio_empty_barrier(bio))) - ci.sector_count = 1; ci.idx = bio->bi_idx; start_io_acct(ci.io); - while (ci.sector_count && !error) - error = __clone_and_map(&ci); + if (bio->bi_rw & REQ_FLUSH) { + ci.bio = &ci.md->flush_bio; + ci.sector_count = 0; + error = __clone_and_map_empty_flush(&ci); + /* dec_pending submits any data associated with flush */ + } else { + ci.bio = bio; + ci.sector_count = bio_sectors(bio); + while (ci.sector_count && !error) + error = __clone_and_map(&ci); + } /* drop the extra reference count */ dec_pending(ci.io, error); @@ -1491,22 +1396,14 @@ static int _dm_request(struct request_queue *q, struct bio *bio) part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); part_stat_unlock(); - /* - * If we're suspended or the thread is processing barriers - * we have to queue this io for later. - */ - if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || - unlikely(bio->bi_rw & REQ_HARDBARRIER)) { + /* if we're suspended, we have to queue this io for later */ + if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { up_read(&md->io_lock); - if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && - bio_rw(bio) == READA) { + if (bio_rw(bio) != READA) + queue_io(md, bio); + else bio_io_error(bio); - return 0; - } - - queue_io(md, bio); - return 0; } @@ -1537,14 +1434,6 @@ static int dm_request(struct request_queue *q, struct bio *bio) return _dm_request(q, bio); } -static bool dm_rq_is_flush_request(struct request *rq) -{ - if (rq->cmd_flags & REQ_FLUSH) - return true; - else - return false; -} - void dm_dispatch_request(struct request *rq) { int r; @@ -1592,22 +1481,15 @@ static int setup_clone(struct request *clone, struct request *rq, { int r; - if (dm_rq_is_flush_request(rq)) { - blk_rq_init(NULL, clone); - clone->cmd_type = REQ_TYPE_FS; - clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); - } else { - r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, - dm_rq_bio_constructor, tio); - if (r) - return r; - - clone->cmd = rq->cmd; - clone->cmd_len = rq->cmd_len; - clone->sense = rq->sense; - clone->buffer = rq->buffer; - } + r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, + dm_rq_bio_constructor, tio); + if (r) + return r; + clone->cmd = rq->cmd; + clone->cmd_len = rq->cmd_len; + clone->sense = rq->sense; + clone->buffer = rq->buffer; clone->end_io = end_clone_request; clone->end_io_data = tio; @@ -1648,9 +1530,6 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) struct mapped_device *md = q->queuedata; struct request *clone; - if (unlikely(dm_rq_is_flush_request(rq))) - return BLKPREP_OK; - if (unlikely(rq->special)) { DMWARN("Already has something in rq->special."); return BLKPREP_KILL; @@ -1727,6 +1606,7 @@ static void dm_request_fn(struct request_queue *q) struct dm_table *map = dm_get_live_table(md); struct dm_target *ti; struct request *rq, *clone; + sector_t pos; /* * For suspend, check blk_queue_stopped() and increment @@ -1739,15 +1619,14 @@ static void dm_request_fn(struct request_queue *q) if (!rq) goto plug_and_out; - if (unlikely(dm_rq_is_flush_request(rq))) { - BUG_ON(md->flush_request); - md->flush_request = rq; - blk_start_request(rq); - queue_work(md->wq, &md->barrier_work); - goto out; - } + /* always use block 0 to find the target for flushes for now */ + pos = 0; + if (!(rq->cmd_flags & REQ_FLUSH)) + pos = blk_rq_pos(rq); + + ti = dm_table_find_target(map, pos); + BUG_ON(!dm_target_is_valid(ti)); - ti = dm_table_find_target(map, blk_rq_pos(rq)); if (ti->type->busy && ti->type->busy(ti)) goto plug_and_out; @@ -1918,7 +1797,6 @@ out: static const struct block_device_operations dm_blk_dops; static void dm_wq_work(struct work_struct *work); -static void dm_rq_barrier_work(struct work_struct *work); static void dm_init_md_queue(struct mapped_device *md) { @@ -1940,6 +1818,7 @@ static void dm_init_md_queue(struct mapped_device *md) blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); md->queue->unplug_fn = dm_unplug_all; blk_queue_merge_bvec(md->queue, dm_merge_bvec); + blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA); } /* @@ -1972,7 +1851,6 @@ static struct mapped_device *alloc_dev(int minor) mutex_init(&md->suspend_lock); mutex_init(&md->type_lock); spin_lock_init(&md->deferred_lock); - spin_lock_init(&md->barrier_error_lock); rwlock_init(&md->map_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); @@ -1995,7 +1873,6 @@ static struct mapped_device *alloc_dev(int minor) atomic_set(&md->pending[1], 0); init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); - INIT_WORK(&md->barrier_work, dm_rq_barrier_work); init_waitqueue_head(&md->eventq); md->disk->major = _major; @@ -2015,6 +1892,10 @@ static struct mapped_device *alloc_dev(int minor) if (!md->bdev) goto bad_bdev; + bio_init(&md->flush_bio); + md->flush_bio.bi_bdev = md->bdev; + md->flush_bio.bi_rw = WRITE_FLUSH; + /* Populate the mapping, nobody knows we exist yet */ spin_lock(&_minor_lock); old_md = idr_replace(&_minor_idr, md, minor); @@ -2245,7 +2126,6 @@ static int dm_init_request_based_queue(struct mapped_device *md) blk_queue_softirq_done(md->queue, dm_softirq_done); blk_queue_prep_rq(md->queue, dm_prep_fn); blk_queue_lld_busy(md->queue, dm_lld_busy); - blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH); elv_register_queue(md->queue); @@ -2406,43 +2286,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) return r; } -static void dm_flush(struct mapped_device *md) -{ - dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); - - bio_init(&md->barrier_bio); - md->barrier_bio.bi_bdev = md->bdev; - md->barrier_bio.bi_rw = WRITE_BARRIER; - __split_and_process_bio(md, &md->barrier_bio); - - dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); -} - -static void process_barrier(struct mapped_device *md, struct bio *bio) -{ - md->barrier_error = 0; - - dm_flush(md); - - if (!bio_empty_barrier(bio)) { - __split_and_process_bio(md, bio); - /* - * If the request isn't supported, don't waste time with - * the second flush. - */ - if (md->barrier_error != -EOPNOTSUPP) - dm_flush(md); - } - - if (md->barrier_error != DM_ENDIO_REQUEUE) - bio_endio(bio, md->barrier_error); - else { - spin_lock_irq(&md->deferred_lock); - bio_list_add_head(&md->deferred, bio); - spin_unlock_irq(&md->deferred_lock); - } -} - /* * Process the deferred bios */ @@ -2452,33 +2295,27 @@ static void dm_wq_work(struct work_struct *work) work); struct bio *c; - down_write(&md->io_lock); + down_read(&md->io_lock); while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { spin_lock_irq(&md->deferred_lock); c = bio_list_pop(&md->deferred); spin_unlock_irq(&md->deferred_lock); - if (!c) { - clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); + if (!c) break; - } - up_write(&md->io_lock); + up_read(&md->io_lock); if (dm_request_based(md)) generic_make_request(c); - else { - if (c->bi_rw & REQ_HARDBARRIER) - process_barrier(md, c); - else - __split_and_process_bio(md, c); - } + else + __split_and_process_bio(md, c); - down_write(&md->io_lock); + down_read(&md->io_lock); } - up_write(&md->io_lock); + up_read(&md->io_lock); } static void dm_queue_flush(struct mapped_device *md) @@ -2488,73 +2325,6 @@ static void dm_queue_flush(struct mapped_device *md) queue_work(md->wq, &md->work); } -static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr) -{ - struct dm_rq_target_io *tio = clone->end_io_data; - - tio->info.target_request_nr = request_nr; -} - -/* Issue barrier requests to targets and wait for their completion. */ -static int dm_rq_barrier(struct mapped_device *md) -{ - int i, j; - struct dm_table *map = dm_get_live_table(md); - unsigned num_targets = dm_table_get_num_targets(map); - struct dm_target *ti; - struct request *clone; - - md->barrier_error = 0; - - for (i = 0; i < num_targets; i++) { - ti = dm_table_get_target(map, i); - for (j = 0; j < ti->num_flush_requests; j++) { - clone = clone_rq(md->flush_request, md, GFP_NOIO); - dm_rq_set_target_request_nr(clone, j); - atomic_inc(&md->pending[rq_data_dir(clone)]); - map_request(ti, clone, md); - } - } - - dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); - dm_table_put(map); - - return md->barrier_error; -} - -static void dm_rq_barrier_work(struct work_struct *work) -{ - int error; - struct mapped_device *md = container_of(work, struct mapped_device, - barrier_work); - struct request_queue *q = md->queue; - struct request *rq; - unsigned long flags; - - /* - * Hold the md reference here and leave it at the last part so that - * the md can't be deleted by device opener when the barrier request - * completes. - */ - dm_get(md); - - error = dm_rq_barrier(md); - - rq = md->flush_request; - md->flush_request = NULL; - - if (error == DM_ENDIO_REQUEUE) { - spin_lock_irqsave(q->queue_lock, flags); - blk_requeue_request(q, rq); - spin_unlock_irqrestore(q->queue_lock, flags); - } else - blk_end_request_all(rq, error); - - blk_run_queue(q); - - dm_put(md); -} - /* * Swap in a new table, returning the old one for the caller to destroy. */ @@ -2677,23 +2447,17 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) * * To get all processes out of __split_and_process_bio in dm_request, * we take the write lock. To prevent any process from reentering - * __split_and_process_bio from dm_request, we set - * DMF_QUEUE_IO_TO_THREAD. - * - * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND - * and call flush_workqueue(md->wq). flush_workqueue will wait until - * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any - * further calls to __split_and_process_bio from dm_wq_work. + * __split_and_process_bio from dm_request and quiesce the thread + * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call + * flush_workqueue(md->wq). */ down_write(&md->io_lock); set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); - set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); up_write(&md->io_lock); /* - * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which - * can be kicked until md->queue is stopped. So stop md->queue before - * flushing md->wq. + * Stop md->queue before flushing md->wq in case request-based + * dm defers requests to md->wq from md->queue. */ if (dm_request_based(md)) stop_queue(md->queue); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index ba19060bcf3f..8a2f767f26d8 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -294,8 +294,8 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio) dev_info_t *tmp_dev; sector_t start_sector; - if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { - md_barrier_request(mddev, bio); + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index f20d13e717d5..ed075d19db37 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -226,12 +226,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio) return 0; } rcu_read_lock(); - if (mddev->suspended || mddev->barrier) { + if (mddev->suspended) { DEFINE_WAIT(__wait); for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, TASK_UNINTERRUPTIBLE); - if (!mddev->suspended && !mddev->barrier) + if (!mddev->suspended) break; rcu_read_unlock(); schedule(); @@ -282,40 +282,29 @@ EXPORT_SYMBOL_GPL(mddev_resume); int mddev_congested(mddev_t *mddev, int bits) { - if (mddev->barrier) - return 1; return mddev->suspended; } EXPORT_SYMBOL(mddev_congested); /* - * Generic barrier handling for md + * Generic flush handling for md */ -#define POST_REQUEST_BARRIER ((void*)1) - -static void md_end_barrier(struct bio *bio, int err) +static void md_end_flush(struct bio *bio, int err) { mdk_rdev_t *rdev = bio->bi_private; mddev_t *mddev = rdev->mddev; - if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER) - set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags); rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&mddev->flush_pending)) { - if (mddev->barrier == POST_REQUEST_BARRIER) { - /* This was a post-request barrier */ - mddev->barrier = NULL; - wake_up(&mddev->sb_wait); - } else - /* The pre-request barrier has finished */ - schedule_work(&mddev->barrier_work); + /* The pre-request flush has finished */ + schedule_work(&mddev->flush_work); } bio_put(bio); } -static void submit_barriers(mddev_t *mddev) +static void submit_flushes(mddev_t *mddev) { mdk_rdev_t *rdev; @@ -332,60 +321,56 @@ static void submit_barriers(mddev_t *mddev) atomic_inc(&rdev->nr_pending); rcu_read_unlock(); bi = bio_alloc(GFP_KERNEL, 0); - bi->bi_end_io = md_end_barrier; + bi->bi_end_io = md_end_flush; bi->bi_private = rdev; bi->bi_bdev = rdev->bdev; atomic_inc(&mddev->flush_pending); - submit_bio(WRITE_BARRIER, bi); + submit_bio(WRITE_FLUSH, bi); rcu_read_lock(); rdev_dec_pending(rdev, mddev); } rcu_read_unlock(); } -static void md_submit_barrier(struct work_struct *ws) +static void md_submit_flush_data(struct work_struct *ws) { - mddev_t *mddev = container_of(ws, mddev_t, barrier_work); - struct bio *bio = mddev->barrier; + mddev_t *mddev = container_of(ws, mddev_t, flush_work); + struct bio *bio = mddev->flush_bio; atomic_set(&mddev->flush_pending, 1); - if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) - bio_endio(bio, -EOPNOTSUPP); - else if (bio->bi_size == 0) + if (bio->bi_size == 0) /* an empty barrier - all done */ bio_endio(bio, 0); else { - bio->bi_rw &= ~REQ_HARDBARRIER; + bio->bi_rw &= ~REQ_FLUSH; if (mddev->pers->make_request(mddev, bio)) generic_make_request(bio); - mddev->barrier = POST_REQUEST_BARRIER; - submit_barriers(mddev); } if (atomic_dec_and_test(&mddev->flush_pending)) { - mddev->barrier = NULL; + mddev->flush_bio = NULL; wake_up(&mddev->sb_wait); } } -void md_barrier_request(mddev_t *mddev, struct bio *bio) +void md_flush_request(mddev_t *mddev, struct bio *bio) { spin_lock_irq(&mddev->write_lock); wait_event_lock_irq(mddev->sb_wait, - !mddev->barrier, + !mddev->flush_bio, mddev->write_lock, /*nothing*/); - mddev->barrier = bio; + mddev->flush_bio = bio; spin_unlock_irq(&mddev->write_lock); atomic_set(&mddev->flush_pending, 1); - INIT_WORK(&mddev->barrier_work, md_submit_barrier); + INIT_WORK(&mddev->flush_work, md_submit_flush_data); - submit_barriers(mddev); + submit_flushes(mddev); if (atomic_dec_and_test(&mddev->flush_pending)) - schedule_work(&mddev->barrier_work); + schedule_work(&mddev->flush_work); } -EXPORT_SYMBOL(md_barrier_request); +EXPORT_SYMBOL(md_flush_request); /* Support for plugging. * This mirrors the plugging support in request_queue, but does not @@ -696,31 +681,6 @@ static void super_written(struct bio *bio, int error) bio_put(bio); } -static void super_written_barrier(struct bio *bio, int error) -{ - struct bio *bio2 = bio->bi_private; - mdk_rdev_t *rdev = bio2->bi_private; - mddev_t *mddev = rdev->mddev; - - if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && - error == -EOPNOTSUPP) { - unsigned long flags; - /* barriers don't appear to be supported :-( */ - set_bit(BarriersNotsupp, &rdev->flags); - mddev->barriers_work = 0; - spin_lock_irqsave(&mddev->write_lock, flags); - bio2->bi_next = mddev->biolist; - mddev->biolist = bio2; - spin_unlock_irqrestore(&mddev->write_lock, flags); - wake_up(&mddev->sb_wait); - bio_put(bio); - } else { - bio_put(bio2); - bio->bi_private = rdev; - super_written(bio, error); - } -} - void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page) { @@ -729,51 +689,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, * and decrement it on completion, waking up sb_wait * if zero is reached. * If an error occurred, call md_error - * - * As we might need to resubmit the request if REQ_HARDBARRIER - * causes ENOTSUPP, we allocate a spare bio... */ struct bio *bio = bio_alloc(GFP_NOIO, 1); - int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG; bio->bi_bdev = rdev->bdev; bio->bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; - bio->bi_rw = rw; atomic_inc(&mddev->pending_writes); - if (!test_bit(BarriersNotsupp, &rdev->flags)) { - struct bio *rbio; - rw |= REQ_HARDBARRIER; - rbio = bio_clone(bio, GFP_NOIO); - rbio->bi_private = bio; - rbio->bi_end_io = super_written_barrier; - submit_bio(rw, rbio); - } else - submit_bio(rw, bio); + submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA, + bio); } void md_super_wait(mddev_t *mddev) { - /* wait for all superblock writes that were scheduled to complete. - * if any had to be retried (due to BARRIER problems), retry them - */ + /* wait for all superblock writes that were scheduled to complete */ DEFINE_WAIT(wq); for(;;) { prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); if (atomic_read(&mddev->pending_writes)==0) break; - while (mddev->biolist) { - struct bio *bio; - spin_lock_irq(&mddev->write_lock); - bio = mddev->biolist; - mddev->biolist = bio->bi_next ; - bio->bi_next = NULL; - spin_unlock_irq(&mddev->write_lock); - submit_bio(bio->bi_rw, bio); - } schedule(); } finish_wait(&mddev->sb_wait, &wq); @@ -1070,7 +1007,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); - clear_bit(BarriersNotsupp, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 0; @@ -1485,7 +1421,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); - clear_bit(BarriersNotsupp, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 1; @@ -4504,7 +4439,6 @@ int md_run(mddev_t *mddev) /* may be over-ridden by personality */ mddev->resync_max_sectors = mddev->dev_sectors; - mddev->barriers_work = 1; mddev->ok_start_degraded = start_dirty_degraded; if (start_readonly && mddev->ro == 0) @@ -4683,7 +4617,6 @@ static void md_clean(mddev_t *mddev) mddev->recovery = 0; mddev->in_sync = 0; mddev->degraded = 0; - mddev->barriers_work = 0; mddev->safemode = 0; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index 3931299788dc..112a2c32db0c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -87,7 +87,6 @@ struct mdk_rdev_s #define Faulty 1 /* device is known to have a fault */ #define In_sync 2 /* device is in_sync with rest of array */ #define WriteMostly 4 /* Avoid reading if at all possible */ -#define BarriersNotsupp 5 /* REQ_HARDBARRIER is not supported */ #define AllReserved 6 /* If whole device is reserved for * one array */ #define AutoDetected 7 /* added by auto-detect */ @@ -273,13 +272,6 @@ struct mddev_s int degraded; /* whether md should consider * adding a spare */ - int barriers_work; /* initialised to true, cleared as soon - * as a barrier request to slave - * fails. Only supported - */ - struct bio *biolist; /* bios that need to be retried - * because REQ_HARDBARRIER is not supported - */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; @@ -339,16 +331,13 @@ struct mddev_s struct attribute_group *to_remove; struct plug_handle *plug; /* if used by personality */ - /* Generic barrier handling. - * If there is a pending barrier request, all other - * writes are blocked while the devices are flushed. - * The last to finish a flush schedules a worker to - * submit the barrier request (without the barrier flag), - * then submit more flush requests. + /* Generic flush handling. + * The last to finish preflush schedules a worker to submit + * the rest of the request (without the REQ_FLUSH flag). */ - struct bio *barrier; + struct bio *flush_bio; atomic_t flush_pending; - struct work_struct barrier_work; + struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ }; @@ -502,7 +491,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok); extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); extern int mddev_congested(mddev_t *mddev, int bits); -extern void md_barrier_request(mddev_t *mddev, struct bio *bio); +extern void md_flush_request(mddev_t *mddev, struct bio *bio); extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page); extern void md_super_wait(mddev_t *mddev); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 0307d217e7a4..6d7ddf32ef2e 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -142,8 +142,8 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio) struct multipath_bh * mp_bh; struct multipath_info *multipath; - if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { - md_barrier_request(mddev, bio); + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); return 0; } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 6f7af46d623c..a39f4c355e55 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -483,8 +483,8 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio) struct strip_zone *zone; mdk_rdev_t *tmp_dev; - if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { - md_barrier_request(mddev, bio); + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); return 0; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0b830bbe1d8b..378a25894c57 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error) if (r1_bio->bios[mirror] == bio) break; - if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { - set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); - set_bit(R1BIO_BarrierRetry, &r1_bio->state); - r1_bio->mddev->barriers_work = 0; - /* Don't rdev_dec_pending in this branch - keep it for the retry */ - } else { + /* + * 'one mirror IO has finished' event handler: + */ + r1_bio->bios[mirror] = NULL; + to_put = bio; + if (!uptodate) { + md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); + /* an I/O failed, we can't clear the bitmap */ + set_bit(R1BIO_Degraded, &r1_bio->state); + } else /* - * this branch is our 'one mirror IO has finished' event handler: + * Set R1BIO_Uptodate in our master bio, so that we + * will return a good error code for to the higher + * levels even if IO on some other mirrored buffer + * fails. + * + * The 'master' represents the composite IO operation + * to user-side. So if something waits for IO, then it + * will wait for the 'master' bio. */ - r1_bio->bios[mirror] = NULL; - to_put = bio; - if (!uptodate) { - md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); - /* an I/O failed, we can't clear the bitmap */ - set_bit(R1BIO_Degraded, &r1_bio->state); - } else - /* - * Set R1BIO_Uptodate in our master bio, so that - * we will return a good error code for to the higher - * levels even if IO on some other mirrored buffer fails. - * - * The 'master' represents the composite IO operation to - * user-side. So if something waits for IO, then it will - * wait for the 'master' bio. - */ - set_bit(R1BIO_Uptodate, &r1_bio->state); - - update_head_pos(mirror, r1_bio); - - if (behind) { - if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) - atomic_dec(&r1_bio->behind_remaining); - - /* In behind mode, we ACK the master bio once the I/O has safely - * reached all non-writemostly disks. Setting the Returned bit - * ensures that this gets done only once -- we don't ever want to - * return -EIO here, instead we'll wait */ - - if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && - test_bit(R1BIO_Uptodate, &r1_bio->state)) { - /* Maybe we can return now */ - if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { - struct bio *mbio = r1_bio->master_bio; - PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", - (unsigned long long) mbio->bi_sector, - (unsigned long long) mbio->bi_sector + - (mbio->bi_size >> 9) - 1); - bio_endio(mbio, 0); - } + set_bit(R1BIO_Uptodate, &r1_bio->state); + + update_head_pos(mirror, r1_bio); + + if (behind) { + if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) + atomic_dec(&r1_bio->behind_remaining); + + /* + * In behind mode, we ACK the master bio once the I/O + * has safely reached all non-writemostly + * disks. Setting the Returned bit ensures that this + * gets done only once -- we don't ever want to return + * -EIO here, instead we'll wait + */ + if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && + test_bit(R1BIO_Uptodate, &r1_bio->state)) { + /* Maybe we can return now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + struct bio *mbio = r1_bio->master_bio; + PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", + (unsigned long long) mbio->bi_sector, + (unsigned long long) mbio->bi_sector + + (mbio->bi_size >> 9) - 1); + bio_endio(mbio, 0); } } - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } + rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); + /* - * * Let's see if all mirrored write operations have finished * already. */ if (atomic_dec_and_test(&r1_bio->remaining)) { - if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) - reschedule_retry(r1_bio); - else { - /* it really is the end of this request */ - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { - /* free extra copy of the data pages */ - int i = bio->bi_vcnt; - while (i--) - safe_put_page(bio->bi_io_vec[i].bv_page); - } - /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - behind); - md_write_end(r1_bio->mddev); - raid_end_bio_io(r1_bio); + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + /* free extra copy of the data pages */ + int i = bio->bi_vcnt; + while (i--) + safe_put_page(bio->bi_io_vec[i].bv_page); } + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, + r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + behind); + md_write_end(r1_bio->mddev); + raid_end_bio_io(r1_bio); } if (to_put) @@ -788,16 +779,13 @@ static int make_request(mddev_t *mddev, struct bio * bio) struct page **behind_pages = NULL; const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); - unsigned long do_barriers; + const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); mdk_rdev_t *blocked_rdev; /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. - * We test barriers_work *after* md_write_start as md_write_start - * may cause the first superblock write, and that will check out - * if barriers work. */ md_write_start(mddev, bio); /* wait on superblock update early */ @@ -821,13 +809,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) } finish_wait(&conf->wait_barrier, &w); } - if (unlikely(!mddev->barriers_work && - (bio->bi_rw & REQ_HARDBARRIER))) { - if (rw == WRITE) - md_write_end(mddev); - bio_endio(bio, -EOPNOTSUPP); - return 0; - } wait_barrier(conf); @@ -959,10 +940,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) atomic_set(&r1_bio->remaining, 0); atomic_set(&r1_bio->behind_remaining, 0); - do_barriers = bio->bi_rw & REQ_HARDBARRIER; - if (do_barriers) - set_bit(R1BIO_Barrier, &r1_bio->state); - bio_list_init(&bl); for (i = 0; i < disks; i++) { struct bio *mbio; @@ -975,7 +952,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_barriers | do_sync; + mbio->bi_rw = WRITE | do_flush_fua | do_sync; mbio->bi_private = r1_bio; if (behind_pages) { @@ -1634,41 +1611,6 @@ static void raid1d(mddev_t *mddev) if (test_bit(R1BIO_IsSync, &r1_bio->state)) { sync_request_write(mddev, r1_bio); unplug = 1; - } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { - /* some requests in the r1bio were REQ_HARDBARRIER - * requests which failed with -EOPNOTSUPP. Hohumm.. - * Better resubmit without the barrier. - * We know which devices to resubmit for, because - * all others have had their bios[] entry cleared. - * We already have a nr_pending reference on these rdevs. - */ - int i; - const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC); - clear_bit(R1BIO_BarrierRetry, &r1_bio->state); - clear_bit(R1BIO_Barrier, &r1_bio->state); - for (i=0; i < conf->raid_disks; i++) - if (r1_bio->bios[i]) - atomic_inc(&r1_bio->remaining); - for (i=0; i < conf->raid_disks; i++) - if (r1_bio->bios[i]) { - struct bio_vec *bvec; - int j; - - bio = bio_clone(r1_bio->master_bio, GFP_NOIO); - /* copy pages from the failed bio, as - * this might be a write-behind device */ - __bio_for_each_segment(bvec, bio, j, 0) - bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page; - bio_put(r1_bio->bios[i]); - bio->bi_sector = r1_bio->sector + - conf->mirrors[i].rdev->data_offset; - bio->bi_bdev = conf->mirrors[i].rdev->bdev; - bio->bi_end_io = raid1_end_write_request; - bio->bi_rw = WRITE | do_sync; - bio->bi_private = r1_bio; - r1_bio->bios[i] = bio; - generic_make_request(bio); - } } else { int disk; diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 5f2d443ae28a..adf8cfd73313 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -117,8 +117,6 @@ struct r1bio_s { #define R1BIO_IsSync 1 #define R1BIO_Degraded 2 #define R1BIO_BehindIO 3 -#define R1BIO_Barrier 4 -#define R1BIO_BarrierRetry 5 /* For write-behind requests, we call bi_end_io when * the last non-write-behind device completes, providing * any write was successful. Otherwise we call when diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 84718383124d..f0d082f749be 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -800,12 +800,13 @@ static int make_request(mddev_t *mddev, struct bio * bio) int chunk_sects = conf->chunk_mask + 1; const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); + const unsigned long do_fua = (bio->bi_rw & REQ_FUA); struct bio_list bl; unsigned long flags; mdk_rdev_t *blocked_rdev; - if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { - md_barrier_request(mddev, bio); + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); return 0; } @@ -965,7 +966,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) conf->mirrors[d].rdev->data_offset; mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync; + mbio->bi_rw = WRITE | do_sync | do_fua; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 69b0a169e43d..31140d1259dc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -506,9 +506,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) int rw; struct bio *bi; mdk_rdev_t *rdev; - if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) - rw = WRITE; - else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) + if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { + if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) + rw = WRITE_FUA; + else + rw = WRITE; + } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = READ; else continue; @@ -1031,6 +1034,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { + if (wbi->bi_rw & REQ_FUA) + set_bit(R5_WantFUA, &dev->flags); tx = async_copy_data(1, wbi, dev->page, dev->sector, tx); wbi = r5_next_bio(wbi, dev->sector); @@ -1048,15 +1053,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref) int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; int i; + bool fua = false; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + for (i = disks; i--; ) + fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); + for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->written || i == pd_idx || i == qd_idx) + if (dev->written || i == pd_idx || i == qd_idx) { set_bit(R5_UPTODATE, &dev->flags); + if (fua) + set_bit(R5_WantFUA, &dev->flags); + } } if (sh->reconstruct_state == reconstruct_state_drain_run) @@ -3281,7 +3293,7 @@ static void handle_stripe5(struct stripe_head *sh) if (dec_preread_active) { /* We delay this until after ops_run_io so that if make_request - * is waiting on a barrier, it won't continue until the writes + * is waiting on a flush, it won't continue until the writes * have actually been submitted. */ atomic_dec(&conf->preread_active_stripes); @@ -3583,7 +3595,7 @@ static void handle_stripe6(struct stripe_head *sh) if (dec_preread_active) { /* We delay this until after ops_run_io so that if make_request - * is waiting on a barrier, it won't continue until the writes + * is waiting on a flush, it won't continue until the writes * have actually been submitted. */ atomic_dec(&conf->preread_active_stripes); @@ -3978,14 +3990,8 @@ static int make_request(mddev_t *mddev, struct bio * bi) const int rw = bio_data_dir(bi); int remaining; - if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) { - /* Drain all pending writes. We only really need - * to ensure they have been submitted, but this is - * easier. - */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - md_barrier_request(mddev, bi); + if (unlikely(bi->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bi); return 0; } @@ -4103,7 +4109,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) finish_wait(&conf->wait_for_overlap, &w); set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); - if (mddev->barrier && + if ((bi->bi_rw & REQ_SYNC) && !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); release_stripe(sh); @@ -4126,13 +4132,6 @@ static int make_request(mddev_t *mddev, struct bio * bi) bio_endio(bi, 0); } - if (mddev->barrier) { - /* We need to wait for the stripes to all be handled. - * So: wait for preread_active_stripes to drop to 0. - */ - wait_event(mddev->thread->wqueue, - atomic_read(&conf->preread_active_stripes) == 0); - } return 0; } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 36eaed5dfd6e..2ace0582b409 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -275,6 +275,7 @@ struct r6_state { * filling */ #define R5_Wantdrain 13 /* dev->towrite needs to be drained */ +#define R5_WantFUA 14 /* Write should be FUA */ /* * Write method */ diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c index e876678176be..9c0b42bfe089 100644 --- a/drivers/mmc/card/queue.c +++ b/drivers/mmc/card/queue.c @@ -128,7 +128,6 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, spinlock_t *lock mq->req = NULL; blk_queue_prep_rq(mq->queue, mmc_prep_request); - blk_queue_ordered(mq->queue, QUEUE_ORDERED_DRAIN); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue); if (mmc_can_erase(card)) { queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mq->queue); diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 8373ca0de8e0..9b106d83b0cd 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -2197,7 +2197,6 @@ static void dasd_setup_queue(struct dasd_block *block) */ blk_queue_max_segment_size(block->request_queue, PAGE_SIZE); blk_queue_segment_boundary(block->request_queue, PAGE_SIZE - 1); - blk_queue_ordered(block->request_queue, QUEUE_ORDERED_DRAIN); } /* diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c index ae10883a5b28..50286d8707f3 100644 --- a/drivers/s390/scsi/zfcp_scsi.c +++ b/drivers/s390/scsi/zfcp_scsi.c @@ -634,6 +634,7 @@ void zfcp_scsi_set_prot(struct zfcp_adapter *adapter) adapter->adapter_features & FSF_FEATURE_DIX_PROT_TCPIP) { mask |= SHOST_DIX_TYPE1_PROTECTION; scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP); + shost->sg_prot_tablesize = ZFCP_QDIO_MAX_SBALES_PER_REQ / 2; shost->sg_tablesize = ZFCP_QDIO_MAX_SBALES_PER_REQ / 2; shost->max_sectors = ZFCP_QDIO_MAX_SBALES_PER_REQ * 8 / 2; } diff --git a/drivers/scsi/aic7xxx_old.c b/drivers/scsi/aic7xxx_old.c index 93984c9dfe14..aee73fafccc8 100644 --- a/drivers/scsi/aic7xxx_old.c +++ b/drivers/scsi/aic7xxx_old.c @@ -2850,12 +2850,6 @@ aic7xxx_done(struct aic7xxx_host *p, struct aic7xxx_scb *scb) aic_dev->r_total++; ptr = aic_dev->r_bins; } - if(cmd->device->simple_tags && cmd->request->cmd_flags & REQ_HARDBARRIER) - { - aic_dev->barrier_total++; - if(scb->tag_action == MSG_ORDERED_Q_TAG) - aic_dev->ordered_total++; - } x = scb->sg_length; x >>= 10; for(i=0; i<6; i++) @@ -10125,7 +10119,6 @@ static void aic7xxx_buildscb(struct aic7xxx_host *p, struct scsi_cmnd *cmd, struct aic_dev_data *aic_dev = cmd->device->hostdata; struct scsi_device *sdptr = cmd->device; unsigned char tindex = TARGET_INDEX(cmd); - struct request *req = cmd->request; int use_sg; mask = (0x01 << tindex); @@ -10144,19 +10137,8 @@ static void aic7xxx_buildscb(struct aic7xxx_host *p, struct scsi_cmnd *cmd, /* We always force TEST_UNIT_READY to untagged */ if (cmd->cmnd[0] != TEST_UNIT_READY && sdptr->simple_tags) { - if (req->cmd_flags & REQ_HARDBARRIER) - { - if(sdptr->ordered_tags) - { - hscb->control |= MSG_ORDERED_Q_TAG; - scb->tag_action = MSG_ORDERED_Q_TAG; - } - } - else - { - hscb->control |= MSG_SIMPLE_Q_TAG; - scb->tag_action = MSG_SIMPLE_Q_TAG; - } + hscb->control |= MSG_SIMPLE_Q_TAG; + scb->tag_action = MSG_SIMPLE_Q_TAG; } } if ( !(aic_dev->dtr_pending) && diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c index 8a8f803439e1..10478153641b 100644 --- a/drivers/scsi/hosts.c +++ b/drivers/scsi/hosts.c @@ -376,6 +376,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize) shost->this_id = sht->this_id; shost->can_queue = sht->can_queue; shost->sg_tablesize = sht->sg_tablesize; + shost->sg_prot_tablesize = sht->sg_prot_tablesize; shost->cmd_per_lun = sht->cmd_per_lun; shost->unchecked_isa_dma = sht->unchecked_isa_dma; shost->use_clustering = sht->use_clustering; diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c index 1787bd2eb4a6..55f09e92ab59 100644 --- a/drivers/scsi/libsas/sas_scsi_host.c +++ b/drivers/scsi/libsas/sas_scsi_host.c @@ -130,17 +130,6 @@ static void sas_scsi_task_done(struct sas_task *task) sc->scsi_done(sc); } -static enum task_attribute sas_scsi_get_task_attr(struct scsi_cmnd *cmd) -{ - enum task_attribute ta = TASK_ATTR_SIMPLE; - if (cmd->request && blk_rq_tagged(cmd->request)) { - if (cmd->device->ordered_tags && - (cmd->request->cmd_flags & REQ_HARDBARRIER)) - ta = TASK_ATTR_ORDERED; - } - return ta; -} - static struct sas_task *sas_create_task(struct scsi_cmnd *cmd, struct domain_device *dev, gfp_t gfp_flags) @@ -160,7 +149,7 @@ static struct sas_task *sas_create_task(struct scsi_cmnd *cmd, task->ssp_task.retry_count = 1; int_to_scsilun(cmd->device->lun, &lun); memcpy(task->ssp_task.LUN, &lun.scsi_lun, 8); - task->ssp_task.task_attr = sas_scsi_get_task_attr(cmd); + task->ssp_task.task_attr = TASK_ATTR_SIMPLE; memcpy(task->ssp_task.cdb, cmd->cmnd, 16); task->scatter = scsi_sglist(cmd); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index ee02d3838a0a..8041fe1ab179 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -968,11 +968,13 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb, */ int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask) { - int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask); + struct request *rq = cmd->request; + + int error = scsi_init_sgtable(rq, &cmd->sdb, gfp_mask); if (error) goto err_exit; - if (blk_bidi_rq(cmd->request)) { + if (blk_bidi_rq(rq)) { struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc( scsi_sdb_cache, GFP_ATOMIC); if (!bidi_sdb) { @@ -980,28 +982,28 @@ int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask) goto err_exit; } - cmd->request->next_rq->special = bidi_sdb; - error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb, - GFP_ATOMIC); + rq->next_rq->special = bidi_sdb; + error = scsi_init_sgtable(rq->next_rq, bidi_sdb, GFP_ATOMIC); if (error) goto err_exit; } - if (blk_integrity_rq(cmd->request)) { + if (blk_integrity_rq(rq)) { struct scsi_data_buffer *prot_sdb = cmd->prot_sdb; int ivecs, count; BUG_ON(prot_sdb == NULL); - ivecs = blk_rq_count_integrity_sg(cmd->request); + ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio); if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) { error = BLKPREP_DEFER; goto err_exit; } - count = blk_rq_map_integrity_sg(cmd->request, + count = blk_rq_map_integrity_sg(rq->q, rq->bio, prot_sdb->table.sgl); BUG_ON(unlikely(count > ivecs)); + BUG_ON(unlikely(count > queue_max_integrity_segments(rq->q))); cmd->prot_sdb = prot_sdb; cmd->prot_sdb->table.nents = count; @@ -1625,6 +1627,14 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost, blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize, SCSI_MAX_SG_CHAIN_SEGMENTS)); + if (scsi_host_prot_dma(shost)) { + shost->sg_prot_tablesize = + min_not_zero(shost->sg_prot_tablesize, + (unsigned short)SCSI_MAX_PROT_SG_SEGMENTS); + BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize); + blk_queue_max_integrity_segments(q, shost->sg_prot_tablesize); + } + blk_queue_max_hw_sectors(q, shost->max_sectors); blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost)); blk_queue_segment_boundary(q, shost->dma_boundary); diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index c3f67373a4f8..20ad59dff730 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -251,6 +251,7 @@ shost_rd_attr(host_busy, "%hu\n"); shost_rd_attr(cmd_per_lun, "%hd\n"); shost_rd_attr(can_queue, "%hd\n"); shost_rd_attr(sg_tablesize, "%hu\n"); +shost_rd_attr(sg_prot_tablesize, "%hu\n"); shost_rd_attr(unchecked_isa_dma, "%d\n"); shost_rd_attr(prot_capabilities, "%u\n"); shost_rd_attr(prot_guard_type, "%hd\n"); @@ -262,6 +263,7 @@ static struct attribute *scsi_sysfs_shost_attrs[] = { &dev_attr_cmd_per_lun.attr, &dev_attr_can_queue.attr, &dev_attr_sg_tablesize.attr, + &dev_attr_sg_prot_tablesize.attr, &dev_attr_unchecked_isa_dma.attr, &dev_attr_proc_name.attr, &dev_attr_scan.attr, diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 8bbb8c3144d5..20295774bf70 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2145,7 +2145,7 @@ static int sd_revalidate_disk(struct gendisk *disk) struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; unsigned char *buffer; - unsigned ordered; + unsigned flush = 0; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_revalidate_disk\n")); @@ -2188,17 +2188,15 @@ static int sd_revalidate_disk(struct gendisk *disk) /* * We now have all cache related info, determine how we deal - * with ordered requests. Note that as the current SCSI - * dispatch function can alter request order, we cannot use - * QUEUE_ORDERED_TAG_* even when ordered tag is supported. + * with flush requests. */ - if (sdkp->WCE) - ordered = sdkp->DPOFUA - ? QUEUE_ORDERED_DRAIN_FUA : QUEUE_ORDERED_DRAIN_FLUSH; - else - ordered = QUEUE_ORDERED_DRAIN; + if (sdkp->WCE) { + flush |= REQ_FLUSH; + if (sdkp->DPOFUA) + flush |= REQ_FUA; + } - blk_queue_ordered(sdkp->disk->queue, ordered); + blk_queue_flush(sdkp->disk->queue, flush); set_capacity(disk, sdkp->capacity); kfree(buffer); diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 84be62149c6c..0cb39ff21171 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -375,21 +375,20 @@ int sd_dif_prepare(struct request *rq, sector_t hw_sector, unsigned int sector_s unsigned int i, j; u32 phys, virt; - /* Already remapped? */ - if (rq->cmd_flags & REQ_INTEGRITY) - return 0; - sdkp = rq->bio->bi_bdev->bd_disk->private_data; if (sdkp->protection_type == SD_DIF_TYPE3_PROTECTION) return 0; - rq->cmd_flags |= REQ_INTEGRITY; phys = hw_sector & 0xffffffff; __rq_for_each_bio(bio, rq) { struct bio_vec *iv; + /* Already remapped? */ + if (bio_flagged(bio, BIO_MAPPED_INTEGRITY)) + break; + virt = bio->bi_integrity->bip_sector & 0xffffffff; bip_for_each_vec(iv, bio->bi_integrity, i) { @@ -408,6 +407,8 @@ int sd_dif_prepare(struct request *rq, sector_t hw_sector, unsigned int sector_s kunmap_atomic(sdt, KM_USER0); } + + bio->bi_flags |= BIO_MAPPED_INTEGRITY; } return 0; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 78d616315d8e..655ab920cff4 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1657,7 +1657,7 @@ static int sg_start_req(Sg_request *srp, unsigned char *cmd) if (sg_allow_dio && hp->flags & SG_FLAG_DIRECT_IO && dxfer_dir != SG_DXFER_UNKNOWN && !iov_count && !sfp->parentdp->device->host->unchecked_isa_dma && - blk_rq_aligned(q, hp->dxferp, dxfer_len)) + blk_rq_aligned(q, (unsigned long)hp->dxferp, dxfer_len)) md = NULL; else md = &map_data; diff --git a/fs/block_dev.c b/fs/block_dev.c index 50e8c8582faa..b737451e2e9d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -370,7 +370,7 @@ int blkdev_fsync(struct file *filp, int datasync) */ mutex_unlock(&bd_inode->i_mutex); - error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); + error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); if (error == -EOPNOTSUPP) error = 0; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 64f10082f048..5e789f4a3ed0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2063,7 +2063,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { + if (printk_ratelimit()) { printk(KERN_WARNING "lost page write due to " "I/O error on %s\n", bdevname(bh->b_bdev, b)); @@ -2200,21 +2200,10 @@ static int write_dev_supers(struct btrfs_device *device, bh->b_end_io = btrfs_end_buffer_write_sync; } - if (i == last_barrier && do_barriers && device->barriers) { - ret = submit_bh(WRITE_BARRIER, bh); - if (ret == -EOPNOTSUPP) { - printk("btrfs: disabling barriers on dev %s\n", - device->name); - set_buffer_uptodate(bh); - device->barriers = 0; - /* one reference for submit_bh */ - get_bh(bh); - lock_buffer(bh); - ret = submit_bh(WRITE_SYNC, bh); - } - } else { + if (i == last_barrier && do_barriers) + ret = submit_bh(WRITE_FLUSH_FUA, bh); + else ret = submit_bh(WRITE_SYNC, bh); - } if (ret) errors++; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 32d094002a57..0b81ecdb101c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1695,8 +1695,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, static void btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { - blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, - BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); + blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0); } static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index dd318ff280b2..e25e46a8b4e2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -398,7 +398,6 @@ static noinline int device_list_add(const char *path, device->work.func = pending_bios_fn; memcpy(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); - device->barriers = 1; spin_lock_init(&device->io_lock); device->name = kstrdup(path, GFP_NOFS); if (!device->name) { @@ -462,7 +461,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) device->devid = orig_dev->devid; device->work.func = pending_bios_fn; memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); - device->barriers = 1; spin_lock_init(&device->io_lock); INIT_LIST_HEAD(&device->dev_list); INIT_LIST_HEAD(&device->dev_alloc_list); @@ -1489,7 +1487,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) trans = btrfs_start_transaction(root, 0); lock_chunks(root); - device->barriers = 1; device->writeable = 1; device->work.func = pending_bios_fn; generate_random_uuid(device->uuid); @@ -3084,7 +3081,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root, return NULL; list_add(&device->dev_list, &fs_devices->devices); - device->barriers = 1; device->dev_root = root->fs_info->dev_root; device->devid = devid; device->work.func = pending_bios_fn; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 31b0fabdd2ea..2b638b6e4eea 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -42,7 +42,6 @@ struct btrfs_device { int running_pending; u64 generation; - int barriers; int writeable; int in_fs_metadata; diff --git a/fs/buffer.c b/fs/buffer.c index 3e7dca279d1c..7f0b9b083f77 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -156,7 +156,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) { + if (!quiet_error(bh)) { buffer_io_error(bh); printk(KERN_WARNING "lost page write due to " "I/O error on %s\n", @@ -2891,7 +2891,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) if (err == -EOPNOTSUPP) { set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - set_bit(BH_Eopnotsupp, &bh->b_state); } if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) @@ -3031,10 +3030,6 @@ int __sync_dirty_buffer(struct buffer_head *bh, int rw) bh->b_end_io = end_buffer_write_sync; ret = submit_bh(rw, bh); wait_on_buffer(bh); - if (buffer_eopnotsupp(bh)) { - clear_buffer_eopnotsupp(bh); - ret = -EOPNOTSUPP; - } if (!ret && !buffer_uptodate(bh)) ret = -EIO; } else { diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index d7e9f74dc3a6..09b13bb34c94 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -90,7 +90,6 @@ int ext3_sync_file(struct file *file, int datasync) * storage */ if (needs_barrier) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, - BLKDEV_IFL_WAIT); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); return ret; } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a0e623055955..a130f02bea9b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2545,8 +2545,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext_pblock(ex); - ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, - GFP_NOFS, BLKDEV_IFL_WAIT); + ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); if (ret > 0) ret = 0; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 592adf2e546e..3f3ff5ee8f9d 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -128,10 +128,9 @@ int ext4_sync_file(struct file *file, int datasync) (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, - NULL, BLKDEV_IFL_WAIT); + NULL); ret = jbd2_log_wait_commit(journal, commit_tid); } else if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, - BLKDEV_IFL_WAIT); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 87d228aae6b0..f7e25aa04854 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1238,7 +1238,6 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, handle_t *handle; ext4_fsblk_t blk; int num, ret = 0, used_blks = 0; - unsigned long flags = BLKDEV_IFL_WAIT; /* This should not happen, but just to be sure check this */ if (sb->s_flags & MS_RDONLY) { @@ -1303,9 +1302,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, ext4_debug("going to zero out inode table in group %d\n", group); - if (barrier) - flags |= BLKDEV_IFL_BARRIER; - ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS, flags); + ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS); if (ret < 0) goto err_out; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3f62df50f483..10ec06a834b3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2612,7 +2612,7 @@ static inline void ext4_issue_discard(struct super_block *sb, discard_block = block + ext4_group_first_block_no(sb, block_group); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); - ret = sb_issue_discard(sb, discard_block, count); + ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); if (ret == -EOPNOTSUPP) { ext4_warning(sb, "discard not supported, disabling"); clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 2f5e347de48b..998a4624da26 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -229,7 +229,7 @@ static int setup_new_group_blocks(struct super_block *sb, ext4_debug("clear inode table blocks %#04llx -> %#04llx\n", block, sbi->s_itb_per_group); err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, - GFP_NOFS, BLKDEV_IFL_WAIT); + GFP_NOFS); if (err) goto exit_bh; @@ -244,8 +244,7 @@ static int setup_new_group_blocks(struct super_block *sb, block = input->inode_table; ext4_debug("clear inode table blocks %#04llx -> %#04llx\n", block, sbi->s_itb_per_group); - err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, - GFP_NOFS, BLKDEV_IFL_WAIT); + err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); if (err) goto exit_bh; diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 81184d3b75a3..b47d2c9f4fa1 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -577,7 +577,8 @@ int fat_free_clusters(struct inode *inode, int cluster) sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl), - nr_clus * sbi->sec_per_clus); + nr_clus * sbi->sec_per_clus, + GFP_NOFS, 0); first_cl = cluster; } diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 1736f2356388..970e682ea754 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -255,10 +255,7 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) for (i = 0; i < nr_bhs; i++) { wait_on_buffer(bhs[i]); - if (buffer_eopnotsupp(bhs[i])) { - clear_buffer_eopnotsupp(bhs[i]); - err = -EOPNOTSUPP; - } else if (!err && !buffer_uptodate(bhs[i])) + if (!err && !buffer_uptodate(bhs[i])) err = -EIO; } return err; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index ac750bd31a6f..eb01f3575e10 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -592,22 +592,13 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) lh->lh_hash = cpu_to_be32(hash); bh->b_end_io = end_buffer_write_sync; - if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) - goto skip_barrier; get_bh(bh); - submit_bh(WRITE_BARRIER | REQ_META, bh); - wait_on_buffer(bh); - if (buffer_eopnotsupp(bh)) { - clear_buffer_eopnotsupp(bh); - set_buffer_uptodate(bh); - fs_info(sdp, "barrier sync failed - disabling barriers\n"); - set_bit(SDF_NOBARRIERS, &sdp->sd_flags); - lock_buffer(bh); -skip_barrier: - get_bh(bh); + if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) submit_bh(WRITE_SYNC | REQ_META, bh); - wait_on_buffer(bh); - } + else + submit_bh(WRITE_FLUSH_FUA | REQ_META, bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) gfs2_io_error_bh(sdp, bh); brelse(bh); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index fb67f593f408..bef3ab6cf5c1 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -866,8 +866,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, if ((start + nr_sects) != blk) { rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, - BLKDEV_IFL_WAIT | - BLKDEV_IFL_BARRIER); + 0); if (rv) goto fail; nr_sects = 0; @@ -881,8 +880,7 @@ start_new_extent: } } if (nr_sects) { - rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, - BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); + rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0); if (rv) goto fail; } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index c8428323167f..bdf572221152 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -137,34 +137,10 @@ static int journal_write_commit_record(journal_t *journal, JBUFFER_TRACE(descriptor, "write commit block"); set_buffer_dirty(bh); - if (journal->j_flags & JFS_BARRIER) { - ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER); - - /* - * Is it possible for another commit to fail at roughly - * the same time as this one? If so, we don't want to - * trust the barrier flag in the super, but instead want - * to remember if we sent a barrier request - */ - if (ret == -EOPNOTSUPP) { - char b[BDEVNAME_SIZE]; - - printk(KERN_WARNING - "JBD: barrier-based sync failed on %s - " - "disabling barriers\n", - bdevname(journal->j_dev, b)); - spin_lock(&journal->j_state_lock); - journal->j_flags &= ~JFS_BARRIER; - spin_unlock(&journal->j_state_lock); - - /* And try again, without the barrier */ - set_buffer_uptodate(bh); - set_buffer_dirty(bh); - ret = sync_dirty_buffer(bh); - } - } else { + if (journal->j_flags & JFS_BARRIER) + ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA); + else ret = sync_dirty_buffer(bh); - } put_bh(bh); /* One for getblk() */ journal_put_journal_head(descriptor); @@ -318,7 +294,7 @@ void journal_commit_transaction(journal_t *journal) int first_tag = 0; int tag_flag; int i; - int write_op = WRITE; + int write_op = WRITE_SYNC; /* * First job: lock down the current transaction and wait for diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 524800dce207..6a79fd0a1a32 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -542,8 +542,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) */ if ((journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, - BLKDEV_IFL_WAIT); + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); if (!(journal->j_flags & JBD2_ABORT)) jbd2_journal_update_superblock(journal, 1); return 0; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6494c81e3b0a..f3ad1598b201 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -136,25 +136,11 @@ static int journal_submit_commit_record(journal_t *journal, if (journal->j_flags & JBD2_BARRIER && !JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { - ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh); - if (ret == -EOPNOTSUPP) { - printk(KERN_WARNING - "JBD2: Disabling barriers on %s, " - "not supported by device\n", journal->j_devname); - write_lock(&journal->j_state_lock); - journal->j_flags &= ~JBD2_BARRIER; - write_unlock(&journal->j_state_lock); - - /* And try again, without the barrier */ - lock_buffer(bh); - set_buffer_uptodate(bh); - clear_buffer_dirty(bh); - ret = submit_bh(WRITE_SYNC_PLUG, bh); - } - } else { + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) + ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh); + else ret = submit_bh(WRITE_SYNC_PLUG, bh); - } + *cbh = bh; return ret; } @@ -168,29 +154,8 @@ static int journal_wait_on_commit_record(journal_t *journal, { int ret = 0; -retry: clear_buffer_dirty(bh); wait_on_buffer(bh); - if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) { - printk(KERN_WARNING - "JBD2: %s: disabling barries on %s - not supported " - "by device\n", __func__, journal->j_devname); - write_lock(&journal->j_state_lock); - journal->j_flags &= ~JBD2_BARRIER; - write_unlock(&journal->j_state_lock); - - lock_buffer(bh); - clear_buffer_dirty(bh); - set_buffer_uptodate(bh); - bh->b_end_io = journal_end_buffer_io_sync; - - ret = submit_bh(WRITE_SYNC_PLUG, bh); - if (ret) { - unlock_buffer(bh); - return ret; - } - goto retry; - } if (unlikely(!buffer_uptodate(bh))) ret = -EIO; @@ -364,7 +329,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) int tag_bytes = journal_tag_bytes(journal); struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; - int write_op = WRITE; + int write_op = WRITE_SYNC; /* * First job: lock down the current transaction and wait for @@ -705,6 +670,16 @@ start_journal_io: } } + err = journal_finish_inode_data_buffers(journal, commit_transaction); + if (err) { + printk(KERN_WARNING + "JBD2: Detected IO errors while flushing file data " + "on %s\n", journal->j_devname); + if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) + jbd2_journal_abort(journal, err); + err = 0; + } + /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue @@ -713,8 +688,7 @@ start_journal_io: if (commit_transaction->t_flushed_data_blocks && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, - BLKDEV_IFL_WAIT); + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); /* Done it all: now write the commit record asynchronously. */ if (JBD2_HAS_INCOMPAT_FEATURE(journal, @@ -723,19 +697,6 @@ start_journal_io: &cbh, crc32_sum); if (err) __jbd2_journal_abort_hard(journal); - if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL, - BLKDEV_IFL_WAIT); - } - - err = journal_finish_inode_data_buffers(journal, commit_transaction); - if (err) { - printk(KERN_WARNING - "JBD2: Detected IO errors while flushing file data " - "on %s\n", journal->j_devname); - if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) - jbd2_journal_abort(journal, err); - err = 0; } /* Lo and behold: we have just managed to send a transaction to @@ -849,6 +810,11 @@ wait_for_iobuf: } if (!err && !is_journal_aborted(journal)) err = journal_wait_on_commit_record(journal, cbh); + if (JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) && + journal->j_flags & JBD2_BARRIER) { + blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL); + } if (err) jbd2_journal_abort(journal, err); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index a6d5fb2a5519..f8d8ea88ad78 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -181,17 +181,9 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag) retry: set_buffer_dirty(nilfs->ns_sbh[0]); - if (nilfs_test_opt(sbi, BARRIER)) { err = __sync_dirty_buffer(nilfs->ns_sbh[0], - WRITE_SYNC | WRITE_BARRIER); - if (err == -EOPNOTSUPP) { - nilfs_warning(sbi->s_super, __func__, - "barrier-based sync failed. " - "disabling barriers\n"); - nilfs_clear_opt(sbi, BARRIER); - goto retry; - } + WRITE_SYNC | WRITE_FLUSH_FUA); } else { err = sync_dirty_buffer(nilfs->ns_sbh[0]); } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index bd02b6127d35..0254be2d73c6 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -635,9 +635,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, - BLKDEV_IFL_WAIT | - BLKDEV_IFL_BARRIER); + GFP_NOFS, 0); if (ret < 0) return ret; nblocks = 0; @@ -647,8 +645,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, - BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); + GFP_NOFS, 0); return ret; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9e8cc4346b76..d3e1d95a7c3d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -188,7 +188,7 @@ static int ocfs2_sync_file(struct file *file, int datasync) */ if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, - NULL, BLKDEV_IFL_WAIT); + NULL); goto bail; } diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 79fbf3f390f0..6dfbee03ccc6 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -352,6 +352,7 @@ static void part_release(struct device *dev) { struct hd_struct *p = dev_to_part(dev); free_part_stats(p); + free_part_info(p); kfree(p); } @@ -401,7 +402,8 @@ static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, whole_disk_show, NULL); struct hd_struct *add_partition(struct gendisk *disk, int partno, - sector_t start, sector_t len, int flags) + sector_t start, sector_t len, int flags, + struct partition_meta_info *info) { struct hd_struct *p; dev_t devt = MKDEV(0, 0); @@ -438,6 +440,14 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, p->partno = partno; p->policy = get_disk_ro(disk); + if (info) { + struct partition_meta_info *pinfo = alloc_part_info(disk); + if (!pinfo) + goto out_free_stats; + memcpy(pinfo, info, sizeof(*info)); + p->info = pinfo; + } + dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) dev_set_name(pdev, "%sp%d", dname, partno); @@ -451,7 +461,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, err = blk_alloc_devt(p, &devt); if (err) - goto out_free_stats; + goto out_free_info; pdev->devt = devt; /* delay uevent until 'holders' subdir is created */ @@ -481,6 +491,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, return p; +out_free_info: + free_part_info(p); out_free_stats: free_part_stats(p); out_free: @@ -642,6 +654,7 @@ rescan: /* add partitions */ for (p = 1; p < state->limit; p++) { sector_t size, from; + struct partition_meta_info *info = NULL; size = state->parts[p].size; if (!size) @@ -675,8 +688,12 @@ rescan: size = get_capacity(disk) - from; } } + + if (state->parts[p].has_info) + info = &state->parts[p].info; part = add_partition(disk, p, from, size, - state->parts[p].flags); + state->parts[p].flags, + &state->parts[p].info); if (IS_ERR(part)) { printk(KERN_ERR " %s: p%d could not be added: %ld\n", disk->disk_name, p, -PTR_ERR(part)); diff --git a/fs/partitions/check.h b/fs/partitions/check.h index 8e4e103ba216..d68bf4dc3bc2 100644 --- a/fs/partitions/check.h +++ b/fs/partitions/check.h @@ -1,5 +1,6 @@ #include <linux/pagemap.h> #include <linux/blkdev.h> +#include <linux/genhd.h> /* * add_gd_partition adds a partitions details to the devices partition @@ -12,6 +13,8 @@ struct parsed_partitions { sector_t from; sector_t size; int flags; + bool has_info; + struct partition_meta_info info; } parts[DISK_MAX_PARTS]; int next; int limit; diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index dbb44d4bb8a7..ac0ccb5026a2 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -94,6 +94,7 @@ * ************************************************************/ #include <linux/crc32.h> +#include <linux/ctype.h> #include <linux/math64.h> #include <linux/slab.h> #include "check.h" @@ -604,6 +605,7 @@ int efi_partition(struct parsed_partitions *state) gpt_entry *ptes = NULL; u32 i; unsigned ssz = bdev_logical_block_size(state->bdev) / 512; + u8 unparsed_guid[37]; if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); @@ -614,6 +616,9 @@ int efi_partition(struct parsed_partitions *state) pr_debug("GUID Partition Table is valid! Yea!\n"); for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { + struct partition_meta_info *info; + unsigned label_count = 0; + unsigned label_max; u64 start = le64_to_cpu(ptes[i].starting_lba); u64 size = le64_to_cpu(ptes[i].ending_lba) - le64_to_cpu(ptes[i].starting_lba) + 1ULL; @@ -627,6 +632,26 @@ int efi_partition(struct parsed_partitions *state) if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID)) state->parts[i + 1].flags = ADDPART_FLAG_RAID; + + info = &state->parts[i + 1].info; + /* Instead of doing a manual swap to big endian, reuse the + * common ASCII hex format as the interim. + */ + efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid); + part_pack_uuid(unparsed_guid, info->uuid); + + /* Naively convert UTF16-LE to 7 bits. */ + label_max = min(sizeof(info->volname) - 1, + sizeof(ptes[i].partition_name)); + info->volname[label_max] = 0; + while (label_count < label_max) { + u8 c = ptes[i].partition_name[label_count] & 0xff; + if (c && !isprint(c)) + c = '!'; + info->volname[label_count] = c; + label_count++; + } + state->parts[i + 1].has_info = true; } kfree(ptes); kfree(gpt); diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 6846371498b6..91f080cc76c8 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -152,8 +152,7 @@ static int reiserfs_sync_file(struct file *filp, int datasync) barrier_done = reiserfs_commit_for_inode(inode); reiserfs_write_unlock(inode->i_sb); if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, - BLKDEV_IFL_WAIT); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); if (barrier_done < 0) return barrier_done; return (err < 0) ? -EIO : 0; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 812e2c05aa29..076c8b194682 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -138,13 +138,6 @@ static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) return 0; } -static void disable_barrier(struct super_block *s) -{ - REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH); - printk("reiserfs: disabling flush barriers on %s\n", - reiserfs_bdevname(s)); -} - static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block *sb) { @@ -677,30 +670,6 @@ static void submit_ordered_buffer(struct buffer_head *bh) submit_bh(WRITE, bh); } -static int submit_barrier_buffer(struct buffer_head *bh) -{ - get_bh(bh); - bh->b_end_io = reiserfs_end_ordered_io; - clear_buffer_dirty(bh); - if (!buffer_uptodate(bh)) - BUG(); - return submit_bh(WRITE_BARRIER, bh); -} - -static void check_barrier_completion(struct super_block *s, - struct buffer_head *bh) -{ - if (buffer_eopnotsupp(bh)) { - clear_buffer_eopnotsupp(bh); - disable_barrier(s); - set_buffer_uptodate(bh); - set_buffer_dirty(bh); - reiserfs_write_unlock(s); - sync_dirty_buffer(bh); - reiserfs_write_lock(s); - } -} - #define CHUNK_SIZE 32 struct buffer_chunk { struct buffer_head *bh[CHUNK_SIZE]; @@ -1009,7 +978,6 @@ static int flush_commit_list(struct super_block *s, struct buffer_head *tbh = NULL; unsigned int trans_id = jl->j_trans_id; struct reiserfs_journal *journal = SB_JOURNAL(s); - int barrier = 0; int retval = 0; int write_len; @@ -1094,24 +1062,6 @@ static int flush_commit_list(struct super_block *s, } atomic_dec(&journal->j_async_throttle); - /* We're skipping the commit if there's an error */ - if (retval || reiserfs_is_journal_aborted(journal)) - barrier = 0; - - /* wait on everything written so far before writing the commit - * if we are in barrier mode, send the commit down now - */ - barrier = reiserfs_barrier_flush(s); - if (barrier) { - int ret; - lock_buffer(jl->j_commit_bh); - ret = submit_barrier_buffer(jl->j_commit_bh); - if (ret == -EOPNOTSUPP) { - set_buffer_uptodate(jl->j_commit_bh); - disable_barrier(s); - barrier = 0; - } - } for (i = 0; i < (jl->j_len + 1); i++) { bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); @@ -1143,27 +1093,22 @@ static int flush_commit_list(struct super_block *s, BUG_ON(atomic_read(&(jl->j_commit_left)) != 1); - if (!barrier) { - /* If there was a write error in the journal - we can't commit - * this transaction - it will be invalid and, if successful, - * will just end up propagating the write error out to - * the file system. */ - if (likely(!retval && !reiserfs_is_journal_aborted (journal))) { - if (buffer_dirty(jl->j_commit_bh)) - BUG(); - mark_buffer_dirty(jl->j_commit_bh) ; - reiserfs_write_unlock(s); - sync_dirty_buffer(jl->j_commit_bh) ; - reiserfs_write_lock(s); - } - } else { + /* If there was a write error in the journal - we can't commit + * this transaction - it will be invalid and, if successful, + * will just end up propagating the write error out to + * the file system. */ + if (likely(!retval && !reiserfs_is_journal_aborted (journal))) { + if (buffer_dirty(jl->j_commit_bh)) + BUG(); + mark_buffer_dirty(jl->j_commit_bh) ; reiserfs_write_unlock(s); - wait_on_buffer(jl->j_commit_bh); + if (reiserfs_barrier_flush(s)) + __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA); + else + sync_dirty_buffer(jl->j_commit_bh); reiserfs_write_lock(s); } - check_barrier_completion(s, jl->j_commit_bh); - /* If there was a write error in the journal - we can't commit this * transaction - it will be invalid and, if successful, will just end * up propagating the write error out to the filesystem. */ @@ -1319,26 +1264,15 @@ static int _update_journal_header_block(struct super_block *sb, jh->j_first_unflushed_offset = cpu_to_le32(offset); jh->j_mount_id = cpu_to_le32(journal->j_mount_id); - if (reiserfs_barrier_flush(sb)) { - int ret; - lock_buffer(journal->j_header_bh); - ret = submit_barrier_buffer(journal->j_header_bh); - if (ret == -EOPNOTSUPP) { - set_buffer_uptodate(journal->j_header_bh); - disable_barrier(sb); - goto sync; - } - reiserfs_write_unlock(sb); - wait_on_buffer(journal->j_header_bh); - reiserfs_write_lock(sb); - check_barrier_completion(sb, journal->j_header_bh); - } else { - sync: - set_buffer_dirty(journal->j_header_bh); - reiserfs_write_unlock(sb); + set_buffer_dirty(journal->j_header_bh); + reiserfs_write_unlock(sb); + + if (reiserfs_barrier_flush(sb)) + __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA); + else sync_dirty_buffer(journal->j_header_bh); - reiserfs_write_lock(sb); - } + + reiserfs_write_lock(sb); if (!buffer_uptodate(journal->j_header_bh)) { reiserfs_warning(sb, "journal-837", "IO error during journal replay"); diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 52785189212f..1168d92ee4aa 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -986,19 +986,7 @@ xfs_buf_iodone_work( xfs_buf_t *bp = container_of(work, xfs_buf_t, b_iodone_work); - /* - * We can get an EOPNOTSUPP to ordered writes. Here we clear the - * ordered flag and reissue them. Because we can't tell the higher - * layers directly that they should not issue ordered I/O anymore, they - * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion. - */ - if ((bp->b_error == EOPNOTSUPP) && - (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) { - trace_xfs_buf_ordered_retry(bp, _RET_IP_); - bp->b_flags &= ~XBF_ORDERED; - bp->b_flags |= _XFS_BARRIER_FAILED; - xfs_buf_iorequest(bp); - } else if (bp->b_iodone) + if (bp->b_iodone) (*(bp->b_iodone))(bp); else if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); @@ -1254,7 +1242,7 @@ _xfs_buf_ioapply( if (bp->b_flags & XBF_ORDERED) { ASSERT(!(bp->b_flags & XBF_READ)); - rw = WRITE_BARRIER; + rw = WRITE_FLUSH_FUA; } else if (bp->b_flags & XBF_LOG_BUFFER) { ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); bp->b_flags &= ~_XBF_RUN_QUEUES; diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 131c0ebf2c0d..383a3f37cf98 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -85,14 +85,6 @@ typedef enum { */ #define _XBF_PAGE_LOCKED (1 << 22) -/* - * If we try a barrier write, but it fails we have to communicate - * this to the upper layers. Unfortunately b_error gets overwritten - * when the buffer is re-issued so we have to add another flag to - * keep this information. - */ -#define _XFS_BARRIER_FAILED (1 << 23) - typedef unsigned int xfs_buf_flags_t; #define XFS_BUF_FLAGS \ @@ -112,8 +104,7 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }, \ - { _XFS_BARRIER_FAILED, "BARRIER_FAILED" } + { _XBF_PAGE_LOCKED, "PAGE_LOCKED" } typedef enum { diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index fa1e40ac4b35..ab31ce5aeaf9 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -692,8 +692,7 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL, - BLKDEV_IFL_WAIT); + blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL); } STATIC void diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 286dc201c5b9..acef2e98c594 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -325,7 +325,6 @@ DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); DEFINE_BUF_EVENT(xfs_buf_cond_lock); DEFINE_BUF_EVENT(xfs_buf_unlock); -DEFINE_BUF_EVENT(xfs_buf_ordered_retry); DEFINE_BUF_EVENT(xfs_buf_iowait); DEFINE_BUF_EVENT(xfs_buf_iowait_done); DEFINE_BUF_EVENT(xfs_buf_delwri_queue); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f4fd49c9b987..cee4ab9f8a9e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -917,19 +917,6 @@ xlog_iodone(xfs_buf_t *bp) l = iclog->ic_log; /* - * If the _XFS_BARRIER_FAILED flag was set by a lower - * layer, it means the underlying device no longer supports - * barrier I/O. Warn loudly and turn off barriers. - */ - if (bp->b_flags & _XFS_BARRIER_FAILED) { - bp->b_flags &= ~_XFS_BARRIER_FAILED; - l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER; - xfs_fs_cmn_err(CE_WARN, l->l_mp, - "xlog_iodone: Barriers are no longer supported" - " by device. Disabling barriers\n"); - } - - /* * Race to shutdown the filesystem if we see an error. */ if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, diff --git a/include/linux/bio.h b/include/linux/bio.h index 5274103434ad..2c3fd7421607 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -496,6 +496,10 @@ static inline struct bio *bio_list_get(struct bio_list *bl) #define bip_for_each_vec(bvl, bip, i) \ __bip_for_each_vec(bvl, bip, i, (bip)->bip_idx) +#define bio_for_each_integrity_vec(_bvl, _bio, _iter) \ + for_each_bio(_bio) \ + bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) + #define bio_integrity(bio) (bio->bi_integrity != NULL) extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index ca83a97c9715..0437ab6bb54c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -97,6 +97,7 @@ struct bio { #define BIO_NULL_MAPPED 9 /* contains invalid user pages */ #define BIO_FS_INTEGRITY 10 /* fs owns integrity data, not block layer */ #define BIO_QUIET 11 /* Make BIO Quiet */ +#define BIO_MAPPED_INTEGRITY 12/* integrity metadata has been remapped */ #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) /* @@ -130,6 +131,8 @@ enum rq_flag_bits { /* bio only flags */ __REQ_UNPLUG, /* unplug the immediately after submission */ __REQ_RAHEAD, /* read ahead, can fail anytime */ + __REQ_THROTTLED, /* This bio has already been subjected to + * throttling rules. Don't do it again. */ /* request only flags */ __REQ_SORTED, /* elevator knows about this request */ @@ -143,10 +146,8 @@ enum rq_flag_bits { __REQ_FAILED, /* set if the request failed */ __REQ_QUIET, /* don't worry about errors */ __REQ_PREEMPT, /* set for "ide_preempt" requests */ - __REQ_ORDERED_COLOR, /* is before or after barrier */ __REQ_ALLOCED, /* request came from our alloc pool */ __REQ_COPY_USER, /* contains copies of user pages */ - __REQ_INTEGRITY, /* integrity metadata has been remapped */ __REQ_FLUSH, /* request for cache flush */ __REQ_IO_STAT, /* account I/O stat */ __REQ_MIXED_MERGE, /* merge of different types, fail separately */ @@ -168,10 +169,12 @@ enum rq_flag_bits { (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) #define REQ_COMMON_MASK \ (REQ_WRITE | REQ_FAILFAST_MASK | REQ_HARDBARRIER | REQ_SYNC | \ - REQ_META| REQ_DISCARD | REQ_NOIDLE) + REQ_META | REQ_DISCARD | REQ_NOIDLE | REQ_FLUSH | REQ_FUA) +#define REQ_CLONE_MASK REQ_COMMON_MASK #define REQ_UNPLUG (1 << __REQ_UNPLUG) #define REQ_RAHEAD (1 << __REQ_RAHEAD) +#define REQ_THROTTLED (1 << __REQ_THROTTLED) #define REQ_SORTED (1 << __REQ_SORTED) #define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER) @@ -184,10 +187,8 @@ enum rq_flag_bits { #define REQ_FAILED (1 << __REQ_FAILED) #define REQ_QUIET (1 << __REQ_QUIET) #define REQ_PREEMPT (1 << __REQ_PREEMPT) -#define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR) #define REQ_ALLOCED (1 << __REQ_ALLOCED) #define REQ_COPY_USER (1 << __REQ_COPY_USER) -#define REQ_INTEGRITY (1 << __REQ_INTEGRITY) #define REQ_FLUSH (1 << __REQ_FLUSH) #define REQ_IO_STAT (1 << __REQ_IO_STAT) #define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e5cb4d029689..5027a599077d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -124,6 +124,9 @@ struct request { * physical address coalescing is performed. */ unsigned short nr_phys_segments; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + unsigned short nr_integrity_segments; +#endif unsigned short ioprio; @@ -243,6 +246,7 @@ struct queue_limits { unsigned short logical_block_size; unsigned short max_segments; + unsigned short max_integrity_segments; unsigned char misaligned; unsigned char discard_misaligned; @@ -355,18 +359,25 @@ struct request_queue struct blk_trace *blk_trace; #endif /* - * reserved for flush operations + * for flush operations */ - unsigned int ordered, next_ordered, ordseq; - int orderr, ordcolor; - struct request pre_flush_rq, bar_rq, post_flush_rq; - struct request *orig_bar_rq; + unsigned int flush_flags; + unsigned int flush_seq; + int flush_err; + struct request flush_rq; + struct request *orig_flush_rq; + struct list_head pending_flushes; struct mutex sysfs_lock; #if defined(CONFIG_BLK_DEV_BSG) struct bsg_class_device bsg_dev; #endif + +#ifdef CONFIG_BLK_DEV_THROTTLING + /* Throttle data */ + struct throtl_data *td; +#endif }; #define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */ @@ -462,56 +473,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) __clear_bit(flag, &q->queue_flags); } -enum { - /* - * Hardbarrier is supported with one of the following methods. - * - * NONE : hardbarrier unsupported - * DRAIN : ordering by draining is enough - * DRAIN_FLUSH : ordering by draining w/ pre and post flushes - * DRAIN_FUA : ordering by draining w/ pre flush and FUA write - * TAG : ordering by tag is enough - * TAG_FLUSH : ordering by tag w/ pre and post flushes - * TAG_FUA : ordering by tag w/ pre flush and FUA write - */ - QUEUE_ORDERED_BY_DRAIN = 0x01, - QUEUE_ORDERED_BY_TAG = 0x02, - QUEUE_ORDERED_DO_PREFLUSH = 0x10, - QUEUE_ORDERED_DO_BAR = 0x20, - QUEUE_ORDERED_DO_POSTFLUSH = 0x40, - QUEUE_ORDERED_DO_FUA = 0x80, - - QUEUE_ORDERED_NONE = 0x00, - - QUEUE_ORDERED_DRAIN = QUEUE_ORDERED_BY_DRAIN | - QUEUE_ORDERED_DO_BAR, - QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN | - QUEUE_ORDERED_DO_PREFLUSH | - QUEUE_ORDERED_DO_POSTFLUSH, - QUEUE_ORDERED_DRAIN_FUA = QUEUE_ORDERED_DRAIN | - QUEUE_ORDERED_DO_PREFLUSH | - QUEUE_ORDERED_DO_FUA, - - QUEUE_ORDERED_TAG = QUEUE_ORDERED_BY_TAG | - QUEUE_ORDERED_DO_BAR, - QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG | - QUEUE_ORDERED_DO_PREFLUSH | - QUEUE_ORDERED_DO_POSTFLUSH, - QUEUE_ORDERED_TAG_FUA = QUEUE_ORDERED_TAG | - QUEUE_ORDERED_DO_PREFLUSH | - QUEUE_ORDERED_DO_FUA, - - /* - * Ordered operation sequence - */ - QUEUE_ORDSEQ_STARTED = 0x01, /* flushing in progress */ - QUEUE_ORDSEQ_DRAIN = 0x02, /* waiting for the queue to be drained */ - QUEUE_ORDSEQ_PREFLUSH = 0x04, /* pre-flushing in progress */ - QUEUE_ORDSEQ_BAR = 0x08, /* original barrier req in progress */ - QUEUE_ORDSEQ_POSTFLUSH = 0x10, /* post-flushing in progress */ - QUEUE_ORDSEQ_DONE = 0x20, -}; - #define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags) #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) @@ -521,7 +482,6 @@ enum { #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) -#define blk_queue_flushing(q) ((q)->ordseq) #define blk_queue_stackable(q) \ test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) @@ -592,7 +552,8 @@ static inline void blk_clear_queue_full(struct request_queue *q, int sync) * it already be started by driver. */ #define RQ_NOMERGE_FLAGS \ - (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER) + (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER | \ + REQ_FLUSH | REQ_FUA) #define rq_mergeable(rq) \ (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \ (((rq)->cmd_flags & REQ_DISCARD) || \ @@ -851,7 +812,7 @@ extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); -extern void blk_queue_physical_block_size(struct request_queue *, unsigned short); +extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); @@ -881,12 +842,8 @@ extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); +extern void blk_queue_flush(struct request_queue *q, unsigned int flush); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); -extern int blk_queue_ordered(struct request_queue *, unsigned); -extern bool blk_do_ordered(struct request_queue *, struct request **); -extern unsigned blk_ordered_cur_seq(struct request_queue *); -extern unsigned blk_ordered_req_seq(struct request *); -extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int); extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); @@ -919,35 +876,28 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, return NULL; return bqt->tag_index[tag]; } -enum{ - BLKDEV_WAIT, /* wait for completion */ - BLKDEV_BARRIER, /* issue request with barrier */ - BLKDEV_SECURE, /* secure discard */ -}; -#define BLKDEV_IFL_WAIT (1 << BLKDEV_WAIT) -#define BLKDEV_IFL_BARRIER (1 << BLKDEV_BARRIER) -#define BLKDEV_IFL_SECURE (1 << BLKDEV_SECURE) -extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *, - unsigned long); + +#define BLKDEV_DISCARD_SECURE 0x01 /* secure discard */ + +extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); -static inline int sb_issue_discard(struct super_block *sb, - sector_t block, sector_t nr_blocks) + sector_t nr_sects, gfp_t gfp_mask); +static inline int sb_issue_discard(struct super_block *sb, sector_t block, + sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags) { - block <<= (sb->s_blocksize_bits - 9); - nr_blocks <<= (sb->s_blocksize_bits - 9); - return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_NOFS, - BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); + return blkdev_issue_discard(sb->s_bdev, block << (sb->s_blocksize_bits - 9), + nr_blocks << (sb->s_blocksize_bits - 9), + gfp_mask, flags); } static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, - sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags) + sector_t nr_blocks, gfp_t gfp_mask) { return blkdev_issue_zeroout(sb->s_bdev, block << (sb->s_blocksize_bits - 9), nr_blocks << (sb->s_blocksize_bits - 9), - gfp_mask, flags); + gfp_mask); } extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); @@ -1012,7 +962,7 @@ static inline unsigned int queue_physical_block_size(struct request_queue *q) return q->limits.physical_block_size; } -static inline int bdev_physical_block_size(struct block_device *bdev) +static inline unsigned int bdev_physical_block_size(struct block_device *bdev) { return queue_physical_block_size(bdev_get_queue(bdev)); } @@ -1101,11 +1051,11 @@ static inline int queue_dma_alignment(struct request_queue *q) return q ? q->dma_alignment : 511; } -static inline int blk_rq_aligned(struct request_queue *q, void *addr, +static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr, unsigned int len) { unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask; - return !((unsigned long)addr & alignment) && !(len & alignment); + return !(addr & alignment) && !(len & alignment); } /* assumes size > 256 */ @@ -1135,6 +1085,7 @@ static inline void put_dev_sector(Sector p) struct work_struct; int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); +int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay); #ifdef CONFIG_BLK_CGROUP /* @@ -1178,6 +1129,24 @@ static inline uint64_t rq_io_start_time_ns(struct request *req) } #endif +#ifdef CONFIG_BLK_DEV_THROTTLING +extern int blk_throtl_init(struct request_queue *q); +extern void blk_throtl_exit(struct request_queue *q); +extern int blk_throtl_bio(struct request_queue *q, struct bio **bio); +extern void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay); +extern void throtl_shutdown_timer_wq(struct request_queue *q); +#else /* CONFIG_BLK_DEV_THROTTLING */ +static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio) +{ + return 0; +} + +static inline int blk_throtl_init(struct request_queue *q) { return 0; } +static inline int blk_throtl_exit(struct request_queue *q) { return 0; } +static inline void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) {} +static inline void throtl_shutdown_timer_wq(struct request_queue *q) {} +#endif /* CONFIG_BLK_DEV_THROTTLING */ + #define MODULE_ALIAS_BLOCKDEV(major,minor) \ MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ @@ -1221,8 +1190,13 @@ struct blk_integrity { extern int blk_integrity_register(struct gendisk *, struct blk_integrity *); extern void blk_integrity_unregister(struct gendisk *); extern int blk_integrity_compare(struct gendisk *, struct gendisk *); -extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); -extern int blk_rq_count_integrity_sg(struct request *); +extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, + struct scatterlist *); +extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); +extern int blk_integrity_merge_rq(struct request_queue *, struct request *, + struct request *); +extern int blk_integrity_merge_bio(struct request_queue *, struct request *, + struct bio *); static inline struct blk_integrity *bdev_get_integrity(struct block_device *bdev) @@ -1243,16 +1217,32 @@ static inline int blk_integrity_rq(struct request *rq) return bio_integrity(rq->bio); } +static inline void blk_queue_max_integrity_segments(struct request_queue *q, + unsigned int segs) +{ + q->limits.max_integrity_segments = segs; +} + +static inline unsigned short +queue_max_integrity_segments(struct request_queue *q) +{ + return q->limits.max_integrity_segments; +} + #else /* CONFIG_BLK_DEV_INTEGRITY */ #define blk_integrity_rq(rq) (0) -#define blk_rq_count_integrity_sg(a) (0) -#define blk_rq_map_integrity_sg(a, b) (0) +#define blk_rq_count_integrity_sg(a, b) (0) +#define blk_rq_map_integrity_sg(a, b, c) (0) #define bdev_get_integrity(a) (0) #define blk_get_integrity(a) (0) #define blk_integrity_compare(a, b) (0) #define blk_integrity_register(a, b) (0) #define blk_integrity_unregister(a) do { } while (0); +#define blk_queue_max_integrity_segments(a, b) do { } while (0); +#define queue_max_integrity_segments(a) (0) +#define blk_integrity_merge_rq(a, b, c) (0) +#define blk_integrity_merge_bio(a, b, c) (0) #endif /* CONFIG_BLK_DEV_INTEGRITY */ diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index ec94c12f21da..dd1b25b2641c 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -32,7 +32,6 @@ enum bh_state_bits { BH_Delay, /* Buffer is not yet allocated on disk */ BH_Boundary, /* Block is followed by a discontiguity */ BH_Write_EIO, /* I/O error on write */ - BH_Eopnotsupp, /* operation not supported (barrier) */ BH_Unwritten, /* Buffer is allocated on disk but not written */ BH_Quiet, /* Buffer Error Prinks to be quiet */ @@ -124,7 +123,6 @@ BUFFER_FNS(Async_Write, async_write) BUFFER_FNS(Delay, delay) BUFFER_FNS(Boundary, boundary) BUFFER_FNS(Write_EIO, write_io_error) -BUFFER_FNS(Eopnotsupp, eopnotsupp) BUFFER_FNS(Unwritten, unwritten) #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 479ee3a1d901..9b2a0158f399 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -53,10 +53,10 @@ extern const char *drbd_buildtag(void); -#define REL_VERSION "8.3.8.1" +#define REL_VERSION "8.3.9rc2" #define API_VERSION 88 #define PRO_VERSION_MIN 86 -#define PRO_VERSION_MAX 94 +#define PRO_VERSION_MAX 95 enum drbd_io_error_p { @@ -91,6 +91,11 @@ enum drbd_after_sb_p { ASB_VIOLENTLY }; +enum drbd_on_no_data { + OND_IO_ERROR, + OND_SUSPEND_IO +}; + /* KEEP the order, do not delete or insert. Only append. */ enum drbd_ret_codes { ERR_CODE_BASE = 100, @@ -140,6 +145,7 @@ enum drbd_ret_codes { ERR_CONNECTED = 151, /* DRBD 8.3 only */ ERR_PERM = 152, ERR_NEED_APV_93 = 153, + ERR_STONITH_AND_PROT_A = 154, /* insert new ones above this line */ AFTER_LAST_ERR_CODE @@ -226,13 +232,17 @@ union drbd_state { unsigned conn:5 ; /* 17/32 cstates */ unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ - unsigned susp:1 ; /* 2/2 IO suspended no/yes */ + unsigned susp:1 ; /* 2/2 IO suspended no/yes (by user) */ unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ unsigned peer_isp:1 ; unsigned user_isp:1 ; - unsigned _pad:11; /* 0 unused */ + unsigned susp_nod:1 ; /* IO suspended because no data */ + unsigned susp_fen:1 ; /* IO suspended because fence peer handler runs*/ + unsigned _pad:9; /* 0 unused */ #elif defined(__BIG_ENDIAN_BITFIELD) - unsigned _pad:11; /* 0 unused */ + unsigned _pad:9; + unsigned susp_fen:1 ; + unsigned susp_nod:1 ; unsigned user_isp:1 ; unsigned peer_isp:1 ; unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ @@ -312,6 +322,8 @@ enum drbd_timeout_flag { #define DRBD_MAGIC 0x83740267 #define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC) +#define DRBD_MAGIC_BIG 0x835a +#define BE_DRBD_MAGIC_BIG __constant_cpu_to_be16(DRBD_MAGIC_BIG) /* these are of type "int" */ #define DRBD_MD_INDEX_INTERNAL -1 diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 440b42e38e89..4ac33f34b77e 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -128,26 +128,31 @@ #define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT #define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT #define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT +#define DRBD_ON_NO_DATA_DEF OND_IO_ERROR #define DRBD_MAX_BIO_BVECS_MIN 0 #define DRBD_MAX_BIO_BVECS_MAX 128 #define DRBD_MAX_BIO_BVECS_DEF 0 -#define DRBD_DP_VOLUME_MIN 4 -#define DRBD_DP_VOLUME_MAX 1048576 -#define DRBD_DP_VOLUME_DEF 16384 +#define DRBD_C_PLAN_AHEAD_MIN 0 +#define DRBD_C_PLAN_AHEAD_MAX 300 +#define DRBD_C_PLAN_AHEAD_DEF 0 /* RS rate controller disabled by default */ -#define DRBD_DP_INTERVAL_MIN 1 -#define DRBD_DP_INTERVAL_MAX 600 -#define DRBD_DP_INTERVAL_DEF 5 +#define DRBD_C_DELAY_TARGET_MIN 1 +#define DRBD_C_DELAY_TARGET_MAX 100 +#define DRBD_C_DELAY_TARGET_DEF 10 -#define DRBD_RS_THROTTLE_TH_MIN 1 -#define DRBD_RS_THROTTLE_TH_MAX 600 -#define DRBD_RS_THROTTLE_TH_DEF 20 +#define DRBD_C_FILL_TARGET_MIN 0 +#define DRBD_C_FILL_TARGET_MAX (1<<20) /* 500MByte in sec */ +#define DRBD_C_FILL_TARGET_DEF 0 /* By default disabled -> controlled by delay_target */ -#define DRBD_RS_HOLD_OFF_TH_MIN 1 -#define DRBD_RS_HOLD_OFF_TH_MAX 6000 -#define DRBD_RS_HOLD_OFF_TH_DEF 100 +#define DRBD_C_MAX_RATE_MIN 250 /* kByte/sec */ +#define DRBD_C_MAX_RATE_MAX (4 << 20) +#define DRBD_C_MAX_RATE_DEF 102400 + +#define DRBD_C_MIN_RATE_MIN 0 /* kByte/sec */ +#define DRBD_C_MIN_RATE_MAX (4 << 20) +#define DRBD_C_MIN_RATE_DEF 4096 #undef RANGE #endif diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h index 5f042810a56c..ade91107c9a5 100644 --- a/include/linux/drbd_nl.h +++ b/include/linux/drbd_nl.h @@ -87,6 +87,12 @@ NL_PACKET(syncer_conf, 8, NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32) NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX) NL_BIT( 65, T_MAY_IGNORE, use_rle) + NL_INTEGER( 75, T_MAY_IGNORE, on_no_data) + NL_INTEGER( 76, T_MAY_IGNORE, c_plan_ahead) + NL_INTEGER( 77, T_MAY_IGNORE, c_delay_target) + NL_INTEGER( 78, T_MAY_IGNORE, c_fill_target) + NL_INTEGER( 79, T_MAY_IGNORE, c_max_rate) + NL_INTEGER( 80, T_MAY_IGNORE, c_min_rate) ) NL_PACKET(invalidate, 9, ) diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 52c0da4bdd18..81bc20e36ec0 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -80,7 +80,7 @@ static inline int ddebug_remove_module(const char *mod) #define dynamic_pr_debug(fmt, ...) \ do { if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); } while (0) -#define dynamic_dev_dbg(dev, format, ...) \ +#define dynamic_dev_dbg(dev, fmt, ...) \ do { if (0) dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); } while (0) #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index ae527beb74e4..be9180ec18ac 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -135,12 +135,12 @@ struct inodes_stat_t { * immediately after submission. The write equivalent * of READ_SYNC. * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only. - * WRITE_BARRIER Like WRITE_SYNC, but tells the block layer that all - * previously submitted writes must be safely on storage - * before this one is started. Also guarantees that when - * this write is complete, it itself is also safely on - * storage. Prevents reordering of writes on both sides - * of this IO. + * WRITE_FLUSH Like WRITE_SYNC but with preceding cache flush. + * WRITE_FUA Like WRITE_SYNC but data is guaranteed to be on + * non-volatile media on completion. + * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded + * by a cache flush and data is guaranteed to be on + * non-volatile media on completion. * */ #define RW_MASK REQ_WRITE @@ -156,16 +156,12 @@ struct inodes_stat_t { #define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG) #define WRITE_ODIRECT_PLUG (WRITE | REQ_SYNC) #define WRITE_META (WRITE | REQ_META) -#define WRITE_BARRIER (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ - REQ_HARDBARRIER) - -/* - * These aren't really reads or writes, they pass down information about - * parts of device that are now unused by the file system. - */ -#define DISCARD_NOBARRIER (WRITE | REQ_DISCARD) -#define DISCARD_BARRIER (WRITE | REQ_DISCARD | REQ_HARDBARRIER) -#define DISCARD_SECURE (DISCARD_NOBARRIER | REQ_SECURE) +#define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ + REQ_FLUSH) +#define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ + REQ_FUA) +#define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ + REQ_FLUSH | REQ_FUA) #define SEL_IN 1 #define SEL_OUT 2 diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5f2f4c4d8fb0..66e26b5a1537 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/kdev_t.h> #include <linux/rcupdate.h> +#include <linux/slab.h> #ifdef CONFIG_BLOCK @@ -86,7 +87,15 @@ struct disk_stats { unsigned long io_ticks; unsigned long time_in_queue; }; - + +#define PARTITION_META_INFO_VOLNAMELTH 64 +#define PARTITION_META_INFO_UUIDLTH 16 + +struct partition_meta_info { + u8 uuid[PARTITION_META_INFO_UUIDLTH]; /* always big endian */ + u8 volname[PARTITION_META_INFO_VOLNAMELTH]; +}; + struct hd_struct { sector_t start_sect; sector_t nr_sects; @@ -95,6 +104,7 @@ struct hd_struct { struct device __dev; struct kobject *holder_dir; int policy, partno; + struct partition_meta_info *info; #ifdef CONFIG_FAIL_MAKE_REQUEST int make_it_fail; #endif @@ -181,6 +191,30 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part) return NULL; } +static inline void part_pack_uuid(const u8 *uuid_str, u8 *to) +{ + int i; + for (i = 0; i < 16; ++i) { + *to++ = (hex_to_bin(*uuid_str) << 4) | + (hex_to_bin(*(uuid_str + 1))); + uuid_str += 2; + switch (i) { + case 3: + case 5: + case 7: + case 9: + uuid_str++; + continue; + } + } +} + +static inline char *part_unpack_uuid(const u8 *uuid, char *out) +{ + sprintf(out, "%pU", uuid); + return out; +} + static inline int disk_max_parts(struct gendisk *disk) { if (disk->flags & GENHD_FL_EXT_DEVT) @@ -342,6 +376,19 @@ static inline int part_in_flight(struct hd_struct *part) return part->in_flight[0] + part->in_flight[1]; } +static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk) +{ + if (disk) + return kzalloc_node(sizeof(struct partition_meta_info), + GFP_KERNEL, disk->node_id); + return kzalloc(sizeof(struct partition_meta_info), GFP_KERNEL); +} + +static inline void free_part_info(struct hd_struct *part) +{ + kfree(part->info); +} + /* block/blk-core.c */ extern void part_round_stats(int cpu, struct hd_struct *part); @@ -533,7 +580,9 @@ extern int disk_expand_part_tbl(struct gendisk *disk, int target); extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev); extern struct hd_struct * __must_check add_partition(struct gendisk *disk, int partno, sector_t start, - sector_t len, int flags); + sector_t len, int flags, + struct partition_meta_info + *info); extern void delete_partition(struct gendisk *, int); extern void printk_all_partitions(void); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2b0a35e6bc69..f5df2f4acb0d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -641,6 +641,16 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } _max1 > _max2 ? _max1 : _max2; }) /** + * min_not_zero - return the minimum that is _not_ zero, unless both are zero + * @x: value1 + * @y: value2 + */ +#define min_not_zero(x, y) ({ \ + typeof(x) __x = (x); \ + typeof(y) __y = (y); \ + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) + +/** * clamp - return a value clamped to a given range with strict typechecking * @val: current value * @min: minimum allowable value diff --git a/include/linux/sched.h b/include/linux/sched.h index 1e2a6db2d7dd..dbafa9e34a2d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -336,6 +336,9 @@ extern unsigned long sysctl_hung_task_warnings; extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#else +/* Avoid need for ifdefs elsewhere in the code */ +enum { sysctl_hung_task_timeout_secs = 0 }; #endif /* Attach to any functions which should be ignored in wchan output. */ diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h index 8fcb6e0e9e72..d63533a4a59e 100644 --- a/include/scsi/scsi.h +++ b/include/scsi/scsi.h @@ -32,6 +32,12 @@ struct scsi_cmnd; #endif /* + * DIX-capable adapters effectively support infinite chaining for the + * protection information scatterlist + */ +#define SCSI_MAX_PROT_SG_SEGMENTS 0xFFFF + +/* * Special value for scanning to specify scanning or rescanning of all * possible channels, (target) ids, or luns on a given shost. */ diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index b7bdecb7b76e..d0a6a845f204 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -388,6 +388,7 @@ struct scsi_host_template { * of scatter-gather. */ unsigned short sg_tablesize; + unsigned short sg_prot_tablesize; /* * Set this if the host adapter has limitations beside segment count. @@ -599,6 +600,7 @@ struct Scsi_Host { int can_queue; short cmd_per_lun; short unsigned int sg_tablesize; + short unsigned int sg_prot_tablesize; short unsigned int max_sectors; unsigned long dma_boundary; /* @@ -823,6 +825,11 @@ static inline unsigned int scsi_host_get_prot(struct Scsi_Host *shost) return shost->prot_capabilities; } +static inline int scsi_host_prot_dma(struct Scsi_Host *shost) +{ + return shost->prot_capabilities >= SHOST_DIX_TYPE0_PROTECTION; +} + static inline unsigned int scsi_host_dif_capable(struct Scsi_Host *shost, unsigned int target_type) { static unsigned char cap[] = { 0, diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h index 17231385cb37..d6e7994aa634 100644 --- a/include/scsi/scsi_tcq.h +++ b/include/scsi/scsi_tcq.h @@ -97,13 +97,9 @@ static inline void scsi_deactivate_tcq(struct scsi_device *sdev, int depth) static inline int scsi_populate_tag_msg(struct scsi_cmnd *cmd, char *msg) { struct request *req = cmd->request; - struct scsi_device *sdev = cmd->device; if (blk_rq_tagged(req)) { - if (sdev->ordered_tags && req->cmd_flags & REQ_HARDBARRIER) - *msg++ = MSG_ORDERED_TAG; - else - *msg++ = MSG_SIMPLE_TAG; + *msg++ = MSG_SIMPLE_TAG; *msg++ = req->tag; return 2; } diff --git a/init/Kconfig b/init/Kconfig index 2de5b1cbadd9..950ba26f7233 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -634,11 +634,14 @@ config BLK_CGROUP Currently, CFQ IO scheduler uses it to recognize task groups and control disk bandwidth allocation (proportional time slice allocation) - to such task groups. + to such task groups. It is also used by bio throttling logic in + block layer to implement upper limit in IO rates on a device. This option only enables generic Block IO controller infrastructure. - One needs to also enable actual IO controlling logic in CFQ for it - to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y). + One needs to also enable actual IO controlling logic/policy. For + enabling proportional weight division of disk bandwidth in CFQ seti + CONFIG_CFQ_GROUP_IOSCHED=y and for enabling throttling policy set + CONFIG_BLK_THROTTLE=y. See Documentation/cgroups/blkio-controller.txt for more information. diff --git a/init/do_mounts.c b/init/do_mounts.c index 52387f14f9b4..62a47eafa8e9 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -58,6 +58,62 @@ static int __init readwrite(char *str) __setup("ro", readonly); __setup("rw", readwrite); +#ifdef CONFIG_BLOCK +/** + * match_dev_by_uuid - callback for finding a partition using its uuid + * @dev: device passed in by the caller + * @data: opaque pointer to a 36 byte char array with a UUID + * + * Returns 1 if the device matches, and 0 otherwise. + */ +static int match_dev_by_uuid(struct device *dev, void *data) +{ + u8 *uuid = data; + struct hd_struct *part = dev_to_part(dev); + + if (!part->info) + goto no_match; + + if (memcmp(uuid, part->info->uuid, sizeof(part->info->uuid))) + goto no_match; + + return 1; +no_match: + return 0; +} + + +/** + * devt_from_partuuid - looks up the dev_t of a partition by its UUID + * @uuid: 36 byte char array containing a hex ascii UUID + * + * The function will return the first partition which contains a matching + * UUID value in its partition_meta_info struct. This does not search + * by filesystem UUIDs. + * + * Returns the matching dev_t on success or 0 on failure. + */ +static dev_t __init devt_from_partuuid(char *uuid_str) +{ + dev_t res = 0; + struct device *dev = NULL; + u8 uuid[16]; + + /* Pack the requested UUID in the expected format. */ + part_pack_uuid(uuid_str, uuid); + + dev = class_find_device(&block_class, NULL, uuid, &match_dev_by_uuid); + if (!dev) + goto done; + + res = dev->devt; + put_device(dev); + +done: + return res; +} +#endif + /* * Convert a name into device number. We accept the following variants: * @@ -68,6 +124,8 @@ __setup("rw", readwrite); * of partition - device number of disk plus the partition number * 5) /dev/<disk_name>p<decimal> - same as the above, that form is * used when disk name of partitioned disk ends on a digit. + * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + * unique id of a partition if the partition table provides it. * * If name doesn't have fall into the categories above, we return (0,0). * block_class is used to check if something is a disk name. If the disk @@ -82,6 +140,18 @@ dev_t name_to_dev_t(char *name) dev_t res = 0; int part; +#ifdef CONFIG_BLOCK + if (strncmp(name, "PARTUUID=", 9) == 0) { + name += 9; + if (strlen(name) != 36) + goto fail; + res = devt_from_partuuid(name); + if (!res) + goto fail; + goto done; + } +#endif + if (strncmp(name, "/dev/", 5) != 0) { unsigned maj, min; diff --git a/mm/swapfile.c b/mm/swapfile.c index 7c703ff2f36f..9fc7bac7db0c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -139,7 +139,7 @@ static int discard_swap(struct swap_info_struct *si) nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT); + nr_blocks, GFP_KERNEL, 0); if (err) return err; cond_resched(); @@ -150,7 +150,7 @@ static int discard_swap(struct swap_info_struct *si) nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT); + nr_blocks, GFP_KERNEL, 0); if (err) break; @@ -189,7 +189,7 @@ static void discard_swap_cluster(struct swap_info_struct *si, start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT)) + nr_blocks, GFP_NOIO, 0)) break; } |