summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-02-21 11:02:48 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2021-02-21 11:02:48 -0800
commit582cd91f69de8e44857cb610ebca661dac8656b7 (patch)
tree0d680db02a5c236ee87b408b3f13ce33ebaca907
parentbd018bbaa58640da786d4289563e71c5ef3938c7 (diff)
parentf885056a48ccf4ad4332def91e973f3993fa8695 (diff)
Merge tag 'for-5.12/block-2021-02-17' of git://git.kernel.dk/linux-block
Pull core block updates from Jens Axboe: "Another nice round of removing more code than what is added, mostly due to Christoph's relentless pursuit of tech debt removal/cleanups. This pull request contains: - Two series of BFQ improvements (Paolo, Jan, Jia) - Block iov_iter improvements (Pavel) - bsg error path fix (Pan) - blk-mq scheduler improvements (Jan) - -EBUSY discard fix (Jan) - bvec allocation improvements (Ming, Christoph) - bio allocation and init improvements (Christoph) - Store bdev pointer in bio instead of gendisk + partno (Christoph) - Block trace point cleanups (Christoph) - hard read-only vs read-only split (Christoph) - Block based swap cleanups (Christoph) - Zoned write granularity support (Damien) - Various fixes/tweaks (Chunguang, Guoqing, Lei, Lukas, Huhai)" * tag 'for-5.12/block-2021-02-17' of git://git.kernel.dk/linux-block: (104 commits) mm: simplify swapdev_block sd_zbc: clear zone resources for non-zoned case block: introduce blk_queue_clear_zone_settings() zonefs: use zone write granularity as block size block: introduce zone_write_granularity limit block: use blk_queue_set_zoned in add_partition() nullb: use blk_queue_set_zoned() to setup zoned devices nvme: cleanup zone information initialization block: document zone_append_max_bytes attribute block: use bi_max_vecs to find the bvec pool md/raid10: remove dead code in reshape_request block: mark the bio as cloned in bio_iov_bvec_set block: set BIO_NO_PAGE_REF in bio_iov_bvec_set block: remove a layer of indentation in bio_iov_iter_get_pages block: turn the nr_iovecs argument to bio_alloc* into an unsigned short block: remove the 1 and 4 vec bvec_slabs entries block: streamline bvec_alloc block: factor out a bvec_alloc_gfp helper block: move struct biovec_slab to bio.c block: reuse BIO_INLINE_VECS for integrity bvecs ...
-rw-r--r--Documentation/block/biovecs.rst2
-rw-r--r--Documentation/block/queue-sysfs.rst13
-rw-r--r--Documentation/filesystems/f2fs.rst1
-rw-r--r--Documentation/filesystems/porting.rst16
-rw-r--r--arch/m68k/emu/nfblock.c2
-rw-r--r--arch/xtensa/platforms/iss/simdisk.c2
-rw-r--r--block/bfq-iosched.c445
-rw-r--r--block/bfq-iosched.h29
-rw-r--r--block/bfq-wf2q.c3
-rw-r--r--block/bio-integrity.c35
-rw-r--r--block/bio.c571
-rw-r--r--block/blk-cgroup.c22
-rw-r--r--block/blk-core.c99
-rw-r--r--block/blk-crypto-fallback.c6
-rw-r--r--block/blk-crypto.c2
-rw-r--r--block/blk-exec.c14
-rw-r--r--block/blk-flush.c17
-rw-r--r--block/blk-merge.c17
-rw-r--r--block/blk-mq.c69
-rw-r--r--block/blk-settings.c41
-rw-r--r--block/blk-sysfs.c8
-rw-r--r--block/blk-throttle.c2
-rw-r--r--block/blk-wbt.c4
-rw-r--r--block/blk-zoned.c17
-rw-r--r--block/blk.h12
-rw-r--r--block/bounce.c4
-rw-r--r--block/bsg.c6
-rw-r--r--block/genhd.c306
-rw-r--r--block/kyber-iosched.c1
-rw-r--r--block/mq-deadline.c6
-rw-r--r--block/partitions/core.c36
-rw-r--r--block/scsi_ioctl.c6
-rw-r--r--drivers/block/brd.c8
-rw-r--r--drivers/block/drbd/drbd_actlog.c2
-rw-r--r--drivers/block/drbd/drbd_bitmap.c2
-rw-r--r--drivers/block/drbd/drbd_int.h6
-rw-r--r--drivers/block/drbd/drbd_main.c13
-rw-r--r--drivers/block/drbd/drbd_req.c7
-rw-r--r--drivers/block/drbd/drbd_req.h12
-rw-r--r--drivers/block/drbd/drbd_worker.c5
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c2
-rw-r--r--drivers/block/null_blk/main.c2
-rw-r--r--drivers/block/null_blk/zoned.c8
-rw-r--r--drivers/block/paride/pd.c2
-rw-r--r--drivers/block/pktcdvd.c6
-rw-r--r--drivers/block/ps3vram.c2
-rw-r--r--drivers/block/rbd.c19
-rw-r--r--drivers/block/rsxx/dev.c2
-rw-r--r--drivers/block/sx8.c4
-rw-r--r--drivers/block/umem.c2
-rw-r--r--drivers/block/virtio_blk.c2
-rw-r--r--drivers/block/zram/zram_drv.c2
-rw-r--r--drivers/cdrom/cdrom.c2
-rw-r--r--drivers/ide/ide-atapi.c2
-rw-r--r--drivers/ide/ide-cd.c2
-rw-r--r--drivers/ide/ide-cd_ioctl.c2
-rw-r--r--drivers/ide/ide-devsets.c2
-rw-r--r--drivers/ide/ide-disk.c2
-rw-r--r--drivers/ide/ide-ioctls.c4
-rw-r--r--drivers/ide/ide-park.c2
-rw-r--r--drivers/ide/ide-pm.c4
-rw-r--r--drivers/ide/ide-tape.c2
-rw-r--r--drivers/ide/ide-taskfile.c2
-rw-r--r--drivers/lightnvm/pblk-init.c2
-rw-r--r--drivers/md/bcache/debug.c2
-rw-r--r--drivers/md/bcache/request.c39
-rw-r--r--drivers/md/bcache/super.c2
-rw-r--r--drivers/md/dm-bio-record.h9
-rw-r--r--drivers/md/dm-cache-metadata.c2
-rw-r--r--drivers/md/dm-clone-target.c14
-rw-r--r--drivers/md/dm-raid1.c10
-rw-r--r--drivers/md/dm-thin-metadata.c2
-rw-r--r--drivers/md/dm-zoned-metadata.c6
-rw-r--r--drivers/md/dm.c14
-rw-r--r--drivers/md/md-linear.c2
-rw-r--r--drivers/md/md.c73
-rw-r--r--drivers/md/md.h8
-rw-r--r--drivers/md/raid1.c8
-rw-r--r--drivers/md/raid10.c18
-rw-r--r--drivers/md/raid5-ppl.c2
-rw-r--r--drivers/md/raid5.c110
-rw-r--r--drivers/mmc/core/block.c10
-rw-r--r--drivers/nvdimm/blk.c4
-rw-r--r--drivers/nvdimm/btt.c4
-rw-r--r--drivers/nvdimm/pmem.c4
-rw-r--r--drivers/nvme/host/core.c31
-rw-r--r--drivers/nvme/host/lightnvm.c7
-rw-r--r--drivers/nvme/host/multipath.c6
-rw-r--r--drivers/nvme/host/pci.c4
-rw-r--r--drivers/nvme/host/rdma.c2
-rw-r--r--drivers/nvme/host/zns.c11
-rw-r--r--drivers/nvme/target/io-cmd-bdev.c2
-rw-r--r--drivers/nvme/target/passthru.c2
-rw-r--r--drivers/s390/block/dasd.c26
-rw-r--r--drivers/s390/block/dcssblk.c6
-rw-r--r--drivers/s390/block/xpram.c2
-rw-r--r--drivers/scsi/scsi_error.c2
-rw-r--r--drivers/scsi/scsi_lib.c2
-rw-r--r--drivers/scsi/sd_zbc.c43
-rw-r--r--drivers/scsi/sg.c3
-rw-r--r--drivers/scsi/st.c2
-rw-r--r--drivers/target/target_core_file.c20
-rw-r--r--drivers/target/target_core_pscsi.c3
-rw-r--r--fs/block_dev.c20
-rw-r--r--fs/btrfs/check-integrity.c10
-rw-r--r--fs/btrfs/raid56.c7
-rw-r--r--fs/btrfs/scrub.c2
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/btrfs/zoned.c4
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/exfat/file.c2
-rw-r--r--fs/ext4/fast_commit.c4
-rw-r--r--fs/ext4/fsync.c2
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/f2fs/data.c40
-rw-r--r--fs/f2fs/f2fs.h2
-rw-r--r--fs/f2fs/segment.c12
-rw-r--r--fs/f2fs/super.c1
-rw-r--r--fs/fat/file.c2
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/iomap/direct-io.c9
-rw-r--r--fs/jbd2/checkpoint.c2
-rw-r--r--fs/jbd2/commit.c4
-rw-r--r--fs/jbd2/recovery.c2
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c5
-rw-r--r--fs/nfsd/blocklayout.c2
-rw-r--r--fs/nilfs2/segbuf.c4
-rw-r--r--fs/nilfs2/the_nilfs.h2
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/splice.c9
-rw-r--r--fs/super.c3
-rw-r--r--fs/xfs/xfs_super.c2
-rw-r--r--fs/zonefs/super.c13
-rw-r--r--include/linux/bio.h55
-rw-r--r--include/linux/blk-mq.h8
-rw-r--r--include/linux/blk_types.h33
-rw-r--r--include/linux/blkdev.h53
-rw-r--r--include/linux/elevator.h2
-rw-r--r--include/linux/genhd.h27
-rw-r--r--include/linux/swap.h1
-rw-r--r--kernel/trace/blktrace.c16
-rw-r--r--lib/iov_iter.c21
-rw-r--r--mm/page_io.c47
-rw-r--r--mm/swapfile.c36
148 files changed, 1343 insertions, 1608 deletions
diff --git a/Documentation/block/biovecs.rst b/Documentation/block/biovecs.rst
index 36771a131b56..ddb867e0185b 100644
--- a/Documentation/block/biovecs.rst
+++ b/Documentation/block/biovecs.rst
@@ -40,6 +40,8 @@ normal code doesn't have to deal with bi_bvec_done.
There is a lower level advance function - bvec_iter_advance() - which takes
a pointer to a biovec, not a bio; this is used by the bio integrity code.
+As of 5.12 bvec segments with zero bv_len are not supported.
+
What's all this get us?
=======================
diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst
index 2638d3446b79..4dc7f0d499a8 100644
--- a/Documentation/block/queue-sysfs.rst
+++ b/Documentation/block/queue-sysfs.rst
@@ -261,6 +261,12 @@ For block drivers that support REQ_OP_WRITE_ZEROES, the maximum number of
bytes that can be zeroed at once. The value 0 means that REQ_OP_WRITE_ZEROES
is not supported.
+zone_append_max_bytes (RO)
+--------------------------
+This is the maximum number of bytes that can be written to a sequential
+zone of a zoned block device using a zone append write operation
+(REQ_OP_ZONE_APPEND). This value is always 0 for regular block devices.
+
zoned (RO)
----------
This indicates if the device is a zoned block device and the zone model of the
@@ -273,4 +279,11 @@ devices are described in the ZBC (Zoned Block Commands) and ZAC
do not support zone commands, they will be treated as regular block devices
and zoned will report "none".
+zone_write_granularity (RO)
+---------------------------
+This indicates the alignment constraint, in bytes, for write operations in
+sequential zones of zoned block devices (devices with a zoned attributed
+that reports "host-managed" or "host-aware"). This value is always 0 for
+regular block devices.
+
Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index 81c05baa8312..35ed01a5fbc9 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -179,7 +179,6 @@ fault_type=%d Support configuring fault injection type, should be
FAULT_KVMALLOC 0x000000002
FAULT_PAGE_ALLOC 0x000000004
FAULT_PAGE_GET 0x000000008
- FAULT_ALLOC_BIO 0x000000010
FAULT_ALLOC_NID 0x000000020
FAULT_ORPHAN 0x000000040
FAULT_BLOCK 0x000000080
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 867036aa90b8..1f8cf8e10b34 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -865,3 +865,19 @@ no matter what. Everything is handled by the caller.
clone_private_mount() returns a longterm mount now, so the proper destructor of
its result is kern_unmount() or kern_unmount_array().
+
+---
+
+**mandatory**
+
+zero-length bvec segments are disallowed, they must be filtered out before
+passed on to an iterator.
+
+---
+
+**mandatory**
+
+For bvec based itererators bio_iov_iter_get_pages() now doesn't copy bvecs but
+uses the one provided. Anyone issuing kiocb-I/O should ensure that the bvec and
+page references stay until I/O has completed, i.e. until ->ki_complete() has
+been called or returned with non -EIOCBQUEUED code.
diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index 92d26c812441..ba808543161a 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -61,7 +61,7 @@ struct nfhd_device {
static blk_qc_t nfhd_submit_bio(struct bio *bio)
{
- struct nfhd_device *dev = bio->bi_disk->private_data;
+ struct nfhd_device *dev = bio->bi_bdev->bd_disk->private_data;
struct bio_vec bvec;
struct bvec_iter iter;
int dir, len, shift;
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index 3447556d276d..fc09be7b1347 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -103,7 +103,7 @@ static void simdisk_transfer(struct simdisk *dev, unsigned long sector,
static blk_qc_t simdisk_submit_bio(struct bio *bio)
{
- struct simdisk *dev = bio->bi_disk->private_data;
+ struct simdisk *dev = bio->bi_bdev->bd_disk->private_data;
struct bio_vec bvec;
struct bvec_iter iter;
sector_t sector = bio->bi_iter.bi_sector;
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 9e81d1052091..b398dde53af9 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -158,7 +158,6 @@ BFQ_BFQQ_FNS(in_large_burst);
BFQ_BFQQ_FNS(coop);
BFQ_BFQQ_FNS(split_coop);
BFQ_BFQQ_FNS(softrt_update);
-BFQ_BFQQ_FNS(has_waker);
#undef BFQ_BFQQ_FNS \
/* Expiration time of sync (0) and async (1) requests, in ns. */
@@ -1024,9 +1023,16 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
else
bfq_clear_bfqq_IO_bound(bfqq);
+ bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns;
+ bfqq->inject_limit = bic->saved_inject_limit;
+ bfqq->decrease_time_jif = bic->saved_decrease_time_jif;
+
bfqq->entity.new_weight = bic->saved_weight;
bfqq->ttime = bic->saved_ttime;
+ bfqq->io_start_time = bic->saved_io_start_time;
+ bfqq->tot_idle_time = bic->saved_tot_idle_time;
bfqq->wr_coeff = bic->saved_wr_coeff;
+ bfqq->service_from_wr = bic->saved_service_from_wr;
bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
@@ -1647,6 +1653,8 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
return bfqq_weight > in_serv_weight;
}
+static bool bfq_better_to_idle(struct bfq_queue *bfqq);
+
static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
int old_wr_coeff,
@@ -1671,15 +1679,19 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
* - it is sync,
* - it does not belong to a large burst,
* - it has been idle for enough time or is soft real-time,
- * - is linked to a bfq_io_cq (it is not shared in any sense).
+ * - is linked to a bfq_io_cq (it is not shared in any sense),
+ * - has a default weight (otherwise we assume the user wanted
+ * to control its weight explicitly)
*/
in_burst = bfq_bfqq_in_large_burst(bfqq);
soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
!BFQQ_TOTALLY_SEEKY(bfqq) &&
!in_burst &&
time_is_before_jiffies(bfqq->soft_rt_next_start) &&
- bfqq->dispatched == 0;
- *interactive = !in_burst && idle_for_long_time;
+ bfqq->dispatched == 0 &&
+ bfqq->entity.new_weight == 40;
+ *interactive = !in_burst && idle_for_long_time &&
+ bfqq->entity.new_weight == 40;
wr_or_deserves_wr = bfqd->low_latency &&
(bfqq->wr_coeff > 1 ||
(bfq_bfqq_sync(bfqq) &&
@@ -1717,17 +1729,6 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
bfq_clear_bfqq_just_created(bfqq);
-
- if (!bfq_bfqq_IO_bound(bfqq)) {
- if (arrived_in_time) {
- bfqq->requests_within_timer++;
- if (bfqq->requests_within_timer >=
- bfqd->bfq_requests_within_timer)
- bfq_mark_bfqq_IO_bound(bfqq);
- } else
- bfqq->requests_within_timer = 0;
- }
-
if (bfqd->low_latency) {
if (unlikely(time_is_after_jiffies(bfqq->split_time)))
/* wraparound */
@@ -1755,10 +1756,10 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
bfq_add_bfqq_busy(bfqd, bfqq);
/*
- * Expire in-service queue only if preemption may be needed
- * for guarantees. In particular, we care only about two
- * cases. The first is that bfqq has to recover a service
- * hole, as explained in the comments on
+ * Expire in-service queue if preemption may be needed for
+ * guarantees or throughput. As for guarantees, we care
+ * explicitly about two cases. The first is that bfqq has to
+ * recover a service hole, as explained in the comments on
* bfq_bfqq_update_budg_for_activation(), i.e., that
* bfqq_wants_to_preempt is true. However, if bfqq does not
* carry time-critical I/O, then bfqq's bandwidth is less
@@ -1785,11 +1786,23 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
* timestamps of the in-service queue would need to be
* updated, and this operation is quite costly (see the
* comments on bfq_bfqq_update_budg_for_activation()).
+ *
+ * As for throughput, we ask bfq_better_to_idle() whether we
+ * still need to plug I/O dispatching. If bfq_better_to_idle()
+ * says no, then plugging is not needed any longer, either to
+ * boost throughput or to perserve service guarantees. Then
+ * the best option is to stop plugging I/O, as not doing so
+ * would certainly lower throughput. We may end up in this
+ * case if: (1) upon a dispatch attempt, we detected that it
+ * was better to plug I/O dispatch, and to wait for a new
+ * request to arrive for the currently in-service queue, but
+ * (2) this switch of bfqq to busy changes the scenario.
*/
if (bfqd->in_service_queue &&
((bfqq_wants_to_preempt &&
bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) ||
- bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) &&
+ bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue) ||
+ !bfq_better_to_idle(bfqd->in_service_queue)) &&
next_queue_may_preempt(bfqd))
bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
false, BFQQE_PREEMPTED);
@@ -1861,6 +1874,138 @@ static void bfq_reset_inject_limit(struct bfq_data *bfqd,
bfqq->decrease_time_jif = jiffies;
}
+static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
+{
+ u64 tot_io_time = now_ns - bfqq->io_start_time;
+
+ if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqq->dispatched == 0)
+ bfqq->tot_idle_time +=
+ now_ns - bfqq->ttime.last_end_request;
+
+ if (unlikely(bfq_bfqq_just_created(bfqq)))
+ return;
+
+ /*
+ * Must be busy for at least about 80% of the time to be
+ * considered I/O bound.
+ */
+ if (bfqq->tot_idle_time * 5 > tot_io_time)
+ bfq_clear_bfqq_IO_bound(bfqq);
+ else
+ bfq_mark_bfqq_IO_bound(bfqq);
+
+ /*
+ * Keep an observation window of at most 200 ms in the past
+ * from now.
+ */
+ if (tot_io_time > 200 * NSEC_PER_MSEC) {
+ bfqq->io_start_time = now_ns - (tot_io_time>>1);
+ bfqq->tot_idle_time >>= 1;
+ }
+}
+
+/*
+ * Detect whether bfqq's I/O seems synchronized with that of some
+ * other queue, i.e., whether bfqq, after remaining empty, happens to
+ * receive new I/O only right after some I/O request of the other
+ * queue has been completed. We call waker queue the other queue, and
+ * we assume, for simplicity, that bfqq may have at most one waker
+ * queue.
+ *
+ * A remarkable throughput boost can be reached by unconditionally
+ * injecting the I/O of the waker queue, every time a new
+ * bfq_dispatch_request happens to be invoked while I/O is being
+ * plugged for bfqq. In addition to boosting throughput, this
+ * unblocks bfqq's I/O, thereby improving bandwidth and latency for
+ * bfqq. Note that these same results may be achieved with the general
+ * injection mechanism, but less effectively. For details on this
+ * aspect, see the comments on the choice of the queue for injection
+ * in bfq_select_queue().
+ *
+ * Turning back to the detection of a waker queue, a queue Q is deemed
+ * as a waker queue for bfqq if, for three consecutive times, bfqq
+ * happens to become non empty right after a request of Q has been
+ * completed. In particular, on the first time, Q is tentatively set
+ * as a candidate waker queue, while on the third consecutive time
+ * that Q is detected, the field waker_bfqq is set to Q, to confirm
+ * that Q is a waker queue for bfqq. These detection steps are
+ * performed only if bfqq has a long think time, so as to make it more
+ * likely that bfqq's I/O is actually being blocked by a
+ * synchronization. This last filter, plus the above three-times
+ * requirement, make false positives less likely.
+ *
+ * NOTE
+ *
+ * The sooner a waker queue is detected, the sooner throughput can be
+ * boosted by injecting I/O from the waker queue. Fortunately,
+ * detection is likely to be actually fast, for the following
+ * reasons. While blocked by synchronization, bfqq has a long think
+ * time. This implies that bfqq's inject limit is at least equal to 1
+ * (see the comments in bfq_update_inject_limit()). So, thanks to
+ * injection, the waker queue is likely to be served during the very
+ * first I/O-plugging time interval for bfqq. This triggers the first
+ * step of the detection mechanism. Thanks again to injection, the
+ * candidate waker queue is then likely to be confirmed no later than
+ * during the next I/O-plugging interval for bfqq.
+ *
+ * ISSUE
+ *
+ * On queue merging all waker information is lost.
+ */
+static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ u64 now_ns)
+{
+ if (!bfqd->last_completed_rq_bfqq ||
+ bfqd->last_completed_rq_bfqq == bfqq ||
+ bfq_bfqq_has_short_ttime(bfqq) ||
+ now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC ||
+ bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq)
+ return;
+
+ if (bfqd->last_completed_rq_bfqq !=
+ bfqq->tentative_waker_bfqq) {
+ /*
+ * First synchronization detected with a
+ * candidate waker queue, or with a different
+ * candidate waker queue from the current one.
+ */
+ bfqq->tentative_waker_bfqq =
+ bfqd->last_completed_rq_bfqq;
+ bfqq->num_waker_detections = 1;
+ } else /* Same tentative waker queue detected again */
+ bfqq->num_waker_detections++;
+
+ if (bfqq->num_waker_detections == 3) {
+ bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
+ bfqq->tentative_waker_bfqq = NULL;
+
+ /*
+ * If the waker queue disappears, then
+ * bfqq->waker_bfqq must be reset. To
+ * this goal, we maintain in each
+ * waker queue a list, woken_list, of
+ * all the queues that reference the
+ * waker queue through their
+ * waker_bfqq pointer. When the waker
+ * queue exits, the waker_bfqq pointer
+ * of all the queues in the woken_list
+ * is reset.
+ *
+ * In addition, if bfqq is already in
+ * the woken_list of a waker queue,
+ * then, before being inserted into
+ * the woken_list of a new waker
+ * queue, bfqq must be removed from
+ * the woken_list of the old waker
+ * queue.
+ */
+ if (!hlist_unhashed(&bfqq->woken_list_node))
+ hlist_del_init(&bfqq->woken_list_node);
+ hlist_add_head(&bfqq->woken_list_node,
+ &bfqd->last_completed_rq_bfqq->woken_list);
+ }
+}
+
static void bfq_add_request(struct request *rq)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq);
@@ -1868,117 +2013,14 @@ static void bfq_add_request(struct request *rq)
struct request *next_rq, *prev;
unsigned int old_wr_coeff = bfqq->wr_coeff;
bool interactive = false;
+ u64 now_ns = ktime_get_ns();
bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
bfqq->queued[rq_is_sync(rq)]++;
bfqd->queued++;
if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
- /*
- * Detect whether bfqq's I/O seems synchronized with
- * that of some other queue, i.e., whether bfqq, after
- * remaining empty, happens to receive new I/O only
- * right after some I/O request of the other queue has
- * been completed. We call waker queue the other
- * queue, and we assume, for simplicity, that bfqq may
- * have at most one waker queue.
- *
- * A remarkable throughput boost can be reached by
- * unconditionally injecting the I/O of the waker
- * queue, every time a new bfq_dispatch_request
- * happens to be invoked while I/O is being plugged
- * for bfqq. In addition to boosting throughput, this
- * unblocks bfqq's I/O, thereby improving bandwidth
- * and latency for bfqq. Note that these same results
- * may be achieved with the general injection
- * mechanism, but less effectively. For details on
- * this aspect, see the comments on the choice of the
- * queue for injection in bfq_select_queue().
- *
- * Turning back to the detection of a waker queue, a
- * queue Q is deemed as a waker queue for bfqq if, for
- * two consecutive times, bfqq happens to become non
- * empty right after a request of Q has been
- * completed. In particular, on the first time, Q is
- * tentatively set as a candidate waker queue, while
- * on the second time, the flag
- * bfq_bfqq_has_waker(bfqq) is set to confirm that Q
- * is a waker queue for bfqq. These detection steps
- * are performed only if bfqq has a long think time,
- * so as to make it more likely that bfqq's I/O is
- * actually being blocked by a synchronization. This
- * last filter, plus the above two-times requirement,
- * make false positives less likely.
- *
- * NOTE
- *
- * The sooner a waker queue is detected, the sooner
- * throughput can be boosted by injecting I/O from the
- * waker queue. Fortunately, detection is likely to be
- * actually fast, for the following reasons. While
- * blocked by synchronization, bfqq has a long think
- * time. This implies that bfqq's inject limit is at
- * least equal to 1 (see the comments in
- * bfq_update_inject_limit()). So, thanks to
- * injection, the waker queue is likely to be served
- * during the very first I/O-plugging time interval
- * for bfqq. This triggers the first step of the
- * detection mechanism. Thanks again to injection, the
- * candidate waker queue is then likely to be
- * confirmed no later than during the next
- * I/O-plugging interval for bfqq.
- */
- if (bfqd->last_completed_rq_bfqq &&
- !bfq_bfqq_has_short_ttime(bfqq) &&
- ktime_get_ns() - bfqd->last_completion <
- 200 * NSEC_PER_USEC) {
- if (bfqd->last_completed_rq_bfqq != bfqq &&
- bfqd->last_completed_rq_bfqq !=
- bfqq->waker_bfqq) {
- /*
- * First synchronization detected with
- * a candidate waker queue, or with a
- * different candidate waker queue
- * from the current one.
- */
- bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
-
- /*
- * If the waker queue disappears, then
- * bfqq->waker_bfqq must be reset. To
- * this goal, we maintain in each
- * waker queue a list, woken_list, of
- * all the queues that reference the
- * waker queue through their
- * waker_bfqq pointer. When the waker
- * queue exits, the waker_bfqq pointer
- * of all the queues in the woken_list
- * is reset.
- *
- * In addition, if bfqq is already in
- * the woken_list of a waker queue,
- * then, before being inserted into
- * the woken_list of a new waker
- * queue, bfqq must be removed from
- * the woken_list of the old waker
- * queue.
- */
- if (!hlist_unhashed(&bfqq->woken_list_node))
- hlist_del_init(&bfqq->woken_list_node);
- hlist_add_head(&bfqq->woken_list_node,
- &bfqd->last_completed_rq_bfqq->woken_list);
-
- bfq_clear_bfqq_has_waker(bfqq);
- } else if (bfqd->last_completed_rq_bfqq ==
- bfqq->waker_bfqq &&
- !bfq_bfqq_has_waker(bfqq)) {
- /*
- * synchronization with waker_bfqq
- * seen for the second time
- */
- bfq_mark_bfqq_has_waker(bfqq);
- }
- }
+ bfq_check_waker(bfqd, bfqq, now_ns);
/*
* Periodically reset inject limit, to make sure that
@@ -2047,6 +2089,9 @@ static void bfq_add_request(struct request *rq)
}
}
+ if (bfq_bfqq_sync(bfqq))
+ bfq_update_io_intensity(bfqq, now_ns);
+
elv_rb_add(&bfqq->sort_list, rq);
/*
@@ -2352,6 +2397,24 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
/* Must be called with bfqq != NULL */
static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
{
+ /*
+ * If bfqq has been enjoying interactive weight-raising, then
+ * reset soft_rt_next_start. We do it for the following
+ * reason. bfqq may have been conveying the I/O needed to load
+ * a soft real-time application. Such an application actually
+ * exhibits a soft real-time I/O pattern after it finishes
+ * loading, and finally starts doing its job. But, if bfqq has
+ * been receiving a lot of bandwidth so far (likely to happen
+ * on a fast device), then soft_rt_next_start now contains a
+ * high value that. So, without this reset, bfqq would be
+ * prevented from being possibly considered as soft_rt for a
+ * very long time.
+ */
+
+ if (bfqq->wr_cur_max_time !=
+ bfqq->bfqd->bfq_wr_rt_max_time)
+ bfqq->soft_rt_next_start = jiffies;
+
if (bfq_bfqq_busy(bfqq))
bfqq->bfqd->wr_busy_queues--;
bfqq->wr_coeff = 1;
@@ -2686,10 +2749,16 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
if (!bic)
return;
+ bic->saved_last_serv_time_ns = bfqq->last_serv_time_ns;
+ bic->saved_inject_limit = bfqq->inject_limit;
+ bic->saved_decrease_time_jif = bfqq->decrease_time_jif;
+
bic->saved_weight = bfqq->entity.orig_weight;
bic->saved_ttime = bfqq->ttime;
bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq);
bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+ bic->saved_io_start_time = bfqq->io_start_time;
+ bic->saved_tot_idle_time = bfqq->tot_idle_time;
bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
if (unlikely(bfq_bfqq_just_created(bfqq) &&
@@ -2712,6 +2781,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
bic->saved_wr_coeff = bfqq->wr_coeff;
bic->saved_wr_start_at_switch_to_srt =
bfqq->wr_start_at_switch_to_srt;
+ bic->saved_service_from_wr = bfqq->service_from_wr;
bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
}
@@ -2937,6 +3007,7 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
}
bfqd->in_service_queue = bfqq;
+ bfqd->in_serv_last_pos = 0;
}
/*
@@ -3442,20 +3513,38 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
* order until all the requests already queued in the device have been
* served. The last sub-condition commented above somewhat mitigates
* this problem for weight-raised queues.
+ *
+ * However, as an additional mitigation for this problem, we preserve
+ * plugging for a special symmetric case that may suddenly turn into
+ * asymmetric: the case where only bfqq is busy. In this case, not
+ * expiring bfqq does not cause any harm to any other queues in terms
+ * of service guarantees. In contrast, it avoids the following unlucky
+ * sequence of events: (1) bfqq is expired, (2) a new queue with a
+ * lower weight than bfqq becomes busy (or more queues), (3) the new
+ * queue is served until a new request arrives for bfqq, (4) when bfqq
+ * is finally served, there are so many requests of the new queue in
+ * the drive that the pending requests for bfqq take a lot of time to
+ * be served. In particular, event (2) may case even already
+ * dispatched requests of bfqq to be delayed, inside the drive. So, to
+ * avoid this series of events, the scenario is preventively declared
+ * as asymmetric also if bfqq is the only busy queues
*/
static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
+ int tot_busy_queues = bfq_tot_busy_queues(bfqd);
+
/* No point in idling for bfqq if it won't get requests any longer */
if (unlikely(!bfqq_process_refs(bfqq)))
return false;
return (bfqq->wr_coeff > 1 &&
(bfqd->wr_busy_queues <
- bfq_tot_busy_queues(bfqd) ||
+ tot_busy_queues ||
bfqd->rq_in_driver >=
bfqq->dispatched + 4)) ||
- bfq_asymmetric_scenario(bfqd, bfqq);
+ bfq_asymmetric_scenario(bfqd, bfqq) ||
+ tot_busy_queues == 1;
}
static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
@@ -3939,10 +4028,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
bfq_bfqq_budget_left(bfqq) >= entity->budget / 3)))
bfq_bfqq_charge_time(bfqd, bfqq, delta);
- if (reason == BFQQE_TOO_IDLE &&
- entity->service <= 2 * entity->budget / 10)
- bfq_clear_bfqq_IO_bound(bfqq);
-
if (bfqd->low_latency && bfqq->wr_coeff == 1)
bfqq->last_wr_start_finish = jiffies;
@@ -3952,30 +4037,15 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
* If we get here, and there are no outstanding
* requests, then the request pattern is isochronous
* (see the comments on the function
- * bfq_bfqq_softrt_next_start()). Thus we can compute
- * soft_rt_next_start. And we do it, unless bfqq is in
- * interactive weight raising. We do not do it in the
- * latter subcase, for the following reason. bfqq may
- * be conveying the I/O needed to load a soft
- * real-time application. Such an application will
- * actually exhibit a soft real-time I/O pattern after
- * it finally starts doing its job. But, if
- * soft_rt_next_start is computed here for an
- * interactive bfqq, and bfqq had received a lot of
- * service before remaining with no outstanding
- * request (likely to happen on a fast device), then
- * soft_rt_next_start would be assigned such a high
- * value that, for a very long time, bfqq would be
- * prevented from being possibly considered as soft
- * real time.
+ * bfq_bfqq_softrt_next_start()). Therefore we can
+ * compute soft_rt_next_start.
*
* If, instead, the queue still has outstanding
* requests, then we have to wait for the completion
* of all the outstanding requests to discover whether
* the request pattern is actually isochronous.
*/
- if (bfqq->dispatched == 0 &&
- bfqq->wr_coeff != bfqd->bfq_wr_coeff)
+ if (bfqq->dispatched == 0)
bfqq->soft_rt_next_start =
bfq_bfqq_softrt_next_start(bfqd, bfqq);
else if (bfqq->dispatched > 0) {
@@ -4497,9 +4567,9 @@ check_queue:
bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
bfq_bfqq_budget_left(async_bfqq))
bfqq = bfqq->bic->bfqq[0];
- else if (bfq_bfqq_has_waker(bfqq) &&
+ else if (bfqq->waker_bfqq &&
bfq_bfqq_busy(bfqq->waker_bfqq) &&
- bfqq->next_rq &&
+ bfqq->waker_bfqq->next_rq &&
bfq_serv_to_charge(bfqq->waker_bfqq->next_rq,
bfqq->waker_bfqq) <=
bfq_bfqq_budget_left(bfqq->waker_bfqq)
@@ -4559,9 +4629,21 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfqq->wr_cur_max_time)) {
if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
- bfq_wr_duration(bfqd)))
+ bfq_wr_duration(bfqd))) {
+ /*
+ * Either in interactive weight
+ * raising, or in soft_rt weight
+ * raising with the
+ * interactive-weight-raising period
+ * elapsed (so no switch back to
+ * interactive weight raising).
+ */
bfq_bfqq_end_wr(bfqq);
- else {
+ } else { /*
+ * soft_rt finishing while still in
+ * interactive period, switch back to
+ * interactive weight raising
+ */
switch_back_to_interactive_wr(bfqq, bfqd);
bfqq->entity.prio_changed = 1;
}
@@ -4640,9 +4722,6 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
{
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
- if (!atomic_read(&hctx->elevator_queued))
- return false;
-
/*
* Avoiding lock: a race on bfqd->busy_queues should cause at
* most a call to dispatch for nothing
@@ -4892,7 +4971,6 @@ void bfq_put_queue(struct bfq_queue *bfqq)
hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
woken_list_node) {
item->waker_bfqq = NULL;
- bfq_clear_bfqq_has_waker(item);
hlist_del_init(&item->woken_list_node);
}
@@ -5012,6 +5090,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
}
bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
+ bfq_log_bfqq(bfqd, bfqq, "new_ioprio %d new_weight %d",
+ bfqq->new_ioprio, bfqq->entity.new_weight);
bfqq->entity.prio_changed = 1;
}
@@ -5049,6 +5129,8 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_io_cq *bic, pid_t pid, int is_sync)
{
+ u64 now_ns = ktime_get_ns();
+
RB_CLEAR_NODE(&bfqq->entity.rb_node);
INIT_LIST_HEAD(&bfqq->fifo);
INIT_HLIST_NODE(&bfqq->burst_list_node);
@@ -5076,7 +5158,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_clear_bfqq_sync(bfqq);
/* set end request to minus infinity from now */
- bfqq->ttime.last_end_request = ktime_get_ns() + 1;
+ bfqq->ttime.last_end_request = now_ns + 1;
+
+ bfqq->io_start_time = now_ns;
bfq_mark_bfqq_IO_bound(bfqq);
@@ -5194,11 +5278,19 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
struct bfq_ttime *ttime = &bfqq->ttime;
- u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
+ u64 elapsed;
+ /*
+ * We are really interested in how long it takes for the queue to
+ * become busy when there is no outstanding IO for this queue. So
+ * ignore cases when the bfq queue has already IO queued.
+ */
+ if (bfqq->dispatched || bfq_bfqq_busy(bfqq))
+ return;
+ elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
- ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
+ ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
ttime->ttime_samples);
@@ -5213,8 +5305,26 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (bfqq->wr_coeff > 1 &&
bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
- BFQQ_TOTALLY_SEEKY(bfqq))
- bfq_bfqq_end_wr(bfqq);
+ BFQQ_TOTALLY_SEEKY(bfqq)) {
+ if (time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
+ bfq_wr_duration(bfqd))) {
+ /*
+ * In soft_rt weight raising with the
+ * interactive-weight-raising period
+ * elapsed (so no switch back to
+ * interactive weight raising).
+ */
+ bfq_bfqq_end_wr(bfqq);
+ } else { /*
+ * stopping soft_rt weight raising
+ * while still in interactive period,
+ * switch back to interactive weight
+ * raising
+ */
+ switch_back_to_interactive_wr(bfqq, bfqd);
+ bfqq->entity.prio_changed = 1;
+ }
+ }
}
static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
@@ -5238,12 +5348,13 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
return;
/* Think time is infinite if no process is linked to
- * bfqq. Otherwise check average think time to
- * decide whether to mark as has_short_ttime
+ * bfqq. Otherwise check average think time to decide whether
+ * to mark as has_short_ttime. To this goal, compare average
+ * think time with half the I/O-plugging timeout.
*/
if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
(bfq_sample_valid(bfqq->ttime.ttime_samples) &&
- bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle))
+ bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle>>1))
has_short_ttime = false;
state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq);
@@ -5557,7 +5668,6 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);
bfq_insert_request(hctx, rq, at_head);
- atomic_inc(&hctx->elevator_queued);
}
}
@@ -5925,7 +6035,6 @@ static void bfq_finish_requeue_request(struct request *rq)
bfq_completed_request(bfqq, bfqd);
bfq_finish_requeue_request_body(bfqq);
- atomic_dec(&rq->mq_hctx->elevator_queued);
spin_unlock_irqrestore(&bfqd->lock, flags);
} else {
@@ -6489,8 +6598,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfqd->bfq_slice_idle = bfq_slice_idle;
bfqd->bfq_timeout = bfq_timeout;
- bfqd->bfq_requests_within_timer = 120;
-
bfqd->bfq_large_burst_thresh = 8;
bfqd->bfq_burst_interval = msecs_to_jiffies(180);
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 703895224562..b8e793c34ff1 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -291,6 +291,11 @@ struct bfq_queue {
/* associated @bfq_ttime struct */
struct bfq_ttime ttime;
+ /* when bfqq started to do I/O within the last observation window */
+ u64 io_start_time;
+ /* how long bfqq has remained empty during the last observ. window */
+ u64 tot_idle_time;
+
/* bit vector: a 1 for each seeky requests in history */
u32 seek_history;
@@ -371,6 +376,11 @@ struct bfq_queue {
* bfq_select_queue().
*/
struct bfq_queue *waker_bfqq;
+ /* pointer to the curr. tentative waker queue, see bfq_check_waker() */
+ struct bfq_queue *tentative_waker_bfqq;
+ /* number of times the same tentative waker has been detected */
+ unsigned int num_waker_detections;
+
/* node for woken_list, see below */
struct hlist_node woken_list_node;
/*
@@ -407,6 +417,9 @@ struct bfq_io_cq {
*/
bool saved_IO_bound;
+ u64 saved_io_start_time;
+ u64 saved_tot_idle_time;
+
/*
* Same purpose as the previous fields for the value of the
* field keeping the queue's belonging to a large burst
@@ -432,9 +445,15 @@ struct bfq_io_cq {
*/
unsigned long saved_wr_coeff;
unsigned long saved_last_wr_start_finish;
+ unsigned long saved_service_from_wr;
unsigned long saved_wr_start_at_switch_to_srt;
unsigned int saved_wr_cur_max_time;
struct bfq_ttime saved_ttime;
+
+ /* Save also injection state */
+ u64 saved_last_serv_time_ns;
+ unsigned int saved_inject_limit;
+ unsigned long saved_decrease_time_jif;
};
/**
@@ -642,14 +661,6 @@ struct bfq_data {
unsigned int bfq_timeout;
/*
- * Number of consecutive requests that must be issued within
- * the idle time slice to set again idling to a queue which
- * was marked as non-I/O-bound (see the definition of the
- * IO_bound flag for further details).
- */
- unsigned int bfq_requests_within_timer;
-
- /*
* Force device idling whenever needed to provide accurate
* service guarantees, without caring about throughput
* issues. CAVEAT: this may even increase latencies, in case
@@ -770,7 +781,6 @@ enum bfqq_state_flags {
*/
BFQQF_coop, /* bfqq is shared */
BFQQF_split_coop, /* shared bfqq will be split */
- BFQQF_has_waker /* bfqq has a waker queue */
};
#define BFQ_BFQQ_FNS(name) \
@@ -790,7 +800,6 @@ BFQ_BFQQ_FNS(in_large_burst);
BFQ_BFQQ_FNS(coop);
BFQ_BFQQ_FNS(split_coop);
BFQ_BFQQ_FNS(softrt_update);
-BFQ_BFQQ_FNS(has_waker);
#undef BFQ_BFQQ_FNS
/* Expiration reasons. */
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 26776bdbdf36..070e34a7feb1 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -137,9 +137,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
sd->next_in_service = next_in_service;
- if (!next_in_service)
- return parent_sched_may_change;
-
return parent_sched_may_change;
}
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 9ffd7e289554..dfa652122a2d 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -14,8 +14,6 @@
#include <linux/slab.h>
#include "blk.h"
-#define BIP_INLINE_VECS 4
-
static struct kmem_cache *bip_slab;
static struct workqueue_struct *kintegrityd_wq;
@@ -30,7 +28,7 @@ static void __bio_integrity_free(struct bio_set *bs,
if (bs && mempool_initialized(&bs->bio_integrity_pool)) {
if (bip->bip_vec)
bvec_free(&bs->bvec_integrity_pool, bip->bip_vec,
- bip->bip_slab);
+ bip->bip_max_vcnt);
mempool_free(bip, &bs->bio_integrity_pool);
} else {
kfree(bip);
@@ -63,7 +61,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
inline_vecs = nr_vecs;
} else {
bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask);
- inline_vecs = BIP_INLINE_VECS;
+ inline_vecs = BIO_INLINE_VECS;
}
if (unlikely(!bip))
@@ -72,14 +70,11 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
memset(bip, 0, sizeof(*bip));
if (nr_vecs > inline_vecs) {
- unsigned long idx = 0;
-
- bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
- &bs->bvec_integrity_pool);
+ bip->bip_max_vcnt = nr_vecs;
+ bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool,
+ &bip->bip_max_vcnt, gfp_mask);
if (!bip->bip_vec)
goto err;
- bip->bip_max_vcnt = bvec_nr_vecs(idx);
- bip->bip_slab = idx;
} else {
bip->bip_vec = bip->bip_inline_vecs;
bip->bip_max_vcnt = inline_vecs;
@@ -140,7 +135,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
iv = bip->bip_vec + bip->bip_vcnt;
if (bip->bip_vcnt &&
- bvec_gap_to_prev(bio->bi_disk->queue,
+ bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue,
&bip->bip_vec[bip->bip_vcnt - 1], offset))
return 0;
@@ -162,7 +157,7 @@ EXPORT_SYMBOL(bio_integrity_add_page);
static blk_status_t bio_integrity_process(struct bio *bio,
struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn)
{
- struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
struct blk_integrity_iter iter;
struct bvec_iter bviter;
struct bio_vec bv;
@@ -171,7 +166,7 @@ static blk_status_t bio_integrity_process(struct bio *bio,
void *prot_buf = page_address(bip->bip_vec->bv_page) +
bip->bip_vec->bv_offset;
- iter.disk_name = bio->bi_disk->disk_name;
+ iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
iter.interval = 1 << bi->interval_exp;
iter.seed = proc_iter->bi_sector;
iter.prot_buf = prot_buf;
@@ -208,8 +203,8 @@ static blk_status_t bio_integrity_process(struct bio *bio,
bool bio_integrity_prep(struct bio *bio)
{
struct bio_integrity_payload *bip;
- struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
- struct request_queue *q = bio->bi_disk->queue;
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
void *buf;
unsigned long start, end;
unsigned int len, nr_pages;
@@ -329,7 +324,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
struct bio_integrity_payload *bip =
container_of(work, struct bio_integrity_payload, bip_work);
struct bio *bio = bip->bip_bio;
- struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
/*
* At the moment verify is called bio's iterator was advanced
@@ -355,7 +350,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
*/
bool __bio_integrity_endio(struct bio *bio)
{
- struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
struct bio_integrity_payload *bip = bio_integrity(bio);
if (bio_op(bio) == REQ_OP_READ && !bio->bi_status &&
@@ -381,7 +376,7 @@ bool __bio_integrity_endio(struct bio *bio)
void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
- struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
bip->bip_iter.bi_sector += bytes_done >> 9;
@@ -397,7 +392,7 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
void bio_integrity_trim(struct bio *bio)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
- struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio));
}
@@ -470,6 +465,6 @@ void __init bio_integrity_init(void)
bip_slab = kmem_cache_create("bio_integrity_payload",
sizeof(struct bio_integrity_payload) +
- sizeof(struct bio_vec) * BIP_INLINE_VECS,
+ sizeof(struct bio_vec) * BIO_INLINE_VECS,
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
}
diff --git a/block/bio.c b/block/bio.c
index 2f21d2958b60..a1c4d2900c7a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -19,27 +19,40 @@
#include <linux/highmem.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>
+#include <linux/xarray.h>
#include <trace/events/block.h>
#include "blk.h"
#include "blk-rq-qos.h"
-/*
- * Test patch to inline a certain number of bi_io_vec's inside the bio
- * itself, to shrink a bio data allocation from two mempool calls to one
- */
-#define BIO_INLINE_VECS 4
-
-/*
- * if you change this list, also change bvec_alloc or things will
- * break badly! cannot be bigger than what you can fit into an
- * unsigned short
- */
-#define BV(x, n) { .nr_vecs = x, .name = "biovec-"#n }
-static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = {
- BV(1, 1), BV(4, 4), BV(16, 16), BV(64, 64), BV(128, 128), BV(BIO_MAX_PAGES, max),
+static struct biovec_slab {
+ int nr_vecs;
+ char *name;
+ struct kmem_cache *slab;
+} bvec_slabs[] __read_mostly = {
+ { .nr_vecs = 16, .name = "biovec-16" },
+ { .nr_vecs = 64, .name = "biovec-64" },
+ { .nr_vecs = 128, .name = "biovec-128" },
+ { .nr_vecs = BIO_MAX_PAGES, .name = "biovec-max" },
};
-#undef BV
+
+static struct biovec_slab *biovec_slab(unsigned short nr_vecs)
+{
+ switch (nr_vecs) {
+ /* smaller bios use inline vecs */
+ case 5 ... 16:
+ return &bvec_slabs[0];
+ case 17 ... 64:
+ return &bvec_slabs[1];
+ case 65 ... 128:
+ return &bvec_slabs[2];
+ case 129 ... BIO_MAX_PAGES:
+ return &bvec_slabs[3];
+ default:
+ BUG();
+ return NULL;
+ }
+}
/*
* fs_bio_set is the bio_set containing bio and iovec memory pools used by
@@ -58,178 +71,133 @@ struct bio_slab {
char name[8];
};
static DEFINE_MUTEX(bio_slab_lock);
-static struct bio_slab *bio_slabs;
-static unsigned int bio_slab_nr, bio_slab_max;
+static DEFINE_XARRAY(bio_slabs);
-static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
+static struct bio_slab *create_bio_slab(unsigned int size)
{
- unsigned int sz = sizeof(struct bio) + extra_size;
- struct kmem_cache *slab = NULL;
- struct bio_slab *bslab, *new_bio_slabs;
- unsigned int new_bio_slab_max;
- unsigned int i, entry = -1;
+ struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL);
- mutex_lock(&bio_slab_lock);
+ if (!bslab)
+ return NULL;
- i = 0;
- while (i < bio_slab_nr) {
- bslab = &bio_slabs[i];
+ snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
+ bslab->slab = kmem_cache_create(bslab->name, size,
+ ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL);
+ if (!bslab->slab)
+ goto fail_alloc_slab;
- if (!bslab->slab && entry == -1)
- entry = i;
- else if (bslab->slab_size == sz) {
- slab = bslab->slab;
- bslab->slab_ref++;
- break;
- }
- i++;
- }
+ bslab->slab_ref = 1;
+ bslab->slab_size = size;
- if (slab)
- goto out_unlock;
-
- if (bio_slab_nr == bio_slab_max && entry == -1) {
- new_bio_slab_max = bio_slab_max << 1;
- new_bio_slabs = krealloc(bio_slabs,
- new_bio_slab_max * sizeof(struct bio_slab),
- GFP_KERNEL);
- if (!new_bio_slabs)
- goto out_unlock;
- bio_slab_max = new_bio_slab_max;
- bio_slabs = new_bio_slabs;
- }
- if (entry == -1)
- entry = bio_slab_nr++;
+ if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL)))
+ return bslab;
+
+ kmem_cache_destroy(bslab->slab);
- bslab = &bio_slabs[entry];
+fail_alloc_slab:
+ kfree(bslab);
+ return NULL;
+}
+
+static inline unsigned int bs_bio_slab_size(struct bio_set *bs)
+{
+ return bs->front_pad + sizeof(struct bio) + bs->back_pad;
+}
- snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
- slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN,
- SLAB_HWCACHE_ALIGN, NULL);
- if (!slab)
- goto out_unlock;
+static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs)
+{
+ unsigned int size = bs_bio_slab_size(bs);
+ struct bio_slab *bslab;
- bslab->slab = slab;
- bslab->slab_ref = 1;
- bslab->slab_size = sz;
-out_unlock:
+ mutex_lock(&bio_slab_lock);
+ bslab = xa_load(&bio_slabs, size);
+ if (bslab)
+ bslab->slab_ref++;
+ else
+ bslab = create_bio_slab(size);
mutex_unlock(&bio_slab_lock);
- return slab;
+
+ if (bslab)
+ return bslab->slab;
+ return NULL;
}
static void bio_put_slab(struct bio_set *bs)
{
struct bio_slab *bslab = NULL;
- unsigned int i;
+ unsigned int slab_size = bs_bio_slab_size(bs);
mutex_lock(&bio_slab_lock);
- for (i = 0; i < bio_slab_nr; i++) {
- if (bs->bio_slab == bio_slabs[i].slab) {
- bslab = &bio_slabs[i];
- break;
- }
- }
-
+ bslab = xa_load(&bio_slabs, slab_size);
if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
goto out;
+ WARN_ON_ONCE(bslab->slab != bs->bio_slab);
+
WARN_ON(!bslab->slab_ref);
if (--bslab->slab_ref)
goto out;
+ xa_erase(&bio_slabs, slab_size);
+
kmem_cache_destroy(bslab->slab);
- bslab->slab = NULL;
+ kfree(bslab);
out:
mutex_unlock(&bio_slab_lock);
}
-unsigned int bvec_nr_vecs(unsigned short idx)
+void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
{
- return bvec_slabs[--idx].nr_vecs;
-}
-
-void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
-{
- if (!idx)
- return;
- idx--;
+ BIO_BUG_ON(nr_vecs > BIO_MAX_PAGES);
- BIO_BUG_ON(idx >= BVEC_POOL_NR);
-
- if (idx == BVEC_POOL_MAX) {
+ if (nr_vecs == BIO_MAX_PAGES)
mempool_free(bv, pool);
- } else {
- struct biovec_slab *bvs = bvec_slabs + idx;
+ else if (nr_vecs > BIO_INLINE_VECS)
+ kmem_cache_free(biovec_slab(nr_vecs)->slab, bv);
+}
- kmem_cache_free(bvs->slab, bv);
- }
+/*
+ * Make the first allocation restricted and don't dump info on allocation
+ * failures, since we'll fall back to the mempool in case of failure.
+ */
+static inline gfp_t bvec_alloc_gfp(gfp_t gfp)
+{
+ return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) |
+ __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
}
-struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
- mempool_t *pool)
+struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
+ gfp_t gfp_mask)
{
- struct bio_vec *bvl;
+ struct biovec_slab *bvs = biovec_slab(*nr_vecs);
- /*
- * see comment near bvec_array define!
- */
- switch (nr) {
- case 1:
- *idx = 0;
- break;
- case 2 ... 4:
- *idx = 1;
- break;
- case 5 ... 16:
- *idx = 2;
- break;
- case 17 ... 64:
- *idx = 3;
- break;
- case 65 ... 128:
- *idx = 4;
- break;
- case 129 ... BIO_MAX_PAGES:
- *idx = 5;
- break;
- default:
+ if (WARN_ON_ONCE(!bvs))
return NULL;
- }
/*
- * idx now points to the pool we want to allocate from. only the
- * 1-vec entry pool is mempool backed.
+ * Upgrade the nr_vecs request to take full advantage of the allocation.
+ * We also rely on this in the bvec_free path.
*/
- if (*idx == BVEC_POOL_MAX) {
-fallback:
- bvl = mempool_alloc(pool, gfp_mask);
- } else {
- struct biovec_slab *bvs = bvec_slabs + *idx;
- gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO);
+ *nr_vecs = bvs->nr_vecs;
- /*
- * Make this allocation restricted and don't dump info on
- * allocation failures, since we'll fallback to the mempool
- * in case of failure.
- */
- __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+ /*
+ * Try a slab allocation first for all smaller allocations. If that
+ * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
+ * The mempool is sized to handle up to BIO_MAX_PAGES entries.
+ */
+ if (*nr_vecs < BIO_MAX_PAGES) {
+ struct bio_vec *bvl;
- /*
- * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM
- * is set, retry with the 1-entry mempool
- */
- bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
- if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) {
- *idx = BVEC_POOL_MAX;
- goto fallback;
- }
+ bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask));
+ if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM))
+ return bvl;
+ *nr_vecs = BIO_MAX_PAGES;
}
- (*idx)++;
- return bvl;
+ return mempool_alloc(pool, gfp_mask);
}
void bio_uninit(struct bio *bio)
@@ -255,7 +223,7 @@ static void bio_free(struct bio *bio)
bio_uninit(bio);
if (bs) {
- bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
+ bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
/*
* If we have front padding, adjust the bio pointer before freeing
@@ -299,12 +267,8 @@ EXPORT_SYMBOL(bio_init);
*/
void bio_reset(struct bio *bio)
{
- unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
-
bio_uninit(bio);
-
memset(bio, 0, BIO_RESET_BYTES);
- bio->bi_flags = flags;
atomic_set(&bio->__bi_remaining, 1);
}
EXPORT_SYMBOL(bio_reset);
@@ -405,122 +369,97 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
* @nr_iovecs: number of iovecs to pre-allocate
* @bs: the bio_set to allocate from.
*
- * Description:
- * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
- * backed by the @bs's mempool.
+ * Allocate a bio from the mempools in @bs.
*
- * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will
- * always be able to allocate a bio. This is due to the mempool guarantees.
- * To make this work, callers must never allocate more than 1 bio at a time
- * from this pool. Callers that need to allocate more than 1 bio must always
- * submit the previously allocated bio for IO before attempting to allocate
- * a new one. Failure to do so can cause deadlocks under memory pressure.
+ * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to
+ * allocate a bio. This is due to the mempool guarantees. To make this work,
+ * callers must never allocate more than 1 bio at a time from the general pool.
+ * Callers that need to allocate more than 1 bio must always submit the
+ * previously allocated bio for IO before attempting to allocate a new one.
+ * Failure to do so can cause deadlocks under memory pressure.
*
- * Note that when running under submit_bio_noacct() (i.e. any block
- * driver), bios are not submitted until after you return - see the code in
- * submit_bio_noacct() that converts recursion into iteration, to prevent
- * stack overflows.
+ * Note that when running under submit_bio_noacct() (i.e. any block driver),
+ * bios are not submitted until after you return - see the code in
+ * submit_bio_noacct() that converts recursion into iteration, to prevent
+ * stack overflows.
*
- * This would normally mean allocating multiple bios under
- * submit_bio_noacct() would be susceptible to deadlocks, but we have
- * deadlock avoidance code that resubmits any blocked bios from a rescuer
- * thread.
+ * This would normally mean allocating multiple bios under submit_bio_noacct()
+ * would be susceptible to deadlocks, but we have
+ * deadlock avoidance code that resubmits any blocked bios from a rescuer
+ * thread.
*
- * However, we do not guarantee forward progress for allocations from other
- * mempools. Doing multiple allocations from the same mempool under
- * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
- * for per bio allocations.
+ * However, we do not guarantee forward progress for allocations from other
+ * mempools. Doing multiple allocations from the same mempool under
+ * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
+ * for per bio allocations.
*
- * RETURNS:
- * Pointer to new bio on success, NULL on failure.
+ * Returns: Pointer to new bio on success, NULL on failure.
*/
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
+struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
struct bio_set *bs)
{
gfp_t saved_gfp = gfp_mask;
- unsigned front_pad;
- unsigned inline_vecs;
- struct bio_vec *bvl = NULL;
struct bio *bio;
void *p;
- if (!bs) {
- if (nr_iovecs > UIO_MAXIOV)
- return NULL;
-
- p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
- front_pad = 0;
- inline_vecs = nr_iovecs;
- } else {
- /* should not use nobvec bioset for nr_iovecs > 0 */
- if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) &&
- nr_iovecs > 0))
- return NULL;
- /*
- * submit_bio_noacct() converts recursion to iteration; this
- * means if we're running beneath it, any bios we allocate and
- * submit will not be submitted (and thus freed) until after we
- * return.
- *
- * This exposes us to a potential deadlock if we allocate
- * multiple bios from the same bio_set() while running
- * underneath submit_bio_noacct(). If we were to allocate
- * multiple bios (say a stacking block driver that was splitting
- * bios), we would deadlock if we exhausted the mempool's
- * reserve.
- *
- * We solve this, and guarantee forward progress, with a rescuer
- * workqueue per bio_set. If we go to allocate and there are
- * bios on current->bio_list, we first try the allocation
- * without __GFP_DIRECT_RECLAIM; if that fails, we punt those
- * bios we would be blocking to the rescuer workqueue before
- * we retry with the original gfp_flags.
- */
-
- if (current->bio_list &&
- (!bio_list_empty(&current->bio_list[0]) ||
- !bio_list_empty(&current->bio_list[1])) &&
- bs->rescue_workqueue)
- gfp_mask &= ~__GFP_DIRECT_RECLAIM;
+ /* should not use nobvec bioset for nr_iovecs > 0 */
+ if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0))
+ return NULL;
+ /*
+ * submit_bio_noacct() converts recursion to iteration; this means if
+ * we're running beneath it, any bios we allocate and submit will not be
+ * submitted (and thus freed) until after we return.
+ *
+ * This exposes us to a potential deadlock if we allocate multiple bios
+ * from the same bio_set() while running underneath submit_bio_noacct().
+ * If we were to allocate multiple bios (say a stacking block driver
+ * that was splitting bios), we would deadlock if we exhausted the
+ * mempool's reserve.
+ *
+ * We solve this, and guarantee forward progress, with a rescuer
+ * workqueue per bio_set. If we go to allocate and there are bios on
+ * current->bio_list, we first try the allocation without
+ * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be
+ * blocking to the rescuer workqueue before we retry with the original
+ * gfp_flags.
+ */
+ if (current->bio_list &&
+ (!bio_list_empty(&current->bio_list[0]) ||
+ !bio_list_empty(&current->bio_list[1])) &&
+ bs->rescue_workqueue)
+ gfp_mask &= ~__GFP_DIRECT_RECLAIM;
+
+ p = mempool_alloc(&bs->bio_pool, gfp_mask);
+ if (!p && gfp_mask != saved_gfp) {
+ punt_bios_to_rescuer(bs);
+ gfp_mask = saved_gfp;
p = mempool_alloc(&bs->bio_pool, gfp_mask);
- if (!p && gfp_mask != saved_gfp) {
- punt_bios_to_rescuer(bs);
- gfp_mask = saved_gfp;
- p = mempool_alloc(&bs->bio_pool, gfp_mask);
- }
-
- front_pad = bs->front_pad;
- inline_vecs = BIO_INLINE_VECS;
}
-
if (unlikely(!p))
return NULL;
- bio = p + front_pad;
- bio_init(bio, NULL, 0);
+ bio = p + bs->front_pad;
+ if (nr_iovecs > BIO_INLINE_VECS) {
+ struct bio_vec *bvl = NULL;
- if (nr_iovecs > inline_vecs) {
- unsigned long idx = 0;
-
- bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
+ bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
if (!bvl && gfp_mask != saved_gfp) {
punt_bios_to_rescuer(bs);
gfp_mask = saved_gfp;
- bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
+ bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
}
-
if (unlikely(!bvl))
goto err_free;
- bio->bi_flags |= idx << BVEC_POOL_OFFSET;
+ bio_init(bio, bvl, nr_iovecs);
} else if (nr_iovecs) {
- bvl = bio->bi_inline_vecs;
+ bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS);
+ } else {
+ bio_init(bio, NULL, 0);
}
bio->bi_pool = bs;
- bio->bi_max_vecs = nr_iovecs;
- bio->bi_io_vec = bvl;
return bio;
err_free:
@@ -529,6 +468,31 @@ err_free:
}
EXPORT_SYMBOL(bio_alloc_bioset);
+/**
+ * bio_kmalloc - kmalloc a bio for I/O
+ * @gfp_mask: the GFP_* mask given to the slab allocator
+ * @nr_iovecs: number of iovecs to pre-allocate
+ *
+ * Use kmalloc to allocate and initialize a bio.
+ *
+ * Returns: Pointer to new bio on success, NULL on failure.
+ */
+struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs)
+{
+ struct bio *bio;
+
+ if (nr_iovecs > UIO_MAXIOV)
+ return NULL;
+
+ bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
+ if (unlikely(!bio))
+ return NULL;
+ bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs);
+ bio->bi_pool = NULL;
+ return bio;
+}
+EXPORT_SYMBOL(bio_kmalloc);
+
void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
{
unsigned long flags;
@@ -607,16 +571,7 @@ void bio_truncate(struct bio *bio, unsigned new_size)
*/
void guard_bio_eod(struct bio *bio)
{
- sector_t maxsector;
- struct block_device *part;
-
- rcu_read_lock();
- part = __disk_get_part(bio->bi_disk, bio->bi_partno);
- if (part)
- maxsector = bdev_nr_sectors(part);
- else
- maxsector = get_capacity(bio->bi_disk);
- rcu_read_unlock();
+ sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
if (!maxsector)
return;
@@ -673,17 +628,18 @@ EXPORT_SYMBOL(bio_put);
*/
void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
{
- BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio));
+ WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs);
/*
- * most users will be overriding ->bi_disk with a new target,
+ * most users will be overriding ->bi_bdev with a new target,
* so we don't set nor calculate new physical/hw segment counts here
*/
- bio->bi_disk = bio_src->bi_disk;
- bio->bi_partno = bio_src->bi_partno;
+ bio->bi_bdev = bio_src->bi_bdev;
bio_set_flag(bio, BIO_CLONED);
if (bio_flagged(bio_src, BIO_THROTTLED))
bio_set_flag(bio, BIO_THROTTLED);
+ if (bio_flagged(bio_src, BIO_REMAPPED))
+ bio_set_flag(bio, BIO_REMAPPED);
bio->bi_opf = bio_src->bi_opf;
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
@@ -730,7 +686,7 @@ EXPORT_SYMBOL(bio_clone_fast);
const char *bio_devname(struct bio *bio, char *buf)
{
- return disk_name(bio->bi_disk, bio->bi_partno, buf);
+ return bdevname(bio->bi_bdev, buf);
}
EXPORT_SYMBOL(bio_devname);
@@ -870,7 +826,7 @@ EXPORT_SYMBOL(bio_add_pc_page);
int bio_add_zone_append_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
- struct request_queue *q = bio->bi_disk->queue;
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
bool same_page = false;
if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
@@ -993,21 +949,18 @@ void bio_release_pages(struct bio *bio, bool mark_dirty)
}
EXPORT_SYMBOL_GPL(bio_release_pages);
-static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
+static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
{
- const struct bio_vec *bv = iter->bvec;
- unsigned int len;
- size_t size;
-
- if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len))
- return -EINVAL;
-
- len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count);
- size = bio_add_page(bio, bv->bv_page, len,
- bv->bv_offset + iter->iov_offset);
- if (unlikely(size != len))
- return -EINVAL;
- iov_iter_advance(iter, size);
+ WARN_ON_ONCE(bio->bi_max_vecs);
+
+ bio->bi_vcnt = iter->nr_segs;
+ bio->bi_io_vec = (struct bio_vec *)iter->bvec;
+ bio->bi_iter.bi_bvec_done = iter->iov_offset;
+ bio->bi_iter.bi_size = iter->count;
+ bio_set_flag(bio, BIO_NO_PAGE_REF);
+ bio_set_flag(bio, BIO_CLONED);
+
+ iov_iter_advance(iter, iter->count);
return 0;
}
@@ -1070,7 +1023,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
{
unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
- struct request_queue *q = bio->bi_disk->queue;
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
@@ -1121,41 +1074,40 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
* This takes either an iterator pointing to user memory, or one pointing to
* kernel pages (BVEC iterator). If we're adding user pages, we pin them and
* map them into the kernel. On IO completion, the caller should put those
- * pages. If we're adding kernel pages, and the caller told us it's safe to
- * do so, we just have to add the pages to the bio directly. We don't grab an
- * extra reference to those pages (the user should already have that), and we
- * don't put the page on IO completion. The caller needs to check if the bio is
- * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be
- * released.
+ * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided
+ * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs
+ * to ensure the bvecs and pages stay referenced until the submitted I/O is
+ * completed by a call to ->ki_complete() or returns with an error other than
+ * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF
+ * on IO completion. If it isn't, then pages should be released.
*
* The function tries, but does not guarantee, to pin as many pages as
* fit into the bio, or are requested in @iter, whatever is smaller. If
* MM encounters an error pinning the requested pages, it stops. Error
* is returned only if 0 pages could be pinned.
+ *
+ * It's intended for direct IO, so doesn't do PSI tracking, the caller is
+ * responsible for setting BIO_WORKINGSET if necessary.
*/
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
- const bool is_bvec = iov_iter_is_bvec(iter);
- int ret;
+ int ret = 0;
- if (WARN_ON_ONCE(bio->bi_vcnt))
- return -EINVAL;
+ if (iov_iter_is_bvec(iter)) {
+ if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
+ return -EINVAL;
+ return bio_iov_bvec_set(bio, iter);
+ }
do {
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- if (WARN_ON_ONCE(is_bvec))
- return -EINVAL;
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND)
ret = __bio_iov_append_get_pages(bio, iter);
- } else {
- if (is_bvec)
- ret = __bio_iov_bvec_add_pages(bio, iter);
- else
- ret = __bio_iov_iter_get_pages(bio, iter);
- }
+ else
+ ret = __bio_iov_iter_get_pages(bio, iter);
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
- if (is_bvec)
- bio_set_flag(bio, BIO_NO_PAGE_REF);
+ /* don't account direct I/O as memory stall */
+ bio_clear_flag(bio, BIO_WORKINGSET);
return bio->bi_vcnt ? 0 : ret;
}
EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
@@ -1178,7 +1130,8 @@ static void submit_bio_wait_endio(struct bio *bio)
*/
int submit_bio_wait(struct bio *bio)
{
- DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map);
+ DECLARE_COMPLETION_ONSTACK_MAP(done,
+ bio->bi_bdev->bd_disk->lockdep_map);
unsigned long hang_check;
bio->bi_private = &done;
@@ -1455,8 +1408,8 @@ again:
if (!bio_integrity_endio(bio))
return;
- if (bio->bi_disk)
- rq_qos_done_bio(bio->bi_disk->queue, bio);
+ if (bio->bi_bdev)
+ rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio);
/*
* Need to have a real endio function for chained bios, otherwise
@@ -1471,8 +1424,8 @@ again:
goto again;
}
- if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
- trace_block_bio_complete(bio->bi_disk->queue, bio);
+ if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+ trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
}
@@ -1559,7 +1512,7 @@ EXPORT_SYMBOL_GPL(bio_trim);
*/
int biovec_init_pool(mempool_t *pool, int pool_entries)
{
- struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX;
+ struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1;
return mempool_init_slab_pool(pool, pool_entries, bp->slab);
}
@@ -1612,15 +1565,17 @@ int bioset_init(struct bio_set *bs,
unsigned int front_pad,
int flags)
{
- unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
-
bs->front_pad = front_pad;
+ if (flags & BIOSET_NEED_BVECS)
+ bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
+ else
+ bs->back_pad = 0;
spin_lock_init(&bs->rescue_lock);
bio_list_init(&bs->rescue_list);
INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
- bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
+ bs->bio_slab = bio_find_or_create_slab(bs);
if (!bs->bio_slab)
return -ENOMEM;
@@ -1663,39 +1618,19 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
}
EXPORT_SYMBOL(bioset_init_from_src);
-static void __init biovec_init_slabs(void)
+static int __init init_bio(void)
{
int i;
- for (i = 0; i < BVEC_POOL_NR; i++) {
- int size;
- struct biovec_slab *bvs = bvec_slabs + i;
+ bio_integrity_init();
- if (bvs->nr_vecs <= BIO_INLINE_VECS) {
- bvs->slab = NULL;
- continue;
- }
+ for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
+ struct biovec_slab *bvs = bvec_slabs + i;
- size = bvs->nr_vecs * sizeof(struct bio_vec);
- bvs->slab = kmem_cache_create(bvs->name, size, 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ bvs->slab = kmem_cache_create(bvs->name,
+ bvs->nr_vecs * sizeof(struct bio_vec), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
}
-}
-
-static int __init init_bio(void)
-{
- bio_slab_max = 2;
- bio_slab_nr = 0;
- bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab),
- GFP_KERNEL);
-
- BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET);
-
- if (!bio_slabs)
- panic("bio: can't allocate bios\n");
-
- bio_integrity_init();
- biovec_init_slabs();
if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
panic("bio: can't allocate bios\n");
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4221a1539391..a317c03d40f6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -32,8 +32,6 @@
#include <linux/psi.h>
#include "blk.h"
-#define MAX_KEY_LEN 100
-
/*
* blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
* blkcg_pol_register_mutex nests outside of it and synchronizes entire
@@ -1765,12 +1763,15 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
if (unlikely(current->flags & PF_KTHREAD))
return;
- if (!blk_get_queue(q))
- return;
+ if (current->throttle_queue != q) {
+ if (!blk_get_queue(q))
+ return;
+
+ if (current->throttle_queue)
+ blk_put_queue(current->throttle_queue);
+ current->throttle_queue = q;
+ }
- if (current->throttle_queue)
- blk_put_queue(current->throttle_queue);
- current->throttle_queue = q;
if (use_memdelay)
current->use_memdelay = use_memdelay;
set_notify_resume(current);
@@ -1808,7 +1809,8 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
struct blkcg_gq *blkg, *ret_blkg = NULL;
rcu_read_lock();
- blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue);
+ blkg = blkg_lookup_create(css_to_blkcg(css),
+ bio->bi_bdev->bd_disk->queue);
while (blkg) {
if (blkg_tryget(blkg)) {
ret_blkg = blkg;
@@ -1844,8 +1846,8 @@ void bio_associate_blkg_from_css(struct bio *bio,
if (css && css->parent) {
bio->bi_blkg = blkg_tryget_closest(bio, css);
} else {
- blkg_get(bio->bi_disk->queue->root_blkg);
- bio->bi_blkg = bio->bi_disk->queue->root_blkg;
+ blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg);
+ bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg;
}
}
EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
diff --git a/block/blk-core.c b/block/blk-core.c
index 7663a9b94b80..5e752840b41a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -476,7 +476,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
static inline int bio_queue_enter(struct bio *bio)
{
- struct request_queue *q = bio->bi_disk->queue;
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
bool nowait = bio->bi_opf & REQ_NOWAIT;
int ret;
@@ -531,7 +531,7 @@ struct request_queue *blk_alloc_queue(int node_id)
if (q->id < 0)
goto fail_q;
- ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
if (ret)
goto fail_id;
@@ -692,11 +692,9 @@ static inline bool should_fail_request(struct block_device *part,
#endif /* CONFIG_FAIL_MAKE_REQUEST */
-static inline bool bio_check_ro(struct bio *bio, struct block_device *part)
+static inline bool bio_check_ro(struct bio *bio)
{
- const int op = bio_op(bio);
-
- if (part->bd_read_only && op_is_write(op)) {
+ if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
char b[BDEVNAME_SIZE];
if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
@@ -704,7 +702,7 @@ static inline bool bio_check_ro(struct bio *bio, struct block_device *part)
WARN_ONCE(1,
"Trying to write to read-only block-device %s (partno %d)\n",
- bio_devname(bio, b), part->bd_partno);
+ bio_devname(bio, b), bio->bi_bdev->bd_partno);
/* Older lvm-tools actually trigger this */
return false;
}
@@ -714,7 +712,7 @@ static inline bool bio_check_ro(struct bio *bio, struct block_device *part)
static noinline int should_fail_bio(struct bio *bio)
{
- if (should_fail_request(bio->bi_disk->part0, bio->bi_iter.bi_size))
+ if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size))
return -EIO;
return 0;
}
@@ -725,8 +723,9 @@ ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
* This may well happen - the kernel calls bread() without checking the size of
* the device, e.g., when mounting a file system.
*/
-static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
+static inline int bio_check_eod(struct bio *bio)
{
+ sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
unsigned int nr_sectors = bio_sectors(bio);
if (nr_sectors && maxsector &&
@@ -741,33 +740,20 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
/*
* Remap block n of partition p to block n+start(p) of the disk.
*/
-static inline int blk_partition_remap(struct bio *bio)
+static int blk_partition_remap(struct bio *bio)
{
- struct block_device *p;
- int ret = -EIO;
+ struct block_device *p = bio->bi_bdev;
- rcu_read_lock();
- p = __disk_get_part(bio->bi_disk, bio->bi_partno);
- if (unlikely(!p))
- goto out;
if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
- goto out;
- if (unlikely(bio_check_ro(bio, p)))
- goto out;
-
+ return -EIO;
if (bio_sectors(bio)) {
- if (bio_check_eod(bio, bdev_nr_sectors(p)))
- goto out;
bio->bi_iter.bi_sector += p->bd_start_sect;
trace_block_bio_remap(bio, p->bd_dev,
bio->bi_iter.bi_sector -
p->bd_start_sect);
}
- bio->bi_partno = 0;
- ret = 0;
-out:
- rcu_read_unlock();
- return ret;
+ bio_set_flag(bio, BIO_REMAPPED);
+ return 0;
}
/*
@@ -807,7 +793,8 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
static noinline_for_stack bool submit_bio_checks(struct bio *bio)
{
- struct request_queue *q = bio->bi_disk->queue;
+ struct block_device *bdev = bio->bi_bdev;
+ struct request_queue *q = bdev->bd_disk->queue;
blk_status_t status = BLK_STS_IOERR;
struct blk_plug *plug;
@@ -826,14 +813,12 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
if (should_fail_bio(bio))
goto end_io;
-
- if (bio->bi_partno) {
- if (unlikely(blk_partition_remap(bio)))
- goto end_io;
- } else {
- if (unlikely(bio_check_ro(bio, bio->bi_disk->part0)))
+ if (unlikely(bio_check_ro(bio)))
+ goto end_io;
+ if (!bio_flagged(bio, BIO_REMAPPED)) {
+ if (unlikely(bio_check_eod(bio)))
goto end_io;
- if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
+ if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
goto end_io;
}
@@ -926,7 +911,7 @@ end_io:
static blk_qc_t __submit_bio(struct bio *bio)
{
- struct gendisk *disk = bio->bi_disk;
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
blk_qc_t ret = BLK_QC_T_NONE;
if (blk_crypto_bio_prep(&bio)) {
@@ -968,7 +953,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
current->bio_list = bio_list_on_stack;
do {
- struct request_queue *q = bio->bi_disk->queue;
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct bio_list lower, same;
if (unlikely(bio_queue_enter(bio) != 0))
@@ -989,7 +974,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
bio_list_init(&lower);
bio_list_init(&same);
while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
- if (q == bio->bi_disk->queue)
+ if (q == bio->bi_bdev->bd_disk->queue)
bio_list_add(&same, bio);
else
bio_list_add(&lower, bio);
@@ -1014,7 +999,7 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
current->bio_list = bio_list;
do {
- struct gendisk *disk = bio->bi_disk;
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
if (unlikely(bio_queue_enter(bio) != 0))
continue;
@@ -1057,7 +1042,7 @@ blk_qc_t submit_bio_noacct(struct bio *bio)
return BLK_QC_T_NONE;
}
- if (!bio->bi_disk->fops->submit_bio)
+ if (!bio->bi_bdev->bd_disk->fops->submit_bio)
return __submit_bio_noacct_mq(bio);
return __submit_bio_noacct(bio);
}
@@ -1069,7 +1054,7 @@ EXPORT_SYMBOL(submit_bio_noacct);
*
* submit_bio() is used to submit I/O requests to block devices. It is passed a
* fully set up &struct bio that describes the I/O that needs to be done. The
- * bio will be send to the device described by the bi_disk and bi_partno fields.
+ * bio will be send to the device described by the bi_bdev field.
*
* The success/failure status of the request, along with notification of
* completion, is delivered asynchronously through the ->bi_end_io() callback
@@ -1089,7 +1074,8 @@ blk_qc_t submit_bio(struct bio *bio)
unsigned int count;
if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
- count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
+ count = queue_logical_block_size(
+ bio->bi_bdev->bd_disk->queue) >> 9;
else
count = bio_sectors(bio);
@@ -1313,7 +1299,11 @@ void blk_account_io_start(struct request *rq)
if (!blk_do_io_stat(rq))
return;
- rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+ /* passthrough requests can hold bios that do not have ->bi_bdev set */
+ if (rq->bio && rq->bio->bi_bdev)
+ rq->part = rq->bio->bi_bdev;
+ else
+ rq->part = rq->rq_disk->part0;
part_stat_lock();
update_io_ticks(rq->part, jiffies, false);
@@ -1336,14 +1326,17 @@ static unsigned long __part_start_io_acct(struct block_device *part,
return now;
}
-unsigned long part_start_io_acct(struct gendisk *disk, struct block_device **part,
- struct bio *bio)
+/**
+ * bio_start_io_acct - start I/O accounting for bio based drivers
+ * @bio: bio to start account for
+ *
+ * Returns the start time that should be passed back to bio_end_io_acct().
+ */
+unsigned long bio_start_io_acct(struct bio *bio)
{
- *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector);
-
- return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio));
+ return __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), bio_op(bio));
}
-EXPORT_SYMBOL_GPL(part_start_io_acct);
+EXPORT_SYMBOL_GPL(bio_start_io_acct);
unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
unsigned int op)
@@ -1366,12 +1359,12 @@ static void __part_end_io_acct(struct block_device *part, unsigned int op,
part_stat_unlock();
}
-void part_end_io_acct(struct block_device *part, struct bio *bio,
- unsigned long start_time)
+void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
+ struct block_device *orig_bdev)
{
- __part_end_io_acct(part, bio_op(bio), start_time);
+ __part_end_io_acct(orig_bdev, bio_op(bio), start_time);
}
-EXPORT_SYMBOL_GPL(part_end_io_acct);
+EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);
void disk_end_io_acct(struct gendisk *disk, unsigned int op,
unsigned long start_time)
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index c162b754efbd..e8327c50d7c9 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -164,10 +164,12 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
struct bio_vec bv;
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), NULL);
+ bio = bio_kmalloc(GFP_NOIO, bio_segments(bio_src));
if (!bio)
return NULL;
- bio->bi_disk = bio_src->bi_disk;
+ bio->bi_bdev = bio_src->bi_bdev;
+ if (bio_flagged(bio_src, BIO_REMAPPED))
+ bio_set_flag(bio, BIO_REMAPPED);
bio->bi_opf = bio_src->bi_opf;
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 5da43f0973b4..09fcb18fa778 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -280,7 +280,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
* Success if device supports the encryption context, or if we succeeded
* in falling back to the crypto API.
*/
- if (blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm,
+ if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm,
&bc_key->crypto_cfg))
return true;
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 85324d53d072..beae70a0e5e5 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -31,8 +31,7 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error)
}
/**
- * blk_execute_rq_nowait - insert a request into queue for execution
- * @q: queue to insert the request in
+ * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
* @bd_disk: matching gendisk
* @rq: request to insert
* @at_head: insert request at head or tail of queue
@@ -45,9 +44,8 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error)
* Note:
* This function will invoke @done directly if the queue is dead.
*/
-void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
- struct request *rq, int at_head,
- rq_end_io_fn *done)
+void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq,
+ int at_head, rq_end_io_fn *done)
{
WARN_ON(irqs_disabled());
WARN_ON(!blk_rq_is_passthrough(rq));
@@ -67,7 +65,6 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
/**
* blk_execute_rq - insert a request into queue for execution
- * @q: queue to insert the request in
* @bd_disk: matching gendisk
* @rq: request to insert
* @at_head: insert request at head or tail of queue
@@ -76,14 +73,13 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
* Insert a fully prepared request at the back of the I/O scheduler queue
* for execution and wait for completion.
*/
-void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
- struct request *rq, int at_head)
+void blk_execute_rq(struct gendisk *bd_disk, struct request *rq, int at_head)
{
DECLARE_COMPLETION_ONSTACK(wait);
unsigned long hang_check;
rq->end_io_data = &wait;
- blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
+ blk_execute_rq_nowait(bd_disk, rq, at_head, blk_end_sync_rq);
/* Prevent hang_check timer from firing at us during very long I/O */
hang_check = sysctl_hung_task_timeout_secs;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 76c1624cb06c..7942ca6ed321 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -432,23 +432,18 @@ void blk_insert_flush(struct request *rq)
/**
* blkdev_issue_flush - queue a flush
* @bdev: blockdev to issue flush for
- * @gfp_mask: memory allocation flags (for bio_alloc)
*
* Description:
* Issue a flush for the block device in question.
*/
-int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask)
+int blkdev_issue_flush(struct block_device *bdev)
{
- struct bio *bio;
- int ret = 0;
+ struct bio bio;
- bio = bio_alloc(gfp_mask, 0);
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-
- ret = submit_bio_wait(bio);
- bio_put(bio);
- return ret;
+ bio_init(&bio, NULL, 0);
+ bio_set_dev(&bio, bdev);
+ bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+ return submit_bio_wait(&bio);
}
EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 808768f6b174..ffb4aa0ea68b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -298,14 +298,13 @@ split:
* Split a bio into two bios, chain the two bios, submit the second half and
* store a pointer to the first half in *@bio. If the second bio is still too
* big it will be split by a recursive call to this function. Since this
- * function may allocate a new bio from @bio->bi_disk->queue->bio_split, it is
- * the responsibility of the caller to ensure that
- * @bio->bi_disk->queue->bio_split is only released after processing of the
- * split bio has finished.
+ * function may allocate a new bio from q->bio_split, it is the responsibility
+ * of the caller to ensure that q->bio_split is only released after processing
+ * of the split bio has finished.
*/
void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
{
- struct request_queue *q = (*bio)->bi_disk->queue;
+ struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue;
struct bio *split = NULL;
switch (bio_op(*bio)) {
@@ -358,9 +357,9 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
*
* Split a bio into two bios, chains the two bios, submit the second half and
* store a pointer to the first half in *@bio. Since this function may allocate
- * a new bio from @bio->bi_disk->queue->bio_split, it is the responsibility of
- * the caller to ensure that @bio->bi_disk->queue->bio_split is only released
- * after processing of the split bio has finished.
+ * a new bio from q->bio_split, it is the responsibility of the caller to ensure
+ * that q->bio_split is only released after processing of the split bio has
+ * finished.
*/
void blk_queue_split(struct bio **bio)
{
@@ -866,7 +865,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
return false;
/* must be same device */
- if (rq->rq_disk != bio->bi_disk)
+ if (rq->rq_disk != bio->bi_bdev->bd_disk)
return false;
/* only merge integrity protected bio into ditto rq */
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f285a9123a8b..f21d922ecfaf 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1646,6 +1646,42 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
}
EXPORT_SYMBOL(blk_mq_run_hw_queue);
+/*
+ * Is the request queue handled by an IO scheduler that does not respect
+ * hardware queues when dispatching?
+ */
+static bool blk_mq_has_sqsched(struct request_queue *q)
+{
+ struct elevator_queue *e = q->elevator;
+
+ if (e && e->type->ops.dispatch_request &&
+ !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
+ return true;
+ return false;
+}
+
+/*
+ * Return prefered queue to dispatch from (if any) for non-mq aware IO
+ * scheduler.
+ */
+static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
+{
+ struct blk_mq_hw_ctx *hctx;
+
+ /*
+ * If the IO scheduler does not respect hardware queues when
+ * dispatching, we just don't bother with multiple HW queues and
+ * dispatch from hctx for the current CPU since running multiple queues
+ * just causes lock contention inside the scheduler and pointless cache
+ * bouncing.
+ */
+ hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT,
+ raw_smp_processor_id());
+ if (!blk_mq_hctx_stopped(hctx))
+ return hctx;
+ return NULL;
+}
+
/**
* blk_mq_run_hw_queues - Run all hardware queues in a request queue.
* @q: Pointer to the request queue to run.
@@ -1653,14 +1689,23 @@ EXPORT_SYMBOL(blk_mq_run_hw_queue);
*/
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
{
- struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_hw_ctx *hctx, *sq_hctx;
int i;
+ sq_hctx = NULL;
+ if (blk_mq_has_sqsched(q))
+ sq_hctx = blk_mq_get_sq_hctx(q);
queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_hctx_stopped(hctx))
continue;
-
- blk_mq_run_hw_queue(hctx, async);
+ /*
+ * Dispatch from this hctx either if there's no hctx preferred
+ * by IO scheduler or if it has requests that bypass the
+ * scheduler.
+ */
+ if (!sq_hctx || sq_hctx == hctx ||
+ !list_empty_careful(&hctx->dispatch))
+ blk_mq_run_hw_queue(hctx, async);
}
}
EXPORT_SYMBOL(blk_mq_run_hw_queues);
@@ -1672,14 +1717,23 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues);
*/
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
{
- struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_hw_ctx *hctx, *sq_hctx;
int i;
+ sq_hctx = NULL;
+ if (blk_mq_has_sqsched(q))
+ sq_hctx = blk_mq_get_sq_hctx(q);
queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_hctx_stopped(hctx))
continue;
-
- blk_mq_delay_run_hw_queue(hctx, msecs);
+ /*
+ * Dispatch from this hctx either if there's no hctx preferred
+ * by IO scheduler or if it has requests that bypass the
+ * scheduler.
+ */
+ if (!sq_hctx || sq_hctx == hctx ||
+ !list_empty_careful(&hctx->dispatch))
+ blk_mq_delay_run_hw_queue(hctx, msecs);
}
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
@@ -2128,7 +2182,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
*/
blk_qc_t blk_mq_submit_bio(struct bio *bio)
{
- struct request_queue *q = bio->bi_disk->queue;
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = op_is_flush(bio->bi_opf);
struct blk_mq_alloc_data data = {
@@ -2653,7 +2707,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
goto free_hctx;
atomic_set(&hctx->nr_active, 0);
- atomic_set(&hctx->elevator_queued, 0);
if (node == NUMA_NO_NODE)
node = set->numa_node;
hctx->numa_node = node;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 43990b1d148b..7dd8be314ac6 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -60,6 +60,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->io_opt = 0;
lim->misaligned = 0;
lim->zoned = BLK_ZONED_NONE;
+ lim->zone_write_granularity = 0;
}
EXPORT_SYMBOL(blk_set_default_limits);
@@ -367,6 +368,28 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
EXPORT_SYMBOL(blk_queue_physical_block_size);
/**
+ * blk_queue_zone_write_granularity - set zone write granularity for the queue
+ * @q: the request queue for the zoned device
+ * @size: the zone write granularity size, in bytes
+ *
+ * Description:
+ * This should be set to the lowest possible size allowing to write in
+ * sequential zones of a zoned block device.
+ */
+void blk_queue_zone_write_granularity(struct request_queue *q,
+ unsigned int size)
+{
+ if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
+ return;
+
+ q->limits.zone_write_granularity = size;
+
+ if (q->limits.zone_write_granularity < q->limits.logical_block_size)
+ q->limits.zone_write_granularity = q->limits.logical_block_size;
+}
+EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity);
+
+/**
* blk_queue_alignment_offset - set physical block alignment offset
* @q: the request queue for the device
* @offset: alignment offset in bytes
@@ -631,6 +654,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->discard_granularity;
}
+ t->zone_write_granularity = max(t->zone_write_granularity,
+ b->zone_write_granularity);
t->zoned = max(t->zoned, b->zoned);
return ret;
}
@@ -847,6 +872,8 @@ EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
*/
void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
{
+ struct request_queue *q = disk->queue;
+
switch (model) {
case BLK_ZONED_HM:
/*
@@ -865,7 +892,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
* we do nothing special as far as the block layer is concerned.
*/
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) ||
- disk_has_partitions(disk))
+ !xa_empty(&disk->part_tbl))
model = BLK_ZONED_NONE;
break;
case BLK_ZONED_NONE:
@@ -875,7 +902,17 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
break;
}
- disk->queue->limits.zoned = model;
+ q->limits.zoned = model;
+ if (model != BLK_ZONED_NONE) {
+ /*
+ * Set the zone write granularity to the device logical block
+ * size by default. The driver can change this value if needed.
+ */
+ blk_queue_zone_write_granularity(q,
+ queue_logical_block_size(q));
+ } else {
+ blk_queue_clear_zone_settings(q);
+ }
}
EXPORT_SYMBOL_GPL(blk_queue_set_zoned);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b513f1683af0..ae39c7f3d83d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -219,6 +219,12 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
(unsigned long long)q->limits.max_write_zeroes_sectors << 9);
}
+static ssize_t queue_zone_write_granularity_show(struct request_queue *q,
+ char *page)
+{
+ return queue_var_show(queue_zone_write_granularity(q), page);
+}
+
static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
{
unsigned long long max_sectors = q->limits.max_zone_append_sectors;
@@ -585,6 +591,7 @@ QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
+QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
QUEUE_RO_ENTRY(queue_zoned, "zoned");
QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
@@ -639,6 +646,7 @@ static struct attribute *queue_attrs[] = {
&queue_write_same_max_entry.attr,
&queue_write_zeroes_max_entry.attr,
&queue_zone_append_max_entry.attr,
+ &queue_zone_write_granularity_entry.attr,
&queue_nonrot_entry.attr,
&queue_zoned_entry.attr,
&queue_nr_zones_entry.attr,
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index d52cac9f3a7c..b1b22d863bdf 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2178,7 +2178,7 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
bool blk_throtl_bio(struct bio *bio)
{
- struct request_queue *q = bio->bi_disk->queue;
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct blkcg_gq *blkg = bio->bi_blkg;
struct throtl_qnode *qn = NULL;
struct throtl_grp *tg = blkg_to_tg(blkg);
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 0321ca83e73f..42aed0160f86 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -518,7 +518,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb);
}
-static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
+static inline bool wbt_should_throttle(struct bio *bio)
{
switch (bio_op(bio)) {
case REQ_OP_WRITE:
@@ -545,7 +545,7 @@ static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio)
if (bio_op(bio) == REQ_OP_READ) {
flags = WBT_READ;
- } else if (wbt_should_throttle(rwb, bio)) {
+ } else if (wbt_should_throttle(bio)) {
if (current_is_kswapd())
flags |= WBT_KSWAPD;
if (bio_op(bio) == REQ_OP_DISCARD)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 7a68b6e4300c..833978c02e60 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -549,3 +549,20 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
return ret;
}
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
+
+void blk_queue_clear_zone_settings(struct request_queue *q)
+{
+ blk_mq_freeze_queue(q);
+
+ blk_queue_free_zone_bitmaps(q);
+ blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q);
+ q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE;
+ q->nr_zones = 0;
+ q->max_open_zones = 0;
+ q->max_active_zones = 0;
+ q->limits.chunk_sectors = 0;
+ q->limits.zone_write_granularity = 0;
+ q->limits.max_zone_append_sectors = 0;
+
+ blk_mq_unfreeze_queue(q);
+}
diff --git a/block/blk.h b/block/blk.h
index 7550364c326c..3b53e44b967e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -55,6 +55,11 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
void blk_freeze_queue(struct request_queue *q);
+#define BIO_INLINE_VECS 4
+struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
+ gfp_t gfp_mask);
+void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);
+
static inline bool biovec_phys_mergeable(struct request_queue *q,
struct bio_vec *vec1, struct bio_vec *vec2)
{
@@ -202,8 +207,6 @@ static inline void elevator_exit(struct request_queue *q,
__elevator_exit(q, e);
}
-struct block_device *__disk_get_part(struct gendisk *disk, int partno);
-
ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
char *buf);
ssize_t part_stat_show(struct device *dev, struct device_attribute *attr,
@@ -331,12 +334,12 @@ struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp);
#ifdef CONFIG_BLK_DEV_ZONED
void blk_queue_free_zone_bitmaps(struct request_queue *q);
+void blk_queue_clear_zone_settings(struct request_queue *q);
#else
static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {}
+static inline void blk_queue_clear_zone_settings(struct request_queue *q) {}
#endif
-struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector);
-
int blk_alloc_devt(struct block_device *part, dev_t *devt);
void blk_free_devt(dev_t devt);
char *disk_name(struct gendisk *hd, int partno, char *buf);
@@ -349,7 +352,6 @@ int bdev_add_partition(struct block_device *bdev, int partno,
int bdev_del_partition(struct block_device *bdev, int partno);
int bdev_resize_partition(struct block_device *bdev, int partno,
sector_t start, sector_t length);
-int disk_expand_part_tbl(struct gendisk *disk, int target);
int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
diff --git a/block/bounce.c b/block/bounce.c
index d3f51acd6e3b..fc55314aa426 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -246,7 +246,9 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
if (!bio)
return NULL;
- bio->bi_disk = bio_src->bi_disk;
+ bio->bi_bdev = bio_src->bi_bdev;
+ if (bio_flagged(bio_src, BIO_REMAPPED))
+ bio_set_flag(bio, BIO_REMAPPED);
bio->bi_opf = bio_src->bi_opf;
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
diff --git a/block/bsg.c b/block/bsg.c
index d7bae94b64d9..bd10922d5cbb 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -157,8 +157,10 @@ static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg)
return PTR_ERR(rq);
ret = q->bsg_dev.ops->fill_hdr(rq, &hdr, mode);
- if (ret)
+ if (ret) {
+ blk_put_request(rq);
return ret;
+ }
rq->timeout = msecs_to_jiffies(hdr.timeout);
if (!rq->timeout)
@@ -181,7 +183,7 @@ static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg)
bio = rq->bio;
- blk_execute_rq(q, NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL));
+ blk_execute_rq(NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL));
ret = rq->q->bsg_dev.ops->complete_rq(rq, &hdr);
blk_rq_unmap_user(bio);
diff --git a/block/genhd.c b/block/genhd.c
index 9e741a4f351b..36ff45bbaaaf 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -162,15 +162,6 @@ static void part_in_flight_rw(struct block_device *part,
inflight[1] = 0;
}
-struct block_device *__disk_get_part(struct gendisk *disk, int partno)
-{
- struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl);
-
- if (unlikely(partno < 0 || partno >= ptbl->len))
- return NULL;
- return rcu_dereference(ptbl->part[partno]);
-}
-
/**
* disk_part_iter_init - initialize partition iterator
* @piter: iterator to initialize
@@ -185,26 +176,14 @@ struct block_device *__disk_get_part(struct gendisk *disk, int partno)
void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
unsigned int flags)
{
- struct disk_part_tbl *ptbl;
-
- rcu_read_lock();
- ptbl = rcu_dereference(disk->part_tbl);
-
piter->disk = disk;
piter->part = NULL;
-
- if (flags & DISK_PITER_REVERSE)
- piter->idx = ptbl->len - 1;
- else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
+ if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
piter->idx = 0;
else
piter->idx = 1;
-
piter->flags = flags;
-
- rcu_read_unlock();
}
-EXPORT_SYMBOL_GPL(disk_part_iter_init);
/**
* disk_part_iter_next - proceed iterator to the next partition and return it
@@ -217,57 +196,30 @@ EXPORT_SYMBOL_GPL(disk_part_iter_init);
*/
struct block_device *disk_part_iter_next(struct disk_part_iter *piter)
{
- struct disk_part_tbl *ptbl;
- int inc, end;
+ struct block_device *part;
+ unsigned long idx;
/* put the last partition */
disk_part_iter_exit(piter);
- /* get part_tbl */
rcu_read_lock();
- ptbl = rcu_dereference(piter->disk->part_tbl);
-
- /* determine iteration parameters */
- if (piter->flags & DISK_PITER_REVERSE) {
- inc = -1;
- if (piter->flags & (DISK_PITER_INCL_PART0 |
- DISK_PITER_INCL_EMPTY_PART0))
- end = -1;
- else
- end = 0;
- } else {
- inc = 1;
- end = ptbl->len;
- }
-
- /* iterate to the next partition */
- for (; piter->idx != end; piter->idx += inc) {
- struct block_device *part;
-
- part = rcu_dereference(ptbl->part[piter->idx]);
- if (!part)
- continue;
- piter->part = bdgrab(part);
- if (!piter->part)
- continue;
+ xa_for_each_start(&piter->disk->part_tbl, idx, part, piter->idx) {
if (!bdev_nr_sectors(part) &&
!(piter->flags & DISK_PITER_INCL_EMPTY) &&
!(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
- piter->idx == 0)) {
- bdput(piter->part);
- piter->part = NULL;
+ piter->idx == 0))
continue;
- }
- piter->idx += inc;
+ piter->part = bdgrab(part);
+ if (!piter->part)
+ continue;
+ piter->idx = idx + 1;
break;
}
-
rcu_read_unlock();
return piter->part;
}
-EXPORT_SYMBOL_GPL(disk_part_iter_next);
/**
* disk_part_iter_exit - finish up partition iteration
@@ -284,91 +236,6 @@ void disk_part_iter_exit(struct disk_part_iter *piter)
bdput(piter->part);
piter->part = NULL;
}
-EXPORT_SYMBOL_GPL(disk_part_iter_exit);
-
-static inline int sector_in_part(struct block_device *part, sector_t sector)
-{
- return part->bd_start_sect <= sector &&
- sector < part->bd_start_sect + bdev_nr_sectors(part);
-}
-
-/**
- * disk_map_sector_rcu - map sector to partition
- * @disk: gendisk of interest
- * @sector: sector to map
- *
- * Find out which partition @sector maps to on @disk. This is
- * primarily used for stats accounting.
- *
- * CONTEXT:
- * RCU read locked.
- *
- * RETURNS:
- * Found partition on success, part0 is returned if no partition matches
- * or the matched partition is being deleted.
- */
-struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
-{
- struct disk_part_tbl *ptbl;
- struct block_device *part;
- int i;
-
- rcu_read_lock();
- ptbl = rcu_dereference(disk->part_tbl);
-
- part = rcu_dereference(ptbl->last_lookup);
- if (part && sector_in_part(part, sector))
- goto out_unlock;
-
- for (i = 1; i < ptbl->len; i++) {
- part = rcu_dereference(ptbl->part[i]);
- if (part && sector_in_part(part, sector)) {
- rcu_assign_pointer(ptbl->last_lookup, part);
- goto out_unlock;
- }
- }
-
- part = disk->part0;
-out_unlock:
- rcu_read_unlock();
- return part;
-}
-
-/**
- * disk_has_partitions
- * @disk: gendisk of interest
- *
- * Walk through the partition table and check if valid partition exists.
- *
- * CONTEXT:
- * Don't care.
- *
- * RETURNS:
- * True if the gendisk has at least one valid non-zero size partition.
- * Otherwise false.
- */
-bool disk_has_partitions(struct gendisk *disk)
-{
- struct disk_part_tbl *ptbl;
- int i;
- bool ret = false;
-
- rcu_read_lock();
- ptbl = rcu_dereference(disk->part_tbl);
-
- /* Iterate partitions skipping the whole device at index 0 */
- for (i = 1; i < ptbl->len; i++) {
- if (rcu_dereference(ptbl->part[i])) {
- ret = true;
- break;
- }
- }
-
- rcu_read_unlock();
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(disk_has_partitions);
/*
* Can be deleted altogether. Later.
@@ -604,6 +471,18 @@ static char *bdevt_str(dev_t devt, char *buf)
return buf;
}
+void disk_uevent(struct gendisk *disk, enum kobject_action action)
+{
+ struct disk_part_iter piter;
+ struct block_device *part;
+
+ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
+ while ((part = disk_part_iter_next(&piter)))
+ kobject_uevent(bdev_kobj(part), action);
+ disk_part_iter_exit(&piter);
+}
+EXPORT_SYMBOL_GPL(disk_uevent);
+
static void disk_scan_partitions(struct gendisk *disk)
{
struct block_device *bdev;
@@ -621,8 +500,6 @@ static void register_disk(struct device *parent, struct gendisk *disk,
const struct attribute_group **groups)
{
struct device *ddev = disk_to_dev(disk);
- struct disk_part_iter piter;
- struct block_device *part;
int err;
ddev->parent = parent;
@@ -665,15 +542,9 @@ static void register_disk(struct device *parent, struct gendisk *disk,
disk_scan_partitions(disk);
- /* announce disk after possible partitions are created */
+ /* announce the disk and partitions after all partitions are created */
dev_set_uevent_suppress(ddev, 0);
- kobject_uevent(&ddev->kobj, KOBJ_ADD);
-
- /* announce possible partitions */
- disk_part_iter_init(&piter, disk, 0);
- while ((part = disk_part_iter_next(&piter)))
- kobject_uevent(bdev_kobj(part), KOBJ_ADD);
- disk_part_iter_exit(&piter);
+ disk_uevent(disk, KOBJ_ADD);
if (disk->queue->backing_dev_info->dev) {
err = sysfs_create_link(&ddev->kobj,
@@ -829,8 +700,7 @@ void del_gendisk(struct gendisk *disk)
down_write(&bdev_lookup_sem);
/* invalidate stuff */
- disk_part_iter_init(&piter, disk,
- DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
+ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
while ((part = disk_part_iter_next(&piter))) {
invalidate_partition(part);
delete_partition(part);
@@ -929,7 +799,7 @@ struct block_device *bdget_disk(struct gendisk *disk, int partno)
struct block_device *bdev = NULL;
rcu_read_lock();
- bdev = __disk_get_part(disk, partno);
+ bdev = xa_load(&disk->part_tbl, partno);
if (bdev && !bdgrab(bdev))
bdev = NULL;
rcu_read_unlock();
@@ -1320,83 +1190,6 @@ static const struct attribute_group *disk_attr_groups[] = {
};
/**
- * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way
- * @disk: disk to replace part_tbl for
- * @new_ptbl: new part_tbl to install
- *
- * Replace disk->part_tbl with @new_ptbl in RCU-safe way. The
- * original ptbl is freed using RCU callback.
- *
- * LOCKING:
- * Matching bd_mutex locked or the caller is the only user of @disk.
- */
-static void disk_replace_part_tbl(struct gendisk *disk,
- struct disk_part_tbl *new_ptbl)
-{
- struct disk_part_tbl *old_ptbl =
- rcu_dereference_protected(disk->part_tbl, 1);
-
- rcu_assign_pointer(disk->part_tbl, new_ptbl);
-
- if (old_ptbl) {
- rcu_assign_pointer(old_ptbl->last_lookup, NULL);
- kfree_rcu(old_ptbl, rcu_head);
- }
-}
-
-/**
- * disk_expand_part_tbl - expand disk->part_tbl
- * @disk: disk to expand part_tbl for
- * @partno: expand such that this partno can fit in
- *
- * Expand disk->part_tbl such that @partno can fit in. disk->part_tbl
- * uses RCU to allow unlocked dereferencing for stats and other stuff.
- *
- * LOCKING:
- * Matching bd_mutex locked or the caller is the only user of @disk.
- * Might sleep.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int disk_expand_part_tbl(struct gendisk *disk, int partno)
-{
- struct disk_part_tbl *old_ptbl =
- rcu_dereference_protected(disk->part_tbl, 1);
- struct disk_part_tbl *new_ptbl;
- int len = old_ptbl ? old_ptbl->len : 0;
- int i, target;
-
- /*
- * check for int overflow, since we can get here from blkpg_ioctl()
- * with a user passed 'partno'.
- */
- target = partno + 1;
- if (target < 0)
- return -EINVAL;
-
- /* disk_max_parts() is zero during initialization, ignore if so */
- if (disk_max_parts(disk) && target > disk_max_parts(disk))
- return -EINVAL;
-
- if (target <= len)
- return 0;
-
- new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL,
- disk->node_id);
- if (!new_ptbl)
- return -ENOMEM;
-
- new_ptbl->len = target;
-
- for (i = 0; i < len; i++)
- rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
-
- disk_replace_part_tbl(disk, new_ptbl);
- return 0;
-}
-
-/**
* disk_release - releases all allocated resources of the gendisk
* @dev: the device representing this disk
*
@@ -1419,7 +1212,7 @@ static void disk_release(struct device *dev)
blk_free_devt(dev->devt);
disk_release_events(disk);
kfree(disk->random);
- disk_replace_part_tbl(disk, NULL);
+ xa_destroy(&disk->part_tbl);
bdput(disk->part0);
if (disk->queue)
blk_put_queue(disk->queue);
@@ -1572,7 +1365,6 @@ dev_t blk_lookup_devt(const char *name, int partno)
struct gendisk *__alloc_disk_node(int minors, int node_id)
{
struct gendisk *disk;
- struct disk_part_tbl *ptbl;
if (minors > DISK_MAX_PARTS) {
printk(KERN_ERR
@@ -1590,11 +1382,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
goto out_free_disk;
disk->node_id = node_id;
- if (disk_expand_part_tbl(disk, 0))
- goto out_bdput;
-
- ptbl = rcu_dereference_protected(disk->part_tbl, 1);
- rcu_assign_pointer(ptbl->part[0], disk->part0);
+ xa_init(&disk->part_tbl);
+ if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
+ goto out_destroy_part_tbl;
disk->minors = minors;
rand_initialize_disk(disk);
@@ -1603,7 +1393,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
device_initialize(disk_to_dev(disk));
return disk;
-out_bdput:
+out_destroy_part_tbl:
+ xa_destroy(&disk->part_tbl);
bdput(disk->part0);
out_free_disk:
kfree(disk);
@@ -1638,31 +1429,32 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro)
kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
}
-void set_disk_ro(struct gendisk *disk, int flag)
+/**
+ * set_disk_ro - set a gendisk read-only
+ * @disk: gendisk to operate on
+ * @read_only: %true to set the disk read-only, %false set the disk read/write
+ *
+ * This function is used to indicate whether a given disk device should have its
+ * read-only flag set. set_disk_ro() is typically used by device drivers to
+ * indicate whether the underlying physical device is write-protected.
+ */
+void set_disk_ro(struct gendisk *disk, bool read_only)
{
- struct disk_part_iter piter;
- struct block_device *part;
-
- if (disk->part0->bd_read_only != flag) {
- set_disk_ro_uevent(disk, flag);
- disk->part0->bd_read_only = flag;
+ if (read_only) {
+ if (test_and_set_bit(GD_READ_ONLY, &disk->state))
+ return;
+ } else {
+ if (!test_and_clear_bit(GD_READ_ONLY, &disk->state))
+ return;
}
-
- disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
- while ((part = disk_part_iter_next(&piter)))
- part->bd_read_only = flag;
- disk_part_iter_exit(&piter);
+ set_disk_ro_uevent(disk, read_only);
}
-
EXPORT_SYMBOL(set_disk_ro);
int bdev_read_only(struct block_device *bdev)
{
- if (!bdev)
- return 0;
- return bdev->bd_read_only;
+ return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
}
-
EXPORT_SYMBOL(bdev_read_only);
/*
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index dc89199bc8c6..c25c41d0d061 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -1029,6 +1029,7 @@ static struct elevator_type kyber_sched = {
#endif
.elevator_attrs = kyber_sched_attrs,
.elevator_name = "kyber",
+ .elevator_features = ELEVATOR_F_MQ_AWARE,
.elevator_owner = THIS_MODULE,
};
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 800ac902809b..b57470e154c8 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -386,8 +386,6 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
spin_lock(&dd->lock);
rq = __dd_dispatch_request(dd);
spin_unlock(&dd->lock);
- if (rq)
- atomic_dec(&rq->mq_hctx->elevator_queued);
return rq;
}
@@ -535,7 +533,6 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);
dd_insert_request(hctx, rq, at_head);
- atomic_inc(&hctx->elevator_queued);
}
spin_unlock(&dd->lock);
}
@@ -582,9 +579,6 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
- if (!atomic_read(&hctx->elevator_queued))
- return false;
-
return !list_empty_careful(&dd->dispatch) ||
!list_empty_careful(&dd->fifo_list[0]) ||
!list_empty_careful(&dd->fifo_list[1]);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 4601a845cd79..f3d9ff2cafb6 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -197,7 +197,7 @@ static ssize_t part_start_show(struct device *dev,
static ssize_t part_ro_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_read_only);
+ return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev)));
}
static ssize_t part_alignment_offset_show(struct device *dev,
@@ -289,13 +289,7 @@ struct device_type part_type = {
*/
void delete_partition(struct block_device *part)
{
- struct gendisk *disk = part->bd_disk;
- struct disk_part_tbl *ptbl =
- rcu_dereference_protected(disk->part_tbl, 1);
-
- rcu_assign_pointer(ptbl->part[part->bd_partno], NULL);
- rcu_assign_pointer(ptbl->last_lookup, NULL);
-
+ xa_erase(&part->bd_disk->part_tbl, part->bd_partno);
kobject_put(part->bd_holder_dir);
device_del(&part->bd_device);
@@ -327,7 +321,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
struct device *ddev = disk_to_dev(disk);
struct device *pdev;
struct block_device *bdev;
- struct disk_part_tbl *ptbl;
const char *dname;
int err;
@@ -343,18 +336,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
case BLK_ZONED_HA:
pr_info("%s: disabling host aware zoned block device support due to partitions\n",
disk->disk_name);
- disk->queue->limits.zoned = BLK_ZONED_NONE;
+ blk_queue_set_zoned(disk, BLK_ZONED_NONE);
break;
case BLK_ZONED_NONE:
break;
}
- err = disk_expand_part_tbl(disk, partno);
- if (err)
- return ERR_PTR(err);
- ptbl = rcu_dereference_protected(disk->part_tbl, 1);
-
- if (ptbl->part[partno])
+ if (xa_load(&disk->part_tbl, partno))
return ERR_PTR(-EBUSY);
bdev = bdev_alloc(disk, partno);
@@ -363,7 +351,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
bdev->bd_start_sect = start;
bdev_set_nr_sectors(bdev, len);
- bdev->bd_read_only = get_disk_ro(disk);
if (info) {
err = -ENOMEM;
@@ -408,8 +395,10 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
}
/* everything is up and running, commence */
+ err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL);
+ if (err)
+ goto out_del;
bdev_add(bdev, devt);
- rcu_assign_pointer(ptbl->part[partno], bdev);
/* suppress uevent if the disk suppresses it */
if (!dev_get_uevent_suppress(ddev))
@@ -615,7 +604,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev,
int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
{
struct parsed_partitions *state;
- int ret = -EAGAIN, p, highest;
+ int ret = -EAGAIN, p;
if (!disk_part_scan_enabled(disk))
return 0;
@@ -663,15 +652,6 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
/* tell userspace that the media / partition table may have changed */
kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
- /*
- * Detect the highest partition number and preallocate disk->part_tbl.
- * This is an optimization and not strictly necessary.
- */
- for (p = 1, highest = 0; p < state->limit; p++)
- if (state->parts[p].size)
- highest = p;
- disk_expand_part_tbl(disk, highest);
-
for (p = 1; p < state->limit; p++)
if (!blk_add_partition(disk, bdev, state, p))
goto out_free_state;
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index c9f009cc0446..6599bac0a78c 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -357,7 +357,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
* (if he doesn't check that is his problem).
* N.B. a non-zero SCSI status is _not_ necessarily an error.
*/
- blk_execute_rq(q, bd_disk, rq, at_head);
+ blk_execute_rq(bd_disk, rq, at_head);
hdr->duration = jiffies_to_msecs(jiffies - start_time);
@@ -493,7 +493,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
goto error;
}
- blk_execute_rq(q, disk, rq, 0);
+ blk_execute_rq(disk, rq, 0);
err = req->result & 0xff; /* only 8 bit SCSI status */
if (err) {
@@ -532,7 +532,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
scsi_req(rq)->cmd[0] = cmd;
scsi_req(rq)->cmd[4] = data;
scsi_req(rq)->cmd_len = 6;
- blk_execute_rq(q, bd_disk, rq, 0);
+ blk_execute_rq(bd_disk, rq, 0);
err = scsi_req(rq)->result ? -EIO : 0;
blk_put_request(rq);
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index c43a6ab4b1f3..18bf99906662 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -284,15 +284,11 @@ out:
static blk_qc_t brd_submit_bio(struct bio *bio)
{
- struct brd_device *brd = bio->bi_disk->private_data;
+ struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
+ sector_t sector = bio->bi_iter.bi_sector;
struct bio_vec bvec;
- sector_t sector;
struct bvec_iter iter;
- sector = bio->bi_iter.bi_sector;
- if (bio_end_sector(bio) > get_capacity(bio->bi_disk))
- goto io_error;
-
bio_for_each_segment(bvec, bio, iter) {
unsigned int len = bvec.bv_len;
int err;
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 7227fc7ab8ed..72cf7603d51f 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -138,7 +138,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
op_flags |= REQ_FUA | REQ_PREFLUSH;
op_flags |= REQ_SYNC;
- bio = bio_alloc_drbd(GFP_NOIO);
+ bio = bio_alloc_bioset(GFP_NOIO, 1, &drbd_md_io_bio_set);
bio_set_dev(bio, bdev->md_bdev);
bio->bi_iter.bi_sector = sector;
err = -EIO;
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index df53dca5d02c..c1f816f896a8 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -976,7 +976,7 @@ static void drbd_bm_endio(struct bio *bio)
static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
{
- struct bio *bio = bio_alloc_drbd(GFP_NOIO);
+ struct bio *bio = bio_alloc_bioset(GFP_NOIO, 1, &drbd_md_io_bio_set);
struct drbd_device *device = ctx->device;
struct drbd_bitmap *b = device->bitmap;
struct page *page;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 8f879e5c2f67..02db50d7e4c6 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1422,8 +1422,6 @@ extern mempool_t drbd_md_io_page_pool;
/* We also need to make sure we get a bio
* when we need it for housekeeping purposes */
extern struct bio_set drbd_md_io_bio_set;
-/* to allocate from that set */
-extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
/* And a bio_set for cloning */
extern struct bio_set drbd_io_bio_set;
@@ -1579,8 +1577,8 @@ static inline void drbd_submit_bio_noacct(struct drbd_device *device,
int fault_type, struct bio *bio)
{
__release(local);
- if (!bio->bi_disk) {
- drbd_err(device, "drbd_submit_bio_noacct: bio->bi_disk == NULL\n");
+ if (!bio->bi_bdev) {
+ drbd_err(device, "drbd_submit_bio_noacct: bio->bi_bdev == NULL\n");
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
return;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 1c8c18b2a25f..788dd97e6026 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -138,19 +138,6 @@ static const struct block_device_operations drbd_ops = {
.release = drbd_release,
};
-struct bio *bio_alloc_drbd(gfp_t gfp_mask)
-{
- struct bio *bio;
-
- if (!bioset_initialized(&drbd_md_io_bio_set))
- return bio_alloc(gfp_mask, 1);
-
- bio = bio_alloc_bioset(gfp_mask, 1, &drbd_md_io_bio_set);
- if (!bio)
- return NULL;
- return bio;
-}
-
#ifdef __CHECKER__
/* When checking with sparse, and this is an inline function, sparse will
give tons of false positives. When this is a real functions sparse works.
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 330f851cb8f0..9dbb660a7d7c 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -30,7 +30,10 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio
return NULL;
memset(req, 0, sizeof(*req));
- drbd_req_make_private_bio(req, bio_src);
+ req->private_bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set);
+ req->private_bio->bi_private = req;
+ req->private_bio->bi_end_io = drbd_request_endio;
+
req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0)
| (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0)
| (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_ZEROES : 0)
@@ -1595,7 +1598,7 @@ void do_submit(struct work_struct *ws)
blk_qc_t drbd_submit_bio(struct bio *bio)
{
- struct drbd_device *device = bio->bi_disk->private_data;
+ struct drbd_device *device = bio->bi_bdev->bd_disk->private_data;
unsigned long start_jif;
blk_queue_split(&bio);
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 55bb0f8721fa..511f39a08de4 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -256,18 +256,6 @@ enum drbd_req_state_bits {
#define MR_WRITE 1
#define MR_READ 2
-static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
-{
- struct bio *bio;
- bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set);
-
- req->private_bio = bio;
-
- bio->bi_private = req;
- bio->bi_end_io = drbd_request_endio;
- bio->bi_next = NULL;
-}
-
/* Short lived temporary struct on the stack.
* We could squirrel the error to be returned into
* bio->bi_iter.bi_size, or similar. But that would be too ugly. */
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 02044ab7f767..64563bfdf0da 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1523,8 +1523,11 @@ int w_restart_disk_io(struct drbd_work *w, int cancel)
if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
drbd_al_begin_io(device, &req->i);
- drbd_req_make_private_bio(req, req->master_bio);
+ req->private_bio = bio_clone_fast(req->master_bio, GFP_NOIO,
+ &drbd_io_bio_set);
bio_set_dev(req->private_bio, device->ldev->backing_bdev);
+ req->private_bio->bi_private = req;
+ req->private_bio->bi_end_io = drbd_request_endio;
submit_bio_noacct(req->private_bio);
return 0;
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 53ac59d19ae5..3fd99836bb1c 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -1015,7 +1015,7 @@ static int mtip_exec_internal_command(struct mtip_port *port,
rq->timeout = timeout;
/* insert request and run queue */
- blk_execute_rq(rq->q, NULL, rq, true);
+ blk_execute_rq(NULL, rq, true);
if (int_cmd->status) {
dev_err(&dd->pdev->dev, "Internal command [%02X] failed %d\n",
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 5357c3a4a36f..d6c821d48090 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1420,7 +1420,7 @@ static blk_qc_t null_submit_bio(struct bio *bio)
{
sector_t sector = bio->bi_iter.bi_sector;
sector_t nr_sectors = bio_sectors(bio);
- struct nullb *nullb = bio->bi_disk->private_data;
+ struct nullb *nullb = bio->bi_bdev->bd_disk->private_data;
struct nullb_queue *nq = nullb_to_queue(nullb);
struct nullb_cmd *cmd;
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index fce0a54df0e5..bfcab1c782b5 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -148,10 +148,6 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
sector += dev->zone_size_sects;
}
- q->limits.zoned = BLK_ZONED_HM;
- blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
- blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
-
return 0;
}
@@ -160,6 +156,10 @@ int null_register_zoned_dev(struct nullb *nullb)
struct nullb_device *dev = nullb->dev;
struct request_queue *q = nullb->q;
+ blk_queue_set_zoned(nullb->disk, BLK_ZONED_HM);
+ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
+ blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
+
if (queue_is_mq(q)) {
int ret = blk_revalidate_disk_zones(nullb->disk, NULL);
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index a7af4f27b7c3..897acda20ac8 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -781,7 +781,7 @@ static int pd_special_command(struct pd_unit *disk,
req = blk_mq_rq_to_pdu(rq);
req->func = func;
- blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
+ blk_execute_rq(disk->gd, rq, 0);
blk_put_request(rq);
return 0;
}
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index b8bb8ec7538d..fc4b0f1aa86d 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -722,7 +722,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
if (cgc->quiet)
rq->rq_flags |= RQF_QUIET;
- blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0);
+ blk_execute_rq(pd->bdev->bd_disk, rq, 0);
if (scsi_req(rq)->result)
ret = -EIO;
out:
@@ -2374,7 +2374,7 @@ static blk_qc_t pkt_submit_bio(struct bio *bio)
blk_queue_split(&bio);
- pd = bio->bi_disk->queue->queuedata;
+ pd = bio->bi_bdev->bd_disk->queue->queuedata;
if (!pd) {
pr_err("%s incorrect request queue\n", bio_devname(bio, b));
goto end_io;
@@ -2418,7 +2418,7 @@ static blk_qc_t pkt_submit_bio(struct bio *bio)
split = bio;
}
- pkt_make_request_write(bio->bi_disk->queue, split);
+ pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split);
} while (split != bio);
return BLK_QC_T_NONE;
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index b71d28372ef3..1d738999fb69 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -581,7 +581,7 @@ out:
static blk_qc_t ps3vram_submit_bio(struct bio *bio)
{
- struct ps3_system_bus_device *dev = bio->bi_disk->private_data;
+ struct ps3_system_bus_device *dev = bio->bi_bdev->bd_disk->private_data;
struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
int busy;
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 59cfe71d0b3a..bbb88eb009e0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -692,29 +692,10 @@ static void rbd_release(struct gendisk *disk, fmode_t mode)
put_device(&rbd_dev->dev);
}
-static int rbd_set_read_only(struct block_device *bdev, bool ro)
-{
- struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
-
- /*
- * Both images mapped read-only and snapshots can't be marked
- * read-write.
- */
- if (!ro) {
- if (rbd_is_ro(rbd_dev))
- return -EROFS;
-
- rbd_assert(!rbd_is_snap(rbd_dev));
- }
-
- return 0;
-}
-
static const struct block_device_operations rbd_bd_ops = {
.owner = THIS_MODULE,
.open = rbd_open,
.release = rbd_release,
- .set_read_only = rbd_set_read_only,
};
/*
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index edacefff6e35..9a28322a8cd8 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -122,7 +122,7 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card,
static blk_qc_t rsxx_submit_bio(struct bio *bio)
{
- struct rsxx_cardinfo *card = bio->bi_disk->private_data;
+ struct rsxx_cardinfo *card = bio->bi_bdev->bd_disk->private_data;
struct rsxx_bio_meta *bio_meta;
blk_status_t st = BLK_STS_IOERR;
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 4478eb7efee0..2cdf2771f8e8 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -539,7 +539,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
spin_unlock_irq(&host->lock);
DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
- blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL);
+ blk_execute_rq_nowait(NULL, rq, true, NULL);
return 0;
@@ -578,7 +578,7 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
crq->msg_bucket = (u32) rc;
DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
- blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL);
+ blk_execute_rq_nowait(NULL, rq, true, NULL);
return 0;
}
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 2b95d7b33b91..982732dbe82e 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -521,7 +521,7 @@ static int mm_check_plugged(struct cardinfo *card)
static blk_qc_t mm_submit_bio(struct bio *bio)
{
- struct cardinfo *card = bio->bi_disk->private_data;
+ struct cardinfo *card = bio->bi_bdev->bd_disk->private_data;
pr_debug("mm_make_request %llu %u\n",
(unsigned long long)bio->bi_iter.bi_sector,
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 145606dc52db..b0285db7cf4f 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -320,7 +320,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
if (err)
goto out;
- blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+ blk_execute_rq(vblk->disk, req, false);
err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req)));
out:
blk_put_request(req);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index e2933cb7a82a..d6243dbc53cc 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1596,7 +1596,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
*/
static blk_qc_t zram_submit_bio(struct bio *bio)
{
- struct zram *zram = bio->bi_disk->private_data;
+ struct zram *zram = bio->bi_bdev->bd_disk->private_data;
if (!valid_io_request(zram, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size)) {
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 8f0e52a71493..90ad34c6ef8e 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2214,7 +2214,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
rq->timeout = 60 * HZ;
bio = rq->bio;
- blk_execute_rq(q, cdi->disk, rq, 0);
+ blk_execute_rq(cdi->disk, rq, 0);
if (scsi_req(rq)->result) {
struct scsi_sense_hdr sshdr;
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 013ad33fbbc8..a1ce9f5ac3aa 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -107,7 +107,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
memcpy(scsi_req(rq)->cmd, pc->c, 12);
if (drive->media == ide_tape)
scsi_req(rq)->cmd[13] = REQ_IDETAPE_PC1;
- blk_execute_rq(drive->queue, disk, rq, 0);
+ blk_execute_rq(disk, rq, 0);
error = scsi_req(rq)->result ? -EIO : 0;
put_req:
blk_put_request(rq);
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 25d2d88e82ad..cffbcc27a34c 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -467,7 +467,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
}
}
- blk_execute_rq(drive->queue, info->disk, rq, 0);
+ blk_execute_rq(info->disk, rq, 0);
error = scsi_req(rq)->result ? -EIO : 0;
if (buffer)
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 46f2df288c6a..011eab9c69b7 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -299,7 +299,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
ide_req(rq)->type = ATA_PRIV_MISC;
rq->rq_flags = RQF_QUIET;
- blk_execute_rq(drive->queue, cd->disk, rq, 0);
+ blk_execute_rq(cd->disk, rq, 0);
ret = scsi_req(rq)->result ? -EIO : 0;
blk_put_request(rq);
/*
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index f2f93ed40356..ca1d4b3d3878 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -173,7 +173,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
*(int *)&scsi_req(rq)->cmd[1] = arg;
ide_req(rq)->special = setting->set;
- blk_execute_rq(q, NULL, rq, 0);
+ blk_execute_rq(NULL, rq, 0);
ret = scsi_req(rq)->result;
blk_put_request(rq);
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 34b9441084f8..8413731c6259 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -482,7 +482,7 @@ static int set_multcount(ide_drive_t *drive, int arg)
drive->mult_req = arg;
drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
- blk_execute_rq(drive->queue, NULL, rq, 0);
+ blk_execute_rq(NULL, rq, 0);
blk_put_request(rq);
return (drive->mult_count == arg) ? 0 : -EIO;
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 58994da10c06..43fbc37d85c3 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -137,7 +137,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, void __user *argp)
rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
ide_req(rq)->type = ATA_PRIV_TASKFILE;
- blk_execute_rq(drive->queue, NULL, rq, 0);
+ blk_execute_rq(NULL, rq, 0);
err = scsi_req(rq)->result ? -EIO : 0;
blk_put_request(rq);
@@ -235,7 +235,7 @@ static int generic_drive_reset(ide_drive_t *drive)
ide_req(rq)->type = ATA_PRIV_MISC;
scsi_req(rq)->cmd_len = 1;
scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
- blk_execute_rq(drive->queue, NULL, rq, 1);
+ blk_execute_rq(NULL, rq, 1);
ret = scsi_req(rq)->result;
blk_put_request(rq);
return ret;
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 8af7af6001eb..a80a0f28f7b9 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -37,7 +37,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
scsi_req(rq)->cmd_len = 1;
ide_req(rq)->type = ATA_PRIV_MISC;
ide_req(rq)->special = &timeout;
- blk_execute_rq(q, NULL, rq, 1);
+ blk_execute_rq(NULL, rq, 1);
rc = scsi_req(rq)->result ? -EIO : 0;
blk_put_request(rq);
if (rc)
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 82ab308f1aaf..d680b3e3295f 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -27,7 +27,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
mesg.event = PM_EVENT_FREEZE;
rqpm.pm_state = mesg.event;
- blk_execute_rq(drive->queue, NULL, rq, 0);
+ blk_execute_rq(NULL, rq, 0);
ret = scsi_req(rq)->result ? -EIO : 0;
blk_put_request(rq);
@@ -50,7 +50,7 @@ static int ide_pm_execute_rq(struct request *rq)
blk_mq_end_request(rq, BLK_STS_OK);
return -ENXIO;
}
- blk_execute_rq(q, NULL, rq, true);
+ blk_execute_rq(NULL, rq, true);
return scsi_req(rq)->result ? -EIO : 0;
}
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 88b96437b22e..fa05e7e7d609 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -868,7 +868,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
goto out_put;
}
- blk_execute_rq(drive->queue, tape->disk, rq, 0);
+ blk_execute_rq(tape->disk, rq, 0);
/* calculate the number of transferred bytes and update buffer state */
size -= scsi_req(rq)->resid_len;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index d016cbe68cba..6665fc4724b9 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -443,7 +443,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
ide_req(rq)->special = cmd;
cmd->rq = rq;
- blk_execute_rq(drive->queue, NULL, rq, 0);
+ blk_execute_rq(NULL, rq, 0);
error = scsi_req(rq)->result ? -EIO : 0;
put_req:
blk_put_request(rq);
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index b6246f73895c..5924f09c217b 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -49,7 +49,7 @@ struct bio_set pblk_bio_set;
static blk_qc_t pblk_submit_bio(struct bio *bio)
{
- struct pblk *pblk = bio->bi_disk->queue->queuedata;
+ struct pblk *pblk = bio->bi_bdev->bd_disk->queue->queuedata;
if (bio_op(bio) == REQ_OP_DISCARD) {
pblk_discard(pblk, bio);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index b00fd08d696b..63e809f38e3f 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -114,7 +114,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
check = bio_kmalloc(GFP_NOIO, bio_segments(bio));
if (!check)
return;
- check->bi_disk = bio->bi_disk;
+ bio_set_dev(check, bio->bi_bdev);
check->bi_opf = REQ_OP_READ;
check->bi_iter.bi_sector = bio->bi_iter.bi_sector;
check->bi_iter.bi_size = bio->bi_iter.bi_size;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 85b1f2a9b72d..29c231758293 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -475,7 +475,7 @@ struct search {
unsigned int read_dirty_data:1;
unsigned int cache_missed:1;
- struct block_device *part;
+ struct block_device *orig_bdev;
unsigned long start_time;
struct btree_op op;
@@ -670,8 +670,8 @@ static void bio_complete(struct search *s)
{
if (s->orig_bio) {
/* Count on bcache device */
- part_end_io_acct(s->part, s->orig_bio, s->start_time);
-
+ bio_end_io_acct_remapped(s->orig_bio, s->start_time,
+ s->orig_bdev);
trace_bcache_request_end(s->d, s->orig_bio);
s->orig_bio->bi_status = s->iop.status;
bio_endio(s->orig_bio);
@@ -714,7 +714,8 @@ static void search_free(struct closure *cl)
}
static inline struct search *search_alloc(struct bio *bio,
- struct bcache_device *d)
+ struct bcache_device *d, struct block_device *orig_bdev,
+ unsigned long start_time)
{
struct search *s;
@@ -732,7 +733,8 @@ static inline struct search *search_alloc(struct bio *bio,
s->write = op_is_write(bio_op(bio));
s->read_dirty_data = 0;
/* Count on the bcache device */
- s->start_time = part_start_io_acct(d->disk, &s->part, bio);
+ s->orig_bdev = orig_bdev;
+ s->start_time = start_time;
s->iop.c = d->c;
s->iop.bio = NULL;
s->iop.inode = d->id;
@@ -894,7 +896,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
!(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
reada = min_t(sector_t, dc->readahead >> 9,
- get_capacity(bio->bi_disk) - bio_end_sector(bio));
+ get_capacity(bio->bi_bdev->bd_disk) -
+ bio_end_sector(bio));
s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
@@ -1073,7 +1076,7 @@ struct detached_dev_io_private {
unsigned long start_time;
bio_end_io_t *bi_end_io;
void *bi_private;
- struct block_device *part;
+ struct block_device *orig_bdev;
};
static void detached_dev_end_io(struct bio *bio)
@@ -1085,7 +1088,7 @@ static void detached_dev_end_io(struct bio *bio)
bio->bi_private = ddip->bi_private;
/* Count on the bcache device */
- part_end_io_acct(ddip->part, bio, ddip->start_time);
+ bio_end_io_acct_remapped(bio, ddip->start_time, ddip->orig_bdev);
if (bio->bi_status) {
struct cached_dev *dc = container_of(ddip->d,
@@ -1098,7 +1101,8 @@ static void detached_dev_end_io(struct bio *bio)
bio->bi_end_io(bio);
}
-static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
+static void detached_dev_do_request(struct bcache_device *d, struct bio *bio,
+ struct block_device *orig_bdev, unsigned long start_time)
{
struct detached_dev_io_private *ddip;
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
@@ -1111,7 +1115,8 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
ddip->d = d;
/* Count on the bcache device */
- ddip->start_time = part_start_io_acct(d->disk, &ddip->part, bio);
+ ddip->orig_bdev = orig_bdev;
+ ddip->start_time = start_time;
ddip->bi_end_io = bio->bi_end_io;
ddip->bi_private = bio->bi_private;
bio->bi_end_io = detached_dev_end_io;
@@ -1167,8 +1172,10 @@ static void quit_max_writeback_rate(struct cache_set *c,
blk_qc_t cached_dev_submit_bio(struct bio *bio)
{
struct search *s;
- struct bcache_device *d = bio->bi_disk->private_data;
+ struct block_device *orig_bdev = bio->bi_bdev;
+ struct bcache_device *d = orig_bdev->bd_disk->private_data;
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+ unsigned long start_time;
int rw = bio_data_dir(bio);
if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) ||
@@ -1193,11 +1200,13 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio)
}
}
+ start_time = bio_start_io_acct(bio);
+
bio_set_dev(bio, dc->bdev);
bio->bi_iter.bi_sector += dc->sb.data_offset;
if (cached_dev_get(dc)) {
- s = search_alloc(bio, d);
+ s = search_alloc(bio, d, orig_bdev, start_time);
trace_bcache_request_start(s->d, bio);
if (!bio->bi_iter.bi_size) {
@@ -1218,7 +1227,7 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio)
}
} else
/* I/O request sent to backing device */
- detached_dev_do_request(d, bio);
+ detached_dev_do_request(d, bio, orig_bdev, start_time);
return BLK_QC_T_NONE;
}
@@ -1274,7 +1283,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio)
{
struct search *s;
struct closure *cl;
- struct bcache_device *d = bio->bi_disk->private_data;
+ struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
bio->bi_status = BLK_STS_IOERR;
@@ -1282,7 +1291,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio)
return BLK_QC_T_NONE;
}
- s = search_alloc(bio, d);
+ s = search_alloc(bio, d, bio->bi_bdev, bio_start_io_acct(bio));
cl = &s->cl;
bio = &s->bio.bio;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 2047a9cccdb5..193fe7652329 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1939,7 +1939,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
goto err;
if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
- BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
+ BIOSET_NEED_RESCUER))
goto err;
c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb);
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
index 2ea0360108e1..a3b71350eec8 100644
--- a/drivers/md/dm-bio-record.h
+++ b/drivers/md/dm-bio-record.h
@@ -18,8 +18,7 @@
*/
struct dm_bio_details {
- struct gendisk *bi_disk;
- u8 bi_partno;
+ struct block_device *bi_bdev;
int __bi_remaining;
unsigned long bi_flags;
struct bvec_iter bi_iter;
@@ -31,8 +30,7 @@ struct dm_bio_details {
static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
{
- bd->bi_disk = bio->bi_disk;
- bd->bi_partno = bio->bi_partno;
+ bd->bi_bdev = bio->bi_bdev;
bd->bi_flags = bio->bi_flags;
bd->bi_iter = bio->bi_iter;
bd->__bi_remaining = atomic_read(&bio->__bi_remaining);
@@ -44,8 +42,7 @@ static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
{
- bio->bi_disk = bd->bi_disk;
- bio->bi_partno = bd->bi_partno;
+ bio->bi_bdev = bd->bi_bdev;
bio->bi_flags = bd->bi_flags;
bio->bi_iter = bd->bi_iter;
atomic_set(&bio->__bi_remaining, bd->__bi_remaining);
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index af6d4f898e4c..89a73204dbf4 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -449,7 +449,7 @@ static int __check_incompat_features(struct cache_disk_superblock *disk_super,
/*
* Check for read-only metadata to skip the following RDWR checks.
*/
- if (get_disk_ro(cmd->bdev->bd_disk))
+ if (bdev_read_only(cmd->bdev))
return 0;
features = le32_to_cpu(disk_super->compat_ro_flags) & ~DM_CACHE_FEATURE_COMPAT_RO_SUPP;
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index bdb255edc200..a90bdf9b2ca6 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -85,12 +85,6 @@ struct clone {
struct dm_clone_metadata *cmd;
- /*
- * bio used to flush the destination device, before committing the
- * metadata.
- */
- struct bio flush_bio;
-
/* Region hydration hash table */
struct hash_table_bucket *ht;
@@ -1155,11 +1149,7 @@ static int commit_metadata(struct clone *clone, bool *dest_dev_flushed)
goto out;
}
- bio_reset(&clone->flush_bio);
- bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev);
- clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-
- r = submit_bio_wait(&clone->flush_bio);
+ r = blkdev_issue_flush(clone->dest_dev->bdev);
if (unlikely(r)) {
__metadata_operation_failed(clone, "flush destination device", r);
goto out;
@@ -1886,7 +1876,6 @@ static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
bio_list_init(&clone->deferred_flush_completions);
clone->hydration_offset = 0;
atomic_set(&clone->hydrations_in_flight, 0);
- bio_init(&clone->flush_bio, NULL, 0);
clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
if (!clone->wq) {
@@ -1958,7 +1947,6 @@ static void clone_dtr(struct dm_target *ti)
struct clone *clone = ti->private;
mutex_destroy(&clone->commit_lock);
- bio_uninit(&clone->flush_bio);
for (i = 0; i < clone->nr_ctr_args; i++)
kfree(clone->ctr_args[i]);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index fa09bc4e4c54..b0a82f29a2e4 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -145,7 +145,7 @@ static void dispatch_bios(void *context, struct bio_list *bio_list)
struct dm_raid1_bio_record {
struct mirror *m;
- /* if details->bi_disk == NULL, details were not saved */
+ /* if details->bi_bdev == NULL, details were not saved */
struct dm_bio_details details;
region_t write_region;
};
@@ -1190,7 +1190,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
struct dm_raid1_bio_record *bio_record =
dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
- bio_record->details.bi_disk = NULL;
+ bio_record->details.bi_bdev = NULL;
if (rw == WRITE) {
/* Save region for mirror_end_io() handler */
@@ -1257,7 +1257,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
goto out;
if (unlikely(*error)) {
- if (!bio_record->details.bi_disk) {
+ if (!bio_record->details.bi_bdev) {
/*
* There wasn't enough memory to record necessary
* information for a retry or there was no other
@@ -1282,7 +1282,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
bd = &bio_record->details;
dm_bio_restore(bd, bio);
- bio_record->details.bi_disk = NULL;
+ bio_record->details.bi_bdev = NULL;
bio->bi_status = 0;
queue_bio(ms, bio, rw);
@@ -1292,7 +1292,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
}
out:
- bio_record->details.bi_disk = NULL;
+ bio_record->details.bi_bdev = NULL;
return DM_ENDIO_DONE;
}
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 6ebb2127f3e2..e75b20480e46 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -636,7 +636,7 @@ static int __check_incompat_features(struct thin_disk_superblock *disk_super,
/*
* Check for read-only metadata to skip the following RDWR checks.
*/
- if (get_disk_ro(pmd->bdev->bd_disk))
+ if (bdev_read_only(pmd->bdev))
return 0;
features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index b298fefb022e..039d17b28938 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -819,7 +819,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block,
mblk->page);
if (ret == 0)
- ret = blkdev_issue_flush(dev->bdev, GFP_NOIO);
+ ret = blkdev_issue_flush(dev->bdev);
return ret;
}
@@ -862,7 +862,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
/* Flush drive cache (this will also sync data) */
if (ret == 0)
- ret = blkdev_issue_flush(dev->bdev, GFP_NOIO);
+ ret = blkdev_issue_flush(dev->bdev);
return ret;
}
@@ -933,7 +933,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
/* If there are no dirty metadata blocks, just flush the device cache */
if (list_empty(&write_list)) {
- ret = blkdev_issue_flush(dev->bdev, GFP_NOIO);
+ ret = blkdev_issue_flush(dev->bdev);
goto err;
}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7bac564f3faa..479ec5bea09e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -977,16 +977,17 @@ static void clone_endio(struct bio *bio)
struct mapped_device *md = tio->io->md;
dm_endio_fn endio = tio->ti->type->end_io;
struct bio *orig_bio = io->orig_bio;
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
if (unlikely(error == BLK_STS_TARGET)) {
if (bio_op(bio) == REQ_OP_DISCARD &&
- !bio->bi_disk->queue->limits.max_discard_sectors)
+ !q->limits.max_discard_sectors)
disable_discard(md);
else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
- !bio->bi_disk->queue->limits.max_write_same_sectors)
+ !q->limits.max_write_same_sectors)
disable_write_same(md);
else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
- !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
+ !q->limits.max_write_zeroes_sectors)
disable_write_zeroes(md);
}
@@ -996,7 +997,7 @@ static void clone_endio(struct bio *bio)
*/
if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
sector_t written_sector = bio->bi_iter.bi_sector;
- struct request_queue *q = orig_bio->bi_disk->queue;
+ struct request_queue *q = orig_bio->bi_bdev->bd_disk->queue;
u64 mask = (u64)blk_queue_zone_sectors(q) - 1;
orig_bio->bi_iter.bi_sector += written_sector & mask;
@@ -1422,8 +1423,7 @@ static int __send_empty_flush(struct clone_info *ci)
*/
bio_init(&flush_bio, NULL, 0);
flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
- flush_bio.bi_disk = ci->io->md->disk;
- bio_associate_blkg(&flush_bio);
+ bio_set_dev(&flush_bio, ci->io->md->disk->part0);
ci->bio = &flush_bio;
ci->sector_count = 0;
@@ -1626,7 +1626,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
static blk_qc_t dm_submit_bio(struct bio *bio)
{
- struct mapped_device *md = bio->bi_disk->private_data;
+ struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;
blk_qc_t ret = BLK_QC_T_NONE;
int srcu_idx;
struct dm_table *map;
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 68cac7d19278..63ed8329a98d 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -252,7 +252,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
start_sector + data_offset;
if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bio->bi_disk->queue))) {
+ !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) {
/* Just ignore it */
bio_endio(bio);
} else {
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 04384452a7ab..21da0c48f6c2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -340,24 +340,6 @@ static int start_readonly;
*/
static bool create_on_open = true;
-struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
- struct mddev *mddev)
-{
- if (!mddev || !bioset_initialized(&mddev->bio_set))
- return bio_alloc(gfp_mask, nr_iovecs);
-
- return bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
-}
-EXPORT_SYMBOL_GPL(bio_alloc_mddev);
-
-static struct bio *md_bio_alloc_sync(struct mddev *mddev)
-{
- if (!mddev || !bioset_initialized(&mddev->sync_set))
- return bio_alloc(GFP_NOIO, 1);
-
- return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
-}
-
/*
* We have a system wide 'event count' that is incremented
* on any 'interesting' event, and readers of /proc/mdstat
@@ -463,8 +445,8 @@ struct md_io {
struct mddev *mddev;
bio_end_io_t *orig_bi_end_io;
void *orig_bi_private;
+ struct block_device *orig_bi_bdev;
unsigned long start_time;
- struct block_device *part;
};
static void md_end_io(struct bio *bio)
@@ -472,7 +454,7 @@ static void md_end_io(struct bio *bio)
struct md_io *md_io = bio->bi_private;
struct mddev *mddev = md_io->mddev;
- part_end_io_acct(md_io->part, bio, md_io->start_time);
+ bio_end_io_acct_remapped(bio, md_io->start_time, md_io->orig_bi_bdev);
bio->bi_end_io = md_io->orig_bi_end_io;
bio->bi_private = md_io->orig_bi_private;
@@ -486,7 +468,7 @@ static void md_end_io(struct bio *bio)
static blk_qc_t md_submit_bio(struct bio *bio)
{
const int rw = bio_data_dir(bio);
- struct mddev *mddev = bio->bi_disk->private_data;
+ struct mddev *mddev = bio->bi_bdev->bd_disk->private_data;
if (mddev == NULL || mddev->pers == NULL) {
bio_io_error(bio);
@@ -514,12 +496,12 @@ static blk_qc_t md_submit_bio(struct bio *bio)
md_io->mddev = mddev;
md_io->orig_bi_end_io = bio->bi_end_io;
md_io->orig_bi_private = bio->bi_private;
+ md_io->orig_bi_bdev = bio->bi_bdev;
bio->bi_end_io = md_end_io;
bio->bi_private = md_io;
- md_io->start_time = part_start_io_acct(mddev->gendisk,
- &md_io->part, bio);
+ md_io->start_time = bio_start_io_acct(bio);
}
/* bio could be mergeable after passing to underlayer */
@@ -613,7 +595,7 @@ static void submit_flushes(struct work_struct *ws)
atomic_inc(&rdev->nr_pending);
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
- bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
+ bi = bio_alloc_bioset(GFP_NOIO, 0, &mddev->bio_set);
bi->bi_end_io = md_end_flush;
bi->bi_private = rdev;
bio_set_dev(bi, rdev->bdev);
@@ -999,7 +981,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
if (test_bit(Faulty, &rdev->flags))
return;
- bio = md_bio_alloc_sync(mddev);
+ bio = bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
atomic_inc(&rdev->nr_pending);
@@ -1031,29 +1013,29 @@ int md_super_wait(struct mddev *mddev)
int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
struct page *page, int op, int op_flags, bool metadata_op)
{
- struct bio *bio = md_bio_alloc_sync(rdev->mddev);
- int ret;
+ struct bio bio;
+ struct bio_vec bvec;
+
+ bio_init(&bio, &bvec, 1);
if (metadata_op && rdev->meta_bdev)
- bio_set_dev(bio, rdev->meta_bdev);
+ bio_set_dev(&bio, rdev->meta_bdev);
else
- bio_set_dev(bio, rdev->bdev);
- bio_set_op_attrs(bio, op, op_flags);
+ bio_set_dev(&bio, rdev->bdev);
+ bio.bi_opf = op | op_flags;
if (metadata_op)
- bio->bi_iter.bi_sector = sector + rdev->sb_start;
+ bio.bi_iter.bi_sector = sector + rdev->sb_start;
else if (rdev->mddev->reshape_position != MaxSector &&
(rdev->mddev->reshape_backwards ==
(sector >= rdev->mddev->reshape_position)))
- bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
+ bio.bi_iter.bi_sector = sector + rdev->new_data_offset;
else
- bio->bi_iter.bi_sector = sector + rdev->data_offset;
- bio_add_page(bio, page, size, 0);
+ bio.bi_iter.bi_sector = sector + rdev->data_offset;
+ bio_add_page(&bio, page, size, 0);
- submit_bio_wait(bio);
+ submit_bio_wait(&bio);
- ret = !bio->bi_status;
- bio_put(bio);
- return ret;
+ return !bio.bi_status;
}
EXPORT_SYMBOL_GPL(sync_page_io);
@@ -2417,6 +2399,12 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
}
EXPORT_SYMBOL(md_integrity_add_rdev);
+static bool rdev_read_only(struct md_rdev *rdev)
+{
+ return bdev_read_only(rdev->bdev) ||
+ (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev));
+}
+
static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
{
char b[BDEVNAME_SIZE];
@@ -2426,8 +2414,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
if (find_rdev(mddev, rdev->bdev->bd_dev))
return -EEXIST;
- if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
- mddev->pers)
+ if (rdev_read_only(rdev) && mddev->pers)
return -EROFS;
/* make sure rdev->sectors exceeds mddev->dev_sectors */
@@ -5861,9 +5848,7 @@ int md_run(struct mddev *mddev)
continue;
sync_blockdev(rdev->bdev);
invalidate_bdev(rdev->bdev);
- if (mddev->ro != 1 &&
- (bdev_read_only(rdev->bdev) ||
- bdev_read_only(rdev->meta_bdev))) {
+ if (mddev->ro != 1 && rdev_read_only(rdev)) {
mddev->ro = 1;
if (mddev->gendisk)
set_disk_ro(mddev->gendisk, 1);
@@ -6158,7 +6143,7 @@ static int restart_array(struct mddev *mddev)
if (test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
has_journal = true;
- if (bdev_read_only(rdev->bdev))
+ if (rdev_read_only(rdev))
has_readonly = true;
}
rcu_read_unlock();
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 34070ab30a8a..bcbba1b5ec4a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -556,7 +556,7 @@ static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sect
static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
{
- atomic_add(nr_sectors, &bio->bi_disk->sync_io);
+ md_sync_acct(bio->bi_bdev, nr_sectors);
}
struct md_personality
@@ -742,8 +742,6 @@ extern void md_rdev_clear(struct md_rdev *rdev);
extern void md_handle_request(struct mddev *mddev, struct bio *bio);
extern void mddev_suspend(struct mddev *mddev);
extern void mddev_resume(struct mddev *mddev);
-extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
- struct mddev *mddev);
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
@@ -793,14 +791,14 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
{
if (bio_op(bio) == REQ_OP_WRITE_SAME &&
- !bio->bi_disk->queue->limits.max_write_same_sectors)
+ !bio->bi_bdev->bd_disk->queue->limits.max_write_same_sectors)
mddev->queue->limits.max_write_same_sectors = 0;
}
static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio)
{
if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
- !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
+ !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors)
mddev->queue->limits.max_write_zeroes_sectors = 0;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c0347997f6ff..d2378765dc15 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -794,13 +794,13 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio)
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
- struct md_rdev *rdev = (void *)bio->bi_disk;
+ struct md_rdev *rdev = (void *)bio->bi_bdev;
bio->bi_next = NULL;
bio_set_dev(bio, rdev->bdev);
if (test_bit(Faulty, &rdev->flags)) {
bio_io_error(bio);
} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bio->bi_disk->queue)))
+ !blk_queue_discard(bio->bi_bdev->bd_disk->queue)))
/* Just ignore it */
bio_endio(bio);
else
@@ -1104,7 +1104,7 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio,
int i = 0;
struct bio *behind_bio = NULL;
- behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev);
+ behind_bio = bio_alloc_bioset(GFP_NOIO, vcnt, &r1_bio->mddev->bio_set);
if (!behind_bio)
return;
@@ -1520,7 +1520,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
trace_block_bio_remap(mbio, disk_devt(mddev->gendisk),
r1_bio->sector);
/* flush_pending_writes() needs access to the rdev so...*/
- mbio->bi_disk = (void *)conf->mirrors[i].rdev;
+ mbio->bi_bdev = (void *)conf->mirrors[i].rdev;
cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
if (cb)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c5d88ef6a45c..a9ae7d113492 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -882,13 +882,13 @@ static void flush_pending_writes(struct r10conf *conf)
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
- struct md_rdev *rdev = (void*)bio->bi_disk;
+ struct md_rdev *rdev = (void*)bio->bi_bdev;
bio->bi_next = NULL;
bio_set_dev(bio, rdev->bdev);
if (test_bit(Faulty, &rdev->flags)) {
bio_io_error(bio);
} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bio->bi_disk->queue)))
+ !blk_queue_discard(bio->bi_bdev->bd_disk->queue)))
/* Just ignore it */
bio_endio(bio);
else
@@ -1075,13 +1075,13 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
- struct md_rdev *rdev = (void*)bio->bi_disk;
+ struct md_rdev *rdev = (void*)bio->bi_bdev;
bio->bi_next = NULL;
bio_set_dev(bio, rdev->bdev);
if (test_bit(Faulty, &rdev->flags)) {
bio_io_error(bio);
} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bio->bi_disk->queue)))
+ !blk_queue_discard(bio->bi_bdev->bd_disk->queue)))
/* Just ignore it */
bio_endio(bio);
else
@@ -1253,7 +1253,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk),
r10_bio->sector);
/* flush_pending_writes() needs access to the rdev so...*/
- mbio->bi_disk = (void *)rdev;
+ mbio->bi_bdev = (void *)rdev;
atomic_inc(&r10_bio->remaining);
@@ -3003,7 +3003,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
/* Again, very different code for resync and recovery.
* Both must result in an r10bio with a list of bios that
- * have bi_end_io, bi_sector, bi_disk set,
+ * have bi_end_io, bi_sector, bi_bdev set,
* and bi_private set to the r10bio.
* For recovery, we may actually create several r10bios
* with 2 bios in each, that correspond to the bios in the main one.
@@ -4531,7 +4531,7 @@ read_more:
return sectors_done;
}
- read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
+ read_bio = bio_alloc_bioset(GFP_KERNEL, RESYNC_PAGES, &mddev->bio_set);
bio_set_dev(read_bio, rdev->bdev);
read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
@@ -4539,10 +4539,6 @@ read_more:
read_bio->bi_private = r10_bio;
read_bio->bi_end_io = end_reshape_read;
bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
- read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
- read_bio->bi_status = 0;
- read_bio->bi_vcnt = 0;
- read_bio->bi_iter.bi_size = 0;
r10_bio->master_bio = read_bio;
r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index d0f540296fe9..e8c118e05dfd 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -1037,7 +1037,7 @@ static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr,
}
/* flush the disk cache after recovery if necessary */
- ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL);
+ ret = blkdev_issue_flush(rdev->bdev);
out:
__free_page(page);
return ret;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3a90cc0e43ca..a348b2adf2a9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5310,7 +5310,7 @@ static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
unsigned int chunk_sectors;
unsigned int bio_sectors = bio_sectors(bio);
- WARN_ON_ONCE(bio->bi_partno);
+ WARN_ON_ONCE(bio->bi_bdev->bd_partno);
chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
return chunk_sectors >=
@@ -5393,90 +5393,72 @@ static void raid5_align_endio(struct bio *bi)
static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
{
struct r5conf *conf = mddev->private;
- int dd_idx;
- struct bio* align_bi;
+ struct bio *align_bio;
struct md_rdev *rdev;
- sector_t end_sector;
+ sector_t sector, end_sector, first_bad;
+ int bad_sectors, dd_idx;
if (!in_chunk_boundary(mddev, raid_bio)) {
pr_debug("%s: non aligned\n", __func__);
return 0;
}
- /*
- * use bio_clone_fast to make a copy of the bio
- */
- align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set);
- if (!align_bi)
- return 0;
- /*
- * set bi_end_io to a new function, and set bi_private to the
- * original bio.
- */
- align_bi->bi_end_io = raid5_align_endio;
- align_bi->bi_private = raid_bio;
- /*
- * compute position
- */
- align_bi->bi_iter.bi_sector =
- raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
- 0, &dd_idx, NULL);
- end_sector = bio_end_sector(align_bi);
+ sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0,
+ &dd_idx, NULL);
+ end_sector = bio_end_sector(raid_bio);
+
rcu_read_lock();
+ if (r5c_big_stripe_cached(conf, sector))
+ goto out_rcu_unlock;
+
rdev = rcu_dereference(conf->disks[dd_idx].replacement);
if (!rdev || test_bit(Faulty, &rdev->flags) ||
rdev->recovery_offset < end_sector) {
rdev = rcu_dereference(conf->disks[dd_idx].rdev);
- if (rdev &&
- (test_bit(Faulty, &rdev->flags) ||
+ if (!rdev)
+ goto out_rcu_unlock;
+ if (test_bit(Faulty, &rdev->flags) ||
!(test_bit(In_sync, &rdev->flags) ||
- rdev->recovery_offset >= end_sector)))
- rdev = NULL;
+ rdev->recovery_offset >= end_sector))
+ goto out_rcu_unlock;
}
- if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) {
- rcu_read_unlock();
- bio_put(align_bi);
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+
+ align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set);
+ bio_set_dev(align_bio, rdev->bdev);
+ align_bio->bi_end_io = raid5_align_endio;
+ align_bio->bi_private = raid_bio;
+ align_bio->bi_iter.bi_sector = sector;
+
+ raid_bio->bi_next = (void *)rdev;
+
+ if (is_badblock(rdev, sector, bio_sectors(align_bio), &first_bad,
+ &bad_sectors)) {
+ bio_put(align_bio);
+ rdev_dec_pending(rdev, mddev);
return 0;
}
- if (rdev) {
- sector_t first_bad;
- int bad_sectors;
-
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
- raid_bio->bi_next = (void*)rdev;
- bio_set_dev(align_bi, rdev->bdev);
-
- if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
- bio_sectors(align_bi),
- &first_bad, &bad_sectors)) {
- bio_put(align_bi);
- rdev_dec_pending(rdev, mddev);
- return 0;
- }
+ /* No reshape active, so we can trust rdev->data_offset */
+ align_bio->bi_iter.bi_sector += rdev->data_offset;
- /* No reshape active, so we can trust rdev->data_offset */
- align_bi->bi_iter.bi_sector += rdev->data_offset;
+ spin_lock_irq(&conf->device_lock);
+ wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
+ conf->device_lock);
+ atomic_inc(&conf->active_aligned_reads);
+ spin_unlock_irq(&conf->device_lock);
- spin_lock_irq(&conf->device_lock);
- wait_event_lock_irq(conf->wait_for_quiescent,
- conf->quiesce == 0,
- conf->device_lock);
- atomic_inc(&conf->active_aligned_reads);
- spin_unlock_irq(&conf->device_lock);
+ if (mddev->gendisk)
+ trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
+ raid_bio->bi_iter.bi_sector);
+ submit_bio_noacct(align_bio);
+ return 1;
- if (mddev->gendisk)
- trace_block_bio_remap(align_bi, disk_devt(mddev->gendisk),
- raid_bio->bi_iter.bi_sector);
- submit_bio_noacct(align_bi);
- return 1;
- } else {
- rcu_read_unlock();
- bio_put(align_bi);
- return 0;
- }
+out_rcu_unlock:
+ rcu_read_unlock();
+ return 0;
}
static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 42e27a298218..a1d6b68320ae 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -253,7 +253,7 @@ static ssize_t power_ro_lock_store(struct device *dev,
goto out_put;
}
req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_BOOT_WP;
- blk_execute_rq(mq->queue, NULL, req, 0);
+ blk_execute_rq(NULL, req, 0);
ret = req_to_mmc_queue_req(req)->drv_op_result;
blk_put_request(req);
@@ -629,7 +629,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md,
rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL;
req_to_mmc_queue_req(req)->drv_op_data = idatas;
req_to_mmc_queue_req(req)->ioc_count = 1;
- blk_execute_rq(mq->queue, NULL, req, 0);
+ blk_execute_rq(NULL, req, 0);
ioc_err = req_to_mmc_queue_req(req)->drv_op_result;
err = mmc_blk_ioctl_copy_to_user(ic_ptr, idata);
blk_put_request(req);
@@ -698,7 +698,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md,
rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL;
req_to_mmc_queue_req(req)->drv_op_data = idata;
req_to_mmc_queue_req(req)->ioc_count = num_of_cmds;
- blk_execute_rq(mq->queue, NULL, req, 0);
+ blk_execute_rq(NULL, req, 0);
ioc_err = req_to_mmc_queue_req(req)->drv_op_result;
/* copy to user if data and response */
@@ -2722,7 +2722,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val)
if (IS_ERR(req))
return PTR_ERR(req);
req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS;
- blk_execute_rq(mq->queue, NULL, req, 0);
+ blk_execute_rq(NULL, req, 0);
ret = req_to_mmc_queue_req(req)->drv_op_result;
if (ret >= 0) {
*val = ret;
@@ -2761,7 +2761,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp)
}
req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_EXT_CSD;
req_to_mmc_queue_req(req)->drv_op_data = &ext_csd;
- blk_execute_rq(mq->queue, NULL, req, 0);
+ blk_execute_rq(NULL, req, 0);
err = req_to_mmc_queue_req(req)->drv_op_result;
blk_put_request(req);
if (err) {
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 22e5617b2cea..e03a1f38d750 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -165,7 +165,7 @@ static int nsblk_do_bvec(struct nd_namespace_blk *nsblk,
static blk_qc_t nd_blk_submit_bio(struct bio *bio)
{
struct bio_integrity_payload *bip;
- struct nd_namespace_blk *nsblk = bio->bi_disk->private_data;
+ struct nd_namespace_blk *nsblk = bio->bi_bdev->bd_disk->private_data;
struct bvec_iter iter;
unsigned long start;
struct bio_vec bvec;
@@ -177,7 +177,7 @@ static blk_qc_t nd_blk_submit_bio(struct bio *bio)
bip = bio_integrity(bio);
rw = bio_data_dir(bio);
- do_acct = blk_queue_io_stat(bio->bi_disk->queue);
+ do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue);
if (do_acct)
start = bio_start_io_acct(bio);
bio_for_each_segment(bvec, bio, iter) {
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 12ff6f8784ac..41aa1f01fc07 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1442,7 +1442,7 @@ static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
static blk_qc_t btt_submit_bio(struct bio *bio)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
- struct btt *btt = bio->bi_disk->private_data;
+ struct btt *btt = bio->bi_bdev->bd_disk->private_data;
struct bvec_iter iter;
unsigned long start;
struct bio_vec bvec;
@@ -1452,7 +1452,7 @@ static blk_qc_t btt_submit_bio(struct bio *bio)
if (!bio_integrity_prep(bio))
return BLK_QC_T_NONE;
- do_acct = blk_queue_io_stat(bio->bi_disk->queue);
+ do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue);
if (do_acct)
start = bio_start_io_acct(bio);
bio_for_each_segment(bvec, bio, iter) {
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index f33bdae626ba..281fedb4dc4d 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -196,13 +196,13 @@ static blk_qc_t pmem_submit_bio(struct bio *bio)
unsigned long start;
struct bio_vec bvec;
struct bvec_iter iter;
- struct pmem_device *pmem = bio->bi_disk->private_data;
+ struct pmem_device *pmem = bio->bi_bdev->bd_disk->private_data;
struct nd_region *nd_region = to_region(pmem);
if (bio->bi_opf & REQ_PREFLUSH)
ret = nvdimm_flush(nd_region, bio);
- do_acct = blk_queue_io_stat(bio->bi_disk->queue);
+ do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue);
if (do_acct)
start = bio_start_io_acct(bio);
bio_for_each_segment(bvec, bio, iter) {
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f13eb4ded95f..89cacff897a8 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -925,7 +925,7 @@ static void nvme_execute_rq_polled(struct request_queue *q,
rq->cmd_flags |= REQ_HIPRI;
rq->end_io_data = &wait;
- blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq);
+ blk_execute_rq_nowait(bd_disk, rq, at_head, nvme_end_sync_rq);
while (!completion_done(&wait)) {
blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true);
@@ -964,7 +964,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
if (poll)
nvme_execute_rq_polled(req->q, NULL, req, at_head);
else
- blk_execute_rq(req->q, NULL, req, at_head);
+ blk_execute_rq(NULL, req, at_head);
if (result)
*result = nvme_req(req)->result;
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
@@ -1101,7 +1101,7 @@ void nvme_execute_passthru_rq(struct request *rq)
u32 effects;
effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
- blk_execute_rq(rq->q, disk, rq, 0);
+ blk_execute_rq(disk, rq, 0);
nvme_passthru_end(ctrl, effects);
}
EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
@@ -1113,7 +1113,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
{
bool write = nvme_is_write(cmd);
struct nvme_ns *ns = q->queuedata;
- struct gendisk *disk = ns ? ns->disk : NULL;
+ struct block_device *bdev = ns ? ns->disk->part0 : NULL;
struct request *req;
struct bio *bio = NULL;
void *meta = NULL;
@@ -1133,8 +1133,9 @@ static int nvme_submit_user_cmd(struct request_queue *q,
if (ret)
goto out;
bio = req->bio;
- bio->bi_disk = disk;
- if (disk && meta_buffer && meta_len) {
+ if (bdev)
+ bio_set_dev(bio, bdev);
+ if (bdev && meta_buffer && meta_len) {
meta = nvme_add_user_metadata(bio, meta_buffer, meta_len,
meta_seed, write);
if (IS_ERR(meta)) {
@@ -1202,7 +1203,7 @@ static int nvme_keep_alive(struct nvme_ctrl *ctrl)
rq->timeout = ctrl->kato * HZ;
rq->end_io_data = ctrl;
- blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
+ blk_execute_rq_nowait(NULL, rq, 0, nvme_keep_alive_end_io);
return 0;
}
@@ -2125,9 +2126,8 @@ static void nvme_update_disk_info(struct gendisk *disk,
nvme_config_discard(disk, ns);
nvme_config_write_zeroes(disk, ns);
- if ((id->nsattr & NVME_NS_ATTR_RO) ||
- test_bit(NVME_NS_FORCE_RO, &ns->flags))
- set_disk_ro(disk, true);
+ set_disk_ro(disk, (id->nsattr & NVME_NS_ATTR_RO) ||
+ test_bit(NVME_NS_FORCE_RO, &ns->flags));
}
static inline bool nvme_first_scan(struct gendisk *disk)
@@ -2176,17 +2176,18 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
ns->lba_shift = id->lbaf[lbaf].ds;
nvme_set_queue_limits(ns->ctrl, ns->queue);
+ ret = nvme_configure_metadata(ns, id);
+ if (ret)
+ goto out_unfreeze;
+ nvme_set_chunk_sectors(ns, id);
+ nvme_update_disk_info(ns->disk, ns, id);
+
if (ns->head->ids.csi == NVME_CSI_ZNS) {
ret = nvme_update_zone_info(ns, lbaf);
if (ret)
goto out_unfreeze;
}
- ret = nvme_configure_metadata(ns, id);
- if (ret)
- goto out_unfreeze;
- nvme_set_chunk_sectors(ns, id);
- nvme_update_disk_info(ns->disk, ns, id);
blk_mq_unfreeze_queue(ns->disk->queue);
if (blk_queue_is_zoned(ns->queue)) {
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 470cef3abec3..b705988629f2 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -695,7 +695,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd,
rq->end_io_data = rqd;
- blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io);
+ blk_execute_rq_nowait(NULL, rq, 0, nvme_nvm_end_io);
return 0;
@@ -757,7 +757,6 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q,
{
bool write = nvme_is_write((struct nvme_command *)vcmd);
struct nvm_dev *dev = ns->ndev;
- struct gendisk *disk = ns->disk;
struct request *rq;
struct bio *bio = NULL;
__le64 *ppa_list = NULL;
@@ -817,10 +816,10 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q,
vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma);
}
- bio->bi_disk = disk;
+ bio_set_dev(bio, ns->disk->part0);
}
- blk_execute_rq(q, NULL, rq, 0);
+ blk_execute_rq(NULL, rq, 0);
if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
ret = -EINTR;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 282b7a4ea9a9..1427c9555cef 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -296,7 +296,7 @@ static bool nvme_available_path(struct nvme_ns_head *head)
blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
{
- struct nvme_ns_head *head = bio->bi_disk->private_data;
+ struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
struct device *dev = disk_to_dev(head->disk);
struct nvme_ns *ns;
blk_qc_t ret = BLK_QC_T_NONE;
@@ -312,7 +312,7 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
srcu_idx = srcu_read_lock(&head->srcu);
ns = nvme_find_path(head);
if (likely(ns)) {
- bio->bi_disk = ns->disk;
+ bio_set_dev(bio, ns->disk->part0);
bio->bi_opf |= REQ_NVME_MPATH;
trace_block_bio_remap(bio, disk_devt(ns->head->disk),
bio->bi_iter.bi_sector);
@@ -352,7 +352,7 @@ static void nvme_requeue_work(struct work_struct *work)
* Reset disk to the mpath node and resubmit to select a new
* path.
*/
- bio->bi_disk = head->disk;
+ bio_set_dev(bio, head->disk->part0);
submit_bio_noacct(bio);
}
}
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6bad4d4dcdf0..808ab5186928 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1357,7 +1357,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
}
abort_req->end_io_data = NULL;
- blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio);
+ blk_execute_rq_nowait(NULL, abort_req, 0, abort_endio);
/*
* The aborted req will be completed on receiving the abort req.
@@ -2281,7 +2281,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
req->end_io_data = nvmeq;
init_completion(&nvmeq->delete_done);
- blk_execute_rq_nowait(q, NULL, req, false,
+ blk_execute_rq_nowait(NULL, req, false,
opcode == nvme_admin_delete_cq ?
nvme_del_cq_end : nvme_del_queue_end);
return 0;
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index b7ce4f221d99..f5ef3edeb2fd 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1468,7 +1468,7 @@ static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue,
if (unlikely(nr))
goto mr_put;
- nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_disk), c,
+ nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c,
req->mr->sig_attrs, ns->pi_type);
nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask);
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 1dfe9a3500e3..c7e3ec561ba0 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -9,13 +9,7 @@
int nvme_revalidate_zones(struct nvme_ns *ns)
{
- struct request_queue *q = ns->queue;
- int ret;
-
- ret = blk_revalidate_disk_zones(ns->disk, NULL);
- if (!ret)
- blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append);
- return ret;
+ return blk_revalidate_disk_zones(ns->disk, NULL);
}
static int nvme_set_max_append(struct nvme_ctrl *ctrl)
@@ -109,10 +103,11 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
goto free_data;
}
- q->limits.zoned = BLK_ZONED_HM;
+ blk_queue_set_zoned(ns->disk, BLK_ZONED_HM);
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1);
blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1);
+ blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append);
free_data:
kfree(id);
return status;
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 125dde3f410e..bf6e0ac9ad28 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -333,7 +333,7 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req)
u16 nvmet_bdev_flush(struct nvmet_req *req)
{
- if (blkdev_issue_flush(req->ns->bdev, GFP_KERNEL))
+ if (blkdev_issue_flush(req->ns->bdev))
return NVME_SC_INTERNAL | NVME_SC_DNR;
return 0;
}
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index b9776fc8f08f..cbc88acdd233 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -275,7 +275,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req)
schedule_work(&req->p.work);
} else {
rq->end_io_data = req;
- blk_execute_rq_nowait(rq->q, ns ? ns->disk : NULL, rq, 0,
+ blk_execute_rq_nowait(ns ? ns->disk : NULL, rq, 0,
nvmet_passthru_req_done);
}
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index c7eb9a10c680..28c04a4efa66 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -428,23 +428,15 @@ static int dasd_state_unfmt_to_basic(struct dasd_device *device)
static int
dasd_state_ready_to_online(struct dasd_device * device)
{
- struct gendisk *disk;
- struct disk_part_iter piter;
- struct block_device *part;
-
device->state = DASD_STATE_ONLINE;
if (device->block) {
dasd_schedule_block_bh(device->block);
if ((device->features & DASD_FEATURE_USERAW)) {
- disk = device->block->gdp;
- kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+ kobject_uevent(&disk_to_dev(device->block->gdp)->kobj,
+ KOBJ_CHANGE);
return 0;
}
- disk = device->block->bdev->bd_disk;
- disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
- while ((part = disk_part_iter_next(&piter)))
- kobject_uevent(bdev_kobj(part), KOBJ_CHANGE);
- disk_part_iter_exit(&piter);
+ disk_uevent(device->block->bdev->bd_disk, KOBJ_CHANGE);
}
return 0;
}
@@ -455,9 +447,6 @@ dasd_state_ready_to_online(struct dasd_device * device)
static int dasd_state_online_to_ready(struct dasd_device *device)
{
int rc;
- struct gendisk *disk;
- struct disk_part_iter piter;
- struct block_device *part;
if (device->discipline->online_to_ready) {
rc = device->discipline->online_to_ready(device);
@@ -466,13 +455,8 @@ static int dasd_state_online_to_ready(struct dasd_device *device)
}
device->state = DASD_STATE_READY;
- if (device->block && !(device->features & DASD_FEATURE_USERAW)) {
- disk = device->block->bdev->bd_disk;
- disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
- while ((part = disk_part_iter_next(&piter)))
- kobject_uevent(bdev_kobj(part), KOBJ_CHANGE);
- disk_part_iter_exit(&piter);
- }
+ if (device->block && !(device->features & DASD_FEATURE_USERAW))
+ disk_uevent(device->block->bdev->bd_disk, KOBJ_CHANGE);
return 0;
}
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 299e77ec2c41..da33cb4cba28 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -879,17 +879,13 @@ dcssblk_submit_bio(struct bio *bio)
blk_queue_split(&bio);
bytes_done = 0;
- dev_info = bio->bi_disk->private_data;
+ dev_info = bio->bi_bdev->bd_disk->private_data;
if (dev_info == NULL)
goto fail;
if ((bio->bi_iter.bi_sector & 7) != 0 ||
(bio->bi_iter.bi_size & 4095) != 0)
/* Request is not page-aligned. */
goto fail;
- if (bio_end_sector(bio) > get_capacity(bio->bi_disk)) {
- /* Request beyond end of DCSS segment. */
- goto fail;
- }
/* verify data transfer direction */
if (dev_info->is_shared) {
switch (dev_info->segment_type) {
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index c2536f7767b3..d1ed39162943 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -184,7 +184,7 @@ static unsigned long xpram_highest_page_index(void)
*/
static blk_qc_t xpram_submit_bio(struct bio *bio)
{
- xpram_device_t *xdev = bio->bi_disk->private_data;
+ xpram_device_t *xdev = bio->bi_bdev->bd_disk->private_data;
struct bio_vec bvec;
struct bvec_iter iter;
unsigned int index;
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index f11f51e2465f..c00f06e9ecb0 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -2007,7 +2007,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
req->timeout = 10 * HZ;
rq->retries = 5;
- blk_execute_rq_nowait(req->q, NULL, req, 1, eh_lock_door_done);
+ blk_execute_rq_nowait(NULL, req, 1, eh_lock_door_done);
}
/**
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index b3f14f05340a..4d2280658559 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -269,7 +269,7 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
/*
* head injection *required* here otherwise quiesce won't work
*/
- blk_execute_rq(req->q, NULL, req, 1);
+ blk_execute_rq(NULL, req, 1);
/*
* Some devices (USB mass-storage in particular) may transfer
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index cf07b7f93579..03adb39293c2 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -665,12 +665,28 @@ static int sd_zbc_init_disk(struct scsi_disk *sdkp)
return 0;
}
-void sd_zbc_release_disk(struct scsi_disk *sdkp)
+static void sd_zbc_clear_zone_info(struct scsi_disk *sdkp)
{
+ /* Serialize against revalidate zones */
+ mutex_lock(&sdkp->rev_mutex);
+
kvfree(sdkp->zones_wp_offset);
sdkp->zones_wp_offset = NULL;
kfree(sdkp->zone_wp_update_buf);
sdkp->zone_wp_update_buf = NULL;
+
+ sdkp->nr_zones = 0;
+ sdkp->rev_nr_zones = 0;
+ sdkp->zone_blocks = 0;
+ sdkp->rev_zone_blocks = 0;
+
+ mutex_unlock(&sdkp->rev_mutex);
+}
+
+void sd_zbc_release_disk(struct scsi_disk *sdkp)
+{
+ if (sd_is_zoned(sdkp))
+ sd_zbc_clear_zone_info(sdkp);
}
static void sd_zbc_revalidate_zones_cb(struct gendisk *disk)
@@ -769,6 +785,21 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
*/
return 0;
+ /* READ16/WRITE16 is mandatory for ZBC disks */
+ sdkp->device->use_16_for_rw = 1;
+ sdkp->device->use_10_for_rw = 0;
+
+ if (!blk_queue_is_zoned(q)) {
+ /*
+ * This can happen for a host aware disk with partitions.
+ * The block device zone information was already cleared
+ * by blk_queue_set_zoned(). Only clear the scsi disk zone
+ * information and exit early.
+ */
+ sd_zbc_clear_zone_info(sdkp);
+ return 0;
+ }
+
/* Check zoned block device characteristics (unconstrained reads) */
ret = sd_zbc_check_zoned_characteristics(sdkp, buf);
if (ret)
@@ -789,9 +820,13 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
blk_queue_max_active_zones(q, 0);
nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks);
- /* READ16/WRITE16 is mandatory for ZBC disks */
- sdkp->device->use_16_for_rw = 1;
- sdkp->device->use_10_for_rw = 0;
+ /*
+ * Per ZBC and ZAC specifications, writes in sequential write required
+ * zones of host-managed devices must be aligned to the device physical
+ * block size.
+ */
+ if (blk_queue_zoned_model(q) == BLK_ZONED_HM)
+ blk_queue_zone_write_granularity(q, sdkp->physical_block_size);
sdkp->rev_nr_zones = nr_zones;
sdkp->rev_zone_blocks = zone_blocks;
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index bfa8d77322d7..4383d93110f8 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -829,8 +829,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp,
srp->rq->timeout = timeout;
kref_get(&sfp->f_ref); /* sg_rq_end_io() does kref_put(). */
- blk_execute_rq_nowait(sdp->device->request_queue, sdp->disk,
- srp->rq, at_head, sg_rq_end_io);
+ blk_execute_rq_nowait(sdp->disk, srp->rq, at_head, sg_rq_end_io);
return 0;
}
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 43f7624508a9..841ad2fc369a 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -585,7 +585,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
rq->retries = retries;
req->end_io_data = SRpnt;
- blk_execute_rq_nowait(req->q, NULL, req, 1, st_scsi_execute_end);
+ blk_execute_rq_nowait(NULL, req, 1, st_scsi_execute_end);
return 0;
}
diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c
index b0cb5b95e892..cce455929778 100644
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -241,6 +241,7 @@ struct target_core_file_cmd {
unsigned long len;
struct se_cmd *cmd;
struct kiocb iocb;
+ struct bio_vec bvecs[];
};
static void cmd_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
@@ -268,29 +269,22 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
struct target_core_file_cmd *aio_cmd;
struct iov_iter iter = {};
struct scatterlist *sg;
- struct bio_vec *bvec;
ssize_t len = 0;
int ret = 0, i;
- aio_cmd = kmalloc(sizeof(struct target_core_file_cmd), GFP_KERNEL);
+ aio_cmd = kmalloc(struct_size(aio_cmd, bvecs, sgl_nents), GFP_KERNEL);
if (!aio_cmd)
return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
- bvec = kcalloc(sgl_nents, sizeof(struct bio_vec), GFP_KERNEL);
- if (!bvec) {
- kfree(aio_cmd);
- return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
- }
-
for_each_sg(sgl, sg, sgl_nents, i) {
- bvec[i].bv_page = sg_page(sg);
- bvec[i].bv_len = sg->length;
- bvec[i].bv_offset = sg->offset;
+ aio_cmd->bvecs[i].bv_page = sg_page(sg);
+ aio_cmd->bvecs[i].bv_len = sg->length;
+ aio_cmd->bvecs[i].bv_offset = sg->offset;
len += sg->length;
}
- iov_iter_bvec(&iter, is_write, bvec, sgl_nents, len);
+ iov_iter_bvec(&iter, is_write, aio_cmd->bvecs, sgl_nents, len);
aio_cmd->cmd = cmd;
aio_cmd->len = len;
@@ -307,8 +301,6 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
else
ret = call_read_iter(file, &aio_cmd->iocb, &iter);
- kfree(bvec);
-
if (ret != -EIOCBQUEUED)
cmd_rw_aio_complete(&aio_cmd->iocb, ret, 0);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 7994f27e4527..33770e5808ce 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1000,8 +1000,7 @@ pscsi_execute_cmd(struct se_cmd *cmd)
req->timeout = PS_TIMEOUT_OTHER;
scsi_req(req)->retries = PS_RETRY;
- blk_execute_rq_nowait(pdv->pdv_sd->request_queue, NULL, req,
- (cmd->sam_task_attr == TCM_HEAD_TAG),
+ blk_execute_rq_nowait(NULL, req, (cmd->sam_task_attr == TCM_HEAD_TAG),
pscsi_req_done);
return 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 235b5042672e..ec26179c8062 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -126,7 +126,6 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
bd_abort_claiming(bdev, truncate_bdev_range);
return 0;
}
-EXPORT_SYMBOL(truncate_bdev_range);
static void set_init_blocksize(struct block_device *bdev)
{
@@ -424,7 +423,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
dio->size += bio->bi_iter.bi_size;
pos += bio->bi_iter.bi_size;
- nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
+ nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES);
if (!nr_pages) {
bool polled = false;
@@ -489,9 +488,10 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
int nr_pages;
- nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
- if (!nr_pages)
+ if (!iov_iter_count(iter))
return 0;
+
+ nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES + 1);
if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
@@ -688,7 +688,7 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
* i_mutex and doing so causes performance issues with concurrent
* O_SYNC writers to a block device.
*/
- error = blkdev_issue_flush(bdev, GFP_KERNEL);
+ error = blkdev_issue_flush(bdev);
if (error == -EOPNOTSUPP)
error = 0;
@@ -1808,13 +1808,11 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
return error;
/*
- * Invalidate again; if someone wandered in and dirtied a page,
- * the caller will be given -EBUSY. The third argument is
- * inclusive, so the rounding here is safe.
+ * Invalidate the page cache again; if someone wandered in and dirtied
+ * a page, we just discard it - userspace has no way of knowing whether
+ * the write happened before or after discard completing...
*/
- return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
- start >> PAGE_SHIFT,
- end >> PAGE_SHIFT);
+ return truncate_bdev_range(bdev, file->f_mode, start, end);
}
const struct file_operations def_blk_fops = {
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 6ff44e53814c..113cb85c1fd4 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2674,7 +2674,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
mutex_lock(&btrfsic_mutex);
/* since btrfsic_submit_bio() is also called before
* btrfsic_mount(), this might return NULL */
- dev_state = btrfsic_dev_state_lookup(bio_dev(bio) + bio->bi_partno);
+ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
if (NULL != dev_state &&
(bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
unsigned int i = 0;
@@ -2690,9 +2690,9 @@ static void __btrfsic_submit_bio(struct bio *bio)
bio_is_patched = 0;
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_disk=%p)\n",
+ pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
bio_op(bio), bio->bi_opf, segs,
- bio->bi_iter.bi_sector, dev_bytenr, bio->bi_disk);
+ bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
mapped_datav = kmalloc_array(segs,
sizeof(*mapped_datav), GFP_NOFS);
@@ -2721,8 +2721,8 @@ static void __btrfsic_submit_bio(struct bio *bio)
} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x FLUSH, disk=%p)\n",
- bio_op(bio), bio->bi_opf, bio->bi_disk);
+ pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
+ bio_op(bio), bio->bi_opf, bio->bi_bdev);
if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
if ((dev_state->state->print_mask &
(BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 5394641541f7..8ec34ecb6d68 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1104,8 +1104,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
* devices or if they are not contiguous
*/
if (last_end == disk_start && !last->bi_status &&
- last->bi_disk == stripe->dev->bdev->bd_disk &&
- last->bi_partno == stripe->dev->bdev->bd_partno) {
+ last->bi_bdev == stripe->dev->bdev) {
ret = bio_add_page(last, page, PAGE_SIZE, 0);
if (ret == PAGE_SIZE)
return 0;
@@ -1356,9 +1355,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
for (i = 0; i < rbio->bbio->num_stripes; i++) {
stripe = &rbio->bbio->stripes[i];
if (in_range(physical, stripe->physical, rbio->stripe_len) &&
- stripe->dev->bdev &&
- bio->bi_disk == stripe->dev->bdev->bd_disk &&
- bio->bi_partno == stripe->dev->bdev->bd_partno) {
+ stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
return i;
}
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 310fce00fcda..582df11d298a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1728,7 +1728,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
sbio = sctx->wr_curr_bio;
sctx->wr_curr_bio = NULL;
- WARN_ON(!sbio->bio->bi_disk);
+ WARN_ON(!sbio->bio->bi_bdev);
scrub_pending_bio_inc(sctx);
/* process all writes in a single worker thread. Then the block layer
* orders the requests before sending them to the driver which
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b8fab44394f5..bc3b33efddc5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -421,7 +421,7 @@ static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
* Preallocate a bio that's always going to be used for flushing device
* barriers and matches the device lifespan
*/
- dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
+ dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
if (!dev->flush_bio) {
kfree(dev);
return ERR_PTR(-ENOMEM);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 9a5cf153da89..d0eb0c8d6269 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1275,8 +1275,8 @@ void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
return;
ordered->physical = physical;
- ordered->disk = bio->bi_disk;
- ordered->partno = bio->bi_partno;
+ ordered->disk = bio->bi_bdev->bd_disk;
+ ordered->partno = bio->bi_bdev->bd_partno;
btrfs_put_ordered_extent(ordered);
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d53fa92a1ab6..aa1083ecd623 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -426,6 +426,8 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
unsigned long flags;
bio->bi_private = dio;
+ /* don't account direct I/O as memory stall */
+ bio_clear_flag(bio, BIO_WORKINGSET);
spin_lock_irqsave(&dio->bio_lock, flags);
dio->refcount++;
@@ -434,7 +436,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty)
bio_set_pages_dirty(bio);
- dio->bio_disk = bio->bi_disk;
+ dio->bio_disk = bio->bi_bdev->bd_disk;
if (sdio->submit_io) {
sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio);
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index a92478eabfa4..183ffdf4d43c 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -361,7 +361,7 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
if (err)
return err;
- return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ return blkdev_issue_flush(inode->i_sb->s_bdev);
}
const struct file_operations exfat_file_operations = {
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 0a14a7c87bf8..6e8208acfc62 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1076,7 +1076,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
* flush before we start writing fast commit blocks.
*/
if (journal->j_fs_dev != journal->j_dev)
- blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
+ blkdev_issue_flush(journal->j_fs_dev);
blk_start_plug(&plug);
if (sbi->s_fc_bytes == 0) {
@@ -1535,7 +1535,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
out:
iput(inode);
if (!ret)
- blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
+ blkdev_issue_flush(sb->s_bdev);
return 0;
}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 113bfb023a4a..027a7d7037a0 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -174,7 +174,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = ext4_fsync_journal(inode, datasync, &needs_barrier);
if (needs_barrier) {
- err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ err = blkdev_issue_flush(inode->i_sb->s_bdev);
if (!ret)
ret = err;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b215c564bc31..20f2fcb799f5 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1583,7 +1583,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
if (ret < 0)
goto err_out;
if (barrier)
- blkdev_issue_flush(sb->s_bdev, GFP_NOFS);
+ blkdev_issue_flush(sb->s_bdev);
skip_zeroout:
ext4_lock_group(sb, group);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9a6f9875aa34..fb5985102c1d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5709,7 +5709,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
needs_barrier = true;
if (needs_barrier) {
int err;
- err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
+ err = blkdev_issue_flush(sb->s_bdev);
if (!ret)
ret = err;
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 1ee966a63df9..b9721c8f116c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -49,27 +49,6 @@ void f2fs_destroy_bioset(void)
bioset_exit(&f2fs_bioset);
}
-static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask,
- unsigned int nr_iovecs)
-{
- return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset);
-}
-
-struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio)
-{
- if (noio) {
- /* No failure on bio allocation */
- return __f2fs_bio_alloc(GFP_NOIO, npages);
- }
-
- if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
- f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO);
- return NULL;
- }
-
- return __f2fs_bio_alloc(GFP_KERNEL, npages);
-}
-
static bool __is_cp_guaranteed(struct page *page)
{
struct address_space *mapping = page->mapping;
@@ -398,22 +377,12 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
return 0;
}
-/*
- * Return true, if pre_bio's bdev is same as its target device.
- */
-static bool __same_bdev(struct f2fs_sb_info *sbi,
- block_t blk_addr, struct bio *bio)
-{
- struct block_device *b = f2fs_target_device(sbi, blk_addr, NULL);
- return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno;
-}
-
static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
{
struct f2fs_sb_info *sbi = fio->sbi;
struct bio *bio;
- bio = f2fs_bio_alloc(sbi, npages, true);
+ bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset);
f2fs_target_device(sbi, fio->new_blkaddr, bio);
if (is_read_io(fio->op)) {
@@ -711,7 +680,7 @@ static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
return false;
if (last_blkaddr + 1 != cur_blkaddr)
return false;
- return __same_bdev(sbi, cur_blkaddr, bio);
+ return bio->bi_bdev == f2fs_target_device(sbi, cur_blkaddr, NULL);
}
static bool io_type_is_mergeable(struct f2fs_bio_info *io,
@@ -999,8 +968,9 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
struct bio_post_read_ctx *ctx;
unsigned int post_read_steps = 0;
- bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES),
- for_write);
+ bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL,
+ min_t(int, nr_pages, BIO_MAX_PAGES),
+ &f2fs_bioset);
if (!bio)
return ERR_PTR(-ENOMEM);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 2860003a09ed..506c801880f3 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -43,7 +43,6 @@ enum {
FAULT_KVMALLOC,
FAULT_PAGE_ALLOC,
FAULT_PAGE_GET,
- FAULT_ALLOC_BIO,
FAULT_ALLOC_NID,
FAULT_ORPHAN,
FAULT_BLOCK,
@@ -3482,7 +3481,6 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi);
*/
int __init f2fs_init_bioset(void);
void f2fs_destroy_bioset(void);
-struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio);
int f2fs_init_bio_entry_cache(void);
void f2fs_destroy_bio_entry_cache(void);
void f2fs_submit_bio(struct f2fs_sb_info *sbi,
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 440634dfaa56..993004f06a77 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -563,17 +563,7 @@ do_sync:
static int __submit_flush_wait(struct f2fs_sb_info *sbi,
struct block_device *bdev)
{
- struct bio *bio;
- int ret;
-
- bio = f2fs_bio_alloc(sbi, 0, false);
- if (!bio)
- return -ENOMEM;
-
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
- bio_set_dev(bio, bdev);
- ret = submit_bio_wait(bio);
- bio_put(bio);
+ int ret = blkdev_issue_flush(bdev);
trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER),
test_opt(sbi, FLUSH_MERGE), ret);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 30d5abef4361..4acfa7d36731 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -46,7 +46,6 @@ const char *f2fs_fault_name[FAULT_MAX] = {
[FAULT_KVMALLOC] = "kvmalloc",
[FAULT_PAGE_ALLOC] = "page alloc",
[FAULT_PAGE_GET] = "page get",
- [FAULT_ALLOC_BIO] = "alloc bio",
[FAULT_ALLOC_NID] = "alloc nid",
[FAULT_ORPHAN] = "orphan",
[FAULT_BLOCK] = "no more block",
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f9ee27cf4d7c..5fee74f1ad61 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -195,7 +195,7 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
if (err)
return err;
- return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ return blkdev_issue_flush(inode->i_sb->s_bdev);
}
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index e3da9e96b835..ca464328b79c 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -340,7 +340,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
}
if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
- blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ blkdev_issue_flush(inode->i_sb->s_bdev);
inode_unlock(inode);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 807119ae5adf..b9e3db3f855f 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -239,7 +239,7 @@ out:
mutex_unlock(&sbi->vh_mutex);
if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
- blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
+ blkdev_issue_flush(sb->s_bdev);
return error;
}
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index e4c258a2cefc..e2c4991833b8 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -279,11 +279,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
orig_count = iov_iter_count(dio->submit.iter);
iov_iter_truncate(dio->submit.iter, length);
- nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
- if (nr_pages <= 0) {
- ret = nr_pages;
+ if (!iov_iter_count(dio->submit.iter))
goto out;
- }
if (need_zeroout) {
/* zero out from the start of the block to the write offset */
@@ -299,6 +296,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
*/
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
+ nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_PAGES);
do {
size_t n;
if (dio->error) {
@@ -339,7 +337,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
dio->size += n;
copied += n;
- nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
+ nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
+ BIO_MAX_PAGES);
iomap_dio_submit_bio(dio, iomap, bio, pos);
pos += n;
} while (nr_pages);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 472932b9e6bc..63b526d44886 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -416,7 +416,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
* jbd2_cleanup_journal_tail() doesn't get called all that often.
*/
if (journal->j_flags & JBD2_BARRIER)
- blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
+ blkdev_issue_flush(journal->j_fs_dev);
return __jbd2_update_log_tail(journal, first_tid, blocknr);
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index b121d7d434c6..3cc4ab2ba7f4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -825,7 +825,7 @@ start_journal_io:
if (commit_transaction->t_need_data_flush &&
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
+ blkdev_issue_flush(journal->j_fs_dev);
/* Done it all: now write the commit record asynchronously. */
if (jbd2_has_feature_async_commit(journal)) {
@@ -932,7 +932,7 @@ start_journal_io:
stats.run.rs_blocks_logged++;
if (jbd2_has_feature_async_commit(journal) &&
journal->j_flags & JBD2_BARRIER) {
- blkdev_issue_flush(journal->j_dev, GFP_NOFS);
+ blkdev_issue_flush(journal->j_dev);
}
if (err)
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index dc0694fcfcd1..69f18fe20923 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -326,7 +326,7 @@ int jbd2_journal_recover(journal_t *journal)
err = err2;
/* Make sure all replayed data is on permanent storage */
if (journal->j_flags & JBD2_BARRIER) {
- err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL);
+ err2 = blkdev_issue_flush(journal->j_fs_dev);
if (!err)
err = err2;
}
diff --git a/fs/libfs.c b/fs/libfs.c
index 79721571e014..abf7674fb437 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1117,7 +1117,7 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end,
err = __generic_file_fsync(file, start, end, datasync);
if (err)
return err;
- return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ return blkdev_issue_flush(inode->i_sb->s_bdev);
}
EXPORT_SYMBOL(generic_file_fsync);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 3be6836074ae..1a96ce28efb0 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -123,11 +123,6 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
npg = min(npg, BIO_MAX_PAGES);
bio = bio_alloc(GFP_NOIO, npg);
- if (!bio && (current->flags & PF_MEMALLOC)) {
- while (!bio && (npg /= 2))
- bio = bio_alloc(GFP_NOIO, npg);
- }
-
if (bio) {
bio->bi_iter.bi_sector = disk_sector;
bio_set_dev(bio, bdev);
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index a07c39c94bbd..1058659a8d31 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -254,7 +254,7 @@ again:
req->cmd[4] = bufflen & 0xff;
req->cmd_len = COMMAND_SIZE(INQUIRY);
- blk_execute_rq(rq->q, NULL, rq, 1);
+ blk_execute_rq(NULL, rq, 1);
if (req->result) {
pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
req->result);
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 1a8729eded8b..1e75417bfe6e 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -386,10 +386,6 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
struct bio *bio;
bio = bio_alloc(GFP_NOIO, nr_vecs);
- if (bio == NULL) {
- while (!bio && (nr_vecs >>= 1))
- bio = bio_alloc(GFP_NOIO, nr_vecs);
- }
if (likely(bio)) {
bio_set_dev(bio, nilfs->ns_bdev);
bio->bi_iter.bi_sector =
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index b55cdeb4d169..987c8ab02aee 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -375,7 +375,7 @@ static inline int nilfs_flush_device(struct the_nilfs *nilfs)
*/
smp_wmb();
- err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL);
+ err = blkdev_issue_flush(nilfs->ns_bdev);
if (err != -EIO)
err = 0;
return err;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 85979e2214b3..df6d709d2ae3 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -194,7 +194,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
needs_barrier = true;
err = jbd2_complete_transaction(journal, commit_tid);
if (needs_barrier) {
- ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ ret = blkdev_issue_flush(inode->i_sb->s_bdev);
if (!err)
err = ret;
}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 0b641ae694f1..1db0254bc38b 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -159,7 +159,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
barrier_done = reiserfs_commit_for_inode(inode);
reiserfs_write_unlock(inode->i_sb);
if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
- blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ blkdev_issue_flush(inode->i_sb->s_bdev);
inode_unlock(inode);
if (barrier_done < 0)
return barrier_done;
diff --git a/fs/splice.c b/fs/splice.c
index b06846f1e6ee..5dbce4dcc1a7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -662,12 +662,14 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
/* build the vector */
left = sd.total_len;
- for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++, n++) {
+ for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
struct pipe_buffer *buf = &pipe->bufs[tail & mask];
size_t this_len = buf->len;
- if (this_len > left)
- this_len = left;
+ /* zero-length bvecs are not supported, skip them */
+ if (!this_len)
+ continue;
+ this_len = min(this_len, left);
ret = pipe_buf_confirm(pipe, buf);
if (unlikely(ret)) {
@@ -680,6 +682,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
array[n].bv_len = this_len;
array[n].bv_offset = buf->offset;
left -= this_len;
+ n++;
}
iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
diff --git a/fs/super.c b/fs/super.c
index 2c6cdea2ab2d..5a1f384ffc74 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -865,7 +865,8 @@ int reconfigure_super(struct fs_context *fc)
if (fc->sb_flags_mask & SB_RDONLY) {
#ifdef CONFIG_BLOCK
- if (!(fc->sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev))
+ if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev &&
+ bdev_read_only(sb->s_bdev))
return -EACCES;
#endif
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 21b1d034aca3..586d42342a79 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -343,7 +343,7 @@ void
xfs_blkdev_issue_flush(
xfs_buftarg_t *buftarg)
{
- blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS);
+ blkdev_issue_flush(buftarg->bt_bdev);
}
STATIC void
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 0e7ab0bc00ae..a29653c0196b 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -541,7 +541,7 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV)
ret = file_write_and_wait_range(file, start, end);
if (!ret)
- ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ ret = blkdev_issue_flush(inode->i_sb->s_bdev);
if (ret)
zonefs_io_error(inode, true);
@@ -678,7 +678,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
if (!nr_pages)
return 0;
- bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set);
+ bio = bio_alloc(GFP_NOFS, nr_pages);
if (!bio)
return -ENOMEM;
@@ -1581,12 +1581,11 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_gran = 1;
/*
- * The block size is set to the device physical sector size to ensure
- * that write operations on 512e devices (512B logical block and 4KB
- * physical block) are always aligned to the device physical blocks,
- * as mandated by the ZBC/ZAC specifications.
+ * The block size is set to the device zone write granularity to ensure
+ * that write operations are always aligned according to the device
+ * interface constraints.
*/
- sb_set_blocksize(sb, bdev_physical_block_size(sb->s_bdev));
+ sb_set_blocksize(sb, bdev_zone_write_granularity(sb->s_bdev));
sbi->s_zone_sectors_shift = ilog2(bdev_zone_sectors(sb->s_bdev));
sbi->s_uid = GLOBAL_ROOT_UID;
sbi->s_gid = GLOBAL_ROOT_GID;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index de62911473bb..5b468f2242ff 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -10,6 +10,7 @@
#include <linux/ioprio.h>
/* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
#include <linux/blk_types.h>
+#include <linux/uio.h>
#define BIO_DEBUG
@@ -328,7 +329,6 @@ struct bio_integrity_payload {
struct bvec_iter bip_iter;
- unsigned short bip_slab; /* slab the bip came from */
unsigned short bip_vcnt; /* # of integrity bio_vecs */
unsigned short bip_max_vcnt; /* integrity bio_vec slots */
unsigned short bip_flags; /* control flags */
@@ -406,7 +406,9 @@ extern void bioset_exit(struct bio_set *);
extern int biovec_init_pool(mempool_t *pool, int pool_entries);
extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src);
-extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *);
+struct bio *bio_alloc_bioset(gfp_t gfp, unsigned short nr_iovecs,
+ struct bio_set *bs);
+struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs);
extern void bio_put(struct bio *);
extern void __bio_clone_fast(struct bio *, struct bio *);
@@ -414,16 +416,11 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
extern struct bio_set fs_bio_set;
-static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs)
{
return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set);
}
-static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-{
- return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
-}
-
extern blk_qc_t submit_bio(struct bio *);
extern void bio_endio(struct bio *);
@@ -441,6 +438,18 @@ static inline void bio_wouldblock_error(struct bio *bio)
bio_endio(bio);
}
+/*
+ * Calculate number of bvec segments that should be allocated to fit data
+ * pointed by @iter. If @iter is backed by bvec it's going to be reused
+ * instead of allocating a new one.
+ */
+static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
+{
+ if (iov_iter_is_bvec(iter))
+ return 0;
+ return iov_iter_npages(iter, max_segs);
+}
+
struct request_queue;
extern int submit_bio_wait(struct bio *bio);
@@ -480,29 +489,26 @@ static inline void zero_fill_bio(struct bio *bio)
zero_fill_bio_iter(bio, bio->bi_iter);
}
-extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
-extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
-extern unsigned int bvec_nr_vecs(unsigned short idx);
extern const char *bio_devname(struct bio *bio, char *buffer);
-#define bio_set_dev(bio, bdev) \
-do { \
- if ((bio)->bi_disk != (bdev)->bd_disk) \
- bio_clear_flag(bio, BIO_THROTTLED);\
- (bio)->bi_disk = (bdev)->bd_disk; \
- (bio)->bi_partno = (bdev)->bd_partno; \
- bio_associate_blkg(bio); \
+#define bio_set_dev(bio, bdev) \
+do { \
+ bio_clear_flag(bio, BIO_REMAPPED); \
+ if ((bio)->bi_bdev != (bdev)) \
+ bio_clear_flag(bio, BIO_THROTTLED); \
+ (bio)->bi_bdev = (bdev); \
+ bio_associate_blkg(bio); \
} while (0)
#define bio_copy_dev(dst, src) \
do { \
- (dst)->bi_disk = (src)->bi_disk; \
- (dst)->bi_partno = (src)->bi_partno; \
+ bio_clear_flag(dst, BIO_REMAPPED); \
+ (dst)->bi_bdev = (src)->bi_bdev; \
bio_clone_blkg_association(dst, src); \
} while (0)
#define bio_dev(bio) \
- disk_devt((bio)->bi_disk)
+ disk_devt((bio)->bi_bdev->bd_disk)
#ifdef CONFIG_BLK_CGROUP
void bio_associate_blkg(struct bio *bio);
@@ -705,6 +711,7 @@ struct bio_set {
mempool_t bvec_integrity_pool;
#endif
+ unsigned int back_pad;
/*
* Deadlock avoidance for stacking block drivers: see comments in
* bio_alloc_bioset() for details
@@ -715,12 +722,6 @@ struct bio_set {
struct workqueue_struct *rescue_workqueue;
};
-struct biovec_slab {
- int nr_vecs;
- char *name;
- struct kmem_cache *slab;
-};
-
static inline bool bioset_initialized(struct bio_set *bs)
{
return bs->bio_slab != NULL;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d705b174d346..aabbf6830ffc 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -140,10 +140,6 @@ struct blk_mq_hw_ctx {
* shared across request queues.
*/
atomic_t nr_active;
- /**
- * @elevator_queued: Number of queued requests on hctx.
- */
- atomic_t elevator_queued;
/** @cpuhp_online: List to store request if CPU is going to die */
struct hlist_node cpuhp_online;
@@ -602,8 +598,8 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
rq->bio = rq->biotail = bio;
rq->ioprio = bio_prio(bio);
- if (bio->bi_disk)
- rq->rq_disk = bio->bi_disk;
+ if (bio->bi_bdev)
+ rq->rq_disk = bio->bi_bdev->bd_disk;
}
blk_qc_t blk_mq_submit_bio(struct bio *bio);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 866f74261b3b..db026b6ec15a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -222,16 +222,15 @@ static inline void bio_issue_init(struct bio_issue *issue,
*/
struct bio {
struct bio *bi_next; /* request queue link */
- struct gendisk *bi_disk;
+ struct block_device *bi_bdev;
unsigned int bi_opf; /* bottom bits req flags,
* top bits REQ_OP. Use
* accessors.
*/
- unsigned short bi_flags; /* status, etc and bvec pool number */
+ unsigned short bi_flags; /* BIO_* below */
unsigned short bi_ioprio;
unsigned short bi_write_hint;
blk_status_t bi_status;
- u8 bi_partno;
atomic_t __bi_remaining;
struct bvec_iter bi_iter;
@@ -304,36 +303,10 @@ enum {
* of this bio. */
BIO_CGROUP_ACCT, /* has been accounted to a cgroup */
BIO_TRACKED, /* set if bio goes through the rq_qos path */
+ BIO_REMAPPED,
BIO_FLAG_LAST
};
-/* See BVEC_POOL_OFFSET below before adding new flags */
-
-/*
- * We support 6 different bvec pools, the last one is magic in that it
- * is backed by a mempool.
- */
-#define BVEC_POOL_NR 6
-#define BVEC_POOL_MAX (BVEC_POOL_NR - 1)
-
-/*
- * Top 3 bits of bio flags indicate the pool the bvecs came from. We add
- * 1 to the actual index so that 0 indicates that there are no bvecs to be
- * freed.
- */
-#define BVEC_POOL_BITS (3)
-#define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS)
-#define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET)
-#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1)
-# error "BVEC_POOL_BITS is too small"
-#endif
-
-/*
- * Flags starting here get preserved by bio_reset() - this includes
- * only BVEC_POOL_IDX()
- */
-#define BIO_RESET_BITS BVEC_POOL_OFFSET
-
typedef __u32 __bitwise blk_mq_req_flags_t;
/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f94ee3089e01..9149f4a5adb3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -337,6 +337,7 @@ struct queue_limits {
unsigned int max_zone_append_sectors;
unsigned int discard_granularity;
unsigned int discard_alignment;
+ unsigned int zone_write_granularity;
unsigned short max_segments;
unsigned short max_integrity_segments;
@@ -948,9 +949,8 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns
extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
struct rq_map_data *, const struct iov_iter *,
gfp_t);
-extern void blk_execute_rq(struct request_queue *, struct gendisk *,
- struct request *, int);
-extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
+extern void blk_execute_rq(struct gendisk *, struct request *, int);
+extern void blk_execute_rq_nowait(struct gendisk *,
struct request *, int, rq_end_io_fn *);
/* Helper to convert REQ_OP_XXX to its string format XXX */
@@ -1161,6 +1161,8 @@ extern void blk_queue_logical_block_size(struct request_queue *, unsigned int);
extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
unsigned int max_zone_append_sectors);
extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
+void blk_queue_zone_write_granularity(struct request_queue *q,
+ unsigned int size);
extern void blk_queue_alignment_offset(struct request_queue *q,
unsigned int alignment);
void blk_queue_update_readahead(struct request_queue *q);
@@ -1289,7 +1291,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
!list_empty(&plug->cb_list));
}
-int blkdev_issue_flush(struct block_device *, gfp_t);
+int blkdev_issue_flush(struct block_device *bdev);
long nr_blockdev_pages(void);
#else /* CONFIG_BLOCK */
struct blk_plug {
@@ -1317,7 +1319,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
return false;
}
-static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask)
+static inline int blkdev_issue_flush(struct block_device *bdev)
{
return 0;
}
@@ -1474,6 +1476,18 @@ static inline int bdev_io_opt(struct block_device *bdev)
return queue_io_opt(bdev_get_queue(bdev));
}
+static inline unsigned int
+queue_zone_write_granularity(const struct request_queue *q)
+{
+ return q->limits.zone_write_granularity;
+}
+
+static inline unsigned int
+bdev_zone_write_granularity(struct block_device *bdev)
+{
+ return queue_zone_write_granularity(bdev_get_queue(bdev));
+}
+
static inline int queue_alignment_offset(const struct request_queue *q)
{
if (q->limits.misaligned)
@@ -1954,21 +1968,9 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
void disk_end_io_acct(struct gendisk *disk, unsigned int op,
unsigned long start_time);
-unsigned long part_start_io_acct(struct gendisk *disk,
- struct block_device **part, struct bio *bio);
-void part_end_io_acct(struct block_device *part, struct bio *bio,
- unsigned long start_time);
-
-/**
- * bio_start_io_acct - start I/O accounting for bio based drivers
- * @bio: bio to start account for
- *
- * Returns the start time that should be passed back to bio_end_io_acct().
- */
-static inline unsigned long bio_start_io_acct(struct bio *bio)
-{
- return disk_start_io_acct(bio->bi_disk, bio_sectors(bio), bio_op(bio));
-}
+unsigned long bio_start_io_acct(struct bio *bio);
+void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
+ struct block_device *orig_bdev);
/**
* bio_end_io_acct - end I/O accounting for bio based drivers
@@ -1977,7 +1979,7 @@ static inline unsigned long bio_start_io_acct(struct bio *bio)
*/
static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
{
- return disk_end_io_acct(bio->bi_disk, bio_op(bio), start_time);
+ return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev);
}
int bdev_read_only(struct block_device *bdev);
@@ -2012,21 +2014,16 @@ void bdev_add(struct block_device *bdev, dev_t dev);
struct block_device *I_BDEV(struct inode *inode);
struct block_device *bdgrab(struct block_device *bdev);
void bdput(struct block_device *);
+int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart,
+ loff_t lend);
#ifdef CONFIG_BLOCK
void invalidate_bdev(struct block_device *bdev);
-int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart,
- loff_t lend);
int sync_blockdev(struct block_device *bdev);
#else
static inline void invalidate_bdev(struct block_device *bdev)
{
}
-static inline int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
- loff_t lstart, loff_t lend)
-{
- return 0;
-}
static inline int sync_blockdev(struct block_device *bdev)
{
return 0;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index bacc40a0bdf3..1fe8e105b83b 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -172,6 +172,8 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
/* Supports zoned block devices sequential write constraint */
#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0)
+/* Supports scheduling on multiple hardware queues */
+#define ELEVATOR_F_MQ_AWARE (1U << 1)
#endif /* CONFIG_BLOCK */
#endif
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 809aaa32d53c..f364619092cc 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -32,6 +32,7 @@ extern struct class block_class;
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/workqueue.h>
+#include <linux/xarray.h>
#define PARTITION_META_INFO_VOLNAMELTH 64
/*
@@ -116,13 +117,6 @@ enum {
DISK_EVENT_FLAG_UEVENT = 1 << 1,
};
-struct disk_part_tbl {
- struct rcu_head rcu_head;
- int len;
- struct block_device __rcu *last_lookup;
- struct block_device __rcu *part[];
-};
-
struct disk_events;
struct badblocks;
@@ -148,12 +142,7 @@ struct gendisk {
unsigned short events; /* supported events */
unsigned short event_flags; /* flags related to event processing */
- /* Array of pointers to partitions indexed by partno.
- * Protected with matching bdev lock but stat and other
- * non-critical accesses use RCU. Always access through
- * helpers.
- */
- struct disk_part_tbl __rcu *part_tbl;
+ struct xarray part_tbl;
struct block_device *part0;
const struct block_device_operations *fops;
@@ -163,6 +152,7 @@ struct gendisk {
int flags;
unsigned long state;
#define GD_NEED_PART_SCAN 0
+#define GD_READ_ONLY 1
struct kobject *slave_dir;
struct timer_rand_state *random;
@@ -212,10 +202,11 @@ static inline dev_t disk_devt(struct gendisk *disk)
return MKDEV(disk->major, disk->first_minor);
}
+void disk_uevent(struct gendisk *disk, enum kobject_action action);
+
/*
* Smarter partition iterator without context limits.
*/
-#define DISK_PITER_REVERSE (1 << 0) /* iterate in the reverse direction */
#define DISK_PITER_INCL_EMPTY (1 << 1) /* include 0-sized parts */
#define DISK_PITER_INCL_PART0 (1 << 2) /* include partition 0 */
#define DISK_PITER_INCL_EMPTY_PART0 (1 << 3) /* include empty partition 0 */
@@ -223,7 +214,7 @@ static inline dev_t disk_devt(struct gendisk *disk)
struct disk_part_iter {
struct gendisk *disk;
struct block_device *part;
- int idx;
+ unsigned long idx;
unsigned int flags;
};
@@ -231,7 +222,6 @@ extern void disk_part_iter_init(struct disk_part_iter *piter,
struct gendisk *disk, unsigned int flags);
struct block_device *disk_part_iter_next(struct disk_part_iter *piter);
extern void disk_part_iter_exit(struct disk_part_iter *piter);
-extern bool disk_has_partitions(struct gendisk *disk);
/* block/genhd.c */
extern void device_add_disk(struct device *parent, struct gendisk *disk,
@@ -249,11 +239,12 @@ static inline void add_disk_no_queue_reg(struct gendisk *disk)
extern void del_gendisk(struct gendisk *gp);
extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
-extern void set_disk_ro(struct gendisk *disk, int flag);
+void set_disk_ro(struct gendisk *disk, bool read_only);
static inline int get_disk_ro(struct gendisk *disk)
{
- return disk->part0->bd_read_only;
+ return disk->part0->bd_read_only ||
+ test_bit(GD_READ_ONLY, &disk->state);
}
extern void disk_block_events(struct gendisk *disk);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 596bc2f4d9b0..3f1f7ae0fbe9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -468,7 +468,6 @@ extern int free_swap_and_cache(swp_entry_t);
int swap_type_of(dev_t device, sector_t offset);
int find_first_swap(dev_t *device);
extern unsigned int count_swap_pages(int, int);
-extern sector_t map_swap_page(struct page *, struct block_device **);
extern sector_t swapdev_block(int, pgoff_t);
extern int page_swapcount(struct page *);
extern int __swap_count(swp_entry_t entry);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fb0fe4c66b84..9e9ee4945043 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -903,7 +903,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BOUNCE, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0);
}
static void blk_add_trace_bio_complete(void *ignore,
@@ -915,22 +915,24 @@ static void blk_add_trace_bio_complete(void *ignore,
static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BACKMERGE, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE,
+ 0);
}
static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_FRONTMERGE, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE,
+ 0);
}
static void blk_add_trace_bio_queue(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_QUEUE, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0);
}
static void blk_add_trace_getrq(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_GETRQ, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0);
}
static void blk_add_trace_plug(void *ignore, struct request_queue *q)
@@ -967,7 +969,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
{
- struct request_queue *q = bio->bi_disk->queue;
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct blk_trace *bt;
rcu_read_lock();
@@ -997,7 +999,7 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
sector_t from)
{
- struct request_queue *q = bio->bi_disk->queue;
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct blk_trace *bt;
struct blk_io_trace_remap r;
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f0b2ccb1bb01..d8ca336ff9cd 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -72,8 +72,6 @@
__start.bi_bvec_done = skip; \
__start.bi_idx = 0; \
for_each_bvec(__v, i->bvec, __bi, __start) { \
- if (!__v.bv_len) \
- continue; \
(void)(STEP); \
} \
}
@@ -1069,6 +1067,21 @@ static void pipe_advance(struct iov_iter *i, size_t size)
pipe_truncate(i);
}
+static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
+{
+ struct bvec_iter bi;
+
+ bi.bi_size = i->count;
+ bi.bi_bvec_done = i->iov_offset;
+ bi.bi_idx = 0;
+ bvec_iter_advance(i->bvec, &bi, size);
+
+ i->bvec += bi.bi_idx;
+ i->nr_segs -= bi.bi_idx;
+ i->count = bi.bi_size;
+ i->iov_offset = bi.bi_bvec_done;
+}
+
void iov_iter_advance(struct iov_iter *i, size_t size)
{
if (unlikely(iov_iter_is_pipe(i))) {
@@ -1079,6 +1092,10 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
i->count -= size;
return;
}
+ if (iov_iter_is_bvec(i)) {
+ iov_iter_bvec_advance(i, size);
+ return;
+ }
iterate_and_advance(i, size, v, 0, 0, 0)
}
EXPORT_SYMBOL(iov_iter_advance);
diff --git a/mm/page_io.c b/mm/page_io.c
index 9bca17ecc4df..92f7941c6d01 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -26,25 +26,6 @@
#include <linux/uio.h>
#include <linux/sched/task.h>
-static struct bio *get_swap_bio(gfp_t gfp_flags,
- struct page *page, bio_end_io_t end_io)
-{
- struct bio *bio;
-
- bio = bio_alloc(gfp_flags, 1);
- if (bio) {
- struct block_device *bdev;
-
- bio->bi_iter.bi_sector = map_swap_page(page, &bdev);
- bio_set_dev(bio, bdev);
- bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
- bio->bi_end_io = end_io;
-
- bio_add_page(bio, page, thp_size(page), 0);
- }
- return bio;
-}
-
void end_swap_bio_write(struct bio *bio)
{
struct page *page = bio_first_page_all(bio);
@@ -361,13 +342,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
return 0;
}
- bio = get_swap_bio(GFP_NOIO, page, end_write_func);
- if (bio == NULL) {
- set_page_dirty(page);
- unlock_page(page);
- return -ENOMEM;
- }
+ bio = bio_alloc(GFP_NOIO, 1);
+ bio_set_dev(bio, sis->bdev);
+ bio->bi_iter.bi_sector = swap_page_sector(page);
bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
+ bio->bi_end_io = end_write_func;
+ bio_add_page(bio, page, thp_size(page), 0);
+
bio_associate_blkg_from_page(bio, page);
count_swpout_vm_event(page);
set_page_writeback(page);
@@ -427,18 +408,18 @@ int swap_readpage(struct page *page, bool synchronous)
}
ret = 0;
- bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
- if (bio == NULL) {
- unlock_page(page);
- ret = -ENOMEM;
- goto out;
- }
- disk = bio->bi_disk;
+ bio = bio_alloc(GFP_KERNEL, 1);
+ bio_set_dev(bio, sis->bdev);
+ bio->bi_opf = REQ_OP_READ;
+ bio->bi_iter.bi_sector = swap_page_sector(page);
+ bio->bi_end_io = end_swap_bio_read;
+ bio_add_page(bio, page, thp_size(page), 0);
+
+ disk = bio->bi_bdev->bd_disk;
/*
* Keep this task valid during swap readpage because the oom killer may
* attempt to access it in the page fault retry time check.
*/
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
if (synchronous) {
bio->bi_opf |= REQ_HIPRI;
get_task_struct(current);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9fffc5af29d1..21a98cb8d646 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,7 +47,6 @@
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
-static sector_t map_swap_entry(swp_entry_t, struct block_device**);
DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
@@ -1850,12 +1849,13 @@ int find_first_swap(dev_t *device)
*/
sector_t swapdev_block(int type, pgoff_t offset)
{
- struct block_device *bdev;
struct swap_info_struct *si = swap_type_to_swap_info(type);
+ struct swap_extent *se;
if (!si || !(si->flags & SWP_WRITEOK))
return 0;
- return map_swap_entry(swp_entry(type, offset), &bdev);
+ se = offset_to_swap_extent(si, offset);
+ return se->start_block + (offset - se->start_page);
}
/*
@@ -2282,36 +2282,6 @@ static void drain_mmlist(void)
}
/*
- * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
- * corresponds to page offset for the specified swap entry.
- * Note that the type of this function is sector_t, but it returns page offset
- * into the bdev, not sector offset.
- */
-static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
-{
- struct swap_info_struct *sis;
- struct swap_extent *se;
- pgoff_t offset;
-
- sis = swp_swap_info(entry);
- *bdev = sis->bdev;
-
- offset = swp_offset(entry);
- se = offset_to_swap_extent(sis, offset);
- return se->start_block + (offset - se->start_page);
-}
-
-/*
- * Returns the page offset into bdev for the specified page's swap entry.
- */
-sector_t map_swap_page(struct page *page, struct block_device **bdev)
-{
- swp_entry_t entry;
- entry.val = page_private(page);
- return map_swap_entry(entry, bdev);
-}
-
-/*
* Free all of a swapdev's extent information
*/
static void destroy_swap_extents(struct swap_info_struct *sis)