From b4fd63f42647110c963d4bfcd526ac48f5a5faff Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 20 Apr 2020 09:24:54 -0700 Subject: Revert "scsi: core: run queue if SCSI device queue isn't ready and queue is idle" This reverts commit 7e70aa789d4a0c89dbfbd2c8a974a4df717475ec. Now that we have the patches ("blk-mq: In blk_mq_dispatch_rq_list() "no budget" is a reason to kick") and ("blk-mq: Rerun dispatching in the case of budget contention") we should no longer need the fix in the SCSI code. Revert it, resolving conflicts with other patches that have touched this code. With this revert (and the two new patches) I can run the script that was in commit 7e70aa789d4a ("scsi: core: run queue if SCSI device queue isn't ready and queue is idle") in a loop with no failure. If I do this revert without the two new patches I can easily get a failure. Signed-off-by: Douglas Anderson Reviewed-by: Ming Lei Acked-by: Martin K. Petersen Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'drivers/scsi/scsi_lib.c') diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 47835c4b4ee0..ea18f618dc66 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1610,12 +1610,7 @@ static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; - if (scsi_dev_queue_ready(q, sdev)) - return true; - - if (atomic_read(&sdev->device_busy) == 0 && !scsi_device_blocked(sdev)) - blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY); - return false; + return scsi_dev_queue_ready(q, sdev); } static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, -- cgit v1.2.3 From 0475bd6c65976c390e3805a1e5f10fc30ca8def2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:42:23 +0200 Subject: scsi: merge scsi_init_sgtable into scsi_init_io scsi_init_io is the only caller of scsi_init_sgtable. Merge the two function to make upcoming changes a little easier. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 46 ++++++++++++++++++---------------------------- 1 file changed, 18 insertions(+), 28 deletions(-) (limited to 'drivers/scsi/scsi_lib.c') diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index ea18f618dc66..4e42acbb3f32 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -978,30 +978,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) scsi_io_completion_action(cmd, result); } -static blk_status_t scsi_init_sgtable(struct request *req, - struct scsi_data_buffer *sdb) -{ - int count; - - /* - * If sg table allocation fails, requeue request later. - */ - if (unlikely(sg_alloc_table_chained(&sdb->table, - blk_rq_nr_phys_segments(req), sdb->table.sgl, - SCSI_INLINE_SG_CNT))) - return BLK_STS_RESOURCE; - - /* - * Next, walk the list, and fill in the addresses and sizes of - * each segment. - */ - count = blk_rq_map_sg(req->q, req, sdb->table.sgl); - BUG_ON(count > sdb->table.nents); - sdb->table.nents = count; - sdb->length = blk_rq_payload_bytes(req); - return BLK_STS_OK; -} - /* * Function: scsi_init_io() * @@ -1017,17 +993,31 @@ blk_status_t scsi_init_io(struct scsi_cmnd *cmd) { struct request *rq = cmd->request; blk_status_t ret; + int count; if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq))) return BLK_STS_IOERR; - ret = scsi_init_sgtable(rq, &cmd->sdb); - if (ret) - return ret; + /* + * If sg table allocation fails, requeue request later. + */ + if (unlikely(sg_alloc_table_chained(&cmd->sdb.table, + blk_rq_nr_phys_segments(rq), cmd->sdb.table.sgl, + SCSI_INLINE_SG_CNT))) + return BLK_STS_RESOURCE; + + /* + * Next, walk the list, and fill in the addresses and sizes of + * each segment. + */ + count = blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl); + BUG_ON(count > cmd->sdb.table.nents); + cmd->sdb.table.nents = count; + cmd->sdb.length = blk_rq_payload_bytes(rq); if (blk_integrity_rq(rq)) { struct scsi_data_buffer *prot_sdb = cmd->prot_sdb; - int ivecs, count; + int ivecs; if (WARN_ON_ONCE(!prot_sdb)) { /* -- cgit v1.2.3 From cc97923a5bccc776851c242b61015faf288d5c22 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:42:24 +0200 Subject: block: move dma drain handling to scsi Don't burden the common block code with with specifics of the libata DMA draining mechanism. Instead move most of the code to the scsi midlayer. That also means the nr_phys_segments adjustments in the blk-mq fast path can go away entirely, given that SCSI never looks at nr_phys_segments after mapping the request to a scatterlist. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 14 -------------- block/blk-mq.c | 11 ----------- block/blk-settings.c | 37 ------------------------------------- drivers/ata/libata-scsi.c | 28 ++++++++++------------------ drivers/scsi/scsi_lib.c | 39 ++++++++++++++++++++++++++++++++++----- include/linux/blkdev.h | 7 ------- include/linux/libata.h | 2 ++ include/scsi/scsi_device.h | 3 +++ include/scsi/scsi_host.h | 7 +++++++ 9 files changed, 56 insertions(+), 92 deletions(-) (limited to 'drivers/scsi/scsi_lib.c') diff --git a/block/blk-merge.c b/block/blk-merge.c index ee618cdb141e..25f5a5e00ee6 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -539,20 +539,6 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, rq->extra_len += pad_len; } - if (q->dma_drain_size && q->dma_drain_needed(rq)) { - if (op_is_write(req_op(rq))) - memset(q->dma_drain_buffer, 0, q->dma_drain_size); - - sg_unmark_end(*last_sg); - *last_sg = sg_next(*last_sg); - sg_set_page(*last_sg, virt_to_page(q->dma_drain_buffer), - q->dma_drain_size, - ((unsigned long)q->dma_drain_buffer) & - (PAGE_SIZE - 1)); - nsegs++; - rq->extra_len += q->dma_drain_size; - } - if (*last_sg) sg_mark_end(*last_sg); diff --git a/block/blk-mq.c b/block/blk-mq.c index cf95e8e0881a..2c105cb2a75b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -667,15 +667,6 @@ void blk_mq_start_request(struct request *rq) blk_add_timer(rq); WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); - if (q->dma_drain_size && blk_rq_bytes(rq)) { - /* - * Make sure space for the drain appears. We know we can do - * this because max_hw_segments has been adjusted to be one - * fewer than the device can handle. - */ - rq->nr_phys_segments++; - } - #ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) q->integrity.profile->prepare_fn(rq); @@ -695,8 +686,6 @@ static void __blk_mq_requeue_request(struct request *rq) if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); rq->rq_flags &= ~RQF_TIMED_OUT; - if (q->dma_drain_size && blk_rq_bytes(rq)) - rq->nr_phys_segments--; } } diff --git a/block/blk-settings.c b/block/blk-settings.c index 14397b4c4b53..2ab1967b9716 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -651,43 +651,6 @@ void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask) } EXPORT_SYMBOL(blk_queue_update_dma_pad); -/** - * blk_queue_dma_drain - Set up a drain buffer for excess dma. - * @q: the request queue for the device - * @dma_drain_needed: fn which returns non-zero if drain is necessary - * @buf: physically contiguous buffer - * @size: size of the buffer in bytes - * - * Some devices have excess DMA problems and can't simply discard (or - * zero fill) the unwanted piece of the transfer. They have to have a - * real area of memory to transfer it into. The use case for this is - * ATAPI devices in DMA mode. If the packet command causes a transfer - * bigger than the transfer size some HBAs will lock up if there - * aren't DMA elements to contain the excess transfer. What this API - * does is adjust the queue so that the buf is always appended - * silently to the scatterlist. - * - * Note: This routine adjusts max_hw_segments to make room for appending - * the drain buffer. If you call blk_queue_max_segments() after calling - * this routine, you must set the limit to one fewer than your device - * can support otherwise there won't be room for the drain buffer. - */ -int blk_queue_dma_drain(struct request_queue *q, - dma_drain_needed_fn *dma_drain_needed, - void *buf, unsigned int size) -{ - if (queue_max_segments(q) < 2) - return -EINVAL; - /* make room for appending the drain */ - blk_queue_max_segments(q, queue_max_segments(q) - 1); - q->dma_drain_needed = dma_drain_needed; - q->dma_drain_buffer = buf; - q->dma_drain_size = size; - - return 0; -} -EXPORT_SYMBOL_GPL(blk_queue_dma_drain); - /** * blk_queue_segment_boundary - set boundary rules for segment merging * @q: the request queue for the device diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 36e588d88b95..feb13b8f93d7 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1017,16 +1017,11 @@ void ata_scsi_sdev_config(struct scsi_device *sdev) * RETURNS: * 1 if ; otherwise, 0. */ -static int atapi_drain_needed(struct request *rq) +bool ata_scsi_dma_need_drain(struct request *rq) { - if (likely(!blk_rq_is_passthrough(rq))) - return 0; - - if (!blk_rq_bytes(rq) || op_is_write(req_op(rq))) - return 0; - return atapi_cmd_type(scsi_req(rq)->cmd[0]) == ATAPI_MISC; } +EXPORT_SYMBOL_GPL(ata_scsi_dma_need_drain); int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev) { @@ -1039,21 +1034,21 @@ int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev) blk_queue_max_hw_sectors(q, dev->max_sectors); if (dev->class == ATA_DEV_ATAPI) { - void *buf; - sdev->sector_size = ATA_SECT_SIZE; /* set DMA padding */ blk_queue_update_dma_pad(q, ATA_DMA_PAD_SZ - 1); - /* configure draining */ - buf = kmalloc(ATAPI_MAX_DRAIN, q->bounce_gfp | GFP_KERNEL); - if (!buf) { + /* make room for appending the drain */ + blk_queue_max_segments(q, queue_max_segments(q) - 1); + + sdev->dma_drain_len = ATAPI_MAX_DRAIN; + sdev->dma_drain_buf = kmalloc(sdev->dma_drain_len, + q->bounce_gfp | GFP_KERNEL); + if (!sdev->dma_drain_buf) { ata_dev_err(dev, "drain buffer allocation failed\n"); return -ENOMEM; } - - blk_queue_dma_drain(q, atapi_drain_needed, buf, ATAPI_MAX_DRAIN); } else { sdev->sector_size = ata_id_logical_sector_size(dev->id); sdev->manage_start_stop = 1; @@ -1135,7 +1130,6 @@ EXPORT_SYMBOL_GPL(ata_scsi_slave_config); void ata_scsi_slave_destroy(struct scsi_device *sdev) { struct ata_port *ap = ata_shost_to_port(sdev->host); - struct request_queue *q = sdev->request_queue; unsigned long flags; struct ata_device *dev; @@ -1152,9 +1146,7 @@ void ata_scsi_slave_destroy(struct scsi_device *sdev) } spin_unlock_irqrestore(ap->lock, flags); - kfree(q->dma_drain_buffer); - q->dma_drain_buffer = NULL; - q->dma_drain_size = 0; + kfree(sdev->dma_drain_buf); } EXPORT_SYMBOL_GPL(ata_scsi_slave_destroy); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 4e42acbb3f32..88cac92fc153 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -978,6 +978,14 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) scsi_io_completion_action(cmd, result); } +static inline bool scsi_cmd_needs_dma_drain(struct scsi_device *sdev, + struct request *rq) +{ + return sdev->dma_drain_len && blk_rq_is_passthrough(rq) && + !op_is_write(req_op(rq)) && + sdev->host->hostt->dma_need_drain(rq); +} + /* * Function: scsi_init_io() * @@ -991,26 +999,47 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) */ blk_status_t scsi_init_io(struct scsi_cmnd *cmd) { + struct scsi_device *sdev = cmd->device; struct request *rq = cmd->request; + unsigned short nr_segs = blk_rq_nr_phys_segments(rq); + struct scatterlist *last_sg = NULL; blk_status_t ret; + bool need_drain = scsi_cmd_needs_dma_drain(sdev, rq); int count; - if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq))) + if (WARN_ON_ONCE(!nr_segs)) return BLK_STS_IOERR; + /* + * Make sure there is space for the drain. The driver must adjust + * max_hw_segments to be prepared for this. + */ + if (need_drain) + nr_segs++; + /* * If sg table allocation fails, requeue request later. */ - if (unlikely(sg_alloc_table_chained(&cmd->sdb.table, - blk_rq_nr_phys_segments(rq), cmd->sdb.table.sgl, - SCSI_INLINE_SG_CNT))) + if (unlikely(sg_alloc_table_chained(&cmd->sdb.table, nr_segs, + cmd->sdb.table.sgl, SCSI_INLINE_SG_CNT))) return BLK_STS_RESOURCE; /* * Next, walk the list, and fill in the addresses and sizes of * each segment. */ - count = blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl); + count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg); + + if (need_drain) { + sg_unmark_end(last_sg); + last_sg = sg_next(last_sg); + sg_set_buf(last_sg, sdev->dma_drain_buf, sdev->dma_drain_len); + sg_mark_end(last_sg); + + rq->extra_len += sdev->dma_drain_len; + count++; + } + BUG_ON(count > cmd->sdb.table.nents); cmd->sdb.table.nents = count; cmd->sdb.length = blk_rq_payload_bytes(rq); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 496dc9491026..8e4726bce498 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -288,7 +288,6 @@ struct blk_queue_ctx; typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); struct bio_vec; -typedef int (dma_drain_needed_fn)(struct request *); enum blk_eh_timer_return { BLK_EH_DONE, /* drivers has completed the command */ @@ -397,7 +396,6 @@ struct request_queue { struct rq_qos *rq_qos; make_request_fn *make_request_fn; - dma_drain_needed_fn *dma_drain_needed; const struct blk_mq_ops *mq_ops; @@ -467,8 +465,6 @@ struct request_queue { */ unsigned long nr_requests; /* Max # of requests */ - unsigned int dma_drain_size; - void *dma_drain_buffer; unsigned int dma_pad_mask; unsigned int dma_alignment; @@ -1097,9 +1093,6 @@ extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset); extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); -extern int blk_queue_dma_drain(struct request_queue *q, - dma_drain_needed_fn *dma_drain_needed, - void *buf, unsigned int size); extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); extern void blk_queue_virt_boundary(struct request_queue *, unsigned long); extern void blk_queue_dma_alignment(struct request_queue *, int); diff --git a/include/linux/libata.h b/include/linux/libata.h index cffa4714bfa8..af832852e620 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1092,6 +1092,7 @@ extern int ata_scsi_ioctl(struct scsi_device *dev, unsigned int cmd, #define ATA_SCSI_COMPAT_IOCTL /* empty */ #endif extern int ata_scsi_queuecmd(struct Scsi_Host *h, struct scsi_cmnd *cmd); +bool ata_scsi_dma_need_drain(struct request *rq); extern int ata_sas_scsi_ioctl(struct ata_port *ap, struct scsi_device *dev, unsigned int cmd, void __user *arg); extern bool ata_link_online(struct ata_link *link); @@ -1387,6 +1388,7 @@ extern struct device_attribute *ata_common_sdev_attrs[]; .ioctl = ata_scsi_ioctl, \ ATA_SCSI_COMPAT_IOCTL \ .queuecommand = ata_scsi_queuecmd, \ + .dma_need_drain = ata_scsi_dma_need_drain, \ .can_queue = ATA_DEF_QUEUE, \ .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ .this_id = ATA_SHT_THIS_ID, \ diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index c3cba2aaf934..bc5909033d13 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -229,6 +229,9 @@ struct scsi_device { struct scsi_device_handler *handler; void *handler_data; + size_t dma_drain_len; + void *dma_drain_buf; + unsigned char access_state; struct mutex state_mutex; enum scsi_device_state sdev_state; diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 822e8cda8d9b..46ef8cccc982 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -270,6 +270,13 @@ struct scsi_host_template { */ int (* map_queues)(struct Scsi_Host *shost); + /* + * Check if scatterlists need to be padded for DMA draining. + * + * Status: OPTIONAL + */ + bool (* dma_need_drain)(struct request *rq); + /* * This function determines the BIOS parameters for a given * harddisk. These tend to be numbers that are made up by -- cgit v1.2.3 From bdf8710d69f82ee6fd41b0166300c3306898b3c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:42:25 +0200 Subject: block: move dma_pad handling from blk_rq_map_sg into the callers There are only two callers of blk_rq_map_sg/__blk_rq_map_sg that set the dma_pad value in the queue. Move the handling into those callers instead of burdening the common code, and move the ->extra_len field from struct request to struct scsi_cmnd. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - block/blk-merge.c | 8 -------- block/blk-mq.c | 1 - drivers/ata/libata-scsi.c | 2 +- drivers/ide/ide-io.c | 7 +++++-- drivers/scsi/scsi_lib.c | 10 +++++++++- include/linux/blkdev.h | 2 -- include/scsi/scsi_cmnd.h | 1 + 8 files changed, 16 insertions(+), 16 deletions(-) (limited to 'drivers/scsi/scsi_lib.c') diff --git a/block/blk-core.c b/block/blk-core.c index 7e4a1da0715e..311596d5dbc4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1638,7 +1638,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->ioprio = rq_src->ioprio; - rq->extra_len = rq_src->extra_len; return 0; diff --git a/block/blk-merge.c b/block/blk-merge.c index 25f5a5e00ee6..c49eb3bdd0be 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -531,14 +531,6 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, else if (rq->bio) nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg); - if (blk_rq_bytes(rq) && (blk_rq_bytes(rq) & q->dma_pad_mask)) { - unsigned int pad_len = - (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; - - (*last_sg)->length += pad_len; - rq->extra_len += pad_len; - } - if (*last_sg) sg_mark_end(*last_sg); diff --git a/block/blk-mq.c b/block/blk-mq.c index 2c105cb2a75b..71d0894ce1c5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -318,7 +318,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->nr_integrity_segments = 0; #endif /* tag was already set */ - rq->extra_len = 0; WRITE_ONCE(rq->deadline, 0); rq->timeout = 0; diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index feb13b8f93d7..435781a16875 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -649,7 +649,7 @@ static void ata_qc_set_pc_nbytes(struct ata_queued_cmd *qc) { struct scsi_cmnd *scmd = qc->scsicmd; - qc->extrabytes = scmd->request->extra_len; + qc->extrabytes = scmd->extra_len; qc->nbytes = scsi_bufflen(scmd) + qc->extrabytes; } diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index b137f27a34d5..c31f1d2b3b07 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -233,10 +233,13 @@ static ide_startstop_t do_special(ide_drive_t *drive) void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd) { ide_hwif_t *hwif = drive->hwif; - struct scatterlist *sg = hwif->sg_table; + struct scatterlist *sg = hwif->sg_table, *last_sg = NULL; struct request *rq = cmd->rq; - cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg); + cmd->sg_nents = __blk_rq_map_sg(drive->queue, rq, sg, &last_sg); + if (blk_rq_bytes(rq) && (blk_rq_bytes(rq) & rq->q->dma_pad_mask)) + last_sg->length += + (rq->q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; } EXPORT_SYMBOL_GPL(ide_map_sg); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 88cac92fc153..0a73230a8f16 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1030,13 +1030,21 @@ blk_status_t scsi_init_io(struct scsi_cmnd *cmd) */ count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg); + if (blk_rq_bytes(rq) & rq->q->dma_pad_mask) { + unsigned int pad_len = + (rq->q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; + + last_sg->length += pad_len; + cmd->extra_len += pad_len; + } + if (need_drain) { sg_unmark_end(last_sg); last_sg = sg_next(last_sg); sg_set_buf(last_sg, sdev->dma_drain_buf, sdev->dma_drain_len); sg_mark_end(last_sg); - rq->extra_len += sdev->dma_drain_len; + cmd->extra_len += sdev->dma_drain_len; count++; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8e4726bce498..f00bd4042295 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -224,8 +224,6 @@ struct request { unsigned short write_hint; unsigned short ioprio; - unsigned int extra_len; /* length of alignment and padding */ - enum mq_rq_state state; refcount_t ref; diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 80ac89e47b47..f93c0b800790 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -142,6 +142,7 @@ struct scsi_cmnd { unsigned long state; /* Command completion state */ unsigned char tag; /* SCSI-II queued command tag */ + unsigned int extra_len; /* length of alignment and padding */ }; /* -- cgit v1.2.3 From 0512a75b98f847c2e9a4b664013424e603e202f7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 12 May 2020 17:55:47 +0900 Subject: block: Introduce REQ_OP_ZONE_APPEND Define REQ_OP_ZONE_APPEND to append-write sectors to a zone of a zoned block device. This is a no-merge write operation. A zone append write BIO must: * Target a zoned block device * Have a sector position indicating the start sector of the target zone * The target zone must be a sequential write zone * The BIO must not cross a zone boundary * The BIO size must not be split to ensure that a single range of LBAs is written with a single command. Implement these checks in generic_make_request_checks() using the helper function blk_check_zone_append(). To avoid write append BIO splitting, introduce the new max_zone_append_sectors queue limit attribute and ensure that a BIO size is always lower than this limit. Export this new limit through sysfs and check these limits in bio_full(). Also when a LLDD can't dispatch a request to a specific zone, it will return BLK_STS_ZONE_RESOURCE indicating this request needs to be delayed, e.g. because the zone it will be dispatched to is still write-locked. If this happens set the request aside in a local list to continue trying dispatching requests such as READ requests or a WRITE/ZONE_APPEND requests targetting other zones. This way we can still keep a high queue depth without starving other requests even if one request can't be served due to zone write-locking. Finally, make sure that the bio sector position indicates the actual write position as indicated by the device on completion. Signed-off-by: Keith Busch [ jth: added zone-append specific add_page and merge_page helpers ] Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bio.c | 62 ++++++++++++++++++++++++++++++++++++++++++++--- block/blk-core.c | 52 +++++++++++++++++++++++++++++++++++++++ block/blk-mq.c | 27 +++++++++++++++++++++ block/blk-settings.c | 31 ++++++++++++++++++++++++ block/blk-sysfs.c | 13 ++++++++++ drivers/scsi/scsi_lib.c | 1 + include/linux/blk_types.h | 14 +++++++++++ include/linux/blkdev.h | 11 +++++++++ 8 files changed, 207 insertions(+), 4 deletions(-) (limited to 'drivers/scsi/scsi_lib.c') diff --git a/block/bio.c b/block/bio.c index aad0a6dad4f9..3aa3c4ce2e5e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1025,6 +1025,50 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) return 0; } +static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) +{ + unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; + unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; + struct request_queue *q = bio->bi_disk->queue; + unsigned int max_append_sectors = queue_max_zone_append_sectors(q); + struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; + struct page **pages = (struct page **)bv; + ssize_t size, left; + unsigned len, i; + size_t offset; + + if (WARN_ON_ONCE(!max_append_sectors)) + return 0; + + /* + * Move page array up in the allocated memory for the bio vecs as far as + * possible so that we can start filling biovecs from the beginning + * without overwriting the temporary page array. + */ + BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); + pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); + + size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + if (unlikely(size <= 0)) + return size ? size : -EFAULT; + + for (left = size, i = 0; left > 0; left -= len, i++) { + struct page *page = pages[i]; + bool same_page = false; + + len = min_t(size_t, PAGE_SIZE - offset, left); + if (bio_add_hw_page(q, bio, page, len, offset, + max_append_sectors, &same_page) != len) + return -EINVAL; + if (same_page) + put_page(page); + offset = 0; + } + + iov_iter_advance(iter, size); + return 0; +} + /** * bio_iov_iter_get_pages - add user or kernel pages to a bio * @bio: bio to add pages to @@ -1054,10 +1098,16 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) return -EINVAL; do { - if (is_bvec) - ret = __bio_iov_bvec_add_pages(bio, iter); - else - ret = __bio_iov_iter_get_pages(bio, iter); + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + if (WARN_ON_ONCE(is_bvec)) + return -EINVAL; + ret = __bio_iov_append_get_pages(bio, iter); + } else { + if (is_bvec) + ret = __bio_iov_bvec_add_pages(bio, iter); + else + ret = __bio_iov_iter_get_pages(bio, iter); + } } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); if (is_bvec) @@ -1460,6 +1510,10 @@ struct bio *bio_split(struct bio *bio, int sectors, BUG_ON(sectors <= 0); BUG_ON(sectors >= bio_sectors(bio)); + /* Zone append commands cannot be split */ + if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) + return NULL; + split = bio_clone_fast(bio, gfp, bs); if (!split) return NULL; diff --git a/block/blk-core.c b/block/blk-core.c index 409e1a6b73b0..cf5b2163edfe 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -135,6 +135,7 @@ static const char *const blk_op_name[] = { REQ_OP_NAME(ZONE_OPEN), REQ_OP_NAME(ZONE_CLOSE), REQ_OP_NAME(ZONE_FINISH), + REQ_OP_NAME(ZONE_APPEND), REQ_OP_NAME(WRITE_SAME), REQ_OP_NAME(WRITE_ZEROES), REQ_OP_NAME(SCSI_IN), @@ -240,6 +241,17 @@ static void req_bio_endio(struct request *rq, struct bio *bio, bio_advance(bio, nbytes); + if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) { + /* + * Partial zone append completions cannot be supported as the + * BIO fragments may end up not being written sequentially. + */ + if (bio->bi_iter.bi_size) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_iter.bi_sector = rq->__sector; + } + /* don't actually finish bio if it's part of flush sequence */ if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) bio_endio(bio); @@ -887,6 +899,41 @@ out: return ret; } +/* + * Check write append to a zoned block device. + */ +static inline blk_status_t blk_check_zone_append(struct request_queue *q, + struct bio *bio) +{ + sector_t pos = bio->bi_iter.bi_sector; + int nr_sectors = bio_sectors(bio); + + /* Only applicable to zoned block devices */ + if (!blk_queue_is_zoned(q)) + return BLK_STS_NOTSUPP; + + /* The bio sector must point to the start of a sequential zone */ + if (pos & (blk_queue_zone_sectors(q) - 1) || + !blk_queue_zone_is_seq(q, pos)) + return BLK_STS_IOERR; + + /* + * Not allowed to cross zone boundaries. Otherwise, the BIO will be + * split and could result in non-contiguous sectors being written in + * different zones. + */ + if (nr_sectors > q->limits.chunk_sectors) + return BLK_STS_IOERR; + + /* Make sure the BIO is small enough and will not get split */ + if (nr_sectors > q->limits.max_zone_append_sectors) + return BLK_STS_IOERR; + + bio->bi_opf |= REQ_NOMERGE; + + return BLK_STS_OK; +} + static noinline_for_stack bool generic_make_request_checks(struct bio *bio) { @@ -959,6 +1006,11 @@ generic_make_request_checks(struct bio *bio) if (!q->limits.max_write_same_sectors) goto not_supported; break; + case REQ_OP_ZONE_APPEND: + status = blk_check_zone_append(q, bio); + if (status != BLK_STS_OK) + goto end_io; + break; case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: diff --git a/block/blk-mq.c b/block/blk-mq.c index d82cefb0474f..9ee695bdf873 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1183,6 +1183,19 @@ static void blk_mq_handle_dev_resource(struct request *rq, __blk_mq_requeue_request(rq); } +static void blk_mq_handle_zone_resource(struct request *rq, + struct list_head *zone_list) +{ + /* + * If we end up here it is because we cannot dispatch a request to a + * specific zone due to LLD level zone-write locking or other zone + * related resource not being available. In this case, set the request + * aside in zone_list for retrying it later. + */ + list_add(&rq->queuelist, zone_list); + __blk_mq_requeue_request(rq); +} + /* * Returns true if we did some work AND can potentially do more. */ @@ -1195,6 +1208,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, int errors, queued; blk_status_t ret = BLK_STS_OK; bool no_budget_avail = false; + LIST_HEAD(zone_list); if (list_empty(list)) return false; @@ -1256,6 +1270,16 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { blk_mq_handle_dev_resource(rq, list); break; + } else if (ret == BLK_STS_ZONE_RESOURCE) { + /* + * Move the request to zone_list and keep going through + * the dispatch list to find more requests the drive can + * accept. + */ + blk_mq_handle_zone_resource(rq, &zone_list); + if (list_empty(list)) + break; + continue; } if (unlikely(ret != BLK_STS_OK)) { @@ -1267,6 +1291,9 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, queued++; } while (!list_empty(list)); + if (!list_empty(&zone_list)) + list_splice_tail_init(&zone_list, list); + hctx->dispatched[queued_to_index(queued)]++; /* diff --git a/block/blk-settings.c b/block/blk-settings.c index 2ab1967b9716..9a2c23cd9700 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -48,6 +48,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; lim->max_write_zeroes_sectors = 0; + lim->max_zone_append_sectors = 0; lim->max_discard_sectors = 0; lim->max_hw_discard_sectors = 0; lim->discard_granularity = 0; @@ -83,6 +84,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_dev_sectors = UINT_MAX; lim->max_write_same_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX; + lim->max_zone_append_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -221,6 +223,33 @@ void blk_queue_max_write_zeroes_sectors(struct request_queue *q, } EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); +/** + * blk_queue_max_zone_append_sectors - set max sectors for a single zone append + * @q: the request queue for the device + * @max_zone_append_sectors: maximum number of sectors to write per command + **/ +void blk_queue_max_zone_append_sectors(struct request_queue *q, + unsigned int max_zone_append_sectors) +{ + unsigned int max_sectors; + + if (WARN_ON(!blk_queue_is_zoned(q))) + return; + + max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors); + max_sectors = min(q->limits.chunk_sectors, max_sectors); + + /* + * Signal eventual driver bugs resulting in the max_zone_append sectors limit + * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set, + * or the max_hw_sectors limit not set. + */ + WARN_ON(!max_sectors); + + q->limits.max_zone_append_sectors = max_sectors; +} +EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors); + /** * blk_queue_max_segments - set max hw segments for a request for this queue * @q: the request queue for the device @@ -470,6 +499,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->max_write_same_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); + t->max_zone_append_sectors = min(t->max_zone_append_sectors, + b->max_zone_append_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index fca9b158f4a0..02643e149d5e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -218,6 +218,13 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) (unsigned long long)q->limits.max_write_zeroes_sectors << 9); } +static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) +{ + unsigned long long max_sectors = q->limits.max_zone_append_sectors; + + return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT); +} + static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { @@ -639,6 +646,11 @@ static struct queue_sysfs_entry queue_write_zeroes_max_entry = { .show = queue_write_zeroes_max_show, }; +static struct queue_sysfs_entry queue_zone_append_max_entry = { + .attr = {.name = "zone_append_max_bytes", .mode = 0444 }, + .show = queue_zone_append_max_show, +}; + static struct queue_sysfs_entry queue_nonrot_entry = { .attr = {.name = "rotational", .mode = 0644 }, .show = queue_show_nonrot, @@ -749,6 +761,7 @@ static struct attribute *queue_attrs[] = { &queue_discard_zeroes_data_entry.attr, &queue_write_same_max_entry.attr, &queue_write_zeroes_max_entry.attr, + &queue_zone_append_max_entry.attr, &queue_nonrot_entry.attr, &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 0a73230a8f16..82ad0244b3d0 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1706,6 +1706,7 @@ out_put_budget: case BLK_STS_OK: break; case BLK_STS_RESOURCE: + case BLK_STS_ZONE_RESOURCE: if (atomic_read(&sdev->device_busy) || scsi_device_blocked(sdev)) ret = BLK_STS_DEV_RESOURCE; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 90895d594e64..b90dca1fa430 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -63,6 +63,18 @@ typedef u8 __bitwise blk_status_t; */ #define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13) +/* + * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone + * related resources are unavailable, but the driver can guarantee the queue + * will be rerun in the future once the resources become available again. + * + * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references + * a zone specific resource and IO to a different zone on the same device could + * still be served. Examples of that are zones that are write-locked, but a read + * to the same zone could be served. + */ +#define BLK_STS_ZONE_RESOURCE ((__force blk_status_t)14) + /** * blk_path_error - returns true if error may be path related * @error: status the request was completed with @@ -296,6 +308,8 @@ enum req_opf { REQ_OP_ZONE_CLOSE = 11, /* Transition a zone to full */ REQ_OP_ZONE_FINISH = 12, + /* write data at the current zone write pointer */ + REQ_OP_ZONE_APPEND = 13, /* SCSI passthrough using struct scsi_request */ REQ_OP_SCSI_IN = 32, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d736acf7f564..5647c78bb876 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -332,6 +332,7 @@ struct queue_limits { unsigned int max_hw_discard_sectors; unsigned int max_write_same_sectors; unsigned int max_write_zeroes_sectors; + unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; @@ -750,6 +751,9 @@ static inline bool rq_mergeable(struct request *rq) if (req_op(rq) == REQ_OP_WRITE_ZEROES) return false; + if (req_op(rq) == REQ_OP_ZONE_APPEND) + return false; + if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; if (rq->rq_flags & RQF_NOMERGE_FLAGS) @@ -1084,6 +1088,8 @@ extern void blk_queue_max_write_same_sectors(struct request_queue *q, extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, unsigned int max_write_same_sectors); extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); +extern void blk_queue_max_zone_append_sectors(struct request_queue *q, + unsigned int max_zone_append_sectors); extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); @@ -1301,6 +1307,11 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } +static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q) +{ + return q->limits.max_zone_append_sectors; +} + static inline unsigned queue_logical_block_size(const struct request_queue *q) { int retval = 512; -- cgit v1.2.3