summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/block/loop.c106
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c2
-rw-r--r--drivers/block/null_blk/main.c177
-rw-r--r--drivers/block/null_blk/null_blk.h6
-rw-r--r--drivers/block/null_blk/zoned.c20
-rw-r--r--drivers/block/rnbd/rnbd-clt.c2
-rw-r--r--drivers/block/sunvdc.c2
-rw-r--r--drivers/block/ublk_drv.c115
-rw-r--r--drivers/block/virtio_blk.c2
-rw-r--r--drivers/block/xen-blkfront.c2
-rw-r--r--drivers/md/dm-integrity.c12
-rw-r--r--drivers/md/dm-table.c7
-rw-r--r--drivers/md/md-bitmap.c14
-rw-r--r--drivers/md/md-cluster.c18
-rw-r--r--drivers/md/md-cluster.h6
-rw-r--r--drivers/md/md-linear.c15
-rw-r--r--drivers/md/md.c356
-rw-r--r--drivers/md/md.h62
-rw-r--r--drivers/md/raid0.c18
-rw-r--r--drivers/md/raid1-10.c6
-rw-r--r--drivers/md/raid1.c56
-rw-r--r--drivers/md/raid10.c66
-rw-r--r--drivers/md/raid5.c91
-rw-r--r--drivers/memstick/core/ms_block.c2
-rw-r--r--drivers/memstick/core/mspro_block.c4
-rw-r--r--drivers/mmc/core/queue.c2
-rw-r--r--drivers/mmc/host/cqhci-crypto.c8
-rw-r--r--drivers/mmc/host/sdhci-msm.c3
-rw-r--r--drivers/mtd/ubi/block.c2
-rw-r--r--drivers/nvdimm/badrange.c2
-rw-r--r--drivers/nvdimm/nd.h2
-rw-r--r--drivers/nvdimm/pfn_devs.c7
-rw-r--r--drivers/nvdimm/pmem.c2
-rw-r--r--drivers/nvme/common/Kconfig1
-rw-r--r--drivers/nvme/common/auth.c337
-rw-r--r--drivers/nvme/common/keyring.c65
-rw-r--r--drivers/nvme/host/Kconfig2
-rw-r--r--drivers/nvme/host/apple.c2
-rw-r--r--drivers/nvme/host/auth.c115
-rw-r--r--drivers/nvme/host/core.c3
-rw-r--r--drivers/nvme/host/fabrics.c34
-rw-r--r--drivers/nvme/host/fabrics.h3
-rw-r--r--drivers/nvme/host/fc.c6
-rw-r--r--drivers/nvme/host/multipath.c138
-rw-r--r--drivers/nvme/host/nvme.h22
-rw-r--r--drivers/nvme/host/pci.c5
-rw-r--r--drivers/nvme/host/rdma.c3
-rw-r--r--drivers/nvme/host/sysfs.c24
-rw-r--r--drivers/nvme/host/tcp.c67
-rw-r--r--drivers/nvme/host/zns.c10
-rw-r--r--drivers/nvme/target/auth.c72
-rw-r--r--drivers/nvme/target/core.c9
-rw-r--r--drivers/nvme/target/debugfs.c27
-rw-r--r--drivers/nvme/target/fabrics-cmd-auth.c60
-rw-r--r--drivers/nvme/target/fabrics-cmd.c25
-rw-r--r--drivers/nvme/target/fc.c14
-rw-r--r--drivers/nvme/target/loop.c2
-rw-r--r--drivers/nvme/target/nvmet.h40
-rw-r--r--drivers/nvme/target/pci-epf.c12
-rw-r--r--drivers/nvme/target/tcp.c32
-rw-r--r--drivers/scsi/scsi_lib.c2
-rw-r--r--drivers/target/target_core_iblock.c12
-rw-r--r--drivers/ufs/core/ufshcd-crypto.c7
-rw-r--r--drivers/ufs/host/ufs-exynos.c3
-rw-r--r--drivers/ufs/host/ufs-qcom.c3
65 files changed, 1718 insertions, 634 deletions
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c05fe27a96b6..674527d770dc 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -45,8 +45,6 @@ enum {
Lo_deleting,
};
-struct loop_func_table;
-
struct loop_device {
int lo_number;
loff_t lo_offset;
@@ -54,7 +52,8 @@ struct loop_device {
int lo_flags;
char lo_file_name[LO_NAME_SIZE];
- struct file * lo_backing_file;
+ struct file *lo_backing_file;
+ unsigned int lo_min_dio_size;
struct block_device *lo_device;
gfp_t old_gfp_mask;
@@ -169,29 +168,14 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file)
* of backing device, and the logical block size of loop is bigger than that of
* the backing device.
*/
-static bool lo_bdev_can_use_dio(struct loop_device *lo,
- struct block_device *backing_bdev)
-{
- unsigned int sb_bsize = bdev_logical_block_size(backing_bdev);
-
- if (queue_logical_block_size(lo->lo_queue) < sb_bsize)
- return false;
- if (lo->lo_offset & (sb_bsize - 1))
- return false;
- return true;
-}
-
static bool lo_can_use_dio(struct loop_device *lo)
{
- struct inode *inode = lo->lo_backing_file->f_mapping->host;
-
if (!(lo->lo_backing_file->f_mode & FMODE_CAN_ODIRECT))
return false;
-
- if (S_ISBLK(inode->i_mode))
- return lo_bdev_can_use_dio(lo, I_BDEV(inode));
- if (inode->i_sb->s_bdev)
- return lo_bdev_can_use_dio(lo, inode->i_sb->s_bdev);
+ if (queue_logical_block_size(lo->lo_queue) < lo->lo_min_dio_size)
+ return false;
+ if (lo->lo_offset & (lo->lo_min_dio_size - 1))
+ return false;
return true;
}
@@ -205,20 +189,12 @@ static bool lo_can_use_dio(struct loop_device *lo)
*/
static inline void loop_update_dio(struct loop_device *lo)
{
- bool dio_in_use = lo->lo_flags & LO_FLAGS_DIRECT_IO;
-
lockdep_assert_held(&lo->lo_mutex);
WARN_ON_ONCE(lo->lo_state == Lo_bound &&
lo->lo_queue->mq_freeze_depth == 0);
- if (lo->lo_backing_file->f_flags & O_DIRECT)
- lo->lo_flags |= LO_FLAGS_DIRECT_IO;
if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !lo_can_use_dio(lo))
lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
-
- /* flush dirty pages before starting to issue direct I/O */
- if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !dio_in_use)
- vfs_fsync(lo->lo_backing_file, 0);
}
/**
@@ -541,6 +517,28 @@ static void loop_reread_partitions(struct loop_device *lo)
__func__, lo->lo_number, lo->lo_file_name, rc);
}
+static unsigned int loop_query_min_dio_size(struct loop_device *lo)
+{
+ struct file *file = lo->lo_backing_file;
+ struct block_device *sb_bdev = file->f_mapping->host->i_sb->s_bdev;
+ struct kstat st;
+
+ /*
+ * Use the minimal dio alignment of the file system if provided.
+ */
+ if (!vfs_getattr(&file->f_path, &st, STATX_DIOALIGN, 0) &&
+ (st.result_mask & STATX_DIOALIGN))
+ return st.dio_offset_align;
+
+ /*
+ * In a perfect world this wouldn't be needed, but as of Linux 6.13 only
+ * a handful of file systems support the STATX_DIOALIGN flag.
+ */
+ if (sb_bdev)
+ return bdev_logical_block_size(sb_bdev);
+ return SECTOR_SIZE;
+}
+
static inline int is_loop_device(struct file *file)
{
struct inode *i = file->f_mapping->host;
@@ -573,6 +571,17 @@ static int loop_validate_file(struct file *file, struct block_device *bdev)
return 0;
}
+static void loop_assign_backing_file(struct loop_device *lo, struct file *file)
+{
+ lo->lo_backing_file = file;
+ lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping);
+ mapping_set_gfp_mask(file->f_mapping,
+ lo->old_gfp_mask & ~(__GFP_IO | __GFP_FS));
+ if (lo->lo_backing_file->f_flags & O_DIRECT)
+ lo->lo_flags |= LO_FLAGS_DIRECT_IO;
+ lo->lo_min_dio_size = loop_query_min_dio_size(lo);
+}
+
/*
* loop_change_fd switched the backing store of a loopback device to
* a new file. This is useful for operating system installers to free up
@@ -622,14 +631,18 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
goto out_err;
+ /*
+ * We might switch to direct I/O mode for the loop device, write back
+ * all dirty data the page cache now that so that the individual I/O
+ * operations don't have to do that.
+ */
+ vfs_fsync(file, 0);
+
/* and ... switch */
disk_force_media_change(lo->lo_disk);
memflags = blk_mq_freeze_queue(lo->lo_queue);
mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
- lo->lo_backing_file = file;
- lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping);
- mapping_set_gfp_mask(file->f_mapping,
- lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+ loop_assign_backing_file(lo, file);
loop_update_dio(lo);
blk_mq_unfreeze_queue(lo->lo_queue, memflags);
partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
@@ -971,12 +984,11 @@ loop_set_status_from_info(struct loop_device *lo,
return 0;
}
-static unsigned int loop_default_blocksize(struct loop_device *lo,
- struct block_device *backing_bdev)
+static unsigned int loop_default_blocksize(struct loop_device *lo)
{
- /* In case of direct I/O, match underlying block size */
- if ((lo->lo_backing_file->f_flags & O_DIRECT) && backing_bdev)
- return bdev_logical_block_size(backing_bdev);
+ /* In case of direct I/O, match underlying minimum I/O size */
+ if (lo->lo_flags & LO_FLAGS_DIRECT_IO)
+ return lo->lo_min_dio_size;
return SECTOR_SIZE;
}
@@ -994,7 +1006,7 @@ static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim,
backing_bdev = inode->i_sb->s_bdev;
if (!bsize)
- bsize = loop_default_blocksize(lo, backing_bdev);
+ bsize = loop_default_blocksize(lo);
loop_get_discard_config(lo, &granularity, &max_discard_sectors);
@@ -1019,7 +1031,6 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
const struct loop_config *config)
{
struct file *file = fget(config->fd);
- struct address_space *mapping;
struct queue_limits lim;
int error;
loff_t size;
@@ -1055,8 +1066,6 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
if (error)
goto out_unlock;
- mapping = file->f_mapping;
-
if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) {
error = -EINVAL;
goto out_unlock;
@@ -1088,9 +1097,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0);
lo->lo_device = bdev;
- lo->lo_backing_file = file;
- lo->old_gfp_mask = mapping_gfp_mask(mapping);
- mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+ loop_assign_backing_file(lo, file);
lim = queue_limits_start_update(lo->lo_queue);
loop_update_limits(lo, &lim, config->block_size);
@@ -1099,6 +1106,13 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
if (error)
goto out_unlock;
+ /*
+ * We might switch to direct I/O mode for the loop device, write back
+ * all dirty data the page cache now that so that the individual I/O
+ * operations don't have to do that.
+ */
+ vfs_fsync(file, 0);
+
loop_update_dio(lo);
loop_sysfs_init(lo);
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 95361099a2dc..0d619df03fa9 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2056,7 +2056,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
unsigned int nents;
/* Map the scatter list for DMA access */
- nents = blk_rq_map_sg(hctx->queue, rq, command->sg);
+ nents = blk_rq_map_sg(rq, command->sg);
nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir);
prefetch(&port->flags);
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index da1ecbf988b8..3bb9cee0a9b5 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -473,6 +473,8 @@ NULLB_DEVICE_ATTR(shared_tags, bool, NULL);
NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
NULLB_DEVICE_ATTR(fua, bool, NULL);
NULLB_DEVICE_ATTR(rotational, bool, NULL);
+NULLB_DEVICE_ATTR(badblocks_once, bool, NULL);
+NULLB_DEVICE_ATTR(badblocks_partial_io, bool, NULL);
static ssize_t nullb_device_power_show(struct config_item *item, char *page)
{
@@ -559,14 +561,14 @@ static ssize_t nullb_device_badblocks_store(struct config_item *item,
goto out;
/* enable badblocks */
cmpxchg(&t_dev->badblocks.shift, -1, 0);
- if (buf[0] == '+')
- ret = badblocks_set(&t_dev->badblocks, start,
- end - start + 1, 1);
- else
- ret = badblocks_clear(&t_dev->badblocks, start,
- end - start + 1);
- if (ret == 0)
+ if (buf[0] == '+') {
+ if (badblocks_set(&t_dev->badblocks, start,
+ end - start + 1, 1))
+ ret = count;
+ } else if (badblocks_clear(&t_dev->badblocks, start,
+ end - start + 1)) {
ret = count;
+ }
out:
kfree(orig);
return ret;
@@ -592,41 +594,43 @@ static ssize_t nullb_device_zone_offline_store(struct config_item *item,
CONFIGFS_ATTR_WO(nullb_device_, zone_offline);
static struct configfs_attribute *nullb_device_attrs[] = {
- &nullb_device_attr_size,
+ &nullb_device_attr_badblocks,
+ &nullb_device_attr_badblocks_once,
+ &nullb_device_attr_badblocks_partial_io,
+ &nullb_device_attr_blocking,
+ &nullb_device_attr_blocksize,
+ &nullb_device_attr_cache_size,
&nullb_device_attr_completion_nsec,
- &nullb_device_attr_submit_queues,
- &nullb_device_attr_poll_queues,
+ &nullb_device_attr_discard,
+ &nullb_device_attr_fua,
&nullb_device_attr_home_node,
- &nullb_device_attr_queue_mode,
- &nullb_device_attr_blocksize,
- &nullb_device_attr_max_sectors,
- &nullb_device_attr_irqmode,
&nullb_device_attr_hw_queue_depth,
&nullb_device_attr_index,
- &nullb_device_attr_blocking,
- &nullb_device_attr_use_per_node_hctx,
- &nullb_device_attr_power,
- &nullb_device_attr_memory_backed,
- &nullb_device_attr_discard,
+ &nullb_device_attr_irqmode,
+ &nullb_device_attr_max_sectors,
&nullb_device_attr_mbps,
- &nullb_device_attr_cache_size,
- &nullb_device_attr_badblocks,
- &nullb_device_attr_zoned,
- &nullb_device_attr_zone_size,
+ &nullb_device_attr_memory_backed,
+ &nullb_device_attr_no_sched,
+ &nullb_device_attr_poll_queues,
+ &nullb_device_attr_power,
+ &nullb_device_attr_queue_mode,
+ &nullb_device_attr_rotational,
+ &nullb_device_attr_shared_tag_bitmap,
+ &nullb_device_attr_shared_tags,
+ &nullb_device_attr_size,
+ &nullb_device_attr_submit_queues,
+ &nullb_device_attr_use_per_node_hctx,
+ &nullb_device_attr_virt_boundary,
+ &nullb_device_attr_zone_append_max_sectors,
&nullb_device_attr_zone_capacity,
- &nullb_device_attr_zone_nr_conv,
- &nullb_device_attr_zone_max_open,
+ &nullb_device_attr_zone_full,
&nullb_device_attr_zone_max_active,
- &nullb_device_attr_zone_append_max_sectors,
- &nullb_device_attr_zone_readonly,
+ &nullb_device_attr_zone_max_open,
+ &nullb_device_attr_zone_nr_conv,
&nullb_device_attr_zone_offline,
- &nullb_device_attr_zone_full,
- &nullb_device_attr_virt_boundary,
- &nullb_device_attr_no_sched,
- &nullb_device_attr_shared_tags,
- &nullb_device_attr_shared_tag_bitmap,
- &nullb_device_attr_fua,
- &nullb_device_attr_rotational,
+ &nullb_device_attr_zone_readonly,
+ &nullb_device_attr_zone_size,
+ &nullb_device_attr_zoned,
NULL,
};
@@ -704,16 +708,28 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
static ssize_t memb_group_features_show(struct config_item *item, char *page)
{
- return snprintf(page, PAGE_SIZE,
- "badblocks,blocking,blocksize,cache_size,fua,"
- "completion_nsec,discard,home_node,hw_queue_depth,"
- "irqmode,max_sectors,mbps,memory_backed,no_sched,"
- "poll_queues,power,queue_mode,shared_tag_bitmap,"
- "shared_tags,size,submit_queues,use_per_node_hctx,"
- "virt_boundary,zoned,zone_capacity,zone_max_active,"
- "zone_max_open,zone_nr_conv,zone_offline,zone_readonly,"
- "zone_size,zone_append_max_sectors,zone_full,"
- "rotational\n");
+
+ struct configfs_attribute **entry;
+ char delimiter = ',';
+ size_t left = PAGE_SIZE;
+ size_t written = 0;
+ int ret;
+
+ for (entry = &nullb_device_attrs[0]; *entry && left > 0; entry++) {
+ if (!*(entry + 1))
+ delimiter = '\n';
+ ret = snprintf(page + written, left, "%s%c", (*entry)->ca_name,
+ delimiter);
+ if (ret >= left) {
+ WARN_ONCE(1, "Too many null_blk features to print\n");
+ memzero_explicit(page, PAGE_SIZE);
+ return -ENOBUFS;
+ }
+ left -= ret;
+ written += ret;
+ }
+
+ return written;
}
CONFIGFS_ATTR_RO(memb_group_, features);
@@ -1249,25 +1265,37 @@ static int null_transfer(struct nullb *nullb, struct page *page,
return err;
}
-static blk_status_t null_handle_rq(struct nullb_cmd *cmd)
+/*
+ * Transfer data for the given request. The transfer size is capped with the
+ * nr_sectors argument.
+ */
+static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd,
+ sector_t nr_sectors)
{
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct nullb *nullb = cmd->nq->dev->nullb;
int err = 0;
unsigned int len;
sector_t sector = blk_rq_pos(rq);
+ unsigned int max_bytes = nr_sectors << SECTOR_SHIFT;
+ unsigned int transferred_bytes = 0;
struct req_iterator iter;
struct bio_vec bvec;
spin_lock_irq(&nullb->lock);
rq_for_each_segment(bvec, rq, iter) {
len = bvec.bv_len;
+ if (transferred_bytes + len > max_bytes)
+ len = max_bytes - transferred_bytes;
err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
op_is_write(req_op(rq)), sector,
rq->cmd_flags & REQ_FUA);
if (err)
break;
sector += len >> SECTOR_SHIFT;
+ transferred_bytes += len;
+ if (transferred_bytes >= max_bytes)
+ break;
}
spin_unlock_irq(&nullb->lock);
@@ -1295,31 +1323,51 @@ static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
return sts;
}
-static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd,
- sector_t sector,
- sector_t nr_sectors)
+/*
+ * Check if the command should fail for the badblocks. If so, return
+ * BLK_STS_IOERR and return number of partial I/O sectors to be written or read,
+ * which may be less than the requested number of sectors.
+ *
+ * @cmd: The command to handle.
+ * @sector: The start sector for I/O.
+ * @nr_sectors: Specifies number of sectors to write or read, and returns the
+ * number of sectors to be written or read.
+ */
+blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, sector_t sector,
+ unsigned int *nr_sectors)
{
struct badblocks *bb = &cmd->nq->dev->badblocks;
- sector_t first_bad;
- int bad_sectors;
+ struct nullb_device *dev = cmd->nq->dev;
+ unsigned int block_sectors = dev->blocksize >> SECTOR_SHIFT;
+ sector_t first_bad, bad_sectors;
+ unsigned int partial_io_sectors = 0;
- if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors))
- return BLK_STS_IOERR;
+ if (!badblocks_check(bb, sector, *nr_sectors, &first_bad, &bad_sectors))
+ return BLK_STS_OK;
- return BLK_STS_OK;
+ if (cmd->nq->dev->badblocks_once)
+ badblocks_clear(bb, first_bad, bad_sectors);
+
+ if (cmd->nq->dev->badblocks_partial_io) {
+ if (!IS_ALIGNED(first_bad, block_sectors))
+ first_bad = ALIGN_DOWN(first_bad, block_sectors);
+ if (sector < first_bad)
+ partial_io_sectors = first_bad - sector;
+ }
+ *nr_sectors = partial_io_sectors;
+
+ return BLK_STS_IOERR;
}
-static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd,
- enum req_op op,
- sector_t sector,
- sector_t nr_sectors)
+blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, enum req_op op,
+ sector_t sector, sector_t nr_sectors)
{
struct nullb_device *dev = cmd->nq->dev;
if (op == REQ_OP_DISCARD)
return null_handle_discard(dev, sector, nr_sectors);
- return null_handle_rq(cmd);
+ return null_handle_data_transfer(cmd, nr_sectors);
}
static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd)
@@ -1366,18 +1414,19 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op,
sector_t sector, unsigned int nr_sectors)
{
struct nullb_device *dev = cmd->nq->dev;
+ blk_status_t badblocks_ret = BLK_STS_OK;
blk_status_t ret;
- if (dev->badblocks.shift != -1) {
- ret = null_handle_badblocks(cmd, sector, nr_sectors);
+ if (dev->badblocks.shift != -1)
+ badblocks_ret = null_handle_badblocks(cmd, sector, &nr_sectors);
+
+ if (dev->memory_backed && nr_sectors) {
+ ret = null_handle_memory_backed(cmd, op, sector, nr_sectors);
if (ret != BLK_STS_OK)
return ret;
}
- if (dev->memory_backed)
- return null_handle_memory_backed(cmd, op, sector, nr_sectors);
-
- return BLK_STS_OK;
+ return badblocks_ret;
}
static void null_handle_cmd(struct nullb_cmd *cmd, sector_t sector,
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
index 6f9fe6171087..7bb6128dbaaf 100644
--- a/drivers/block/null_blk/null_blk.h
+++ b/drivers/block/null_blk/null_blk.h
@@ -63,6 +63,8 @@ struct nullb_device {
unsigned long flags; /* device flags */
unsigned int curr_cache;
struct badblocks badblocks;
+ bool badblocks_once;
+ bool badblocks_partial_io;
unsigned int nr_zones;
unsigned int nr_zones_imp_open;
@@ -131,6 +133,10 @@ blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector,
sector_t nr_sectors);
blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op,
sector_t sector, unsigned int nr_sectors);
+blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, sector_t sector,
+ unsigned int *nr_sectors);
+blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, enum req_op op,
+ sector_t sector, sector_t nr_sectors);
#ifdef CONFIG_BLK_DEV_ZONED
int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim);
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 0d5f9bf95229..4e5728f45989 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -353,6 +353,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
struct nullb_device *dev = cmd->nq->dev;
unsigned int zno = null_zone_no(dev, sector);
struct nullb_zone *zone = &dev->zones[zno];
+ blk_status_t badblocks_ret = BLK_STS_OK;
blk_status_t ret;
trace_nullb_zone_op(cmd, zno, zone->cond);
@@ -412,9 +413,20 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
zone->cond = BLK_ZONE_COND_IMP_OPEN;
}
- ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
- if (ret != BLK_STS_OK)
- goto unlock_zone;
+ if (dev->badblocks.shift != -1) {
+ badblocks_ret = null_handle_badblocks(cmd, sector, &nr_sectors);
+ if (badblocks_ret != BLK_STS_OK && !nr_sectors) {
+ ret = badblocks_ret;
+ goto unlock_zone;
+ }
+ }
+
+ if (dev->memory_backed) {
+ ret = null_handle_memory_backed(cmd, REQ_OP_WRITE, sector,
+ nr_sectors);
+ if (ret != BLK_STS_OK)
+ goto unlock_zone;
+ }
zone->wp += nr_sectors;
if (zone->wp == zone->start + zone->capacity) {
@@ -429,7 +441,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
zone->cond = BLK_ZONE_COND_FULL;
}
- ret = BLK_STS_OK;
+ ret = badblocks_ret;
unlock_zone:
null_unlock_zone(dev, zone);
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index 82467ecde7ec..15627417f12e 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -1010,7 +1010,7 @@ static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev,
* See queue limits.
*/
if ((req_op(rq) != REQ_OP_DISCARD) && (req_op(rq) != REQ_OP_WRITE_ZEROES))
- sg_cnt = blk_rq_map_sg(dev->queue, rq, iu->sgt.sgl);
+ sg_cnt = blk_rq_map_sg(rq, iu->sgt.sgl);
if (sg_cnt == 0)
sg_mark_end(&iu->sgt.sgl[0]);
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 282f81616a78..2b33fb5b949b 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -485,7 +485,7 @@ static int __send_request(struct request *req)
}
sg_init_table(sg, port->ring_cookies);
- nsg = blk_rq_map_sg(req->q, req, sg);
+ nsg = blk_rq_map_sg(req, sg);
len = 0;
for (i = 0; i < nsg; i++)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 6d7b26ab434f..c060da409ed8 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -73,11 +73,10 @@
/* All UBLK_PARAM_TYPE_* should be included here */
#define UBLK_PARAM_TYPE_ALL \
(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
- UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
+ UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
+ UBLK_PARAM_TYPE_DMA_ALIGN)
struct ublk_rq_data {
- struct llist_node node;
-
struct kref ref;
};
@@ -144,8 +143,6 @@ struct ublk_queue {
struct task_struct *ubq_daemon;
char *io_cmd_buf;
- struct llist_head io_cmds;
-
unsigned long io_addr; /* mapped vm address */
unsigned int max_io_sz;
bool force_abort;
@@ -494,15 +491,17 @@ static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
static DEFINE_MUTEX(ublk_ctl_mutex);
+
+#define UBLK_MAX_UBLKS UBLK_MINORS
+
/*
- * Max ublk devices allowed to add
+ * Max unprivileged ublk devices allowed to add
*
* It can be extended to one per-user limit in future or even controlled
* by cgroup.
*/
-#define UBLK_MAX_UBLKS UBLK_MINORS
-static unsigned int ublks_max = 64;
-static unsigned int ublks_added; /* protected by ublk_ctl_mutex */
+static unsigned int unprivileged_ublks_max = 64;
+static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
static struct miscdevice ublk_misc;
@@ -573,6 +572,16 @@ static int ublk_validate_params(const struct ublk_device *ub)
else if (ublk_dev_is_zoned(ub))
return -EINVAL;
+ if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
+ const struct ublk_param_dma_align *p = &ub->params.dma;
+
+ if (p->alignment >= PAGE_SIZE)
+ return -EINVAL;
+
+ if (!is_power_of_2(p->alignment + 1))
+ return -EINVAL;
+ }
+
return 0;
}
@@ -1100,7 +1109,7 @@ static void ublk_complete_rq(struct kref *ref)
}
/*
- * Since __ublk_rq_task_work always fails requests immediately during
+ * Since ublk_rq_task_work_cb always fails requests immediately during
* exiting, __ublk_fail_req() is only called from abort context during
* exiting. So lock is unnecessary.
*
@@ -1146,11 +1155,14 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
blk_mq_end_request(rq, BLK_STS_IOERR);
}
-static inline void __ublk_rq_task_work(struct request *req,
- unsigned issue_flags)
+static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
{
- struct ublk_queue *ubq = req->mq_hctx->driver_data;
- int tag = req->tag;
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+ struct ublk_queue *ubq = pdu->ubq;
+ int tag = pdu->tag;
+ struct request *req = blk_mq_tag_to_rq(
+ ubq->dev->tag_set.tags[ubq->q_id], tag);
struct ublk_io *io = &ubq->ios[tag];
unsigned int mapped_bytes;
@@ -1225,34 +1237,11 @@ static inline void __ublk_rq_task_work(struct request *req,
ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
}
-static inline void ublk_forward_io_cmds(struct ublk_queue *ubq,
- unsigned issue_flags)
-{
- struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
- struct ublk_rq_data *data, *tmp;
-
- io_cmds = llist_reverse_order(io_cmds);
- llist_for_each_entry_safe(data, tmp, io_cmds, node)
- __ublk_rq_task_work(blk_mq_rq_from_pdu(data), issue_flags);
-}
-
-static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags)
-{
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
- struct ublk_queue *ubq = pdu->ubq;
-
- ublk_forward_io_cmds(ubq, issue_flags);
-}
-
static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
{
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
+ struct ublk_io *io = &ubq->ios[rq->tag];
- if (llist_add(&data->node, &ubq->io_cmds)) {
- struct ublk_io *io = &ubq->ios[rq->tag];
-
- io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb);
- }
+ io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb);
}
static enum blk_eh_timer_return ublk_timeout(struct request *rq)
@@ -1445,7 +1434,7 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
struct request *rq;
/*
- * Either we fail the request or ublk_rq_task_work_fn
+ * Either we fail the request or ublk_rq_task_work_cb
* will do it
*/
rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
@@ -1911,10 +1900,9 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
return -EIOCBQUEUED;
out:
- io_uring_cmd_done(cmd, ret, 0, issue_flags);
pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
__func__, cmd_op, tag, ret, io->flags);
- return -EIOCBQUEUED;
+ return ret;
}
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
@@ -1970,7 +1958,10 @@ static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
- ublk_ch_uring_cmd_local(cmd, issue_flags);
+ int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
+
+ if (ret != -EIOCBQUEUED)
+ io_uring_cmd_done(cmd, ret, 0, issue_flags);
}
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
@@ -2235,7 +2226,8 @@ static int ublk_add_chdev(struct ublk_device *ub)
if (ret)
goto fail;
- ublks_added++;
+ if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
+ unprivileged_ublks_added++;
return 0;
fail:
put_device(dev);
@@ -2264,11 +2256,16 @@ static int ublk_add_tag_set(struct ublk_device *ub)
static void ublk_remove(struct ublk_device *ub)
{
+ bool unprivileged;
+
ublk_stop_dev(ub);
cancel_work_sync(&ub->nosrv_work);
cdev_device_del(&ub->cdev, &ub->cdev_dev);
+ unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
ublk_put_device(ub);
- ublks_added--;
+
+ if (unprivileged)
+ unprivileged_ublks_added--;
}
static struct ublk_device *ublk_get_device_from_id(int idx)
@@ -2343,6 +2340,9 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
lim.features |= BLK_FEAT_ROTATIONAL;
+ if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
+ lim.dma_alignment = ub->params.dma.alignment;
+
if (wait_for_completion_interruptible(&ub->completion) != 0)
return -EINTR;
@@ -2530,7 +2530,8 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
return ret;
ret = -EACCES;
- if (ublks_added >= ublks_max)
+ if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
+ unprivileged_ublks_added >= unprivileged_ublks_max)
goto out_unlock;
ret = -ENOMEM;
@@ -3101,10 +3102,9 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
if (ub)
ublk_put_device(ub);
out:
- io_uring_cmd_done(cmd, ret, 0, issue_flags);
pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
__func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
- return -EIOCBQUEUED;
+ return ret;
}
static const struct file_operations ublk_ctl_fops = {
@@ -3168,23 +3168,26 @@ static void __exit ublk_exit(void)
module_init(ublk_init);
module_exit(ublk_exit);
-static int ublk_set_max_ublks(const char *buf, const struct kernel_param *kp)
+static int ublk_set_max_unprivileged_ublks(const char *buf,
+ const struct kernel_param *kp)
{
return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
}
-static int ublk_get_max_ublks(char *buf, const struct kernel_param *kp)
+static int ublk_get_max_unprivileged_ublks(char *buf,
+ const struct kernel_param *kp)
{
- return sysfs_emit(buf, "%u\n", ublks_max);
+ return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
}
-static const struct kernel_param_ops ublk_max_ublks_ops = {
- .set = ublk_set_max_ublks,
- .get = ublk_get_max_ublks,
+static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
+ .set = ublk_set_max_unprivileged_ublks,
+ .get = ublk_get_max_unprivileged_ublks,
};
-module_param_cb(ublks_max, &ublk_max_ublks_ops, &ublks_max, 0644);
-MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)");
+module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
+ &unprivileged_ublks_max, 0644);
+MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
MODULE_DESCRIPTION("Userspace block device");
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 91cde76a4b3e..7cffea01d868 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -226,7 +226,7 @@ static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req,
if (unlikely(err))
return -ENOMEM;
- return blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl);
+ return blk_rq_map_sg(req, vbr->sg_table.sgl);
}
static void virtblk_cleanup_cmd(struct request *req)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index edcd08a9dcef..5babe575c288 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -751,7 +751,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
id = blkif_ring_get_request(rinfo, req, &final_ring_req);
ring_req = &rinfo->shadow[id].req;
- num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
+ num_sg = blk_rq_map_sg(req, rinfo->shadow[id].sg);
num_grant = 0;
/* Calculate the number of grant used */
for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index c45464b6576a..c8c1a00e7d80 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -4811,23 +4811,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
ti->error = "Cannot allocate bio set";
goto bad;
}
- r = bioset_integrity_create(&ic->recheck_bios, RECHECK_POOL_SIZE);
- if (r) {
- ti->error = "Cannot allocate bio integrity set";
- r = -ENOMEM;
- goto bad;
- }
r = bioset_init(&ic->recalc_bios, 1, 0, BIOSET_NEED_BVECS);
if (r) {
ti->error = "Cannot allocate bio set";
goto bad;
}
- r = bioset_integrity_create(&ic->recalc_bios, 1);
- if (r) {
- ti->error = "Cannot allocate bio integrity set";
- r = -ENOMEM;
- goto bad;
- }
}
ic->metadata_wq = alloc_workqueue("dm-integrity-metadata",
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 0ef5203387b2..453803f1edf5 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1081,15 +1081,9 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
__alignof__(struct dm_io)) + DM_IO_BIO_OFFSET;
if (bioset_init(&pools->io_bs, pool_size, io_front_pad, bioset_flags))
goto out_free_pools;
- if (mempool_needs_integrity &&
- bioset_integrity_create(&pools->io_bs, pool_size))
- goto out_free_pools;
init_bs:
if (bioset_init(&pools->bs, pool_size, front_pad, 0))
goto out_free_pools;
- if (mempool_needs_integrity &&
- bioset_integrity_create(&pools->bs, pool_size))
- goto out_free_pools;
t->mempools = pools;
return 0;
@@ -1250,6 +1244,7 @@ static int dm_table_construct_crypto_profile(struct dm_table *t)
profile->max_dun_bytes_supported = UINT_MAX;
memset(profile->modes_supported, 0xFF,
sizeof(profile->modes_supported));
+ profile->key_types_supported = ~0;
for (i = 0; i < t->num_targets; i++) {
struct dm_target *ti = dm_table_get_target(t, i);
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 23c09d22fcdb..44ec9b17cfd3 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -29,8 +29,10 @@
#include <linux/buffer_head.h>
#include <linux/seq_file.h>
#include <trace/events/block.h>
+
#include "md.h"
#include "md-bitmap.h"
+#include "md-cluster.h"
#define BITMAP_MAJOR_LO 3
/* version 4 insists the bitmap is in little-endian order
@@ -426,8 +428,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
struct block_device *bdev;
struct mddev *mddev = bitmap->mddev;
struct bitmap_storage *store = &bitmap->storage;
- unsigned int bitmap_limit = (bitmap->storage.file_pages - pg_index) <<
- PAGE_SHIFT;
+ unsigned long num_pages = bitmap->storage.file_pages;
+ unsigned int bitmap_limit = (num_pages - pg_index % num_pages) << PAGE_SHIFT;
loff_t sboff, offset = mddev->bitmap_info.offset;
sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE;
unsigned int size = PAGE_SIZE;
@@ -436,7 +438,7 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
/* we compare length (page numbers), not page offset. */
- if ((pg_index - store->sb_index) == store->file_pages - 1) {
+ if ((pg_index - store->sb_index) == num_pages - 1) {
unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
if (last_page_size == 0)
@@ -942,7 +944,7 @@ out:
bmname(bitmap), err);
goto out_no_sb;
}
- bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev);
+ bitmap->cluster_slot = bitmap->mddev->cluster_ops->slot_number(bitmap->mddev);
goto re_read;
}
@@ -2021,7 +2023,7 @@ static void md_bitmap_free(void *data)
sysfs_put(bitmap->sysfs_can_clear);
if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info &&
- bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev))
+ bitmap->cluster_slot == bitmap->mddev->cluster_ops->slot_number(bitmap->mddev))
md_cluster_stop(bitmap->mddev);
/* Shouldn't be needed - but just in case.... */
@@ -2229,7 +2231,7 @@ static int bitmap_load(struct mddev *mddev)
mddev_create_serial_pool(mddev, rdev);
if (mddev_is_clustered(mddev))
- md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
+ mddev->cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
/* Clear out old bitmap info first: Either there is none, or we
* are resuming after someone else has possibly changed things,
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 6595f89becdb..94221d964d4f 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1166,7 +1166,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
struct dlm_lock_resource *bm_lockres;
char str[64];
- if (i == md_cluster_ops->slot_number(mddev))
+ if (i == slot_number(mddev))
continue;
bitmap = mddev->bitmap_ops->get_from_slot(mddev, i);
@@ -1216,7 +1216,7 @@ out:
*/
static int cluster_check_sync_size(struct mddev *mddev)
{
- int current_slot = md_cluster_ops->slot_number(mddev);
+ int current_slot = slot_number(mddev);
int node_num = mddev->bitmap_info.nodes;
struct dlm_lock_resource *bm_lockres;
struct md_bitmap_stats stats;
@@ -1612,7 +1612,14 @@ out:
return err;
}
-static const struct md_cluster_operations cluster_ops = {
+static struct md_cluster_operations cluster_ops = {
+ .head = {
+ .type = MD_CLUSTER,
+ .id = ID_CLUSTER,
+ .name = "cluster",
+ .owner = THIS_MODULE,
+ },
+
.join = join,
.leave = leave,
.slot_number = slot_number,
@@ -1642,13 +1649,12 @@ static int __init cluster_init(void)
{
pr_warn("md-cluster: support raid1 and raid10 (limited support)\n");
pr_info("Registering Cluster MD functions\n");
- register_md_cluster_operations(&cluster_ops, THIS_MODULE);
- return 0;
+ return register_md_submodule(&cluster_ops.head);
}
static void cluster_exit(void)
{
- unregister_md_cluster_operations();
+ unregister_md_submodule(&cluster_ops.head);
}
module_init(cluster_init);
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 470bf18ffde5..8fb06d853173 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -10,6 +10,8 @@ struct mddev;
struct md_rdev;
struct md_cluster_operations {
+ struct md_submodule_head head;
+
int (*join)(struct mddev *mddev, int nodes);
int (*leave)(struct mddev *mddev);
int (*slot_number)(struct mddev *mddev);
@@ -35,4 +37,8 @@ struct md_cluster_operations {
void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors);
};
+extern int md_setup_cluster(struct mddev *mddev, int nodes);
+extern void md_cluster_stop(struct mddev *mddev);
+extern void md_reload_sb(struct mddev *mddev, int raid_disk);
+
#endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 369aed044b40..5d9b08115375 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -5,7 +5,6 @@
*/
#include <linux/blkdev.h>
-#include <linux/raid/md_u.h>
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -320,9 +319,13 @@ static void linear_quiesce(struct mddev *mddev, int state)
}
static struct md_personality linear_personality = {
- .name = "linear",
- .level = LEVEL_LINEAR,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_LINEAR,
+ .name = "linear",
+ .owner = THIS_MODULE,
+ },
+
.make_request = linear_make_request,
.run = linear_run,
.free = linear_free,
@@ -335,12 +338,12 @@ static struct md_personality linear_personality = {
static int __init linear_init(void)
{
- return register_md_personality(&linear_personality);
+ return register_md_submodule(&linear_personality.head);
}
static void linear_exit(void)
{
- unregister_md_personality(&linear_personality);
+ unregister_md_submodule(&linear_personality.head);
}
module_init(linear_init);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 30b3dbbce2d2..438e71e45c16 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -79,16 +79,10 @@ static const char *action_name[NR_SYNC_ACTIONS] = {
[ACTION_IDLE] = "idle",
};
-/* pers_list is a list of registered personalities protected by pers_lock. */
-static LIST_HEAD(pers_list);
-static DEFINE_SPINLOCK(pers_lock);
+static DEFINE_XARRAY(md_submodule);
static const struct kobj_type md_ktype;
-const struct md_cluster_operations *md_cluster_ops;
-EXPORT_SYMBOL(md_cluster_ops);
-static struct module *md_cluster_mod;
-
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq;
@@ -629,6 +623,12 @@ static void __mddev_put(struct mddev *mddev)
queue_work(md_misc_wq, &mddev->del_work);
}
+static void mddev_put_locked(struct mddev *mddev)
+{
+ if (atomic_dec_and_test(&mddev->active))
+ __mddev_put(mddev);
+}
+
void mddev_put(struct mddev *mddev)
{
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
@@ -888,16 +888,40 @@ struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
}
EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
-static struct md_personality *find_pers(int level, char *clevel)
+static struct md_personality *get_pers(int level, char *clevel)
{
- struct md_personality *pers;
- list_for_each_entry(pers, &pers_list, list) {
- if (level != LEVEL_NONE && pers->level == level)
- return pers;
- if (strcmp(pers->name, clevel)==0)
- return pers;
+ struct md_personality *ret = NULL;
+ struct md_submodule_head *head;
+ unsigned long i;
+
+ xa_lock(&md_submodule);
+ xa_for_each(&md_submodule, i, head) {
+ if (head->type != MD_PERSONALITY)
+ continue;
+ if ((level != LEVEL_NONE && head->id == level) ||
+ !strcmp(head->name, clevel)) {
+ if (try_module_get(head->owner))
+ ret = (void *)head;
+ break;
+ }
}
- return NULL;
+ xa_unlock(&md_submodule);
+
+ if (!ret) {
+ if (level != LEVEL_NONE)
+ pr_warn("md: personality for level %d is not loaded!\n",
+ level);
+ else
+ pr_warn("md: personality for level %s is not loaded!\n",
+ clevel);
+ }
+
+ return ret;
+}
+
+static void put_pers(struct md_personality *pers)
+{
+ module_put(pers->head.owner);
}
/* return the offset of the super block in 512byte sectors */
@@ -1180,7 +1204,7 @@ int md_check_no_bitmap(struct mddev *mddev)
if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
return 0;
pr_warn("%s: bitmaps are not supported for %s\n",
- mdname(mddev), mddev->pers->name);
+ mdname(mddev), mddev->pers->head.name);
return 1;
}
EXPORT_SYMBOL(md_check_no_bitmap);
@@ -1748,7 +1772,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
count <<= sb->bblog_shift;
if (bb + 1 == 0)
break;
- if (badblocks_set(&rdev->badblocks, sector, count, 1))
+ if (!badblocks_set(&rdev->badblocks, sector, count, 1))
return -EINVAL;
}
} else if (sb->bblog_offset != 0)
@@ -2359,19 +2383,6 @@ int md_integrity_register(struct mddev *mddev)
return 0; /* shouldn't register */
pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
- if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
- (mddev->level != 1 && mddev->level != 10 &&
- bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) {
- /*
- * No need to handle the failure of bioset_integrity_create,
- * because the function is called by md_run() -> pers->run(),
- * md_run calls bioset_exit -> bioset_integrity_free in case
- * of failure case.
- */
- pr_err("md: failed to create integrity pool for %s\n",
- mdname(mddev));
- return -EINVAL;
- }
return 0;
}
EXPORT_SYMBOL(md_integrity_register);
@@ -2639,11 +2650,11 @@ repeat:
force_change = 1;
if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
nospares = 1;
- ret = md_cluster_ops->metadata_update_start(mddev);
+ ret = mddev->cluster_ops->metadata_update_start(mddev);
/* Has someone else has updated the sb */
if (!does_sb_need_changing(mddev)) {
if (ret == 0)
- md_cluster_ops->metadata_update_cancel(mddev);
+ mddev->cluster_ops->metadata_update_cancel(mddev);
bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
BIT(MD_SB_CHANGE_DEVS) |
BIT(MD_SB_CHANGE_CLEAN));
@@ -2783,7 +2794,7 @@ rewrite:
/* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
if (mddev_is_clustered(mddev) && ret == 0)
- md_cluster_ops->metadata_update_finish(mddev);
+ mddev->cluster_ops->metadata_update_finish(mddev);
if (mddev->in_sync != sync_req ||
!bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
@@ -2942,7 +2953,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
else {
err = 0;
if (mddev_is_clustered(mddev))
- err = md_cluster_ops->remove_disk(mddev, rdev);
+ err = mddev->cluster_ops->remove_disk(mddev, rdev);
if (err == 0) {
md_kick_rdev_from_array(rdev);
@@ -3052,7 +3063,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
* by this node eventually
*/
if (!mddev_is_clustered(rdev->mddev) ||
- (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
+ (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) {
clear_bit(Faulty, &rdev->flags);
err = add_bound_rdev(rdev);
}
@@ -3860,7 +3871,7 @@ level_show(struct mddev *mddev, char *page)
spin_lock(&mddev->lock);
p = mddev->pers;
if (p)
- ret = sprintf(page, "%s\n", p->name);
+ ret = sprintf(page, "%s\n", p->head.name);
else if (mddev->clevel[0])
ret = sprintf(page, "%s\n", mddev->clevel);
else if (mddev->level != LEVEL_NONE)
@@ -3917,7 +3928,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
rv = -EINVAL;
if (!mddev->pers->quiesce) {
pr_warn("md: %s: %s does not support online personality change\n",
- mdname(mddev), mddev->pers->name);
+ mdname(mddev), mddev->pers->head.name);
goto out_unlock;
}
@@ -3931,24 +3942,20 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
if (request_module("md-%s", clevel) != 0)
request_module("md-level-%s", clevel);
- spin_lock(&pers_lock);
- pers = find_pers(level, clevel);
- if (!pers || !try_module_get(pers->owner)) {
- spin_unlock(&pers_lock);
- pr_warn("md: personality %s not loaded\n", clevel);
+ pers = get_pers(level, clevel);
+ if (!pers) {
rv = -EINVAL;
goto out_unlock;
}
- spin_unlock(&pers_lock);
if (pers == mddev->pers) {
/* Nothing to do! */
- module_put(pers->owner);
+ put_pers(pers);
rv = len;
goto out_unlock;
}
if (!pers->takeover) {
- module_put(pers->owner);
+ put_pers(pers);
pr_warn("md: %s: %s does not support personality takeover\n",
mdname(mddev), clevel);
rv = -EINVAL;
@@ -3969,7 +3976,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->raid_disks -= mddev->delta_disks;
mddev->delta_disks = 0;
mddev->reshape_backwards = 0;
- module_put(pers->owner);
+ put_pers(pers);
pr_warn("md: %s: %s would not accept array\n",
mdname(mddev), clevel);
rv = PTR_ERR(priv);
@@ -3984,7 +3991,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
oldpriv = mddev->private;
mddev->pers = pers;
mddev->private = priv;
- strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+ strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel));
mddev->level = mddev->new_level;
mddev->layout = mddev->new_layout;
mddev->chunk_sectors = mddev->new_chunk_sectors;
@@ -4026,7 +4033,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->to_remove = &md_redundancy_group;
}
- module_put(oldpers->owner);
+ put_pers(oldpers);
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0)
@@ -5584,7 +5591,7 @@ __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
{
- if (mddev->pers == NULL || (mddev->pers->level != 1))
+ if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1))
return sprintf(page, "n/a\n");
else
return sprintf(page, "%d\n", mddev->serialize_policy);
@@ -5610,7 +5617,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
err = mddev_suspend_and_lock(mddev);
if (err)
return err;
- if (mddev->pers == NULL || (mddev->pers->level != 1)) {
+ if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) {
pr_err("md: serialize_policy is only effective for raid1\n");
err = -EINVAL;
goto unlock;
@@ -6096,30 +6103,21 @@ int md_run(struct mddev *mddev)
goto exit_sync_set;
}
- spin_lock(&pers_lock);
- pers = find_pers(mddev->level, mddev->clevel);
- if (!pers || !try_module_get(pers->owner)) {
- spin_unlock(&pers_lock);
- if (mddev->level != LEVEL_NONE)
- pr_warn("md: personality for level %d is not loaded!\n",
- mddev->level);
- else
- pr_warn("md: personality for level %s is not loaded!\n",
- mddev->clevel);
+ pers = get_pers(mddev->level, mddev->clevel);
+ if (!pers) {
err = -EINVAL;
goto abort;
}
- spin_unlock(&pers_lock);
- if (mddev->level != pers->level) {
- mddev->level = pers->level;
- mddev->new_level = pers->level;
+ if (mddev->level != pers->head.id) {
+ mddev->level = pers->head.id;
+ mddev->new_level = pers->head.id;
}
- strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+ strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel));
if (mddev->reshape_position != MaxSector &&
pers->start_reshape == NULL) {
/* This personality cannot handle reshaping... */
- module_put(pers->owner);
+ put_pers(pers);
err = -EINVAL;
goto abort;
}
@@ -6246,7 +6244,7 @@ bitmap_abort:
if (mddev->private)
pers->free(mddev, mddev->private);
mddev->private = NULL;
- module_put(pers->owner);
+ put_pers(pers);
mddev->bitmap_ops->destroy(mddev);
abort:
bioset_exit(&mddev->io_clone_set);
@@ -6467,7 +6465,7 @@ static void __md_stop(struct mddev *mddev)
mddev->private = NULL;
if (pers->sync_request && mddev->to_remove == NULL)
mddev->to_remove = &md_redundancy_group;
- module_put(pers->owner);
+ put_pers(pers);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
bioset_exit(&mddev->bio_set);
@@ -6983,7 +6981,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
set_bit(Candidate, &rdev->flags);
else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
/* --add initiated by this node */
- err = md_cluster_ops->add_new_disk(mddev, rdev);
+ err = mddev->cluster_ops->add_new_disk(mddev, rdev);
if (err) {
export_rdev(rdev, mddev);
return err;
@@ -7000,14 +6998,14 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
if (mddev_is_clustered(mddev)) {
if (info->state & (1 << MD_DISK_CANDIDATE)) {
if (!err) {
- err = md_cluster_ops->new_disk_ack(mddev,
- err == 0);
+ err = mddev->cluster_ops->new_disk_ack(
+ mddev, err == 0);
if (err)
md_kick_rdev_from_array(rdev);
}
} else {
if (err)
- md_cluster_ops->add_new_disk_cancel(mddev);
+ mddev->cluster_ops->add_new_disk_cancel(mddev);
else
err = add_bound_rdev(rdev);
}
@@ -7087,10 +7085,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
goto busy;
kick_rdev:
- if (mddev_is_clustered(mddev)) {
- if (md_cluster_ops->remove_disk(mddev, rdev))
- goto busy;
- }
+ if (mddev_is_clustered(mddev) &&
+ mddev->cluster_ops->remove_disk(mddev, rdev))
+ goto busy;
md_kick_rdev_from_array(rdev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
@@ -7393,7 +7390,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
rv = mddev->pers->resize(mddev, num_sectors);
if (!rv) {
if (mddev_is_clustered(mddev))
- md_cluster_ops->update_size(mddev, old_dev_sectors);
+ mddev->cluster_ops->update_size(mddev, old_dev_sectors);
else if (!mddev_is_dm(mddev))
set_capacity_and_notify(mddev->gendisk,
mddev->array_sectors);
@@ -7441,6 +7438,28 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
return rv;
}
+static int get_cluster_ops(struct mddev *mddev)
+{
+ xa_lock(&md_submodule);
+ mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER);
+ if (mddev->cluster_ops &&
+ !try_module_get(mddev->cluster_ops->head.owner))
+ mddev->cluster_ops = NULL;
+ xa_unlock(&md_submodule);
+
+ return mddev->cluster_ops == NULL ? -ENOENT : 0;
+}
+
+static void put_cluster_ops(struct mddev *mddev)
+{
+ if (!mddev->cluster_ops)
+ return;
+
+ mddev->cluster_ops->leave(mddev);
+ module_put(mddev->cluster_ops->head.owner);
+ mddev->cluster_ops = NULL;
+}
+
/*
* update_array_info is used to change the configuration of an
* on-line array.
@@ -7549,16 +7568,15 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
if (mddev->bitmap_info.nodes) {
/* hold PW on all the bitmap lock */
- if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
+ if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) {
pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
rv = -EPERM;
- md_cluster_ops->unlock_all_bitmaps(mddev);
+ mddev->cluster_ops->unlock_all_bitmaps(mddev);
goto err;
}
mddev->bitmap_info.nodes = 0;
- md_cluster_ops->leave(mddev);
- module_put(md_cluster_mod);
+ put_cluster_ops(mddev);
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
}
mddev->bitmap_ops->destroy(mddev);
@@ -7842,7 +7860,7 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
case CLUSTERED_DISK_NACK:
if (mddev_is_clustered(mddev))
- md_cluster_ops->new_disk_ack(mddev, false);
+ mddev->cluster_ops->new_disk_ack(mddev, false);
else
err = -EINVAL;
goto unlock;
@@ -8124,7 +8142,8 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
return;
mddev->pers->error_handler(mddev, rdev);
- if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR)
+ if (mddev->pers->head.id == ID_RAID0 ||
+ mddev->pers->head.id == ID_LINEAR)
return;
if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
@@ -8162,14 +8181,17 @@ static void status_unused(struct seq_file *seq)
static void status_personalities(struct seq_file *seq)
{
- struct md_personality *pers;
+ struct md_submodule_head *head;
+ unsigned long i;
seq_puts(seq, "Personalities : ");
- spin_lock(&pers_lock);
- list_for_each_entry(pers, &pers_list, list)
- seq_printf(seq, "[%s] ", pers->name);
- spin_unlock(&pers_lock);
+ xa_lock(&md_submodule);
+ xa_for_each(&md_submodule, i, head)
+ if (head->type == MD_PERSONALITY)
+ seq_printf(seq, "[%s] ", head->name);
+ xa_unlock(&md_submodule);
+
seq_puts(seq, "\n");
}
@@ -8392,7 +8414,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, " (read-only)");
if (mddev->ro == MD_AUTO_READ)
seq_printf(seq, " (auto-read-only)");
- seq_printf(seq, " %s", mddev->pers->name);
+ seq_printf(seq, " %s", mddev->pers->head.name);
} else {
seq_printf(seq, "inactive");
}
@@ -8461,9 +8483,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
status_unused(seq);
- if (atomic_dec_and_test(&mddev->active))
- __mddev_put(mddev);
-
+ mddev_put_locked(mddev);
return 0;
}
@@ -8514,67 +8534,34 @@ static const struct proc_ops mdstat_proc_ops = {
.proc_poll = mdstat_poll,
};
-int register_md_personality(struct md_personality *p)
-{
- pr_debug("md: %s personality registered for level %d\n",
- p->name, p->level);
- spin_lock(&pers_lock);
- list_add_tail(&p->list, &pers_list);
- spin_unlock(&pers_lock);
- return 0;
-}
-EXPORT_SYMBOL(register_md_personality);
-
-int unregister_md_personality(struct md_personality *p)
+int register_md_submodule(struct md_submodule_head *msh)
{
- pr_debug("md: %s personality unregistered\n", p->name);
- spin_lock(&pers_lock);
- list_del_init(&p->list);
- spin_unlock(&pers_lock);
- return 0;
+ return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL);
}
-EXPORT_SYMBOL(unregister_md_personality);
+EXPORT_SYMBOL_GPL(register_md_submodule);
-int register_md_cluster_operations(const struct md_cluster_operations *ops,
- struct module *module)
+void unregister_md_submodule(struct md_submodule_head *msh)
{
- int ret = 0;
- spin_lock(&pers_lock);
- if (md_cluster_ops != NULL)
- ret = -EALREADY;
- else {
- md_cluster_ops = ops;
- md_cluster_mod = module;
- }
- spin_unlock(&pers_lock);
- return ret;
+ xa_erase(&md_submodule, msh->id);
}
-EXPORT_SYMBOL(register_md_cluster_operations);
-
-int unregister_md_cluster_operations(void)
-{
- spin_lock(&pers_lock);
- md_cluster_ops = NULL;
- spin_unlock(&pers_lock);
- return 0;
-}
-EXPORT_SYMBOL(unregister_md_cluster_operations);
+EXPORT_SYMBOL_GPL(unregister_md_submodule);
int md_setup_cluster(struct mddev *mddev, int nodes)
{
- int ret;
- if (!md_cluster_ops)
+ int ret = get_cluster_ops(mddev);
+
+ if (ret) {
request_module("md-cluster");
- spin_lock(&pers_lock);
+ ret = get_cluster_ops(mddev);
+ }
+
/* ensure module won't be unloaded */
- if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
+ if (ret) {
pr_warn("can't find md-cluster module or get its reference.\n");
- spin_unlock(&pers_lock);
- return -ENOENT;
+ return ret;
}
- spin_unlock(&pers_lock);
- ret = md_cluster_ops->join(mddev, nodes);
+ ret = mddev->cluster_ops->join(mddev, nodes);
if (!ret)
mddev->safemode_delay = 0;
return ret;
@@ -8582,10 +8569,7 @@ int md_setup_cluster(struct mddev *mddev, int nodes)
void md_cluster_stop(struct mddev *mddev)
{
- if (!md_cluster_ops)
- return;
- md_cluster_ops->leave(mddev);
- module_put(md_cluster_mod);
+ put_cluster_ops(mddev);
}
static int is_mddev_idle(struct mddev *mddev, int init)
@@ -8978,7 +8962,7 @@ void md_do_sync(struct md_thread *thread)
}
if (mddev_is_clustered(mddev)) {
- ret = md_cluster_ops->resync_start(mddev);
+ ret = mddev->cluster_ops->resync_start(mddev);
if (ret)
goto skip;
@@ -9005,7 +8989,7 @@ void md_do_sync(struct md_thread *thread)
*
*/
if (mddev_is_clustered(mddev))
- md_cluster_ops->resync_start_notify(mddev);
+ mddev->cluster_ops->resync_start_notify(mddev);
do {
int mddev2_minor = -1;
mddev->curr_resync = MD_RESYNC_DELAYED;
@@ -9460,6 +9444,13 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
return true;
}
+ /* Check if resync is in progress. */
+ if (mddev->recovery_cp < MaxSector) {
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ return true;
+ }
+
/*
* Remove any failed drives, then add spares if possible. Spares are
* also removed and re-added, to allow the personality to fail the
@@ -9476,13 +9467,6 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
return true;
}
- /* Check if recovery is in progress. */
- if (mddev->recovery_cp < MaxSector) {
- set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
- clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- return true;
- }
-
/* Delay to choose resync/check/repair in md_do_sync(). */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
return true;
@@ -9789,7 +9773,7 @@ void md_reap_sync_thread(struct mddev *mddev)
* call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
* clustered raid */
if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
- md_cluster_ops->resync_finish(mddev);
+ mddev->cluster_ops->resync_finish(mddev);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -9797,13 +9781,13 @@ void md_reap_sync_thread(struct mddev *mddev)
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
/*
- * We call md_cluster_ops->update_size here because sync_size could
+ * We call mddev->cluster_ops->update_size here because sync_size could
* be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
* so it is time to update size across cluster.
*/
if (mddev_is_clustered(mddev) && is_reshaped
&& !test_bit(MD_CLOSING, &mddev->flags))
- md_cluster_ops->update_size(mddev, old_dev_sectors);
+ mddev->cluster_ops->update_size(mddev, old_dev_sectors);
/* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
@@ -9841,12 +9825,11 @@ EXPORT_SYMBOL(md_finish_reshape);
/* Bad block management */
-/* Returns 1 on success, 0 on failure */
-int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
- int is_new)
+/* Returns true on success, false on failure */
+bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new)
{
struct mddev *mddev = rdev->mddev;
- int rv;
/*
* Recording new badblocks for faulty rdev will force unnecessary
@@ -9856,50 +9839,51 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
* avoid it.
*/
if (test_bit(Faulty, &rdev->flags))
- return 1;
+ return true;
if (is_new)
s += rdev->new_data_offset;
else
s += rdev->data_offset;
- rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
- if (rv == 0) {
- /* Make sure they get written out promptly */
- if (test_bit(ExternalBbl, &rdev->flags))
- sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
- sysfs_notify_dirent_safe(rdev->sysfs_state);
- set_mask_bits(&mddev->sb_flags, 0,
- BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
- md_wakeup_thread(rdev->mddev->thread);
- return 1;
- } else
- return 0;
+
+ if (!badblocks_set(&rdev->badblocks, s, sectors, 0))
+ return false;
+
+ /* Make sure they get written out promptly */
+ if (test_bit(ExternalBbl, &rdev->flags))
+ sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
+ set_mask_bits(&mddev->sb_flags, 0,
+ BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
+ md_wakeup_thread(rdev->mddev->thread);
+ return true;
}
EXPORT_SYMBOL_GPL(rdev_set_badblocks);
-int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
- int is_new)
+void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new)
{
- int rv;
if (is_new)
s += rdev->new_data_offset;
else
s += rdev->data_offset;
- rv = badblocks_clear(&rdev->badblocks, s, sectors);
- if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
+
+ if (!badblocks_clear(&rdev->badblocks, s, sectors))
+ return;
+
+ if (test_bit(ExternalBbl, &rdev->flags))
sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
- return rv;
}
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
static int md_notify_reboot(struct notifier_block *this,
unsigned long code, void *x)
{
- struct mddev *mddev, *n;
+ struct mddev *mddev;
int need_delay = 0;
spin_lock(&all_mddevs_lock);
- list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
if (!mddev_get(mddev))
continue;
spin_unlock(&all_mddevs_lock);
@@ -9911,8 +9895,8 @@ static int md_notify_reboot(struct notifier_block *this,
mddev_unlock(mddev);
}
need_delay = 1;
- mddev_put(mddev);
spin_lock(&all_mddevs_lock);
+ mddev_put_locked(mddev);
}
spin_unlock(&all_mddevs_lock);
@@ -10029,7 +10013,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RESHAPE_ACTIVE) &&
- !md_cluster_ops->resync_status_get(mddev)) {
+ !mddev->cluster_ops->resync_status_get(mddev)) {
/*
* -1 to make raid1_add_disk() set conf->fullsync
* to 1. This could avoid skipping sync when the
@@ -10245,7 +10229,7 @@ void md_autostart_arrays(int part)
static __exit void md_exit(void)
{
- struct mddev *mddev, *n;
+ struct mddev *mddev;
int delay = 1;
unregister_blkdev(MD_MAJOR,"md");
@@ -10266,7 +10250,7 @@ static __exit void md_exit(void)
remove_proc_entry("mdstat", NULL);
spin_lock(&all_mddevs_lock);
- list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
if (!mddev_get(mddev))
continue;
spin_unlock(&all_mddevs_lock);
@@ -10278,8 +10262,8 @@ static __exit void md_exit(void)
* the mddev for destruction by a workqueue, and the
* destroy_workqueue() below will wait for that to complete.
*/
- mddev_put(mddev);
spin_lock(&all_mddevs_lock);
+ mddev_put_locked(mddev);
}
spin_unlock(&all_mddevs_lock);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index def808064ad8..1cf00a04bcdd 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -18,11 +18,37 @@
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
+#include <linux/raid/md_u.h>
#include <trace/events/block.h>
-#include "md-cluster.h"
#define MaxSector (~(sector_t)0)
+enum md_submodule_type {
+ MD_PERSONALITY = 0,
+ MD_CLUSTER,
+ MD_BITMAP, /* TODO */
+};
+
+enum md_submodule_id {
+ ID_LINEAR = LEVEL_LINEAR,
+ ID_RAID0 = 0,
+ ID_RAID1 = 1,
+ ID_RAID4 = 4,
+ ID_RAID5 = 5,
+ ID_RAID6 = 6,
+ ID_RAID10 = 10,
+ ID_CLUSTER,
+ ID_BITMAP, /* TODO */
+ ID_LLBITMAP, /* TODO */
+};
+
+struct md_submodule_head {
+ enum md_submodule_type type;
+ enum md_submodule_id id;
+ const char *name;
+ struct module *owner;
+};
+
/*
* These flags should really be called "NO_RETRY" rather than
* "FAILFAST" because they don't make any promise about time lapse,
@@ -266,8 +292,8 @@ enum flag_bits {
Nonrot, /* non-rotational device (SSD) */
};
-static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
- sector_t *first_bad, int *bad_sectors)
+static inline int is_badblock(struct md_rdev *rdev, sector_t s, sector_t sectors,
+ sector_t *first_bad, sector_t *bad_sectors)
{
if (unlikely(rdev->badblocks.count)) {
int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s,
@@ -284,16 +310,17 @@ static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s,
int sectors)
{
sector_t first_bad;
- int bad_sectors;
+ sector_t bad_sectors;
return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors);
}
-extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
- int is_new);
-extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
- int is_new);
+extern bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new);
+extern void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new);
struct md_cluster_info;
+struct md_cluster_operations;
/**
* enum mddev_flags - md device flags.
@@ -576,6 +603,7 @@ struct mddev {
mempool_t *serial_info_pool;
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
struct md_cluster_info *cluster_info;
+ struct md_cluster_operations *cluster_ops;
unsigned int good_device_nr; /* good device num within cluster raid */
unsigned int noio_flag; /* for memalloc scope API */
@@ -699,10 +727,8 @@ static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
struct md_personality
{
- char *name;
- int level;
- struct list_head list;
- struct module *owner;
+ struct md_submodule_head head;
+
bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio);
/*
* start up works that do NOT require md_thread. tasks that
@@ -843,13 +869,9 @@ static inline void safe_put_page(struct page *p)
if (p) put_page(p);
}
-extern int register_md_personality(struct md_personality *p);
-extern int unregister_md_personality(struct md_personality *p);
-extern int register_md_cluster_operations(const struct md_cluster_operations *ops,
- struct module *module);
-extern int unregister_md_cluster_operations(void);
-extern int md_setup_cluster(struct mddev *mddev, int nodes);
-extern void md_cluster_stop(struct mddev *mddev);
+int register_md_submodule(struct md_submodule_head *msh);
+void unregister_md_submodule(struct md_submodule_head *msh);
+
extern struct md_thread *md_register_thread(
void (*run)(struct md_thread *thread),
struct mddev *mddev,
@@ -906,7 +928,6 @@ extern void md_idle_sync_thread(struct mddev *mddev);
extern void md_frozen_sync_thread(struct mddev *mddev);
extern void md_unfrozen_sync_thread(struct mddev *mddev);
-extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev);
extern void mddev_destroy_serial_pool(struct mddev *mddev,
@@ -928,7 +949,6 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
}
}
-extern const struct md_cluster_operations *md_cluster_ops;
static inline int mddev_is_clustered(struct mddev *mddev)
{
return mddev->cluster_info && mddev->bitmap_info.nodes > 1;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 70bcc3cdf2cd..d8f639f4ae12 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -809,9 +809,13 @@ static void raid0_quiesce(struct mddev *mddev, int quiesce)
static struct md_personality raid0_personality=
{
- .name = "raid0",
- .level = 0,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID0,
+ .name = "raid0",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid0_make_request,
.run = raid0_run,
.free = raid0_free,
@@ -822,14 +826,14 @@ static struct md_personality raid0_personality=
.error_handler = raid0_error,
};
-static int __init raid0_init (void)
+static int __init raid0_init(void)
{
- return register_md_personality (&raid0_personality);
+ return register_md_submodule(&raid0_personality.head);
}
-static void raid0_exit (void)
+static void __exit raid0_exit(void)
{
- unregister_md_personality (&raid0_personality);
+ unregister_md_submodule(&raid0_personality.head);
}
module_init(raid0_init);
diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 4378d3250bd7..c7efd8aab675 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -247,7 +247,7 @@ static inline int raid1_check_read_range(struct md_rdev *rdev,
sector_t this_sector, int *len)
{
sector_t first_bad;
- int bad_sectors;
+ sector_t bad_sectors;
/* no bad block overlap */
if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors))
@@ -287,8 +287,8 @@ static inline bool raid1_should_read_first(struct mddev *mddev,
return true;
if (mddev_is_clustered(mddev) &&
- md_cluster_ops->area_resyncing(mddev, READ, this_sector,
- this_sector + len))
+ mddev->cluster_ops->area_resyncing(mddev, READ, this_sector,
+ this_sector + len))
return true;
return false;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 10ea3af40991..0efc03cea24e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -36,6 +36,7 @@
#include "md.h"
#include "raid1.h"
#include "md-bitmap.h"
+#include "md-cluster.h"
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
@@ -45,6 +46,7 @@
static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
+static void raid1_free(struct mddev *mddev, void *priv);
#define RAID_1_10_NAME "raid1"
#include "raid1-10.c"
@@ -1315,8 +1317,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
struct r1conf *conf = mddev->private;
struct raid1_info *mirror;
struct bio *read_bio;
- const enum req_op op = bio_op(bio);
- const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
int max_sectors;
int rdisk, error;
bool r1bio_existed = !!r1_bio;
@@ -1404,7 +1404,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
read_bio->bi_iter.bi_sector = r1_bio->sector +
mirror->rdev->data_offset;
read_bio->bi_end_io = raid1_end_read_request;
- read_bio->bi_opf = op | do_sync;
if (test_bit(FailFast, &mirror->rdev->flags) &&
test_bit(R1BIO_FailFast, &r1_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
@@ -1467,7 +1466,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
bool is_discard = (bio_op(bio) == REQ_OP_DISCARD);
if (mddev_is_clustered(mddev) &&
- md_cluster_ops->area_resyncing(mddev, WRITE,
+ mddev->cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio))) {
DEFINE_WAIT(w);
@@ -1478,7 +1477,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
for (;;) {
prepare_to_wait(&conf->wait_barrier,
&w, TASK_IDLE);
- if (!md_cluster_ops->area_resyncing(mddev, WRITE,
+ if (!mddev->cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector,
bio_end_sector(bio)))
break;
@@ -1537,7 +1536,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
atomic_inc(&rdev->nr_pending);
if (test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
- int bad_sectors;
+ sector_t bad_sectors;
int is_bad;
is_bad = is_badblock(rdev, r1_bio->sector, max_sectors,
@@ -1653,8 +1652,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset);
mbio->bi_end_io = raid1_end_write_request;
- mbio->bi_opf = bio_op(bio) |
- (bio->bi_opf & (REQ_SYNC | REQ_FUA | REQ_ATOMIC));
if (test_bit(FailFast, &rdev->flags) &&
!test_bit(WriteMostly, &rdev->flags) &&
conf->raid_disks - mddev->degraded > 1)
@@ -2486,7 +2483,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
}
}
-static int narrow_write_error(struct r1bio *r1_bio, int i)
+static bool narrow_write_error(struct r1bio *r1_bio, int i)
{
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
@@ -2507,10 +2504,10 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
sector_t sector;
int sectors;
int sect_to_write = r1_bio->sectors;
- int ok = 1;
+ bool ok = true;
if (rdev->badblocks.shift < 0)
- return 0;
+ return false;
block_sectors = roundup(1 << rdev->badblocks.shift,
bdev_logical_block_size(rdev->bdev) >> 9);
@@ -2886,7 +2883,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
} else {
/* may need to read from here */
sector_t first_bad = MaxSector;
- int bad_sectors;
+ sector_t bad_sectors;
if (is_badblock(rdev, sector_nr, good_sectors,
&first_bad, &bad_sectors)) {
@@ -3038,9 +3035,9 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
conf->cluster_sync_low = mddev->curr_resync_completed;
conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
/* Send resync message */
- md_cluster_ops->resync_info_update(mddev,
- conf->cluster_sync_low,
- conf->cluster_sync_high);
+ mddev->cluster_ops->resync_info_update(mddev,
+ conf->cluster_sync_low,
+ conf->cluster_sync_high);
}
/* For a user-requested sync, we read all readable devices and do a
@@ -3256,8 +3253,11 @@ static int raid1_run(struct mddev *mddev)
if (!mddev_is_dm(mddev)) {
ret = raid1_set_limits(mddev);
- if (ret)
+ if (ret) {
+ if (!mddev->private)
+ raid1_free(mddev, conf);
return ret;
+ }
}
mddev->degraded = 0;
@@ -3271,6 +3271,8 @@ static int raid1_run(struct mddev *mddev)
*/
if (conf->raid_disks - mddev->degraded < 1) {
md_unregister_thread(mddev, &conf->thread);
+ if (!mddev->private)
+ raid1_free(mddev, conf);
return -EINVAL;
}
@@ -3491,9 +3493,13 @@ static void *raid1_takeover(struct mddev *mddev)
static struct md_personality raid1_personality =
{
- .name = "raid1",
- .level = 1,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID1,
+ .name = "raid1",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid1_make_request,
.run = raid1_run,
.free = raid1_free,
@@ -3510,18 +3516,18 @@ static struct md_personality raid1_personality =
.takeover = raid1_takeover,
};
-static int __init raid_init(void)
+static int __init raid1_init(void)
{
- return register_md_personality(&raid1_personality);
+ return register_md_submodule(&raid1_personality.head);
}
-static void raid_exit(void)
+static void __exit raid1_exit(void)
{
- unregister_md_personality(&raid1_personality);
+ unregister_md_submodule(&raid1_personality.head);
}
-module_init(raid_init);
-module_exit(raid_exit);
+module_init(raid1_init);
+module_exit(raid1_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
MODULE_ALIAS("md-personality-3"); /* RAID1 */
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 15b9ae5bf84d..846c5f29486e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -24,6 +24,7 @@
#include "raid10.h"
#include "raid0.h"
#include "md-bitmap.h"
+#include "md-cluster.h"
/*
* RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -747,7 +748,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
for (slot = 0; slot < conf->copies ; slot++) {
sector_t first_bad;
- int bad_sectors;
+ sector_t bad_sectors;
sector_t dev_sector;
unsigned int pending;
bool nonrot;
@@ -1146,8 +1147,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
{
struct r10conf *conf = mddev->private;
struct bio *read_bio;
- const enum req_op op = bio_op(bio);
- const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
int max_sectors;
struct md_rdev *rdev;
char b[BDEVNAME_SIZE];
@@ -1228,7 +1227,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
choose_data_offset(r10_bio, rdev);
read_bio->bi_end_io = raid10_end_read_request;
- read_bio->bi_opf = op | do_sync;
if (test_bit(FailFast, &rdev->flags) &&
test_bit(R10BIO_FailFast, &r10_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
@@ -1247,10 +1245,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
struct bio *bio, bool replacement,
int n_copy)
{
- const enum req_op op = bio_op(bio);
- const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
- const blk_opf_t do_fua = bio->bi_opf & REQ_FUA;
- const blk_opf_t do_atomic = bio->bi_opf & REQ_ATOMIC;
unsigned long flags;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev;
@@ -1269,7 +1263,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
choose_data_offset(r10_bio, rdev));
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_opf = op | do_sync | do_fua | do_atomic;
if (!replacement && test_bit(FailFast,
&conf->mirrors[devnum].rdev->flags)
&& enough(conf, devnum))
@@ -1355,9 +1348,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
int error;
if ((mddev_is_clustered(mddev) &&
- md_cluster_ops->area_resyncing(mddev, WRITE,
- bio->bi_iter.bi_sector,
- bio_end_sector(bio)))) {
+ mddev->cluster_ops->area_resyncing(mddev, WRITE,
+ bio->bi_iter.bi_sector,
+ bio_end_sector(bio)))) {
DEFINE_WAIT(w);
/* Bail out if REQ_NOWAIT is set for the bio */
if (bio->bi_opf & REQ_NOWAIT) {
@@ -1367,7 +1360,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
for (;;) {
prepare_to_wait(&conf->wait_barrier,
&w, TASK_IDLE);
- if (!md_cluster_ops->area_resyncing(mddev, WRITE,
+ if (!mddev->cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio)))
break;
schedule();
@@ -1438,7 +1431,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
sector_t dev_sector = r10_bio->devs[i].addr;
- int bad_sectors;
+ sector_t bad_sectors;
int is_bad;
is_bad = is_badblock(rdev, dev_sector, max_sectors,
@@ -1631,11 +1624,10 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return -EAGAIN;
- if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) {
+ if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
bio_wouldblock_error(bio);
return 0;
}
- wait_barrier(conf, false);
/*
* Check reshape again to avoid reshape happens after checking
@@ -2786,7 +2778,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
}
}
-static int narrow_write_error(struct r10bio *r10_bio, int i)
+static bool narrow_write_error(struct r10bio *r10_bio, int i)
{
struct bio *bio = r10_bio->master_bio;
struct mddev *mddev = r10_bio->mddev;
@@ -2807,10 +2799,10 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
sector_t sector;
int sectors;
int sect_to_write = r10_bio->sectors;
- int ok = 1;
+ bool ok = true;
if (rdev->badblocks.shift < 0)
- return 0;
+ return false;
block_sectors = roundup(1 << rdev->badblocks.shift,
bdev_logical_block_size(rdev->bdev) >> 9);
@@ -3413,7 +3405,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t from_addr, to_addr;
struct md_rdev *rdev = conf->mirrors[d].rdev;
sector_t sector, first_bad;
- int bad_sectors;
+ sector_t bad_sectors;
if (!rdev ||
!test_bit(In_sync, &rdev->flags))
continue;
@@ -3609,7 +3601,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
sector_t first_bad, sector;
- int bad_sectors;
+ sector_t bad_sectors;
struct md_rdev *rdev;
if (r10_bio->devs[i].repl_bio)
@@ -3716,7 +3708,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
conf->cluster_sync_low = mddev->curr_resync_completed;
raid10_set_cluster_sync_high(conf);
/* Send resync message */
- md_cluster_ops->resync_info_update(mddev,
+ mddev->cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
@@ -3749,7 +3741,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
if (broadcast_msg) {
raid10_set_cluster_sync_high(conf);
- md_cluster_ops->resync_info_update(mddev,
+ mddev->cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
@@ -4541,7 +4533,7 @@ static int raid10_start_reshape(struct mddev *mddev)
if (ret)
goto abort;
- ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
+ ret = mddev->cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
if (ret) {
mddev->bitmap_ops->resize(mddev, oldsize, 0, false);
goto abort;
@@ -4832,7 +4824,7 @@ read_more:
conf->cluster_sync_low = sb_reshape_pos;
}
- md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
+ mddev->cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
conf->cluster_sync_high);
}
@@ -4977,7 +4969,7 @@ static void raid10_update_reshape_pos(struct mddev *mddev)
struct r10conf *conf = mddev->private;
sector_t lo, hi;
- md_cluster_ops->resync_info_get(mddev, &lo, &hi);
+ mddev->cluster_ops->resync_info_get(mddev, &lo, &hi);
if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
|| mddev->reshape_position == MaxSector)
conf->reshape_progress = mddev->reshape_position;
@@ -5123,9 +5115,13 @@ static void raid10_finish_reshape(struct mddev *mddev)
static struct md_personality raid10_personality =
{
- .name = "raid10",
- .level = 10,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID10,
+ .name = "raid10",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid10_make_request,
.run = raid10_run,
.free = raid10_free,
@@ -5145,18 +5141,18 @@ static struct md_personality raid10_personality =
.update_reshape_pos = raid10_update_reshape_pos,
};
-static int __init raid_init(void)
+static int __init raid10_init(void)
{
- return register_md_personality(&raid10_personality);
+ return register_md_submodule(&raid10_personality.head);
}
-static void raid_exit(void)
+static void __exit raid10_exit(void)
{
- unregister_md_personality(&raid10_personality);
+ unregister_md_submodule(&raid10_personality.head);
}
-module_init(raid_init);
-module_exit(raid_exit);
+module_init(raid10_init);
+module_exit(raid10_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
MODULE_ALIAS("md-personality-9"); /* RAID10 */
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5c79429acc64..6389383166c0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5858,6 +5858,9 @@ static enum reshape_loc get_reshape_loc(struct mddev *mddev,
struct r5conf *conf, sector_t logical_sector)
{
sector_t reshape_progress, reshape_safe;
+
+ if (likely(conf->reshape_progress == MaxSector))
+ return LOC_NO_RESHAPE;
/*
* Spinlock is needed as reshape_progress may be
* 64bit on a 32bit platform, and so it might be
@@ -5935,22 +5938,19 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
const int rw = bio_data_dir(bi);
enum stripe_result ret;
struct stripe_head *sh;
+ enum reshape_loc loc;
sector_t new_sector;
int previous = 0, flags = 0;
int seq, dd_idx;
seq = read_seqcount_begin(&conf->gen_lock);
-
- if (unlikely(conf->reshape_progress != MaxSector)) {
- enum reshape_loc loc = get_reshape_loc(mddev, conf,
- logical_sector);
- if (loc == LOC_INSIDE_RESHAPE) {
- ret = STRIPE_SCHEDULE_AND_RETRY;
- goto out;
- }
- if (loc == LOC_AHEAD_OF_RESHAPE)
- previous = 1;
+ loc = get_reshape_loc(mddev, conf, logical_sector);
+ if (loc == LOC_INSIDE_RESHAPE) {
+ ret = STRIPE_SCHEDULE_AND_RETRY;
+ goto out;
}
+ if (loc == LOC_AHEAD_OF_RESHAPE)
+ previous = 1;
new_sector = raid5_compute_sector(conf, logical_sector, previous,
&dd_idx, NULL);
@@ -6127,7 +6127,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
/* Bail out if conflicts with reshape and REQ_NOWAIT is set */
if ((bi->bi_opf & REQ_NOWAIT) &&
- (conf->reshape_progress != MaxSector) &&
get_reshape_loc(mddev, conf, logical_sector) == LOC_INSIDE_RESHAPE) {
bio_wouldblock_error(bi);
if (rw == WRITE)
@@ -8954,9 +8953,13 @@ static void raid5_prepare_suspend(struct mddev *mddev)
static struct md_personality raid6_personality =
{
- .name = "raid6",
- .level = 6,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID6,
+ .name = "raid6",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid5_make_request,
.run = raid5_run,
.start = raid5_start,
@@ -8980,9 +8983,13 @@ static struct md_personality raid6_personality =
};
static struct md_personality raid5_personality =
{
- .name = "raid5",
- .level = 5,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID5,
+ .name = "raid5",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid5_make_request,
.run = raid5_run,
.start = raid5_start,
@@ -9007,9 +9014,13 @@ static struct md_personality raid5_personality =
static struct md_personality raid4_personality =
{
- .name = "raid4",
- .level = 4,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID4,
+ .name = "raid4",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid5_make_request,
.run = raid5_run,
.start = raid5_start,
@@ -9045,21 +9056,39 @@ static int __init raid5_init(void)
"md/raid5:prepare",
raid456_cpu_up_prepare,
raid456_cpu_dead);
- if (ret) {
- destroy_workqueue(raid5_wq);
- return ret;
- }
- register_md_personality(&raid6_personality);
- register_md_personality(&raid5_personality);
- register_md_personality(&raid4_personality);
+ if (ret)
+ goto err_destroy_wq;
+
+ ret = register_md_submodule(&raid6_personality.head);
+ if (ret)
+ goto err_cpuhp_remove;
+
+ ret = register_md_submodule(&raid5_personality.head);
+ if (ret)
+ goto err_unregister_raid6;
+
+ ret = register_md_submodule(&raid4_personality.head);
+ if (ret)
+ goto err_unregister_raid5;
+
return 0;
+
+err_unregister_raid5:
+ unregister_md_submodule(&raid5_personality.head);
+err_unregister_raid6:
+ unregister_md_submodule(&raid6_personality.head);
+err_cpuhp_remove:
+ cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
+err_destroy_wq:
+ destroy_workqueue(raid5_wq);
+ return ret;
}
-static void raid5_exit(void)
+static void __exit raid5_exit(void)
{
- unregister_md_personality(&raid6_personality);
- unregister_md_personality(&raid5_personality);
- unregister_md_personality(&raid4_personality);
+ unregister_md_submodule(&raid6_personality.head);
+ unregister_md_submodule(&raid5_personality.head);
+ unregister_md_submodule(&raid4_personality.head);
cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
destroy_workqueue(raid5_wq);
}
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index 5b617c1f6789..f4398383ae06 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -1904,7 +1904,7 @@ static void msb_io_work(struct work_struct *work)
/* process the request */
dbg_verbose("IO: processing new request");
- blk_rq_map_sg(msb->queue, req, sg);
+ blk_rq_map_sg(req, sg);
lba = blk_rq_pos(req);
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 634d343b6bdb..c9853d887d28 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -627,9 +627,7 @@ static int mspro_block_issue_req(struct memstick_dev *card)
while (true) {
msb->current_page = 0;
msb->current_seg = 0;
- msb->seg_count = blk_rq_map_sg(msb->block_req->q,
- msb->block_req,
- msb->req_sg);
+ msb->seg_count = blk_rq_map_sg(msb->block_req, msb->req_sg);
if (!msb->seg_count) {
unsigned int bytes = blk_rq_cur_bytes(msb->block_req);
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index ab662f502fe7..3ba62f825b84 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -523,5 +523,5 @@ unsigned int mmc_queue_map_sg(struct mmc_queue *mq, struct mmc_queue_req *mqrq)
{
struct request *req = mmc_queue_req_to_req(mqrq);
- return blk_rq_map_sg(mq->queue, req, mqrq->sg);
+ return blk_rq_map_sg(req, mqrq->sg);
}
diff --git a/drivers/mmc/host/cqhci-crypto.c b/drivers/mmc/host/cqhci-crypto.c
index cb8044093402..5a467098a0d6 100644
--- a/drivers/mmc/host/cqhci-crypto.c
+++ b/drivers/mmc/host/cqhci-crypto.c
@@ -84,11 +84,11 @@ static int cqhci_crypto_keyslot_program(struct blk_crypto_profile *profile,
if (ccap_array[cap_idx].algorithm_id == CQHCI_CRYPTO_ALG_AES_XTS) {
/* In XTS mode, the blk_crypto_key's size is already doubled */
- memcpy(cfg.crypto_key, key->raw, key->size/2);
+ memcpy(cfg.crypto_key, key->bytes, key->size/2);
memcpy(cfg.crypto_key + CQHCI_CRYPTO_KEY_MAX_SIZE/2,
- key->raw + key->size/2, key->size/2);
+ key->bytes + key->size/2, key->size/2);
} else {
- memcpy(cfg.crypto_key, key->raw, key->size);
+ memcpy(cfg.crypto_key, key->bytes, key->size);
}
cqhci_crypto_program_key(cq_host, &cfg, slot);
@@ -204,6 +204,8 @@ int cqhci_crypto_init(struct cqhci_host *cq_host)
/* Unfortunately, CQHCI crypto only supports 32 DUN bits. */
profile->max_dun_bytes_supported = 4;
+ profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW;
+
/*
* Cache all the crypto capabilities and advertise the supported crypto
* modes and data unit sizes to the block layer.
diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c
index e3d39311fdc7..3c383bce4928 100644
--- a/drivers/mmc/host/sdhci-msm.c
+++ b/drivers/mmc/host/sdhci-msm.c
@@ -1895,6 +1895,7 @@ static int sdhci_msm_ice_init(struct sdhci_msm_host *msm_host,
profile->ll_ops = sdhci_msm_crypto_ops;
profile->max_dun_bytes_supported = 4;
+ profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW;
profile->dev = dev;
/*
@@ -1968,7 +1969,7 @@ static int sdhci_msm_ice_keyslot_program(struct blk_crypto_profile *profile,
return qcom_ice_program_key(msm_host->ice,
QCOM_ICE_CRYPTO_ALG_AES_XTS,
QCOM_ICE_CRYPTO_KEY_SIZE_256,
- key->raw,
+ key->bytes,
key->crypto_cfg.data_unit_size / 512,
slot);
}
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index 2836905f0152..39cc0a6a4d37 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -199,7 +199,7 @@ static blk_status_t ubiblock_read(struct request *req)
* and ubi_read_sg() will check that limit.
*/
ubi_sgl_init(&pdu->usgl);
- blk_rq_map_sg(req->q, req, pdu->usgl.sg);
+ blk_rq_map_sg(req, pdu->usgl.sg);
while (bytes_left) {
/*
diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c
index a002ea6fdd84..ee478ccde7c6 100644
--- a/drivers/nvdimm/badrange.c
+++ b/drivers/nvdimm/badrange.c
@@ -167,7 +167,7 @@ static void set_badblock(struct badblocks *bb, sector_t s, int num)
dev_dbg(bb->dev, "Found a bad range (0x%llx, 0x%llx)\n",
(u64) s * 512, (u64) num * 512);
/* this isn't an error as the hardware will still throw an exception */
- if (badblocks_set(bb, s, num, 1))
+ if (!badblocks_set(bb, s, num, 1))
dev_info_once(bb->dev, "%s: failed for sector %llx\n",
__func__, (u64) s);
}
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 5ca06e9a2d29..cc5c8f3f81e8 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -673,7 +673,7 @@ static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector,
{
if (bb->count) {
sector_t first_bad;
- int num_bad;
+ sector_t num_bad;
return !!badblocks_check(bb, sector, len / 512, &first_bad,
&num_bad);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index cfdfe0eaa512..8f3e816e805d 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -367,9 +367,10 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn)
struct nd_namespace_common *ndns = nd_pfn->ndns;
void *zero_page = page_address(ZERO_PAGE(0));
struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
- int num_bad, meta_num, rc, bb_present;
+ int meta_num, rc, bb_present;
sector_t first_bad, meta_start;
struct nd_namespace_io *nsio;
+ sector_t num_bad;
if (nd_pfn->mode != PFN_MODE_PMEM)
return 0;
@@ -394,7 +395,7 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn)
bb_present = badblocks_check(&nd_region->bb, meta_start,
meta_num, &first_bad, &num_bad);
if (bb_present) {
- dev_dbg(&nd_pfn->dev, "meta: %x badblocks at %llx\n",
+ dev_dbg(&nd_pfn->dev, "meta: %llx badblocks at %llx\n",
num_bad, first_bad);
nsoff = ALIGN_DOWN((nd_region->ndr_start
+ (first_bad << 9)) - nsio->res.start,
@@ -413,7 +414,7 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn)
}
if (rc) {
dev_err(&nd_pfn->dev,
- "error clearing %x badblocks at %llx\n",
+ "error clearing %llx badblocks at %llx\n",
num_bad, first_bad);
return rc;
}
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index d81faa9d89c9..43156e1576c9 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -249,7 +249,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
unsigned int num = PFN_PHYS(nr_pages) >> SECTOR_SHIFT;
struct badblocks *bb = &pmem->bb;
sector_t first_bad;
- int num_bad;
+ sector_t num_bad;
if (kaddr)
*kaddr = pmem->virt_addr + offset;
diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig
index 244432e0b73d..da963e4f3f1f 100644
--- a/drivers/nvme/common/Kconfig
+++ b/drivers/nvme/common/Kconfig
@@ -12,3 +12,4 @@ config NVME_AUTH
select CRYPTO_SHA512
select CRYPTO_DH
select CRYPTO_DH_RFC7919_GROUPS
+ select CRYPTO_HKDF
diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c
index 9b7126e1a19d..2c092ec8c0a9 100644
--- a/drivers/nvme/common/auth.c
+++ b/drivers/nvme/common/auth.c
@@ -11,9 +11,12 @@
#include <linux/unaligned.h>
#include <crypto/hash.h>
#include <crypto/dh.h>
+#include <crypto/hkdf.h>
#include <linux/nvme.h>
#include <linux/nvme-auth.h>
+#define HKDF_MAX_HASHLEN 64
+
static u32 nvme_dhchap_seqnum;
static DEFINE_MUTEX(nvme_dhchap_mutex);
@@ -471,5 +474,339 @@ int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key)
}
EXPORT_SYMBOL_GPL(nvme_auth_generate_key);
+/**
+ * nvme_auth_generate_psk - Generate a PSK for TLS
+ * @hmac_id: Hash function identifier
+ * @skey: Session key
+ * @skey_len: Length of @skey
+ * @c1: Value of challenge C1
+ * @c2: Value of challenge C2
+ * @hash_len: Hash length of the hash algorithm
+ * @ret_psk: Pointer too the resulting generated PSK
+ * @ret_len: length of @ret_psk
+ *
+ * Generate a PSK for TLS as specified in NVMe base specification, section
+ * 8.13.5.9: Generated PSK for TLS
+ *
+ * The generated PSK for TLS shall be computed applying the HMAC function
+ * using the hash function H( ) selected by the HashID parameter in the
+ * DH-HMAC-CHAP_Challenge message with the session key KS as key to the
+ * concatenation of the two challenges C1 and C2 (i.e., generated
+ * PSK = HMAC(KS, C1 || C2)).
+ *
+ * Returns 0 on success with a valid generated PSK pointer in @ret_psk and
+ * the length of @ret_psk in @ret_len, or a negative error number otherwise.
+ */
+int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len,
+ u8 *c1, u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len)
+{
+ struct crypto_shash *tfm;
+ SHASH_DESC_ON_STACK(shash, tfm);
+ u8 *psk;
+ const char *hmac_name;
+ int ret, psk_len;
+
+ if (!c1 || !c2)
+ return -EINVAL;
+
+ hmac_name = nvme_auth_hmac_name(hmac_id);
+ if (!hmac_name) {
+ pr_warn("%s: invalid hash algorithm %d\n",
+ __func__, hmac_id);
+ return -EINVAL;
+ }
+
+ tfm = crypto_alloc_shash(hmac_name, 0, 0);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ psk_len = crypto_shash_digestsize(tfm);
+ psk = kzalloc(psk_len, GFP_KERNEL);
+ if (!psk) {
+ ret = -ENOMEM;
+ goto out_free_tfm;
+ }
+
+ shash->tfm = tfm;
+ ret = crypto_shash_setkey(tfm, skey, skey_len);
+ if (ret)
+ goto out_free_psk;
+
+ ret = crypto_shash_init(shash);
+ if (ret)
+ goto out_free_psk;
+
+ ret = crypto_shash_update(shash, c1, hash_len);
+ if (ret)
+ goto out_free_psk;
+
+ ret = crypto_shash_update(shash, c2, hash_len);
+ if (ret)
+ goto out_free_psk;
+
+ ret = crypto_shash_final(shash, psk);
+ if (!ret) {
+ *ret_psk = psk;
+ *ret_len = psk_len;
+ }
+
+out_free_psk:
+ if (ret)
+ kfree_sensitive(psk);
+out_free_tfm:
+ crypto_free_shash(tfm);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_generate_psk);
+
+/**
+ * nvme_auth_generate_digest - Generate TLS PSK digest
+ * @hmac_id: Hash function identifier
+ * @psk: Generated input PSK
+ * @psk_len: Length of @psk
+ * @subsysnqn: NQN of the subsystem
+ * @hostnqn: NQN of the host
+ * @ret_digest: Pointer to the returned digest
+ *
+ * Generate a TLS PSK digest as specified in TP8018 Section 3.6.1.3:
+ * TLS PSK and PSK identity Derivation
+ *
+ * The PSK digest shall be computed by encoding in Base64 (refer to RFC
+ * 4648) the result of the application of the HMAC function using the hash
+ * function specified in item 4 above (ie the hash function of the cipher
+ * suite associated with the PSK identity) with the PSK as HMAC key to the
+ * concatenation of:
+ * - the NQN of the host (i.e., NQNh) not including the null terminator;
+ * - a space character;
+ * - the NQN of the NVM subsystem (i.e., NQNc) not including the null
+ * terminator;
+ * - a space character; and
+ * - the seventeen ASCII characters "NVMe-over-Fabrics"
+ * (i.e., <PSK digest> = Base64(HMAC(PSK, NQNh || " " || NQNc || " " ||
+ * "NVMe-over-Fabrics"))).
+ * The length of the PSK digest depends on the hash function used to compute
+ * it as follows:
+ * - If the SHA-256 hash function is used, the resulting PSK digest is 44
+ * characters long; or
+ * - If the SHA-384 hash function is used, the resulting PSK digest is 64
+ * characters long.
+ *
+ * Returns 0 on success with a valid digest pointer in @ret_digest, or a
+ * negative error number on failure.
+ */
+int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len,
+ char *subsysnqn, char *hostnqn, u8 **ret_digest)
+{
+ struct crypto_shash *tfm;
+ SHASH_DESC_ON_STACK(shash, tfm);
+ u8 *digest, *enc;
+ const char *hmac_name;
+ size_t digest_len, hmac_len;
+ int ret;
+
+ if (WARN_ON(!subsysnqn || !hostnqn))
+ return -EINVAL;
+
+ hmac_name = nvme_auth_hmac_name(hmac_id);
+ if (!hmac_name) {
+ pr_warn("%s: invalid hash algorithm %d\n",
+ __func__, hmac_id);
+ return -EINVAL;
+ }
+
+ switch (nvme_auth_hmac_hash_len(hmac_id)) {
+ case 32:
+ hmac_len = 44;
+ break;
+ case 48:
+ hmac_len = 64;
+ break;
+ default:
+ pr_warn("%s: invalid hash algorithm '%s'\n",
+ __func__, hmac_name);
+ return -EINVAL;
+ }
+
+ enc = kzalloc(hmac_len + 1, GFP_KERNEL);
+ if (!enc)
+ return -ENOMEM;
+
+ tfm = crypto_alloc_shash(hmac_name, 0, 0);
+ if (IS_ERR(tfm)) {
+ ret = PTR_ERR(tfm);
+ goto out_free_enc;
+ }
+
+ digest_len = crypto_shash_digestsize(tfm);
+ digest = kzalloc(digest_len, GFP_KERNEL);
+ if (!digest) {
+ ret = -ENOMEM;
+ goto out_free_tfm;
+ }
+
+ shash->tfm = tfm;
+ ret = crypto_shash_setkey(tfm, psk, psk_len);
+ if (ret)
+ goto out_free_digest;
+
+ ret = crypto_shash_init(shash);
+ if (ret)
+ goto out_free_digest;
+
+ ret = crypto_shash_update(shash, hostnqn, strlen(hostnqn));
+ if (ret)
+ goto out_free_digest;
+
+ ret = crypto_shash_update(shash, " ", 1);
+ if (ret)
+ goto out_free_digest;
+
+ ret = crypto_shash_update(shash, subsysnqn, strlen(subsysnqn));
+ if (ret)
+ goto out_free_digest;
+
+ ret = crypto_shash_update(shash, " NVMe-over-Fabrics", 18);
+ if (ret)
+ goto out_free_digest;
+
+ ret = crypto_shash_final(shash, digest);
+ if (ret)
+ goto out_free_digest;
+
+ ret = base64_encode(digest, digest_len, enc);
+ if (ret < hmac_len) {
+ ret = -ENOKEY;
+ goto out_free_digest;
+ }
+ *ret_digest = enc;
+ ret = 0;
+
+out_free_digest:
+ kfree_sensitive(digest);
+out_free_tfm:
+ crypto_free_shash(tfm);
+out_free_enc:
+ if (ret)
+ kfree_sensitive(enc);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_generate_digest);
+
+/**
+ * nvme_auth_derive_tls_psk - Derive TLS PSK
+ * @hmac_id: Hash function identifier
+ * @psk: generated input PSK
+ * @psk_len: size of @psk
+ * @psk_digest: TLS PSK digest
+ * @ret_psk: Pointer to the resulting TLS PSK
+ *
+ * Derive a TLS PSK as specified in TP8018 Section 3.6.1.3:
+ * TLS PSK and PSK identity Derivation
+ *
+ * The TLS PSK shall be derived as follows from an input PSK
+ * (i.e., either a retained PSK or a generated PSK) and a PSK
+ * identity using the HKDF-Extract and HKDF-Expand-Label operations
+ * (refer to RFC 5869 and RFC 8446) where the hash function is the
+ * one specified by the hash specifier of the PSK identity:
+ * 1. PRK = HKDF-Extract(0, Input PSK); and
+ * 2. TLS PSK = HKDF-Expand-Label(PRK, "nvme-tls-psk", PskIdentityContext, L),
+ * where PskIdentityContext is the hash identifier indicated in
+ * the PSK identity concatenated to a space character and to the
+ * Base64 PSK digest (i.e., "<hash> <PSK digest>") and L is the
+ * output size in bytes of the hash function (i.e., 32 for SHA-256
+ * and 48 for SHA-384).
+ *
+ * Returns 0 on success with a valid psk pointer in @ret_psk or a negative
+ * error number otherwise.
+ */
+int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len,
+ u8 *psk_digest, u8 **ret_psk)
+{
+ struct crypto_shash *hmac_tfm;
+ const char *hmac_name;
+ const char *psk_prefix = "tls13 nvme-tls-psk";
+ static const char default_salt[HKDF_MAX_HASHLEN];
+ size_t info_len, prk_len;
+ char *info;
+ unsigned char *prk, *tls_key;
+ int ret;
+
+ hmac_name = nvme_auth_hmac_name(hmac_id);
+ if (!hmac_name) {
+ pr_warn("%s: invalid hash algorithm %d\n",
+ __func__, hmac_id);
+ return -EINVAL;
+ }
+ if (hmac_id == NVME_AUTH_HASH_SHA512) {
+ pr_warn("%s: unsupported hash algorithm %s\n",
+ __func__, hmac_name);
+ return -EINVAL;
+ }
+
+ hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0);
+ if (IS_ERR(hmac_tfm))
+ return PTR_ERR(hmac_tfm);
+
+ prk_len = crypto_shash_digestsize(hmac_tfm);
+ prk = kzalloc(prk_len, GFP_KERNEL);
+ if (!prk) {
+ ret = -ENOMEM;
+ goto out_free_shash;
+ }
+
+ if (WARN_ON(prk_len > HKDF_MAX_HASHLEN)) {
+ ret = -EINVAL;
+ goto out_free_prk;
+ }
+ ret = hkdf_extract(hmac_tfm, psk, psk_len,
+ default_salt, prk_len, prk);
+ if (ret)
+ goto out_free_prk;
+
+ ret = crypto_shash_setkey(hmac_tfm, prk, prk_len);
+ if (ret)
+ goto out_free_prk;
+
+ /*
+ * 2 addtional bytes for the length field from HDKF-Expand-Label,
+ * 2 addtional bytes for the HMAC ID, and one byte for the space
+ * separator.
+ */
+ info_len = strlen(psk_digest) + strlen(psk_prefix) + 5;
+ info = kzalloc(info_len + 1, GFP_KERNEL);
+ if (!info) {
+ ret = -ENOMEM;
+ goto out_free_prk;
+ }
+
+ put_unaligned_be16(psk_len, info);
+ memcpy(info + 2, psk_prefix, strlen(psk_prefix));
+ sprintf(info + 2 + strlen(psk_prefix), "%02d %s", hmac_id, psk_digest);
+
+ tls_key = kzalloc(psk_len, GFP_KERNEL);
+ if (!tls_key) {
+ ret = -ENOMEM;
+ goto out_free_info;
+ }
+ ret = hkdf_expand(hmac_tfm, info, info_len, tls_key, psk_len);
+ if (ret) {
+ kfree(tls_key);
+ goto out_free_info;
+ }
+ *ret_psk = tls_key;
+
+out_free_info:
+ kfree(info);
+out_free_prk:
+ kfree(prk);
+out_free_shash:
+ crypto_free_shash(hmac_tfm);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_derive_tls_psk);
+
MODULE_DESCRIPTION("NVMe Authentication framework");
MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c
index ed5167f942d8..32d16c53133b 100644
--- a/drivers/nvme/common/keyring.c
+++ b/drivers/nvme/common/keyring.c
@@ -5,7 +5,6 @@
#include <linux/module.h>
#include <linux/seq_file.h>
-#include <linux/key.h>
#include <linux/key-type.h>
#include <keys/user-type.h>
#include <linux/nvme.h>
@@ -124,6 +123,70 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring,
return key_ref_to_ptr(keyref);
}
+/**
+ * nvme_tls_psk_refresh - Refresh TLS PSK
+ * @keyring: Keyring holding the TLS PSK
+ * @hostnqn: Host NQN to use
+ * @subnqn: Subsystem NQN to use
+ * @hmac_id: Hash function identifier
+ * @data: TLS PSK key material
+ * @data_len: Length of @data
+ * @digest: TLS PSK digest
+ *
+ * Refresh a generated version 1 TLS PSK with the identity generated
+ * from @hmac_id, @hostnqn, @subnqn, and @digest in the keyring given
+ * by @keyring.
+ *
+ * Returns the updated key success or an error pointer otherwise.
+ */
+struct key *nvme_tls_psk_refresh(struct key *keyring,
+ const char *hostnqn, const char *subnqn, u8 hmac_id,
+ u8 *data, size_t data_len, const char *digest)
+{
+ key_perm_t keyperm =
+ KEY_POS_SEARCH | KEY_POS_VIEW | KEY_POS_READ |
+ KEY_POS_WRITE | KEY_POS_LINK | KEY_POS_SETATTR |
+ KEY_USR_SEARCH | KEY_USR_VIEW | KEY_USR_READ;
+ char *identity;
+ key_ref_t keyref;
+ key_serial_t keyring_id;
+ struct key *key;
+
+ if (!hostnqn || !subnqn || !data || !data_len)
+ return ERR_PTR(-EINVAL);
+
+ identity = kasprintf(GFP_KERNEL, "NVMe1G%02d %s %s %s",
+ hmac_id, hostnqn, subnqn, digest);
+ if (!identity)
+ return ERR_PTR(-ENOMEM);
+
+ if (!keyring)
+ keyring = nvme_keyring;
+ keyring_id = key_serial(keyring);
+ pr_debug("keyring %x refresh tls psk '%s'\n",
+ keyring_id, identity);
+ keyref = key_create_or_update(make_key_ref(keyring, true),
+ "psk", identity, data, data_len,
+ keyperm, KEY_ALLOC_NOT_IN_QUOTA |
+ KEY_ALLOC_BUILT_IN |
+ KEY_ALLOC_BYPASS_RESTRICTION);
+ if (IS_ERR(keyref)) {
+ pr_debug("refresh tls psk '%s' failed, error %ld\n",
+ identity, PTR_ERR(keyref));
+ kfree(identity);
+ return ERR_PTR(-ENOKEY);
+ }
+ kfree(identity);
+ /*
+ * Set the default timeout to 1 hour
+ * as suggested in TP8018.
+ */
+ key = key_ref_to_ptr(keyref);
+ key_set_timeout(key, 3600);
+ return key;
+}
+EXPORT_SYMBOL_GPL(nvme_tls_psk_refresh);
+
/*
* NVMe PSK priority list
*
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 486afe598184..10e453b2436e 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -109,7 +109,7 @@ config NVME_HOST_AUTH
bool "NVMe over Fabrics In-Band Authentication in host side"
depends on NVME_CORE
select NVME_AUTH
- select NVME_KEYRING if NVME_TCP_TLS
+ select NVME_KEYRING
help
This provides support for NVMe over Fabrics In-Band Authentication in
host side.
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index 8971aca41e63..57e863ecde58 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -525,7 +525,7 @@ static blk_status_t apple_nvme_map_data(struct apple_nvme *anv,
if (!iod->sg)
return BLK_STS_RESOURCE;
sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
- iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
+ iod->nents = blk_rq_map_sg(req, iod->sg);
if (!iod->nents)
goto out_free_sg;
diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
index 5ea0e21709da..6115fef74c1e 100644
--- a/drivers/nvme/host/auth.c
+++ b/drivers/nvme/host/auth.c
@@ -12,6 +12,7 @@
#include "nvme.h"
#include "fabrics.h"
#include <linux/nvme-auth.h>
+#include <linux/nvme-keyring.h>
#define CHAP_BUF_SIZE 4096
static struct kmem_cache *nvme_chap_buf_cache;
@@ -131,7 +132,13 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl,
data->auth_type = NVME_AUTH_COMMON_MESSAGES;
data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE;
data->t_id = cpu_to_le16(chap->transaction);
- data->sc_c = 0; /* No secure channel concatenation */
+ if (ctrl->opts->concat && chap->qid == 0) {
+ if (ctrl->opts->tls_key)
+ data->sc_c = NVME_AUTH_SECP_REPLACETLSPSK;
+ else
+ data->sc_c = NVME_AUTH_SECP_NEWTLSPSK;
+ } else
+ data->sc_c = NVME_AUTH_SECP_NOSC;
data->napd = 1;
data->auth_protocol[0].dhchap.authid = NVME_AUTH_DHCHAP_AUTH_ID;
data->auth_protocol[0].dhchap.halen = 3;
@@ -311,8 +318,9 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
data->hl = chap->hash_len;
data->dhvlen = cpu_to_le16(chap->host_key_len);
memcpy(data->rval, chap->response, chap->hash_len);
- if (ctrl->ctrl_key) {
+ if (ctrl->ctrl_key)
chap->bi_directional = true;
+ if (ctrl->ctrl_key || ctrl->opts->concat) {
get_random_bytes(chap->c2, chap->hash_len);
data->cvalid = 1;
memcpy(data->rval + chap->hash_len, chap->c2,
@@ -322,7 +330,10 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
} else {
memset(chap->c2, 0, chap->hash_len);
}
- chap->s2 = nvme_auth_get_seqnum();
+ if (ctrl->opts->concat)
+ chap->s2 = 0;
+ else
+ chap->s2 = nvme_auth_get_seqnum();
data->seqnum = cpu_to_le32(chap->s2);
if (chap->host_key_len) {
dev_dbg(ctrl->device, "%s: qid %d host public key %*ph\n",
@@ -677,6 +688,92 @@ static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap)
crypto_free_kpp(chap->dh_tfm);
}
+void nvme_auth_revoke_tls_key(struct nvme_ctrl *ctrl)
+{
+ dev_dbg(ctrl->device, "Wipe generated TLS PSK %08x\n",
+ key_serial(ctrl->opts->tls_key));
+ key_revoke(ctrl->opts->tls_key);
+ key_put(ctrl->opts->tls_key);
+ ctrl->opts->tls_key = NULL;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_revoke_tls_key);
+
+static int nvme_auth_secure_concat(struct nvme_ctrl *ctrl,
+ struct nvme_dhchap_queue_context *chap)
+{
+ u8 *psk, *digest, *tls_psk;
+ struct key *tls_key;
+ size_t psk_len;
+ int ret = 0;
+
+ if (!chap->sess_key) {
+ dev_warn(ctrl->device,
+ "%s: qid %d no session key negotiated\n",
+ __func__, chap->qid);
+ return -ENOKEY;
+ }
+
+ if (chap->qid) {
+ dev_warn(ctrl->device,
+ "qid %d: secure concatenation not supported on I/O queues\n",
+ chap->qid);
+ return -EINVAL;
+ }
+ ret = nvme_auth_generate_psk(chap->hash_id, chap->sess_key,
+ chap->sess_key_len,
+ chap->c1, chap->c2,
+ chap->hash_len, &psk, &psk_len);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "%s: qid %d failed to generate PSK, error %d\n",
+ __func__, chap->qid, ret);
+ return ret;
+ }
+ dev_dbg(ctrl->device,
+ "%s: generated psk %*ph\n", __func__, (int)psk_len, psk);
+
+ ret = nvme_auth_generate_digest(chap->hash_id, psk, psk_len,
+ ctrl->opts->subsysnqn,
+ ctrl->opts->host->nqn, &digest);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "%s: qid %d failed to generate digest, error %d\n",
+ __func__, chap->qid, ret);
+ goto out_free_psk;
+ };
+ dev_dbg(ctrl->device, "%s: generated digest %s\n",
+ __func__, digest);
+ ret = nvme_auth_derive_tls_psk(chap->hash_id, psk, psk_len,
+ digest, &tls_psk);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "%s: qid %d failed to derive TLS psk, error %d\n",
+ __func__, chap->qid, ret);
+ goto out_free_digest;
+ };
+
+ tls_key = nvme_tls_psk_refresh(ctrl->opts->keyring,
+ ctrl->opts->host->nqn,
+ ctrl->opts->subsysnqn, chap->hash_id,
+ tls_psk, psk_len, digest);
+ if (IS_ERR(tls_key)) {
+ ret = PTR_ERR(tls_key);
+ dev_warn(ctrl->device,
+ "%s: qid %d failed to insert generated key, error %d\n",
+ __func__, chap->qid, ret);
+ tls_key = NULL;
+ }
+ kfree_sensitive(tls_psk);
+ if (ctrl->opts->tls_key)
+ nvme_auth_revoke_tls_key(ctrl);
+ ctrl->opts->tls_key = tls_key;
+out_free_digest:
+ kfree_sensitive(digest);
+out_free_psk:
+ kfree_sensitive(psk);
+ return ret;
+}
+
static void nvme_queue_auth_work(struct work_struct *work)
{
struct nvme_dhchap_queue_context *chap =
@@ -833,6 +930,13 @@ static void nvme_queue_auth_work(struct work_struct *work)
}
if (!ret) {
chap->error = 0;
+ if (ctrl->opts->concat &&
+ (ret = nvme_auth_secure_concat(ctrl, chap))) {
+ dev_warn(ctrl->device,
+ "%s: qid %d failed to enable secure concatenation\n",
+ __func__, chap->qid);
+ chap->error = ret;
+ }
return;
}
@@ -912,6 +1016,11 @@ static void nvme_ctrl_auth_work(struct work_struct *work)
"qid 0: authentication failed\n");
return;
}
+ /*
+ * Only run authentication on the admin queue for secure concatenation.
+ */
+ if (ctrl->opts->concat)
+ return;
for (q = 1; q < ctrl->queue_count; q++) {
ret = nvme_auth_negotiate(ctrl, q);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8359d0aa0e44..777db89fdaa7 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4018,6 +4018,9 @@ static void nvme_ns_remove(struct nvme_ns *ns)
if (!nvme_ns_head_multipath(ns->head))
nvme_cdev_del(&ns->cdev, &ns->cdev_device);
+
+ nvme_mpath_remove_sysfs_link(ns);
+
del_gendisk(ns->disk);
mutex_lock(&ns->ctrl->namespaces_lock);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 432efcbf9e2f..93e9041b9657 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -472,8 +472,9 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
result = le32_to_cpu(res.u32);
ctrl->cntlid = result & 0xFFFF;
if (result & (NVME_CONNECT_AUTHREQ_ATR | NVME_CONNECT_AUTHREQ_ASCR)) {
- /* Secure concatenation is not implemented */
- if (result & NVME_CONNECT_AUTHREQ_ASCR) {
+ /* Check for secure concatenation */
+ if ((result & NVME_CONNECT_AUTHREQ_ASCR) &&
+ !ctrl->opts->concat) {
dev_warn(ctrl->device,
"qid 0: secure concatenation is not supported\n");
ret = -EOPNOTSUPP;
@@ -550,7 +551,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
/* Secure concatenation is not implemented */
if (result & NVME_CONNECT_AUTHREQ_ASCR) {
dev_warn(ctrl->device,
- "qid 0: secure concatenation is not supported\n");
+ "qid %d: secure concatenation is not supported\n", qid);
ret = -EOPNOTSUPP;
goto out_free_data;
}
@@ -706,6 +707,7 @@ static const match_table_t opt_tokens = {
#endif
#ifdef CONFIG_NVME_TCP_TLS
{ NVMF_OPT_TLS, "tls" },
+ { NVMF_OPT_CONCAT, "concat" },
#endif
{ NVMF_OPT_ERR, NULL }
};
@@ -735,6 +737,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
opts->tls = false;
opts->tls_key = NULL;
opts->keyring = NULL;
+ opts->concat = false;
options = o = kstrdup(buf, GFP_KERNEL);
if (!options)
@@ -1053,6 +1056,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
opts->tls = true;
break;
+ case NVMF_OPT_CONCAT:
+ if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) {
+ pr_err("TLS is not supported\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->concat = true;
+ break;
default:
pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
p);
@@ -1079,6 +1090,23 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
pr_warn("failfast tmo (%d) larger than controller loss tmo (%d)\n",
opts->fast_io_fail_tmo, ctrl_loss_tmo);
}
+ if (opts->concat) {
+ if (opts->tls) {
+ pr_err("Secure concatenation over TLS is not supported\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (opts->tls_key) {
+ pr_err("Cannot specify a TLS key for secure concatenation\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!opts->dhchap_secret) {
+ pr_err("Need to enable DH-CHAP for secure concatenation\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
opts->host = nvmf_host_add(hostnqn, &hostid);
if (IS_ERR(opts->host)) {
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 21d75dc4a3a0..9cf5b020adba 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -66,6 +66,7 @@ enum {
NVMF_OPT_TLS = 1 << 25,
NVMF_OPT_KEYRING = 1 << 26,
NVMF_OPT_TLS_KEY = 1 << 27,
+ NVMF_OPT_CONCAT = 1 << 28,
};
/**
@@ -101,6 +102,7 @@ enum {
* @keyring: Keyring to use for key lookups
* @tls_key: TLS key for encrypted connections (TCP)
* @tls: Start TLS encrypted connections (TCP)
+ * @concat: Enabled Secure channel concatenation (TCP)
* @disable_sqflow: disable controller sq flow control
* @hdr_digest: generate/verify header digest (TCP)
* @data_digest: generate/verify data digest (TCP)
@@ -130,6 +132,7 @@ struct nvmf_ctrl_options {
struct key *keyring;
struct key *tls_key;
bool tls;
+ bool concat;
bool disable_sqflow;
bool hdr_digest;
bool data_digest;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index b9929a5a7f4e..2257c3c96dd2 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2571,7 +2571,7 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
if (ret)
return -ENOMEM;
- op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl);
+ op->nents = blk_rq_map_sg(rq, freq->sg_table.sgl);
WARN_ON(op->nents > blk_rq_nr_phys_segments(rq));
freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl,
op->nents, rq_dma_dir(rq));
@@ -2858,7 +2858,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
unsigned int nr_io_queues;
int ret;
- nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()),
+ nr_io_queues = min3(opts->nr_io_queues, num_online_cpus(),
ctrl->lport->ops->max_hw_queues);
ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
if (ret) {
@@ -2912,7 +2912,7 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl)
unsigned int nr_io_queues;
int ret;
- nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()),
+ nr_io_queues = min3(opts->nr_io_queues, num_online_cpus(),
ctrl->lport->ops->max_hw_queues);
ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
if (ret) {
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 2a7635565083..6b12ca80aa27 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -686,6 +686,8 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
kblockd_schedule_work(&head->partition_scan_work);
}
+ nvme_mpath_add_sysfs_link(ns->head);
+
mutex_lock(&head->lock);
if (nvme_path_is_optimized(ns)) {
int node, srcu_idx;
@@ -768,6 +770,25 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
if (nvme_state_is_live(ns->ana_state) &&
nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
nvme_mpath_set_live(ns);
+ else {
+ /*
+ * Add sysfs link from multipath head gendisk node to path
+ * device gendisk node.
+ * If path's ana state is live (i.e. state is either optimized
+ * or non-optimized) while we alloc the ns then sysfs link would
+ * be created from nvme_mpath_set_live(). In that case we would
+ * not fallthrough this code path. However for the path's ana
+ * state other than live, we call nvme_mpath_set_live() only
+ * after ana state transitioned to the live state. But we still
+ * want to create the sysfs link from head node to a path device
+ * irrespctive of the path's ana state.
+ * If we reach through here then it means that path's ana state
+ * is not live but still create the sysfs link to this path from
+ * head node if head node of the path has already come alive.
+ */
+ if (test_bit(NVME_NSHEAD_DISK_LIVE, &ns->head->flags))
+ nvme_mpath_add_sysfs_link(ns->head);
+ }
}
static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
@@ -955,6 +976,45 @@ static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
}
DEVICE_ATTR_RO(ana_state);
+static ssize_t queue_depth_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+
+ if (ns->head->subsys->iopolicy != NVME_IOPOLICY_QD)
+ return 0;
+
+ return sysfs_emit(buf, "%d\n", atomic_read(&ns->ctrl->nr_active));
+}
+DEVICE_ATTR_RO(queue_depth);
+
+static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ int node, srcu_idx;
+ nodemask_t numa_nodes;
+ struct nvme_ns *current_ns;
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+ struct nvme_ns_head *head = ns->head;
+
+ if (head->subsys->iopolicy != NVME_IOPOLICY_NUMA)
+ return 0;
+
+ nodes_clear(numa_nodes);
+
+ srcu_idx = srcu_read_lock(&head->srcu);
+ for_each_node(node) {
+ current_ns = srcu_dereference(head->current_path[node],
+ &head->srcu);
+ if (ns == current_ns)
+ node_set(node, numa_nodes);
+ }
+ srcu_read_unlock(&head->srcu, srcu_idx);
+
+ return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&numa_nodes));
+}
+DEVICE_ATTR_RO(numa_nodes);
+
static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
struct nvme_ana_group_desc *desc, void *data)
{
@@ -967,6 +1027,84 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
return -ENXIO; /* just break out of the loop */
}
+void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head)
+{
+ struct device *target;
+ int rc, srcu_idx;
+ struct nvme_ns *ns;
+ struct kobject *kobj;
+
+ /*
+ * Ensure head disk node is already added otherwise we may get invalid
+ * kobj for head disk node
+ */
+ if (!test_bit(GD_ADDED, &head->disk->state))
+ return;
+
+ kobj = &disk_to_dev(head->disk)->kobj;
+
+ /*
+ * loop through each ns chained through the head->list and create the
+ * sysfs link from head node to the ns path node
+ */
+ srcu_idx = srcu_read_lock(&head->srcu);
+
+ list_for_each_entry_rcu(ns, &head->list, siblings) {
+ /*
+ * Avoid creating link if it already exists for the given path.
+ * When path ana state transitions from optimized to non-
+ * optimized or vice-versa, the nvme_mpath_set_live() is
+ * invoked which in truns call this function. Now if the sysfs
+ * link already exists for the given path and we attempt to re-
+ * create the link then sysfs code would warn about it loudly.
+ * So we evaluate NVME_NS_SYSFS_ATTR_LINK flag here to ensure
+ * that we're not creating duplicate link.
+ * The test_and_set_bit() is used because it is protecting
+ * against multiple nvme paths being simultaneously added.
+ */
+ if (test_and_set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
+ continue;
+
+ /*
+ * Ensure that ns path disk node is already added otherwise we
+ * may get invalid kobj name for target
+ */
+ if (!test_bit(GD_ADDED, &ns->disk->state))
+ continue;
+
+ target = disk_to_dev(ns->disk);
+ /*
+ * Create sysfs link from head gendisk kobject @kobj to the
+ * ns path gendisk kobject @target->kobj.
+ */
+ rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name,
+ &target->kobj, dev_name(target));
+ if (unlikely(rc)) {
+ dev_err(disk_to_dev(ns->head->disk),
+ "failed to create link to %s\n",
+ dev_name(target));
+ clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
+ }
+ }
+
+ srcu_read_unlock(&head->srcu, srcu_idx);
+}
+
+void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
+{
+ struct device *target;
+ struct kobject *kobj;
+
+ if (!test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
+ return;
+
+ target = disk_to_dev(ns->disk);
+ kobj = &disk_to_dev(ns->head->disk)->kobj;
+ sysfs_remove_link_from_group(kobj, nvme_ns_mpath_attr_group.name,
+ dev_name(target));
+ clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
+}
+
void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
{
if (nvme_ctrl_use_ana(ns->ctrl)) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 7be92d07430e..51e078642127 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -534,10 +534,11 @@ struct nvme_ns {
struct nvme_ns_head *head;
unsigned long flags;
-#define NVME_NS_REMOVING 0
-#define NVME_NS_ANA_PENDING 2
-#define NVME_NS_FORCE_RO 3
-#define NVME_NS_READY 4
+#define NVME_NS_REMOVING 0
+#define NVME_NS_ANA_PENDING 2
+#define NVME_NS_FORCE_RO 3
+#define NVME_NS_READY 4
+#define NVME_NS_SYSFS_ATTR_LINK 5
struct cdev cdev;
struct device cdev_device;
@@ -933,6 +934,7 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
extern const struct attribute_group *nvme_ns_attr_groups[];
+extern const struct attribute_group nvme_ns_mpath_attr_group;
extern const struct pr_ops nvme_pr_ops;
extern const struct block_device_operations nvme_ns_head_ops;
extern const struct attribute_group nvme_dev_attrs_group;
@@ -955,6 +957,8 @@ void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
void nvme_failover_req(struct request *req);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
+void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns);
+void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns);
void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
@@ -980,6 +984,8 @@ static inline void nvme_trace_bio_complete(struct request *req)
extern bool multipath;
extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
+extern struct device_attribute dev_attr_queue_depth;
+extern struct device_attribute dev_attr_numa_nodes;
extern struct device_attribute subsys_attr_iopolicy;
static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
@@ -1009,6 +1015,12 @@ static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
}
+static inline void nvme_mpath_add_sysfs_link(struct nvme_ns *ns)
+{
+}
+static inline void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
+{
+}
static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
{
return false;
@@ -1147,6 +1159,7 @@ void nvme_auth_stop(struct nvme_ctrl *ctrl);
int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid);
int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid);
void nvme_auth_free(struct nvme_ctrl *ctrl);
+void nvme_auth_revoke_tls_key(struct nvme_ctrl *ctrl);
#else
static inline int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
{
@@ -1169,6 +1182,7 @@ static inline int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid)
return -EPROTONOSUPPORT;
}
static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {};
+static inline void nvme_auth_revoke_tls_key(struct nvme_ctrl *ctrl) {};
#endif
u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3ad7f197c808..2883d17ee1eb 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -812,7 +812,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
if (!iod->sgt.sgl)
return BLK_STS_RESOURCE;
sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req));
- iod->sgt.orig_nents = blk_rq_map_sg(req->q, req, iod->sgt.sgl);
+ iod->sgt.orig_nents = blk_rq_map_sg(req, iod->sgt.sgl);
if (!iod->sgt.orig_nents)
goto out_free_sg;
@@ -953,9 +953,6 @@ out_free_cmd:
return ret;
}
-/*
- * NOTE: ns is NULL when called on the admin queue.
- */
static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 86a2891d9bcc..b5a0295b5bf4 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1476,8 +1476,7 @@ static int nvme_rdma_dma_map_req(struct ib_device *ibdev, struct request *rq,
if (ret)
return -ENOMEM;
- req->data_sgl.nents = blk_rq_map_sg(rq->q, rq,
- req->data_sgl.sg_table.sgl);
+ req->data_sgl.nents = blk_rq_map_sg(rq, req->data_sgl.sg_table.sgl);
*count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl,
req->data_sgl.nents, rq_dma_dir(rq));
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index 3a41b9ab0f13..6d31226f7a4f 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -258,6 +258,8 @@ static struct attribute *nvme_ns_attrs[] = {
#ifdef CONFIG_NVME_MULTIPATH
&dev_attr_ana_grpid.attr,
&dev_attr_ana_state.attr,
+ &dev_attr_queue_depth.attr,
+ &dev_attr_numa_nodes.attr,
#endif
&dev_attr_io_passthru_err_log_enabled.attr,
NULL,
@@ -290,6 +292,10 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
return 0;
}
+ if (a == &dev_attr_queue_depth.attr || a == &dev_attr_numa_nodes.attr) {
+ if (nvme_disk_is_ns_head(dev_to_disk(dev)))
+ return 0;
+ }
#endif
return a->mode;
}
@@ -299,8 +305,22 @@ static const struct attribute_group nvme_ns_attr_group = {
.is_visible = nvme_ns_attrs_are_visible,
};
+#ifdef CONFIG_NVME_MULTIPATH
+static struct attribute *nvme_ns_mpath_attrs[] = {
+ NULL,
+};
+
+const struct attribute_group nvme_ns_mpath_attr_group = {
+ .name = "multipath",
+ .attrs = nvme_ns_mpath_attrs,
+};
+#endif
+
const struct attribute_group *nvme_ns_attr_groups[] = {
&nvme_ns_attr_group,
+#ifdef CONFIG_NVME_MULTIPATH
+ &nvme_ns_mpath_attr_group,
+#endif
NULL,
};
@@ -780,10 +800,10 @@ static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj,
return 0;
if (a == &dev_attr_tls_key.attr &&
- !ctrl->opts->tls)
+ !ctrl->opts->tls && !ctrl->opts->concat)
return 0;
if (a == &dev_attr_tls_configured_key.attr &&
- !ctrl->opts->tls_key)
+ (!ctrl->opts->tls_key || ctrl->opts->concat))
return 0;
if (a == &dev_attr_tls_keyring.attr &&
!ctrl->opts->keyring)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 327f3f2f5399..26c459f0198d 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -8,7 +8,6 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/err.h>
-#include <linux/key.h>
#include <linux/nvme-tcp.h>
#include <linux/nvme-keyring.h>
#include <net/sock.h>
@@ -249,7 +248,7 @@ static inline bool nvme_tcp_tls_configured(struct nvme_ctrl *ctrl)
if (!IS_ENABLED(CONFIG_NVME_TCP_TLS))
return 0;
- return ctrl->opts->tls;
+ return ctrl->opts->tls || ctrl->opts->concat;
}
static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
@@ -1790,7 +1789,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
queue->cmnd_capsule_len = sizeof(struct nvme_command) +
NVME_TCP_ADMIN_CCSZ;
- ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
+ ret = sock_create_kern(current->nsproxy->net_ns,
+ ctrl->addr.ss_family, SOCK_STREAM,
IPPROTO_TCP, &queue->sock);
if (ret) {
dev_err(nctrl->device,
@@ -2060,7 +2060,7 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
if (nvme_tcp_tls_configured(ctrl)) {
if (ctrl->opts->tls_key)
pskid = key_serial(ctrl->opts->tls_key);
- else {
+ else if (ctrl->opts->tls) {
pskid = nvme_tls_psk_default(ctrl->opts->keyring,
ctrl->opts->host->nqn,
ctrl->opts->subsysnqn);
@@ -2090,9 +2090,25 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
{
int i, ret;
- if (nvme_tcp_tls_configured(ctrl) && !ctrl->tls_pskid) {
- dev_err(ctrl->device, "no PSK negotiated\n");
- return -ENOKEY;
+ if (nvme_tcp_tls_configured(ctrl)) {
+ if (ctrl->opts->concat) {
+ /*
+ * The generated PSK is stored in the
+ * fabric options
+ */
+ if (!ctrl->opts->tls_key) {
+ dev_err(ctrl->device, "no PSK generated\n");
+ return -ENOKEY;
+ }
+ if (ctrl->tls_pskid &&
+ ctrl->tls_pskid != key_serial(ctrl->opts->tls_key)) {
+ dev_err(ctrl->device, "Stale PSK id %08x\n", ctrl->tls_pskid);
+ ctrl->tls_pskid = 0;
+ }
+ } else if (!ctrl->tls_pskid) {
+ dev_err(ctrl->device, "no PSK negotiated\n");
+ return -ENOKEY;
+ }
}
for (i = 1; i < ctrl->queue_count; i++) {
@@ -2310,6 +2326,27 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl,
}
}
+/*
+ * The TLS key is set by secure concatenation after negotiation has been
+ * completed on the admin queue. We need to revoke the key when:
+ * - concatenation is enabled (otherwise it's a static key set by the user)
+ * and
+ * - the generated key is present in ctrl->tls_key (otherwise there's nothing
+ * to revoke)
+ * and
+ * - a valid PSK key ID has been set in ctrl->tls_pskid (otherwise TLS
+ * negotiation has not run).
+ *
+ * We cannot always revoke the key as nvme_tcp_alloc_admin_queue() is called
+ * twice during secure concatenation, once on a 'normal' connection to run the
+ * DH-HMAC-CHAP negotiation (which generates the key, so it _must not_ be set),
+ * and once after the negotiation (which uses the key, so it _must_ be set).
+ */
+static bool nvme_tcp_key_revoke_needed(struct nvme_ctrl *ctrl)
+{
+ return ctrl->opts->concat && ctrl->opts->tls_key && ctrl->tls_pskid;
+}
+
static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
{
struct nvmf_ctrl_options *opts = ctrl->opts;
@@ -2319,6 +2356,16 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
if (ret)
return ret;
+ if (ctrl->opts && ctrl->opts->concat && !ctrl->tls_pskid) {
+ /* See comments for nvme_tcp_key_revoke_needed() */
+ dev_dbg(ctrl->device, "restart admin queue for secure concatenation\n");
+ nvme_stop_keep_alive(ctrl);
+ nvme_tcp_teardown_admin_queue(ctrl, false);
+ ret = nvme_tcp_configure_admin_queue(ctrl, false);
+ if (ret)
+ return ret;
+ }
+
if (ctrl->icdoff) {
ret = -EOPNOTSUPP;
dev_err(ctrl->device, "icdoff is not supported!\n");
@@ -2415,6 +2462,8 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
struct nvme_tcp_ctrl, err_work);
struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+ if (nvme_tcp_key_revoke_needed(ctrl))
+ nvme_auth_revoke_tls_key(ctrl);
nvme_stop_keep_alive(ctrl);
flush_work(&ctrl->async_event_work);
nvme_tcp_teardown_io_queues(ctrl, false);
@@ -2455,6 +2504,8 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
container_of(work, struct nvme_ctrl, reset_work);
int ret;
+ if (nvme_tcp_key_revoke_needed(ctrl))
+ nvme_auth_revoke_tls_key(ctrl);
nvme_stop_ctrl(ctrl);
nvme_tcp_teardown_ctrl(ctrl, false);
@@ -2951,7 +3002,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS |
- NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY,
+ NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY | NVMF_OPT_CONCAT,
.create_ctrl = nvme_tcp_create_ctrl,
};
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 382949e18c6a..cce4c5b55aa9 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -146,17 +146,16 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
return NULL;
}
-static int nvme_zone_parse_entry(struct nvme_ctrl *ctrl,
- struct nvme_ns_head *head,
+static int nvme_zone_parse_entry(struct nvme_ns *ns,
struct nvme_zone_descriptor *entry,
unsigned int idx, report_zones_cb cb,
void *data)
{
+ struct nvme_ns_head *head = ns->head;
struct blk_zone zone = { };
if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
- dev_err(ctrl->device, "invalid zone type %#x\n",
- entry->zt);
+ dev_err(ns->ctrl->device, "invalid zone type %#x\n", entry->zt);
return -EINVAL;
}
@@ -213,8 +212,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
break;
for (i = 0; i < nz && zone_idx < nr_zones; i++) {
- ret = nvme_zone_parse_entry(ns->ctrl, ns->head,
- &report->entries[i],
+ ret = nvme_zone_parse_entry(ns, &report->entries[i],
zone_idx, cb, data);
if (ret)
goto out_free;
diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c
index b47d675232d2..0b0645ac5df4 100644
--- a/drivers/nvme/target/auth.c
+++ b/drivers/nvme/target/auth.c
@@ -15,6 +15,7 @@
#include <linux/ctype.h>
#include <linux/random.h>
#include <linux/nvme-auth.h>
+#include <linux/nvme-keyring.h>
#include <linux/unaligned.h>
#include "nvmet.h"
@@ -139,7 +140,7 @@ int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id)
return ret;
}
-u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl)
+u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq)
{
int ret = 0;
struct nvmet_host_link *p;
@@ -165,6 +166,11 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl)
goto out_unlock;
}
+ if (nvmet_queue_tls_keyid(sq)) {
+ pr_debug("host %s tls enabled\n", ctrl->hostnqn);
+ goto out_unlock;
+ }
+
ret = nvmet_setup_dhgroup(ctrl, host->dhchap_dhgroup_id);
if (ret < 0) {
pr_warn("Failed to setup DH group");
@@ -233,6 +239,9 @@ out_unlock:
void nvmet_auth_sq_free(struct nvmet_sq *sq)
{
cancel_delayed_work(&sq->auth_expired_work);
+#ifdef CONFIG_NVME_TARGET_TCP_TLS
+ sq->tls_key = 0;
+#endif
kfree(sq->dhchap_c1);
sq->dhchap_c1 = NULL;
kfree(sq->dhchap_c2);
@@ -261,6 +270,12 @@ void nvmet_destroy_auth(struct nvmet_ctrl *ctrl)
nvme_auth_free_key(ctrl->ctrl_key);
ctrl->ctrl_key = NULL;
}
+#ifdef CONFIG_NVME_TARGET_TCP_TLS
+ if (ctrl->tls_key) {
+ key_put(ctrl->tls_key);
+ ctrl->tls_key = NULL;
+ }
+#endif
}
bool nvmet_check_auth_status(struct nvmet_req *req)
@@ -542,3 +557,58 @@ int nvmet_auth_ctrl_sesskey(struct nvmet_req *req,
return ret;
}
+
+void nvmet_auth_insert_psk(struct nvmet_sq *sq)
+{
+ int hash_len = nvme_auth_hmac_hash_len(sq->ctrl->shash_id);
+ u8 *psk, *digest, *tls_psk;
+ size_t psk_len;
+ int ret;
+#ifdef CONFIG_NVME_TARGET_TCP_TLS
+ struct key *tls_key = NULL;
+#endif
+
+ ret = nvme_auth_generate_psk(sq->ctrl->shash_id,
+ sq->dhchap_skey,
+ sq->dhchap_skey_len,
+ sq->dhchap_c1, sq->dhchap_c2,
+ hash_len, &psk, &psk_len);
+ if (ret) {
+ pr_warn("%s: ctrl %d qid %d failed to generate PSK, error %d\n",
+ __func__, sq->ctrl->cntlid, sq->qid, ret);
+ return;
+ }
+ ret = nvme_auth_generate_digest(sq->ctrl->shash_id, psk, psk_len,
+ sq->ctrl->subsysnqn,
+ sq->ctrl->hostnqn, &digest);
+ if (ret) {
+ pr_warn("%s: ctrl %d qid %d failed to generate digest, error %d\n",
+ __func__, sq->ctrl->cntlid, sq->qid, ret);
+ goto out_free_psk;
+ }
+ ret = nvme_auth_derive_tls_psk(sq->ctrl->shash_id, psk, psk_len,
+ digest, &tls_psk);
+ if (ret) {
+ pr_warn("%s: ctrl %d qid %d failed to derive TLS PSK, error %d\n",
+ __func__, sq->ctrl->cntlid, sq->qid, ret);
+ goto out_free_digest;
+ }
+#ifdef CONFIG_NVME_TARGET_TCP_TLS
+ tls_key = nvme_tls_psk_refresh(NULL, sq->ctrl->hostnqn, sq->ctrl->subsysnqn,
+ sq->ctrl->shash_id, tls_psk, psk_len, digest);
+ if (IS_ERR(tls_key)) {
+ pr_warn("%s: ctrl %d qid %d failed to refresh key, error %ld\n",
+ __func__, sq->ctrl->cntlid, sq->qid, PTR_ERR(tls_key));
+ tls_key = NULL;
+ kfree_sensitive(tls_psk);
+ }
+ if (sq->ctrl->tls_key)
+ key_put(sq->ctrl->tls_key);
+ sq->ctrl->tls_key = tls_key;
+#endif
+
+out_free_digest:
+ kfree_sensitive(digest);
+out_free_psk:
+ kfree_sensitive(psk);
+}
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 2e741696f371..71f8d06998d6 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -1618,8 +1618,6 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
}
ctrl->cntlid = ret;
- uuid_copy(&ctrl->hostid, args->hostid);
-
/*
* Discovery controllers may use some arbitrary high value
* in order to cleanup stale discovery sessions
@@ -1647,7 +1645,7 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
if (args->hostid)
uuid_copy(&ctrl->hostid, args->hostid);
- dhchap_status = nvmet_setup_auth(ctrl);
+ dhchap_status = nvmet_setup_auth(ctrl, args->sq);
if (dhchap_status) {
pr_err("Failed to setup authentication, dhchap status %u\n",
dhchap_status);
@@ -1662,11 +1660,12 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
args->status = NVME_SC_SUCCESS;
- pr_info("Created %s controller %d for subsystem %s for NQN %s%s%s.\n",
+ pr_info("Created %s controller %d for subsystem %s for NQN %s%s%s%s.\n",
nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm",
ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn,
ctrl->pi_support ? " T10-PI is enabled" : "",
- nvmet_has_auth(ctrl) ? " with DH-HMAC-CHAP" : "");
+ nvmet_has_auth(ctrl, args->sq) ? " with DH-HMAC-CHAP" : "",
+ nvmet_queue_tls_keyid(args->sq) ? ", TLS" : "");
return ctrl;
diff --git a/drivers/nvme/target/debugfs.c b/drivers/nvme/target/debugfs.c
index 220c7391fc19..e4300eb95101 100644
--- a/drivers/nvme/target/debugfs.c
+++ b/drivers/nvme/target/debugfs.c
@@ -132,6 +132,27 @@ static int nvmet_ctrl_host_traddr_show(struct seq_file *m, void *p)
}
NVMET_DEBUGFS_ATTR(nvmet_ctrl_host_traddr);
+#ifdef CONFIG_NVME_TARGET_TCP_TLS
+static int nvmet_ctrl_tls_key_show(struct seq_file *m, void *p)
+{
+ struct nvmet_ctrl *ctrl = m->private;
+ key_serial_t keyid = nvmet_queue_tls_keyid(ctrl->sqs[0]);
+
+ seq_printf(m, "%08x\n", keyid);
+ return 0;
+}
+NVMET_DEBUGFS_ATTR(nvmet_ctrl_tls_key);
+
+static int nvmet_ctrl_tls_concat_show(struct seq_file *m, void *p)
+{
+ struct nvmet_ctrl *ctrl = m->private;
+
+ seq_printf(m, "%d\n", ctrl->concat);
+ return 0;
+}
+NVMET_DEBUGFS_ATTR(nvmet_ctrl_tls_concat);
+#endif
+
int nvmet_debugfs_ctrl_setup(struct nvmet_ctrl *ctrl)
{
char name[32];
@@ -157,6 +178,12 @@ int nvmet_debugfs_ctrl_setup(struct nvmet_ctrl *ctrl)
&nvmet_ctrl_state_fops);
debugfs_create_file("host_traddr", S_IRUSR, ctrl->debugfs_dir, ctrl,
&nvmet_ctrl_host_traddr_fops);
+#ifdef CONFIG_NVME_TARGET_TCP_TLS
+ debugfs_create_file("tls_concat", S_IRUSR, ctrl->debugfs_dir, ctrl,
+ &nvmet_ctrl_tls_concat_fops);
+ debugfs_create_file("tls_key", S_IRUSR, ctrl->debugfs_dir, ctrl,
+ &nvmet_ctrl_tls_key_fops);
+#endif
return 0;
}
diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c
index 2022757f08dc..bf01ec414c55 100644
--- a/drivers/nvme/target/fabrics-cmd-auth.c
+++ b/drivers/nvme/target/fabrics-cmd-auth.c
@@ -43,8 +43,26 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d)
data->auth_protocol[0].dhchap.halen,
data->auth_protocol[0].dhchap.dhlen);
req->sq->dhchap_tid = le16_to_cpu(data->t_id);
- if (data->sc_c)
- return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH;
+ if (data->sc_c != NVME_AUTH_SECP_NOSC) {
+ if (!IS_ENABLED(CONFIG_NVME_TARGET_TCP_TLS))
+ return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH;
+ /* Secure concatenation can only be enabled on the admin queue */
+ if (req->sq->qid)
+ return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH;
+ switch (data->sc_c) {
+ case NVME_AUTH_SECP_NEWTLSPSK:
+ if (nvmet_queue_tls_keyid(req->sq))
+ return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH;
+ break;
+ case NVME_AUTH_SECP_REPLACETLSPSK:
+ if (!nvmet_queue_tls_keyid(req->sq))
+ return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH;
+ break;
+ default:
+ return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH;
+ }
+ ctrl->concat = true;
+ }
if (data->napd != 1)
return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
@@ -103,6 +121,12 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d)
nvme_auth_dhgroup_name(fallback_dhgid));
ctrl->dh_gid = fallback_dhgid;
}
+ if (ctrl->dh_gid == NVME_AUTH_DHGROUP_NULL && ctrl->concat) {
+ pr_debug("%s: ctrl %d qid %d: NULL DH group invalid "
+ "for secure channel concatenation\n", __func__,
+ ctrl->cntlid, req->sq->qid);
+ return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH;
+ }
pr_debug("%s: ctrl %d qid %d: selected DH group %s (%d)\n",
__func__, ctrl->cntlid, req->sq->qid,
nvme_auth_dhgroup_name(ctrl->dh_gid), ctrl->dh_gid);
@@ -148,12 +172,22 @@ static u8 nvmet_auth_reply(struct nvmet_req *req, void *d)
if (memcmp(data->rval, response, data->hl)) {
pr_info("ctrl %d qid %d host response mismatch\n",
ctrl->cntlid, req->sq->qid);
+ pr_debug("ctrl %d qid %d rval %*ph\n",
+ ctrl->cntlid, req->sq->qid, data->hl, data->rval);
+ pr_debug("ctrl %d qid %d response %*ph\n",
+ ctrl->cntlid, req->sq->qid, data->hl, response);
kfree(response);
return NVME_AUTH_DHCHAP_FAILURE_FAILED;
}
kfree(response);
pr_debug("%s: ctrl %d qid %d host authenticated\n",
__func__, ctrl->cntlid, req->sq->qid);
+ if (!data->cvalid && ctrl->concat) {
+ pr_debug("%s: ctrl %d qid %d invalid challenge\n",
+ __func__, ctrl->cntlid, req->sq->qid);
+ return NVME_AUTH_DHCHAP_FAILURE_FAILED;
+ }
+ req->sq->dhchap_s2 = le32_to_cpu(data->seqnum);
if (data->cvalid) {
req->sq->dhchap_c2 = kmemdup(data->rval + data->hl, data->hl,
GFP_KERNEL);
@@ -163,11 +197,23 @@ static u8 nvmet_auth_reply(struct nvmet_req *req, void *d)
pr_debug("%s: ctrl %d qid %d challenge %*ph\n",
__func__, ctrl->cntlid, req->sq->qid, data->hl,
req->sq->dhchap_c2);
- } else {
+ }
+ /*
+ * NVMe Base Spec 2.2 section 8.3.4.5.4: DH-HMAC-CHAP_Reply message
+ * Sequence Number (SEQNUM): [ .. ]
+ * The value 0h is used to indicate that bidirectional authentication
+ * is not performed, but a challenge value C2 is carried in order to
+ * generate a pre-shared key (PSK) for subsequent establishment of a
+ * secure channel.
+ */
+ if (req->sq->dhchap_s2 == 0) {
+ if (ctrl->concat)
+ nvmet_auth_insert_psk(req->sq);
req->sq->authenticated = true;
+ kfree(req->sq->dhchap_c2);
req->sq->dhchap_c2 = NULL;
- }
- req->sq->dhchap_s2 = le32_to_cpu(data->seqnum);
+ } else if (!data->cvalid)
+ req->sq->authenticated = true;
return 0;
}
@@ -246,7 +292,7 @@ void nvmet_execute_auth_send(struct nvmet_req *req)
pr_debug("%s: ctrl %d qid %d reset negotiation\n",
__func__, ctrl->cntlid, req->sq->qid);
if (!req->sq->qid) {
- dhchap_status = nvmet_setup_auth(ctrl);
+ dhchap_status = nvmet_setup_auth(ctrl, req->sq);
if (dhchap_status) {
pr_err("ctrl %d qid 0 failed to setup re-authentication\n",
ctrl->cntlid);
@@ -303,6 +349,8 @@ void nvmet_execute_auth_send(struct nvmet_req *req)
}
goto done_kfree;
case NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2:
+ if (ctrl->concat)
+ nvmet_auth_insert_psk(req->sq);
req->sq->authenticated = true;
pr_debug("%s: ctrl %d qid %d ctrl authenticated\n",
__func__, ctrl->cntlid, req->sq->qid);
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index eb406c90c167..f012bdf89850 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -234,10 +234,26 @@ err:
return ret;
}
-static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl)
+static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq)
{
+ bool needs_auth = nvmet_has_auth(ctrl, sq);
+ key_serial_t keyid = nvmet_queue_tls_keyid(sq);
+
+ /* Do not authenticate I/O queues for secure concatenation */
+ if (ctrl->concat && sq->qid)
+ needs_auth = false;
+
+ if (keyid)
+ pr_debug("%s: ctrl %d qid %d should %sauthenticate, tls psk %08x\n",
+ __func__, ctrl->cntlid, sq->qid,
+ needs_auth ? "" : "not ", keyid);
+ else
+ pr_debug("%s: ctrl %d qid %d should %sauthenticate%s\n",
+ __func__, ctrl->cntlid, sq->qid,
+ needs_auth ? "" : "not ",
+ ctrl->concat ? ", secure concatenation" : "");
return (u32)ctrl->cntlid |
- (nvmet_has_auth(ctrl) ? NVME_CONNECT_AUTHREQ_ATR : 0);
+ (needs_auth ? NVME_CONNECT_AUTHREQ_ATR : 0);
}
static void nvmet_execute_admin_connect(struct nvmet_req *req)
@@ -247,6 +263,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
struct nvmet_ctrl *ctrl = NULL;
struct nvmet_alloc_ctrl_args args = {
.port = req->port,
+ .sq = req->sq,
.ops = req->ops,
.p2p_client = req->p2p_client,
.kato = le32_to_cpu(c->kato),
@@ -299,7 +316,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
goto out;
}
- args.result = cpu_to_le32(nvmet_connect_result(ctrl));
+ args.result = cpu_to_le32(nvmet_connect_result(ctrl, req->sq));
out:
kfree(d);
complete:
@@ -357,7 +374,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
goto out_ctrl_put;
pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
- req->cqe->result.u32 = cpu_to_le32(nvmet_connect_result(ctrl));
+ req->cqe->result.u32 = cpu_to_le32(nvmet_connect_result(ctrl, req->sq));
out:
kfree(d);
complete:
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 3ef4beacde32..7318b736d414 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -172,20 +172,6 @@ struct nvmet_fc_tgt_assoc {
struct work_struct del_work;
};
-
-static inline int
-nvmet_fc_iodnum(struct nvmet_fc_ls_iod *iodptr)
-{
- return (iodptr - iodptr->tgtport->iod);
-}
-
-static inline int
-nvmet_fc_fodnum(struct nvmet_fc_fcp_iod *fodptr)
-{
- return (fodptr - fodptr->queue->fod);
-}
-
-
/*
* Association and Connection IDs:
*
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index a9d112d34d4f..a5c41144667c 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -162,7 +162,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
}
iod->req.sg = iod->sg_table.sgl;
- iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl);
+ iod->req.sg_cnt = blk_rq_map_sg(req, iod->sg_table.sgl);
iod->req.transfer_len = blk_rq_payload_bytes(req);
}
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index fcf4f460dc9a..b6db8b74dc4a 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -165,6 +165,9 @@ struct nvmet_sq {
u8 *dhchap_skey;
int dhchap_skey_len;
#endif
+#ifdef CONFIG_NVME_TARGET_TCP_TLS
+ struct key *tls_key;
+#endif
struct completion free_done;
struct completion confirm_done;
};
@@ -289,6 +292,7 @@ struct nvmet_ctrl {
u64 err_counter;
struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS];
bool pi_support;
+ bool concat;
#ifdef CONFIG_NVME_TARGET_AUTH
struct nvme_dhchap_key *host_key;
struct nvme_dhchap_key *ctrl_key;
@@ -298,6 +302,9 @@ struct nvmet_ctrl {
u8 *dh_key;
size_t dh_keysize;
#endif
+#ifdef CONFIG_NVME_TARGET_TCP_TLS
+ struct key *tls_key;
+#endif
struct nvmet_pr_log_mgr pr_log_mgr;
};
@@ -583,6 +590,7 @@ void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new);
struct nvmet_alloc_ctrl_args {
struct nvmet_port *port;
+ struct nvmet_sq *sq;
char *subsysnqn;
char *hostnqn;
uuid_t *hostid;
@@ -819,7 +827,7 @@ static inline u8 nvmet_cc_iocqes(u32 cc)
/* Convert a 32-bit number to a 16-bit 0's based number */
static inline __le16 to0based(u32 a)
{
- return cpu_to_le16(max(1U, min(1U << 16, a)) - 1);
+ return cpu_to_le16(clamp(a, 1U, 1U << 16) - 1);
}
static inline bool nvmet_ns_has_pi(struct nvmet_ns *ns)
@@ -851,6 +859,22 @@ static inline void nvmet_req_bio_put(struct nvmet_req *req, struct bio *bio)
bio_put(bio);
}
+#ifdef CONFIG_NVME_TARGET_TCP_TLS
+static inline key_serial_t nvmet_queue_tls_keyid(struct nvmet_sq *sq)
+{
+ return sq->tls_key ? key_serial(sq->tls_key) : 0;
+}
+static inline void nvmet_sq_put_tls_key(struct nvmet_sq *sq)
+{
+ if (sq->tls_key) {
+ key_put(sq->tls_key);
+ sq->tls_key = NULL;
+ }
+}
+#else
+static inline key_serial_t nvmet_queue_tls_keyid(struct nvmet_sq *sq) { return 0; }
+static inline void nvmet_sq_put_tls_key(struct nvmet_sq *sq) {}
+#endif
#ifdef CONFIG_NVME_TARGET_AUTH
u32 nvmet_auth_send_data_len(struct nvmet_req *req);
void nvmet_execute_auth_send(struct nvmet_req *req);
@@ -859,7 +883,7 @@ void nvmet_execute_auth_receive(struct nvmet_req *req);
int nvmet_auth_set_key(struct nvmet_host *host, const char *secret,
bool set_ctrl);
int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash);
-u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl);
+u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq);
void nvmet_auth_sq_init(struct nvmet_sq *sq);
void nvmet_destroy_auth(struct nvmet_ctrl *ctrl);
void nvmet_auth_sq_free(struct nvmet_sq *sq);
@@ -869,16 +893,18 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
unsigned int hash_len);
int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
unsigned int hash_len);
-static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl)
+static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq)
{
- return ctrl->host_key != NULL;
+ return ctrl->host_key != NULL && !nvmet_queue_tls_keyid(sq);
}
int nvmet_auth_ctrl_exponential(struct nvmet_req *req,
u8 *buf, int buf_size);
int nvmet_auth_ctrl_sesskey(struct nvmet_req *req,
u8 *buf, int buf_size);
+void nvmet_auth_insert_psk(struct nvmet_sq *sq);
#else
-static inline u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl)
+static inline u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl,
+ struct nvmet_sq *sq)
{
return 0;
}
@@ -891,11 +917,13 @@ static inline bool nvmet_check_auth_status(struct nvmet_req *req)
{
return true;
}
-static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl)
+static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl,
+ struct nvmet_sq *sq)
{
return false;
}
static inline const char *nvmet_dhchap_dhgroup_name(u8 dhgid) { return NULL; }
+static inline void nvmet_auth_insert_psk(struct nvmet_sq *sq) {};
#endif
int nvmet_pr_init_ns(struct nvmet_ns *ns);
diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c
index b1e31483f157..b54b3fdbe389 100644
--- a/drivers/nvme/target/pci-epf.c
+++ b/drivers/nvme/target/pci-epf.c
@@ -1385,7 +1385,6 @@ static u16 nvmet_pci_epf_delete_sq(struct nvmet_ctrl *tctrl, u16 sqid)
if (!test_and_clear_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags))
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
- flush_workqueue(sq->iod_wq);
destroy_workqueue(sq->iod_wq);
sq->iod_wq = NULL;
@@ -2129,8 +2128,15 @@ static int nvmet_pci_epf_configure_bar(struct nvmet_pci_epf *nvme_epf)
return -ENODEV;
}
- if (epc_features->bar[BAR_0].only_64bit)
- epf->bar[BAR_0].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+ /*
+ * While NVMe PCIe Transport Specification 1.1, section 2.1.10, claims
+ * that the BAR0 type is Implementation Specific, in NVMe 1.1, the type
+ * is required to be 64-bit. Thus, for interoperability, always set the
+ * type to 64-bit. In the rare case that the PCI EPC does not support
+ * configuring BAR0 as 64-bit, the call to pci_epc_set_bar() will fail,
+ * and we will return failure back to the user.
+ */
+ epf->bar[BAR_0].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64;
/*
* Calculate the size of the register bar: NVMe registers first with
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 4f9cac8a5abe..f2d0c920269b 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -8,7 +8,6 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/err.h>
-#include <linux/key.h>
#include <linux/nvme-tcp.h>
#include <linux/nvme-keyring.h>
#include <net/sock.h>
@@ -1080,10 +1079,11 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
&queue->nvme_sq, &nvmet_tcp_ops))) {
- pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
+ pr_err("failed cmd %p id %d opcode %d, data_len: %d, status: %04x\n",
req->cmd, req->cmd->common.command_id,
req->cmd->common.opcode,
- le32_to_cpu(req->cmd->common.dptr.sgl.length));
+ le32_to_cpu(req->cmd->common.dptr.sgl.length),
+ le16_to_cpu(req->cqe->status));
nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
return 0;
@@ -1609,6 +1609,7 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
/* stop accepting incoming data */
queue->rcv_state = NVMET_TCP_RECV_ERR;
+ nvmet_sq_put_tls_key(&queue->nvme_sq);
nvmet_tcp_uninit_data_in_cmds(queue);
nvmet_sq_destroy(&queue->nvme_sq);
cancel_work_sync(&queue->io_work);
@@ -1794,6 +1795,27 @@ static int nvmet_tcp_try_peek_pdu(struct nvmet_tcp_queue *queue)
return 0;
}
+static int nvmet_tcp_tls_key_lookup(struct nvmet_tcp_queue *queue,
+ key_serial_t peerid)
+{
+ struct key *tls_key = nvme_tls_key_lookup(peerid);
+ int status = 0;
+
+ if (IS_ERR(tls_key)) {
+ pr_warn("%s: queue %d failed to lookup key %x\n",
+ __func__, queue->idx, peerid);
+ spin_lock_bh(&queue->state_lock);
+ queue->state = NVMET_TCP_Q_FAILED;
+ spin_unlock_bh(&queue->state_lock);
+ status = PTR_ERR(tls_key);
+ } else {
+ pr_debug("%s: queue %d using TLS PSK %x\n",
+ __func__, queue->idx, peerid);
+ queue->nvme_sq.tls_key = tls_key;
+ }
+ return status;
+}
+
static void nvmet_tcp_tls_handshake_done(void *data, int status,
key_serial_t peerid)
{
@@ -1814,6 +1836,10 @@ static void nvmet_tcp_tls_handshake_done(void *data, int status,
spin_unlock_bh(&queue->state_lock);
cancel_delayed_work_sync(&queue->tls_handshake_tmo_work);
+
+ if (!status)
+ status = nvmet_tcp_tls_key_lookup(queue, peerid);
+
if (status)
nvmet_tcp_schedule_release_queue(queue);
else
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index f1cfe0bb89b2..0d29470e86b0 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1149,7 +1149,7 @@ blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd)
* Next, walk the list, and fill in the addresses and sizes of
* each segment.
*/
- count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg);
+ count = __blk_rq_map_sg(rq, cmd->sdb.table.sgl, &last_sg);
if (blk_rq_bytes(rq) & rq->q->limits.dma_pad_mask) {
unsigned int pad_len =
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index c8dc92a7d63e..73564efd11d2 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -167,18 +167,6 @@ static int iblock_configure_device(struct se_device *dev)
break;
}
- if (dev->dev_attrib.pi_prot_type) {
- struct bio_set *bs = &ib_dev->ibd_bio_set;
-
- if (bioset_integrity_create(bs, IBLOCK_BIO_POOL_SIZE) < 0) {
- pr_err("Unable to allocate bioset for PI\n");
- ret = -ENOMEM;
- goto out_blkdev_put;
- }
- pr_debug("IBLOCK setup BIP bs->bio_integrity_pool: %p\n",
- &bs->bio_integrity_pool);
- }
-
dev->dev_attrib.hw_pi_prot_type = dev->dev_attrib.pi_prot_type;
return 0;
diff --git a/drivers/ufs/core/ufshcd-crypto.c b/drivers/ufs/core/ufshcd-crypto.c
index 694ff7578fc1..9e63a9d3cb7e 100644
--- a/drivers/ufs/core/ufshcd-crypto.c
+++ b/drivers/ufs/core/ufshcd-crypto.c
@@ -72,11 +72,11 @@ static int ufshcd_crypto_keyslot_program(struct blk_crypto_profile *profile,
if (ccap_array[cap_idx].algorithm_id == UFS_CRYPTO_ALG_AES_XTS) {
/* In XTS mode, the blk_crypto_key's size is already doubled */
- memcpy(cfg.crypto_key, key->raw, key->size/2);
+ memcpy(cfg.crypto_key, key->bytes, key->size/2);
memcpy(cfg.crypto_key + UFS_CRYPTO_KEY_MAX_SIZE/2,
- key->raw + key->size/2, key->size/2);
+ key->bytes + key->size/2, key->size/2);
} else {
- memcpy(cfg.crypto_key, key->raw, key->size);
+ memcpy(cfg.crypto_key, key->bytes, key->size);
}
ufshcd_program_key(hba, &cfg, slot);
@@ -185,6 +185,7 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba)
hba->crypto_profile.ll_ops = ufshcd_crypto_ops;
/* UFS only supports 8 bytes for any DUN */
hba->crypto_profile.max_dun_bytes_supported = 8;
+ hba->crypto_profile.key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW;
hba->crypto_profile.dev = hba->dev;
/*
diff --git a/drivers/ufs/host/ufs-exynos.c b/drivers/ufs/host/ufs-exynos.c
index 13dd5dfc03eb..6a415d9bdc85 100644
--- a/drivers/ufs/host/ufs-exynos.c
+++ b/drivers/ufs/host/ufs-exynos.c
@@ -1320,6 +1320,7 @@ static void exynos_ufs_fmp_init(struct ufs_hba *hba, struct exynos_ufs *ufs)
return;
}
profile->max_dun_bytes_supported = AES_BLOCK_SIZE;
+ profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW;
profile->dev = hba->dev;
profile->modes_supported[BLK_ENCRYPTION_MODE_AES_256_XTS] =
DATA_UNIT_SIZE;
@@ -1366,7 +1367,7 @@ static int exynos_ufs_fmp_fill_prdt(struct ufs_hba *hba,
void *prdt, unsigned int num_segments)
{
struct fmp_sg_entry *fmp_prdt = prdt;
- const u8 *enckey = crypt_ctx->bc_key->raw;
+ const u8 *enckey = crypt_ctx->bc_key->bytes;
const u8 *twkey = enckey + AES_KEYSIZE_256;
u64 dun_lo = crypt_ctx->bc_dun[0];
u64 dun_hi = crypt_ctx->bc_dun[1];
diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c
index 2cfa1774944b..cc9578e1f38e 100644
--- a/drivers/ufs/host/ufs-qcom.c
+++ b/drivers/ufs/host/ufs-qcom.c
@@ -147,6 +147,7 @@ static int ufs_qcom_ice_init(struct ufs_qcom_host *host)
profile->ll_ops = ufs_qcom_crypto_ops;
profile->max_dun_bytes_supported = 8;
+ profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW;
profile->dev = dev;
/*
@@ -202,7 +203,7 @@ static int ufs_qcom_ice_keyslot_program(struct blk_crypto_profile *profile,
err = qcom_ice_program_key(host->ice,
QCOM_ICE_CRYPTO_ALG_AES_XTS,
QCOM_ICE_CRYPTO_KEY_SIZE_256,
- key->raw,
+ key->bytes,
key->crypto_cfg.data_unit_size / 512,
slot);
ufshcd_release(hba);