public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Benjamin Marzinski <bmarzins@redhat.com>
To: John Garry <john.g.garry@oracle.com>
Cc: hch@lst.de, kbusch@kernel.org, sagi@grimberg.me, axboe@fb.com,
	martin.petersen@oracle.com,
	james.bottomley@hansenpartnership.com, hare@suse.com,
	jmeneghi@redhat.com, linux-nvme@lists.infradead.org,
	linux-scsi@vger.kernel.org, michael.christie@oracle.com,
	snitzer@kernel.org, dm-devel@lists.linux.dev,
	linux-block@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH 21/24] scsi: sd: support multipath disk
Date: Mon, 9 Mar 2026 22:40:53 -0400	[thread overview]
Message-ID: <aa-EtdqIOX603PVu@redhat.com> (raw)
In-Reply-To: <20260225153627.1032500-22-john.g.garry@oracle.com>

On Wed, Feb 25, 2026 at 03:36:24PM +0000, John Garry wrote:
> Add support to attach a multipath disk.
> 
> We still allocate the gendisk per path, and this is required for the
> per-path submission. However, those gendisks are marked as hidden. Those
> disks are named sdX:Y, where X is the multipath disk index and Y is the
> per-path index.
> 
> A global list of sd_mpath_disks is kept for matching scsi_device's.
> 
> The multipath gendisk has the name and disk->major/minor set to minic a
> scsi_disk.
> 
> The following is an example of relevant scsi_disk and block sysfs
> directories:
> 
> $ ls -l /sys/block/ | grep sdc
> lrwxrwxrwx    1 root     root             0 Feb 24 16:01 sdc -> ../devices/virtual/scsi_mpath_disk/0/sdc
> lrwxrwxrwx    1 root     root             0 Feb 24 16:01 sdc:0 -> ../devices/platform/host8/session1/target8:0:0/8:0:0:0/block/sdc:0
> lrwxrwxrwx    1 root     root             0 Feb 24 16:02 sdc:1 -> ../devices/platform/host9/session2/target9:0:0/9:0:0:0/block/sdc:1
> 
> $ ls -l /sys/class/scsi_mpath_disk/0/
> total 0
> drwxr-xr-x    2 root     root             0 Feb 24 16:03 power
> drwxr-xr-x   11 root     root             0 Feb 24 16:01 sdc
> lrwxrwxrwx    1 root     root             0 Feb 24 16:01 subsystem -> ../../../../class/scsi_mpath_disk
> -rw-r--r--    1 root     root          4096 Feb 24 16:01 uevent
> 
> $ ls -l /sys/class/scsi_mpath_disk/0/sdc/multipath/
> total 0
> lrwxrwxrwx    1 root     root             0 Feb 24 16:20 sdc:0 -> ../../../../../platform/host8/session1/target8:0:0/8:0:0:0/block/sdc:0
> lrwxrwxrwx    1 root     root             0 Feb 24 16:20 sdc:1 -> ../../../../../platform/host9/session2/target9:0:0/9:0:0:0/block/sdc:1
> 
> 
> $ ls -l /dev/sdc*
> brw-rw----    1 root     disk        8,  32 Feb 24 16:01 /dev/sdc
> brw-rw----    1 root     disk        8,  33 Feb 24 16:01 /dev/sdc1
> brw-rw----    1 root     disk        8,  34 Feb 24 16:01 /dev/sdc2
> 
> 
> $ lsblk /dev/sdc
> NAME            MAJ:MIN RM  SIZE RO TYPE MOUNTPOINTS
> sdc               8:32   0  600M  0 disk
> |-sdc1            8:33   0    9M  0 part
> `-sdc2            8:34   0  568M  0 part
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  drivers/scsi/sd.c | 376 +++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 358 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index 9617878b53ec6..409c0937764d9 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -117,12 +117,33 @@ static DEFINE_IDA(sd_index_ida);
>  static mempool_t *sd_page_pool;
>  static struct lock_class_key sd_bio_compl_lkclass;
>  #ifdef CONFIG_SCSI_MULTIPATH
> +static LIST_HEAD(sd_mpath_disks_list);
> +static DEFINE_MUTEX(sd_mpath_disks_lock);
> +
>  struct sd_mpath_disk {
> +	struct device			dev;
> +	int				disk_index;
> +	int				disk_count;
> +	struct list_head		entry;
> +	struct mutex			lock;
>  	struct mpath_disk		*mpath_disk;
> +	struct scsi_mpath_head		*scsi_mpath_head;
>  };
>  
>  static void sd_mpath_disk_release(struct device *dev)
>  {
> +	struct sd_mpath_disk *sd_mpath_disk =
> +		container_of(dev, struct sd_mpath_disk, dev);
> +	struct scsi_mpath_head *scsi_mpath_head =
> +		sd_mpath_disk->scsi_mpath_head;
> +	struct mpath_disk *mpath_disk = sd_mpath_disk->mpath_disk;
> +
> +	mpath_put_disk(mpath_disk);
> +
> +	ida_free(&sd_index_ida, sd_mpath_disk->disk_index);
> +	scsi_mpath_put_head(scsi_mpath_head);
> +
> +	kfree(sd_mpath_disk);
>  }
>  
>  static const struct class sd_mpath_disk_class = {
> @@ -4144,7 +4165,302 @@ static const struct scsi_mpath_pr_ops sd_mpath_pr_ops = {
>  	.pr_read_keys	= sd_mpath_pr_read_keys,
>  	.pr_read_reservation = sd_mpath_pr_read_reservation,
>  };
> +
> +static int sd_mpath_revalidate_head(struct scsi_disk *sdkp)
> +{
> +	struct sd_mpath_disk *sd_mpath_disk = sdkp->sd_mpath_disk;
> +	struct mpath_disk *mpath_disk = sd_mpath_disk->mpath_disk;;
> +	struct gendisk *disk = mpath_disk->disk;
> +	struct queue_limits *sdkp_lim = &sdkp->disk->queue->limits;
> +	struct queue_limits lim;
> +	unsigned int memflags;
> +	int ret;
> +
> +	lim = queue_limits_start_update(disk->queue);
> +	memflags = blk_mq_freeze_queue(disk->queue);
> +
> +	lim.logical_block_size = sdkp_lim->logical_block_size;
> +	lim.physical_block_size = sdkp_lim->physical_block_size;
> +	lim.io_min = sdkp_lim->io_min;
> +	lim.io_opt = sdkp_lim->io_opt;
> +
> +	queue_limits_stack_bdev(&lim, sdkp->disk->part0, 0,
> +					disk->disk_name);
> +
> +	/* TODO: setup integrity limits */
> +	lim.max_write_streams = sdkp_lim->max_write_streams;
> +	lim.write_stream_granularity = sdkp_lim->write_stream_granularity;
> +	ret = queue_limits_commit_update(disk->queue, &lim);
> +
> +	set_capacity_and_notify(disk, get_capacity(sdkp->disk));
> +
> +	blk_mq_unfreeze_queue(disk->queue, memflags);
> +
> +	return ret;
> +}
> +static int sd_mpath_get_disk(struct sd_mpath_disk *sd_mpath_disk)
> +{
> +	if (!get_device(&sd_mpath_disk->dev))
> +		return -ENXIO;
> +	return 0;
> +}
> +
> +static void sd_mpath_put_disk(struct sd_mpath_disk *sd_mpath_disk)
> +{
> +	put_device(&sd_mpath_disk->dev);
> +}
> +
> +static struct sd_mpath_disk *sd_mpath_find_disk(struct scsi_device *sdp)
> +{
> +	struct scsi_mpath_device *scsi_mpath_dev = sdp->scsi_mpath_dev;
> +	struct sd_mpath_disk *sd_mpath_disk;
> +	int ret;
> +
> +	mutex_lock(&sd_mpath_disks_lock);
> +	list_for_each_entry(sd_mpath_disk, &sd_mpath_disks_list, entry) {
> +		struct scsi_mpath_head *scsi_mpath_head;
> +		struct mpath_disk *mpath_disk;
> +		struct mpath_head *mpath_head;
> +
> +		ret = sd_mpath_get_disk(sd_mpath_disk);
> +		if (ret)
> +			continue;
> +		mpath_disk = sd_mpath_disk->mpath_disk;
> +		mpath_head = mpath_disk->mpath_head;
> +		scsi_mpath_head = mpath_head->drvdata;
> +
> +		if (strncmp(scsi_mpath_head->wwid,
> +			scsi_mpath_dev->device_id_str,
> +			SCSI_MPATH_DEVICE_ID_LEN) == 0) {
> +
> +			mutex_unlock(&sd_mpath_disks_lock);
> +			return sd_mpath_disk;
> +		}
> +		sd_mpath_put_disk(sd_mpath_disk);
> +	}
> +
> +	return NULL;
> +}
> +
> +static void sd_mpath_add_disk(struct scsi_disk *sdkp)
> +{
> +	struct scsi_device *sdp = sdkp->device;
> +	struct scsi_mpath_device *scsi_mpath_dev = sdp->scsi_mpath_dev;
> +	struct mpath_device *mpath_device = &scsi_mpath_dev->mpath_device;
> +	struct sd_mpath_disk *sd_mpath_disk = sdkp->sd_mpath_disk;
> +	struct mpath_disk *mpath_disk = sd_mpath_disk->mpath_disk;
> +	struct mpath_head *mpath_head = mpath_disk->mpath_head;
> +
> +	mpath_device->disk = sdkp->disk;
> +	mpath_add_device(mpath_head, mpath_device);
> +	mpath_device_set_live(mpath_disk, mpath_device);
> +}
> +
> +static int sd_mpath_probe(struct scsi_disk *sdkp)
> +{
> +	struct scsi_device *sdp = sdkp->device;
> +	struct scsi_mpath_device *scsi_mpath_dev = sdp->scsi_mpath_dev;
> +	struct device *dma_dev = sdp->host->dma_dev;
> +	struct scsi_mpath_head *scsi_mpath_head =
> +				scsi_mpath_dev->scsi_mpath_head;
> +	struct sd_mpath_disk *sd_mpath_disk;
> +	struct mpath_head *mpath_head = scsi_mpath_head->mpath_head;
> +	struct queue_limits lim;
> +	struct gendisk *disk;
> +	int error;
> +
> +	/*
> +	 * sd_mpath_disks_list is kept locked if no disk found.
> +	 * Otherwise an extra reference is taken.
> +	 */

Again, I personally think the logic is easier to follow when all the
locking isn't split over multiple functions.

> +	sd_mpath_disk = sd_mpath_find_disk(sdp);
> +	if (sd_mpath_disk) {
> +		mutex_lock(&sd_mpath_disk->lock);
> +		sd_mpath_disk->disk_count++;
> +		mutex_unlock(&sd_mpath_disk->lock);
> +		goto found;
> +	}
> +
> +	sd_mpath_disk = kzalloc(sizeof(*sd_mpath_disk), GFP_KERNEL);
> +	if (!sd_mpath_disk) {
> +		error = -ENOMEM;
> +		goto out_unlock;
> +	}
> +
> +	sd_mpath_disk->scsi_mpath_head = scsi_mpath_head;
> +	device_initialize(&sd_mpath_disk->dev);
> +	mutex_init(&sd_mpath_disk->lock);
> +	sd_mpath_disk->dev.class = &sd_mpath_disk_class;
> +
> +	blk_set_stacking_limits(&lim);
> +	lim.dma_alignment = 3;
> +	lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT |
> +		BLK_FEAT_POLL | BLK_FEAT_ATOMIC_WRITES;
> +
> +	sd_mpath_disk->mpath_disk = mpath_alloc_head_disk(&lim,
> +						dev_to_node(dma_dev));
> +	if (!sd_mpath_disk->mpath_disk) {
> +		error = -ENOMEM;
> +		goto out_free_disk;
> +	}
> +	disk = sd_mpath_disk->mpath_disk->disk;
> +	mpath_get_head(mpath_head); /* undone in mpath_free_disk() */
> +
> +	sd_mpath_disk->mpath_disk->mpath_head = mpath_head;
> +	sd_mpath_disk->mpath_disk->parent = &sd_mpath_disk->dev;
> +
> +	error = ida_alloc(&sd_index_ida, GFP_KERNEL);
> +	if (error < 0) {
> +		sdev_printk(KERN_WARNING, sdp, "sd_probe: memory exhausted.\n");
> +		goto out_put_disk;
> +	}
> +	sd_mpath_disk->disk_index = error;
> +	error = sd_format_disk_name("sd", sd_mpath_disk->disk_index,
> +				disk->disk_name, DISK_NAME_LEN);
> +	if (error)
> +		goto out_free_index;
> +
> +	error = dev_set_name(&sd_mpath_disk->dev, "%s",
> +				dev_name(&scsi_mpath_head->dev));
> +	if (error)
> +		goto out_free_index;
> +
> +	/* undone in sd_mpath_disk_release() */
> +	scsi_mpath_get_head(scsi_mpath_head);
> +
> +	error = device_add(&sd_mpath_disk->dev);
> +	if (error) {
> +		put_device(&sd_mpath_disk->dev);
> +		goto out_unlock;

We should clean up when we fail here, instead of just unlocking without
fully setting things up.

-Ben

> +	}
> +
> +	list_add_tail(&sd_mpath_disk->entry, &sd_mpath_disks_list);
> +	disk->major = sd_major((sd_mpath_disk->disk_index & 0xf0) >> 4);
> +	disk->first_minor = ((sd_mpath_disk->disk_index & 0xf) << 4) |
> +				(sd_mpath_disk->disk_index & 0xfff00);
> +	disk->minors = SD_MINORS;
> +
> +	sd_mpath_disk->disk_count = 1;
> +	mutex_unlock(&sd_mpath_disks_lock);
> +
> +found:
> +	sdkp->sd_mpath_disk = sd_mpath_disk;
> +	sdkp->disk->flags |= GENHD_FL_HIDDEN;
> +	snprintf(sdkp->disk->disk_name, DISK_NAME_LEN, "%s:%d",
> +		sd_mpath_disk->mpath_disk->disk->disk_name,
> +		scsi_mpath_dev->index);
> +
> +	sdkp->index = -1;
> +	return 0;
> +
> +out_free_index:
> +	ida_free(&sd_index_ida, sd_mpath_disk->disk_index);
> +out_put_disk:
> +	mpath_put_disk(sd_mpath_disk->mpath_disk);
> +out_free_disk:
> +	kfree(sd_mpath_disk);
> +out_unlock:
> +	mutex_unlock(&sd_mpath_disks_lock);
> +	return error;
> +}


  reply	other threads:[~2026-03-10  2:41 UTC|newest]

Thread overview: 64+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-25 15:36 [PATCH 00/24] Native SCSI multipath support John Garry
2026-02-25 15:36 ` [PATCH 01/24] scsi: core: add SCSI_MAX_QUEUE_DEPTH John Garry
2026-03-03  6:52   ` Hannes Reinecke
2026-03-03  7:45     ` John Garry
2026-02-25 15:36 ` [PATCH 02/24] scsi-multipath: introduce basic SCSI device support John Garry
2026-03-02  2:16   ` Benjamin Marzinski
2026-03-02 11:33     ` John Garry
2026-03-02  2:22   ` Benjamin Marzinski
2026-03-02 11:39     ` John Garry
2026-03-03  5:39       ` Benjamin Marzinski
2026-03-03  8:01         ` Hannes Reinecke
2026-03-03 14:20           ` Benjamin Marzinski
2026-03-05 15:59           ` John Garry
2026-03-03  6:57   ` Hannes Reinecke
2026-03-03  7:45     ` John Garry
2026-02-25 15:36 ` [PATCH 03/24] scsi-multipath: introduce scsi_device head structure John Garry
2026-03-02  2:50   ` Benjamin Marzinski
2026-03-02 12:00     ` John Garry
2026-03-03  7:13   ` Hannes Reinecke
2026-03-03  7:50     ` John Garry
2026-02-25 15:36 ` [PATCH 04/24] scsi-multipath: introduce scsi_mpath_device_class John Garry
2026-03-02  2:54   ` Benjamin Marzinski
2026-03-02 12:01     ` John Garry
2026-03-03  7:16   ` Hannes Reinecke
2026-03-03 10:53     ` John Garry
2026-02-25 15:36 ` [PATCH 05/24] scsi-multipath: provide sysfs link from to scsi_device John Garry
2026-03-03  7:19   ` Hannes Reinecke
2026-03-03 10:49     ` John Garry
2026-02-25 15:36 ` [PATCH 06/24] scsi-multipath: support iopolicy John Garry
2026-02-25 15:36 ` [PATCH 07/24] scsi-multipath: clone each bio John Garry
2026-03-02  3:21   ` Benjamin Marzinski
2026-03-02 12:12     ` John Garry
2026-03-02 16:27       ` Benjamin Marzinski
2026-03-02 17:16         ` John Garry
2026-02-25 15:36 ` [PATCH 08/24] scsi-multipath: clear path when decide is blocked John Garry
2026-02-25 15:36 ` [PATCH 09/24] scsi-multipath: failover handling John Garry
2026-03-02  3:57   ` Benjamin Marzinski
2026-03-02 12:20     ` John Garry
2026-03-04  5:46   ` Benjamin Marzinski
2026-03-04 11:11     ` John Garry
2026-02-25 15:36 ` [PATCH 10/24] scsi-multipath: add scsi_mpath_{start,end}_request() John Garry
2026-03-02  4:08   ` Benjamin Marzinski
2026-03-02 12:20     ` John Garry
2026-03-04  6:13   ` Benjamin Marzinski
2026-03-04 11:11     ` John Garry
2026-03-05  2:37       ` Benjamin Marzinski
2026-02-25 15:36 ` [PATCH 11/24] scsi-multipath: add scsi_mpath_ioctl() John Garry
2026-02-25 15:36 ` [PATCH 12/24] scsi-multipath: provide callbacks for path state John Garry
2026-03-03  5:31   ` Benjamin Marzinski
2026-02-25 15:36 ` [PATCH 13/24] scsi-multipath: set disk device_groups John Garry
2026-02-25 15:36 ` [PATCH 14/24] scsi-multipath: add PR support John Garry
2026-02-25 15:36 ` [PATCH 15/24] scsi: sd: refactor PR ops John Garry
2026-02-25 15:36 ` [PATCH 16/24] scsi: sd: add multipath disk class John Garry
2026-02-25 15:36 ` [PATCH 17/24] scsi: sd: add sd_mpath_{start,end}_command() John Garry
2026-02-25 15:36 ` [PATCH 18/24] scsi: sd: add sd_mpath_ioctl() John Garry
2026-02-25 15:36 ` [PATCH 19/24] scsi: sd: add multipath PR support John Garry
2026-02-25 15:36 ` [PATCH 20/24] scsi: sd: add sd_mpath_to_disk() John Garry
2026-02-25 15:36 ` [PATCH 21/24] scsi: sd: support multipath disk John Garry
2026-03-10  2:40   ` Benjamin Marzinski [this message]
2026-03-10 10:12     ` John Garry
2026-03-10 15:19       ` Benjamin Marzinski
2026-02-25 15:36 ` [PATCH 22/24] scsi: sd: add mpath_dev file John Garry
2026-02-25 15:36 ` [PATCH 23/24] scsi: sd: add mpath_numa_nodes dev attribute John Garry
2026-02-25 15:36 ` [PATCH 24/24] scsi: sd: add mpath_queue_depth " John Garry

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aa-EtdqIOX603PVu@redhat.com \
    --to=bmarzins@redhat.com \
    --cc=axboe@fb.com \
    --cc=dm-devel@lists.linux.dev \
    --cc=hare@suse.com \
    --cc=hch@lst.de \
    --cc=james.bottomley@hansenpartnership.com \
    --cc=jmeneghi@redhat.com \
    --cc=john.g.garry@oracle.com \
    --cc=kbusch@kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=michael.christie@oracle.com \
    --cc=sagi@grimberg.me \
    --cc=snitzer@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox