linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] virtio-scsi driver
@ 2011-11-30 13:54 Paolo Bonzini
  2011-11-30 13:54 ` [PATCH 1/2] virtio-scsi: first version Paolo Bonzini
  2011-11-30 13:54 ` [PATCH 2/2] virtio-scsi: add error handling Paolo Bonzini
  0 siblings, 2 replies; 18+ messages in thread
From: Paolo Bonzini @ 2011-11-30 13:54 UTC (permalink / raw)
  To: Michael S. Tsirkin, linux-scsi; +Cc: LKML, Rusty Russell, Stefan Hajnoczi


This is the first version of the driver for the virtio-scsi.  It
implements a subset of the spec, in particular it does not implement
asynchronous notifications for either LUN reset/removal/addition or
CD-ROM media events.

Other matching bits:

- spec at http://people.redhat.com/pbonzini/virtio-spec.pdf (and just
  sent out)

- QEMU implementation at git://github.com/bonzini/qemu.git,
  branch virtio-scsi

Please review.

Paolo Bonzini (2):
  virtio-scsi: first version
  virtio-scsi: add error handling

 drivers/scsi/Kconfig       |    8 +
 drivers/scsi/Makefile      |    1 +
 drivers/scsi/virtio_scsi.c |  551 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/virtio_ids.h |    1 +
 4 files changed, 561 insertions(+), 0 deletions(-)
 create mode 100644 drivers/scsi/virtio_scsi.c

-- 
1.7.7.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/2] virtio-scsi: first version
  2011-11-30 13:54 [PATCH 0/2] virtio-scsi driver Paolo Bonzini
@ 2011-11-30 13:54 ` Paolo Bonzini
  2011-12-01  6:33   ` Sasha Levin
  2011-12-02 23:07   ` Benjamin Herrenschmidt
  2011-11-30 13:54 ` [PATCH 2/2] virtio-scsi: add error handling Paolo Bonzini
  1 sibling, 2 replies; 18+ messages in thread
From: Paolo Bonzini @ 2011-11-30 13:54 UTC (permalink / raw)
  To: Michael S. Tsirkin, linux-scsi; +Cc: LKML, Rusty Russell, Stefan Hajnoczi

The virtio-scsi HBA is the basis of an alternative storage stack
for QEMU-based virtual machines (including KVM).  Compared to
virtio-blk it is more scalable, because it supports many LUNs
on a single PCI slot), more powerful (it more easily supports
passthrough of host devices to the guest) and more easily
extensible (new SCSI features implemented by QEMU should not
require updating the driver in the guest).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 drivers/scsi/Kconfig       |    8 +
 drivers/scsi/Makefile      |    1 +
 drivers/scsi/virtio_scsi.c |  478 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/virtio_ids.h |    1 +
 4 files changed, 488 insertions(+), 0 deletions(-)
 create mode 100644 drivers/scsi/virtio_scsi.c

diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 06ea3bc..3ab6ad7 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -1902,6 +1902,14 @@ config SCSI_BFA_FC
 	  To compile this driver as a module, choose M here. The module will
 	  be called bfa.
 
+config SCSI_VIRTIO
+	tristate "virtio-scsi support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && VIRTIO
+	help
+          This is the virtual HBA driver for virtio.  It can be used with
+          QEMU based VMMs (like KVM or Xen).  Say Y or M.
+
+
 endif # SCSI_LOWLEVEL
 
 source "drivers/scsi/pcmcia/Kconfig"
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index 2b88749..351b28b 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -141,6 +141,7 @@ obj-$(CONFIG_SCSI_CXGB4_ISCSI)	+= libiscsi.o libiscsi_tcp.o cxgbi/
 obj-$(CONFIG_SCSI_BNX2_ISCSI)	+= libiscsi.o bnx2i/
 obj-$(CONFIG_BE2ISCSI)		+= libiscsi.o be2iscsi/
 obj-$(CONFIG_SCSI_PMCRAID)	+= pmcraid.o
+obj-$(CONFIG_SCSI_VIRTIO)	+= virtio_scsi.o
 obj-$(CONFIG_VMWARE_PVSCSI)	+= vmw_pvscsi.o
 
 obj-$(CONFIG_ARM)		+= arm/
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
new file mode 100644
index 0000000..cf8732f
--- /dev/null
+++ b/drivers/scsi/virtio_scsi.c
@@ -0,0 +1,478 @@
+/*
+ * Virtio SCSI HBA driver
+ *
+ * Copyright IBM Corp. 2010
+ * Copyright Red Hat, Inc. 2011
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *  Paolo Bonzini   <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_scsi.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_cmnd.h>
+
+#define VIRTIO_SCSI_DEBUG 0
+
+static void dbg(const char *fmt, ...)
+{
+	if (VIRTIO_SCSI_DEBUG) {
+		va_list args;
+
+		va_start(args, fmt);
+		vprintk(fmt, args);
+		va_end(args);
+	}
+}
+
+/* Command queue element */
+struct virtio_scsi_cmd {
+	struct scsi_cmnd *sc;
+	union {
+		struct virtio_scsi_cmd_req       cmd;
+		struct virtio_scsi_ctrl_tmf_req  tmf;
+		struct virtio_scsi_ctrl_an_req   an;
+	} req;
+	union {
+		struct virtio_scsi_cmd_resp      cmd;
+		struct virtio_scsi_ctrl_tmf_resp tmf;
+		struct virtio_scsi_ctrl_an_resp  an;
+		struct virtio_scsi_event         evt;
+	} resp;
+} ____cacheline_aligned_in_smp;
+
+/* Driver instance state */
+struct virtio_scsi {
+	/* Protects cmd_vq and sg[] */
+	spinlock_t cmd_vq_lock;
+
+	struct virtio_device *vdev;
+	struct virtqueue *ctrl_vq;
+	struct virtqueue *event_vq;
+	struct virtqueue *cmd_vq;
+
+	/* For sglist construction when adding commands to the virtqueue.  */
+	struct scatterlist sg[];
+};
+
+static struct kmem_cache *virtscsi_cmd_cache;
+
+static inline struct Scsi_Host *virtio_scsi_host(struct virtio_device *vdev)
+{
+	return vdev->priv;
+}
+
+static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid)
+{
+	if (!resid)
+		return;
+
+	if (!scsi_bidi_cmnd(sc)) {
+		scsi_set_resid(sc, resid);
+		return;
+	}
+
+	scsi_in(sc)->resid = min(resid, scsi_in(sc)->length);
+	scsi_out(sc)->resid = resid - scsi_in(sc)->resid;
+}
+
+/**
+ * virtscsi_complete_cmd - finish a scsi_cmd and invoke scsi_done
+ *
+ * Called with cmd_vq_lock held.
+ */
+static void virtscsi_complete_cmd(void *buf)
+{
+	struct virtio_scsi_cmd *cmd = buf;
+	struct scsi_cmnd *sc = cmd->sc;
+	struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
+
+	dbg("%s %d:%d:%d:%d cmd %p status %#02x sense_len %u\n", __func__,
+		sc->device->host->host_no, sc->device->id,
+		sc->device->channel, sc->device->lun,
+		sc, resp->status, resp->sense_len);
+
+	sc->result = resp->status;
+	virtscsi_compute_resid(sc, resp->resid);
+	switch (resp->response) {
+	case VIRTIO_SCSI_S_OK:
+		set_host_byte(sc, DID_OK);
+		break;
+	case VIRTIO_SCSI_S_UNDERRUN:
+		set_host_byte(sc, DID_BAD_TARGET);
+	case VIRTIO_SCSI_S_ABORTED:
+		set_host_byte(sc, DID_ABORT);
+		break;
+	case VIRTIO_SCSI_S_BAD_TARGET:
+		set_host_byte(sc, DID_BAD_TARGET);
+		break;
+	case VIRTIO_SCSI_S_RESET:
+		set_host_byte(sc, DID_RESET);
+		break;
+	case VIRTIO_SCSI_S_TRANSPORT_FAILURE:
+		set_host_byte(sc, DID_TRANSPORT_DISRUPTED);
+		break;
+	case VIRTIO_SCSI_S_TARGET_FAILURE:
+		set_host_byte(sc, DID_TARGET_FAILURE);
+		break;
+	case VIRTIO_SCSI_S_NEXUS_FAILURE:
+		set_host_byte(sc, DID_NEXUS_FAILURE);
+		break;
+	default:
+		scmd_printk(KERN_DEBUG, sc, "Unknown comand response %d",
+			    resp->response);
+		/* fall through */
+	case VIRTIO_SCSI_S_FAILURE:
+		set_host_byte(sc, DID_ERROR);
+		break;
+	}
+
+	WARN_ON(resp->sense_len > VIRTIO_SCSI_SENSE_SIZE);
+	if (sc->sense_buffer) {
+		memcpy(sc->sense_buffer, resp->sense,
+		       min_t(u32, resp->sense_len, VIRTIO_SCSI_SENSE_SIZE));
+		if (resp->sense_len)
+			set_driver_byte(sc, DRIVER_SENSE);
+	}
+
+	kmem_cache_free(virtscsi_cmd_cache, cmd);
+	sc->scsi_done(sc);
+}
+
+static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf))
+{
+	struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
+	struct virtio_scsi *vscsi = shost_priv(sh);
+	void *buf;
+	unsigned long flags;
+	unsigned int len;
+
+	spin_lock_irqsave(&vscsi->cmd_vq_lock, flags);
+
+	do {
+		virtqueue_disable_cb(vq);
+		while ((buf = virtqueue_get_buf(vq, &len)) != NULL) {
+			fn(buf);
+		}
+	} while (!virtqueue_enable_cb(vq));
+
+	spin_unlock_irqrestore(&vscsi->cmd_vq_lock, flags);
+}
+
+static void virtscsi_cmd_done(struct virtqueue *vq)
+{
+	virtscsi_vq_done(vq, virtscsi_complete_cmd);
+};
+
+/* These are still stubs.  */
+static void virtscsi_complete_free(void *buf)
+{
+	struct virtio_scsi_cmd *cmd = buf;
+
+	kmem_cache_free(virtscsi_cmd_cache, cmd);
+}
+
+static void virtscsi_ctrl_done(struct virtqueue *vq)
+{
+	virtscsi_vq_done(vq, virtscsi_complete_free);
+};
+
+static void virtscsi_event_done(struct virtqueue *vq)
+{
+	virtscsi_vq_done(vq, virtscsi_complete_free);
+};
+
+/**
+ * virtscsi_map_cmd - map a scsi_cmd to a virtqueue scatterlist
+ * @vscsi	: virtio_scsi state
+ * @sc		: command to be mapped
+ * @cmd		: command structure
+ * @out_num	: number of read-only elements
+ * @in_num	: number of write-only elements
+ *
+ * Called with cmd_vq_lock held.
+ */
+static void virtscsi_map_sgl(struct scatterlist *sg, unsigned int *p_idx,
+			     struct scsi_data_buffer *sdb)
+{
+	struct sg_table *table = &sdb->table;
+	struct scatterlist *sg_elem;
+	unsigned int idx = *p_idx;
+	int i;
+
+	for_each_sg(table->sgl, sg_elem, table->nents, i)
+		sg_set_buf(&sg[idx++], sg_virt(sg_elem), sg_elem->length);
+
+	*p_idx = idx;
+}
+
+static void virtscsi_map_cmd(struct virtio_scsi *vscsi, struct virtqueue *vq,
+			     struct virtio_scsi_cmd *cmd, unsigned *out_num,
+			     unsigned *in_num)
+{
+	struct scsi_cmnd *sc = cmd->sc;
+	struct scatterlist *sg = vscsi->sg;
+	unsigned int idx = 0;
+
+	if (sc) {
+		struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
+		BUG_ON(scsi_sg_count(sc) > shost->sg_tablesize);
+
+		/* TODO: check feature bit and fail if unsupported?  */
+		BUG_ON(sc->sc_data_direction == DMA_BIDIRECTIONAL);
+	}
+
+	/* Request header.  */
+	sg_set_buf(&sg[idx++], &cmd->req.cmd, sizeof(cmd->req.cmd));
+
+	/* Data-out buffer.  */
+	if (sc && sc->sc_data_direction != DMA_FROM_DEVICE)
+		virtscsi_map_sgl(sg, &idx, scsi_out(sc));
+
+	*out_num = idx;
+
+	/* Response header.  */
+	sg_set_buf(&sg[idx++], &cmd->resp, sizeof(cmd->resp));
+
+	/* Data-in buffer */
+	if (sc && sc->sc_data_direction != DMA_TO_DEVICE)
+		virtscsi_map_sgl(sg, &idx, scsi_in(sc));
+
+	*in_num = idx - *out_num;
+}
+
+static int virtscsi_kick_cmd(struct virtio_scsi *vscsi, struct virtqueue *vq,
+			     struct virtio_scsi_cmd *cmd)
+{
+	unsigned int out_num, in_num;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&vscsi->cmd_vq_lock, flags);
+
+	virtscsi_map_cmd(vscsi, vq, cmd, &out_num, &in_num);
+
+	ret = virtqueue_add_buf(vq, vscsi->sg, out_num, in_num, cmd);
+	if (ret >= 0)
+		virtqueue_kick(vq);
+
+	spin_unlock_irqrestore(&vscsi->cmd_vq_lock, flags);
+	return ret;
+}
+
+static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
+{
+	struct virtio_scsi *vscsi = shost_priv(sh);
+	struct virtio_scsi_cmd *cmd;
+	int ret;
+
+	dbg("%s %d:%d:%d:%d got cmd %p CDB: %#02x\n", __func__,
+		sc->device->host->host_no, sc->device->id,
+		sc->device->channel, sc->device->lun,
+		sc, sc->cmnd[0]);
+
+	ret = SCSI_MLQUEUE_HOST_BUSY;
+	cmd = kmem_cache_zalloc(virtscsi_cmd_cache, GFP_ATOMIC);
+	if (!cmd)
+		goto out;
+
+	cmd->sc = sc;
+	cmd->req.cmd = (struct virtio_scsi_cmd_req){
+		.lun[0] = 1,
+		.lun[1] = sc->device->id,
+		.lun[2] = (sc->device->lun >> 8) | 0x40,
+		.lun[3] = sc->device->lun & 0xff,
+		.tag = (__u64)sc,
+		.task_attr = VIRTIO_SCSI_S_SIMPLE,
+		.prio = 0,
+		.crn = 0,
+	};
+
+	BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE);
+	memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len);
+
+	if (virtscsi_kick_cmd(vscsi, vscsi->cmd_vq, cmd) >= 0)
+		ret = 0;
+
+out:
+	return ret;
+}
+
+static struct scsi_host_template virtscsi_host_template = {
+	.module = THIS_MODULE,
+	.name = "Virtio SCSI HBA",
+	.proc_name = "virtio_scsi",
+	.queuecommand = virtscsi_queuecommand,
+	.this_id = -1,
+
+	.can_queue = 1024,
+	.max_sectors = 0xFFFF,
+	.dma_boundary = UINT_MAX,
+	.use_clustering = ENABLE_CLUSTERING,
+	.cmd_per_lun = 1,
+};
+
+#define virtscsi_config_get(vdev, fld) \
+	({ \
+		typeof(((struct virtio_scsi_config *)0)->fld) __val; \
+		vdev->config->get(vdev, \
+				  offsetof(struct virtio_scsi_config, fld), \
+				  &__val, sizeof(__val)); \
+		__val; \
+	})
+
+#define virtscsi_config_set(vdev, fld, val) \
+	(void)({ \
+		typeof(((struct virtio_scsi_config *)0)->fld) __val = (val); \
+		vdev->config->set(vdev, \
+				  offsetof(struct virtio_scsi_config, fld), \
+				  &__val, sizeof(__val)); \
+	})
+
+static int __devinit virtscsi_init(struct virtio_device *vdev, struct virtio_scsi *vscsi)
+{
+	int err;
+	struct virtqueue *vqs[3];
+	vq_callback_t *callbacks[] = {
+		virtscsi_ctrl_done,
+		virtscsi_event_done,
+		virtscsi_cmd_done
+	};
+	const char *names[] = {
+		"control",
+		"event",
+		"command"
+	};
+
+	/* Discover virtqueues and write information to configuration.  */
+	err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names);
+	if (err)
+		return err;
+
+	vscsi->ctrl_vq = vqs[0];
+	vscsi->event_vq = vqs[1];
+	vscsi->cmd_vq = vqs[2];
+
+	virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
+	virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);
+	return 0;
+}
+
+static int __devinit virtscsi_probe(struct virtio_device *vdev)
+{
+	struct Scsi_Host *shost;
+	struct virtio_scsi *vscsi;
+	int err;
+	u32 sg_elems;
+
+	/* We need to know how many segments before we allocate.
+         * We need an extra sg elements at head and tail.
+	 */
+	sg_elems = virtscsi_config_get(vdev, seg_max);
+
+	dbg(KERN_ALERT "virtio-scsi sg_elems %d", __func__, sg_elems);
+
+	/* Allocate memory and link the structs together.  */
+        shost = scsi_host_alloc(&virtscsi_host_template,
+		sizeof(*vscsi) + sizeof(vscsi->sg[0]) * (sg_elems + 2));
+
+	if (!shost)
+		return -ENOMEM;
+
+	shost->sg_tablesize = sg_elems;
+	vscsi = shost_priv(shost);
+	vscsi->vdev = vdev;
+	vdev->priv = shost;
+
+	/* Random initializations.  */
+	spin_lock_init(&vscsi->cmd_vq_lock);
+	sg_init_table(vscsi->sg, sg_elems + 2);
+
+	err = virtscsi_init(vdev, vscsi);
+	if (err)
+		goto virtio_init_failed;
+
+	shost->max_lun = virtscsi_config_get(vdev, max_lun) + 1;
+	shost->max_id = virtscsi_config_get(vdev, max_target) + 1;
+	shost->max_channel = 0;
+	shost->max_cmd_len = VIRTIO_SCSI_CDB_SIZE;
+	err = scsi_add_host(shost, &vdev->dev);
+	if (err)
+		goto scsi_add_host_failed;
+
+	scsi_scan_host(shost);
+
+	return 0;
+
+scsi_add_host_failed:
+	vdev->config->del_vqs(vdev);
+virtio_init_failed:
+	scsi_host_put(shost);
+	return err;
+}
+
+static void __devexit virtscsi_remove_vqs(struct virtio_device *vdev)
+{
+	/* Stop all the virtqueues. */
+	vdev->config->reset(vdev);
+
+	vdev->config->del_vqs(vdev);
+}
+
+static void __devexit virtscsi_remove(struct virtio_device *vdev)
+{
+	struct Scsi_Host *shost = virtio_scsi_host(vdev);
+
+	scsi_remove_host(shost);
+
+	virtscsi_remove_vqs(vdev);
+	scsi_host_put(shost);
+}
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_SCSI, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static struct virtio_driver virtio_scsi_driver = {
+	.driver.name = KBUILD_MODNAME,
+	.driver.owner = THIS_MODULE,
+	.id_table = id_table,
+	.probe = virtscsi_probe,
+	.remove = __devexit_p(virtscsi_remove),
+};
+
+static int __init init(void)
+{
+	virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0);
+	if (!virtscsi_cmd_cache) {
+		printk(KERN_ERR "kmem_cache_create() for "
+				"virtscsi_cmd_cache failed\n");
+		return -ENOMEM;
+	}
+
+	return register_virtio_driver(&virtio_scsi_driver);
+}
+
+static void __exit fini(void)
+{
+	unregister_virtio_driver(&virtio_scsi_driver);
+	kmem_cache_destroy(virtscsi_cmd_cache);
+}
+module_init(init);
+module_exit(fini);
+
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio SCSI HBA driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/virtio_ids.h b/include/linux/virtio_ids.h
index 85bb0bb..fee54c4 100644
--- a/include/linux/virtio_ids.h
+++ b/include/linux/virtio_ids.h
@@ -34,6 +34,7 @@
 #define VIRTIO_ID_CONSOLE	3 /* virtio console */
 #define VIRTIO_ID_RNG		4 /* virtio ring */
 #define VIRTIO_ID_BALLOON	5 /* virtio balloon */
+#define VIRTIO_ID_SCSI		7 /* virtio scsi */
 #define VIRTIO_ID_9P		9 /* 9p virtio console */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
-- 
1.7.7.1



^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/2] virtio-scsi: add error handling
  2011-11-30 13:54 [PATCH 0/2] virtio-scsi driver Paolo Bonzini
  2011-11-30 13:54 ` [PATCH 1/2] virtio-scsi: first version Paolo Bonzini
@ 2011-11-30 13:54 ` Paolo Bonzini
  1 sibling, 0 replies; 18+ messages in thread
From: Paolo Bonzini @ 2011-11-30 13:54 UTC (permalink / raw)
  To: Michael S. Tsirkin, linux-scsi; +Cc: LKML, Rusty Russell, Stefan Hajnoczi

This commit adds basic error handling to the virtio-scsi
HBA device.  Task management functions are sent synchronously
via the control virtqueue.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 drivers/scsi/virtio_scsi.c |   75 +++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 74 insertions(+), 1 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index cf8732f..35ab0ef 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -36,6 +36,7 @@ static void dbg(const char *fmt, ...)
 /* Command queue element */
 struct virtio_scsi_cmd {
 	struct scsi_cmnd *sc;
+	struct completion *comp;
 	union {
 		struct virtio_scsi_cmd_req       cmd;
 		struct virtio_scsi_ctrl_tmf_req  tmf;
@@ -172,11 +173,12 @@ static void virtscsi_cmd_done(struct virtqueue *vq)
 	virtscsi_vq_done(vq, virtscsi_complete_cmd);
 };
 
-/* These are still stubs.  */
 static void virtscsi_complete_free(void *buf)
 {
 	struct virtio_scsi_cmd *cmd = buf;
 
+	if (cmd->comp)
+		complete_all(cmd->comp);
 	kmem_cache_free(virtscsi_cmd_cache, cmd);
 }
 
@@ -306,12 +308,83 @@ out:
 	return ret;
 }
 
+static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd)
+{
+	DECLARE_COMPLETION_ONSTACK(comp);
+	int ret;
+
+	cmd->comp = &comp;
+	ret = virtscsi_kick_cmd(vscsi, vscsi->ctrl_vq, cmd);
+	if (ret < 0)
+		return FAILED;
+
+	wait_for_completion(&comp);
+	if (cmd->resp.tmf.response != VIRTIO_SCSI_S_OK &&
+	    cmd->resp.tmf.response != VIRTIO_SCSI_S_FUNCTION_SUCCEEDED)
+		return FAILED;
+
+	return SUCCESS;
+}
+
+static int virtscsi_device_reset(struct scsi_cmnd *sc)
+{
+	struct virtio_scsi *vscsi = shost_priv(sc->device->host);
+	struct virtio_scsi_cmd *cmd;
+
+	dbg("%s %d:%d:%d:%d got device reset", __func__,
+		sc->device->host->host_no, sc->device->id,
+		sc->device->channel, sc->device->lun);
+
+	cmd = kmem_cache_zalloc(virtscsi_cmd_cache, GFP_ATOMIC);
+	if (!cmd)
+		return FAILED;
+
+	cmd->sc = sc;
+	cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){
+		.type = VIRTIO_SCSI_T_TMF,
+		.subtype = VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET,
+		.lun[0] = 1,
+		.lun[1] = sc->device->id,
+		.lun[2] = (sc->device->lun >> 8) | 0x40,
+		.lun[3] = sc->device->lun & 0xff,
+	};
+	return virtscsi_tmf(vscsi, cmd);
+}
+
+static int virtscsi_abort(struct scsi_cmnd *sc)
+{
+	struct virtio_scsi *vscsi = shost_priv(sc->device->host);
+	struct virtio_scsi_cmd *cmd;
+
+	dbg("%s %d:%d:%d:%d got abort", __func__,
+		sc->device->host->host_no, sc->device->id,
+		sc->device->channel, sc->device->lun);
+
+	cmd = kmem_cache_zalloc(virtscsi_cmd_cache, GFP_ATOMIC);
+	if (!cmd)
+		return FAILED;
+
+	cmd->sc = sc;
+	cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){
+		.type = VIRTIO_SCSI_T_TMF,
+		.subtype = VIRTIO_SCSI_T_TMF_ABORT_TASK,
+		.lun[0] = 1,
+		.lun[1] = sc->device->id,
+		.lun[2] = (sc->device->lun >> 8) | 0x40,
+		.lun[3] = sc->device->lun & 0xff,
+		.tag = (__u64)sc,
+	};
+	return virtscsi_tmf(vscsi, cmd);
+}
+
 static struct scsi_host_template virtscsi_host_template = {
 	.module = THIS_MODULE,
 	.name = "Virtio SCSI HBA",
 	.proc_name = "virtio_scsi",
 	.queuecommand = virtscsi_queuecommand,
 	.this_id = -1,
+	.eh_abort_handler               = virtscsi_abort,
+	.eh_device_reset_handler        = virtscsi_device_reset,
 
 	.can_queue = 1024,
 	.max_sectors = 0xFFFF,
-- 
1.7.7.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] virtio-scsi: first version
  2011-11-30 13:54 ` [PATCH 1/2] virtio-scsi: first version Paolo Bonzini
@ 2011-12-01  6:33   ` Sasha Levin
  2011-12-01  8:36     ` Paolo Bonzini
  2011-12-02 23:07   ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 18+ messages in thread
From: Sasha Levin @ 2011-12-01  6:33 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Michael S. Tsirkin, linux-scsi, LKML, Rusty Russell,
	Stefan Hajnoczi

On Wed, 2011-11-30 at 14:54 +0100, Paolo Bonzini wrote:
> The virtio-scsi HBA is the basis of an alternative storage stack
> for QEMU-based virtual machines (including KVM).  Compared to
> virtio-blk it is more scalable, because it supports many LUNs
> on a single PCI slot), more powerful (it more easily supports
> passthrough of host devices to the guest) and more easily
> extensible (new SCSI features implemented by QEMU should not
> require updating the driver in the guest).
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  drivers/scsi/Kconfig       |    8 +
>  drivers/scsi/Makefile      |    1 +
>  drivers/scsi/virtio_scsi.c |  478 ++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/virtio_ids.h |    1 +
  include/linux/virtio_scsi.h is missing here.

>  4 files changed, 488 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/scsi/virtio_scsi.c
> 
> diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
> index 06ea3bc..3ab6ad7 100644
> --- a/drivers/scsi/Kconfig
> +++ b/drivers/scsi/Kconfig
> @@ -1902,6 +1902,14 @@ config SCSI_BFA_FC
>  	  To compile this driver as a module, choose M here. The module will
>  	  be called bfa.
>  
> +config SCSI_VIRTIO
> +	tristate "virtio-scsi support (EXPERIMENTAL)"
> +	depends on EXPERIMENTAL && VIRTIO
> +	help
> +          This is the virtual HBA driver for virtio.  It can be used with
> +          QEMU based VMMs (like KVM or Xen).  Say Y or M.

QEMU based? What about non-QEMU based? :)

> +
> +
>  endif # SCSI_LOWLEVEL
>  
>  source "drivers/scsi/pcmcia/Kconfig"
> diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
> index 2b88749..351b28b 100644
> --- a/drivers/scsi/Makefile
> +++ b/drivers/scsi/Makefile
> @@ -141,6 +141,7 @@ obj-$(CONFIG_SCSI_CXGB4_ISCSI)	+= libiscsi.o libiscsi_tcp.o cxgbi/
>  obj-$(CONFIG_SCSI_BNX2_ISCSI)	+= libiscsi.o bnx2i/
>  obj-$(CONFIG_BE2ISCSI)		+= libiscsi.o be2iscsi/
>  obj-$(CONFIG_SCSI_PMCRAID)	+= pmcraid.o
> +obj-$(CONFIG_SCSI_VIRTIO)	+= virtio_scsi.o
>  obj-$(CONFIG_VMWARE_PVSCSI)	+= vmw_pvscsi.o
>  
>  obj-$(CONFIG_ARM)		+= arm/
> diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
> new file mode 100644
> index 0000000..cf8732f
> --- /dev/null
> +++ b/drivers/scsi/virtio_scsi.c
> @@ -0,0 +1,478 @@
> +/*
> + * Virtio SCSI HBA driver
> + *
> + * Copyright IBM Corp. 2010
> + * Copyright Red Hat, Inc. 2011
> + *
> + * Authors:
> + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
> + *  Paolo Bonzini   <pbonzini@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/virtio.h>
> +#include <linux/virtio_ids.h>
> +#include <linux/virtio_config.h>
> +#include <linux/virtio_scsi.h>
> +#include <scsi/scsi_host.h>
> +#include <scsi/scsi_device.h>
> +#include <scsi/scsi_cmnd.h>
> +
> +#define VIRTIO_SCSI_DEBUG 0

Maybe this should be a CONFIG_ instead which could be selected when
building the kernel?

> +
> +static void dbg(const char *fmt, ...)
> +{
> +	if (VIRTIO_SCSI_DEBUG) {
> +		va_list args;
> +
> +		va_start(args, fmt);
> +		vprintk(fmt, args);
> +		va_end(args);
> +	}
> +}

Or possibly switch from dbg() here to dev_dbg() which would let you use
standard kernel interfaces to enable/disable debugging.

printk()s below should probably be dev_warn/dev_err() or sdev_*()
variants.

[snip]

> +static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
> +{
> +	struct virtio_scsi *vscsi = shost_priv(sh);
> +	struct virtio_scsi_cmd *cmd;
> +	int ret;
> +
> +	dbg("%s %d:%d:%d:%d got cmd %p CDB: %#02x\n", __func__,
> +		sc->device->host->host_no, sc->device->id,
> +		sc->device->channel, sc->device->lun,
> +		sc, sc->cmnd[0]);
> +
> +	ret = SCSI_MLQUEUE_HOST_BUSY;
> +	cmd = kmem_cache_zalloc(virtscsi_cmd_cache, GFP_ATOMIC);

GFP_ATOMIC? Not GFP_KERNEL?

> +	if (!cmd)
> +		goto out;
> +
> +	cmd->sc = sc;
> +	cmd->req.cmd = (struct virtio_scsi_cmd_req){
> +		.lun[0] = 1,
> +		.lun[1] = sc->device->id,
> +		.lun[2] = (sc->device->lun >> 8) | 0x40,
> +		.lun[3] = sc->device->lun & 0xff,
> +		.tag = (__u64)sc,
> +		.task_attr = VIRTIO_SCSI_S_SIMPLE,
> +		.prio = 0,
> +		.crn = 0,
> +	};
> +
> +	BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE);
> +	memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len);
> +
> +	if (virtscsi_kick_cmd(vscsi, vscsi->cmd_vq, cmd) >= 0)
> +		ret = 0;
> +
> +out:
> +	return ret;
> +}
> +
> +static struct scsi_host_template virtscsi_host_template = {
> +	.module = THIS_MODULE,
> +	.name = "Virtio SCSI HBA",
> +	.proc_name = "virtio_scsi",
> +	.queuecommand = virtscsi_queuecommand,
> +	.this_id = -1,
> +
> +	.can_queue = 1024,
> +	.max_sectors = 0xFFFF,
> +	.dma_boundary = UINT_MAX,
> +	.use_clustering = ENABLE_CLUSTERING,
> +	.cmd_per_lun = 1,
> +};
> +
> +#define virtscsi_config_get(vdev, fld) \
> +	({ \
> +		typeof(((struct virtio_scsi_config *)0)->fld) __val; \
> +		vdev->config->get(vdev, \
> +				  offsetof(struct virtio_scsi_config, fld), \
> +				  &__val, sizeof(__val)); \
> +		__val; \
> +	})
> +
> +#define virtscsi_config_set(vdev, fld, val) \
> +	(void)({ \
> +		typeof(((struct virtio_scsi_config *)0)->fld) __val = (val); \
> +		vdev->config->set(vdev, \
> +				  offsetof(struct virtio_scsi_config, fld), \
> +				  &__val, sizeof(__val)); \
> +	})
> +
> +static int __devinit virtscsi_init(struct virtio_device *vdev, struct virtio_scsi *vscsi)
> +{
> +	int err;
> +	struct virtqueue *vqs[3];
> +	vq_callback_t *callbacks[] = {
> +		virtscsi_ctrl_done,
> +		virtscsi_event_done,
> +		virtscsi_cmd_done
> +	};

The spec is talking about multiple request vqs, while here they are
limited to one. Is adding multiple request vqs really speeds things up?
How was it tested without being supported by the driver?

> +	const char *names[] = {
> +		"control",
> +		"event",
> +		"command"

The spec calls them "control", "event" and "request". We should keep the
same names as the spec here and in variable names used int the code
('cmd_vq' should probably be 'req_vq' or something similar).

> +	};
> +
> +	/* Discover virtqueues and write information to configuration.  */
> +	err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names);
> +	if (err)
> +		return err;
> +
> +	vscsi->ctrl_vq = vqs[0];
> +	vscsi->event_vq = vqs[1];
> +	vscsi->cmd_vq = vqs[2];
> +
> +	virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
> +	virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);
> +	return 0;
> +}
> +
> +static int __devinit virtscsi_probe(struct virtio_device *vdev)
> +{
> +	struct Scsi_Host *shost;
> +	struct virtio_scsi *vscsi;
> +	int err;
> +	u32 sg_elems;
> +
> +	/* We need to know how many segments before we allocate.
> +         * We need an extra sg elements at head and tail.
> +	 */
> +	sg_elems = virtscsi_config_get(vdev, seg_max);

Maybe default to one if not specified (=0), like in virtio-blk.

> +
> +	dbg(KERN_ALERT "virtio-scsi sg_elems %d", __func__, sg_elems);
> +
> +	/* Allocate memory and link the structs together.  */
> +        shost = scsi_host_alloc(&virtscsi_host_template,
> +		sizeof(*vscsi) + sizeof(vscsi->sg[0]) * (sg_elems + 2));
> +
> +	if (!shost)
> +		return -ENOMEM;
> +
> +	shost->sg_tablesize = sg_elems;
> +	vscsi = shost_priv(shost);
> +	vscsi->vdev = vdev;
> +	vdev->priv = shost;
> +
> +	/* Random initializations.  */
> +	spin_lock_init(&vscsi->cmd_vq_lock);
> +	sg_init_table(vscsi->sg, sg_elems + 2);
> +
> +	err = virtscsi_init(vdev, vscsi);
> +	if (err)
> +		goto virtio_init_failed;
> +
> +	shost->max_lun = virtscsi_config_get(vdev, max_lun) + 1;
> +	shost->max_id = virtscsi_config_get(vdev, max_target) + 1;
> +	shost->max_channel = 0;
> +	shost->max_cmd_len = VIRTIO_SCSI_CDB_SIZE;
> +	err = scsi_add_host(shost, &vdev->dev);
> +	if (err)
> +		goto scsi_add_host_failed;
> +
> +	scsi_scan_host(shost);
> +
> +	return 0;
> +
> +scsi_add_host_failed:
> +	vdev->config->del_vqs(vdev);
> +virtio_init_failed:
> +	scsi_host_put(shost);
> +	return err;
> +}
> +
> +static void __devexit virtscsi_remove_vqs(struct virtio_device *vdev)
> +{
> +	/* Stop all the virtqueues. */
> +	vdev->config->reset(vdev);
> +
> +	vdev->config->del_vqs(vdev);
> +}
> +
> +static void __devexit virtscsi_remove(struct virtio_device *vdev)
> +{
> +	struct Scsi_Host *shost = virtio_scsi_host(vdev);
> +
> +	scsi_remove_host(shost);
> +
> +	virtscsi_remove_vqs(vdev);
> +	scsi_host_put(shost);
> +}
> +
> +static struct virtio_device_id id_table[] = {
> +	{ VIRTIO_ID_SCSI, VIRTIO_DEV_ANY_ID },
> +	{ 0 },
> +};
> +
> +static struct virtio_driver virtio_scsi_driver = {
> +	.driver.name = KBUILD_MODNAME,
> +	.driver.owner = THIS_MODULE,
> +	.id_table = id_table,
> +	.probe = virtscsi_probe,
> +	.remove = __devexit_p(virtscsi_remove),
> +};
> +
> +static int __init init(void)
> +{
> +	virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0);

Shouldn't these kmemcaches be per-device and not globally shared between
all devices?

> +	if (!virtscsi_cmd_cache) {
> +		printk(KERN_ERR "kmem_cache_create() for "
> +				"virtscsi_cmd_cache failed\n");
> +		return -ENOMEM;
> +	}
> +
> +	return register_virtio_driver(&virtio_scsi_driver);
> +}
> +
> +static void __exit fini(void)
> +{
> +	unregister_virtio_driver(&virtio_scsi_driver);
> +	kmem_cache_destroy(virtscsi_cmd_cache);
> +}
> +module_init(init);
> +module_exit(fini);
> +
> +MODULE_DEVICE_TABLE(virtio, id_table);
> +MODULE_DESCRIPTION("Virtio SCSI HBA driver");
> +MODULE_LICENSE("GPL");
> diff --git a/include/linux/virtio_ids.h b/include/linux/virtio_ids.h
> index 85bb0bb..fee54c4 100644
> --- a/include/linux/virtio_ids.h
> +++ b/include/linux/virtio_ids.h
> @@ -34,6 +34,7 @@
>  #define VIRTIO_ID_CONSOLE	3 /* virtio console */
>  #define VIRTIO_ID_RNG		4 /* virtio ring */
>  #define VIRTIO_ID_BALLOON	5 /* virtio balloon */
> +#define VIRTIO_ID_SCSI		7 /* virtio scsi */
>  #define VIRTIO_ID_9P		9 /* 9p virtio console */
>  
>  #endif /* _LINUX_VIRTIO_IDS_H */

-- 

Sasha.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] virtio-scsi: first version
  2011-12-01  6:33   ` Sasha Levin
@ 2011-12-01  8:36     ` Paolo Bonzini
  2011-12-01  8:55       ` Sasha Levin
  2011-12-02  0:29       ` Rusty Russell
  0 siblings, 2 replies; 18+ messages in thread
From: Paolo Bonzini @ 2011-12-01  8:36 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Michael S. Tsirkin, linux-scsi, LKML, Rusty Russell,
	Stefan Hajnoczi

On 12/01/2011 07:33 AM, Sasha Levin wrote:
> On Wed, 2011-11-30 at 14:54 +0100, Paolo Bonzini wrote:
>> The virtio-scsi HBA is the basis of an alternative storage stack
>> for QEMU-based virtual machines (including KVM).  Compared to
>> virtio-blk it is more scalable, because it supports many LUNs
>> on a single PCI slot), more powerful (it more easily supports
>> passthrough of host devices to the guest) and more easily
>> extensible (new SCSI features implemented by QEMU should not
>> require updating the driver in the guest).
>>
>> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
>> ---
>>   drivers/scsi/Kconfig       |    8 +
>>   drivers/scsi/Makefile      |    1 +
>>   drivers/scsi/virtio_scsi.c |  478 ++++++++++++++++++++++++++++++++++++++++++++
>>   include/linux/virtio_ids.h |    1 +
>    include/linux/virtio_scsi.h is missing here.
>
>>   4 files changed, 488 insertions(+), 0 deletions(-)
>>   create mode 100644 drivers/scsi/virtio_scsi.c
>>
>> diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
>> index 06ea3bc..3ab6ad7 100644
>> --- a/drivers/scsi/Kconfig
>> +++ b/drivers/scsi/Kconfig
>> @@ -1902,6 +1902,14 @@ config SCSI_BFA_FC
>>   	  To compile this driver as a module, choose M here. The module will
>>   	  be called bfa.
>>
>> +config SCSI_VIRTIO
>> +	tristate "virtio-scsi support (EXPERIMENTAL)"
>> +	depends on EXPERIMENTAL&&  VIRTIO
>> +	help
>> +          This is the virtual HBA driver for virtio.  It can be used with
>> +          QEMU based VMMs (like KVM or Xen).  Say Y or M.
>
> QEMU based? What about non-QEMU based? :)

You should first change VIRTIO_BLK. :)  (Just kidding, point taken).

>> +
>> +static void dbg(const char *fmt, ...)
>> +{
>> +	if (VIRTIO_SCSI_DEBUG) {
>> +		va_list args;
>> +
>> +		va_start(args, fmt);
>> +		vprintk(fmt, args);
>> +		va_end(args);
>> +	}
>> +}
>
> Or possibly switch from dbg() here to dev_dbg() which would let you use
> standard kernel interfaces to enable/disable debugging.

I changed to sdev_printk/scmd_printk everywhere.

>> +static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
>> +{
>> +	struct virtio_scsi *vscsi = shost_priv(sh);
>> +	struct virtio_scsi_cmd *cmd;
>> +	int ret;
>> +
>> +	dbg("%s %d:%d:%d:%d got cmd %p CDB: %#02x\n", __func__,
>> +		sc->device->host->host_no, sc->device->id,
>> +		sc->device->channel, sc->device->lun,
>> +		sc, sc->cmnd[0]);
>> +
>> +	ret = SCSI_MLQUEUE_HOST_BUSY;
>> +	cmd = kmem_cache_zalloc(virtscsi_cmd_cache, GFP_ATOMIC);
>
> GFP_ATOMIC? Not GFP_KERNEL?

Done.

>> +static int __devinit virtscsi_init(struct virtio_device *vdev, struct virtio_scsi *vscsi)
>> +{
>> +	int err;
>> +	struct virtqueue *vqs[3];
>> +	vq_callback_t *callbacks[] = {
>> +		virtscsi_ctrl_done,
>> +		virtscsi_event_done,
>> +		virtscsi_cmd_done
>> +	};
>
> The spec is talking about multiple request vqs, while here they are
> limited to one. Is adding multiple request vqs really speeds things up?

The main advantage is having multiple MSI-X vectors.  It's quite common 
in physical HBAs.

> How was it tested without being supported by the driver?

The driver code was just a hack and not good for upstream.

>> +	const char *names[] = {
>> +		"control",
>> +		"event",
>> +		"command"
>
> The spec calls them "control", "event" and "request". We should keep the
> same names as the spec here and in variable names used int the code
> ('cmd_vq' should probably be 'req_vq' or something similar).

Will fix.  The lock is also protecting more than the req_vq.

>> +	};
>> +
>> +	/* Discover virtqueues and write information to configuration.  */
>> +	err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names);
>> +	if (err)
>> +		return err;
>> +
>> +	vscsi->ctrl_vq = vqs[0];
>> +	vscsi->event_vq = vqs[1];
>> +	vscsi->cmd_vq = vqs[2];
>> +
>> +	virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
>> +	virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);
>> +	return 0;
>> +}
>> +
>> +static int __devinit virtscsi_probe(struct virtio_device *vdev)
>> +{
>> +	struct Scsi_Host *shost;
>> +	struct virtio_scsi *vscsi;
>> +	int err;
>> +	u32 sg_elems;
>> +
>> +	/* We need to know how many segments before we allocate.
>> +         * We need an extra sg elements at head and tail.
>> +	 */
>> +	sg_elems = virtscsi_config_get(vdev, seg_max);
>
> Maybe default to one if not specified (=0), like in virtio-blk.

Good idea.  Though with sg_elems=1 it is insanely slow.

>> +
>> +	dbg(KERN_ALERT "virtio-scsi sg_elems %d", __func__, sg_elems);
>> +
>> +	/* Allocate memory and link the structs together.  */
>> +        shost = scsi_host_alloc(&virtscsi_host_template,
>> +		sizeof(*vscsi) + sizeof(vscsi->sg[0]) * (sg_elems + 2));
>> +
>> +	if (!shost)
>> +		return -ENOMEM;
>> +
>> +	shost->sg_tablesize = sg_elems;
>> +	vscsi = shost_priv(shost);
>> +	vscsi->vdev = vdev;
>> +	vdev->priv = shost;
>> +
>> +	/* Random initializations.  */
>> +	spin_lock_init(&vscsi->cmd_vq_lock);
>> +	sg_init_table(vscsi->sg, sg_elems + 2);
>> +
>> +	err = virtscsi_init(vdev, vscsi);
>> +	if (err)
>> +		goto virtio_init_failed;
>> +
>> +	shost->max_lun = virtscsi_config_get(vdev, max_lun) + 1;
>> +	shost->max_id = virtscsi_config_get(vdev, max_target) + 1;
>> +	shost->max_channel = 0;
>> +	shost->max_cmd_len = VIRTIO_SCSI_CDB_SIZE;
>> +	err = scsi_add_host(shost,&vdev->dev);
>> +	if (err)
>> +		goto scsi_add_host_failed;
>> +
>> +	scsi_scan_host(shost);
>> +
>> +	return 0;
>> +
>> +scsi_add_host_failed:
>> +	vdev->config->del_vqs(vdev);
>> +virtio_init_failed:
>> +	scsi_host_put(shost);
>> +	return err;
>> +}
>> +
>> +static void __devexit virtscsi_remove_vqs(struct virtio_device *vdev)
>> +{
>> +	/* Stop all the virtqueues. */
>> +	vdev->config->reset(vdev);
>> +
>> +	vdev->config->del_vqs(vdev);
>> +}
>> +
>> +static void __devexit virtscsi_remove(struct virtio_device *vdev)
>> +{
>> +	struct Scsi_Host *shost = virtio_scsi_host(vdev);
>> +
>> +	scsi_remove_host(shost);
>> +
>> +	virtscsi_remove_vqs(vdev);
>> +	scsi_host_put(shost);
>> +}
>> +
>> +static struct virtio_device_id id_table[] = {
>> +	{ VIRTIO_ID_SCSI, VIRTIO_DEV_ANY_ID },
>> +	{ 0 },
>> +};
>> +
>> +static struct virtio_driver virtio_scsi_driver = {
>> +	.driver.name = KBUILD_MODNAME,
>> +	.driver.owner = THIS_MODULE,
>> +	.id_table = id_table,
>> +	.probe = virtscsi_probe,
>> +	.remove = __devexit_p(virtscsi_remove),
>> +};
>> +
>> +static int __init init(void)
>> +{
>> +	virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0);
>
> Shouldn't these kmemcaches be per-device and not globally shared between
> all devices?

In practice it will be rare (and it's part of the design) to have more 
than one virtio-scsi device (perhaps two: one for passthrough and one 
for other block devices).  If the kmemcaches are a bottleneck, what you 
want is making them per-virtqueue.  Fixing it is simple if it turns out 
to be a problem, and it is simpler if I do it together with multi-vq 
support.

Thanks for the review.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] virtio-scsi: first version
  2011-12-01  8:36     ` Paolo Bonzini
@ 2011-12-01  8:55       ` Sasha Levin
  2011-12-02  0:29       ` Rusty Russell
  1 sibling, 0 replies; 18+ messages in thread
From: Sasha Levin @ 2011-12-01  8:55 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Michael S. Tsirkin, linux-scsi, LKML, Rusty Russell,
	Stefan Hajnoczi

On Thu, 2011-12-01 at 09:36 +0100, Paolo Bonzini wrote:
> On 12/01/2011 07:33 AM, Sasha Levin wrote:
> > On Wed, 2011-11-30 at 14:54 +0100, Paolo Bonzini wrote:
> >> The virtio-scsi HBA is the basis of an alternative storage stack
> >> for QEMU-based virtual machines (including KVM).  Compared to
> >> virtio-blk it is more scalable, because it supports many LUNs
> >> on a single PCI slot), more powerful (it more easily supports
> >> passthrough of host devices to the guest) and more easily
> >> extensible (new SCSI features implemented by QEMU should not
> >> require updating the driver in the guest).
> >>
> >> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> >> ---
> >>   drivers/scsi/Kconfig       |    8 +
> >>   drivers/scsi/Makefile      |    1 +
> >>   drivers/scsi/virtio_scsi.c |  478 ++++++++++++++++++++++++++++++++++++++++++++
> >>   include/linux/virtio_ids.h |    1 +
> >    include/linux/virtio_scsi.h is missing here.

I was actually hoping you could send the .h so I could do some hacking
on it :)

> > Maybe default to one if not specified (=0), like in virtio-blk.
> 
> Good idea.  Though with sg_elems=1 it is insanely slow.

It's either that or fail on sg_elems=0, since currently if it's 0 it
looks like the failure will be non-obvious.

> > Shouldn't these kmemcaches be per-device and not globally shared between
> > all devices?
> 
> In practice it will be rare (and it's part of the design) to have more 
> than one virtio-scsi device (perhaps two: one for passthrough and one 
> for other block devices).  If the kmemcaches are a bottleneck, what you 
> want is making them per-virtqueue.  Fixing it is simple if it turns out 
> to be a problem, and it is simpler if I do it together with multi-vq 
> support.

I guess we should just remember test that when multi-vq support is
added.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] virtio-scsi: first version
  2011-12-01  8:36     ` Paolo Bonzini
  2011-12-01  8:55       ` Sasha Levin
@ 2011-12-02  0:29       ` Rusty Russell
  1 sibling, 0 replies; 18+ messages in thread
From: Rusty Russell @ 2011-12-02  0:29 UTC (permalink / raw)
  To: Paolo Bonzini, Sasha Levin
  Cc: Michael S. Tsirkin, linux-scsi, LKML, Stefan Hajnoczi

On Thu, 01 Dec 2011 09:36:51 +0100, Paolo Bonzini <pbonzini@redhat.com> wrote:
> On 12/01/2011 07:33 AM, Sasha Levin wrote:
> > On Wed, 2011-11-30 at 14:54 +0100, Paolo Bonzini wrote:
> >> +	/* We need to know how many segments before we allocate.
> >> +         * We need an extra sg elements at head and tail.
> >> +	 */
> >> +	sg_elems = virtscsi_config_get(vdev, seg_max);
> >
> > Maybe default to one if not specified (=0), like in virtio-blk.
> 
> Good idea.  Though with sg_elems=1 it is insanely slow.

And a bit over-conservative.  If they don't specify a max, you can pick
a number.  Make sure it fits in the ring though :)

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] virtio-scsi: first version
  2011-11-30 13:54 ` [PATCH 1/2] virtio-scsi: first version Paolo Bonzini
  2011-12-01  6:33   ` Sasha Levin
@ 2011-12-02 23:07   ` Benjamin Herrenschmidt
  2011-12-03 17:38     ` Paolo Bonzini
  1 sibling, 1 reply; 18+ messages in thread
From: Benjamin Herrenschmidt @ 2011-12-02 23:07 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Michael S. Tsirkin, linux-scsi, LKML, Rusty Russell,
	Stefan Hajnoczi

On Wed, 2011-11-30 at 14:54 +0100, Paolo Bonzini wrote:
> The virtio-scsi HBA is the basis of an alternative storage stack
> for QEMU-based virtual machines (including KVM).  Compared to
> virtio-blk it is more scalable, because it supports many LUNs
> on a single PCI slot), more powerful (it more easily supports
> passthrough of host devices to the guest) and more easily
> extensible (new SCSI features implemented by QEMU should not
> require updating the driver in the guest).

So apologies in advance if I missed something here but it looks to me
like you are hard wiring the various parameters such as max transfer
size, command size, number of segments, etc...  (basically the queue
limits).

The max transfer size especially is the real nasty one.

This is typically the one thing you should -really- obtain from the
other side. This is the number one reason why we cannot today reliably
emulate a SCSI controller in qemu -and- pass-through the SCSI commands
to the host /dev/sg or equivalent (only full device emulation is
reliable).

This is also typically what something like virtio-scsi allows us to fix,
so let's fix it. IE. we have the ability to query the "limits" of the
real HBA /  transport on the host side and to pass them along to the
guest, which enables us to do real pass-through.

In fact, spapr-vscsi (IBM pSeries vscsi) does provide the max req side
(though currently we lack the ability to query it from the kernel on the
qemu side but that's an orthogonal problem. There are ways to do it,
more or less buggy, but we can fix that).

So I very strongly suggest that you get the protocol right -now- and
make sure it carries those informations (even if the qemu side initially
only supports emulation and hard codes the queue limits, we can fix that
later and be backward compatible with older guests).

Ben.

> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  drivers/scsi/Kconfig       |    8 +
>  drivers/scsi/Makefile      |    1 +
>  drivers/scsi/virtio_scsi.c |  478 ++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/virtio_ids.h |    1 +
>  4 files changed, 488 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/scsi/virtio_scsi.c
> 
> diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
> index 06ea3bc..3ab6ad7 100644
> --- a/drivers/scsi/Kconfig
> +++ b/drivers/scsi/Kconfig
> @@ -1902,6 +1902,14 @@ config SCSI_BFA_FC
>  	  To compile this driver as a module, choose M here. The module will
>  	  be called bfa.
>  
> +config SCSI_VIRTIO
> +	tristate "virtio-scsi support (EXPERIMENTAL)"
> +	depends on EXPERIMENTAL && VIRTIO
> +	help
> +          This is the virtual HBA driver for virtio.  It can be used with
> +          QEMU based VMMs (like KVM or Xen).  Say Y or M.
> +
> +
>  endif # SCSI_LOWLEVEL
>  
>  source "drivers/scsi/pcmcia/Kconfig"
> diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
> index 2b88749..351b28b 100644
> --- a/drivers/scsi/Makefile
> +++ b/drivers/scsi/Makefile
> @@ -141,6 +141,7 @@ obj-$(CONFIG_SCSI_CXGB4_ISCSI)	+= libiscsi.o libiscsi_tcp.o cxgbi/
>  obj-$(CONFIG_SCSI_BNX2_ISCSI)	+= libiscsi.o bnx2i/
>  obj-$(CONFIG_BE2ISCSI)		+= libiscsi.o be2iscsi/
>  obj-$(CONFIG_SCSI_PMCRAID)	+= pmcraid.o
> +obj-$(CONFIG_SCSI_VIRTIO)	+= virtio_scsi.o
>  obj-$(CONFIG_VMWARE_PVSCSI)	+= vmw_pvscsi.o
>  
>  obj-$(CONFIG_ARM)		+= arm/
> diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
> new file mode 100644
> index 0000000..cf8732f
> --- /dev/null
> +++ b/drivers/scsi/virtio_scsi.c
> @@ -0,0 +1,478 @@
> +/*
> + * Virtio SCSI HBA driver
> + *
> + * Copyright IBM Corp. 2010
> + * Copyright Red Hat, Inc. 2011
> + *
> + * Authors:
> + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
> + *  Paolo Bonzini   <pbonzini@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/virtio.h>
> +#include <linux/virtio_ids.h>
> +#include <linux/virtio_config.h>
> +#include <linux/virtio_scsi.h>
> +#include <scsi/scsi_host.h>
> +#include <scsi/scsi_device.h>
> +#include <scsi/scsi_cmnd.h>
> +
> +#define VIRTIO_SCSI_DEBUG 0
> +
> +static void dbg(const char *fmt, ...)
> +{
> +	if (VIRTIO_SCSI_DEBUG) {
> +		va_list args;
> +
> +		va_start(args, fmt);
> +		vprintk(fmt, args);
> +		va_end(args);
> +	}
> +}
> +
> +/* Command queue element */
> +struct virtio_scsi_cmd {
> +	struct scsi_cmnd *sc;
> +	union {
> +		struct virtio_scsi_cmd_req       cmd;
> +		struct virtio_scsi_ctrl_tmf_req  tmf;
> +		struct virtio_scsi_ctrl_an_req   an;
> +	} req;
> +	union {
> +		struct virtio_scsi_cmd_resp      cmd;
> +		struct virtio_scsi_ctrl_tmf_resp tmf;
> +		struct virtio_scsi_ctrl_an_resp  an;
> +		struct virtio_scsi_event         evt;
> +	} resp;
> +} ____cacheline_aligned_in_smp;
> +
> +/* Driver instance state */
> +struct virtio_scsi {
> +	/* Protects cmd_vq and sg[] */
> +	spinlock_t cmd_vq_lock;
> +
> +	struct virtio_device *vdev;
> +	struct virtqueue *ctrl_vq;
> +	struct virtqueue *event_vq;
> +	struct virtqueue *cmd_vq;
> +
> +	/* For sglist construction when adding commands to the virtqueue.  */
> +	struct scatterlist sg[];
> +};
> +
> +static struct kmem_cache *virtscsi_cmd_cache;
> +
> +static inline struct Scsi_Host *virtio_scsi_host(struct virtio_device *vdev)
> +{
> +	return vdev->priv;
> +}
> +
> +static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid)
> +{
> +	if (!resid)
> +		return;
> +
> +	if (!scsi_bidi_cmnd(sc)) {
> +		scsi_set_resid(sc, resid);
> +		return;
> +	}
> +
> +	scsi_in(sc)->resid = min(resid, scsi_in(sc)->length);
> +	scsi_out(sc)->resid = resid - scsi_in(sc)->resid;
> +}
> +
> +/**
> + * virtscsi_complete_cmd - finish a scsi_cmd and invoke scsi_done
> + *
> + * Called with cmd_vq_lock held.
> + */
> +static void virtscsi_complete_cmd(void *buf)
> +{
> +	struct virtio_scsi_cmd *cmd = buf;
> +	struct scsi_cmnd *sc = cmd->sc;
> +	struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
> +
> +	dbg("%s %d:%d:%d:%d cmd %p status %#02x sense_len %u\n", __func__,
> +		sc->device->host->host_no, sc->device->id,
> +		sc->device->channel, sc->device->lun,
> +		sc, resp->status, resp->sense_len);
> +
> +	sc->result = resp->status;
> +	virtscsi_compute_resid(sc, resp->resid);
> +	switch (resp->response) {
> +	case VIRTIO_SCSI_S_OK:
> +		set_host_byte(sc, DID_OK);
> +		break;
> +	case VIRTIO_SCSI_S_UNDERRUN:
> +		set_host_byte(sc, DID_BAD_TARGET);
> +	case VIRTIO_SCSI_S_ABORTED:
> +		set_host_byte(sc, DID_ABORT);
> +		break;
> +	case VIRTIO_SCSI_S_BAD_TARGET:
> +		set_host_byte(sc, DID_BAD_TARGET);
> +		break;
> +	case VIRTIO_SCSI_S_RESET:
> +		set_host_byte(sc, DID_RESET);
> +		break;
> +	case VIRTIO_SCSI_S_TRANSPORT_FAILURE:
> +		set_host_byte(sc, DID_TRANSPORT_DISRUPTED);
> +		break;
> +	case VIRTIO_SCSI_S_TARGET_FAILURE:
> +		set_host_byte(sc, DID_TARGET_FAILURE);
> +		break;
> +	case VIRTIO_SCSI_S_NEXUS_FAILURE:
> +		set_host_byte(sc, DID_NEXUS_FAILURE);
> +		break;
> +	default:
> +		scmd_printk(KERN_DEBUG, sc, "Unknown comand response %d",
> +			    resp->response);
> +		/* fall through */
> +	case VIRTIO_SCSI_S_FAILURE:
> +		set_host_byte(sc, DID_ERROR);
> +		break;
> +	}
> +
> +	WARN_ON(resp->sense_len > VIRTIO_SCSI_SENSE_SIZE);
> +	if (sc->sense_buffer) {
> +		memcpy(sc->sense_buffer, resp->sense,
> +		       min_t(u32, resp->sense_len, VIRTIO_SCSI_SENSE_SIZE));
> +		if (resp->sense_len)
> +			set_driver_byte(sc, DRIVER_SENSE);
> +	}
> +
> +	kmem_cache_free(virtscsi_cmd_cache, cmd);
> +	sc->scsi_done(sc);
> +}
> +
> +static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf))
> +{
> +	struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
> +	struct virtio_scsi *vscsi = shost_priv(sh);
> +	void *buf;
> +	unsigned long flags;
> +	unsigned int len;
> +
> +	spin_lock_irqsave(&vscsi->cmd_vq_lock, flags);
> +
> +	do {
> +		virtqueue_disable_cb(vq);
> +		while ((buf = virtqueue_get_buf(vq, &len)) != NULL) {
> +			fn(buf);
> +		}
> +	} while (!virtqueue_enable_cb(vq));
> +
> +	spin_unlock_irqrestore(&vscsi->cmd_vq_lock, flags);
> +}
> +
> +static void virtscsi_cmd_done(struct virtqueue *vq)
> +{
> +	virtscsi_vq_done(vq, virtscsi_complete_cmd);
> +};
> +
> +/* These are still stubs.  */
> +static void virtscsi_complete_free(void *buf)
> +{
> +	struct virtio_scsi_cmd *cmd = buf;
> +
> +	kmem_cache_free(virtscsi_cmd_cache, cmd);
> +}
> +
> +static void virtscsi_ctrl_done(struct virtqueue *vq)
> +{
> +	virtscsi_vq_done(vq, virtscsi_complete_free);
> +};
> +
> +static void virtscsi_event_done(struct virtqueue *vq)
> +{
> +	virtscsi_vq_done(vq, virtscsi_complete_free);
> +};
> +
> +/**
> + * virtscsi_map_cmd - map a scsi_cmd to a virtqueue scatterlist
> + * @vscsi	: virtio_scsi state
> + * @sc		: command to be mapped
> + * @cmd		: command structure
> + * @out_num	: number of read-only elements
> + * @in_num	: number of write-only elements
> + *
> + * Called with cmd_vq_lock held.
> + */
> +static void virtscsi_map_sgl(struct scatterlist *sg, unsigned int *p_idx,
> +			     struct scsi_data_buffer *sdb)
> +{
> +	struct sg_table *table = &sdb->table;
> +	struct scatterlist *sg_elem;
> +	unsigned int idx = *p_idx;
> +	int i;
> +
> +	for_each_sg(table->sgl, sg_elem, table->nents, i)
> +		sg_set_buf(&sg[idx++], sg_virt(sg_elem), sg_elem->length);
> +
> +	*p_idx = idx;
> +}
> +
> +static void virtscsi_map_cmd(struct virtio_scsi *vscsi, struct virtqueue *vq,
> +			     struct virtio_scsi_cmd *cmd, unsigned *out_num,
> +			     unsigned *in_num)
> +{
> +	struct scsi_cmnd *sc = cmd->sc;
> +	struct scatterlist *sg = vscsi->sg;
> +	unsigned int idx = 0;
> +
> +	if (sc) {
> +		struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
> +		BUG_ON(scsi_sg_count(sc) > shost->sg_tablesize);
> +
> +		/* TODO: check feature bit and fail if unsupported?  */
> +		BUG_ON(sc->sc_data_direction == DMA_BIDIRECTIONAL);
> +	}
> +
> +	/* Request header.  */
> +	sg_set_buf(&sg[idx++], &cmd->req.cmd, sizeof(cmd->req.cmd));
> +
> +	/* Data-out buffer.  */
> +	if (sc && sc->sc_data_direction != DMA_FROM_DEVICE)
> +		virtscsi_map_sgl(sg, &idx, scsi_out(sc));
> +
> +	*out_num = idx;
> +
> +	/* Response header.  */
> +	sg_set_buf(&sg[idx++], &cmd->resp, sizeof(cmd->resp));
> +
> +	/* Data-in buffer */
> +	if (sc && sc->sc_data_direction != DMA_TO_DEVICE)
> +		virtscsi_map_sgl(sg, &idx, scsi_in(sc));
> +
> +	*in_num = idx - *out_num;
> +}
> +
> +static int virtscsi_kick_cmd(struct virtio_scsi *vscsi, struct virtqueue *vq,
> +			     struct virtio_scsi_cmd *cmd)
> +{
> +	unsigned int out_num, in_num;
> +	unsigned long flags;
> +	int ret;
> +
> +	spin_lock_irqsave(&vscsi->cmd_vq_lock, flags);
> +
> +	virtscsi_map_cmd(vscsi, vq, cmd, &out_num, &in_num);
> +
> +	ret = virtqueue_add_buf(vq, vscsi->sg, out_num, in_num, cmd);
> +	if (ret >= 0)
> +		virtqueue_kick(vq);
> +
> +	spin_unlock_irqrestore(&vscsi->cmd_vq_lock, flags);
> +	return ret;
> +}
> +
> +static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
> +{
> +	struct virtio_scsi *vscsi = shost_priv(sh);
> +	struct virtio_scsi_cmd *cmd;
> +	int ret;
> +
> +	dbg("%s %d:%d:%d:%d got cmd %p CDB: %#02x\n", __func__,
> +		sc->device->host->host_no, sc->device->id,
> +		sc->device->channel, sc->device->lun,
> +		sc, sc->cmnd[0]);
> +
> +	ret = SCSI_MLQUEUE_HOST_BUSY;
> +	cmd = kmem_cache_zalloc(virtscsi_cmd_cache, GFP_ATOMIC);
> +	if (!cmd)
> +		goto out;
> +
> +	cmd->sc = sc;
> +	cmd->req.cmd = (struct virtio_scsi_cmd_req){
> +		.lun[0] = 1,
> +		.lun[1] = sc->device->id,
> +		.lun[2] = (sc->device->lun >> 8) | 0x40,
> +		.lun[3] = sc->device->lun & 0xff,
> +		.tag = (__u64)sc,
> +		.task_attr = VIRTIO_SCSI_S_SIMPLE,
> +		.prio = 0,
> +		.crn = 0,
> +	};
> +
> +	BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE);
> +	memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len);
> +
> +	if (virtscsi_kick_cmd(vscsi, vscsi->cmd_vq, cmd) >= 0)
> +		ret = 0;
> +
> +out:
> +	return ret;
> +}
> +
> +static struct scsi_host_template virtscsi_host_template = {
> +	.module = THIS_MODULE,
> +	.name = "Virtio SCSI HBA",
> +	.proc_name = "virtio_scsi",
> +	.queuecommand = virtscsi_queuecommand,
> +	.this_id = -1,
> +
> +	.can_queue = 1024,
> +	.max_sectors = 0xFFFF,
> +	.dma_boundary = UINT_MAX,
> +	.use_clustering = ENABLE_CLUSTERING,
> +	.cmd_per_lun = 1,
> +};
> +
> +#define virtscsi_config_get(vdev, fld) \
> +	({ \
> +		typeof(((struct virtio_scsi_config *)0)->fld) __val; \
> +		vdev->config->get(vdev, \
> +				  offsetof(struct virtio_scsi_config, fld), \
> +				  &__val, sizeof(__val)); \
> +		__val; \
> +	})
> +
> +#define virtscsi_config_set(vdev, fld, val) \
> +	(void)({ \
> +		typeof(((struct virtio_scsi_config *)0)->fld) __val = (val); \
> +		vdev->config->set(vdev, \
> +				  offsetof(struct virtio_scsi_config, fld), \
> +				  &__val, sizeof(__val)); \
> +	})
> +
> +static int __devinit virtscsi_init(struct virtio_device *vdev, struct virtio_scsi *vscsi)
> +{
> +	int err;
> +	struct virtqueue *vqs[3];
> +	vq_callback_t *callbacks[] = {
> +		virtscsi_ctrl_done,
> +		virtscsi_event_done,
> +		virtscsi_cmd_done
> +	};
> +	const char *names[] = {
> +		"control",
> +		"event",
> +		"command"
> +	};
> +
> +	/* Discover virtqueues and write information to configuration.  */
> +	err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names);
> +	if (err)
> +		return err;
> +
> +	vscsi->ctrl_vq = vqs[0];
> +	vscsi->event_vq = vqs[1];
> +	vscsi->cmd_vq = vqs[2];
> +
> +	virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
> +	virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);
> +	return 0;
> +}
> +
> +static int __devinit virtscsi_probe(struct virtio_device *vdev)
> +{
> +	struct Scsi_Host *shost;
> +	struct virtio_scsi *vscsi;
> +	int err;
> +	u32 sg_elems;
> +
> +	/* We need to know how many segments before we allocate.
> +         * We need an extra sg elements at head and tail.
> +	 */
> +	sg_elems = virtscsi_config_get(vdev, seg_max);
> +
> +	dbg(KERN_ALERT "virtio-scsi sg_elems %d", __func__, sg_elems);
> +
> +	/* Allocate memory and link the structs together.  */
> +        shost = scsi_host_alloc(&virtscsi_host_template,
> +		sizeof(*vscsi) + sizeof(vscsi->sg[0]) * (sg_elems + 2));
> +
> +	if (!shost)
> +		return -ENOMEM;
> +
> +	shost->sg_tablesize = sg_elems;
> +	vscsi = shost_priv(shost);
> +	vscsi->vdev = vdev;
> +	vdev->priv = shost;
> +
> +	/* Random initializations.  */
> +	spin_lock_init(&vscsi->cmd_vq_lock);
> +	sg_init_table(vscsi->sg, sg_elems + 2);
> +
> +	err = virtscsi_init(vdev, vscsi);
> +	if (err)
> +		goto virtio_init_failed;
> +
> +	shost->max_lun = virtscsi_config_get(vdev, max_lun) + 1;
> +	shost->max_id = virtscsi_config_get(vdev, max_target) + 1;
> +	shost->max_channel = 0;
> +	shost->max_cmd_len = VIRTIO_SCSI_CDB_SIZE;
> +	err = scsi_add_host(shost, &vdev->dev);
> +	if (err)
> +		goto scsi_add_host_failed;
> +
> +	scsi_scan_host(shost);
> +
> +	return 0;
> +
> +scsi_add_host_failed:
> +	vdev->config->del_vqs(vdev);
> +virtio_init_failed:
> +	scsi_host_put(shost);
> +	return err;
> +}
> +
> +static void __devexit virtscsi_remove_vqs(struct virtio_device *vdev)
> +{
> +	/* Stop all the virtqueues. */
> +	vdev->config->reset(vdev);
> +
> +	vdev->config->del_vqs(vdev);
> +}
> +
> +static void __devexit virtscsi_remove(struct virtio_device *vdev)
> +{
> +	struct Scsi_Host *shost = virtio_scsi_host(vdev);
> +
> +	scsi_remove_host(shost);
> +
> +	virtscsi_remove_vqs(vdev);
> +	scsi_host_put(shost);
> +}
> +
> +static struct virtio_device_id id_table[] = {
> +	{ VIRTIO_ID_SCSI, VIRTIO_DEV_ANY_ID },
> +	{ 0 },
> +};
> +
> +static struct virtio_driver virtio_scsi_driver = {
> +	.driver.name = KBUILD_MODNAME,
> +	.driver.owner = THIS_MODULE,
> +	.id_table = id_table,
> +	.probe = virtscsi_probe,
> +	.remove = __devexit_p(virtscsi_remove),
> +};
> +
> +static int __init init(void)
> +{
> +	virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0);
> +	if (!virtscsi_cmd_cache) {
> +		printk(KERN_ERR "kmem_cache_create() for "
> +				"virtscsi_cmd_cache failed\n");
> +		return -ENOMEM;
> +	}
> +
> +	return register_virtio_driver(&virtio_scsi_driver);
> +}
> +
> +static void __exit fini(void)
> +{
> +	unregister_virtio_driver(&virtio_scsi_driver);
> +	kmem_cache_destroy(virtscsi_cmd_cache);
> +}
> +module_init(init);
> +module_exit(fini);
> +
> +MODULE_DEVICE_TABLE(virtio, id_table);
> +MODULE_DESCRIPTION("Virtio SCSI HBA driver");
> +MODULE_LICENSE("GPL");
> diff --git a/include/linux/virtio_ids.h b/include/linux/virtio_ids.h
> index 85bb0bb..fee54c4 100644
> --- a/include/linux/virtio_ids.h
> +++ b/include/linux/virtio_ids.h
> @@ -34,6 +34,7 @@
>  #define VIRTIO_ID_CONSOLE	3 /* virtio console */
>  #define VIRTIO_ID_RNG		4 /* virtio ring */
>  #define VIRTIO_ID_BALLOON	5 /* virtio balloon */
> +#define VIRTIO_ID_SCSI		7 /* virtio scsi */
>  #define VIRTIO_ID_9P		9 /* 9p virtio console */
>  
>  #endif /* _LINUX_VIRTIO_IDS_H */



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] virtio-scsi: first version
  2011-12-02 23:07   ` Benjamin Herrenschmidt
@ 2011-12-03 17:38     ` Paolo Bonzini
  2011-12-03 22:22       ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 18+ messages in thread
From: Paolo Bonzini @ 2011-12-03 17:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Michael S. Tsirkin, LKML, Rusty Russell, Stefan Hajnoczi

On 12/03/2011 12:07 AM, Benjamin Herrenschmidt wrote:
> This is typically the one thing you should -really- obtain from the
> other side. This is the number one reason why we cannot today reliably
> emulate a SCSI controller in qemu -and- pass-through the SCSI commands
> to the host /dev/sg or equivalent (only full device emulation is
> reliable).
>
> This is also typically what something like virtio-scsi allows us to fix,
> so let's fix it. IE. we have the ability to query the "limits" of the
> real HBA /  transport on the host side and to pass them along to the
> guest, which enables us to do real pass-through.

You can fix this in QEMU; READ and WRITE commands can be split in 
multiple parts.  The scsi-block device I added recently does this.

However, it seems like a simple change, so I'll do it.  I'll add 
max_sectors and cmd_per_lun to the config space.

Paolo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] virtio-scsi: first version
  2011-12-03 17:38     ` Paolo Bonzini
@ 2011-12-03 22:22       ` Benjamin Herrenschmidt
  2011-12-05 10:08         ` Paolo Bonzini
  0 siblings, 1 reply; 18+ messages in thread
From: Benjamin Herrenschmidt @ 2011-12-03 22:22 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: Michael S. Tsirkin, LKML, Rusty Russell, Stefan Hajnoczi

On Sat, 2011-12-03 at 18:38 +0100, Paolo Bonzini wrote:
> On 12/03/2011 12:07 AM, Benjamin Herrenschmidt wrote:
> > This is typically the one thing you should -really- obtain from the
> > other side. This is the number one reason why we cannot today reliably
> > emulate a SCSI controller in qemu -and- pass-through the SCSI commands
> > to the host /dev/sg or equivalent (only full device emulation is
> > reliable).
> >
> > This is also typically what something like virtio-scsi allows us to fix,
> > so let's fix it. IE. we have the ability to query the "limits" of the
> > real HBA /  transport on the host side and to pass them along to the
> > guest, which enables us to do real pass-through.
> 
> You can fix this in QEMU; READ and WRITE commands can be split in 
> multiple parts.  The scsi-block device I added recently does this.

You can split -some- commands... it's clumsy and not always legit, you
don't always know all commands (what about vendor commands such as
firmware updates etc...) and it can be really tricky with tagged queues
and barriers.

> However, it seems like a simple change, so I'll do it.  I'll add 
> max_sectors and cmd_per_lun to the config space.

Sounds like a good start :-)

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] virtio-scsi: first version
  2011-12-03 22:22       ` Benjamin Herrenschmidt
@ 2011-12-05 10:08         ` Paolo Bonzini
  0 siblings, 0 replies; 18+ messages in thread
From: Paolo Bonzini @ 2011-12-05 10:08 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Michael S. Tsirkin, LKML, Rusty Russell, Stefan Hajnoczi

On 12/03/2011 11:22 PM, Benjamin Herrenschmidt wrote:
>> >  You can fix this in QEMU; READ and WRITE commands can be split in
>> >  multiple parts.  The scsi-block device I added recently does this.
> You can split -some- commands... it's clumsy and not always legit, you
> don't always know all commands (what about vendor commands such as
> firmware updates etc...)

For the record, firmware updates won't run under virt unless you make 
qemu runs with CAP_SYS_RAWIO.

Paolo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/2] virtio-scsi: first version
  2011-12-05 17:29 [PATCH v2 0/2] virtio-scsi driver Paolo Bonzini
@ 2011-12-05 17:29 ` Paolo Bonzini
  2011-12-06 18:09   ` James Bottomley
  0 siblings, 1 reply; 18+ messages in thread
From: Paolo Bonzini @ 2011-12-05 17:29 UTC (permalink / raw)
  To: linux-kernel, Michael S. Tsirkin, linux-scsi
  Cc: Rusty Russell, Stefan Hajnoczi

The virtio-scsi HBA is the basis of an alternative storage stack
for QEMU-based virtual machines (including KVM).  Compared to
virtio-blk it is more scalable, because it supports many LUNs
on a single PCI slot), more powerful (it more easily supports
passthrough of host devices to the guest) and more easily
extensible (new SCSI features implemented by QEMU should not
require updating the driver in the guest).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
	v1->v2: use dbg_dev, sdev_printk, scmd_printk
           - renamed lock to vq_lock
           - renamed cmd_vq to req_vq (and other similar changes)
           - fixed missing break in VIRTIO_SCSI_S_UNDERRUN
           - added VIRTIO_SCSI_S_BUSY
           - removed unused argument from virtscsi_map_cmd
           - fixed two tabs that had slipped in
           - moved max_sectors and cmd_per_lun from template to config space
           - __attribute__((packed)) -> __packed

 drivers/scsi/Kconfig        |    8 +
 drivers/scsi/Makefile       |    1 +
 drivers/scsi/virtio_scsi.c  |  465 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/virtio_scsi.h |  112 +++++++++++
 include/linux/virtio_ids.h  |    1 +
 4 files changed, 475 insertions(+), 0 deletions(-)
 create mode 100644 drivers/scsi/virtio_scsi.c
 create mode 100644 include/linux/virtio_scsi.h

diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 06ea3bc..3ab6ad7 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -1902,6 +1902,14 @@ config SCSI_BFA_FC
 	  To compile this driver as a module, choose M here. The module will
 	  be called bfa.
 
+config SCSI_VIRTIO
+	tristate "virtio-scsi support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && VIRTIO
+	help
+          This is the virtual HBA driver for virtio.  If the kernel will
+          be used in a virtual machine, say Y or M.
+
+
 endif # SCSI_LOWLEVEL
 
 source "drivers/scsi/pcmcia/Kconfig"
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index 2b88749..351b28b 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -141,6 +141,7 @@ obj-$(CONFIG_SCSI_CXGB4_ISCSI)	+= libiscsi.o libiscsi_tcp.o cxgbi/
 obj-$(CONFIG_SCSI_BNX2_ISCSI)	+= libiscsi.o bnx2i/
 obj-$(CONFIG_BE2ISCSI)		+= libiscsi.o be2iscsi/
 obj-$(CONFIG_SCSI_PMCRAID)	+= pmcraid.o
+obj-$(CONFIG_SCSI_VIRTIO)	+= virtio_scsi.o
 obj-$(CONFIG_VMWARE_PVSCSI)	+= vmw_pvscsi.o
 
 obj-$(CONFIG_ARM)		+= arm/
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
new file mode 100644
index 0000000..cf8732f
--- /dev/null
+++ b/drivers/scsi/virtio_scsi.c
@@ -0,0 +1,465 @@
+/*
+ * Virtio SCSI HBA driver
+ *
+ * Copyright IBM Corp. 2010
+ * Copyright Red Hat, Inc. 2011
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *  Paolo Bonzini   <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_scsi.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_cmnd.h>
+
+/* Command queue element */
+struct virtio_scsi_cmd {
+	struct scsi_cmnd *sc;
+	union {
+		struct virtio_scsi_cmd_req       cmd;
+		struct virtio_scsi_ctrl_tmf_req  tmf;
+		struct virtio_scsi_ctrl_an_req   an;
+	} req;
+	union {
+		struct virtio_scsi_cmd_resp      cmd;
+		struct virtio_scsi_ctrl_tmf_resp tmf;
+		struct virtio_scsi_ctrl_an_resp  an;
+		struct virtio_scsi_event         evt;
+	} resp;
+} ____cacheline_aligned_in_smp;
+
+/* Driver instance state */
+struct virtio_scsi {
+	/* Protects ctrl_vq, req_vq and sg[] */
+	spinlock_t vq_lock;
+
+	struct virtio_device *vdev;
+	struct virtqueue *ctrl_vq;
+	struct virtqueue *event_vq;
+	struct virtqueue *req_vq;
+
+	/* For sglist construction when adding commands to the virtqueue.  */
+	struct scatterlist sg[];
+};
+
+static struct kmem_cache *virtscsi_cmd_cache;
+
+static inline struct Scsi_Host *virtio_scsi_host(struct virtio_device *vdev)
+{
+	return vdev->priv;
+}
+
+static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid)
+{
+	if (!resid)
+		return;
+
+	if (!scsi_bidi_cmnd(sc)) {
+		scsi_set_resid(sc, resid);
+		return;
+	}
+
+	scsi_in(sc)->resid = min(resid, scsi_in(sc)->length);
+	scsi_out(sc)->resid = resid - scsi_in(sc)->resid;
+}
+
+/**
+ * virtscsi_complete_cmd - finish a scsi_cmd and invoke scsi_done
+ *
+ * Called with vq_lock held.
+ */
+static void virtscsi_complete_cmd(void *buf)
+{
+	struct virtio_scsi_cmd *cmd = buf;
+	struct scsi_cmnd *sc = cmd->sc;
+	struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
+
+	dev_dbg(&sc->device->sdev_gendev,
+		"cmd %p response %u status %#02x sense_len %u\n",
+		sc, resp->response, resp->status, resp->sense_len);
+
+	sc->result = resp->status;
+	virtscsi_compute_resid(sc, resp->resid);
+	switch (resp->response) {
+	case VIRTIO_SCSI_S_OK:
+		set_host_byte(sc, DID_OK);
+		break;
+	case VIRTIO_SCSI_S_UNDERRUN:
+		set_host_byte(sc, DID_ERROR);
+		break;
+	case VIRTIO_SCSI_S_ABORTED:
+		set_host_byte(sc, DID_ABORT);
+		break;
+	case VIRTIO_SCSI_S_BAD_TARGET:
+		set_host_byte(sc, DID_BAD_TARGET);
+		break;
+	case VIRTIO_SCSI_S_RESET:
+		set_host_byte(sc, DID_RESET);
+		break;
+	case VIRTIO_SCSI_S_BUSY:
+		set_host_byte(sc, DID_BUS_BUSY);
+		break;
+	case VIRTIO_SCSI_S_TRANSPORT_FAILURE:
+		set_host_byte(sc, DID_TRANSPORT_DISRUPTED);
+		break;
+	case VIRTIO_SCSI_S_TARGET_FAILURE:
+		set_host_byte(sc, DID_TARGET_FAILURE);
+		break;
+	case VIRTIO_SCSI_S_NEXUS_FAILURE:
+		set_host_byte(sc, DID_NEXUS_FAILURE);
+		break;
+	default:
+		scmd_printk(KERN_WARNING, sc, "Unknown response %d",
+			    resp->response);
+		/* fall through */
+	case VIRTIO_SCSI_S_FAILURE:
+		set_host_byte(sc, DID_ERROR);
+		break;
+	}
+
+	WARN_ON(resp->sense_len > VIRTIO_SCSI_SENSE_SIZE);
+	if (sc->sense_buffer) {
+		memcpy(sc->sense_buffer, resp->sense,
+		       min_t(u32, resp->sense_len, VIRTIO_SCSI_SENSE_SIZE));
+		if (resp->sense_len)
+			set_driver_byte(sc, DRIVER_SENSE);
+	}
+
+	kmem_cache_free(virtscsi_cmd_cache, cmd);
+	sc->scsi_done(sc);
+}
+
+static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf))
+{
+	struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
+	struct virtio_scsi *vscsi = shost_priv(sh);
+	void *buf;
+	unsigned long flags;
+	unsigned int len;
+
+	spin_lock_irqsave(&vscsi->vq_lock, flags);
+
+	do {
+		virtqueue_disable_cb(vq);
+		while ((buf = virtqueue_get_buf(vq, &len)) != NULL)
+			fn(buf);
+	} while (!virtqueue_enable_cb(vq));
+
+	spin_unlock_irqrestore(&vscsi->vq_lock, flags);
+}
+
+static void virtscsi_req_done(struct virtqueue *vq)
+{
+	virtscsi_vq_done(vq, virtscsi_complete_cmd);
+};
+
+/* These are still stubs.  */
+static void virtscsi_complete_free(void *buf)
+{
+	struct virtio_scsi_cmd *cmd = buf;
+
+	kmem_cache_free(virtscsi_cmd_cache, cmd);
+}
+
+static void virtscsi_ctrl_done(struct virtqueue *vq)
+{
+	virtscsi_vq_done(vq, virtscsi_complete_free);
+};
+
+static void virtscsi_event_done(struct virtqueue *vq)
+{
+	virtscsi_vq_done(vq, virtscsi_complete_free);
+};
+
+static void virtscsi_map_sgl(struct scatterlist *sg, unsigned int *p_idx,
+			     struct scsi_data_buffer *sdb)
+{
+	struct sg_table *table = &sdb->table;
+	struct scatterlist *sg_elem;
+	unsigned int idx = *p_idx;
+	int i;
+
+	for_each_sg(table->sgl, sg_elem, table->nents, i)
+		sg_set_buf(&sg[idx++], sg_virt(sg_elem), sg_elem->length);
+
+	*p_idx = idx;
+}
+
+/**
+ * virtscsi_map_cmd - map a scsi_cmd to a virtqueue scatterlist
+ * @vscsi	: virtio_scsi state
+ * @cmd		: command structure
+ * @out_num	: number of read-only elements
+ * @in_num	: number of write-only elements
+ *
+ * Called with vq_lock held.
+ */
+static void virtscsi_map_cmd(struct virtio_scsi *vscsi,
+			     struct virtio_scsi_cmd *cmd, unsigned *out_num,
+			     unsigned *in_num)
+{
+	struct scsi_cmnd *sc = cmd->sc;
+	struct scatterlist *sg = vscsi->sg;
+	unsigned int idx = 0;
+
+	if (sc) {
+		struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
+		BUG_ON(scsi_sg_count(sc) > shost->sg_tablesize);
+
+		/* TODO: check feature bit and fail if unsupported?  */
+		BUG_ON(sc->sc_data_direction == DMA_BIDIRECTIONAL);
+	}
+
+	/* Request header.  */
+	sg_set_buf(&sg[idx++], &cmd->req.cmd, sizeof(cmd->req.cmd));
+
+	/* Data-out buffer.  */
+	if (sc && sc->sc_data_direction != DMA_FROM_DEVICE)
+		virtscsi_map_sgl(sg, &idx, scsi_out(sc));
+
+	*out_num = idx;
+
+	/* Response header.  */
+	sg_set_buf(&sg[idx++], &cmd->resp, sizeof(cmd->resp));
+
+	/* Data-in buffer */
+	if (sc && sc->sc_data_direction != DMA_TO_DEVICE)
+		virtscsi_map_sgl(sg, &idx, scsi_in(sc));
+
+	*in_num = idx - *out_num;
+}
+
+static int virtscsi_kick_cmd(struct virtio_scsi *vscsi, struct virtqueue *vq,
+			     struct virtio_scsi_cmd *cmd)
+{
+	unsigned int out_num, in_num;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&vscsi->vq_lock, flags);
+
+	virtscsi_map_cmd(vscsi, cmd, &out_num, &in_num);
+
+	ret = virtqueue_add_buf(vq, vscsi->sg, out_num, in_num, cmd);
+	if (ret >= 0)
+		virtqueue_kick(vq);
+
+	spin_unlock_irqrestore(&vscsi->vq_lock, flags);
+	return ret;
+}
+
+static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
+{
+	struct virtio_scsi *vscsi = shost_priv(sh);
+	struct virtio_scsi_cmd *cmd;
+	int ret;
+
+	dev_dbg(&sc->device->sdev_gendev,
+		"cmd %p CDB: %#02x\n", sc, sc->cmnd[0]);
+
+	ret = SCSI_MLQUEUE_HOST_BUSY;
+	cmd = kmem_cache_zalloc(virtscsi_cmd_cache, GFP_ATOMIC);
+	if (!cmd)
+		goto out;
+
+	cmd->sc = sc;
+	cmd->req.cmd = (struct virtio_scsi_cmd_req){
+		.lun[0] = 1,
+		.lun[1] = sc->device->id,
+		.lun[2] = (sc->device->lun >> 8) | 0x40,
+		.lun[3] = sc->device->lun & 0xff,
+		.tag = (__u64)sc,
+		.task_attr = VIRTIO_SCSI_S_SIMPLE,
+		.prio = 0,
+		.crn = 0,
+	};
+
+	BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE);
+	memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len);
+
+	if (virtscsi_kick_cmd(vscsi, vscsi->req_vq, cmd) >= 0)
+		ret = 0;
+
+out:
+	return ret;
+}
+
+static struct scsi_host_template virtscsi_host_template = {
+	.module = THIS_MODULE,
+	.name = "Virtio SCSI HBA",
+	.proc_name = "virtio_scsi",
+	.queuecommand = virtscsi_queuecommand,
+	.this_id = -1,
+
+	.can_queue = 1024,
+	.dma_boundary = UINT_MAX,
+	.use_clustering = ENABLE_CLUSTERING,
+};
+
+#define virtscsi_config_get(vdev, fld) \
+	({ \
+		typeof(((struct virtio_scsi_config *)0)->fld) __val; \
+		vdev->config->get(vdev, \
+				  offsetof(struct virtio_scsi_config, fld), \
+				  &__val, sizeof(__val)); \
+		__val; \
+	})
+
+#define virtscsi_config_set(vdev, fld, val) \
+	(void)({ \
+		typeof(((struct virtio_scsi_config *)0)->fld) __val = (val); \
+		vdev->config->set(vdev, \
+				  offsetof(struct virtio_scsi_config, fld), \
+				  &__val, sizeof(__val)); \
+	})
+
+static int __devinit virtscsi_init(struct virtio_device *vdev,
+				   struct virtio_scsi *vscsi)
+{
+	int err;
+	struct virtqueue *vqs[3];
+	vq_callback_t *callbacks[] = {
+		virtscsi_ctrl_done,
+		virtscsi_event_done,
+		virtscsi_req_done
+	};
+	const char *names[] = {
+		"control",
+		"event",
+		"request"
+	};
+
+	/* Discover virtqueues and write information to configuration.  */
+	err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names);
+	if (err)
+		return err;
+
+	vscsi->ctrl_vq = vqs[0];
+	vscsi->event_vq = vqs[1];
+	vscsi->req_vq = vqs[2];
+
+	virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
+	virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);
+	return 0;
+}
+
+static int __devinit virtscsi_probe(struct virtio_device *vdev)
+{
+	struct Scsi_Host *shost;
+	struct virtio_scsi *vscsi;
+	int err;
+	u32 sg_elems;
+	u32 cmd_per_lun;
+
+	/* We need to know how many segments before we allocate.
+	 * We need an extra sg elements at head and tail.
+	 */
+	sg_elems = virtscsi_config_get(vdev, seg_max) ?: 1;
+
+	/* Allocate memory and link the structs together.  */
+	shost = scsi_host_alloc(&virtscsi_host_template,
+		sizeof(*vscsi) + sizeof(vscsi->sg[0]) * (sg_elems + 2));
+
+	if (!shost)
+		return -ENOMEM;
+
+	shost->sg_tablesize = sg_elems;
+	vscsi = shost_priv(shost);
+	vscsi->vdev = vdev;
+	vdev->priv = shost;
+
+	/* Random initializations.  */
+	spin_lock_init(&vscsi->vq_lock);
+	sg_init_table(vscsi->sg, sg_elems + 2);
+
+	err = virtscsi_init(vdev, vscsi);
+	if (err)
+		goto virtscsi_init_failed;
+
+	cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1;
+	shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue);
+	shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF;
+	shost->max_lun = virtscsi_config_get(vdev, max_lun) + 1;
+	shost->max_id = virtscsi_config_get(vdev, max_target) + 1;
+	shost->max_channel = 0;
+	shost->max_cmd_len = VIRTIO_SCSI_CDB_SIZE;
+	err = scsi_add_host(shost, &vdev->dev);
+	if (err)
+		goto scsi_add_host_failed;
+
+	scsi_scan_host(shost);
+
+	return 0;
+
+scsi_add_host_failed:
+	vdev->config->del_vqs(vdev);
+virtscsi_init_failed:
+	scsi_host_put(shost);
+	return err;
+}
+
+static void __devexit virtscsi_remove_vqs(struct virtio_device *vdev)
+{
+	/* Stop all the virtqueues. */
+	vdev->config->reset(vdev);
+
+	vdev->config->del_vqs(vdev);
+}
+
+static void __devexit virtscsi_remove(struct virtio_device *vdev)
+{
+	struct Scsi_Host *shost = virtio_scsi_host(vdev);
+
+	scsi_remove_host(shost);
+
+	virtscsi_remove_vqs(vdev);
+	scsi_host_put(shost);
+}
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_SCSI, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static struct virtio_driver virtio_scsi_driver = {
+	.driver.name = KBUILD_MODNAME,
+	.driver.owner = THIS_MODULE,
+	.id_table = id_table,
+	.probe = virtscsi_probe,
+	.remove = __devexit_p(virtscsi_remove),
+};
+
+static int __init init(void)
+{
+	virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0);
+	if (!virtscsi_cmd_cache) {
+		printk(KERN_ERR "kmem_cache_create() for "
+				"virtscsi_cmd_cache failed\n");
+		return -ENOMEM;
+	}
+
+	return register_virtio_driver(&virtio_scsi_driver);
+}
+
+static void __exit fini(void)
+{
+	unregister_virtio_driver(&virtio_scsi_driver);
+	kmem_cache_destroy(virtscsi_cmd_cache);
+}
+module_init(init);
+module_exit(fini);
+
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio SCSI HBA driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/virtio_ids.h b/include/linux/virtio_ids.h
index 85bb0bb..fee54c4 100644
--- a/include/linux/virtio_ids.h
+++ b/include/linux/virtio_ids.h
@@ -34,6 +34,7 @@
 #define VIRTIO_ID_CONSOLE	3 /* virtio console */
 #define VIRTIO_ID_RNG		4 /* virtio ring */
 #define VIRTIO_ID_BALLOON	5 /* virtio balloon */
+#define VIRTIO_ID_SCSI		7 /* virtio scsi */
 #define VIRTIO_ID_9P		9 /* 9p virtio console */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/include/linux/virtio_scsi.h b/include/linux/virtio_scsi.h
new file mode 100644
index 0000000..8dac04f
--- /dev/null
+++ b/include/linux/virtio_scsi.h
@@ -0,0 +1,114 @@
+#ifndef _LINUX_VIRTIO_SCSI_H
+#define _LINUX_VIRTIO_SCSI_H
+/* This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers. */
+
+#define VIRTIO_SCSI_CDB_SIZE   32
+#define VIRTIO_SCSI_SENSE_SIZE 96
+
+/* SCSI command request, followed by data-out */
+struct virtio_scsi_cmd_req {
+	u8 lun[8];		/* Logical Unit Number */
+	u64 tag;		/* Command identifier */
+	u8 task_attr;		/* Task attribute */
+	u8 prio;
+	u8 crn;
+	u8 cdb[VIRTIO_SCSI_CDB_SIZE];
+} __packed;
+
+/* Response, followed by sense data and data-in */
+struct virtio_scsi_cmd_resp {
+	u32 sense_len;		/* Sense data length */
+	u32 resid;		/* Residual bytes in data buffer */
+	u16 status_qualifier;	/* Status qualifier */
+	u8 status;		/* Command completion status */
+	u8 response;		/* Response values */
+	u8 sense[VIRTIO_SCSI_SENSE_SIZE];
+} __packed;
+
+/* Task Management Request */
+struct virtio_scsi_ctrl_tmf_req {
+	u32 type;
+	u32 subtype;
+	u8 lun[8];
+	u64 tag;
+} __packed;
+
+struct virtio_scsi_ctrl_tmf_resp {
+	u8 response;
+} __packed;
+
+/* Asynchronous notification query/subscription */
+struct virtio_scsi_ctrl_an_req {
+	u32 type;
+	u8 lun[8];
+	u32 event_requested;
+} __packed;
+
+struct virtio_scsi_ctrl_an_resp {
+	u32 event_actual;
+	u8 response;
+} __packed;
+
+struct virtio_scsi_event {
+	u32 event;
+	u8 lun[8];
+	u32 reason;
+} __packed;
+
+struct virtio_scsi_config {
+	u32 num_queues;
+	u32 seg_max;
+	u32 max_sectors;
+	u32 cmd_per_lun;
+	u32 event_info_size;
+	u32 sense_size;
+	u32 cdb_size;
+	u16 max_channel;
+	u16 max_target;
+	u32 max_lun;
+} __packed;
+
+/* Response codes */
+#define VIRTIO_SCSI_S_OK                       0
+#define VIRTIO_SCSI_S_UNDERRUN                 1
+#define VIRTIO_SCSI_S_ABORTED                  2
+#define VIRTIO_SCSI_S_BAD_TARGET               3
+#define VIRTIO_SCSI_S_RESET                    4
+#define VIRTIO_SCSI_S_BUSY                     5
+#define VIRTIO_SCSI_S_TRANSPORT_FAILURE        6
+#define VIRTIO_SCSI_S_TARGET_FAILURE           7
+#define VIRTIO_SCSI_S_NEXUS_FAILURE            8
+#define VIRTIO_SCSI_S_FAILURE                  9
+#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED       10
+#define VIRTIO_SCSI_S_FUNCTION_REJECTED        11
+#define VIRTIO_SCSI_S_INCORRECT_LUN            12
+
+/* Controlq type codes.  */
+#define VIRTIO_SCSI_T_TMF                      0
+#define VIRTIO_SCSI_T_AN_QUERY                 1
+#define VIRTIO_SCSI_T_AN_SUBSCRIBE             2
+
+/* Valid TMF subtypes.  */
+#define VIRTIO_SCSI_T_TMF_ABORT_TASK           0
+#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET       1
+#define VIRTIO_SCSI_T_TMF_CLEAR_ACA            2
+#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET       3
+#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET      4
+#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET   5
+#define VIRTIO_SCSI_T_TMF_QUERY_TASK           6
+#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET       7
+
+/* Events.  */
+#define VIRTIO_SCSI_T_EVENTS_MISSED            0x80000000
+#define VIRTIO_SCSI_T_NO_EVENT                 0
+#define VIRTIO_SCSI_T_TRANSPORT_RESET          1
+#define VIRTIO_SCSI_T_ASYNC_NOTIFY             2
+
+#define VIRTIO_SCSI_S_SIMPLE                   0
+#define VIRTIO_SCSI_S_ORDERED                  1
+#define VIRTIO_SCSI_S_HEAD                     2
+#define VIRTIO_SCSI_S_ACA                      3
+
+
+#endif /* _LINUX_VIRTIO_SCSI_H */

-- 
1.7.7.1



^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] virtio-scsi: first version
  2011-12-05 17:29 ` [PATCH 1/2] virtio-scsi: first version Paolo Bonzini
@ 2011-12-06 18:09   ` James Bottomley
  2011-12-07  9:41     ` Paolo Bonzini
  0 siblings, 1 reply; 18+ messages in thread
From: James Bottomley @ 2011-12-06 18:09 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: linux-kernel, Michael S. Tsirkin, linux-scsi, Rusty Russell,
	Stefan Hajnoczi

On Mon, 2011-12-05 at 18:29 +0100, Paolo Bonzini wrote:
> The virtio-scsi HBA is the basis of an alternative storage stack
> for QEMU-based virtual machines (including KVM).

Could you clarify what the problem with virtio-blk is?

>   Compared to
> virtio-blk it is more scalable, because it supports many LUNs
> on a single PCI slot),

This is just multiplexing, surely, which should be easily fixable in
virtio-blk?

>  more powerful (it more easily supports
> passthrough of host devices to the guest)

I assume this means exclusive passthrough?  In which case, why doesn't
passing the host block queue through to the guest just work?  That means
the host is doing all the SCSI back end stuff and you've just got a
lightweight queue pass through.

>  and more easily
> extensible (new SCSI features implemented by QEMU should not
> require updating the driver in the guest).

I don't really understand this comment at all:  The block protocol is
far simpler than SCSI, but includes SG_IO, which can encapsulate all of
the SCSI features ... I'm not familiar necessarily with the problems of
QEMU devices, but surely it can unwrap the SG_IO transport generically
rather than having to emulate on a per feature basis?

James



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] virtio-scsi: first version
  2011-12-06 18:09   ` James Bottomley
@ 2011-12-07  9:41     ` Paolo Bonzini
  2011-12-07 14:35       ` James Bottomley
  0 siblings, 1 reply; 18+ messages in thread
From: Paolo Bonzini @ 2011-12-07  9:41 UTC (permalink / raw)
  To: James Bottomley
  Cc: linux-kernel, Michael S. Tsirkin, linux-scsi, Rusty Russell,
	Stefan Hajnoczi

On 12/06/2011 07:09 PM, James Bottomley wrote:
> On Mon, 2011-12-05 at 18:29 +0100, Paolo Bonzini wrote:
>> The virtio-scsi HBA is the basis of an alternative storage stack
>> for QEMU-based virtual machines (including KVM).
>
> Could you clarify what the problem with virtio-blk is?

In a nutshell, if virtio-blk had no problems, then you could also throw 
away iSCSI and extend NBD instead. :)

The main problem is that *every* new feature requires updating three or 
more places: the spec, the host (QEMU), and the guest drivers (at least 
two: Linux and Windows).  Exposing the new feature also requires 
updating all the hosts, but also all the guests.

With virtio-scsi, the host device provides nothing but a SCSI transport. 
  You still have to update everything (spec+host+guest) when something 
is added to the SCSI transport, but that's a pretty rare event.  In the 
most common case, there is a feature that the guest already knows about, 
but that QEMU does not implement (for example a particular mode page 
bit).  Once the host is updated to expose the feature, the guest picks 
it up automatically.

Say I want to let guests toggle the write cache.  With virtio-blk, this 
is not part of the spec so first I would have to add a new feature bit 
and a field in the configuration space of the device.  I would need to 
the host (of course), but I would also have to teach guest drivers about 
the new feature and field.  I cannot just send a MODE SELECT command via 
SG_IO, because the block device might be backed by a file.

With virtio-scsi, the guest will just go to the mode pages and flip the 
WCE bit.  I don't need to update the virtio-scsi spec, because the spec 
only defines the transport.  I don't need to update the guest driver, 
because it likewise only defines the transport and sd.c already knows 
how to do MODE SENSE/MODE SELECT.  I do need to teach the QEMU target of 
course, but that will always be smaller than the sum of 
host+Linux+Windows changes required for virtio-blk (if only because the 
Windows driver already contains a sort of SCSI target).

Regarding passthrough, non-block devices and task management functions 
cannot be passed via virtio-blk.  Lack of TMFs make virtio-blk's error 
handling less than optimal in the guest.

>> Compared to virtio-blk it is more scalable, because it supports
>> many LUNs on a single PCI slot),
>
> This is just multiplexing, surely, which should be easily fixable in
> virtio-blk?

Yes, you can do that.   I did play with a "virtio-over-virtio" device, 
but it was actually more complex than virtio-scsi and would not fix the 
other problems.

>> more powerful (it more easily supports passthrough of host devices
>> to the guest)
>
> I assume this means exclusive passthrough?

It doesn't really matter if it is exclusive or not (it can be 
non-exclusive with NPIV or iSCSI in the host; otherwise it pretty much 
has to be exclusive, because persistent reservations do not work).  The 
important point is that it's at the LUN level rather than the host level.

> In which case, why doesn't passing the host block queue through to
> the guest just work? That means the host is doing all the SCSI back
> end stuff and you've just got a lightweight queue pass through.

If you want to do passthrough, virtio-scsi is exactly this, a 
lightweight queue.

There are other possible uses, where the target is on the host.  QEMU 
itself can act as the target, or you can use LIO with FILEIO or IBLOCK 
backends.

>> and more easily extensible (new SCSI features implemented by QEMU
>> should not require updating the driver in the guest).
>
> I don't really understand this comment at all:  The block protocol is
> far simpler than SCSI, but includes SG_IO, which can encapsulate all
> of the SCSI features ...

The problem is that SG_IO is bolted on.  It doesn't work if the guest's 
block device is backed by a file, and in general the guest shouldn't 
care about that.  The command might be passed down to a real disk, 
interpreted by an iSCSI target, or emulated by QEMU.  There's no reason 
why a guest should see any difference and indeed with virtio-scsi it 
does not (besides the obvious differences in INQUIRY data).

And even if it works, it is neither the main I/O mechanism nor the main 
configuration mechanism.  Regarding configuration, see the above example 
of toggling the write cache.

Regarding I/O, an example would be adding "discard" support.  With 
virtio-scsi, you just make sure that the emulated target supports WRITE 
SAME w/UNMAP.  With virtio-blk it's again spec+host+guest updates. 
Bypassing this with SG_IO would mean copying a lot of code from sd.c and 
not working with files (cutting out both sparse and non-raw files, which 
are the most common kind of virt thin-provisioning).

Not to mention that virtio-blk does I/O in units of 512 bytes.  It 
supports passing an arbitrary logical block size in the configuration 
space, but even then there's no guarantee that SG_IO will use the same 
size.  To use SG_IO, you have to fetch the logical block size with READ 
CAPACITY.

Also, using SG_IO for I/O will bypass the host cache and might leave the 
host in a pretty confused state, so you could not reliably do extended 
copy using SG_IO, for example.  Spec+host+driver once more.  (And 
modifying the spec would be a spectacular waste of time because the 
outcome would be simply a dumbed down version of SBC, and quite hard to 
get right the first time).

SG_IO is also very much tied to Linux guests, both in the host and in 
the guest.  For example, the spec includes an "errors" field that is not 
defined in the spec.  Reading the virtio-blk code shows that it is 
really a (status, msg_status, host_status, driver_status) combo.  In the 
guest, not all OSes tell the driver if the I/O request came from a 
"regular" command or from SCSI pass-through.  In Windows, all disks are 
like Linux /dev/sdX, so Windows drivers cannot send SG_IO requests to 
the host.

All this makes SG_IO a workaround, but not a solution.  Which 
virtio-scsi is.

> I'm not familiar necessarily with the problems of QEMU devices, but
> surely it can unwrap the SG_IO transport generically rather than
> having to emulate on a per feature basis?

QEMU does interpret virtio-blk's SG_IO just by passing down the ioctl. 
With the virtio-scsi backend you can choose between doing so or 
emulating everything.

Paolo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] virtio-scsi: first version
  2011-12-07  9:41     ` Paolo Bonzini
@ 2011-12-07 14:35       ` James Bottomley
  2011-12-08 13:09         ` Paolo Bonzini
  0 siblings, 1 reply; 18+ messages in thread
From: James Bottomley @ 2011-12-07 14:35 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: linux-kernel, Michael S. Tsirkin, linux-scsi, Rusty Russell,
	Stefan Hajnoczi

On Wed, 2011-12-07 at 10:41 +0100, Paolo Bonzini wrote:
> On 12/06/2011 07:09 PM, James Bottomley wrote:
> > On Mon, 2011-12-05 at 18:29 +0100, Paolo Bonzini wrote:
> >> The virtio-scsi HBA is the basis of an alternative storage stack
> >> for QEMU-based virtual machines (including KVM).
> >
> > Could you clarify what the problem with virtio-blk is?
> 
> In a nutshell, if virtio-blk had no problems, then you could also throw 
> away iSCSI and extend NBD instead. :)

Um, I wouldn't make that as an argument.  For a linux only transport,
nbd is far better than iSCSI mainly because it's a lot simpler and
easier and doesn't have a tied encapsulation ... it is chosen in a lot
of implementations for that reason.

> The main problem is that *every* new feature requires updating three or 
> more places: the spec, the host (QEMU), and the guest drivers (at least 
> two: Linux and Windows).  Exposing the new feature also requires 
> updating all the hosts, but also all the guests.

Define "new feature"; you mean the various request types for flush and
discard?

> With virtio-scsi, the host device provides nothing but a SCSI transport. 
>   You still have to update everything (spec+host+guest) when something 
> is added to the SCSI transport, but that's a pretty rare event.

Well, no it's not, the transports are the fastest evolving piece of the
SCSI spec.  What I think you mean to say here is that we take care to
preserve compatibility with older transports while we add the newer
features, so you're not forced to adopt them?

>   In the 
> most common case, there is a feature that the guest already knows about, 
> but that QEMU does not implement (for example a particular mode page 
> bit).  Once the host is updated to expose the feature, the guest picks 
> it up automatically.

That's in the encapsulation, surely; these are used to set up the queue,
so only the queue runner (i.e. the host) needs to know.  If the guest
really wants the information, it encapsulates the request in SG_IO which
has been a stable API for donkey's years.

> Say I want to let guests toggle the write cache.  With virtio-blk, this 
> is not part of the spec so first I would have to add a new feature bit 
> and a field in the configuration space of the device.  I would need to 
> the host (of course), but I would also have to teach guest drivers about 
> the new feature and field.  I cannot just send a MODE SELECT command via 
> SG_IO, because the block device might be backed by a file.

I don't get this.  If you have a file backed SCSI device, you have to
interpret the MODE_SELECT command on the transport.  How is that any
different from unwrapping the SG_IO picking out the MODE_SELECT and
interpreting it?

> With virtio-scsi, the guest will just go to the mode pages and flip the 
> WCE bit.  I don't need to update the virtio-scsi spec, because the spec 
> only defines the transport.  I don't need to update the guest driver, 
> because it likewise only defines the transport and sd.c already knows 
> how to do MODE SENSE/MODE SELECT.

sd might, but I don't see how the back end works for a file without your
doing interpretation.

>   I do need to teach the QEMU target of 
> course, but that will always be smaller than the sum of 
> host+Linux+Windows changes required for virtio-blk (if only because the 
> Windows driver already contains a sort of SCSI target).
> 
> Regarding passthrough, non-block devices and task management functions 
> cannot be passed via virtio-blk.  Lack of TMFs make virtio-blk's error 
> handling less than optimal in the guest.

This would be presumably because most of the errors (i.e. the transport
ones) are handled in the host.  All the guest has to do is pass on the
error codes the host gives it.

You worry me enormously talking about TMFs because they're transport
specific.  SAM defines the generalities but the transports define the
exact implementations.  If you expose them, your device will behave
slightly differently depending on what the physical attachment is.  This
is really a recipe for code explosion because to special case them
you'll need to know what the host transport actually is and that's a
huge amount of knowledge to pass through.  You'd really be better off
using standardised pass throughs, like FCoE (or even iSCSI) if you want
this.

> >> Compared to virtio-blk it is more scalable, because it supports
> >> many LUNs on a single PCI slot),
> >
> > This is just multiplexing, surely, which should be easily fixable in
> > virtio-blk?
> 
> Yes, you can do that.   I did play with a "virtio-over-virtio" device, 
> but it was actually more complex than virtio-scsi and would not fix the 
> other problems.
> 
> >> more powerful (it more easily supports passthrough of host devices
> >> to the guest)
> >
> > I assume this means exclusive passthrough?
> 
> It doesn't really matter if it is exclusive or not (it can be 
> non-exclusive with NPIV or iSCSI in the host; otherwise it pretty much 
> has to be exclusive, because persistent reservations do not work).  The 
> important point is that it's at the LUN level rather than the host level.

virtio-blk can pass through at the LUN level surely: every LUN (in fact
every separate SCSI device) has a separate queue.

> > In which case, why doesn't passing the host block queue through to
> > the guest just work? That means the host is doing all the SCSI back
> > end stuff and you've just got a lightweight queue pass through.
> 
> If you want to do passthrough, virtio-scsi is exactly this, a 
> lightweight queue.
> 
> There are other possible uses, where the target is on the host.  QEMU 
> itself can act as the target, or you can use LIO with FILEIO or IBLOCK 
> backends.

If you use an iSCSI back end, why not an iSCSI initiator.  They may be
messy but at least the interaction is defined and expected rather than
encapsulated like you'd be doing with virtio-scsi.

> >> and more easily extensible (new SCSI features implemented by QEMU
> >> should not require updating the driver in the guest).
> >
> > I don't really understand this comment at all:  The block protocol is
> > far simpler than SCSI, but includes SG_IO, which can encapsulate all
> > of the SCSI features ...
> 
> The problem is that SG_IO is bolted on.  It doesn't work if the guest's 
> block device is backed by a file, and in general the guest shouldn't 
> care about that. 

Well how does virtio-scsi cope when file backed?  It's the same problem:
you get a non READ/WRITE command and you have to interpret it according
to some set of rules.  That's the same problem whether the command comes
via virtio-scsi or via SG_IO on virtio-blk.  The real difference is that
with virtio-blk most of the initial start up (which is where all the
weird and wonderful discover commands come from) is in the host.

>  The command might be passed down to a real disk, 
> interpreted by an iSCSI target, or emulated by QEMU.  There's no reason 
> why a guest should see any difference and indeed with virtio-scsi it 
> does not (besides the obvious differences in INQUIRY data).
> 
> And even if it works, it is neither the main I/O mechanism nor the main 
> configuration mechanism.  Regarding configuration, see the above example 
> of toggling the write cache.
> 
> Regarding I/O, an example would be adding "discard" support.  With 
> virtio-scsi, you just make sure that the emulated target supports WRITE 
> SAME w/UNMAP.  With virtio-blk it's again spec+host+guest updates. 
> Bypassing this with SG_IO would mean copying a lot of code from sd.c and 
> not working with files (cutting out both sparse and non-raw files, which 
> are the most common kind of virt thin-provisioning).

so I agree, supporting REQ_DISCARD are host updates because they're an
expansion of the block protocol.  However, they're rare, and, as you
said, you have to update the emulated targets anyway.  Incidentally,
REQ_DISCARD was added in 2008.  In that time close to 50 new commands
have been added to SCSI, so the block protocol is pretty slow moving.

> Not to mention that virtio-blk does I/O in units of 512 bytes.  It 
> supports passing an arbitrary logical block size in the configuration 
> space, but even then there's no guarantee that SG_IO will use the same 
> size.  To use SG_IO, you have to fetch the logical block size with READ 
> CAPACITY.

So here what I think you're telling me is that virtio-blk doesn't have a
correct discovery protocol?  That's easily fixable, surely (and not via
SG_IO ... you need discovery of the host queue parameters, so an
extension to block extracting block parameters).

> Also, using SG_IO for I/O will bypass the host cache and might leave the 
> host in a pretty confused state, so you could not reliably do extended 
> copy using SG_IO, for example.  Spec+host+driver once more.  (And 
> modifying the spec would be a spectacular waste of time because the 
> outcome would be simply a dumbed down version of SBC, and quite hard to 
> get right the first time).
> 
> SG_IO is also very much tied to Linux guests, both in the host and in 
> the guest.  For example, the spec includes an "errors" field that is not 
> defined in the spec.  Reading the virtio-blk code shows that it is 
> really a (status, msg_status, host_status, driver_status) combo.  In the 
> guest, not all OSes tell the driver if the I/O request came from a 
> "regular" command or from SCSI pass-through.  In Windows, all disks are 
> like Linux /dev/sdX, so Windows drivers cannot send SG_IO requests to 
> the host.
> 
> All this makes SG_IO a workaround, but not a solution.  Which 
> virtio-scsi is.

I don't think I understand any of this.  Most of the operation of the
device goes via native block queue (REQ_*) commands ... that's how we
run all block queues anyway.  Applications use SG_IO for various well
co-ordinated (or not, admittedly, in the case of udev) actions ... if
that's so fragile, desktops would have fallen over long ago.

> > I'm not familiar necessarily with the problems of QEMU devices, but
> > surely it can unwrap the SG_IO transport generically rather than
> > having to emulate on a per feature basis?
> 
> QEMU does interpret virtio-blk's SG_IO just by passing down the ioctl. 
> With the virtio-scsi backend you can choose between doing so or 
> emulating everything.

So why is that choice not available to virto-blk?  surely it could
interpret after unwrapping the SG_IO encapsulation.

Reading back all of this, I think there's some basic misunderstanding
somewhere, so let me see if I can make the discussion more abstract.

The way we run a storage device today (be it scsi or something else) is
via a block queue.  The only interaction a user gets is via that queue.
Therefore, in Linux, slicing the interaction at the queue and
transporting all the queue commands to some back end produces exactly
what we have today ... now correctly implemented, virtio-blk should do
that (and if there are problems in the current implementation, I'd
rather see them fixed), so it should have full equivalency to what a
native linux userspace sees.

Because of the slicing at the top, most of the actual processing,
including error handling and interpretation goes on in the back end
(i.e. the host) and anything request based like dm-mp and md (but
obviously not lvm, which is bio based) ... what I seem to see implied
but not stated in the above is that you have some reason you want to
move this into the guest, which is what happens if you slice at a lower
level (like SCSI)?

One of the problems you might also pick up slicing within SCSI is that
if (by some miracle, admittedly) we finally disentangle ATA from SCSI,
you'll lose ATA and SATA support in virtio-scsi.  Today you also loose
support for non-SCSI block devices like mmc 

James



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] virtio-scsi: first version
  2011-12-07 14:35       ` James Bottomley
@ 2011-12-08 13:09         ` Paolo Bonzini
  2011-12-09 20:06           ` James Bottomley
  0 siblings, 1 reply; 18+ messages in thread
From: Paolo Bonzini @ 2011-12-08 13:09 UTC (permalink / raw)
  To: James Bottomley
  Cc: linux-kernel, Michael S. Tsirkin, linux-scsi, Rusty Russell,
	Stefan Hajnoczi

On 12/07/2011 03:35 PM, James Bottomley wrote:
> On Wed, 2011-12-07 at 10:41 +0100, Paolo Bonzini wrote:
>> On 12/06/2011 07:09 PM, James Bottomley wrote:
>>> On Mon, 2011-12-05 at 18:29 +0100, Paolo Bonzini wrote:
>>>> The virtio-scsi HBA is the basis of an alternative storage stack
>>>> for QEMU-based virtual machines (including KVM).
>>>
>>> Could you clarify what the problem with virtio-blk is?
>>
>> In a nutshell, if virtio-blk had no problems, then you could also throw
>> away iSCSI and extend NBD instead. :)
>
> Um, I wouldn't make that as an argument.  For a linux only transport,
> nbd is far better than iSCSI mainly because it's a lot simpler and
> easier and doesn't have a tied encapsulation ... it is chosen in a lot
> of implementations for that reason.

Indeed virtio-blk is not going to disappear overnight.

>> The main problem is that *every* new feature requires updating three or
>> more places: the spec, the host (QEMU), and the guest drivers (at least
>> two: Linux and Windows).  Exposing the new feature also requires
>> updating all the hosts, but also all the guests.
>
> Define "new feature"; you mean the various request types for flush and
> discard?

So far the feature bits that had to be added was barrier (now 
deprecated), maximum request size, maximum segments/request, geometry 
information (chs, for BIOS boot), read-only, total size, SCSI requests, 
flush requests, WCE, topology (aka block limits).  WCE and topology 
actually are in the code but not in the virtio spec.  For each of these, 
both the host and the guest drivers had to be updated.

These still do not cover discard (and secure discard), bidirectional 
SG_IO, and perhaps something for removable media. (*)  Any future 
extension of course will also require updating the host and guest 
drivers (plus the spec).

     (*) I mention removable media because one of two usecases I know
         for SG_IO on virtio-blk is burning CDs.

At some point, it makes sense to rethink the protocol.  virtio-scsi is 
substantially saner in this respect; it requires 1/3 of the work to 
implement a new feature, and especially frees us from having to define 
another spec specially for virtualization.  This is why I listed 
extensibility as part of the goals for virtio-scsi.

>> With virtio-scsi, the host device provides nothing but a SCSI transport.
>>    You still have to update everything (spec+host+guest) when something
>> is added to the SCSI transport, but that's a pretty rare event.
>
> Well, no it's not, the transports are the fastest evolving piece of the
> SCSI spec.

No, I mean when something is added to the generic definition of SCSI 
transport (SAM, more or less), not the individual transports.  When the 
virtio-scsi transport has to change, you still have to update 
spec+host+guest, but that's relatively rare.

>> In the most common case, there is a feature that the guest already
>> knows about, but that QEMU does not implement (for example a
>> particular mode page bit). Once the host is updated to expose the
>> feature, the guest picks it up automatically.
>
> That's in the encapsulation, surely; these are used to set up the queue,
> so only the queue runner (i.e. the host) needs to know.

Not at all.  You can start the guest in writethrough-cache mode.  Then, 
guests that know how to do flush+FUA can enable writeback for 
performance.  There's nothing virtio-blk or virtio-scsi specific in 
this.  But in virtio-scsi you only need to update the host.  In 
virtio-blk you need to update the guest and spec too.

> I don't get this.  If you have a file backed SCSI device, you have to
> interpret the MODE_SELECT command on the transport.  How is that any
> different from unwrapping the SG_IO picking out the MODE_SELECT and
> interpreting it?

The difference is that virtio-scsi exposes a direct-access SCSI device, 
nothing less nothing more.  virtio-blk exposes a disk that has nothing 
to do with SCSI except that it happens to understand SG_IO; the primary 
means for communication are the virtio-blk config space and read/write 
requests.

So, for virtio-blk, SG_IO is good for persistent reservations, burning 
CDs, and basically nothing else.  Neither of these can really be done in 
the host by interpreting, so for virtio-blk it makes sense to simply 
pass through.

For virtio-scsi, the SCSI command set is how you communicate with the 
host, and you don't care about who ends up interpreting the commands: it 
can be local or remote, userspace or kernelspace, a server or a disk, 
you don't care.

So, QEMU is already (optionally) doing interpretation for virtio-scsi. 
It's not for virtio-blk, and it's not going to.

>> Regarding passthrough, non-block devices and task management functions
>> cannot be passed via virtio-blk.  Lack of TMFs make virtio-blk's error
>> handling less than optimal in the guest.
>
> This would be presumably because most of the errors (i.e. the transport
> ones) are handled in the host.  All the guest has to do is pass on the
> error codes the host gives it.
>
> You worry me enormously talking about TMFs because they're transport
> specific.

True, but virtio-blk for example cannot even retry a command at all.

>> It doesn't really matter if it is exclusive or not (it can be
>> non-exclusive with NPIV or iSCSI in the host; otherwise it pretty much
>> has to be exclusive, because persistent reservations do not work).  The
>> important point is that it's at the LUN level rather than the host level.
>
> virtio-blk can pass through at the LUN level surely: every LUN (in fact
> every separate SCSI device) has a separate queue.

virtio-blk isn't meant to do pass through.  virtio-blk had SG_IO bolted 
on it, but this doesn't mean that the guest /dev/vdX is equivalent to 
the host's /dev/sdY.  From kernelspace, features are lacking: no WCE 
toggle, no thin provisioning, no extended copy, etc.  From userspace, 
your block size might be screwed up or worse.  With virtio-scsi, by 
definition the guest /dev/sdX can be as capable as the host's /dev/sdY 
if you ask the host to do passthrough.

>> There are other possible uses, where the target is on the host.  QEMU
>> itself can act as the target, or you can use LIO with FILEIO or IBLOCK
>> backends.
>
> If you use an iSCSI back end, why not an iSCSI initiator.  They may be
> messy but at least the interaction is defined and expected rather than
> encapsulated like you'd be doing with virtio-scsi.

If you use an iSCSI initiator, you need to expose to the guest the 
details of your storage, including possibly the authentication.

I'm not sure however if you interpreted LIO as LIO's iSCSI backend.  In 
that case, note that a virtio-scsi backend for LIO is in the works too.

> so I agree, supporting REQ_DISCARD are host updates because they're an
> expansion of the block protocol.  However, they're rare, and, as you
> said, you have to update the emulated targets anyway.

New features are rare, but there are also features where virtio-blk is 
lagging behind, and those aren't necessarily rare.

Regarding updates to the targets, you have much more control on the host 
than the guest.  Updating the host is trivial compared to updating the 
guest.

> Incidentally, REQ_DISCARD was added in 2008.  In that time close to
> 50 new commands have been added to SCSI, so the block protocol is
> pretty slow moving.

That also means that virtio-blk cannot give guests access to the full 
range of features that might want to use.  Not all OSes are Linux, not 
all OSes limit themselves to the features of the Linux block protocol.

>> Not to mention that virtio-blk does I/O in units of 512 bytes.  It
>> supports passing an arbitrary logical block size in the configuration
>> space, but even then there's no guarantee that SG_IO will use the same
>> size.  To use SG_IO, you have to fetch the logical block size with READ
>> CAPACITY.
>
> So here what I think you're telling me is that virtio-blk doesn't have a
> correct discovery protocol?

No, I'm saying that virtio-blk's SG_IO is not meant to be used for 
configuration, I/O or discovery.  If you want to use it for those tasks, 
and it breaks, you're on your own.  virtio-blk lets you show a 
4k-logical-block disk as having 512b logical blocks, for example because 
otherwise you could not boot from it; however, as soon as you use SG_IO 
the truth shows.  The answer is "don't do it", but can be a severe 
limitation.

>>> I'm not familiar necessarily with the problems of QEMU devices, but
>>> surely it can unwrap the SG_IO transport generically rather than
>>> having to emulate on a per feature basis?
>>
>> QEMU does interpret virtio-blk's SG_IO just by passing down the ioctl.
>> With the virtio-scsi backend you can choose between doing so or
>> emulating everything.
>
> So why is that choice not available to virto-blk?  surely it could
> interpret after unwrapping the SG_IO encapsulation.

Because if you do this, you get really no advantages.  Userspace uses 
virtio-blk's SG_IO for only a couple of usecases, which hardly apply to 
files.  On the other hand, if you use SPC/SBC as a unified protocol for 
configuration, discovery and I/O, it makes sense to emulate.

> Reading back all of this, I think there's some basic misunderstanding
> somewhere, so let me see if I can make the discussion more abstract.

Probably. :)

> The way we run a storage device today (be it scsi or something else) is
> via a block queue.  The only interaction a user gets is via that queue.
> Therefore, in Linux, slicing the interaction at the queue and
> transporting all the queue commands to some back end produces exactly
> what we have today ...

Let's draw it like this:

              guest       |                    host
                          |
  read() -> req() ---virtio-blk ---> read() -> req -> READ(16) -> device

> now correctly implemented, virtio-blk should do that (and if there
> are problems in the current implementation, I'd rather see them
> fixed), so it should have full equivalency to what a native linux
> userspace sees.

Right: there are missing features I mentioned above, and SG_IO is very 
limited with virtio-blk compared to native, but usually it is fine.  For 
other OSes it is less than ideal, but it can work.  It can be improved 
(not completely fixed), but again at some point, it makes sense to 
rethink the stack.

> Because of the slicing at the top, most of the actual processing,
> including error handling and interpretation goes on in the back end
> (i.e. the host) and anything request based like dm-mp and md (but
> obviously not lvm, which is bio based) ... what I seem to see implied
> but not stated in the above is that you have some reason you want to
> move this into the guest, which is what happens if you slice at a lower
> level (like SCSI)?

Yes, that's what happens if you do passthrough:

              guest                 |            host
                                    |
  read() -> req() -> READ(16) --virtio-scsi ---> ioctl() -> ...

Advantages here include the ability to work with non-block devices, and 
the ability to reuse all the discovery code that is or will be in sd.c. 
  If you do like this and you want multipathing (for example) you indeed 
have to move it into the VM, but it doesn't usually make much sense.

However, something else actually can happen in the host, and here lie 
the interesting cases.  For example, the host userspace can send the 
commands to the LUN via iSCSI, directly:

              guest                 | host with userspace iSCSI initiator
                                    |
  read() -> req() -> READ(16) --virtio-scsi ---> send() -> ...

This is still effectively passthrough, on the other hand it doesn't 
require you to handle low-level details in the VM.  And unlike an iSCSI 
initiator in the guest, you are free to change how the storage is 
implemented.

A third implementation is to emulate SCSI commands by unpacking them in 
host userspace:

              guest                 |            host
                                    |
  read() -> req() -> READ(16) --virtio-scsi ---> read() -> ...

Again, you reuse all the discovery code that is in sd.c, and future 
improvements can be confined to the emulation code only.  In addition, 
future improvements done to sd.c for non-virt will apply to virt as well 
(either right away or modulo emulation improvements).  In addition, 
you're 100% sure that when the guest uses SG_IO it will not exhibit any 
quirks.  And it is also more flexible when your guests are not Linux.

There's nothing new in it.  As far as I know, only Xen has a dedicated 
protocol for paravirtualized block devices (in addition to virtio). 
Hyper-V and VMware both use paravirtualized SCSI.

> One of the problems you might also pick up slicing within SCSI is that
> if (by some miracle, admittedly) we finally disentangle ATA from SCSI,
> you'll lose ATA and SATA support in virtio-scsi.  Today you also loose
> support for non-SCSI block devices like mmc

You do not lose that.  Just like virtio-blk cannot do SG_IO to mmc, 
virtio-scsi is only be usable with mmc in emulated mode.

Paolo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] virtio-scsi: first version
  2011-12-08 13:09         ` Paolo Bonzini
@ 2011-12-09 20:06           ` James Bottomley
  2011-12-10 16:37             ` Paolo Bonzini
  0 siblings, 1 reply; 18+ messages in thread
From: James Bottomley @ 2011-12-09 20:06 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: linux-kernel, Michael S. Tsirkin, linux-scsi, Rusty Russell,
	Stefan Hajnoczi

On Thu, 2011-12-08 at 14:09 +0100, Paolo Bonzini wrote:
> > Well, no it's not, the transports are the fastest evolving piece of the
> > SCSI spec.
> 
> No, I mean when something is added to the generic definition of SCSI 
> transport (SAM, more or less), not the individual transports.  When the 
> virtio-scsi transport has to change, you still have to update 
> spec+host+guest, but that's relatively rare.

This doesn't make sense:  You talk about wanting TMF access which *is*
transport defined.  Even if we just concentrate on the primary device
commands, they add at a fairly fast rate ... something like 10x what the
block protocol adds at.

> >> In the most common case, there is a feature that the guest already
> >> knows about, but that QEMU does not implement (for example a
> >> particular mode page bit). Once the host is updated to expose the
> >> feature, the guest picks it up automatically.
> >
> > That's in the encapsulation, surely; these are used to set up the queue,
> > so only the queue runner (i.e. the host) needs to know.
> 
> Not at all.  You can start the guest in writethrough-cache mode.  Then, 
> guests that know how to do flush+FUA can enable writeback for 
> performance.  There's nothing virtio-blk or virtio-scsi specific in 
> this.  But in virtio-scsi you only need to update the host.  In 
> virtio-blk you need to update the guest and spec too.
> 
> > I don't get this.  If you have a file backed SCSI device, you have to
> > interpret the MODE_SELECT command on the transport.  How is that any
> > different from unwrapping the SG_IO picking out the MODE_SELECT and
> > interpreting it?
> 
> The difference is that virtio-scsi exposes a direct-access SCSI device, 
> nothing less nothing more.  virtio-blk exposes a disk that has nothing 
> to do with SCSI except that it happens to understand SG_IO; the primary 
> means for communication are the virtio-blk config space and read/write 
> requests.

So this sounds like the virtio block backend has the wrong
implementation ... just fix it?  What should happen is anything you send
to a block queue in the guest you should be able to send to a block
queue in the host.  If you employ that principle, then tere are no
communication problems and the set of supported features is exactly what
a block queue supports which, in turn, is exactly what a user expects.

> So, for virtio-blk, SG_IO is good for persistent reservations, burning 
> CDs, and basically nothing else.  Neither of these can really be done in 
> the host by interpreting, so for virtio-blk it makes sense to simply 
> pass through.

It is a pass through for user space ... I don't get what your point is.
All of the internal commands for setup are handled in the host.  All the
guest is doing is attaching to a formed block queue.  I think, as I've
said several times before, all of this indicates virtio-blk doesn't do
discovery of the host block queue properly, but that's fixable.

> For virtio-scsi, the SCSI command set is how you communicate with the 
> host, and you don't care about who ends up interpreting the commands: it 
> can be local or remote, userspace or kernelspace, a server or a disk, 
> you don't care.
> 
> So, QEMU is already (optionally) doing interpretation for virtio-scsi. 
> It's not for virtio-blk, and it's not going to.

So are you saying there's some unfixable bug in the virtio-blk back end
that renders it difficult to use?

> >> Regarding passthrough, non-block devices and task management functions
> >> cannot be passed via virtio-blk.  Lack of TMFs make virtio-blk's error
> >> handling less than optimal in the guest.
> >
> > This would be presumably because most of the errors (i.e. the transport
> > ones) are handled in the host.  All the guest has to do is pass on the
> > error codes the host gives it.
> >
> > You worry me enormously talking about TMFs because they're transport
> > specific.
> 
> True, but virtio-blk for example cannot even retry a command at all.

Why would it need to.  You seem to understand that architecturally the
queue is sliced, but what you don't seem to appreciate is that error
handling is done below this ... i.e. in the host in your model, so
virtio-blk properly implemented *shouldn't* be doing retries.  You seem
to be stating error handling in a way that necessarily violates the
layering of block and then declaring this to be a problem.  It isn't; in
virtio-block, errors are handled in the host and passed up to the guest
when resolved.  I think you think there's some problem with this, but
you haven't actually said what it is yet.

> >> It doesn't really matter if it is exclusive or not (it can be
> >> non-exclusive with NPIV or iSCSI in the host; otherwise it pretty much
> >> has to be exclusive, because persistent reservations do not work).  The
> >> important point is that it's at the LUN level rather than the host level.
> >
> > virtio-blk can pass through at the LUN level surely: every LUN (in fact
> > every separate SCSI device) has a separate queue.
> 
> virtio-blk isn't meant to do pass through.  virtio-blk had SG_IO bolted 
> on it, but this doesn't mean that the guest /dev/vdX is equivalent to 
> the host's /dev/sdY.  From kernelspace, features are lacking: no WCE 
> toggle, no thin provisioning, no extended copy, etc.  From userspace, 
> your block size might be screwed up or worse.  With virtio-scsi, by 
> definition the guest /dev/sdX can be as capable as the host's /dev/sdY 
> if you ask the host to do passthrough.

Why do you worry about WCE?  That's a SCSI feature and it's handled in
the host.  What you need is the discovery of the flush parameters and a
corresponding setting of those in the guest queue, so that flushes are
preserved in stream.  The point here is that virtio-blk operates at the
block level, so you should too ... as in you operate in the abstracted
block environment which sets flush type ... you don't ask to pierce the
abstraction to try to see SCSI parameters.  The flush type should be
passed between the queues as part of proper discovery.

> >> There are other possible uses, where the target is on the host.  QEMU
> >> itself can act as the target, or you can use LIO with FILEIO or IBLOCK
> >> backends.
> >
> > If you use an iSCSI back end, why not an iSCSI initiator.  They may be
> > messy but at least the interaction is defined and expected rather than
> > encapsulated like you'd be doing with virtio-scsi.
> 
> If you use an iSCSI initiator, you need to expose to the guest the 
> details of your storage, including possibly the authentication.
> 
> I'm not sure however if you interpreted LIO as LIO's iSCSI backend.  In 
> that case, note that a virtio-scsi backend for LIO is in the works too.
> 
> > so I agree, supporting REQ_DISCARD are host updates because they're an
> > expansion of the block protocol.  However, they're rare, and, as you
> > said, you have to update the emulated targets anyway.
> 
> New features are rare, but there are also features where virtio-blk is 
> lagging behind, and those aren't necessarily rare.
> 
> Regarding updates to the targets, you have much more control on the host 
> than the guest.  Updating the host is trivial compared to updating the 
> guest.

So is this a turf war?  virto-blk isn't evolving fast enough (and since
you say lagging behind and DISCARD was a 2008 feature, that seems
reasonable) so you want to invent and additional backend that can move
faster?

> > Incidentally, REQ_DISCARD was added in 2008.  In that time close to
> > 50 new commands have been added to SCSI, so the block protocol is
> > pretty slow moving.
> 
> That also means that virtio-blk cannot give guests access to the full 
> range of features that might want to use.  Not all OSes are Linux, not 
> all OSes limit themselves to the features of the Linux block protocol.

So you're trying to solve the non-linux guest problem?  My observation
from Windows has been that windows device queues mostly behave
reasonably similarly to Linux ... that's not exactly, but similarly
enough that we can translate the requests.  On this basis, I don't
really see why a windows block driver can't attach almost exactly to a
Linux Guest queue.  I could see where this would fail pre 2008 because
of MS_SYNC, but we can even do that now.

> >> Not to mention that virtio-blk does I/O in units of 512 bytes.  It
> >> supports passing an arbitrary logical block size in the configuration
> >> space, but even then there's no guarantee that SG_IO will use the same
> >> size.  To use SG_IO, you have to fetch the logical block size with READ
> >> CAPACITY.
> >
> > So here what I think you're telling me is that virtio-blk doesn't have a
> > correct discovery protocol?
> 
> No, I'm saying that virtio-blk's SG_IO is not meant to be used for 
> configuration, I/O or discovery. 

Exactly correct.  virtio-blk should have its own queue discovery
protocol.  This, I think, seems to be the missing piece.

>  If you want to use it for those tasks, 
> and it breaks, you're on your own.  virtio-blk lets you show a 
> 4k-logical-block disk as having 512b logical blocks, for example because 
> otherwise you could not boot from it; however, as soon as you use SG_IO 
> the truth shows.  The answer is "don't do it", but can be a severe 
> limitation.

Again, you've lost me ... part of the things a correct block discovery
protocol would get would be physical and logical sector size ... this
will show you exactly if you're using 4k sectors and whether you're
emulated.

> >>> I'm not familiar necessarily with the problems of QEMU devices, but
> >>> surely it can unwrap the SG_IO transport generically rather than
> >>> having to emulate on a per feature basis?
> >>
> >> QEMU does interpret virtio-blk's SG_IO just by passing down the ioctl.
> >> With the virtio-scsi backend you can choose between doing so or
> >> emulating everything.
> >
> > So why is that choice not available to virto-blk?  surely it could
> > interpret after unwrapping the SG_IO encapsulation.
> 
> Because if you do this, you get really no advantages.  Userspace uses 
> virtio-blk's SG_IO for only a couple of usecases, which hardly apply to 
> files.  On the other hand, if you use SPC/SBC as a unified protocol for 
> configuration, discovery and I/O, it makes sense to emulate.
> 
> > Reading back all of this, I think there's some basic misunderstanding
> > somewhere, so let me see if I can make the discussion more abstract.
> 
> Probably. :)
> 
> > The way we run a storage device today (be it scsi or something else) is
> > via a block queue.  The only interaction a user gets is via that queue.
> > Therefore, in Linux, slicing the interaction at the queue and
> > transporting all the queue commands to some back end produces exactly
> > what we have today ...
> 
> Let's draw it like this:
> 
>               guest       |                    host
>                           |
>   read() -> req() ---virtio-blk ---> read() -> req -> READ(16) -> device
> 
> > now correctly implemented, virtio-blk should do that (and if there
> > are problems in the current implementation, I'd rather see them
> > fixed), so it should have full equivalency to what a native linux
> > userspace sees.
> 
> Right: there are missing features I mentioned above, and SG_IO is very 
> limited with virtio-blk compared to native, but usually it is fine.  For 
> other OSes it is less than ideal, but it can work.  It can be improved 
> (not completely fixed), but again at some point, it makes sense to 
> rethink the stack.
> 
> > Because of the slicing at the top, most of the actual processing,
> > including error handling and interpretation goes on in the back end
> > (i.e. the host) and anything request based like dm-mp and md (but
> > obviously not lvm, which is bio based) ... what I seem to see implied
> > but not stated in the above is that you have some reason you want to
> > move this into the guest, which is what happens if you slice at a lower
> > level (like SCSI)?
> 
> Yes, that's what happens if you do passthrough:
> 
>               guest                 |            host
>                                     |
>   read() -> req() -> READ(16) --virtio-scsi ---> ioctl() -> ...
> 
> Advantages here include the ability to work with non-block devices, and 
> the ability to reuse all the discovery code that is or will be in sd.c. 
>   If you do like this and you want multipathing (for example) you indeed 
> have to move it into the VM, but it doesn't usually make much sense.
> 
> However, something else actually can happen in the host, and here lie 
> the interesting cases.  For example, the host userspace can send the 
> commands to the LUN via iSCSI, directly:
> 
>               guest                 | host with userspace iSCSI initiator
>                                     |
>   read() -> req() -> READ(16) --virtio-scsi ---> send() -> ...
> 
> This is still effectively passthrough, on the other hand it doesn't 
> require you to handle low-level details in the VM.  And unlike an iSCSI 
> initiator in the guest, you are free to change how the storage is 
> implemented.
> 
> A third implementation is to emulate SCSI commands by unpacking them in 
> host userspace:
> 
>               guest                 |            host
>                                     |
>   read() -> req() -> READ(16) --virtio-scsi ---> read() -> ...
> 
> Again, you reuse all the discovery code that is in sd.c, and future 
> improvements can be confined to the emulation code only.  In addition, 
> future improvements done to sd.c for non-virt will apply to virt as well 
> (either right away or modulo emulation improvements).  In addition, 
> you're 100% sure that when the guest uses SG_IO it will not exhibit any 
> quirks.  And it is also more flexible when your guests are not Linux.
> 
> There's nothing new in it.  As far as I know, only Xen has a dedicated 
> protocol for paravirtualized block devices (in addition to virtio). 
> Hyper-V and VMware both use paravirtualized SCSI.
> 
> > One of the problems you might also pick up slicing within SCSI is that
> > if (by some miracle, admittedly) we finally disentangle ATA from SCSI,
> > you'll lose ATA and SATA support in virtio-scsi.  Today you also loose
> > support for non-SCSI block devices like mmc
> 
> You do not lose that.  Just like virtio-blk cannot do SG_IO to mmc, 
> virtio-scsi is only be usable with mmc in emulated mode.

OK, so I think the problem boils down to two components:

     1. virtio-blk isn't developing fast enough.  This looks to be a
        fairly easily fixable problem
     2. Discover in virtio-blk isn't done properly.  Again, this looks
        to be easily fixable.

Once you fix the above, most of what you're asking for, which is mainly
SCSI encapsulation for discovery and error handling in the guest for no
reason I can discern, becomes irrelevant.

James



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] virtio-scsi: first version
  2011-12-09 20:06           ` James Bottomley
@ 2011-12-10 16:37             ` Paolo Bonzini
  0 siblings, 0 replies; 18+ messages in thread
From: Paolo Bonzini @ 2011-12-10 16:37 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-scsi

On 12/09/2011 09:06 PM, James Bottomley wrote:
> On Thu, 2011-12-08 at 14:09 +0100, Paolo Bonzini wrote:
>>> Well, no it's not, the transports are the fastest evolving piece of the
>>> SCSI spec.
>>
>> No, I mean when something is added to the generic definition of SCSI
>> transport (SAM, more or less), not the individual transports.  When the
>> virtio-scsi transport has to change, you still have to update
>> spec+host+guest, but that's relatively rare.
>
> This doesn't make sense:  You talk about wanting TMF access which *is*
> transport defined.

TMF access is transport defined.  The definition of TMFs is part of SAM 
and not fast moving.  The virtio-scsi spec tells you how to access TMFs 
on virtio-scsi; it doesn't tell you what the TMFs do, because it just 
refers you to SAM.

Device commands can be treated opaquely when doing passthrough, so their 
rate of change does not matter.  And you can always leave them out in 
the emulated target, too.  If some new command turns out to be 
interesting enough to implement it in the emulated target, you do it and 
guests that can use the feature will start using it.

>> So, for virtio-blk, SG_IO is good for persistent reservations, burning
>> CDs, and basically nothing else.  Neither of these can really be done in
>> the host by interpreting, so for virtio-blk it makes sense to simply
>> pass through.
>
> It is a pass through for user space ... I don't get what your point is.
> All of the internal commands for setup are handled in the host.

In the host or in the guest kernel?  I'm not sure I understand your 
point either. :)

> All the guest is doing is attaching to a formed block queue. I think,
> as I've said several times before, all of this indicates virtio-blk
> doesn't do discovery of the host block queue properly, but that's
> fixable.

Well, the only fix is to disable SG_IO.  For example, suppose the host 
disk is 4k-lbs and you present it to the guest as 512b-logical, 
4096-byte physical.  That's a sensible thing to do if you want the guest 
boot from that disk.

Now, SG_IO will see 4k-lbs, and you cannot change it.  To avoid showing 
mismatched geometry to the guest, _the only fix is to disable SG_IO_. 
If you do so, you prevent the guest from doing possibly useful things 
with it (e.g. PR).  If you don't, you have to cross your fingers and 
hope the guest won't do possibly harmful things with it.

Of course, virtio-scsi is not a silver bullet.  If you want to modify 
the block limits you won't be able to pass the LUN through anymore, and 
you will have to use an emulated target; that's obvious.  However, in 
_no_ case will there be a mismatch between the queue parameters seen by 
the kernel and what you get in SG_IO.

>>> You worry me enormously talking about TMFs because they're transport
>>> specific.
>>
>> True, but virtio-blk for example cannot even retry a command at all.
>
> Why would it need to.  You seem to understand that architecturally the
> queue is sliced, but what you don't seem to appreciate is that error
> handling is done below this ... i.e. in the host in your model, so
> virtio-blk properly implemented *shouldn't* be doing retries.

There may be no error handling in the host at all, for example if the 
host is using as a simple userspace iSCSI initiator that just sends 
commands over TCP.  It's also possible that non-Linux OSes cannot be 
told "no error handling".  Windows expects the driver to be able to 
reset LUNs/buses/hosts, for example.

> You seem to be stating error handling in a way that necessarily
> violates the layering of block and then declaring this to be a
> problem.  It isn't; in virtio-block, errors are handled in the host
> and passed up to the guest when resolved.

I agree, but in practice it doesn't always work like that, depending on 
your storage backends.  Again, the choice with virtio-blk is either 
"keep it broken" or "don't do it".

> Why do you worry about WCE?  That's a SCSI feature and it's handled in
> the host.

No, the guest must also be able to toggle it.  But that's irrelevant. 
The point is: you need discovery of geometry parameters, of topology 
parameters, of cache parameters.  You need reads, writes, flushes, 
discards.  Why reinvent the wheel every time, and not encapsulate those 
within SPC/SBC commands?  Consider that:

1) you also need to support generic SCSI commands for userspace, and 
virtio-blk's solution for that sucks;

2) you would anyway need the SCSI encapsulation code for the sake of 
Windows drivers (only it would run in the Windows guests rather than in 
the host).

At some point you start wondering whether you're heading straight to a 
local optimum, and why every other virtualization platform is doing 
something else.  virtio-blk's main feature is its simplicity; it's quite 
possible that we're past the break-even point for virtio-blk's simplicity.

> The point here is that virtio-blk operates at the
> block level, so you should too [...] you don't ask to pierce the
> abstraction to try to see SCSI parameters.

Exactly!  That's why I say SG_IO on virtio-blk is a very bad idea, and 
if clients need SCSI (and they do) they should be presented a real SCSI 
device, which virtio-scsi provides.

>> Regarding updates to the targets, you have much more control on the host
>> than the guest.  Updating the host is trivial compared to updating the
>> guest.
>
> So is this a turf war?  virto-blk isn't evolving fast enough (and since
> you say lagging behind and DISCARD was a 2008 feature, that seems
> reasonable) so you want to invent and additional backend that can move
> faster?

No turf war at all, simply different choices favoring flexibility and 
extensibility over simplicity.  (And even that is not entirely true: the 
actual virtio drivers are simpler for virtio-scsi, though of course the 
whole stack is more complex).

virtio-blk lags behind by design, because it tries to follow the Linux 
block layer's protocol.  To add a new feature to the protocol, in 
practice it has to be already in the block layer, even if there is a 
useful addition that non-Linux guests could use.  Then you have to come 
to an agreement on spec updates, implement it in host, and get the guest 
driver updated.

With virtio-scsi, you sidestep the problems completely, because all you 
need to do in the host is provide a SCSI target with a decent command 
set.  The spec heavily relies on SAM and refers you to it and the other 
SCSI specifications.  New features can be added even before Linux adopts 
a new feature, as soon as SPC or SBC includes them.  You do not need 
separate work on a separate spec, and you do not risk getting that part 
wrong.  And once Linux non-virt devices do gain support for the new 
feature, virt devices also gain it.  Sometimes for free, for example if 
you had already done the host implementation for Windows guests.

>>> Incidentally, REQ_DISCARD was added in 2008.  In that time close to
>>> 50 new commands have been added to SCSI, so the block protocol is
>>> pretty slow moving.
>>
>> That also means that virtio-blk cannot give guests access to the full
>> range of features that might want to use.  Not all OSes are Linux, not
>> all OSes limit themselves to the features of the Linux block protocol.
>
> So you're trying to solve the non-linux guest problem?  My observation
> from Windows has been that windows device queues mostly behave
> reasonably similarly to Linux ... that's not exactly, but similarly
> enough that we can translate the requests.

That's not the case, actually.  I don't know how Windows device queues 
work, but Windows storage drivers can only hook themselves at the SCSI 
layer.  A Windows storage driver cannot distinguish a read that came 
from the disk driver, from a READ that came from userspace via 
passthrough (not unlike a Linux driver for a SCSI host).

For this reason, Windows virtio-blk devices do not have the equivalent 
of SG_IO.  If you send a READ command via SCSI passthrough, it becomes a 
regular read.  If you send an INQUIRY, you get artificial data that the 
Windows virtio-blk device makes up.

[snip]

> OK, so I think the problem boils down to two components:
>
>       1. virtio-blk isn't developing fast enough.  This looks to be a
>          fairly easily fixable problem

Agreed, the immediate shortcomings are fixable, though the slowness is 
inherent in virtio-blk.  It can even be considered a feature, because it 
is a consequence of virtio-blk's simplicity.

>       2. Discover in virtio-blk isn't done properly.  Again, this looks
>          to be easily fixable.

No, this is not a problem.  virtio-blk does discovery very well.  The 
problems are all with the SG_IO interface:

1. When you create a virtio-blk device on say /dev/sdb, you have more 
flexibility than just passing /dev/sdb through to the guest.  But if you 
use this flexibility, you have no choice but to disable SG_IO altogether 
(or leave it enabled, and hope the guest doesn't corrupt its own data 
inadvertently).

2. SG_IO is limited to Linux guests, so that non-Linux guests are 
limited in practice to the feature set of the Linux block layer.

3. Even on Linux, SG_IO is not reliably a part of the userspace ABI for 
virtio disks.  That's because it may work or not depending on how 
storage has been configured.

4. SG_IO on virtio-blk does not cover non-block SCSI devices.

> Once you fix the above, most of what you're asking for, which is mainly
> SCSI encapsulation for discovery and error handling in the guest for no
> reason I can discern, becomes irrelevant.

SCSI encapsulation is not an end by itself.  It just lets you reuse work 
on an existing spec rather than making up one.

Paolo


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2011-12-10 16:38 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-11-30 13:54 [PATCH 0/2] virtio-scsi driver Paolo Bonzini
2011-11-30 13:54 ` [PATCH 1/2] virtio-scsi: first version Paolo Bonzini
2011-12-01  6:33   ` Sasha Levin
2011-12-01  8:36     ` Paolo Bonzini
2011-12-01  8:55       ` Sasha Levin
2011-12-02  0:29       ` Rusty Russell
2011-12-02 23:07   ` Benjamin Herrenschmidt
2011-12-03 17:38     ` Paolo Bonzini
2011-12-03 22:22       ` Benjamin Herrenschmidt
2011-12-05 10:08         ` Paolo Bonzini
2011-11-30 13:54 ` [PATCH 2/2] virtio-scsi: add error handling Paolo Bonzini
  -- strict thread matches above, loose matches on Subject: below --
2011-12-05 17:29 [PATCH v2 0/2] virtio-scsi driver Paolo Bonzini
2011-12-05 17:29 ` [PATCH 1/2] virtio-scsi: first version Paolo Bonzini
2011-12-06 18:09   ` James Bottomley
2011-12-07  9:41     ` Paolo Bonzini
2011-12-07 14:35       ` James Bottomley
2011-12-08 13:09         ` Paolo Bonzini
2011-12-09 20:06           ` James Bottomley
2011-12-10 16:37             ` Paolo Bonzini

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).