kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] virtio DMA API core stuff
@ 2015-10-28  1:17 Andy Lutomirski
  2015-10-28  1:17 ` [PATCH 1/3] virtio_net: Stop doing DMA from the stack Andy Lutomirski
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Andy Lutomirski @ 2015-10-28  1:17 UTC (permalink / raw)
  To: linux-kernel
  Cc: Joerg Roedel, Christian Borntraeger, Cornelia Huck, Sebastian Ott,
	Paolo Bonzini, Christoph Hellwig, benh, KVM, dwmw2,
	Martin Schwidefsky, linux-s390, Andy Lutomirski

This switches virtio to use the DMA API unconditionally.  I'm sure
it breaks things, but it seems to work on x86 using virtio-pci, with
and without Xen, and using both the modern 1.0 variant and the
legacy variant.

Andy Lutomirski (3):
  virtio_net: Stop doing DMA from the stack
  virtio_ring: Support DMA APIs
  virtio_pci: Use the DMA API

 drivers/net/virtio_net.c           |  53 +++++++----
 drivers/virtio/Kconfig             |   2 +-
 drivers/virtio/virtio_pci_common.h |   3 +-
 drivers/virtio/virtio_pci_legacy.c |  19 +++-
 drivers/virtio/virtio_pci_modern.c |  34 ++++---
 drivers/virtio/virtio_ring.c       | 179 ++++++++++++++++++++++++++++++-------
 tools/virtio/linux/dma-mapping.h   |  17 ++++
 7 files changed, 241 insertions(+), 66 deletions(-)
 create mode 100644 tools/virtio/linux/dma-mapping.h

-- 
2.4.3

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/3] virtio_net: Stop doing DMA from the stack
  2015-10-28  1:17 [PATCH 0/3] virtio DMA API core stuff Andy Lutomirski
@ 2015-10-28  1:17 ` Andy Lutomirski
  2015-10-28  2:07   ` Joerg Roedel
  2015-10-28  1:17 ` [PATCH 2/3] virtio_ring: Support DMA APIs Andy Lutomirski
  2015-10-28  1:17 ` [PATCH 3/3] virtio_pci: Use the DMA API Andy Lutomirski
  2 siblings, 1 reply; 16+ messages in thread
From: Andy Lutomirski @ 2015-10-28  1:17 UTC (permalink / raw)
  To: linux-kernel
  Cc: Joerg Roedel, Christian Borntraeger, Cornelia Huck, Sebastian Ott,
	Paolo Bonzini, Christoph Hellwig, benh, KVM, dwmw2,
	Martin Schwidefsky, linux-s390, Andy Lutomirski, Andy Lutomirski

From: Andy Lutomirski <luto@amacapital.net>

Once virtio starts using the DMA API, we won't be able to safely DMA
from the stack.  virtio-net does a couple of config DMA requests
from small stack buffers -- switch to using dynamically-allocated
memory.

This should have no effect on any performance-critical code paths.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 drivers/net/virtio_net.c | 53 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index d8838dedb7a4..4f10f8a58811 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -976,31 +976,43 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
 				 struct scatterlist *out)
 {
 	struct scatterlist *sgs[4], hdr, stat;
-	struct virtio_net_ctrl_hdr ctrl;
-	virtio_net_ctrl_ack status = ~0;
+
+	struct {
+		struct virtio_net_ctrl_hdr ctrl;
+		virtio_net_ctrl_ack status;
+	} *buf;
+
 	unsigned out_num = 0, tmp;
+	bool ret;
 
 	/* Caller should know better */
 	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
 
-	ctrl.class = class;
-	ctrl.cmd = cmd;
+	buf = kmalloc(sizeof(*buf), GFP_ATOMIC);
+	if (!buf)
+		return false;
+	buf->status = ~0;
+
+	buf->ctrl.class = class;
+	buf->ctrl.cmd = cmd;
 	/* Add header */
-	sg_init_one(&hdr, &ctrl, sizeof(ctrl));
+	sg_init_one(&hdr, &buf->ctrl, sizeof(buf->ctrl));
 	sgs[out_num++] = &hdr;
 
 	if (out)
 		sgs[out_num++] = out;
 
 	/* Add return status. */
-	sg_init_one(&stat, &status, sizeof(status));
+	sg_init_one(&stat, &buf->status, sizeof(buf->status));
 	sgs[out_num] = &stat;
 
 	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
 	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
 
-	if (unlikely(!virtqueue_kick(vi->cvq)))
-		return status == VIRTIO_NET_OK;
+	if (unlikely(!virtqueue_kick(vi->cvq))) {
+		ret = (buf->status == VIRTIO_NET_OK);
+		goto out;
+	}
 
 	/* Spin for a response, the kick causes an ioport write, trapping
 	 * into the hypervisor, so the request should be handled immediately.
@@ -1009,7 +1021,11 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
 	       !virtqueue_is_broken(vi->cvq))
 		cpu_relax();
 
-	return status == VIRTIO_NET_OK;
+	ret = (buf->status == VIRTIO_NET_OK);
+
+out:
+	kfree(buf);
+	return ret;
 }
 
 static int virtnet_set_mac_address(struct net_device *dev, void *p)
@@ -1151,7 +1167,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct scatterlist sg[2];
-	u8 promisc, allmulti;
+	u8 *cmdbyte;
 	struct virtio_net_ctrl_mac *mac_data;
 	struct netdev_hw_addr *ha;
 	int uc_count;
@@ -1163,22 +1179,25 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
 		return;
 
-	promisc = ((dev->flags & IFF_PROMISC) != 0);
-	allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
+	cmdbyte = kmalloc(sizeof(*cmdbyte), GFP_ATOMIC);
+	if (!cmdbyte)
+		return;
 
-	sg_init_one(sg, &promisc, sizeof(promisc));
+	sg_init_one(sg, cmdbyte, sizeof(*cmdbyte));
 
+	*cmdbyte = ((dev->flags & IFF_PROMISC) != 0);
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
 				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
 		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
-			 promisc ? "en" : "dis");
-
-	sg_init_one(sg, &allmulti, sizeof(allmulti));
+			 *cmdbyte ? "en" : "dis");
 
+	*cmdbyte = ((dev->flags & IFF_ALLMULTI) != 0);
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
 				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
 		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
-			 allmulti ? "en" : "dis");
+			 *cmdbyte ? "en" : "dis");
+
+	kfree(cmdbyte);
 
 	uc_count = netdev_uc_count(dev);
 	mc_count = netdev_mc_count(dev);
-- 
2.4.3

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/3] virtio_ring: Support DMA APIs
  2015-10-28  1:17 [PATCH 0/3] virtio DMA API core stuff Andy Lutomirski
  2015-10-28  1:17 ` [PATCH 1/3] virtio_net: Stop doing DMA from the stack Andy Lutomirski
@ 2015-10-28  1:17 ` Andy Lutomirski
  2015-10-28  2:06   ` Joerg Roedel
  2015-10-28  2:27   ` Christian Borntraeger
  2015-10-28  1:17 ` [PATCH 3/3] virtio_pci: Use the DMA API Andy Lutomirski
  2 siblings, 2 replies; 16+ messages in thread
From: Andy Lutomirski @ 2015-10-28  1:17 UTC (permalink / raw)
  To: linux-kernel
  Cc: Joerg Roedel, Christian Borntraeger, Cornelia Huck, Sebastian Ott,
	Paolo Bonzini, Christoph Hellwig, benh, KVM, dwmw2,
	Martin Schwidefsky, linux-s390, Andy Lutomirski

From: Andy Lutomirski <luto@amacapital.net>

virtio_ring currently sends the device (usually a hypervisor)
physical addresses of its I/O buffers.  This is okay when DMA
addresses and physical addresses are the same thing, but this isn't
always the case.  For example, this never works on Xen guests, and
it is likely to fail if a physical "virtio" device ever ends up
behind an IOMMU or swiotlb.

The immediate use case for me is to enable virtio on Xen guests.
For that to work, we need vring to support DMA address translation
as well as a corresponding change to virtio_pci or to another
driver.

With this patch, if enabled, virtfs survives kmemleak and
CONFIG_DMA_API_DEBUG.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 drivers/virtio/Kconfig           |   2 +-
 drivers/virtio/virtio_ring.c     | 179 +++++++++++++++++++++++++++++++--------
 tools/virtio/linux/dma-mapping.h |  17 ++++
 3 files changed, 164 insertions(+), 34 deletions(-)
 create mode 100644 tools/virtio/linux/dma-mapping.h

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index cab9f3f63a38..77590320d44c 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -60,7 +60,7 @@ config VIRTIO_INPUT
 
  config VIRTIO_MMIO
 	tristate "Platform bus driver for memory mapped virtio devices"
-	depends on HAS_IOMEM
+	depends on HAS_IOMEM && HAS_DMA
  	select VIRTIO
  	---help---
  	 This drivers provides support for memory mapped virtio
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 096b857e7b75..911fbaa7a260 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/hrtimer.h>
 #include <linux/kmemleak.h>
+#include <linux/dma-mapping.h>
 
 #ifdef DEBUG
 /* For development, we want to crash whenever the ring is screwed. */
@@ -54,7 +55,14 @@
 #define END_USE(vq)
 #endif
 
-struct vring_virtqueue {
+struct vring_desc_state
+{
+	void *data;			/* Data for callback. */
+	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
+};
+
+struct vring_virtqueue
+{
 	struct virtqueue vq;
 
 	/* Actual memory layout for this queue */
@@ -92,12 +100,71 @@ struct vring_virtqueue {
 	ktime_t last_add_time;
 #endif
 
-	/* Tokens for callbacks. */
-	void *data[];
+	/* Per-descriptor state. */
+	struct vring_desc_state desc_state[];
 };
 
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 
+/*
+ * The DMA ops on various arches are rather gnarly right now, and
+ * making all of the arch DMA ops work on the vring device itself
+ * is a mess.  For now, we use the parent device for DMA ops.
+ */
+struct device *vring_dma_dev(const struct vring_virtqueue *vq)
+{
+	return vq->vq.vdev->dev.parent;
+}
+
+/* Map one sg entry. */
+static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
+				   struct scatterlist *sg,
+				   enum dma_data_direction direction)
+{
+	/*
+	 * We can't use dma_map_sg, because we don't use scatterlists in
+	 * the way it expects (we don't guarantee that the scatterlist
+	 * will exist for the lifetime of the mapping).
+	 */
+	return dma_map_page(vring_dma_dev(vq),
+			    sg_page(sg), sg->offset, sg->length,
+			    direction);
+}
+
+static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
+				   void *cpu_addr, size_t size,
+				   enum dma_data_direction direction)
+{
+	return dma_map_single(vring_dma_dev(vq),
+			      cpu_addr, size, direction);
+}
+
+static void vring_unmap_one(const struct vring_virtqueue *vq,
+			    struct vring_desc *desc)
+{
+	u16 flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
+
+	if (flags & VRING_DESC_F_INDIRECT) {
+		dma_unmap_single(vring_dma_dev(vq),
+				 virtio64_to_cpu(vq->vq.vdev, desc->addr),
+				 virtio32_to_cpu(vq->vq.vdev, desc->len),
+				 (flags & VRING_DESC_F_WRITE) ?
+				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	} else {
+		dma_unmap_page(vring_dma_dev(vq),
+			       virtio64_to_cpu(vq->vq.vdev, desc->addr),
+			       virtio32_to_cpu(vq->vq.vdev, desc->len),
+			       (flags & VRING_DESC_F_WRITE) ?
+			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	}
+}
+
+static int vring_mapping_error(const struct vring_virtqueue *vq,
+			       dma_addr_t addr)
+{
+	return dma_mapping_error(vring_dma_dev(vq), addr);
+}
+
 static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
 					 unsigned int total_sg, gfp_t gfp)
 {
@@ -131,7 +198,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	struct scatterlist *sg;
 	struct vring_desc *desc;
-	unsigned int i, n, avail, descs_used, uninitialized_var(prev);
+	unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx;
 	int head;
 	bool indirect;
 
@@ -171,21 +238,25 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 
 	if (desc) {
 		/* Use a single buffer which doesn't continue */
+		indirect = true;
 		vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT);
-		vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, virt_to_phys(desc));
-		/* avoid kmemleak false positive (hidden by virt_to_phys) */
-		kmemleak_ignore(desc);
+		vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, vring_map_single(
+			vq,
+			desc, total_sg * sizeof(struct vring_desc),
+			DMA_TO_DEVICE));
+		if (vring_mapping_error(vq, vq->vring.desc[head].addr))
+			goto unmap_free_indir;
+
 		vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc));
 
 		/* Set up rest to use this indirect table. */
 		i = 0;
 		descs_used = 1;
-		indirect = true;
 	} else {
+		indirect = false;
 		desc = vq->vring.desc;
 		i = head;
 		descs_used = total_sg;
-		indirect = false;
 	}
 
 	if (vq->vq.num_free < descs_used) {
@@ -200,14 +271,13 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 		return -ENOSPC;
 	}
 
-	/* We're about to use some buffers from the free list. */
-	vq->vq.num_free -= descs_used;
-
 	for (n = 0; n < out_sgs; n++) {
 		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
 			desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT);
-			desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg));
+			desc[i].addr = cpu_to_virtio64(_vq->vdev, vring_map_one_sg(vq, sg, DMA_TO_DEVICE));
 			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
+			if (vring_mapping_error(vq, desc[i].addr))
+				goto unmap_release;
 			prev = i;
 			i = virtio16_to_cpu(_vq->vdev, desc[i].next);
 		}
@@ -215,8 +285,10 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	for (; n < (out_sgs + in_sgs); n++) {
 		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
 			desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE);
-			desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg));
+			desc[i].addr = cpu_to_virtio64(_vq->vdev, vring_map_one_sg(vq, sg, DMA_FROM_DEVICE));
 			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
+			if (vring_mapping_error(vq, desc[i].addr))
+				goto unmap_release;
 			prev = i;
 			i = virtio16_to_cpu(_vq->vdev, desc[i].next);
 		}
@@ -224,14 +296,19 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	/* Last one doesn't continue. */
 	desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
 
+	/* We're using some buffers from the free list. */
+	vq->vq.num_free -= descs_used;
+
 	/* Update free pointer */
 	if (indirect)
 		vq->free_head = virtio16_to_cpu(_vq->vdev, vq->vring.desc[head].next);
 	else
 		vq->free_head = i;
 
-	/* Set token. */
-	vq->data[head] = data;
+	/* Store token and indirect buffer state. */
+	vq->desc_state[head].data = data;
+	if (indirect)
+		vq->desc_state[head].indir_desc = desc;
 
 	/* Put entry in available array (but don't update avail->idx until they
 	 * do sync). */
@@ -253,6 +330,28 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 		virtqueue_kick(_vq);
 
 	return 0;
+
+unmap_release:
+	err_idx = i;
+	i = head;
+
+	for (n = 0; n < total_sg; n++) {
+		if (i == err_idx)
+			break;
+		vring_unmap_one(vq, &desc[i]);
+		i = vq->vring.desc[i].next;
+	}
+
+	vq->vq.num_free += total_sg;
+
+	if (indirect)
+		vring_unmap_one(vq, desc);
+
+unmap_free_indir:
+	if (indirect)
+		kfree(desc);
+
+	return -EIO;
 }
 
 /**
@@ -423,27 +522,42 @@ EXPORT_SYMBOL_GPL(virtqueue_kick);
 
 static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
 {
-	unsigned int i;
+	unsigned int i, j;
+	u16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
 
 	/* Clear data ptr. */
-	vq->data[head] = NULL;
+	vq->desc_state[head].data = NULL;
 
-	/* Put back on free list: find end */
+	/* Put back on free list: unmap first-level descriptors and find end */
 	i = head;
 
-	/* Free the indirect table */
-	if (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))
-		kfree(phys_to_virt(virtio64_to_cpu(vq->vq.vdev, vq->vring.desc[i].addr)));
-
-	while (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT)) {
+	while (vq->vring.desc[i].flags & nextflag) {
+		vring_unmap_one(vq, &vq->vring.desc[i]);
 		i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
 		vq->vq.num_free++;
 	}
 
+	vring_unmap_one(vq, &vq->vring.desc[i]);
 	vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
 	vq->free_head = head;
+
 	/* Plus final descriptor */
 	vq->vq.num_free++;
+
+	/* Free the indirect table, if any, now that it's unmapped. */
+	if (vq->desc_state[head].indir_desc) {
+		struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
+		u32 len = vq->vring.desc[head].len;
+
+		BUG_ON(!(vq->vring.desc[head].flags & VRING_DESC_F_INDIRECT));
+		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
+
+		for (j = 0; j < len / sizeof(struct vring_desc); j++)
+			vring_unmap_one(vq, &indir_desc[j]);
+
+		kfree(vq->desc_state[head].indir_desc);
+		vq->desc_state[head].indir_desc = NULL;
+	}
 }
 
 static inline bool more_used(const struct vring_virtqueue *vq)
@@ -498,13 +612,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 		BAD_RING(vq, "id %u out of range\n", i);
 		return NULL;
 	}
-	if (unlikely(!vq->data[i])) {
+	if (unlikely(!vq->desc_state[i].data)) {
 		BAD_RING(vq, "id %u is not a head!\n", i);
 		return NULL;
 	}
 
 	/* detach_buf clears data, so grab it now. */
-	ret = vq->data[i];
+	ret = vq->desc_state[i].data;
 	detach_buf(vq, i);
 	vq->last_used_idx++;
 	/* If we expect an interrupt for the next entry, tell host
@@ -665,10 +779,10 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
 	START_USE(vq);
 
 	for (i = 0; i < vq->vring.num; i++) {
-		if (!vq->data[i])
+		if (!vq->desc_state[i].data)
 			continue;
 		/* detach_buf clears data, so grab it now. */
-		buf = vq->data[i];
+		buf = vq->desc_state[i].data;
 		detach_buf(vq, i);
 		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - 1);
 		END_USE(vq);
@@ -721,7 +835,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 		return NULL;
 	}
 
-	vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
+	vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
+		     GFP_KERNEL);
 	if (!vq)
 		return NULL;
 
@@ -751,11 +866,9 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 
 	/* Put everything in free lists. */
 	vq->free_head = 0;
-	for (i = 0; i < num-1; i++) {
+	for (i = 0; i < num-1; i++)
 		vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
-		vq->data[i] = NULL;
-	}
-	vq->data[i] = NULL;
+	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
 
 	return &vq->vq;
 }
diff --git a/tools/virtio/linux/dma-mapping.h b/tools/virtio/linux/dma-mapping.h
new file mode 100644
index 000000000000..4f93af89ae16
--- /dev/null
+++ b/tools/virtio/linux/dma-mapping.h
@@ -0,0 +1,17 @@
+#ifndef _LINUX_DMA_MAPPING_H
+#define _LINUX_DMA_MAPPING_H
+
+#ifdef CONFIG_HAS_DMA
+# error Virtio userspace code does not support CONFIG_HAS_DMA
+#endif
+
+#define PCI_DMA_BUS_IS_PHYS 1
+
+enum dma_data_direction {
+	DMA_BIDIRECTIONAL = 0,
+	DMA_TO_DEVICE = 1,
+	DMA_FROM_DEVICE = 2,
+	DMA_NONE = 3,
+};
+
+#endif
-- 
2.4.3

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 3/3] virtio_pci: Use the DMA API
  2015-10-28  1:17 [PATCH 0/3] virtio DMA API core stuff Andy Lutomirski
  2015-10-28  1:17 ` [PATCH 1/3] virtio_net: Stop doing DMA from the stack Andy Lutomirski
  2015-10-28  1:17 ` [PATCH 2/3] virtio_ring: Support DMA APIs Andy Lutomirski
@ 2015-10-28  1:17 ` Andy Lutomirski
  2015-10-28  2:15   ` Joerg Roedel
  2 siblings, 1 reply; 16+ messages in thread
From: Andy Lutomirski @ 2015-10-28  1:17 UTC (permalink / raw)
  To: linux-kernel
  Cc: Joerg Roedel, Christian Borntraeger, Cornelia Huck, Sebastian Ott,
	Paolo Bonzini, Christoph Hellwig, benh, KVM, dwmw2,
	Martin Schwidefsky, linux-s390, Andy Lutomirski, Andy Lutomirski

From: Andy Lutomirski <luto@amacapital.net>

This fixes virtio-pci on platforms and busses that have IOMMUs.  This
will break the experimental QEMU Q35 IOMMU support until QEMU is
fixed.  In exchange, it fixes physical virtio hardware as well as
virtio-pci running under Xen.

We should clean up the virtqueue API to do its own allocation and
teach virtqueue_get_avail and virtqueue_get_used to return DMA
addresses directly.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 drivers/virtio/virtio_pci_common.h |  3 ++-
 drivers/virtio/virtio_pci_legacy.c | 19 +++++++++++++++----
 drivers/virtio/virtio_pci_modern.c | 34 ++++++++++++++++++++++++----------
 3 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index b976d968e793..cd6196b513ad 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -38,8 +38,9 @@ struct virtio_pci_vq_info {
 	/* the number of entries in the queue */
 	int num;
 
-	/* the virtual address of the ring queue */
+	/* the ring queue */
 	void *queue;
+	dma_addr_t queue_dma_addr;      /* bus address */
 
 	/* the list node for the virtqueues list */
 	struct list_head node;
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
index 48bc9797e530..b5293e5f2af4 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -135,12 +135,14 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	info->msix_vector = msix_vec;
 
 	size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
-	info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
+	info->queue = dma_zalloc_coherent(&vp_dev->pci_dev->dev, size,
+					  &info->queue_dma_addr,
+					  GFP_KERNEL);
 	if (info->queue == NULL)
 		return ERR_PTR(-ENOMEM);
 
 	/* activate the queue */
-	iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
+	iowrite32(info->queue_dma_addr >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
 		  vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
 	/* create the vring */
@@ -169,7 +171,8 @@ out_assign:
 	vring_del_virtqueue(vq);
 out_activate_queue:
 	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
-	free_pages_exact(info->queue, size);
+	dma_free_coherent(&vp_dev->pci_dev->dev, size,
+			  info->queue, info->queue_dma_addr);
 	return ERR_PTR(err);
 }
 
@@ -194,7 +197,8 @@ static void del_vq(struct virtio_pci_vq_info *info)
 	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
 	size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_VRING_ALIGN));
-	free_pages_exact(info->queue, size);
+	dma_free_coherent(&vp_dev->pci_dev->dev, size,
+			  info->queue, info->queue_dma_addr);
 }
 
 static const struct virtio_config_ops virtio_pci_config_ops = {
@@ -227,6 +231,13 @@ int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev)
 		return -ENODEV;
 	}
 
+	rc = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64));
+	if (rc)
+		rc = dma_set_mask_and_coherent(&pci_dev->dev,
+						DMA_BIT_MASK(32));
+	if (rc)
+		dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA.  Trying to continue, but this might not work.\n");
+
 	rc = pci_request_region(pci_dev, 0, "virtio-pci-legacy");
 	if (rc)
 		return rc;
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 8e5cf194cc0b..fbe0bd1c4881 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -293,14 +293,16 @@ static size_t vring_pci_size(u16 num)
 	return PAGE_ALIGN(vring_size(num, SMP_CACHE_BYTES));
 }
 
-static void *alloc_virtqueue_pages(int *num)
+static void *alloc_virtqueue_pages(struct virtio_pci_device *vp_dev,
+				   int *num, dma_addr_t *dma_addr)
 {
 	void *pages;
 
 	/* TODO: allocate each queue chunk individually */
 	for (; *num && vring_pci_size(*num) > PAGE_SIZE; *num /= 2) {
-		pages = alloc_pages_exact(vring_pci_size(*num),
-					  GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
+		pages = dma_zalloc_coherent(
+			&vp_dev->pci_dev->dev, vring_pci_size(*num),
+			dma_addr, GFP_KERNEL|__GFP_NOWARN);
 		if (pages)
 			return pages;
 	}
@@ -309,7 +311,9 @@ static void *alloc_virtqueue_pages(int *num)
 		return NULL;
 
 	/* Try to get a single page. You are my only hope! */
-	return alloc_pages_exact(vring_pci_size(*num), GFP_KERNEL|__GFP_ZERO);
+	return dma_zalloc_coherent(
+		&vp_dev->pci_dev->dev, vring_pci_size(*num),
+		dma_addr, GFP_KERNEL);
 }
 
 static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
@@ -346,7 +350,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	info->num = num;
 	info->msix_vector = msix_vec;
 
-	info->queue = alloc_virtqueue_pages(&info->num);
+	info->queue = alloc_virtqueue_pages(vp_dev, &info->num,
+					    &info->queue_dma_addr);
 	if (info->queue == NULL)
 		return ERR_PTR(-ENOMEM);
 
@@ -361,11 +366,11 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 
 	/* activate the queue */
 	vp_iowrite16(num, &cfg->queue_size);
-	vp_iowrite64_twopart(virt_to_phys(info->queue),
+	vp_iowrite64_twopart(info->queue_dma_addr,
 			     &cfg->queue_desc_lo, &cfg->queue_desc_hi);
-	vp_iowrite64_twopart(virt_to_phys(virtqueue_get_avail(vq)),
+	vp_iowrite64_twopart(info->queue_dma_addr + ((char *)virtqueue_get_avail(vq) - (char *)info->queue),
 			     &cfg->queue_avail_lo, &cfg->queue_avail_hi);
-	vp_iowrite64_twopart(virt_to_phys(virtqueue_get_used(vq)),
+	vp_iowrite64_twopart(info->queue_dma_addr + ((char *)virtqueue_get_used(vq) - (char *)info->queue),
 			     &cfg->queue_used_lo, &cfg->queue_used_hi);
 
 	if (vp_dev->notify_base) {
@@ -411,7 +416,8 @@ err_assign_vector:
 err_map_notify:
 	vring_del_virtqueue(vq);
 err_new_queue:
-	free_pages_exact(info->queue, vring_pci_size(info->num));
+	dma_free_coherent(&vp_dev->pci_dev->dev, vring_pci_size(info->num),
+			  info->queue, info->queue_dma_addr);
 	return ERR_PTR(err);
 }
 
@@ -457,7 +463,8 @@ static void del_vq(struct virtio_pci_vq_info *info)
 
 	vring_del_virtqueue(vq);
 
-	free_pages_exact(info->queue, vring_pci_size(info->num));
+	dma_free_coherent(&vp_dev->pci_dev->dev, vring_pci_size(info->num),
+			  info->queue, info->queue_dma_addr);
 }
 
 static const struct virtio_config_ops virtio_pci_config_nodev_ops = {
@@ -641,6 +648,13 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev)
 		return -EINVAL;
 	}
 
+	err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64));
+	if (err)
+		err = dma_set_mask_and_coherent(&pci_dev->dev,
+						DMA_BIT_MASK(32));
+	if (err)
+		dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA.  Trying to continue, but this might not work.\n");
+
 	/* Device capability is only mandatory for devices that have
 	 * device-specific configuration.
 	 */
-- 
2.4.3

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] virtio_ring: Support DMA APIs
  2015-10-28  1:17 ` [PATCH 2/3] virtio_ring: Support DMA APIs Andy Lutomirski
@ 2015-10-28  2:06   ` Joerg Roedel
  2015-10-28  2:13     ` Andy Lutomirski
  2015-10-28  2:27   ` Christian Borntraeger
  1 sibling, 1 reply; 16+ messages in thread
From: Joerg Roedel @ 2015-10-28  2:06 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-kernel, Christian Borntraeger, Cornelia Huck, Sebastian Ott,
	Paolo Bonzini, Christoph Hellwig, benh, KVM, dwmw2,
	Martin Schwidefsky, linux-s390, Andy Lutomirski

Hi Andy,

On Tue, Oct 27, 2015 at 06:17:09PM -0700, Andy Lutomirski wrote:
> From: Andy Lutomirski <luto@amacapital.net>
> 
> virtio_ring currently sends the device (usually a hypervisor)
> physical addresses of its I/O buffers.  This is okay when DMA
> addresses and physical addresses are the same thing, but this isn't
> always the case.  For example, this never works on Xen guests, and
> it is likely to fail if a physical "virtio" device ever ends up
> behind an IOMMU or swiotlb.

The overall code looks good, but I havn't seen and dma_sync* calls.
When swiotlb=force is in use this would break.

> +		vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, vring_map_single(
> +			vq,
> +			desc, total_sg * sizeof(struct vring_desc),
> +			DMA_TO_DEVICE));

Nit: I think readability could be improved here by using a temporary
variable for the return value of vring_map_single().


	Joerg

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/3] virtio_net: Stop doing DMA from the stack
  2015-10-28  1:17 ` [PATCH 1/3] virtio_net: Stop doing DMA from the stack Andy Lutomirski
@ 2015-10-28  2:07   ` Joerg Roedel
  2015-10-28  2:14     ` Andy Lutomirski
  0 siblings, 1 reply; 16+ messages in thread
From: Joerg Roedel @ 2015-10-28  2:07 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-kernel, Christian Borntraeger, Cornelia Huck, Sebastian Ott,
	Paolo Bonzini, Christoph Hellwig, benh, KVM, dwmw2,
	Martin Schwidefsky, linux-s390, Andy Lutomirski

On Tue, Oct 27, 2015 at 06:17:08PM -0700, Andy Lutomirski wrote:
> From: Andy Lutomirski <luto@amacapital.net>
> 
> Once virtio starts using the DMA API, we won't be able to safely DMA
> from the stack.  virtio-net does a couple of config DMA requests
> from small stack buffers -- switch to using dynamically-allocated
> memory.
> 
> This should have no effect on any performance-critical code paths.
> 
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
>  drivers/net/virtio_net.c | 53 ++++++++++++++++++++++++++++++++----------------
>  1 file changed, 36 insertions(+), 17 deletions(-)

Reviewed-by: Joerg Roedel <jroedel@suse.de>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] virtio_ring: Support DMA APIs
  2015-10-28  2:06   ` Joerg Roedel
@ 2015-10-28  2:13     ` Andy Lutomirski
  2015-10-28  2:21       ` Joerg Roedel
  0 siblings, 1 reply; 16+ messages in thread
From: Andy Lutomirski @ 2015-10-28  2:13 UTC (permalink / raw)
  To: Joerg Roedel
  Cc: Andy Lutomirski, linux-kernel@vger.kernel.org,
	Christian Borntraeger, Cornelia Huck, Sebastian Ott,
	Paolo Bonzini, Christoph Hellwig, Benjamin Herrenschmidt, KVM,
	David Woodhouse, Martin Schwidefsky, linux-s390

On Tue, Oct 27, 2015 at 7:06 PM, Joerg Roedel <jroedel@suse.de> wrote:
> Hi Andy,
>
> On Tue, Oct 27, 2015 at 06:17:09PM -0700, Andy Lutomirski wrote:
>> From: Andy Lutomirski <luto@amacapital.net>
>>
>> virtio_ring currently sends the device (usually a hypervisor)
>> physical addresses of its I/O buffers.  This is okay when DMA
>> addresses and physical addresses are the same thing, but this isn't
>> always the case.  For example, this never works on Xen guests, and
>> it is likely to fail if a physical "virtio" device ever ends up
>> behind an IOMMU or swiotlb.
>
> The overall code looks good, but I havn't seen and dma_sync* calls.
> When swiotlb=force is in use this would break.
>
>> +             vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, vring_map_single(
>> +                     vq,
>> +                     desc, total_sg * sizeof(struct vring_desc),
>> +                     DMA_TO_DEVICE));
>

Are you talking about a dma_sync call on the descriptor ring itself?
Isn't dma_alloc_coherent supposed to make that unnecessary?  I should
move the allocation into the virtqueue code.

The docs suggest that I might need to "flush the processor's write
buffers before telling devices to read that memory".  I'm not sure how
to do that.

> Nit: I think readability could be improved here by using a temporary
> variable for the return value of vring_map_single().
>

I'll do something like that for v2.

--Andy

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/3] virtio_net: Stop doing DMA from the stack
  2015-10-28  2:07   ` Joerg Roedel
@ 2015-10-28  2:14     ` Andy Lutomirski
  0 siblings, 0 replies; 16+ messages in thread
From: Andy Lutomirski @ 2015-10-28  2:14 UTC (permalink / raw)
  To: Joerg Roedel
  Cc: Andy Lutomirski, linux-kernel@vger.kernel.org,
	Christian Borntraeger, Cornelia Huck, Sebastian Ott,
	Paolo Bonzini, Christoph Hellwig, Benjamin Herrenschmidt, KVM,
	David Woodhouse, Martin Schwidefsky, linux-s390

On Tue, Oct 27, 2015 at 7:07 PM, Joerg Roedel <jroedel@suse.de> wrote:
> On Tue, Oct 27, 2015 at 06:17:08PM -0700, Andy Lutomirski wrote:
>> From: Andy Lutomirski <luto@amacapital.net>
>>
>> Once virtio starts using the DMA API, we won't be able to safely DMA
>> from the stack.  virtio-net does a couple of config DMA requests
>> from small stack buffers -- switch to using dynamically-allocated
>> memory.
>>
>> This should have no effect on any performance-critical code paths.
>>
>> Signed-off-by: Andy Lutomirski <luto@kernel.org>
>> ---
>>  drivers/net/virtio_net.c | 53 ++++++++++++++++++++++++++++++++----------------
>>  1 file changed, 36 insertions(+), 17 deletions(-)
>
> Reviewed-by: Joerg Roedel <jroedel@suse.de>

Should I just send this to netdev now for 4.4?  We could get this one
out of the way, maybe.

--Andy

-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] virtio_pci: Use the DMA API
  2015-10-28  1:17 ` [PATCH 3/3] virtio_pci: Use the DMA API Andy Lutomirski
@ 2015-10-28  2:15   ` Joerg Roedel
  2015-10-28  2:22     ` David Woodhouse
  2015-10-28  2:25     ` Joerg Roedel
  0 siblings, 2 replies; 16+ messages in thread
From: Joerg Roedel @ 2015-10-28  2:15 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-kernel, Christian Borntraeger, Cornelia Huck, Sebastian Ott,
	Paolo Bonzini, Christoph Hellwig, benh, KVM, dwmw2,
	Martin Schwidefsky, linux-s390, Andy Lutomirski

On Tue, Oct 27, 2015 at 06:17:10PM -0700, Andy Lutomirski wrote:
> From: Andy Lutomirski <luto@amacapital.net>
> 
> This fixes virtio-pci on platforms and busses that have IOMMUs.  This
> will break the experimental QEMU Q35 IOMMU support until QEMU is
> fixed.  In exchange, it fixes physical virtio hardware as well as
> virtio-pci running under Xen.
> 
> We should clean up the virtqueue API to do its own allocation and
> teach virtqueue_get_avail and virtqueue_get_used to return DMA
> addresses directly.
> 
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
>  drivers/virtio/virtio_pci_common.h |  3 ++-
>  drivers/virtio/virtio_pci_legacy.c | 19 +++++++++++++++----
>  drivers/virtio/virtio_pci_modern.c | 34 ++++++++++++++++++++++++----------
>  3 files changed, 41 insertions(+), 15 deletions(-)

Same here, you need to call the dma_sync* functions when passing data
from/to the virtio-device.

I think a good test for that is to boot a virtio kvm-guest with
swiotlb=force and see if it still works.



	Joerg

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] virtio_ring: Support DMA APIs
  2015-10-28  2:13     ` Andy Lutomirski
@ 2015-10-28  2:21       ` Joerg Roedel
  2015-10-28  5:12         ` Andy Lutomirski
  0 siblings, 1 reply; 16+ messages in thread
From: Joerg Roedel @ 2015-10-28  2:21 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andy Lutomirski, linux-kernel@vger.kernel.org,
	Christian Borntraeger, Cornelia Huck, Sebastian Ott,
	Paolo Bonzini, Christoph Hellwig, Benjamin Herrenschmidt, KVM,
	David Woodhouse, Martin Schwidefsky, linux-s390

On Tue, Oct 27, 2015 at 07:13:56PM -0700, Andy Lutomirski wrote:
> On Tue, Oct 27, 2015 at 7:06 PM, Joerg Roedel <jroedel@suse.de> wrote:
> > Hi Andy,
> >
> > On Tue, Oct 27, 2015 at 06:17:09PM -0700, Andy Lutomirski wrote:
> >> From: Andy Lutomirski <luto@amacapital.net>
> >>
> >> virtio_ring currently sends the device (usually a hypervisor)
> >> physical addresses of its I/O buffers.  This is okay when DMA
> >> addresses and physical addresses are the same thing, but this isn't
> >> always the case.  For example, this never works on Xen guests, and
> >> it is likely to fail if a physical "virtio" device ever ends up
> >> behind an IOMMU or swiotlb.
> >
> > The overall code looks good, but I havn't seen and dma_sync* calls.
> > When swiotlb=force is in use this would break.
> >
> >> +             vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, vring_map_single(
> >> +                     vq,
> >> +                     desc, total_sg * sizeof(struct vring_desc),
> >> +                     DMA_TO_DEVICE));
> >
> 
> Are you talking about a dma_sync call on the descriptor ring itself?
> Isn't dma_alloc_coherent supposed to make that unnecessary?  I should
> move the allocation into the virtqueue code.
> 
> The docs suggest that I might need to "flush the processor's write
> buffers before telling devices to read that memory".  I'm not sure how
> to do that.

The write buffers should be flushed by the dma-api functions if
necessary.  For dma_alloc_coherent allocations you don't need to call
dma_sync*, but for the map_single/map_page/map_sg ones, as these might
be bounce-buffered.


	Joerg

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] virtio_pci: Use the DMA API
  2015-10-28  2:15   ` Joerg Roedel
@ 2015-10-28  2:22     ` David Woodhouse
  2015-10-28  2:50       ` Joerg Roedel
  2015-10-28  2:25     ` Joerg Roedel
  1 sibling, 1 reply; 16+ messages in thread
From: David Woodhouse @ 2015-10-28  2:22 UTC (permalink / raw)
  To: Joerg Roedel, Andy Lutomirski
  Cc: linux-kernel, Christian Borntraeger, Cornelia Huck, Sebastian Ott,
	Paolo Bonzini, Christoph Hellwig, benh, KVM, Martin Schwidefsky,
	linux-s390, Andy Lutomirski

[-- Attachment #1: Type: text/plain, Size: 1648 bytes --]

On Wed, 2015-10-28 at 11:15 +0900, Joerg Roedel wrote:
> On Tue, Oct 27, 2015 at 06:17:10PM -0700, Andy Lutomirski wrote:
> > From: Andy Lutomirski <luto@amacapital.net>
> > 
> > This fixes virtio-pci on platforms and busses that have IOMMUs. 
> >  This
> > will break the experimental QEMU Q35 IOMMU support until QEMU is
> > fixed.  In exchange, it fixes physical virtio hardware as well as
> > virtio-pci running under Xen.
> > 
> > We should clean up the virtqueue API to do its own allocation and
> > teach virtqueue_get_avail and virtqueue_get_used to return DMA
> > addresses directly.
> > 
> > Signed-off-by: Andy Lutomirski <luto@kernel.org>
> > ---
> >  drivers/virtio/virtio_pci_common.h |  3 ++-
> >  drivers/virtio/virtio_pci_legacy.c | 19 +++++++++++++++----
> >  drivers/virtio/virtio_pci_modern.c | 34 ++++++++++++++++++++++++--
> > --------
> >  3 files changed, 41 insertions(+), 15 deletions(-)
> 
> Same here, you need to call the dma_sync* functions when passing data
> from/to the virtio-device.
> 
> I think a good test for that is to boot a virtio kvm-guest with
> swiotlb=force and see if it still works.

That's useful but doesn't cover the cases where dma_wmb() is needed,
right? 

We should make sure we're handling descriptors properly as we would for
real hardware, and ensuring that addresses etc. are written to the
descriptor (and a write barrier occurs) *before* the bit is set which
tells the 'hardware' that it owns that descriptor.

AFAICT we're currently setting the flags *first* in virtqueue_add(),
let alone the missing write barrier between the two.

-- 
dwmw2



[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5691 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] virtio_pci: Use the DMA API
  2015-10-28  2:15   ` Joerg Roedel
  2015-10-28  2:22     ` David Woodhouse
@ 2015-10-28  2:25     ` Joerg Roedel
  1 sibling, 0 replies; 16+ messages in thread
From: Joerg Roedel @ 2015-10-28  2:25 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-kernel, Christian Borntraeger, Cornelia Huck, Sebastian Ott,
	Paolo Bonzini, Christoph Hellwig, benh, KVM, dwmw2,
	Martin Schwidefsky, linux-s390, Andy Lutomirski

On Wed, Oct 28, 2015 at 11:15:30AM +0900, Joerg Roedel wrote:
> Same here, you need to call the dma_sync* functions when passing data
> from/to the virtio-device.

Okay, forget about this comment. This patch only converts to
dma_coherent allocations, which don't need synchronization.

> I think a good test for that is to boot a virtio kvm-guest with
> swiotlb=force and see if it still works.

But this holds, Its a good way to test if your changes work with
bounce-buffering. Together with DMA_API_DEBUG you also see if your
specified dma_directions are right.



	Joerg

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] virtio_ring: Support DMA APIs
  2015-10-28  1:17 ` [PATCH 2/3] virtio_ring: Support DMA APIs Andy Lutomirski
  2015-10-28  2:06   ` Joerg Roedel
@ 2015-10-28  2:27   ` Christian Borntraeger
  2015-10-28  5:09     ` Andy Lutomirski
  1 sibling, 1 reply; 16+ messages in thread
From: Christian Borntraeger @ 2015-10-28  2:27 UTC (permalink / raw)
  To: Andy Lutomirski, linux-kernel
  Cc: Joerg Roedel, Cornelia Huck, Sebastian Ott, Paolo Bonzini,
	Christoph Hellwig, benh, KVM, dwmw2, Martin Schwidefsky,
	linux-s390, Andy Lutomirski

Am 28.10.2015 um 10:17 schrieb Andy Lutomirski:
@@ -423,27 +522,42 @@ EXPORT_SYMBOL_GPL(virtqueue_kick);
> 
>  static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
>  {
> -	unsigned int i;
> +	unsigned int i, j;
> +	u16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
> 
>  	/* Clear data ptr. */
> -	vq->data[head] = NULL;
> +	vq->desc_state[head].data = NULL;
> 
> -	/* Put back on free list: find end */
> +	/* Put back on free list: unmap first-level descriptors and find end */
>  	i = head;
> 
> -	/* Free the indirect table */
> -	if (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))
> -		kfree(phys_to_virt(virtio64_to_cpu(vq->vq.vdev, vq->vring.desc[i].addr)));
> -
> -	while (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT)) {
> +	while (vq->vring.desc[i].flags & nextflag) {
> +		vring_unmap_one(vq, &vq->vring.desc[i]);
>  		i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
>  		vq->vq.num_free++;
>  	}
> 
> +	vring_unmap_one(vq, &vq->vring.desc[i]);
>  	vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
>  	vq->free_head = head;
> +
>  	/* Plus final descriptor */
>  	vq->vq.num_free++;
> +
> +	/* Free the indirect table, if any, now that it's unmapped. */
> +	if (vq->desc_state[head].indir_desc) {
> +		struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
> +		u32 len = vq->vring.desc[head].len;
> +
> +		BUG_ON(!(vq->vring.desc[head].flags & VRING_DESC_F_INDIRECT));
> +		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
> +
> +		for (j = 0; j < len / sizeof(struct vring_desc); j++)
> +			vring_unmap_one(vq, &indir_desc[j]);
> +
> +		kfree(vq->desc_state[head].indir_desc);
> +		vq->desc_state[head].indir_desc = NULL;
> +	}
>  }

something seems to be broken with indirect descriptors with that change

[    1.885451] ------------[ cut here ]------------
[    1.885454] kernel BUG at drivers/virtio/virtio_ring.c:552!
[    1.885487] illegal operation: 0001 ilc:1 [#1] SMP 
[    1.885489] Modules linked in:
[    1.885491] CPU: 0 PID: 2 Comm: kthreadd Not tainted 4.3.0-rc3+ #250
[    1.885493] task: 000000000be49988 ti: 000000000be54000 task.ti: 000000000be54000
[    1.885514] Krnl PSW : 0404c00180000000 000000000059b9d2 (detach_buf+0x1ca/0x1d0)
[    1.885519]            R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 EA:3
Krnl GPRS: 0000000000000000 0000000030000000 000000000c0c8c00 000000000a89c000
[    1.885522]            000000000059b8fa 0000000000000001 0000000000000000 0000000000000000
[    1.885523]            0000000000000000 0000000000000000 0000000000000100 000000000a89c000
[    1.885525]            000000000c082000 000000000c082000 000000000059b8fa 000000000c7fbc88
[    1.885531] Krnl Code: 000000000059b9c6: a7f4ff40		brc	15,59b846
           000000000059b9ca: a7f40001		brc	15,59b9cc
          #000000000059b9ce: a7f40001		brc	15,59b9d0
          >000000000059b9d2: 0707		bcr	0,%r7
           000000000059b9d4: 0707		bcr	0,%r7
           000000000059b9d6: 0707		bcr	0,%r7
           000000000059b9d8: c00400000000	brcl	0,59b9d8
           000000000059b9de: ebcff0780024	stmg	%r12,%r15,120(%r15)
[    1.885542] Call Trace:
[    1.885544] ([<000000000059b8fa>] detach_buf+0xf2/0x1d0)
[    1.885546]  [<000000000059bbb4>] virtqueue_get_buf+0xcc/0x218
[    1.885549]  [<00000000005dd8fe>] virtblk_done+0xa6/0x120
[    1.885550]  [<000000000059b66e>] vring_interrupt+0x7e/0x108
[    1.885552]  [<00000000006c21a4>] virtio_airq_handler+0x7c/0x120
[    1.885554]  [<0000000000657e54>] do_airq_interrupt+0xa4/0xc8
[    1.885558]  [<00000000001b79a0>] handle_irq_event_percpu+0x60/0x1f0
[    1.885560]  [<00000000001bbbea>] handle_percpu_irq+0x72/0xa0
[    1.885561]  [<00000000001b6fa4>] generic_handle_irq+0x4c/0x78
[    1.885564]  [<000000000010cc7c>] do_IRQ+0x64/0x88
[    1.885566] Btrfs loaded
[    1.885600]  [<000000000080e30a>] io_int_handler+0x10a/0x218
[    1.885603]  [<0000000000186b9e>] wake_up_new_task+0x1be/0x260
[    1.885604] ([<0000000000186b6a>] wake_up_new_task+0x18a/0x260)
[    1.885606]  [<0000000000156182>] _do_fork+0xfa/0x338
[    1.885608]  [<000000000015644e>] kernel_thread+0x4e/0x60
[    1.885609]  [<0000000000179202>] kthreadd+0x1a2/0x238
[    1.885611]  [<000000000080e056>] kernel_thread_starter+0x6/0xc
[    1.885613]  [<000000000080e050>] kernel_thread_starter+0x0/0xc
[    1.885614] Last Breaking-Event-Address:
[    1.885615]  [<000000000059b9ce>] detach_buf+0x1c6/0x1d0
[    1.885617]  
[    1.885618] Kernel panic - not syncing: Fatal exception in interrupt

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] virtio_pci: Use the DMA API
  2015-10-28  2:22     ` David Woodhouse
@ 2015-10-28  2:50       ` Joerg Roedel
  0 siblings, 0 replies; 16+ messages in thread
From: Joerg Roedel @ 2015-10-28  2:50 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Andy Lutomirski, linux-kernel, Christian Borntraeger,
	Cornelia Huck, Sebastian Ott, Paolo Bonzini, Christoph Hellwig,
	benh, KVM, Martin Schwidefsky, linux-s390, Andy Lutomirski

On Wed, Oct 28, 2015 at 11:22:52AM +0900, David Woodhouse wrote:
> On Wed, 2015-10-28 at 11:15 +0900, Joerg Roedel wrote:
> > I think a good test for that is to boot a virtio kvm-guest with
> > swiotlb=force and see if it still works.
> 
> That's useful but doesn't cover the cases where dma_wmb() is needed,
> right? 
> 
> We should make sure we're handling descriptors properly as we would for
> real hardware, and ensuring that addresses etc. are written to the
> descriptor (and a write barrier occurs) *before* the bit is set which
> tells the 'hardware' that it owns that descriptor.

Hmm, good question. The virtio code has virtio_rmb/wmb and should
already call it in the right places.

The virtio-barriers default to rmb()/wmb() unless weak_barriers is set,
in which case it calls dma_rmb()/wmb(), just a compiler barrier on x86.

And weak_barriers is set by the virtio drivers when creating the queue,
and the drivers should know what they are doing. Saying this, I think
the virtio-drivers should already get this right.


	Joerg

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] virtio_ring: Support DMA APIs
  2015-10-28  2:27   ` Christian Borntraeger
@ 2015-10-28  5:09     ` Andy Lutomirski
  0 siblings, 0 replies; 16+ messages in thread
From: Andy Lutomirski @ 2015-10-28  5:09 UTC (permalink / raw)
  To: Christian Borntraeger
  Cc: Andy Lutomirski, linux-kernel@vger.kernel.org, Joerg Roedel,
	Cornelia Huck, Sebastian Ott, Paolo Bonzini, Christoph Hellwig,
	Benjamin Herrenschmidt, KVM, David Woodhouse, Martin Schwidefsky,
	linux-s390

On Tue, Oct 27, 2015 at 7:27 PM, Christian Borntraeger
<borntraeger@de.ibm.com> wrote:
> Am 28.10.2015 um 10:17 schrieb Andy Lutomirski:
> @@ -423,27 +522,42 @@ EXPORT_SYMBOL_GPL(virtqueue_kick);
>>
>>  static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
>>  {
>> -     unsigned int i;
>> +     unsigned int i, j;
>> +     u16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
>>
>>       /* Clear data ptr. */
>> -     vq->data[head] = NULL;
>> +     vq->desc_state[head].data = NULL;
>>
>> -     /* Put back on free list: find end */
>> +     /* Put back on free list: unmap first-level descriptors and find end */
>>       i = head;
>>
>> -     /* Free the indirect table */
>> -     if (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))
>> -             kfree(phys_to_virt(virtio64_to_cpu(vq->vq.vdev, vq->vring.desc[i].addr)));
>> -
>> -     while (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT)) {
>> +     while (vq->vring.desc[i].flags & nextflag) {
>> +             vring_unmap_one(vq, &vq->vring.desc[i]);
>>               i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
>>               vq->vq.num_free++;
>>       }
>>
>> +     vring_unmap_one(vq, &vq->vring.desc[i]);
>>       vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
>>       vq->free_head = head;
>> +
>>       /* Plus final descriptor */
>>       vq->vq.num_free++;
>> +
>> +     /* Free the indirect table, if any, now that it's unmapped. */
>> +     if (vq->desc_state[head].indir_desc) {
>> +             struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
>> +             u32 len = vq->vring.desc[head].len;
>> +
>> +             BUG_ON(!(vq->vring.desc[head].flags & VRING_DESC_F_INDIRECT));
>> +             BUG_ON(len == 0 || len % sizeof(struct vring_desc));
>> +
>> +             for (j = 0; j < len / sizeof(struct vring_desc); j++)
>> +                     vring_unmap_one(vq, &indir_desc[j]);
>> +
>> +             kfree(vq->desc_state[head].indir_desc);
>> +             vq->desc_state[head].indir_desc = NULL;
>> +     }
>>  }
>
> something seems to be broken with indirect descriptors with that change

You're big endian, right?  I had an endian handling bug, and I'll fix it in v2.

--Andy

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] virtio_ring: Support DMA APIs
  2015-10-28  2:21       ` Joerg Roedel
@ 2015-10-28  5:12         ` Andy Lutomirski
  0 siblings, 0 replies; 16+ messages in thread
From: Andy Lutomirski @ 2015-10-28  5:12 UTC (permalink / raw)
  To: Joerg Roedel
  Cc: Andy Lutomirski, linux-kernel@vger.kernel.org,
	Christian Borntraeger, Cornelia Huck, Sebastian Ott,
	Paolo Bonzini, Christoph Hellwig, Benjamin Herrenschmidt, KVM,
	David Woodhouse, Martin Schwidefsky, linux-s390

On Tue, Oct 27, 2015 at 7:21 PM, Joerg Roedel <jroedel@suse.de> wrote:
> On Tue, Oct 27, 2015 at 07:13:56PM -0700, Andy Lutomirski wrote:
>> On Tue, Oct 27, 2015 at 7:06 PM, Joerg Roedel <jroedel@suse.de> wrote:
>> > Hi Andy,
>> >
>> > On Tue, Oct 27, 2015 at 06:17:09PM -0700, Andy Lutomirski wrote:
>> >> From: Andy Lutomirski <luto@amacapital.net>
>> >>
>> >> virtio_ring currently sends the device (usually a hypervisor)
>> >> physical addresses of its I/O buffers.  This is okay when DMA
>> >> addresses and physical addresses are the same thing, but this isn't
>> >> always the case.  For example, this never works on Xen guests, and
>> >> it is likely to fail if a physical "virtio" device ever ends up
>> >> behind an IOMMU or swiotlb.
>> >
>> > The overall code looks good, but I havn't seen and dma_sync* calls.
>> > When swiotlb=force is in use this would break.
>> >
>> >> +             vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, vring_map_single(
>> >> +                     vq,
>> >> +                     desc, total_sg * sizeof(struct vring_desc),
>> >> +                     DMA_TO_DEVICE));
>> >
>>
>> Are you talking about a dma_sync call on the descriptor ring itself?
>> Isn't dma_alloc_coherent supposed to make that unnecessary?  I should
>> move the allocation into the virtqueue code.
>>
>> The docs suggest that I might need to "flush the processor's write
>> buffers before telling devices to read that memory".  I'm not sure how
>> to do that.
>
> The write buffers should be flushed by the dma-api functions if
> necessary.  For dma_alloc_coherent allocations you don't need to call
> dma_sync*, but for the map_single/map_page/map_sg ones, as these might
> be bounce-buffered.

I think that all the necessary barriers are already there.  I had a
nasty bug that swiotlb=force exposed, though, which I've fixed.

--Andy

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2015-10-28  5:12 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-10-28  1:17 [PATCH 0/3] virtio DMA API core stuff Andy Lutomirski
2015-10-28  1:17 ` [PATCH 1/3] virtio_net: Stop doing DMA from the stack Andy Lutomirski
2015-10-28  2:07   ` Joerg Roedel
2015-10-28  2:14     ` Andy Lutomirski
2015-10-28  1:17 ` [PATCH 2/3] virtio_ring: Support DMA APIs Andy Lutomirski
2015-10-28  2:06   ` Joerg Roedel
2015-10-28  2:13     ` Andy Lutomirski
2015-10-28  2:21       ` Joerg Roedel
2015-10-28  5:12         ` Andy Lutomirski
2015-10-28  2:27   ` Christian Borntraeger
2015-10-28  5:09     ` Andy Lutomirski
2015-10-28  1:17 ` [PATCH 3/3] virtio_pci: Use the DMA API Andy Lutomirski
2015-10-28  2:15   ` Joerg Roedel
2015-10-28  2:22     ` David Woodhouse
2015-10-28  2:50       ` Joerg Roedel
2015-10-28  2:25     ` Joerg Roedel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).