[PATCH 0/3] virtio: Clean up scatterlists and use the DMA API

virtualization.lists.linux-foundation.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API
@ 2014-08-26 21:16 Andy Lutomirski
  2014-08-26 21:17 ` [PATCH 1/3] virtio_ring: Remove sg_next indirection Andy Lutomirski
                   ` (4 more replies)
  0 siblings, 5 replies; 28+ messages in thread
From: Andy Lutomirski @ 2014-08-26 21:16 UTC (permalink / raw)
  To: Rusty Russell, Michael S. Tsirkin
  Cc: linux-s390, Konrad Rzeszutek Wilk, virtualization, linux390,
	Andy Lutomirski

This fixes virtio on Xen guests as well as on any other platform on
which physical addresses don't match bus addresses.

This can be tested with:

    virtme-run --xen xen --kimg arch/x86/boot/bzImage --console

using virtme from here:

    https://git.kernel.org/cgit/utils/kernel/virtme/virtme.git

Without these patches, the guest hangs forever.  With these patches,
everything works.

There are two outstanding issues.  virtio_net warns if DMA debugging
is on because it does DMA from the stack.  (The warning is correct.)
This also is likely to do something unpleasant to s390.
(Maintainers are cc'd -- I don't know what to do about it.)

Andy Lutomirski (3):
  virtio_ring: Remove sg_next indirection
  virtio_ring: Use DMA APIs
  virtio_pci: Use the DMA API for virtqueues

 drivers/virtio/Kconfig       |   1 +
 drivers/virtio/virtio_pci.c  |  25 ++++++--
 drivers/virtio/virtio_ring.c | 150 ++++++++++++++++++++++++++++++-------------
 3 files changed, 125 insertions(+), 51 deletions(-)

-- 
1.9.3

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/3] virtio_ring: Remove sg_next indirection
  2014-08-26 21:16 [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API Andy Lutomirski
@ 2014-08-26 21:17 ` Andy Lutomirski
  2014-09-01  0:58   ` Rusty Russell
  2014-08-26 21:17 ` [PATCH 2/3] virtio_ring: Use DMA APIs Andy Lutomirski
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 28+ messages in thread
From: Andy Lutomirski @ 2014-08-26 21:17 UTC (permalink / raw)
  To: Rusty Russell, Michael S. Tsirkin
  Cc: linux-s390, Konrad Rzeszutek Wilk, virtualization, linux390,
	Andy Lutomirski

The only unusual thing about virtio's use of scatterlists is that
two of the APIs accept scatterlists that might not be terminated.
Using function pointers to handle this case is overkill; for_each_sg
can do it.

There's a small subtlely here: for_each_sg assumes that the provided
count is correct, but, because of the way that virtio_ring handles
multiple scatterlists at once, the count is only an upper bound if
there is more than one scatterlist.  This is easily solved by
checking the sg pointer for NULL on each iteration.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 drivers/virtio/virtio_ring.c | 46 +++++++++++++++++---------------------------
 1 file changed, 18 insertions(+), 28 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 4d08f45a9c29..d356a701c9c2 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -99,25 +99,9 @@ struct vring_virtqueue
 
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 
-static inline struct scatterlist *sg_next_chained(struct scatterlist *sg,
-						  unsigned int *count)
-{
-	return sg_next(sg);
-}
-
-static inline struct scatterlist *sg_next_arr(struct scatterlist *sg,
-					      unsigned int *count)
-{
-	if (--(*count) == 0)
-		return NULL;
-	return sg + 1;
-}
-
 /* Set up an indirect table of descriptors and add it to the queue. */
 static inline int vring_add_indirect(struct vring_virtqueue *vq,
 				     struct scatterlist *sgs[],
-				     struct scatterlist *(*next)
-				       (struct scatterlist *, unsigned int *),
 				     unsigned int total_sg,
 				     unsigned int total_out,
 				     unsigned int total_in,
@@ -128,7 +112,7 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
 	struct vring_desc *desc;
 	unsigned head;
 	struct scatterlist *sg;
-	int i, n;
+	int i, j, n;
 
 	/*
 	 * We require lowmem mappings for the descriptors because
@@ -144,7 +128,9 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
 	/* Transfer entries from the sg lists into the indirect page */
 	i = 0;
 	for (n = 0; n < out_sgs; n++) {
-		for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
+		for_each_sg(sgs[n], sg, total_out, j) {
+			if (!sg)
+				break;
 			desc[i].flags = VRING_DESC_F_NEXT;
 			desc[i].addr = sg_phys(sg);
 			desc[i].len = sg->length;
@@ -153,7 +139,9 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
 		}
 	}
 	for (; n < (out_sgs + in_sgs); n++) {
-		for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
+		for_each_sg(sgs[n], sg, total_in, j) {
+			if (!sg)
+				break;
 			desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
 			desc[i].addr = sg_phys(sg);
 			desc[i].len = sg->length;
@@ -186,8 +174,6 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
 
 static inline int virtqueue_add(struct virtqueue *_vq,
 				struct scatterlist *sgs[],
-				struct scatterlist *(*next)
-				  (struct scatterlist *, unsigned int *),
 				unsigned int total_out,
 				unsigned int total_in,
 				unsigned int out_sgs,
@@ -197,7 +183,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	struct scatterlist *sg;
-	unsigned int i, n, avail, uninitialized_var(prev), total_sg;
+	unsigned int i, j, n, avail, uninitialized_var(prev), total_sg;
 	int head;
 
 	START_USE(vq);
@@ -227,7 +213,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	/* If the host supports indirect descriptor tables, and we have multiple
 	 * buffers, then go indirect. FIXME: tune this threshold */
 	if (vq->indirect && total_sg > 1 && vq->vq.num_free) {
-		head = vring_add_indirect(vq, sgs, next, total_sg, total_out,
+		head = vring_add_indirect(vq, sgs, total_sg, total_out,
 					  total_in,
 					  out_sgs, in_sgs, gfp);
 		if (likely(head >= 0))
@@ -254,7 +240,9 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 
 	head = i = vq->free_head;
 	for (n = 0; n < out_sgs; n++) {
-		for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
+		for_each_sg(sgs[n], sg, total_out, j) {
+			if (!sg)
+				break;
 			vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
 			vq->vring.desc[i].addr = sg_phys(sg);
 			vq->vring.desc[i].len = sg->length;
@@ -263,7 +251,9 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 		}
 	}
 	for (; n < (out_sgs + in_sgs); n++) {
-		for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
+		for_each_sg(sgs[n], sg, total_in, j) {
+			if (!sg)
+				break;
 			vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
 			vq->vring.desc[i].addr = sg_phys(sg);
 			vq->vring.desc[i].len = sg->length;
@@ -337,7 +327,7 @@ int virtqueue_add_sgs(struct virtqueue *_vq,
 		for (sg = sgs[i]; sg; sg = sg_next(sg))
 			total_in++;
 	}
-	return virtqueue_add(_vq, sgs, sg_next_chained,
+	return virtqueue_add(_vq, sgs,
 			     total_out, total_in, out_sgs, in_sgs, data, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
@@ -360,7 +350,7 @@ int virtqueue_add_outbuf(struct virtqueue *vq,
 			 void *data,
 			 gfp_t gfp)
 {
-	return virtqueue_add(vq, &sg, sg_next_arr, num, 0, 1, 0, data, gfp);
+	return virtqueue_add(vq, &sg, num, 0, 1, 0, data, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
 
@@ -382,7 +372,7 @@ int virtqueue_add_inbuf(struct virtqueue *vq,
 			void *data,
 			gfp_t gfp)
 {
-	return virtqueue_add(vq, &sg, sg_next_arr, 0, num, 0, 1, data, gfp);
+	return virtqueue_add(vq, &sg, 0, num, 0, 1, data, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
 
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] virtio_ring: Remove sg_next indirection
  2014-08-26 21:17 ` [PATCH 1/3] virtio_ring: Remove sg_next indirection Andy Lutomirski
@ 2014-09-01  0:58   ` Rusty Russell
  2014-09-01  1:42     ` Andy Lutomirski
  0 siblings, 1 reply; 28+ messages in thread
From: Rusty Russell @ 2014-09-01  0:58 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: linux-s390, Konrad Rzeszutek Wilk, virtualization, linux390,
	Andy Lutomirski

Andy Lutomirski <luto@amacapital.net> writes:
> The only unusual thing about virtio's use of scatterlists is that
> two of the APIs accept scatterlists that might not be terminated.
> Using function pointers to handle this case is overkill; for_each_sg
> can do it.
>
> There's a small subtlely here: for_each_sg assumes that the provided
> count is correct, but, because of the way that virtio_ring handles
> multiple scatterlists at once, the count is only an upper bound if
> there is more than one scatterlist.  This is easily solved by
> checking the sg pointer for NULL on each iteration.

Hi Andy,

        (Sorry for the delayed response; I was still catching up from
my week at KS.)

Unfortunately the reason we dance through so many hoops here is that
it has a measurable performance impact :(  Those indirect calls get inlined.  

There's only one place which actually uses a weirdly-formed sg now,
and that's virtio_net.  It's pretty trivial to fix.

However, vring_bench drops 15% when we do this.  There's a larger
question as to how much difference that makes in Real Life, of course.
I'll measure that today.

Here are my two patches, back-to-back (it cam out of of an earlier
concern about reducing stack usage, hence the stack measurements).

Cheers,
Rusty.

Subject: virtio_net: pass well-formed sg to virtqueue_add_inbuf()

This is the only place which doesn't hand virtqueue_add_inbuf or
virtqueue_add_outbuf a well-formed, well-terminated sg.  Fix it,
so we can make virtio_add_* simpler.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 8a852b5f215f..63299b04cdf2 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -590,6 +590,8 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
 	offset = sizeof(struct padded_vnet_hdr);
 	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
 
+	sg_mark_end(&rq->sg[MAX_SKB_FRAGS + 2 - 1]);
+
 	/* chain first in list head */
 	first->private = (unsigned long)list;
 	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,

Subject: virtio_ring: assume sgs are always well-formed.

We used to have several callers which just used arrays.  They're
gone, so we can use sg_next() everywhere, simplifying the code.

Before:
	gcc 4.8.2: virtio_blk: stack used = 392
	gcc 4.6.4: virtio_blk: stack used = 528

After:
	gcc 4.8.2: virtio_blk: stack used = 392
	gcc 4.6.4: virtio_blk: stack used = 480

vring_bench before:
	936153354-967745359(9.44739e+08+/-6.1e+06)ns
vring_bench after:
	1061485790-1104800648(1.08254e+09+/-6.6e+06)ns

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 8e531578065f..a6cdb52ae9b2 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -107,28 +107,10 @@ struct vring_virtqueue
 
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 
-static inline struct scatterlist *sg_next_chained(struct scatterlist *sg,
-						  unsigned int *count)
-{
-	return sg_next(sg);
-}
-
-static inline struct scatterlist *sg_next_arr(struct scatterlist *sg,
-					      unsigned int *count)
-{
-	if (--(*count) == 0)
-		return NULL;
-	return sg + 1;
-}
-
 /* Set up an indirect table of descriptors and add it to the queue. */
 static inline int vring_add_indirect(struct vring_virtqueue *vq,
 				     struct scatterlist *sgs[],
-				     struct scatterlist *(*next)
-				       (struct scatterlist *, unsigned int *),
 				     unsigned int total_sg,
-				     unsigned int total_out,
-				     unsigned int total_in,
 				     unsigned int out_sgs,
 				     unsigned int in_sgs,
 				     gfp_t gfp)
@@ -155,7 +137,7 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
 	/* Transfer entries from the sg lists into the indirect page */
 	i = 0;
 	for (n = 0; n < out_sgs; n++) {
-		for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
+		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
 			desc[i].flags = VRING_DESC_F_NEXT;
 			desc[i].addr = sg_phys(sg);
 			desc[i].len = sg->length;
@@ -164,7 +146,7 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
 		}
 	}
 	for (; n < (out_sgs + in_sgs); n++) {
-		for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
+		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
 			desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
 			desc[i].addr = sg_phys(sg);
 			desc[i].len = sg->length;
@@ -197,10 +179,7 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
 
 static inline int virtqueue_add(struct virtqueue *_vq,
 				struct scatterlist *sgs[],
-				struct scatterlist *(*next)
-				  (struct scatterlist *, unsigned int *),
-				unsigned int total_out,
-				unsigned int total_in,
+				unsigned int total_sg,
 				unsigned int out_sgs,
 				unsigned int in_sgs,
 				void *data,
@@ -208,7 +187,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	struct scatterlist *sg;
-	unsigned int i, n, avail, uninitialized_var(prev), total_sg;
+	unsigned int i, n, avail, uninitialized_var(prev);
 	int head;
 
 	START_USE(vq);
@@ -233,13 +212,10 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	}
 #endif
 
-	total_sg = total_in + total_out;
-
 	/* If the host supports indirect descriptor tables, and we have multiple
 	 * buffers, then go indirect. FIXME: tune this threshold */
 	if (vq->indirect && total_sg > 1 && vq->vq.num_free) {
-		head = vring_add_indirect(vq, sgs, next, total_sg, total_out,
-					  total_in,
+		head = vring_add_indirect(vq, sgs, total_sg, 
 					  out_sgs, in_sgs, gfp);
 		if (likely(head >= 0))
 			goto add_head;
@@ -265,7 +241,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 
 	head = i = vq->free_head;
 	for (n = 0; n < out_sgs; n++) {
-		for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
+		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
 			vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
 			vq->vring.desc[i].addr = sg_phys(sg);
 			vq->vring.desc[i].len = sg->length;
@@ -274,7 +250,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 		}
 	}
 	for (; n < (out_sgs + in_sgs); n++) {
-		for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
+		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
 			vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
 			vq->vring.desc[i].addr = sg_phys(sg);
 			vq->vring.desc[i].len = sg->length;
@@ -335,29 +311,23 @@ int virtqueue_add_sgs(struct virtqueue *_vq,
 		      void *data,
 		      gfp_t gfp)
 {
-	unsigned int i, total_out, total_in;
+	unsigned int i, total_sg = 0;
 
 	/* Count them first. */
-	for (i = total_out = total_in = 0; i < out_sgs; i++) {
-		struct scatterlist *sg;
-		for (sg = sgs[i]; sg; sg = sg_next(sg))
-			total_out++;
-	}
-	for (; i < out_sgs + in_sgs; i++) {
+	for (i = 0; i < out_sgs + in_sgs; i++) {
 		struct scatterlist *sg;
 		for (sg = sgs[i]; sg; sg = sg_next(sg))
-			total_in++;
+			total_sg++;
 	}
-	return virtqueue_add(_vq, sgs, sg_next_chained,
-			     total_out, total_in, out_sgs, in_sgs, data, gfp);
+	return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, data, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
 
 /**
  * virtqueue_add_outbuf - expose output buffers to other end
  * @vq: the struct virtqueue we're talking about.
- * @sgs: array of scatterlists (need not be terminated!)
- * @num: the number of scatterlists readable by other side
+ * @sg: scatterlist (must be well-formed and terminated!)
+ * @num: the number of entries in @sg readable by other side
  * @data: the token identifying the buffer.
  * @gfp: how to do memory allocations (if necessary).
  *
@@ -367,19 +337,19 @@ EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
  * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
  */
 int virtqueue_add_outbuf(struct virtqueue *vq,
-			 struct scatterlist sg[], unsigned int num,
+			 struct scatterlist *sg, unsigned int num,
 			 void *data,
 			 gfp_t gfp)
 {
-	return virtqueue_add(vq, &sg, sg_next_arr, num, 0, 1, 0, data, gfp);
+	return virtqueue_add(vq, &sg, num, 1, 0, data, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
 
 /**
  * virtqueue_add_inbuf - expose input buffers to other end
  * @vq: the struct virtqueue we're talking about.
- * @sgs: array of scatterlists (need not be terminated!)
- * @num: the number of scatterlists writable by other side
+ * @sg: scatterlist (must be well-formed and terminated!)
+ * @num: the number of entries in @sg writable by other side
  * @data: the token identifying the buffer.
  * @gfp: how to do memory allocations (if necessary).
  *
@@ -389,11 +359,11 @@ EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
  * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
  */
 int virtqueue_add_inbuf(struct virtqueue *vq,
-			struct scatterlist sg[], unsigned int num,
+			struct scatterlist *sg, unsigned int num,
 			void *data,
 			gfp_t gfp)
 {
-	return virtqueue_add(vq, &sg, sg_next_arr, 0, num, 0, 1, data, gfp);
+	return virtqueue_add(vq, &sg, num, 0, 1, data, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] virtio_ring: Remove sg_next indirection
  2014-09-01  0:58   ` Rusty Russell
@ 2014-09-01  1:42     ` Andy Lutomirski
  2014-09-01  6:59       ` Michael S. Tsirkin
  0 siblings, 1 reply; 28+ messages in thread
From: Andy Lutomirski @ 2014-09-01  1:42 UTC (permalink / raw)
  To: Rusty Russell
  Cc: linux-s390@vger.kernel.org, Konrad Rzeszutek Wilk,
	Michael S. Tsirkin, Linux Virtualization, linux390@de.ibm.com

On Sun, Aug 31, 2014 at 5:58 PM, Rusty Russell <rusty@rustcorp.com.au> wrote:
> Andy Lutomirski <luto@amacapital.net> writes:
>> The only unusual thing about virtio's use of scatterlists is that
>> two of the APIs accept scatterlists that might not be terminated.
>> Using function pointers to handle this case is overkill; for_each_sg
>> can do it.
>>
>> There's a small subtlely here: for_each_sg assumes that the provided
>> count is correct, but, because of the way that virtio_ring handles
>> multiple scatterlists at once, the count is only an upper bound if
>> there is more than one scatterlist.  This is easily solved by
>> checking the sg pointer for NULL on each iteration.
>
> Hi Andy,
>
>         (Sorry for the delayed response; I was still catching up from
> my week at KS.)

No problem.  In the grand scheme of maintainer response time, I think
you're near the top. :)

>
> Unfortunately the reason we dance through so many hoops here is that
> it has a measurable performance impact :(  Those indirect calls get inlined.

gcc inlines that?  That must nearly double the size of the object file. :-/

>
> There's only one place which actually uses a weirdly-formed sg now,
> and that's virtio_net.  It's pretty trivial to fix.
>
> However, vring_bench drops 15% when we do this.  There's a larger
> question as to how much difference that makes in Real Life, of course.
> I'll measure that today.

Weird.  sg_next shouldn't be nearly that slow.  Weird.

>
> Here are my two patches, back-to-back (it cam out of of an earlier
> concern about reducing stack usage, hence the stack measurements).
>

I like your version better than mine, except that I suspect that your
version will blow up for the same reason that my v2 patches blow up:
you probably need the skb_to_sgvec_nomark fix, too.

IOW, what happens if you apply patches 1-4 from my v3 series and then
apply your patches on top of that?

There'll be a hit on some virtio_pci machines due to use of the DMA
API.  I would argue that, if this is measurable, the right fix is to
prod the DMA API maintainers, whoever they are, to fix it.  The DMA
API really out to be very fast on identity-mapped devices, but I don't
know whether it is in practice.

--Andy

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] virtio_ring: Remove sg_next indirection
  2014-09-01  1:42     ` Andy Lutomirski
@ 2014-09-01  6:59       ` Michael S. Tsirkin
  2014-09-01 17:15         ` Andy Lutomirski
  0 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2014-09-01  6:59 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-s390@vger.kernel.org, Konrad Rzeszutek Wilk,
	Linux Virtualization, linux390@de.ibm.com

On Sun, Aug 31, 2014 at 06:42:38PM -0700, Andy Lutomirski wrote:
> On Sun, Aug 31, 2014 at 5:58 PM, Rusty Russell <rusty@rustcorp.com.au> wrote:
> > Andy Lutomirski <luto@amacapital.net> writes:
> >> The only unusual thing about virtio's use of scatterlists is that
> >> two of the APIs accept scatterlists that might not be terminated.
> >> Using function pointers to handle this case is overkill; for_each_sg
> >> can do it.
> >>
> >> There's a small subtlely here: for_each_sg assumes that the provided
> >> count is correct, but, because of the way that virtio_ring handles
> >> multiple scatterlists at once, the count is only an upper bound if
> >> there is more than one scatterlist.  This is easily solved by
> >> checking the sg pointer for NULL on each iteration.
> >
> > Hi Andy,
> >
> >         (Sorry for the delayed response; I was still catching up from
> > my week at KS.)
> 
> No problem.  In the grand scheme of maintainer response time, I think
> you're near the top. :)
> 
> >
> > Unfortunately the reason we dance through so many hoops here is that
> > it has a measurable performance impact :(  Those indirect calls get inlined.
> 
> gcc inlines that?  That must nearly double the size of the object file. :-/
> 
> >
> > There's only one place which actually uses a weirdly-formed sg now,
> > and that's virtio_net.  It's pretty trivial to fix.

This path in virtio net is also unused on modern hypervisors, so we probably
don't care how well does it perform: why not apply it anyway?
It's the virtio_ring changes that we need to worry about.

> > However, vring_bench drops 15% when we do this.  There's a larger
> > question as to how much difference that makes in Real Life, of course.
> > I'll measure that today.
> 
> Weird.  sg_next shouldn't be nearly that slow.  Weird.

I think that's down to the fact that it's out of line,
so it prevents inlining of the caller.

> >
> > Here are my two patches, back-to-back (it cam out of of an earlier
> > concern about reducing stack usage, hence the stack measurements).
> >
> 
> I like your version better than mine, except that I suspect that your
> version will blow up for the same reason that my v2 patches blow up:
> you probably need the skb_to_sgvec_nomark fix, too.
> 
> IOW, what happens if you apply patches 1-4 from my v3 series and then
> apply your patches on top of that?
> 
> There'll be a hit on some virtio_pci machines due to use of the DMA
> API.  I would argue that, if this is measurable, the right fix is to
> prod the DMA API maintainers, whoever they are, to fix it.  The DMA
> API really out to be very fast on identity-mapped devices, but I don't
> know whether it is in practice.
> 
> --Andy

Right but we'd have to fix that before applying the patches
to avoid performance regressions.

-- 
MST

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] virtio_ring: Remove sg_next indirection
  2014-09-01  6:59       ` Michael S. Tsirkin
@ 2014-09-01 17:15         ` Andy Lutomirski
  0 siblings, 0 replies; 28+ messages in thread
From: Andy Lutomirski @ 2014-09-01 17:15 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: linux-s390@vger.kernel.org, Konrad Rzeszutek Wilk,
	Linux Virtualization, linux390@de.ibm.com

On Sep 1, 2014 12:00 AM, "Michael S. Tsirkin" <mst@redhat.com> wrote:
>
> On Sun, Aug 31, 2014 at 06:42:38PM -0700, Andy Lutomirski wrote:
> > On Sun, Aug 31, 2014 at 5:58 PM, Rusty Russell <rusty@rustcorp.com.au> wrote:
> > > Andy Lutomirski <luto@amacapital.net> writes:
> > >> The only unusual thing about virtio's use of scatterlists is that
> > >> two of the APIs accept scatterlists that might not be terminated.
> > >> Using function pointers to handle this case is overkill; for_each_sg
> > >> can do it.
> > >>
> > >> There's a small subtlely here: for_each_sg assumes that the provided
> > >> count is correct, but, because of the way that virtio_ring handles
> > >> multiple scatterlists at once, the count is only an upper bound if
> > >> there is more than one scatterlist.  This is easily solved by
> > >> checking the sg pointer for NULL on each iteration.
> > >
> > > Hi Andy,
> > >
> > >         (Sorry for the delayed response; I was still catching up from
> > > my week at KS.)
> >
> > No problem.  In the grand scheme of maintainer response time, I think
> > you're near the top. :)
> >
> > >
> > > Unfortunately the reason we dance through so many hoops here is that
> > > it has a measurable performance impact :(  Those indirect calls get inlined.
> >
> > gcc inlines that?  That must nearly double the size of the object file. :-/
> >
> > >
> > > There's only one place which actually uses a weirdly-formed sg now,
> > > and that's virtio_net.  It's pretty trivial to fix.
>
> This path in virtio net is also unused on modern hypervisors, so we probably
> don't care how well does it perform: why not apply it anyway?
> It's the virtio_ring changes that we need to worry about.
>
> > > However, vring_bench drops 15% when we do this.  There's a larger
> > > question as to how much difference that makes in Real Life, of course.
> > > I'll measure that today.
> >
> > Weird.  sg_next shouldn't be nearly that slow.  Weird.
>
> I think that's down to the fact that it's out of line,
> so it prevents inlining of the caller.
>
> > >
> > > Here are my two patches, back-to-back (it cam out of of an earlier
> > > concern about reducing stack usage, hence the stack measurements).
> > >
> >
> > I like your version better than mine, except that I suspect that your
> > version will blow up for the same reason that my v2 patches blow up:
> > you probably need the skb_to_sgvec_nomark fix, too.
> >
> > IOW, what happens if you apply patches 1-4 from my v3 series and then
> > apply your patches on top of that?
> >
> > There'll be a hit on some virtio_pci machines due to use of the DMA
> > API.  I would argue that, if this is measurable, the right fix is to
> > prod the DMA API maintainers, whoever they are, to fix it.  The DMA
> > API really out to be very fast on identity-mapped devices, but I don't
> > know whether it is in practice.
> >
> > --Andy
>
> Right but we'd have to fix that before applying the patches
> to avoid performance regressions.

My pathetic attempt to benchmark it came up with no difference.
What's the right benchmark?  I'm having trouble compiling anything in
tools/virtio -- my patch breaks that code, but it seems like it's
already broken.  The next version of my patches will at least not make
the problem worse.

I think that we can eliminate any possibility of a meaningful
performance hit due to the streaming part of the mappings by checking
PCI_DMA_BUS_IS_PHYS.  I'll test and send out a version of the patches
that does that.

For what it's worth, there's another possible performance hit: on some
architectures, coherent memory could be uncached.  This might not
matter for anyone, though.  I think that x86 uses normal memory for
coherent mappings.  ARM is unlikely to make serious use of virtio_pci,
since AFAIK virtio_mmio is strongly preferred on ARM.  s390 should be
unaffected for much the same reason.  Are there other architectures on
which this could be a problem?

It's possible that the weak barrier flag could be used to disable the
use of coherent memory, but it might require some DMA API trickery.
In particular, the only API I see is DMA_BIDIRECTIONAL, and we'll end
up with ugly code in virtio_pci to get it working.

Ultimately, I think that hypervisors need a way to tell their guests
that certain PCI devices are emulated and therefore fully coherent
regardless our normal architectural considerations.

--Andy

>
> --
> MST

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 2/3] virtio_ring: Use DMA APIs
  2014-08-26 21:16 [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API Andy Lutomirski
  2014-08-26 21:17 ` [PATCH 1/3] virtio_ring: Remove sg_next indirection Andy Lutomirski
@ 2014-08-26 21:17 ` Andy Lutomirski
  2014-08-27  7:29   ` Christian Borntraeger
  2014-08-26 21:17 ` [PATCH 3/3] virtio_pci: Use the DMA API for virtqueues Andy Lutomirski
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 28+ messages in thread
From: Andy Lutomirski @ 2014-08-26 21:17 UTC (permalink / raw)
  To: Rusty Russell, Michael S. Tsirkin
  Cc: linux-s390, Konrad Rzeszutek Wilk, virtualization, linux390,
	Andy Lutomirski

virtio_ring currently sends the device (usually a hypervisor)
physical addresses of its I/O buffers.  This is okay when DMA
addresses and physical addresses are the same thing, but this isn't
always the case.  For example, this never works on Xen guests, and
it is likely to fail if a physical "virtio" device ever ends up
behind an IOMMU or swiotlb.

The immediate use case for me is to enable virtio on Xen guests.
For that to work, we need this fix as well as a corresponding
fix to virtio_pci or to another driver.

With this patch, virtfs survives kmemleak and CONFIG_DMA_API_DEBUG.
virtio-net warns (correctly) about DMA from the stack in
virtnet_set_rx_mode.

This breaks s390's defconfig.  The default configuration for s390
does virtio through a KVM-specific interface, but that configuration
does not support DMA.  I could modify this patch to stub out the DMA
API calls if !CONFIG_HAS_DMA, but it seems to me that it would be
much nicer to make s390 support DMA unconditionally.

It's actually unclear to me whether this can be fixed without either
s390 arch support or a special case for s390 in virtio_ring: on
brief inspection of s390's DMA code, it seems to assume that
everything goes through a PCI IOMMU, which is presumably not the
case for virtio.

Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 drivers/virtio/Kconfig       |   1 +
 drivers/virtio/virtio_ring.c | 116 +++++++++++++++++++++++++++++++++++--------
 2 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index c6683f2e396c..b6f3acc61153 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -1,5 +1,6 @@
 config VIRTIO
 	tristate
+	depends on HAS_DMA
 	---help---
 	  This option is selected by any driver which implements the virtio
 	  bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST,
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index d356a701c9c2..6a78e2fd8e63 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/hrtimer.h>
 #include <linux/kmemleak.h>
+#include <linux/dma-mapping.h>
 
 #ifdef DEBUG
 /* For development, we want to crash whenever the ring is screwed. */
@@ -54,6 +55,12 @@
 #define END_USE(vq)
 #endif
 
+struct vring_desc_state
+{
+	void *data;			/* Data for callback. */
+	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
+};
+
 struct vring_virtqueue
 {
 	struct virtqueue vq;
@@ -93,12 +100,45 @@ struct vring_virtqueue
 	ktime_t last_add_time;
 #endif
 
-	/* Tokens for callbacks. */
-	void *data[];
+	/* Per-descriptor state. */
+	struct vring_desc_state desc_state[];
 };
 
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 
+/* Helper to map one sg entry, since we can't use dma_map_sg. */
+static dma_addr_t dma_map_one_sg(struct vring_virtqueue *vq,
+				 struct scatterlist *sg,
+				 enum dma_data_direction direction)
+{
+	return dma_map_page(vq->vq.vdev->dev.parent,
+			    sg_page(sg), sg->offset, sg->length, direction);
+}
+
+static void unmap_one(struct vring_virtqueue *vq, struct vring_desc *desc)
+{
+	if (desc->flags & VRING_DESC_F_INDIRECT) {
+		dma_unmap_single(vq->vq.vdev->dev.parent,
+				 desc->addr, desc->len,
+				 (desc->flags & VRING_DESC_F_WRITE) ?
+				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	} else {
+		dma_unmap_page(vq->vq.vdev->dev.parent,
+			       desc->addr, desc->len,
+			       (desc->flags & VRING_DESC_F_WRITE) ?
+			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	}
+}
+
+static void unmap_indirect(struct vring_virtqueue *vq, struct vring_desc *desc,
+			   int total)
+{
+	int i;
+
+	for (i = 0; i < total; i++)
+		unmap_one(vq, &desc[i]);
+}
+
 /* Set up an indirect table of descriptors and add it to the queue. */
 static inline int vring_add_indirect(struct vring_virtqueue *vq,
 				     struct scatterlist *sgs[],
@@ -132,7 +172,11 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
 			if (!sg)
 				break;
 			desc[i].flags = VRING_DESC_F_NEXT;
-			desc[i].addr = sg_phys(sg);
+			desc[i].addr =
+				dma_map_one_sg(vq, sg, DMA_TO_DEVICE);
+			if (dma_mapping_error(vq->vq.vdev->dev.parent,
+					      desc[i].addr))
+				goto unmap_free;
 			desc[i].len = sg->length;
 			desc[i].next = i+1;
 			i++;
@@ -143,7 +187,11 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
 			if (!sg)
 				break;
 			desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
-			desc[i].addr = sg_phys(sg);
+			desc[i].addr =
+				dma_map_one_sg(vq, sg, DMA_FROM_DEVICE);
+			if (dma_mapping_error(vq->vq.vdev->dev.parent,
+					      desc[i].addr))
+				goto unmap_free;
 			desc[i].len = sg->length;
 			desc[i].next = i+1;
 			i++;
@@ -161,15 +209,27 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
 	/* Use a single buffer which doesn't continue */
 	head = vq->free_head;
 	vq->vring.desc[head].flags = VRING_DESC_F_INDIRECT;
-	vq->vring.desc[head].addr = virt_to_phys(desc);
-	/* kmemleak gives a false positive, as it's hidden by virt_to_phys */
-	kmemleak_ignore(desc);
+	vq->vring.desc[head].addr =
+		dma_map_single(vq->vq.vdev->dev.parent,
+			       desc, i * sizeof(struct vring_desc),
+			       DMA_TO_DEVICE);
+	if (dma_mapping_error(vq->vq.vdev->dev.parent,
+			      vq->vring.desc[head].addr))
+		goto unmap_free;
 	vq->vring.desc[head].len = i * sizeof(struct vring_desc);
 
 	/* Update free pointer */
 	vq->free_head = vq->vring.desc[head].next;
 
+	/* Save the indirect block */
+	vq->desc_state[head].indir_desc = desc;
+
 	return head;
+
+unmap_free:
+	unmap_indirect(vq, desc, i);
+	kfree(desc);
+	return -ENOMEM;
 }
 
 static inline int virtqueue_add(struct virtqueue *_vq,
@@ -244,7 +304,8 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 			if (!sg)
 				break;
 			vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
-			vq->vring.desc[i].addr = sg_phys(sg);
+			vq->vring.desc[i].addr =
+				dma_map_one_sg(vq, sg, DMA_TO_DEVICE);
 			vq->vring.desc[i].len = sg->length;
 			prev = i;
 			i = vq->vring.desc[i].next;
@@ -255,7 +316,8 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 			if (!sg)
 				break;
 			vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
-			vq->vring.desc[i].addr = sg_phys(sg);
+			vq->vring.desc[i].addr =
+				dma_map_one_sg(vq, sg, DMA_FROM_DEVICE);
 			vq->vring.desc[i].len = sg->length;
 			prev = i;
 			i = vq->vring.desc[i].next;
@@ -269,7 +331,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 
 add_head:
 	/* Set token. */
-	vq->data[head] = data;
+	vq->desc_state[head].data = data;
 
 	/* Put entry in available array (but don't update avail->idx until they
 	 * do sync). */
@@ -470,22 +532,33 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
 	unsigned int i;
 
 	/* Clear data ptr. */
-	vq->data[head] = NULL;
+	vq->desc_state[head].data = NULL;
 
 	/* Put back on free list: find end */
 	i = head;
 
 	/* Free the indirect table */
-	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT)
-		kfree(phys_to_virt(vq->vring.desc[i].addr));
+	if (vq->desc_state[head].indir_desc) {
+		u32 len = vq->vring.desc[i].len;
+
+		BUG_ON(!(vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT));
+		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
+		unmap_indirect(vq, vq->desc_state[head].indir_desc,
+			       len / sizeof(struct vring_desc));
+		kfree(vq->desc_state[head].indir_desc);
+		vq->desc_state[head].indir_desc = NULL;
+	}
 
 	while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
+		unmap_one(vq, &vq->vring.desc[i]);
 		i = vq->vring.desc[i].next;
 		vq->vq.num_free++;
 	}
 
+	unmap_one(vq, &vq->vring.desc[i]);
 	vq->vring.desc[i].next = vq->free_head;
 	vq->free_head = head;
+
 	/* Plus final descriptor */
 	vq->vq.num_free++;
 }
@@ -542,13 +615,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 		BAD_RING(vq, "id %u out of range\n", i);
 		return NULL;
 	}
-	if (unlikely(!vq->data[i])) {
+	if (unlikely(!vq->desc_state[i].data)) {
 		BAD_RING(vq, "id %u is not a head!\n", i);
 		return NULL;
 	}
 
 	/* detach_buf clears data, so grab it now. */
-	ret = vq->data[i];
+	ret = vq->desc_state[i].data;
 	detach_buf(vq, i);
 	vq->last_used_idx++;
 	/* If we expect an interrupt for the next entry, tell host
@@ -709,10 +782,10 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
 	START_USE(vq);
 
 	for (i = 0; i < vq->vring.num; i++) {
-		if (!vq->data[i])
+		if (!vq->desc_state[i].data)
 			continue;
 		/* detach_buf clears data, so grab it now. */
-		buf = vq->data[i];
+		buf = vq->desc_state[i].data;
 		detach_buf(vq, i);
 		vq->vring.avail->idx--;
 		END_USE(vq);
@@ -765,7 +838,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 		return NULL;
 	}
 
-	vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
+	vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
+		     GFP_KERNEL);
 	if (!vq)
 		return NULL;
 
@@ -795,11 +869,9 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 
 	/* Put everything in free lists. */
 	vq->free_head = 0;
-	for (i = 0; i < num-1; i++) {
+	for (i = 0; i < num-1; i++)
 		vq->vring.desc[i].next = i+1;
-		vq->data[i] = NULL;
-	}
-	vq->data[i] = NULL;
+	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
 
 	return &vq->vq;
 }
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/3] virtio_ring: Use DMA APIs
  2014-08-26 21:17 ` [PATCH 2/3] virtio_ring: Use DMA APIs Andy Lutomirski
@ 2014-08-27  7:29   ` Christian Borntraeger
  2014-08-27 17:35     ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 28+ messages in thread
From: Christian Borntraeger @ 2014-08-27  7:29 UTC (permalink / raw)
  To: Andy Lutomirski, Rusty Russell, Michael S. Tsirkin
  Cc: linux390, virtualization, Konrad Rzeszutek Wilk, linux-s390

On 26/08/14 23:17, Andy Lutomirski wrote:
> virtio_ring currently sends the device (usually a hypervisor)
> physical addresses of its I/O buffers.  This is okay when DMA
> addresses and physical addresses are the same thing, but this isn't
> always the case.  For example, this never works on Xen guests, and
> it is likely to fail if a physical "virtio" device ever ends up
> behind an IOMMU or swiotlb.
> 
> The immediate use case for me is to enable virtio on Xen guests.
> For that to work, we need this fix as well as a corresponding
> fix to virtio_pci or to another driver.
> 
> With this patch, virtfs survives kmemleak and CONFIG_DMA_API_DEBUG.
> virtio-net warns (correctly) about DMA from the stack in
> virtnet_set_rx_mode.
> 
> This breaks s390's defconfig.  The default configuration for s390
> does virtio through a KVM-specific interface, but that configuration
> does not support DMA.  I could modify this patch to stub out the DMA
> API calls if !CONFIG_HAS_DMA, but it seems to me that it would be
> much nicer to make s390 support DMA unconditionally.

s390 has no DMA per se. Newest systems have a PCI-like  I/O attach in 
addition to the classic channel I/O, and for that we enable the DMA code 
just for that transport to be able to reuse some of the existing PCI
drivers. (only some because, we differ in some aspects from how PCI
looks like) But the architecture itself (and the virtio interface) does
not provide the DMA interface as you know it:

In essence this patch is a no-go for s390.

I am also aksing myself, if this makes virtio-ring more expensive?

Christian



> 
> It's actually unclear to me whether this can be fixed without either
> s390 arch support or a special case for s390 in virtio_ring: on
> brief inspection of s390's DMA code, it seems to assume that
> everything goes through a PCI IOMMU, which is presumably not the
> case for virtio.


> 
> Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> ---
>  drivers/virtio/Kconfig       |   1 +
>  drivers/virtio/virtio_ring.c | 116 +++++++++++++++++++++++++++++++++++--------
>  2 files changed, 95 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> index c6683f2e396c..b6f3acc61153 100644
> --- a/drivers/virtio/Kconfig
> +++ b/drivers/virtio/Kconfig
> @@ -1,5 +1,6 @@
>  config VIRTIO
>  	tristate
> +	depends on HAS_DMA
>  	---help---
>  	  This option is selected by any driver which implements the virtio
>  	  bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST,
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index d356a701c9c2..6a78e2fd8e63 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -24,6 +24,7 @@
>  #include <linux/module.h>
>  #include <linux/hrtimer.h>
>  #include <linux/kmemleak.h>
> +#include <linux/dma-mapping.h>
> 
>  #ifdef DEBUG
>  /* For development, we want to crash whenever the ring is screwed. */
> @@ -54,6 +55,12 @@
>  #define END_USE(vq)
>  #endif
> 
> +struct vring_desc_state
> +{
> +	void *data;			/* Data for callback. */
> +	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
> +};
> +
>  struct vring_virtqueue
>  {
>  	struct virtqueue vq;
> @@ -93,12 +100,45 @@ struct vring_virtqueue
>  	ktime_t last_add_time;
>  #endif
> 
> -	/* Tokens for callbacks. */
> -	void *data[];
> +	/* Per-descriptor state. */
> +	struct vring_desc_state desc_state[];
>  };
> 
>  #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
> 
> +/* Helper to map one sg entry, since we can't use dma_map_sg. */
> +static dma_addr_t dma_map_one_sg(struct vring_virtqueue *vq,
> +				 struct scatterlist *sg,
> +				 enum dma_data_direction direction)
> +{
> +	return dma_map_page(vq->vq.vdev->dev.parent,
> +			    sg_page(sg), sg->offset, sg->length, direction);
> +}
> +
> +static void unmap_one(struct vring_virtqueue *vq, struct vring_desc *desc)
> +{
> +	if (desc->flags & VRING_DESC_F_INDIRECT) {
> +		dma_unmap_single(vq->vq.vdev->dev.parent,
> +				 desc->addr, desc->len,
> +				 (desc->flags & VRING_DESC_F_WRITE) ?
> +				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	} else {
> +		dma_unmap_page(vq->vq.vdev->dev.parent,
> +			       desc->addr, desc->len,
> +			       (desc->flags & VRING_DESC_F_WRITE) ?
> +			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	}
> +}
> +
> +static void unmap_indirect(struct vring_virtqueue *vq, struct vring_desc *desc,
> +			   int total)
> +{
> +	int i;
> +
> +	for (i = 0; i < total; i++)
> +		unmap_one(vq, &desc[i]);
> +}
> +
>  /* Set up an indirect table of descriptors and add it to the queue. */
>  static inline int vring_add_indirect(struct vring_virtqueue *vq,
>  				     struct scatterlist *sgs[],
> @@ -132,7 +172,11 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
>  			if (!sg)
>  				break;
>  			desc[i].flags = VRING_DESC_F_NEXT;
> -			desc[i].addr = sg_phys(sg);
> +			desc[i].addr =
> +				dma_map_one_sg(vq, sg, DMA_TO_DEVICE);
> +			if (dma_mapping_error(vq->vq.vdev->dev.parent,
> +					      desc[i].addr))
> +				goto unmap_free;
>  			desc[i].len = sg->length;
>  			desc[i].next = i+1;
>  			i++;
> @@ -143,7 +187,11 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
>  			if (!sg)
>  				break;
>  			desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
> -			desc[i].addr = sg_phys(sg);
> +			desc[i].addr =
> +				dma_map_one_sg(vq, sg, DMA_FROM_DEVICE);
> +			if (dma_mapping_error(vq->vq.vdev->dev.parent,
> +					      desc[i].addr))
> +				goto unmap_free;
>  			desc[i].len = sg->length;
>  			desc[i].next = i+1;
>  			i++;
> @@ -161,15 +209,27 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
>  	/* Use a single buffer which doesn't continue */
>  	head = vq->free_head;
>  	vq->vring.desc[head].flags = VRING_DESC_F_INDIRECT;
> -	vq->vring.desc[head].addr = virt_to_phys(desc);
> -	/* kmemleak gives a false positive, as it's hidden by virt_to_phys */
> -	kmemleak_ignore(desc);
> +	vq->vring.desc[head].addr =
> +		dma_map_single(vq->vq.vdev->dev.parent,
> +			       desc, i * sizeof(struct vring_desc),
> +			       DMA_TO_DEVICE);
> +	if (dma_mapping_error(vq->vq.vdev->dev.parent,
> +			      vq->vring.desc[head].addr))
> +		goto unmap_free;
>  	vq->vring.desc[head].len = i * sizeof(struct vring_desc);
> 
>  	/* Update free pointer */
>  	vq->free_head = vq->vring.desc[head].next;
> 
> +	/* Save the indirect block */
> +	vq->desc_state[head].indir_desc = desc;
> +
>  	return head;
> +
> +unmap_free:
> +	unmap_indirect(vq, desc, i);
> +	kfree(desc);
> +	return -ENOMEM;
>  }
> 
>  static inline int virtqueue_add(struct virtqueue *_vq,
> @@ -244,7 +304,8 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>  			if (!sg)
>  				break;
>  			vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
> -			vq->vring.desc[i].addr = sg_phys(sg);
> +			vq->vring.desc[i].addr =
> +				dma_map_one_sg(vq, sg, DMA_TO_DEVICE);
>  			vq->vring.desc[i].len = sg->length;
>  			prev = i;
>  			i = vq->vring.desc[i].next;
> @@ -255,7 +316,8 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>  			if (!sg)
>  				break;
>  			vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
> -			vq->vring.desc[i].addr = sg_phys(sg);
> +			vq->vring.desc[i].addr =
> +				dma_map_one_sg(vq, sg, DMA_FROM_DEVICE);
>  			vq->vring.desc[i].len = sg->length;
>  			prev = i;
>  			i = vq->vring.desc[i].next;
> @@ -269,7 +331,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
> 
>  add_head:
>  	/* Set token. */
> -	vq->data[head] = data;
> +	vq->desc_state[head].data = data;
> 
>  	/* Put entry in available array (but don't update avail->idx until they
>  	 * do sync). */
> @@ -470,22 +532,33 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
>  	unsigned int i;
> 
>  	/* Clear data ptr. */
> -	vq->data[head] = NULL;
> +	vq->desc_state[head].data = NULL;
> 
>  	/* Put back on free list: find end */
>  	i = head;
> 
>  	/* Free the indirect table */
> -	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT)
> -		kfree(phys_to_virt(vq->vring.desc[i].addr));
> +	if (vq->desc_state[head].indir_desc) {
> +		u32 len = vq->vring.desc[i].len;
> +
> +		BUG_ON(!(vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT));
> +		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
> +		unmap_indirect(vq, vq->desc_state[head].indir_desc,
> +			       len / sizeof(struct vring_desc));
> +		kfree(vq->desc_state[head].indir_desc);
> +		vq->desc_state[head].indir_desc = NULL;
> +	}
> 
>  	while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
> +		unmap_one(vq, &vq->vring.desc[i]);
>  		i = vq->vring.desc[i].next;
>  		vq->vq.num_free++;
>  	}
> 
> +	unmap_one(vq, &vq->vring.desc[i]);
>  	vq->vring.desc[i].next = vq->free_head;
>  	vq->free_head = head;
> +
>  	/* Plus final descriptor */
>  	vq->vq.num_free++;
>  }
> @@ -542,13 +615,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
>  		BAD_RING(vq, "id %u out of range\n", i);
>  		return NULL;
>  	}
> -	if (unlikely(!vq->data[i])) {
> +	if (unlikely(!vq->desc_state[i].data)) {
>  		BAD_RING(vq, "id %u is not a head!\n", i);
>  		return NULL;
>  	}
> 
>  	/* detach_buf clears data, so grab it now. */
> -	ret = vq->data[i];
> +	ret = vq->desc_state[i].data;
>  	detach_buf(vq, i);
>  	vq->last_used_idx++;
>  	/* If we expect an interrupt for the next entry, tell host
> @@ -709,10 +782,10 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
>  	START_USE(vq);
> 
>  	for (i = 0; i < vq->vring.num; i++) {
> -		if (!vq->data[i])
> +		if (!vq->desc_state[i].data)
>  			continue;
>  		/* detach_buf clears data, so grab it now. */
> -		buf = vq->data[i];
> +		buf = vq->desc_state[i].data;
>  		detach_buf(vq, i);
>  		vq->vring.avail->idx--;
>  		END_USE(vq);
> @@ -765,7 +838,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
>  		return NULL;
>  	}
> 
> -	vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
> +	vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
> +		     GFP_KERNEL);
>  	if (!vq)
>  		return NULL;
> 
> @@ -795,11 +869,9 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
> 
>  	/* Put everything in free lists. */
>  	vq->free_head = 0;
> -	for (i = 0; i < num-1; i++) {
> +	for (i = 0; i < num-1; i++)
>  		vq->vring.desc[i].next = i+1;
> -		vq->data[i] = NULL;
> -	}
> -	vq->data[i] = NULL;
> +	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
> 
>  	return &vq->vq;
>  }
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/3] virtio_ring: Use DMA APIs
  2014-08-27  7:29   ` Christian Borntraeger
@ 2014-08-27 17:35     ` Konrad Rzeszutek Wilk
  2014-08-27 17:39       ` Andy Lutomirski
  0 siblings, 1 reply; 28+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-08-27 17:35 UTC (permalink / raw)
  To: Christian Borntraeger
  Cc: linux-s390, Michael S. Tsirkin, Andy Lutomirski, linux390,
	virtualization

On Wed, Aug 27, 2014 at 09:29:36AM +0200, Christian Borntraeger wrote:
> On 26/08/14 23:17, Andy Lutomirski wrote:
> > virtio_ring currently sends the device (usually a hypervisor)
> > physical addresses of its I/O buffers.  This is okay when DMA
> > addresses and physical addresses are the same thing, but this isn't
> > always the case.  For example, this never works on Xen guests, and
> > it is likely to fail if a physical "virtio" device ever ends up
> > behind an IOMMU or swiotlb.
> > 
> > The immediate use case for me is to enable virtio on Xen guests.
> > For that to work, we need this fix as well as a corresponding
> > fix to virtio_pci or to another driver.
> > 
> > With this patch, virtfs survives kmemleak and CONFIG_DMA_API_DEBUG.
> > virtio-net warns (correctly) about DMA from the stack in
> > virtnet_set_rx_mode.
> > 
> > This breaks s390's defconfig.  The default configuration for s390
> > does virtio through a KVM-specific interface, but that configuration
> > does not support DMA.  I could modify this patch to stub out the DMA
> > API calls if !CONFIG_HAS_DMA, but it seems to me that it would be
> > much nicer to make s390 support DMA unconditionally.
> 
> s390 has no DMA per se. Newest systems have a PCI-like  I/O attach in 
> addition to the classic channel I/O, and for that we enable the DMA code 
> just for that transport to be able to reuse some of the existing PCI
> drivers. (only some because, we differ in some aspects from how PCI
> looks like) But the architecture itself (and the virtio interface) does
> not provide the DMA interface as you know it:

Don't most of those DMA_API end up then being nops? As in, we do
have in the dma-api file the #ifdef case when a platform does not do
DMA which ends up with all functions stubbed out.

> 
> In essence this patch is a no-go for s390.

Is that due to possible compilation?
> 
> I am also aksing myself, if this makes virtio-ring more expensive?
> 
> Christian
> 
> 
> 
> > 
> > It's actually unclear to me whether this can be fixed without either
> > s390 arch support or a special case for s390 in virtio_ring: on
> > brief inspection of s390's DMA code, it seems to assume that
> > everything goes through a PCI IOMMU, which is presumably not the
> > case for virtio.
> 
> 
> > 
> > Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
> > Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> > ---
> >  drivers/virtio/Kconfig       |   1 +
> >  drivers/virtio/virtio_ring.c | 116 +++++++++++++++++++++++++++++++++++--------
> >  2 files changed, 95 insertions(+), 22 deletions(-)
> > 
> > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > index c6683f2e396c..b6f3acc61153 100644
> > --- a/drivers/virtio/Kconfig
> > +++ b/drivers/virtio/Kconfig
> > @@ -1,5 +1,6 @@
> >  config VIRTIO
> >  	tristate
> > +	depends on HAS_DMA
> >  	---help---
> >  	  This option is selected by any driver which implements the virtio
> >  	  bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST,
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index d356a701c9c2..6a78e2fd8e63 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -24,6 +24,7 @@
> >  #include <linux/module.h>
> >  #include <linux/hrtimer.h>
> >  #include <linux/kmemleak.h>
> > +#include <linux/dma-mapping.h>
> > 
> >  #ifdef DEBUG
> >  /* For development, we want to crash whenever the ring is screwed. */
> > @@ -54,6 +55,12 @@
> >  #define END_USE(vq)
> >  #endif
> > 
> > +struct vring_desc_state
> > +{
> > +	void *data;			/* Data for callback. */
> > +	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
> > +};
> > +
> >  struct vring_virtqueue
> >  {
> >  	struct virtqueue vq;
> > @@ -93,12 +100,45 @@ struct vring_virtqueue
> >  	ktime_t last_add_time;
> >  #endif
> > 
> > -	/* Tokens for callbacks. */
> > -	void *data[];
> > +	/* Per-descriptor state. */
> > +	struct vring_desc_state desc_state[];
> >  };
> > 
> >  #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
> > 
> > +/* Helper to map one sg entry, since we can't use dma_map_sg. */
> > +static dma_addr_t dma_map_one_sg(struct vring_virtqueue *vq,
> > +				 struct scatterlist *sg,
> > +				 enum dma_data_direction direction)
> > +{
> > +	return dma_map_page(vq->vq.vdev->dev.parent,
> > +			    sg_page(sg), sg->offset, sg->length, direction);
> > +}
> > +
> > +static void unmap_one(struct vring_virtqueue *vq, struct vring_desc *desc)
> > +{
> > +	if (desc->flags & VRING_DESC_F_INDIRECT) {
> > +		dma_unmap_single(vq->vq.vdev->dev.parent,
> > +				 desc->addr, desc->len,
> > +				 (desc->flags & VRING_DESC_F_WRITE) ?
> > +				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
> > +	} else {
> > +		dma_unmap_page(vq->vq.vdev->dev.parent,
> > +			       desc->addr, desc->len,
> > +			       (desc->flags & VRING_DESC_F_WRITE) ?
> > +			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
> > +	}
> > +}
> > +
> > +static void unmap_indirect(struct vring_virtqueue *vq, struct vring_desc *desc,
> > +			   int total)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < total; i++)
> > +		unmap_one(vq, &desc[i]);
> > +}
> > +
> >  /* Set up an indirect table of descriptors and add it to the queue. */
> >  static inline int vring_add_indirect(struct vring_virtqueue *vq,
> >  				     struct scatterlist *sgs[],
> > @@ -132,7 +172,11 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
> >  			if (!sg)
> >  				break;
> >  			desc[i].flags = VRING_DESC_F_NEXT;
> > -			desc[i].addr = sg_phys(sg);
> > +			desc[i].addr =
> > +				dma_map_one_sg(vq, sg, DMA_TO_DEVICE);
> > +			if (dma_mapping_error(vq->vq.vdev->dev.parent,
> > +					      desc[i].addr))
> > +				goto unmap_free;
> >  			desc[i].len = sg->length;
> >  			desc[i].next = i+1;
> >  			i++;
> > @@ -143,7 +187,11 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
> >  			if (!sg)
> >  				break;
> >  			desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
> > -			desc[i].addr = sg_phys(sg);
> > +			desc[i].addr =
> > +				dma_map_one_sg(vq, sg, DMA_FROM_DEVICE);
> > +			if (dma_mapping_error(vq->vq.vdev->dev.parent,
> > +					      desc[i].addr))
> > +				goto unmap_free;
> >  			desc[i].len = sg->length;
> >  			desc[i].next = i+1;
> >  			i++;
> > @@ -161,15 +209,27 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
> >  	/* Use a single buffer which doesn't continue */
> >  	head = vq->free_head;
> >  	vq->vring.desc[head].flags = VRING_DESC_F_INDIRECT;
> > -	vq->vring.desc[head].addr = virt_to_phys(desc);
> > -	/* kmemleak gives a false positive, as it's hidden by virt_to_phys */
> > -	kmemleak_ignore(desc);
> > +	vq->vring.desc[head].addr =
> > +		dma_map_single(vq->vq.vdev->dev.parent,
> > +			       desc, i * sizeof(struct vring_desc),
> > +			       DMA_TO_DEVICE);
> > +	if (dma_mapping_error(vq->vq.vdev->dev.parent,
> > +			      vq->vring.desc[head].addr))
> > +		goto unmap_free;
> >  	vq->vring.desc[head].len = i * sizeof(struct vring_desc);
> > 
> >  	/* Update free pointer */
> >  	vq->free_head = vq->vring.desc[head].next;
> > 
> > +	/* Save the indirect block */
> > +	vq->desc_state[head].indir_desc = desc;
> > +
> >  	return head;
> > +
> > +unmap_free:
> > +	unmap_indirect(vq, desc, i);
> > +	kfree(desc);
> > +	return -ENOMEM;
> >  }
> > 
> >  static inline int virtqueue_add(struct virtqueue *_vq,
> > @@ -244,7 +304,8 @@ static inline int virtqueue_add(struct virtqueue *_vq,
> >  			if (!sg)
> >  				break;
> >  			vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
> > -			vq->vring.desc[i].addr = sg_phys(sg);
> > +			vq->vring.desc[i].addr =
> > +				dma_map_one_sg(vq, sg, DMA_TO_DEVICE);
> >  			vq->vring.desc[i].len = sg->length;
> >  			prev = i;
> >  			i = vq->vring.desc[i].next;
> > @@ -255,7 +316,8 @@ static inline int virtqueue_add(struct virtqueue *_vq,
> >  			if (!sg)
> >  				break;
> >  			vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
> > -			vq->vring.desc[i].addr = sg_phys(sg);
> > +			vq->vring.desc[i].addr =
> > +				dma_map_one_sg(vq, sg, DMA_FROM_DEVICE);
> >  			vq->vring.desc[i].len = sg->length;
> >  			prev = i;
> >  			i = vq->vring.desc[i].next;
> > @@ -269,7 +331,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
> > 
> >  add_head:
> >  	/* Set token. */
> > -	vq->data[head] = data;
> > +	vq->desc_state[head].data = data;
> > 
> >  	/* Put entry in available array (but don't update avail->idx until they
> >  	 * do sync). */
> > @@ -470,22 +532,33 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
> >  	unsigned int i;
> > 
> >  	/* Clear data ptr. */
> > -	vq->data[head] = NULL;
> > +	vq->desc_state[head].data = NULL;
> > 
> >  	/* Put back on free list: find end */
> >  	i = head;
> > 
> >  	/* Free the indirect table */
> > -	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT)
> > -		kfree(phys_to_virt(vq->vring.desc[i].addr));
> > +	if (vq->desc_state[head].indir_desc) {
> > +		u32 len = vq->vring.desc[i].len;
> > +
> > +		BUG_ON(!(vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT));
> > +		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
> > +		unmap_indirect(vq, vq->desc_state[head].indir_desc,
> > +			       len / sizeof(struct vring_desc));
> > +		kfree(vq->desc_state[head].indir_desc);
> > +		vq->desc_state[head].indir_desc = NULL;
> > +	}
> > 
> >  	while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
> > +		unmap_one(vq, &vq->vring.desc[i]);
> >  		i = vq->vring.desc[i].next;
> >  		vq->vq.num_free++;
> >  	}
> > 
> > +	unmap_one(vq, &vq->vring.desc[i]);
> >  	vq->vring.desc[i].next = vq->free_head;
> >  	vq->free_head = head;
> > +
> >  	/* Plus final descriptor */
> >  	vq->vq.num_free++;
> >  }
> > @@ -542,13 +615,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
> >  		BAD_RING(vq, "id %u out of range\n", i);
> >  		return NULL;
> >  	}
> > -	if (unlikely(!vq->data[i])) {
> > +	if (unlikely(!vq->desc_state[i].data)) {
> >  		BAD_RING(vq, "id %u is not a head!\n", i);
> >  		return NULL;
> >  	}
> > 
> >  	/* detach_buf clears data, so grab it now. */
> > -	ret = vq->data[i];
> > +	ret = vq->desc_state[i].data;
> >  	detach_buf(vq, i);
> >  	vq->last_used_idx++;
> >  	/* If we expect an interrupt for the next entry, tell host
> > @@ -709,10 +782,10 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
> >  	START_USE(vq);
> > 
> >  	for (i = 0; i < vq->vring.num; i++) {
> > -		if (!vq->data[i])
> > +		if (!vq->desc_state[i].data)
> >  			continue;
> >  		/* detach_buf clears data, so grab it now. */
> > -		buf = vq->data[i];
> > +		buf = vq->desc_state[i].data;
> >  		detach_buf(vq, i);
> >  		vq->vring.avail->idx--;
> >  		END_USE(vq);
> > @@ -765,7 +838,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
> >  		return NULL;
> >  	}
> > 
> > -	vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
> > +	vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
> > +		     GFP_KERNEL);
> >  	if (!vq)
> >  		return NULL;
> > 
> > @@ -795,11 +869,9 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
> > 
> >  	/* Put everything in free lists. */
> >  	vq->free_head = 0;
> > -	for (i = 0; i < num-1; i++) {
> > +	for (i = 0; i < num-1; i++)
> >  		vq->vring.desc[i].next = i+1;
> > -		vq->data[i] = NULL;
> > -	}
> > -	vq->data[i] = NULL;
> > +	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
> > 
> >  	return &vq->vq;
> >  }
> > 
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/3] virtio_ring: Use DMA APIs
  2014-08-27 17:35     ` Konrad Rzeszutek Wilk
@ 2014-08-27 17:39       ` Andy Lutomirski
  0 siblings, 0 replies; 28+ messages in thread
From: Andy Lutomirski @ 2014-08-27 17:39 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: linux-s390@vger.kernel.org, Michael S. Tsirkin,
	Linux Virtualization, Christian Borntraeger, linux390@de.ibm.com

On Wed, Aug 27, 2014 at 10:35 AM, Konrad Rzeszutek Wilk
<konrad.wilk@oracle.com> wrote:
> On Wed, Aug 27, 2014 at 09:29:36AM +0200, Christian Borntraeger wrote:
>> On 26/08/14 23:17, Andy Lutomirski wrote:
>> > virtio_ring currently sends the device (usually a hypervisor)
>> > physical addresses of its I/O buffers.  This is okay when DMA
>> > addresses and physical addresses are the same thing, but this isn't
>> > always the case.  For example, this never works on Xen guests, and
>> > it is likely to fail if a physical "virtio" device ever ends up
>> > behind an IOMMU or swiotlb.
>> >
>> > The immediate use case for me is to enable virtio on Xen guests.
>> > For that to work, we need this fix as well as a corresponding
>> > fix to virtio_pci or to another driver.
>> >
>> > With this patch, virtfs survives kmemleak and CONFIG_DMA_API_DEBUG.
>> > virtio-net warns (correctly) about DMA from the stack in
>> > virtnet_set_rx_mode.
>> >
>> > This breaks s390's defconfig.  The default configuration for s390
>> > does virtio through a KVM-specific interface, but that configuration
>> > does not support DMA.  I could modify this patch to stub out the DMA
>> > API calls if !CONFIG_HAS_DMA, but it seems to me that it would be
>> > much nicer to make s390 support DMA unconditionally.
>>
>> s390 has no DMA per se. Newest systems have a PCI-like  I/O attach in
>> addition to the classic channel I/O, and for that we enable the DMA code
>> just for that transport to be able to reuse some of the existing PCI
>> drivers. (only some because, we differ in some aspects from how PCI
>> looks like) But the architecture itself (and the virtio interface) does
>> not provide the DMA interface as you know it:
>
> Don't most of those DMA_API end up then being nops? As in, we do
> have in the dma-api file the #ifdef case when a platform does not do
> DMA which ends up with all functions stubbed out.
>

This doesn't work on s390.  It fails to link.

This was my initial thought, too -- it would be nice if s390 set up
no-op DMA ops its virtio device.  But this might be overkill, if
virtio is the *only* device on s390 that does anything that looks like
DMA.

Anyway, v2 (coming soon) adds a switch in virtio, so s390 should
continue to work.

--Andy

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 3/3] virtio_pci: Use the DMA API for virtqueues
  2014-08-26 21:16 [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API Andy Lutomirski
  2014-08-26 21:17 ` [PATCH 1/3] virtio_ring: Remove sg_next indirection Andy Lutomirski
  2014-08-26 21:17 ` [PATCH 2/3] virtio_ring: Use DMA APIs Andy Lutomirski
@ 2014-08-26 21:17 ` Andy Lutomirski
  2014-08-27 17:32   ` Konrad Rzeszutek Wilk
  2014-08-27  6:46 ` [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API Stefan Hajnoczi
  2014-08-27  7:54 ` Cornelia Huck
  4 siblings, 1 reply; 28+ messages in thread
From: Andy Lutomirski @ 2014-08-26 21:17 UTC (permalink / raw)
  To: Rusty Russell, Michael S. Tsirkin
  Cc: linux-s390, Konrad Rzeszutek Wilk, virtualization, linux390,
	Andy Lutomirski

A virtqueue is a coherent DMA mapping.  Use the DMA API for it.
This fixes virtio_pci on Xen.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 drivers/virtio/virtio_pci.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 3d1463c6b120..19039c5bec24 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -80,8 +80,9 @@ struct virtio_pci_vq_info
 	/* the number of entries in the queue */
 	int num;
 
-	/* the virtual address of the ring queue */
-	void *queue;
+	/* the ring queue */
+	void *queue;			/* virtual address */
+	dma_addr_t queue_dma_addr;	/* bus address */
 
 	/* the list node for the virtqueues list */
 	struct list_head node;
@@ -417,15 +418,16 @@ static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
 	info->num = num;
 	info->msix_vector = msix_vec;
 
-	size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
-	info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
+	size = vring_size(num, VIRTIO_PCI_VRING_ALIGN);
+	info->queue = dma_zalloc_coherent(vdev->dev.parent, size,
+					  &info->queue_dma_addr, GFP_KERNEL);
 	if (info->queue == NULL) {
 		err = -ENOMEM;
 		goto out_info;
 	}
 
 	/* activate the queue */
-	iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
+	iowrite32(info->queue_dma_addr >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
 		  vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
 	/* create the vring */
@@ -462,7 +464,8 @@ out_assign:
 	vring_del_virtqueue(vq);
 out_activate_queue:
 	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
-	free_pages_exact(info->queue, size);
+	dma_free_coherent(vdev->dev.parent, size,
+			  info->queue, info->queue_dma_addr);
 out_info:
 	kfree(info);
 	return ERR_PTR(err);
@@ -493,7 +496,8 @@ static void vp_del_vq(struct virtqueue *vq)
 	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
 	size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_VRING_ALIGN));
-	free_pages_exact(info->queue, size);
+	dma_free_coherent(vq->vdev->dev.parent, size,
+			  info->queue, info->queue_dma_addr);
 	kfree(info);
 }
 
@@ -712,6 +716,13 @@ static int virtio_pci_probe(struct pci_dev *pci_dev,
 	if (err)
 		goto out;
 
+	/*
+	 * We support 64-bit DMA.  If this fails (e.g. some bridge
+	 * or PV code doesn't or DAC is disabled), then we're okay
+	 * with 32-bit DMA.
+	 */
+	dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64));
+
 	err = pci_request_regions(pci_dev, "virtio-pci");
 	if (err)
 		goto out_enable_device;
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/3] virtio_pci: Use the DMA API for virtqueues
  2014-08-26 21:17 ` [PATCH 3/3] virtio_pci: Use the DMA API for virtqueues Andy Lutomirski
@ 2014-08-27 17:32   ` Konrad Rzeszutek Wilk
  2014-08-27 17:35     ` Andy Lutomirski
  0 siblings, 1 reply; 28+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-08-27 17:32 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: linux-s390, Michael S. Tsirkin, virtualization, linux390

On Tue, Aug 26, 2014 at 02:17:02PM -0700, Andy Lutomirski wrote:
> A virtqueue is a coherent DMA mapping.  Use the DMA API for it.
> This fixes virtio_pci on Xen.
> 
> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> ---
>  drivers/virtio/virtio_pci.c | 25 ++++++++++++++++++-------
>  1 file changed, 18 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
> index 3d1463c6b120..19039c5bec24 100644
> --- a/drivers/virtio/virtio_pci.c
> +++ b/drivers/virtio/virtio_pci.c
> @@ -80,8 +80,9 @@ struct virtio_pci_vq_info
>  	/* the number of entries in the queue */
>  	int num;
>  
> -	/* the virtual address of the ring queue */
> -	void *queue;
> +	/* the ring queue */
> +	void *queue;			/* virtual address */
> +	dma_addr_t queue_dma_addr;	/* bus address */
>  
>  	/* the list node for the virtqueues list */
>  	struct list_head node;
> @@ -417,15 +418,16 @@ static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
>  	info->num = num;
>  	info->msix_vector = msix_vec;
>  
> -	size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
> -	info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
> +	size = vring_size(num, VIRTIO_PCI_VRING_ALIGN);
> +	info->queue = dma_zalloc_coherent(vdev->dev.parent, size,
> +					  &info->queue_dma_addr, GFP_KERNEL);
>  	if (info->queue == NULL) {
>  		err = -ENOMEM;
>  		goto out_info;
>  	}
>  
>  	/* activate the queue */
> -	iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
> +	iowrite32(info->queue_dma_addr >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
>  		  vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
>  
>  	/* create the vring */
> @@ -462,7 +464,8 @@ out_assign:
>  	vring_del_virtqueue(vq);
>  out_activate_queue:
>  	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
> -	free_pages_exact(info->queue, size);
> +	dma_free_coherent(vdev->dev.parent, size,
> +			  info->queue, info->queue_dma_addr);
>  out_info:
>  	kfree(info);
>  	return ERR_PTR(err);
> @@ -493,7 +496,8 @@ static void vp_del_vq(struct virtqueue *vq)
>  	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
>  
>  	size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_VRING_ALIGN));
> -	free_pages_exact(info->queue, size);
> +	dma_free_coherent(vq->vdev->dev.parent, size,
> +			  info->queue, info->queue_dma_addr);
>  	kfree(info);
>  }
>  
> @@ -712,6 +716,13 @@ static int virtio_pci_probe(struct pci_dev *pci_dev,
>  	if (err)
>  		goto out;
>  
> +	/*
> +	 * We support 64-bit DMA.  If this fails (e.g. some bridge
> +	 * or PV code doesn't or DAC is disabled), then we're okay
> +	 * with 32-bit DMA.

<scratches his head>

I am having a hard time parsing that. Could you expand a bit the
faulting use-case please?

> +	 */
> +	dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64));

The usual process is:

	ret = dma_set_mask_and_coherent(..)
	if (ret)
		ret = dma_set_mask_and_coherent(.., DMA_BIT_MASK(32))

	if (ret)
		pr_warn("We are truly screwed. Good luck!\n");

> +
>  	err = pci_request_regions(pci_dev, "virtio-pci");
>  	if (err)
>  		goto out_enable_device;
> -- 
> 1.9.3
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/3] virtio_pci: Use the DMA API for virtqueues
  2014-08-27 17:32   ` Konrad Rzeszutek Wilk
@ 2014-08-27 17:35     ` Andy Lutomirski
  2014-08-27 18:52       ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 28+ messages in thread
From: Andy Lutomirski @ 2014-08-27 17:35 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: linux-s390@vger.kernel.org, Michael S. Tsirkin,
	Linux Virtualization, linux390@de.ibm.com

On Wed, Aug 27, 2014 at 10:32 AM, Konrad Rzeszutek Wilk
<konrad.wilk@oracle.com> wrote:
> On Tue, Aug 26, 2014 at 02:17:02PM -0700, Andy Lutomirski wrote:
>> A virtqueue is a coherent DMA mapping.  Use the DMA API for it.
>> This fixes virtio_pci on Xen.
>>
>> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
>> ---
>>  drivers/virtio/virtio_pci.c | 25 ++++++++++++++++++-------
>>  1 file changed, 18 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
>> index 3d1463c6b120..19039c5bec24 100644
>> --- a/drivers/virtio/virtio_pci.c
>> +++ b/drivers/virtio/virtio_pci.c
>> @@ -80,8 +80,9 @@ struct virtio_pci_vq_info
>>       /* the number of entries in the queue */
>>       int num;
>>
>> -     /* the virtual address of the ring queue */
>> -     void *queue;
>> +     /* the ring queue */
>> +     void *queue;                    /* virtual address */
>> +     dma_addr_t queue_dma_addr;      /* bus address */
>>
>>       /* the list node for the virtqueues list */
>>       struct list_head node;
>> @@ -417,15 +418,16 @@ static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
>>       info->num = num;
>>       info->msix_vector = msix_vec;
>>
>> -     size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
>> -     info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
>> +     size = vring_size(num, VIRTIO_PCI_VRING_ALIGN);
>> +     info->queue = dma_zalloc_coherent(vdev->dev.parent, size,
>> +                                       &info->queue_dma_addr, GFP_KERNEL);
>>       if (info->queue == NULL) {
>>               err = -ENOMEM;
>>               goto out_info;
>>       }
>>
>>       /* activate the queue */
>> -     iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
>> +     iowrite32(info->queue_dma_addr >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
>>                 vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
>>
>>       /* create the vring */
>> @@ -462,7 +464,8 @@ out_assign:
>>       vring_del_virtqueue(vq);
>>  out_activate_queue:
>>       iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
>> -     free_pages_exact(info->queue, size);
>> +     dma_free_coherent(vdev->dev.parent, size,
>> +                       info->queue, info->queue_dma_addr);
>>  out_info:
>>       kfree(info);
>>       return ERR_PTR(err);
>> @@ -493,7 +496,8 @@ static void vp_del_vq(struct virtqueue *vq)
>>       iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
>>
>>       size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_VRING_ALIGN));
>> -     free_pages_exact(info->queue, size);
>> +     dma_free_coherent(vq->vdev->dev.parent, size,
>> +                       info->queue, info->queue_dma_addr);
>>       kfree(info);
>>  }
>>
>> @@ -712,6 +716,13 @@ static int virtio_pci_probe(struct pci_dev *pci_dev,
>>       if (err)
>>               goto out;
>>
>> +     /*
>> +      * We support 64-bit DMA.  If this fails (e.g. some bridge
>> +      * or PV code doesn't or DAC is disabled), then we're okay
>> +      * with 32-bit DMA.
>
> <scratches his head>
>
> I am having a hard time parsing that. Could you expand a bit the
> faulting use-case please?
>
>> +      */
>> +     dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64));
>
> The usual process is:
>
>         ret = dma_set_mask_and_coherent(..)
>         if (ret)
>                 ret = dma_set_mask_and_coherent(.., DMA_BIT_MASK(32))
>
>         if (ret)
>                 pr_warn("We are truly screwed. Good luck!\n");
>

I assumed that, if dma_set_mask_and_coherent(..., DMA_BIT_BASK(64))
fails, then we can still do DMA, just not 64-bit DMA.  This driver
should be fine with that -- it'll just be a bit slower.

If that's not a safe assumption, I can change it.

--Andy

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/3] virtio_pci: Use the DMA API for virtqueues
  2014-08-27 17:35     ` Andy Lutomirski
@ 2014-08-27 18:52       ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 28+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-08-27 18:52 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-s390@vger.kernel.org, Michael S. Tsirkin,
	Linux Virtualization, linux390@de.ibm.com

On Wed, Aug 27, 2014 at 10:35:10AM -0700, Andy Lutomirski wrote:
> On Wed, Aug 27, 2014 at 10:32 AM, Konrad Rzeszutek Wilk
> <konrad.wilk@oracle.com> wrote:
> > On Tue, Aug 26, 2014 at 02:17:02PM -0700, Andy Lutomirski wrote:
> >> A virtqueue is a coherent DMA mapping.  Use the DMA API for it.
> >> This fixes virtio_pci on Xen.
> >>
> >> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> >> ---
> >>  drivers/virtio/virtio_pci.c | 25 ++++++++++++++++++-------
> >>  1 file changed, 18 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
> >> index 3d1463c6b120..19039c5bec24 100644
> >> --- a/drivers/virtio/virtio_pci.c
> >> +++ b/drivers/virtio/virtio_pci.c
> >> @@ -80,8 +80,9 @@ struct virtio_pci_vq_info
> >>       /* the number of entries in the queue */
> >>       int num;
> >>
> >> -     /* the virtual address of the ring queue */
> >> -     void *queue;
> >> +     /* the ring queue */
> >> +     void *queue;                    /* virtual address */
> >> +     dma_addr_t queue_dma_addr;      /* bus address */
> >>
> >>       /* the list node for the virtqueues list */
> >>       struct list_head node;
> >> @@ -417,15 +418,16 @@ static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
> >>       info->num = num;
> >>       info->msix_vector = msix_vec;
> >>
> >> -     size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
> >> -     info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
> >> +     size = vring_size(num, VIRTIO_PCI_VRING_ALIGN);
> >> +     info->queue = dma_zalloc_coherent(vdev->dev.parent, size,
> >> +                                       &info->queue_dma_addr, GFP_KERNEL);
> >>       if (info->queue == NULL) {
> >>               err = -ENOMEM;
> >>               goto out_info;
> >>       }
> >>
> >>       /* activate the queue */
> >> -     iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
> >> +     iowrite32(info->queue_dma_addr >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
> >>                 vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
> >>
> >>       /* create the vring */
> >> @@ -462,7 +464,8 @@ out_assign:
> >>       vring_del_virtqueue(vq);
> >>  out_activate_queue:
> >>       iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
> >> -     free_pages_exact(info->queue, size);
> >> +     dma_free_coherent(vdev->dev.parent, size,
> >> +                       info->queue, info->queue_dma_addr);
> >>  out_info:
> >>       kfree(info);
> >>       return ERR_PTR(err);
> >> @@ -493,7 +496,8 @@ static void vp_del_vq(struct virtqueue *vq)
> >>       iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
> >>
> >>       size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_VRING_ALIGN));
> >> -     free_pages_exact(info->queue, size);
> >> +     dma_free_coherent(vq->vdev->dev.parent, size,
> >> +                       info->queue, info->queue_dma_addr);
> >>       kfree(info);
> >>  }
> >>
> >> @@ -712,6 +716,13 @@ static int virtio_pci_probe(struct pci_dev *pci_dev,
> >>       if (err)
> >>               goto out;
> >>
> >> +     /*
> >> +      * We support 64-bit DMA.  If this fails (e.g. some bridge
> >> +      * or PV code doesn't or DAC is disabled), then we're okay
> >> +      * with 32-bit DMA.
> >
> > <scratches his head>
> >
> > I am having a hard time parsing that. Could you expand a bit the
> > faulting use-case please?
> >
> >> +      */
> >> +     dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64));
> >
> > The usual process is:
> >
> >         ret = dma_set_mask_and_coherent(..)
> >         if (ret)
> >                 ret = dma_set_mask_and_coherent(.., DMA_BIT_MASK(32))
> >
> >         if (ret)
> >                 pr_warn("We are truly screwed. Good luck!\n");
> >
> 
> I assumed that, if dma_set_mask_and_coherent(..., DMA_BIT_BASK(64))
> fails, then we can still do DMA, just not 64-bit DMA.  This driver
> should be fine with that -- it'll just be a bit slower.

Right. It should fail back to 32-bit (the default on PCI bus).

I was merely thinking of the error reporting might be useful
but on second thought - saying that the user is screwed - is
a bad idea.
> 
> If that's not a safe assumption, I can change it.
> 
> --Andy

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API
  2014-08-26 21:16 [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API Andy Lutomirski
                   ` (2 preceding siblings ...)
  2014-08-26 21:17 ` [PATCH 3/3] virtio_pci: Use the DMA API for virtqueues Andy Lutomirski
@ 2014-08-27  6:46 ` Stefan Hajnoczi
  2014-08-27 15:11   ` Andy Lutomirski
  2014-08-27 17:27   ` Konrad Rzeszutek Wilk
  2014-08-27  7:54 ` Cornelia Huck
  4 siblings, 2 replies; 28+ messages in thread
From: Stefan Hajnoczi @ 2014-08-27  6:46 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-s390, Konrad Rzeszutek Wilk, Linux Virtualization,
	Michael S. Tsirkin, linux390, virtio-dev

On Tue, Aug 26, 2014 at 10:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> There are two outstanding issues.  virtio_net warns if DMA debugging
> is on because it does DMA from the stack.  (The warning is correct.)
> This also is likely to do something unpleasant to s390.
> (Maintainers are cc'd -- I don't know what to do about it.)

This changes the semantics of vring and breaks existing guests when
bus address != physical address.

Can you use a transport feature bit to indicate that bus addresses are
used?  That way both approaches can be supported.

Please also update the virtio specification:
https://tools.oasis-open.org/version-control/browse/wsvn/virtio/

Stefan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API
  2014-08-27  6:46 ` [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API Stefan Hajnoczi
@ 2014-08-27 15:11   ` Andy Lutomirski
  2014-08-27 15:45     ` Michael S. Tsirkin
  2014-08-27 17:27   ` Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 28+ messages in thread
From: Andy Lutomirski @ 2014-08-27 15:11 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: linux-s390@vger.kernel.org, Konrad Rzeszutek Wilk,
	Benjamin Herrenschmidt, Linux Virtualization,
	Christian Borntraeger, Michael S. Tsirkin, linux390@de.ibm.com,
	virtio-dev

On Aug 26, 2014 11:46 PM, "Stefan Hajnoczi" <stefanha@gmail.com> wrote:
>
> On Tue, Aug 26, 2014 at 10:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> > There are two outstanding issues.  virtio_net warns if DMA debugging
> > is on because it does DMA from the stack.  (The warning is correct.)
> > This also is likely to do something unpleasant to s390.
> > (Maintainers are cc'd -- I don't know what to do about it.)
>
> This changes the semantics of vring and breaks existing guests when
> bus address != physical address.
>
> Can you use a transport feature bit to indicate that bus addresses are
> used?  That way both approaches can be supported.

I can try to support both styles of addressing, but I don't think that
this can be negotiated between the device (i.e. host or physical
virtio-speaking device) and the guest.  In the Xen case that I care
about (Linux on Xen on KVM), the host doesn't know about the
translation at all -- Xen is an intermediate layer that only the guest
is aware of.  In this case, there are effectively two layers of
virtualization, and only the inner one (Xen) knows about the
translation despite the fact that the the outer layer is the one
providing the virtio device.

I could change virtio_ring to use the DMA API only if requested by the
lower driver (virtio_pci, etc) and to have only virtio_pci enable that
feature.  Will that work for all cases?

On s390, this shouldn't work just like the current code.  On x86, I
think that if QEMU ever starts exposing an IOMMU attached to a
virtio-pci device, then QEMU should expect that IOMMU to be used.  If
QEMU expects to see physical addresses, then it shouldn't advertise an
IOMMU.  Since QEMU doesn't currently support guest IOMMUs, this should
be fine for everything that uses QEMU.

At least x86's implementation of the DMA ops for devices that aren't
behind an IOMMU should be very fast.

Are there any other weird cases for which this might be a problem?

>
> Please also update the virtio specification:
> https://tools.oasis-open.org/version-control/browse/wsvn/virtio/
>

I'm not sure it will need an update.  Perhaps a note in the PCI
section indicating that, if the host expects the guest to program an
IOMMU, then it should use the appropriate platform-specific mechanism
to expose that IOMMU.

--Andy

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API
  2014-08-27 15:11   ` Andy Lutomirski
@ 2014-08-27 15:45     ` Michael S. Tsirkin
  2014-08-27 15:50       ` Andy Lutomirski
  0 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2014-08-27 15:45 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: virtio-dev, linux-s390@vger.kernel.org, Konrad Rzeszutek Wilk,
	Linux Virtualization, Christian Borntraeger,
	Benjamin Herrenschmidt, linux390@de.ibm.com

On Wed, Aug 27, 2014 at 08:11:15AM -0700, Andy Lutomirski wrote:
> On Aug 26, 2014 11:46 PM, "Stefan Hajnoczi" <stefanha@gmail.com> wrote:
> >
> > On Tue, Aug 26, 2014 at 10:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> > > There are two outstanding issues.  virtio_net warns if DMA debugging
> > > is on because it does DMA from the stack.  (The warning is correct.)
> > > This also is likely to do something unpleasant to s390.
> > > (Maintainers are cc'd -- I don't know what to do about it.)
> >
> > This changes the semantics of vring and breaks existing guests when
> > bus address != physical address.
> >
> > Can you use a transport feature bit to indicate that bus addresses are
> > used?  That way both approaches can be supported.
> 
> I can try to support both styles of addressing, but I don't think that
> this can be negotiated between the device (i.e. host or physical
> virtio-speaking device) and the guest.  In the Xen case that I care
> about (Linux on Xen on KVM), the host doesn't know about the
> translation at all -- Xen is an intermediate layer that only the guest
> is aware of.  In this case, there are effectively two layers of
> virtualization, and only the inner one (Xen) knows about the
> translation despite the fact that the the outer layer is the one
> providing the virtio device.
> 
> I could change virtio_ring to use the DMA API only if requested by the
> lower driver (virtio_pci, etc) and to have only virtio_pci enable that
> feature.  Will that work for all cases?
> 
> On s390, this shouldn't work just like the current code.  On x86, I
> think that if QEMU ever starts exposing an IOMMU attached to a
> virtio-pci device, then QEMU should expect that IOMMU to be used.  If
> QEMU expects to see physical addresses, then it shouldn't advertise an
> IOMMU.  Since QEMU doesn't currently support guest IOMMUs, this should
> be fine for everything that uses QEMU.
> 
> At least x86's implementation of the DMA ops for devices that aren't
> behind an IOMMU should be very fast.
> 
> Are there any other weird cases for which this might be a problem?
> 
> >
> > Please also update the virtio specification:
> > https://tools.oasis-open.org/version-control/browse/wsvn/virtio/
> >
> 
> I'm not sure it will need an update.  Perhaps a note in the PCI
> section indicating that, if the host expects the guest to program an
> IOMMU, then it should use the appropriate platform-specific mechanism
> to expose that IOMMU.
> 
> --Andy

If there's no virtio mechanism to negotate enabling/disabling
translations, then specification does not need an extension.

-- 
MST

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API
  2014-08-27 15:45     ` Michael S. Tsirkin
@ 2014-08-27 15:50       ` Andy Lutomirski
  2014-08-27 16:13         ` Christopher Covington
  2014-08-27 20:52         ` Christian Borntraeger
  0 siblings, 2 replies; 28+ messages in thread
From: Andy Lutomirski @ 2014-08-27 15:50 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: virtio-dev, linux-s390@vger.kernel.org, Konrad Rzeszutek Wilk,
	Linux Virtualization, Christian Borntraeger,
	Benjamin Herrenschmidt, linux390@de.ibm.com

On Wed, Aug 27, 2014 at 8:45 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Wed, Aug 27, 2014 at 08:11:15AM -0700, Andy Lutomirski wrote:
>> On Aug 26, 2014 11:46 PM, "Stefan Hajnoczi" <stefanha@gmail.com> wrote:
>> >
>> > On Tue, Aug 26, 2014 at 10:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> > > There are two outstanding issues.  virtio_net warns if DMA debugging
>> > > is on because it does DMA from the stack.  (The warning is correct.)
>> > > This also is likely to do something unpleasant to s390.
>> > > (Maintainers are cc'd -- I don't know what to do about it.)
>> >
>> > This changes the semantics of vring and breaks existing guests when
>> > bus address != physical address.
>> >
>> > Can you use a transport feature bit to indicate that bus addresses are
>> > used?  That way both approaches can be supported.
>>
>> I can try to support both styles of addressing, but I don't think that
>> this can be negotiated between the device (i.e. host or physical
>> virtio-speaking device) and the guest.  In the Xen case that I care
>> about (Linux on Xen on KVM), the host doesn't know about the
>> translation at all -- Xen is an intermediate layer that only the guest
>> is aware of.  In this case, there are effectively two layers of
>> virtualization, and only the inner one (Xen) knows about the
>> translation despite the fact that the the outer layer is the one
>> providing the virtio device.
>>
>> I could change virtio_ring to use the DMA API only if requested by the
>> lower driver (virtio_pci, etc) and to have only virtio_pci enable that
>> feature.  Will that work for all cases?
>>
>> On s390, this shouldn't work just like the current code.  On x86, I
>> think that if QEMU ever starts exposing an IOMMU attached to a
>> virtio-pci device, then QEMU should expect that IOMMU to be used.  If
>> QEMU expects to see physical addresses, then it shouldn't advertise an
>> IOMMU.  Since QEMU doesn't currently support guest IOMMUs, this should
>> be fine for everything that uses QEMU.
>>
>> At least x86's implementation of the DMA ops for devices that aren't
>> behind an IOMMU should be very fast.
>>
>> Are there any other weird cases for which this might be a problem?
>>
>> >
>> > Please also update the virtio specification:
>> > https://tools.oasis-open.org/version-control/browse/wsvn/virtio/
>> >
>>
>> I'm not sure it will need an update.  Perhaps a note in the PCI
>> section indicating that, if the host expects the guest to program an
>> IOMMU, then it should use the appropriate platform-specific mechanism
>> to expose that IOMMU.
>>
>> --Andy
>
> If there's no virtio mechanism to negotate enabling/disabling
> translations, then specification does not need an extension.

It wouldn't shock me if someone wants to negotiate this for
virtio_mmio some day, but that might be more of a device tree thing.
I have no idea how that works, but I think it can wait until someone
wants it.

I updated the patches, and I'll send them out after I try to test-boot
s390 under QEMU :)

--Andy

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API
  2014-08-27 15:50       ` Andy Lutomirski
@ 2014-08-27 16:13         ` Christopher Covington
  2014-08-27 16:19           ` Andy Lutomirski
  2014-08-27 20:52         ` Christian Borntraeger
  1 sibling, 1 reply; 28+ messages in thread
From: Christopher Covington @ 2014-08-27 16:13 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: virtio-dev, linux-s390@vger.kernel.org, Benjamin Herrenschmidt,
	Konrad Rzeszutek Wilk, Linux Virtualization,
	Christian Borntraeger, Michael S. Tsirkin, linux390@de.ibm.com,
	linux-arm-kernel@lists.infradead.org

On 08/27/2014 11:50 AM, Andy Lutomirski wrote:
> On Wed, Aug 27, 2014 at 8:45 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
>> On Wed, Aug 27, 2014 at 08:11:15AM -0700, Andy Lutomirski wrote:
>>> On Aug 26, 2014 11:46 PM, "Stefan Hajnoczi" <stefanha@gmail.com> wrote:
>>>>
>>>> On Tue, Aug 26, 2014 at 10:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>> There are two outstanding issues.  virtio_net warns if DMA debugging
>>>>> is on because it does DMA from the stack.  (The warning is correct.)
>>>>> This also is likely to do something unpleasant to s390.
>>>>> (Maintainers are cc'd -- I don't know what to do about it.)
>>>>
>>>> This changes the semantics of vring and breaks existing guests when
>>>> bus address != physical address.
>>>>
>>>> Can you use a transport feature bit to indicate that bus addresses are
>>>> used?  That way both approaches can be supported.
>>>
>>> I can try to support both styles of addressing, but I don't think that
>>> this can be negotiated between the device (i.e. host or physical
>>> virtio-speaking device) and the guest.  In the Xen case that I care
>>> about (Linux on Xen on KVM), the host doesn't know about the
>>> translation at all -- Xen is an intermediate layer that only the guest
>>> is aware of.  In this case, there are effectively two layers of
>>> virtualization, and only the inner one (Xen) knows about the
>>> translation despite the fact that the the outer layer is the one
>>> providing the virtio device.
>>>
>>> I could change virtio_ring to use the DMA API only if requested by the
>>> lower driver (virtio_pci, etc) and to have only virtio_pci enable that
>>> feature.  Will that work for all cases?
>>>
>>> On s390, this shouldn't work just like the current code.  On x86, I
>>> think that if QEMU ever starts exposing an IOMMU attached to a
>>> virtio-pci device, then QEMU should expect that IOMMU to be used.  If
>>> QEMU expects to see physical addresses, then it shouldn't advertise an
>>> IOMMU.  Since QEMU doesn't currently support guest IOMMUs, this should
>>> be fine for everything that uses QEMU.
>>>
>>> At least x86's implementation of the DMA ops for devices that aren't
>>> behind an IOMMU should be very fast.
>>>
>>> Are there any other weird cases for which this might be a problem?
>>>
>>>>
>>>> Please also update the virtio specification:
>>>> https://tools.oasis-open.org/version-control/browse/wsvn/virtio/
>>>>
>>>
>>> I'm not sure it will need an update.  Perhaps a note in the PCI
>>> section indicating that, if the host expects the guest to program an
>>> IOMMU, then it should use the appropriate platform-specific mechanism
>>> to expose that IOMMU.
>>>
>>> --Andy
>>
>> If there's no virtio mechanism to negotate enabling/disabling
>> translations, then specification does not need an extension.
> 
> It wouldn't shock me if someone wants to negotiate this for
> virtio_mmio some day, but that might be more of a device tree thing.
> I have no idea how that works, but I think it can wait until someone
> wants it.

At one point I wanted VirtIO-MMIO to not fail miserably on ARM LPAE systems.

http://lists.infradead.org/pipermail/linux-arm-kernel/2014-March/241613.html

Christopher

-- 
Employee of Qualcomm Innovation Center, Inc.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by the Linux Foundation.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API
  2014-08-27 16:13         ` Christopher Covington
@ 2014-08-27 16:19           ` Andy Lutomirski
  2014-08-27 17:34             ` Christopher Covington
  0 siblings, 1 reply; 28+ messages in thread
From: Andy Lutomirski @ 2014-08-27 16:19 UTC (permalink / raw)
  To: Christopher Covington
  Cc: virtio-dev, linux-s390@vger.kernel.org, Benjamin Herrenschmidt,
	Konrad Rzeszutek Wilk, Linux Virtualization,
	Christian Borntraeger, Michael S. Tsirkin, linux390@de.ibm.com,
	linux-arm-kernel@lists.infradead.org

On Wed, Aug 27, 2014 at 9:13 AM, Christopher Covington
<cov@codeaurora.org> wrote:
> On 08/27/2014 11:50 AM, Andy Lutomirski wrote:
>> On Wed, Aug 27, 2014 at 8:45 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
>>> On Wed, Aug 27, 2014 at 08:11:15AM -0700, Andy Lutomirski wrote:
>>>> On Aug 26, 2014 11:46 PM, "Stefan Hajnoczi" <stefanha@gmail.com> wrote:
>>>>>
>>>>> On Tue, Aug 26, 2014 at 10:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>>> There are two outstanding issues.  virtio_net warns if DMA debugging
>>>>>> is on because it does DMA from the stack.  (The warning is correct.)
>>>>>> This also is likely to do something unpleasant to s390.
>>>>>> (Maintainers are cc'd -- I don't know what to do about it.)
>>>>>
>>>>> This changes the semantics of vring and breaks existing guests when
>>>>> bus address != physical address.
>>>>>
>>>>> Can you use a transport feature bit to indicate that bus addresses are
>>>>> used?  That way both approaches can be supported.
>>>>
>>>> I can try to support both styles of addressing, but I don't think that
>>>> this can be negotiated between the device (i.e. host or physical
>>>> virtio-speaking device) and the guest.  In the Xen case that I care
>>>> about (Linux on Xen on KVM), the host doesn't know about the
>>>> translation at all -- Xen is an intermediate layer that only the guest
>>>> is aware of.  In this case, there are effectively two layers of
>>>> virtualization, and only the inner one (Xen) knows about the
>>>> translation despite the fact that the the outer layer is the one
>>>> providing the virtio device.
>>>>
>>>> I could change virtio_ring to use the DMA API only if requested by the
>>>> lower driver (virtio_pci, etc) and to have only virtio_pci enable that
>>>> feature.  Will that work for all cases?
>>>>
>>>> On s390, this shouldn't work just like the current code.  On x86, I
>>>> think that if QEMU ever starts exposing an IOMMU attached to a
>>>> virtio-pci device, then QEMU should expect that IOMMU to be used.  If
>>>> QEMU expects to see physical addresses, then it shouldn't advertise an
>>>> IOMMU.  Since QEMU doesn't currently support guest IOMMUs, this should
>>>> be fine for everything that uses QEMU.
>>>>
>>>> At least x86's implementation of the DMA ops for devices that aren't
>>>> behind an IOMMU should be very fast.
>>>>
>>>> Are there any other weird cases for which this might be a problem?
>>>>
>>>>>
>>>>> Please also update the virtio specification:
>>>>> https://tools.oasis-open.org/version-control/browse/wsvn/virtio/
>>>>>
>>>>
>>>> I'm not sure it will need an update.  Perhaps a note in the PCI
>>>> section indicating that, if the host expects the guest to program an
>>>> IOMMU, then it should use the appropriate platform-specific mechanism
>>>> to expose that IOMMU.
>>>>
>>>> --Andy
>>>
>>> If there's no virtio mechanism to negotate enabling/disabling
>>> translations, then specification does not need an extension.
>>
>> It wouldn't shock me if someone wants to negotiate this for
>> virtio_mmio some day, but that might be more of a device tree thing.
>> I have no idea how that works, but I think it can wait until someone
>> wants it.
>
> At one point I wanted VirtIO-MMIO to not fail miserably on ARM LPAE systems.
>
> http://lists.infradead.org/pipermail/linux-arm-kernel/2014-March/241613.html
>

Since I know nothing about LPAE, I'll leave this to you :)  But I'll
cc you on patch v2 soon, and please tell me whether my code makes
sense on ARM.

(My attempt to boot-test s390 failed because I have no clue what
command-line options to pass to QEMU.  If anyone wants to give me some
pointers to get a working configuration with -kernel and some kind of
console, I can add support to virtme.  Alas, I think that no one ever
bothered to implement 9p over virtio-ccw in QEMU.  Why exactly does
the virtio stuff in QEMU require that you instantiate virtio-9p-pci
instead of just asking for an appropriate virtio dievice?)

--Andy

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API
  2014-08-27 16:19           ` Andy Lutomirski
@ 2014-08-27 17:34             ` Christopher Covington
  2014-08-27 17:37               ` Andy Lutomirski
  0 siblings, 1 reply; 28+ messages in thread
From: Christopher Covington @ 2014-08-27 17:34 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-s390@vger.kernel.org, Konrad Rzeszutek Wilk,
	Benjamin Herrenschmidt, Michael S. Tsirkin, Linux Virtualization,
	Christian Borntraeger, linux390@de.ibm.com,
	linux-arm-kernel@lists.infradead.org

On 08/27/2014 12:19 PM, Andy Lutomirski wrote:
> On Wed, Aug 27, 2014 at 9:13 AM, Christopher Covington
> <cov@codeaurora.org> wrote:
>> On 08/27/2014 11:50 AM, Andy Lutomirski wrote:
>>> On Wed, Aug 27, 2014 at 8:45 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
>>>> On Wed, Aug 27, 2014 at 08:11:15AM -0700, Andy Lutomirski wrote:
>>>>> On Aug 26, 2014 11:46 PM, "Stefan Hajnoczi" <stefanha@gmail.com> wrote:
>>>>>>
>>>>>> On Tue, Aug 26, 2014 at 10:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>>>> There are two outstanding issues.  virtio_net warns if DMA debugging
>>>>>>> is on because it does DMA from the stack.  (The warning is correct.)
>>>>>>> This also is likely to do something unpleasant to s390.
>>>>>>> (Maintainers are cc'd -- I don't know what to do about it.)
>>>>>>
>>>>>> This changes the semantics of vring and breaks existing guests when
>>>>>> bus address != physical address.
>>>>>>
>>>>>> Can you use a transport feature bit to indicate that bus addresses are
>>>>>> used?  That way both approaches can be supported.
>>>>>
>>>>> I can try to support both styles of addressing, but I don't think that
>>>>> this can be negotiated between the device (i.e. host or physical
>>>>> virtio-speaking device) and the guest.  In the Xen case that I care
>>>>> about (Linux on Xen on KVM), the host doesn't know about the
>>>>> translation at all -- Xen is an intermediate layer that only the guest
>>>>> is aware of.  In this case, there are effectively two layers of
>>>>> virtualization, and only the inner one (Xen) knows about the
>>>>> translation despite the fact that the the outer layer is the one
>>>>> providing the virtio device.
>>>>>
>>>>> I could change virtio_ring to use the DMA API only if requested by the
>>>>> lower driver (virtio_pci, etc) and to have only virtio_pci enable that
>>>>> feature.  Will that work for all cases?
>>>>>
>>>>> On s390, this shouldn't work just like the current code.  On x86, I
>>>>> think that if QEMU ever starts exposing an IOMMU attached to a
>>>>> virtio-pci device, then QEMU should expect that IOMMU to be used.  If
>>>>> QEMU expects to see physical addresses, then it shouldn't advertise an
>>>>> IOMMU.  Since QEMU doesn't currently support guest IOMMUs, this should
>>>>> be fine for everything that uses QEMU.
>>>>>
>>>>> At least x86's implementation of the DMA ops for devices that aren't
>>>>> behind an IOMMU should be very fast.
>>>>>
>>>>> Are there any other weird cases for which this might be a problem?
>>>>>
>>>>>>
>>>>>> Please also update the virtio specification:
>>>>>> https://tools.oasis-open.org/version-control/browse/wsvn/virtio/
>>>>>>
>>>>>
>>>>> I'm not sure it will need an update.  Perhaps a note in the PCI
>>>>> section indicating that, if the host expects the guest to program an
>>>>> IOMMU, then it should use the appropriate platform-specific mechanism
>>>>> to expose that IOMMU.
>>>>>
>>>>> --Andy
>>>>
>>>> If there's no virtio mechanism to negotate enabling/disabling
>>>> translations, then specification does not need an extension.
>>>
>>> It wouldn't shock me if someone wants to negotiate this for
>>> virtio_mmio some day, but that might be more of a device tree thing.
>>> I have no idea how that works, but I think it can wait until someone
>>> wants it.
>>
>> At one point I wanted VirtIO-MMIO to not fail miserably on ARM LPAE systems.
>>
>> http://lists.infradead.org/pipermail/linux-arm-kernel/2014-March/241613.html
>>
> 
> Since I know nothing about LPAE, I'll leave this to you :)  But I'll
> cc you on patch v2 soon, and please tell me whether my code makes
> sense on ARM.
>
> (My attempt to boot-test s390 failed because I have no clue what
> command-line options to pass to QEMU.  If anyone wants to give me some
> pointers to get a working configuration with -kernel and some kind of
> console, I can add support to virtme.  Alas, I think that no one ever
> bothered to implement 9p over virtio-ccw in QEMU.  Why exactly does
> the virtio stuff in QEMU require that you instantiate virtio-9p-pci
> instead of just asking for an appropriate virtio dievice?)

Virtme looks interesting. If it's any use, here is my modest QEMU command line
collection.

$dir/bin/x86_64-linux-gnu/qemu-system-x86_64 \
  -m 1024 \
  -kernel $dir/boot/x86_64-linux-gnu/bzImage-x86_64 \
  -append 'root=/dev/root init=/sbin/x86_64-linux-gnu/init rootfstype=9p
rootflags=trans=virtio,version=9p2000.L ro console=ttyS0' \
  -serial stdio \
  -fsdev local,id=root,path=$dir,security_model=none \
  -device virtio-9p-pci,fsdev=root,mount_tag=/dev/root \
  -monitor none

$dir/bin/x86_64-linux-gnu/qemu-system-arm \
  -M virt \
  -cpu cortex-a15 \
  -m 1024 \
  -kernel $dir/boot/arm-linux-gnueabihf/Image \
  -append 'root=/dev/root init=/sbin/arm-linux-gnueabihf/init rootfstype=9p
rootflags=trans=virtio,version=9p2000.L ro console=ttyAMA0' \
  -serial stdio \
  -fsdev local,id=root,path=$dir,security_model=none \
  -device virtio-9p-device,fsdev=root,mount_tag=/dev/root \
  -monitor none

$dir/bin/x86_64-linux-gnu/qemu-system-aarch64 \
  -M virt \
  -cpu cortex-a57 \
  -m 1024 \
  -kernel $dir/boot/aarch64-linux-gnu/Image \
  -append 'root=/dev/root init=/sbin/aarch64-linux-gnu/init rootfstype=9p
rootflags=trans=virtio,version=9p2000.L ro console=ttyAMA0' \
  -serial stdio \
  -fsdev local,id=root,path=$dir,security_model=none \
  -device virtio-9p-device,fsdev=root,mount_tag=/dev/root -monitor none

Christopher

-- 
Employee of Qualcomm Innovation Center, Inc.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by the Linux Foundation.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API
  2014-08-27 17:34             ` Christopher Covington
@ 2014-08-27 17:37               ` Andy Lutomirski
  2014-08-27 17:50                 ` Christopher Covington
  0 siblings, 1 reply; 28+ messages in thread
From: Andy Lutomirski @ 2014-08-27 17:37 UTC (permalink / raw)
  To: Christopher Covington
  Cc: linux-s390@vger.kernel.org, Konrad Rzeszutek Wilk,
	Benjamin Herrenschmidt, Michael S. Tsirkin, Linux Virtualization,
	Christian Borntraeger, linux390@de.ibm.com,
	linux-arm-kernel@lists.infradead.org

On Wed, Aug 27, 2014 at 10:34 AM, Christopher Covington
<cov@codeaurora.org> wrote:
> On 08/27/2014 12:19 PM, Andy Lutomirski wrote:
>> On Wed, Aug 27, 2014 at 9:13 AM, Christopher Covington
>> <cov@codeaurora.org> wrote:
>
> Virtme looks interesting. If it's any use, here is my modest QEMU command line
> collection.
>
> $dir/bin/x86_64-linux-gnu/qemu-system-x86_64 \
>   -m 1024 \
>   -kernel $dir/boot/x86_64-linux-gnu/bzImage-x86_64 \
>   -append 'root=/dev/root init=/sbin/x86_64-linux-gnu/init rootfstype=9p
> rootflags=trans=virtio,version=9p2000.L ro console=ttyS0' \
>   -serial stdio \
>   -fsdev local,id=root,path=$dir,security_model=none \
>   -device virtio-9p-pci,fsdev=root,mount_tag=/dev/root \
>   -monitor none

This is similar to what virtme does, except that virtme will give you
a much more functional system at the cost of a small fraction of a
second of additional boot time.  And virtme sometimes supports modules
:)

>
> $dir/bin/x86_64-linux-gnu/qemu-system-arm \
>   -M virt \
>   -cpu cortex-a15 \
>   -m 1024 \
>   -kernel $dir/boot/arm-linux-gnueabihf/Image \
>   -append 'root=/dev/root init=/sbin/arm-linux-gnueabihf/init rootfstype=9p
> rootflags=trans=virtio,version=9p2000.L ro console=ttyAMA0' \
>   -serial stdio \
>   -fsdev local,id=root,path=$dir,security_model=none \
>   -device virtio-9p-device,fsdev=root,mount_tag=/dev/root \
>   -monitor none

Can you give me some hints as to how to configure a kernel for this?
I tried to get anything other than versatilepb working and gave up.

>
> $dir/bin/x86_64-linux-gnu/qemu-system-aarch64 \
>   -M virt \
>   -cpu cortex-a57 \
>   -m 1024 \
>   -kernel $dir/boot/aarch64-linux-gnu/Image \
>   -append 'root=/dev/root init=/sbin/aarch64-linux-gnu/init rootfstype=9p
> rootflags=trans=virtio,version=9p2000.L ro console=ttyAMA0' \
>   -serial stdio \
>   -fsdev local,id=root,path=$dir,security_model=none \
>   -device virtio-9p-device,fsdev=root,mount_tag=/dev/root -monitor none
>

Ooh, thanks!  I'll add support for this to virtme.

--Andy

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API
  2014-08-27 17:37               ` Andy Lutomirski
@ 2014-08-27 17:50                 ` Christopher Covington
  0 siblings, 0 replies; 28+ messages in thread
From: Christopher Covington @ 2014-08-27 17:50 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-s390@vger.kernel.org, Konrad Rzeszutek Wilk,
	Benjamin Herrenschmidt, Michael S. Tsirkin, Linux Virtualization,
	Christian Borntraeger, linux390@de.ibm.com,
	linux-arm-kernel@lists.infradead.org

On 08/27/2014 01:37 PM, Andy Lutomirski wrote:
> On Wed, Aug 27, 2014 at 10:34 AM, Christopher Covington
> <cov@codeaurora.org> wrote:
>> On 08/27/2014 12:19 PM, Andy Lutomirski wrote:
>>> On Wed, Aug 27, 2014 at 9:13 AM, Christopher Covington
>>> <cov@codeaurora.org> wrote:
>>
>> Virtme looks interesting. If it's any use, here is my modest QEMU command line
>> collection.
>>
>> $dir/bin/x86_64-linux-gnu/qemu-system-x86_64 \
>>   -m 1024 \
>>   -kernel $dir/boot/x86_64-linux-gnu/bzImage-x86_64 \
>>   -append 'root=/dev/root init=/sbin/x86_64-linux-gnu/init rootfstype=9p
>> rootflags=trans=virtio,version=9p2000.L ro console=ttyS0' \
>>   -serial stdio \
>>   -fsdev local,id=root,path=$dir,security_model=none \
>>   -device virtio-9p-pci,fsdev=root,mount_tag=/dev/root \
>>   -monitor none
> 
> This is similar to what virtme does, except that virtme will give you
> a much more functional system at the cost of a small fraction of a
> second of additional boot time.  And virtme sometimes supports modules
> :)
> 
>>
>> $dir/bin/x86_64-linux-gnu/qemu-system-arm \
>>   -M virt \
>>   -cpu cortex-a15 \
>>   -m 1024 \
>>   -kernel $dir/boot/arm-linux-gnueabihf/Image \
>>   -append 'root=/dev/root init=/sbin/arm-linux-gnueabihf/init rootfstype=9p
>> rootflags=trans=virtio,version=9p2000.L ro console=ttyAMA0' \
>>   -serial stdio \
>>   -fsdev local,id=root,path=$dir,security_model=none \
>>   -device virtio-9p-device,fsdev=root,mount_tag=/dev/root \
>>   -monitor none
> 
> Can you give me some hints as to how to configure a kernel for this?
> I tried to get anything other than versatilepb working and gave up.

I think you can just use vexpress_defconfig. If you want to from there you can
unselect CONFIG_ARCH_VEXPRESS and set CONFIG_ARCH_VIRT=y, but I don't think
that's strictly necessary.

>> $dir/bin/x86_64-linux-gnu/qemu-system-aarch64 \
>>   -M virt \
>>   -cpu cortex-a57 \
>>   -m 1024 \
>>   -kernel $dir/boot/aarch64-linux-gnu/Image \
>>   -append 'root=/dev/root init=/sbin/aarch64-linux-gnu/init rootfstype=9p
>> rootflags=trans=virtio,version=9p2000.L ro console=ttyAMA0' \
>>   -serial stdio \
>>   -fsdev local,id=root,path=$dir,security_model=none \
>>   -device virtio-9p-device,fsdev=root,mount_tag=/dev/root -monitor none
>>
> 
> Ooh, thanks!  I'll add support for this to virtme.

Cheers,
Christopher

-- 
Employee of Qualcomm Innovation Center, Inc.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by the Linux Foundation.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API
  2014-08-27 15:50       ` Andy Lutomirski
  2014-08-27 16:13         ` Christopher Covington
@ 2014-08-27 20:52         ` Christian Borntraeger
  1 sibling, 0 replies; 28+ messages in thread
From: Christian Borntraeger @ 2014-08-27 20:52 UTC (permalink / raw)
  To: Andy Lutomirski, Michael S. Tsirkin
  Cc: virtio-dev, linux-s390@vger.kernel.org, Konrad Rzeszutek Wilk,
	Linux Virtualization, Benjamin Herrenschmidt, linux390@de.ibm.com

On 27/08/14 17:50, Andy Lutomirski wrote:
> On Wed, Aug 27, 2014 at 8:45 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
>> On Wed, Aug 27, 2014 at 08:11:15AM -0700, Andy Lutomirski wrote:
>>> On Aug 26, 2014 11:46 PM, "Stefan Hajnoczi" <stefanha@gmail.com> wrote:
>>>>
>>>> On Tue, Aug 26, 2014 at 10:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>> There are two outstanding issues.  virtio_net warns if DMA debugging
>>>>> is on because it does DMA from the stack.  (The warning is correct.)
>>>>> This also is likely to do something unpleasant to s390.
>>>>> (Maintainers are cc'd -- I don't know what to do about it.)
>>>>
>>>> This changes the semantics of vring and breaks existing guests when
>>>> bus address != physical address.
>>>>
>>>> Can you use a transport feature bit to indicate that bus addresses are
>>>> used?  That way both approaches can be supported.
>>>
>>> I can try to support both styles of addressing, but I don't think that
>>> this can be negotiated between the device (i.e. host or physical
>>> virtio-speaking device) and the guest.  In the Xen case that I care
>>> about (Linux on Xen on KVM), the host doesn't know about the
>>> translation at all -- Xen is an intermediate layer that only the guest
>>> is aware of.  In this case, there are effectively two layers of
>>> virtualization, and only the inner one (Xen) knows about the
>>> translation despite the fact that the the outer layer is the one
>>> providing the virtio device.
>>>
>>> I could change virtio_ring to use the DMA API only if requested by the
>>> lower driver (virtio_pci, etc) and to have only virtio_pci enable that
>>> feature.  Will that work for all cases?
>>>
>>> On s390, this shouldn't work just like the current code.  On x86, I
>>> think that if QEMU ever starts exposing an IOMMU attached to a
>>> virtio-pci device, then QEMU should expect that IOMMU to be used.  If
>>> QEMU expects to see physical addresses, then it shouldn't advertise an
>>> IOMMU.  Since QEMU doesn't currently support guest IOMMUs, this should
>>> be fine for everything that uses QEMU.
>>>
>>> At least x86's implementation of the DMA ops for devices that aren't
>>> behind an IOMMU should be very fast.
>>>
>>> Are there any other weird cases for which this might be a problem?
>>>
>>>>
>>>> Please also update the virtio specification:
>>>> https://tools.oasis-open.org/version-control/browse/wsvn/virtio/
>>>>
>>>
>>> I'm not sure it will need an update.  Perhaps a note in the PCI
>>> section indicating that, if the host expects the guest to program an
>>> IOMMU, then it should use the appropriate platform-specific mechanism
>>> to expose that IOMMU.
>>>
>>> --Andy
>>
>> If there's no virtio mechanism to negotate enabling/disabling
>> translations, then specification does not need an extension.
> 
> It wouldn't shock me if someone wants to negotiate this for
> virtio_mmio some day, but that might be more of a device tree thing.
> I have no idea how that works, but I think it can wait until someone
> wants it.
> 
> I updated the patches, and I'll send them out after I try to test-boot
> s390 under QEMU :)

The emulation of several parts of s390 including the ccw stuff is currently 
broken in QEMU. I can test new version.

Christian

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API
  2014-08-27  6:46 ` [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API Stefan Hajnoczi
  2014-08-27 15:11   ` Andy Lutomirski
@ 2014-08-27 17:27   ` Konrad Rzeszutek Wilk
  2014-08-27 18:10     ` Stefan Hajnoczi
  1 sibling, 1 reply; 28+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-08-27 17:27 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: linux-s390, Michael S. Tsirkin, Linux Virtualization, linux390,
	Andy Lutomirski, virtio-dev

On Wed, Aug 27, 2014 at 07:46:46AM +0100, Stefan Hajnoczi wrote:
> On Tue, Aug 26, 2014 at 10:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> > There are two outstanding issues.  virtio_net warns if DMA debugging
> > is on because it does DMA from the stack.  (The warning is correct.)
> > This also is likely to do something unpleasant to s390.
> > (Maintainers are cc'd -- I don't know what to do about it.)
> 
> This changes the semantics of vring and breaks existing guests when
> bus address != physical address.

Isn't that what this is suppose to fix? Right now the semantics of the
vring is that bus address == physical address.

> 
> Can you use a transport feature bit to indicate that bus addresses are
> used?  That way both approaches can be supported.
> 
> Please also update the virtio specification:
> https://tools.oasis-open.org/version-control/browse/wsvn/virtio/
> 
> Stefan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API
  2014-08-27 17:27   ` Konrad Rzeszutek Wilk
@ 2014-08-27 18:10     ` Stefan Hajnoczi
  2014-08-27 18:58       ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 28+ messages in thread
From: Stefan Hajnoczi @ 2014-08-27 18:10 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: linux-s390, Michael S. Tsirkin, Linux Virtualization, linux390,
	Andy Lutomirski, virtio-dev

On Wed, Aug 27, 2014 at 6:27 PM, Konrad Rzeszutek Wilk
<konrad.wilk@oracle.com> wrote:
> On Wed, Aug 27, 2014 at 07:46:46AM +0100, Stefan Hajnoczi wrote:
>> On Tue, Aug 26, 2014 at 10:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> > There are two outstanding issues.  virtio_net warns if DMA debugging
>> > is on because it does DMA from the stack.  (The warning is correct.)
>> > This also is likely to do something unpleasant to s390.
>> > (Maintainers are cc'd -- I don't know what to do about it.)
>>
>> This changes the semantics of vring and breaks existing guests when
>> bus address != physical address.
>
> Isn't that what this is suppose to fix? Right now the semantics of the
> vring is that bus address == physical address.

Today the assumption is that the device emulation code has access to
guest physical memory (no translation necessary).

Changing the semantics breaks today's guests.  You need to make this
change in a way so that existing guests don't break.

Stefan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API
  2014-08-27 18:10     ` Stefan Hajnoczi
@ 2014-08-27 18:58       ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 28+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-08-27 18:58 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: linux-s390, Michael S. Tsirkin, Linux Virtualization, linux390,
	Andy Lutomirski, virtio-dev

On Wed, Aug 27, 2014 at 07:10:20PM +0100, Stefan Hajnoczi wrote:
> On Wed, Aug 27, 2014 at 6:27 PM, Konrad Rzeszutek Wilk
> <konrad.wilk@oracle.com> wrote:
> > On Wed, Aug 27, 2014 at 07:46:46AM +0100, Stefan Hajnoczi wrote:
> >> On Tue, Aug 26, 2014 at 10:16 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> >> > There are two outstanding issues.  virtio_net warns if DMA debugging
> >> > is on because it does DMA from the stack.  (The warning is correct.)
> >> > This also is likely to do something unpleasant to s390.
> >> > (Maintainers are cc'd -- I don't know what to do about it.)
> >>
> >> This changes the semantics of vring and breaks existing guests when
> >> bus address != physical address.
> >
> > Isn't that what this is suppose to fix? Right now the semantics of the
> > vring is that bus address == physical address.
> 
> Today the assumption is that the device emulation code has access to
> guest physical memory (no translation necessary).

Right.
> 
> Changing the semantics breaks today's guests.  You need to make this
> change in a way so that existing guests don't break.

Well, the DMA API will preserve that semantic as under KVM it will
return phys == bus.

But that is a side-effect of the DMA API implementation preserving
this semantic so I understand your point.
> 
> Stefan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API
  2014-08-26 21:16 [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API Andy Lutomirski
                   ` (3 preceding siblings ...)
  2014-08-27  6:46 ` [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API Stefan Hajnoczi
@ 2014-08-27  7:54 ` Cornelia Huck
  4 siblings, 0 replies; 28+ messages in thread
From: Cornelia Huck @ 2014-08-27  7:54 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-s390, Konrad Rzeszutek Wilk, virtualization,
	Christian Borntraeger, Michael S. Tsirkin, linux390

On Tue, 26 Aug 2014 14:16:59 -0700
Andy Lutomirski <luto@amacapital.net> wrote:

> This fixes virtio on Xen guests as well as on any other platform on
> which physical addresses don't match bus addresses.
> 
> This can be tested with:
> 
>     virtme-run --xen xen --kimg arch/x86/boot/bzImage --console
> 
> using virtme from here:
> 
>     https://git.kernel.org/cgit/utils/kernel/virtme/virtme.git
> 
> Without these patches, the guest hangs forever.  With these patches,
> everything works.
> 
> There are two outstanding issues.  virtio_net warns if DMA debugging
> is on because it does DMA from the stack.  (The warning is correct.)
> This also is likely to do something unpleasant to s390.

s/is likely to do something unpleasant to/breaks/

Sorry, this just won't work for s390, as Christian already explained.

> (Maintainers are cc'd -- I don't know what to do about it.)
> 
> Andy Lutomirski (3):
>   virtio_ring: Remove sg_next indirection
>   virtio_ring: Use DMA APIs
>   virtio_pci: Use the DMA API for virtqueues
> 
>  drivers/virtio/Kconfig       |   1 +
>  drivers/virtio/virtio_pci.c  |  25 ++++++--
>  drivers/virtio/virtio_ring.c | 150 ++++++++++++++++++++++++++++++-------------
>  3 files changed, 125 insertions(+), 51 deletions(-)
> 

Aren't you also changing some basic assumptions with the conversion to
DMA apis? virtqueues currently aren't accessed via dma (or it wouldn't
work on s390); if you want (say) virtio-pci to use dma, shouldn't that
be something that is negotiated between guest and host?

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2014-09-01 17:15 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-08-26 21:16 [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API Andy Lutomirski
2014-08-26 21:17 ` [PATCH 1/3] virtio_ring: Remove sg_next indirection Andy Lutomirski
2014-09-01  0:58   ` Rusty Russell
2014-09-01  1:42     ` Andy Lutomirski
2014-09-01  6:59       ` Michael S. Tsirkin
2014-09-01 17:15         ` Andy Lutomirski
2014-08-26 21:17 ` [PATCH 2/3] virtio_ring: Use DMA APIs Andy Lutomirski
2014-08-27  7:29   ` Christian Borntraeger
2014-08-27 17:35     ` Konrad Rzeszutek Wilk
2014-08-27 17:39       ` Andy Lutomirski
2014-08-26 21:17 ` [PATCH 3/3] virtio_pci: Use the DMA API for virtqueues Andy Lutomirski
2014-08-27 17:32   ` Konrad Rzeszutek Wilk
2014-08-27 17:35     ` Andy Lutomirski
2014-08-27 18:52       ` Konrad Rzeszutek Wilk
2014-08-27  6:46 ` [PATCH 0/3] virtio: Clean up scatterlists and use the DMA API Stefan Hajnoczi
2014-08-27 15:11   ` Andy Lutomirski
2014-08-27 15:45     ` Michael S. Tsirkin
2014-08-27 15:50       ` Andy Lutomirski
2014-08-27 16:13         ` Christopher Covington
2014-08-27 16:19           ` Andy Lutomirski
2014-08-27 17:34             ` Christopher Covington
2014-08-27 17:37               ` Andy Lutomirski
2014-08-27 17:50                 ` Christopher Covington
2014-08-27 20:52         ` Christian Borntraeger
2014-08-27 17:27   ` Konrad Rzeszutek Wilk
2014-08-27 18:10     ` Stefan Hajnoczi
2014-08-27 18:58       ` Konrad Rzeszutek Wilk
2014-08-27  7:54 ` Cornelia Huck

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).