* [RFC] Rewrite page fault interception in TTM drivers
From: Matthew Wilcox @ 2018-04-15 3:31 UTC (permalink / raw)
To: dri-devel, amd-gfx, virtualization
Three^W Two of the TTM drivers intercept the pagefault handler in a rather
un-Linuxy and racy way. If they really must intercept the fault call
(and it's not clear to me that they need to), I'd rather see them do it
like this.
The QXL driver seems least likely to need it; as the virtio driver has
exactly the same code in it, only commented out. The radeon driver takes
its own lock ... maybe there's a way to avoid that since no other driver
needs to take its own lock at this point?
diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c b/drivers/gpu/drm/qxl/qxl_ttm.c
index ee2340e31f06..d2c76a3d6816 100644
--- a/drivers/gpu/drm/qxl/qxl_ttm.c
+++ b/drivers/gpu/drm/qxl/qxl_ttm.c
@@ -102,21 +102,23 @@ static void qxl_ttm_global_fini(struct qxl_device *qdev)
}
}
-static struct vm_operations_struct qxl_ttm_vm_ops;
-static const struct vm_operations_struct *ttm_vm_ops;
-
static int qxl_ttm_fault(struct vm_fault *vmf)
{
struct ttm_buffer_object *bo;
- int r;
bo = (struct ttm_buffer_object *)vmf->vma->vm_private_data;
if (bo == NULL)
return VM_FAULT_NOPAGE;
- r = ttm_vm_ops->fault(vmf);
- return r;
+ return ttm_bo_vm_fault(vmf);
}
+static const struct vm_operations_struct qxl_ttm_vm_ops = {
+ .fault = qxl_ttm_fault,
+ .open = ttm_bo_vm_open,
+ .close = ttm_bo_vm_close,
+ .access = ttm_bo_vm_access,
+};
+
int qxl_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct drm_file *file_priv;
@@ -139,11 +141,6 @@ int qxl_mmap(struct file *filp, struct vm_area_struct *vma)
r = ttm_bo_mmap(filp, vma, &qdev->mman.bdev);
if (unlikely(r != 0))
return r;
- if (unlikely(ttm_vm_ops == NULL)) {
- ttm_vm_ops = vma->vm_ops;
- qxl_ttm_vm_ops = *ttm_vm_ops;
- qxl_ttm_vm_ops.fault = &qxl_ttm_fault;
- }
vma->vm_ops = &qxl_ttm_vm_ops;
return 0;
}
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index 8689fcca051c..08cc0c5b9e94 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -944,9 +944,6 @@ void radeon_ttm_set_active_vram_size(struct radeon_device *rdev, u64 size)
man->size = size >> PAGE_SHIFT;
}
-static struct vm_operations_struct radeon_ttm_vm_ops;
-static const struct vm_operations_struct *ttm_vm_ops = NULL;
-
static int radeon_ttm_fault(struct vm_fault *vmf)
{
struct ttm_buffer_object *bo;
@@ -959,11 +956,18 @@ static int radeon_ttm_fault(struct vm_fault *vmf)
}
rdev = radeon_get_rdev(bo->bdev);
down_read(&rdev->pm.mclk_lock);
- r = ttm_vm_ops->fault(vmf);
+ r = ttm_bo_vm_fault(vmf);
up_read(&rdev->pm.mclk_lock);
return r;
}
+static const struct vm_operations_struct radeon_ttm_vm_ops = {
+ .fault = radeon_ttm_fault,
+ .open = ttm_bo_vm_open,
+ .close = ttm_bo_vm_close,
+ .access = ttm_bo_vm_access,
+};
+
int radeon_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct drm_file *file_priv;
@@ -983,11 +987,6 @@ int radeon_mmap(struct file *filp, struct vm_area_struct *vma)
if (unlikely(r != 0)) {
return r;
}
- if (unlikely(ttm_vm_ops == NULL)) {
- ttm_vm_ops = vma->vm_ops;
- radeon_ttm_vm_ops = *ttm_vm_ops;
- radeon_ttm_vm_ops.fault = &radeon_ttm_fault;
- }
vma->vm_ops = &radeon_ttm_vm_ops;
return 0;
}
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 8eba95b3c737..f59a8f41aae0 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -104,7 +104,7 @@ static unsigned long ttm_bo_io_mem_pfn(struct ttm_buffer_object *bo,
+ page_offset;
}
-static int ttm_bo_vm_fault(struct vm_fault *vmf)
+int ttm_bo_vm_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
@@ -294,8 +294,9 @@ static int ttm_bo_vm_fault(struct vm_fault *vmf)
ttm_bo_unreserve(bo);
return ret;
}
+EXPORT_SYMBOL_GPL(ttm_bo_vm_fault);
-static void ttm_bo_vm_open(struct vm_area_struct *vma)
+void ttm_bo_vm_open(struct vm_area_struct *vma)
{
struct ttm_buffer_object *bo =
(struct ttm_buffer_object *)vma->vm_private_data;
@@ -304,14 +305,16 @@ static void ttm_bo_vm_open(struct vm_area_struct *vma)
(void)ttm_bo_reference(bo);
}
+EXPORT_SYMBOL_GPL(ttm_bo_vm_open);
-static void ttm_bo_vm_close(struct vm_area_struct *vma)
+void ttm_bo_vm_close(struct vm_area_struct *vma)
{
struct ttm_buffer_object *bo = (struct ttm_buffer_object *)vma->vm_private_data;
ttm_bo_unref(&bo);
vma->vm_private_data = NULL;
}
+EXPORT_SYMBOL_GPL(ttm_bo_vm_close);
static int ttm_bo_vm_access_kmap(struct ttm_buffer_object *bo,
unsigned long offset,
@@ -352,7 +355,7 @@ static int ttm_bo_vm_access_kmap(struct ttm_buffer_object *bo,
return len;
}
-static int ttm_bo_vm_access(struct vm_area_struct *vma, unsigned long addr,
+int ttm_bo_vm_access(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write)
{
unsigned long offset = (addr) - vma->vm_start;
@@ -389,6 +392,7 @@ static int ttm_bo_vm_access(struct vm_area_struct *vma, unsigned long addr,
return ret;
}
+EXPORT_SYMBOL_GPL(ttm_bo_vm_access);
static const struct vm_operations_struct ttm_bo_vm_ops = {
.fault = ttm_bo_vm_fault,
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index c67977aa1a0e..5935fbfc07f5 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -713,6 +713,12 @@ void *ttm_kmap_atomic_prot(struct page *page, pgprot_t prot);
void ttm_kunmap_atomic_prot(void *addr, pgprot_t prot);
+int ttm_bo_vm_fault(struct vm_fault *vmf);
+void ttm_bo_vm_open(struct vm_area_struct *vma);
+void ttm_bo_vm_close(struct vm_area_struct *vma);
+int ttm_bo_vm_access(struct vm_area_struct *vma, unsigned long addr,
+ void *buf, int len, int write);
+
/**
* ttm_bo_io
*
^ permalink raw reply related
* Re: [RFC v2] virtio: support packed ring
From: Tiwei Bie @ 2018-04-14 11:22 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180413181808-mutt-send-email-mst@kernel.org>
On Fri, Apr 13, 2018 at 06:22:45PM +0300, Michael S. Tsirkin wrote:
> On Sun, Apr 01, 2018 at 10:12:16PM +0800, Tiwei Bie wrote:
> > +static inline bool more_used(const struct vring_virtqueue *vq)
> > +{
> > + return vq->packed ? more_used_packed(vq) : more_used_split(vq);
> > +}
> > +
> > +void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, unsigned int *len,
> > + void **ctx)
> > +{
> > + struct vring_virtqueue *vq = to_vvq(_vq);
> > + void *ret;
> > + unsigned int i;
> > + u16 last_used;
> > +
> > + START_USE(vq);
> > +
> > + if (unlikely(vq->broken)) {
> > + END_USE(vq);
> > + return NULL;
> > + }
> > +
> > + if (!more_used(vq)) {
> > + pr_debug("No more buffers in queue\n");
> > + END_USE(vq);
> > + return NULL;
> > + }
>
> So virtqueue_get_buf_ctx_split should only call more_used_split.
Yeah, you're right! Will fix this in the next version.
>
> to avoid such issues I think we should lay out the code like this:
>
> XXX_split
>
> XXX_packed
>
> XXX wrappers
I'll do it. Thanks for the suggestion!
>
> > +/* The standard layout
>
> I'd drop standard here.
Got it. I'll drop the word "standard".
>
> > for the packed ring is a continuous chunk of memory
> > + * which looks like this.
> > + *
> > + * struct vring_packed
> > + * {
>
> Can the opening bracket go on the prev line pls?
Sure.
>
> > + * // The actual descriptors (16 bytes each)
> > + * struct vring_packed_desc desc[num];
> > + *
> > + * // Padding to the next align boundary.
> > + * char pad[];
> > + *
> > + * // Driver Event Suppression
> > + * struct vring_packed_desc_event driver;
> > + *
> > + * // Device Event Suppression
> > + * struct vring_packed_desc_event device;
>
> Maybe that's how our driver does it but it's not based on spec
> so I don't think this belongs in the header.
I will move it to the place where vring_packed_init()
is defined.
>
> > + * };
> > + */
> > +
> > +static inline unsigned vring_packed_size(unsigned int num, unsigned long align)
> > +{
> > + return ((sizeof(struct vring_packed_desc) * num + align - 1)
> > + & ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
> > +}
> > +
>
> Cant say this API makes sense for me.
Hmm, do you have any suggestion? Also move it out of this header?
Thanks for the review! :)
Best regards,
Tiwei Bie
>
>
> > #endif /* _UAPI_LINUX_VIRTIO_RING_H */
> > --
> > 2.11.0
^ permalink raw reply
* Re: [PATCH] virtio_balloon: add array of stat names
From: Jonathan Helman @ 2018-04-13 23:43 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: linux-kernel, virtualization
In-Reply-To: <20180414010537-mutt-send-email-mst@kernel.org>
On 04/13/2018 03:07 PM, Michael S. Tsirkin wrote:
> On Fri, Apr 13, 2018 at 11:53:31AM -0700, Jonathan Helman wrote:
>>
>>
>> On 04/13/2018 06:44 AM, Michael S. Tsirkin wrote:
>>> Jason Wang points out that it's vary hard for users to build an array of
>>
>> s/vary/very
>>
>>> stat names. The naive thing is to use VIRTIO_BALLOON_S_NR but that
>>> breaks if we add more stats.
>>>
>>> Let's add an array of reasonably readable names.
>>
>> Thanks for doing this, this is goodness. Clients of this interface will now
>> only require a modification to their internal copy of the header file (if
>> they have one), rather than updating their string array as well.
>>
>>>
>>> Fixes: 6c64fe7f2 ("virtio_balloon: export hugetlb page allocation counts")
>>
>> I think this is a tad bit confusing since the only way you'd know why this
>> patch "fixes" 6c64fe7f2 is by reading the LKML archives and finding Jason's
>> comment.
>
> So
>
> ... add more stats as recently by commit 6c64fe7f2 ("virtio_balloon: export hugetlb page allocation counts")
>
>
Sure. Thanks.
Jon
>
>
>>> Cc: Jason Wang <jasowang@redhat.com>
>>> Cc: Jonathan Helman <jonathan.helman@oracle.com>, > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>>
>> Reviewed-by: Jonathan Helman <jonathan.helman@oracle.com>
>>
>>> ---
>>> include/uapi/linux/virtio_balloon.h | 15 +++++++++++++++
>>> 1 file changed, 15 insertions(+)
>>>
>>> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
>>> index 9e02137..1477c17 100644
>>> --- a/include/uapi/linux/virtio_balloon.h
>>> +++ b/include/uapi/linux/virtio_balloon.h
>>> @@ -64,6 +64,21 @@ struct virtio_balloon_config {
>>> #define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page allocation failures */
>>> #define VIRTIO_BALLOON_S_NR 10
>>> +#define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \
>>> + VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \
>>> + VIRTIO_BALLOON_S_NAMES_prefix "swap-out", \
>>> + VIRTIO_BALLOON_S_NAMES_prefix "major-faults", \
>>> + VIRTIO_BALLOON_S_NAMES_prefix "minor-faults", \
>>> + VIRTIO_BALLOON_S_NAMES_prefix "free-memory", \
>>> + VIRTIO_BALLOON_S_NAMES_prefix "total-memory", \
>>> + VIRTIO_BALLOON_S_NAMES_prefix "available-memory", \
>>> + VIRTIO_BALLOON_S_NAMES_prefix "disk-caches", \
>>> + VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \
>>> + VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures" \ > +}
>>> +
>>> +#define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("")
>>> +
>>> /*
>>> * Memory statistics structure.
>>> * Driver fills an array of these structures and passes to device.
>>>
^ permalink raw reply
* Re: [PATCH] virtio_balloon: add array of stat names
From: Michael S. Tsirkin @ 2018-04-13 22:07 UTC (permalink / raw)
To: Jonathan Helman; +Cc: linux-kernel, virtualization
In-Reply-To: <8ac55da5-3c2b-a546-7597-ec6c82ff9590@oracle.com>
On Fri, Apr 13, 2018 at 11:53:31AM -0700, Jonathan Helman wrote:
>
>
> On 04/13/2018 06:44 AM, Michael S. Tsirkin wrote:
> > Jason Wang points out that it's vary hard for users to build an array of
>
> s/vary/very
>
> > stat names. The naive thing is to use VIRTIO_BALLOON_S_NR but that
> > breaks if we add more stats.
> >
> > Let's add an array of reasonably readable names.
>
> Thanks for doing this, this is goodness. Clients of this interface will now
> only require a modification to their internal copy of the header file (if
> they have one), rather than updating their string array as well.
>
> >
> > Fixes: 6c64fe7f2 ("virtio_balloon: export hugetlb page allocation counts")
>
> I think this is a tad bit confusing since the only way you'd know why this
> patch "fixes" 6c64fe7f2 is by reading the LKML archives and finding Jason's
> comment.
So
... add more stats as recently by commit 6c64fe7f2 ("virtio_balloon: export hugetlb page allocation counts")
> > Cc: Jason Wang <jasowang@redhat.com>
> > Cc: Jonathan Helman <jonathan.helman@oracle.com>, > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>
> Reviewed-by: Jonathan Helman <jonathan.helman@oracle.com>
>
> > ---
> > include/uapi/linux/virtio_balloon.h | 15 +++++++++++++++
> > 1 file changed, 15 insertions(+)
> >
> > diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> > index 9e02137..1477c17 100644
> > --- a/include/uapi/linux/virtio_balloon.h
> > +++ b/include/uapi/linux/virtio_balloon.h
> > @@ -64,6 +64,21 @@ struct virtio_balloon_config {
> > #define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page allocation failures */
> > #define VIRTIO_BALLOON_S_NR 10
> > +#define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \
> > + VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \
> > + VIRTIO_BALLOON_S_NAMES_prefix "swap-out", \
> > + VIRTIO_BALLOON_S_NAMES_prefix "major-faults", \
> > + VIRTIO_BALLOON_S_NAMES_prefix "minor-faults", \
> > + VIRTIO_BALLOON_S_NAMES_prefix "free-memory", \
> > + VIRTIO_BALLOON_S_NAMES_prefix "total-memory", \
> > + VIRTIO_BALLOON_S_NAMES_prefix "available-memory", \
> > + VIRTIO_BALLOON_S_NAMES_prefix "disk-caches", \
> > + VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \
> > + VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures" \ > +}
> > +
> > +#define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("")
> > +
> > /*
> > * Memory statistics structure.
> > * Driver fills an array of these structures and passes to device.
> >
^ permalink raw reply
* Re: [virtio-dev] Re: [PATCH v2] virtio_balloon: export hugetlb page allocation counts
From: Jonathan Helman @ 2018-04-13 22:03 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Mihai Carabas, virtio-dev, linux-kernel, virtualization
In-Reply-To: <20180414005108-mutt-send-email-mst@kernel.org>
On 04/13/2018 02:51 PM, Michael S. Tsirkin wrote:
> On Fri, Apr 13, 2018 at 10:10:57AM -0700, Jonathan Helman wrote:
>>
>>
>> On 04/13/2018 06:44 AM, Michael S. Tsirkin wrote:
>>> On Fri, Apr 13, 2018 at 03:01:11PM +0800, Jason Wang wrote:
>>>>
>>>>
>>>> On 2018年04月12日 08:24, Jonathan Helman wrote:
>>>>>
>>>>>
>>>>> On 04/10/2018 08:12 PM, Jason Wang wrote:
>>>>>>
>>>>>>
>>>>>> On 2018年04月10日 05:11, Jonathan Helman wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 03/22/2018 07:38 PM, Jason Wang wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> On 2018年03月22日 11:10, Michael S. Tsirkin wrote:
>>>>>>>>> On Thu, Mar 22, 2018 at 09:52:18AM +0800, Jason Wang wrote:
>>>>>>>>>> On 2018年03月20日 12:26, Jonathan Helman wrote:
>>>>>>>>>>>> On Mar 19, 2018, at 7:31 PM, Jason
>>>>>>>>>>>> Wang<jasowang@redhat.com> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> On 2018年03月20日 06:14, Jonathan Helman wrote:
>>>>>>>>>>>>> Export the number of successful and failed hugetlb page
>>>>>>>>>>>>> allocations via the virtio balloon driver. These 2 counts
>>>>>>>>>>>>> come directly from the vm_events HTLB_BUDDY_PGALLOC and
>>>>>>>>>>>>> HTLB_BUDDY_PGALLOC_FAIL.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Signed-off-by: Jonathan Helman<jonathan.helman@oracle.com>
>>>>>>>>>>>> Reviewed-by: Jason Wang<jasowang@redhat.com>
>>>>>>>>>>> Thanks.
>>>>>>>>>>>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>> drivers/virtio/virtio_balloon.c | 6 ++++++
>>>>>>>>>>>>> include/uapi/linux/virtio_balloon.h | 4 +++-
>>>>>>>>>>>>> 2 files changed, 9 insertions(+), 1 deletion(-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>> a/drivers/virtio/virtio_balloon.c
>>>>>>>>>>>>> b/drivers/virtio/virtio_balloon.c
>>>>>>>>>>>>> index dfe5684..6b237e3 100644
>>>>>>>>>>>>> --- a/drivers/virtio/virtio_balloon.c
>>>>>>>>>>>>> +++ b/drivers/virtio/virtio_balloon.c
>>>>>>>>>>>>> @@ -272,6 +272,12 @@ static unsigned int
>>>>>>>>>>>>> update_balloon_stats(struct
>>>>>>>>>>>>> virtio_balloon *vb)
>>>>>>>>>>>>> pages_to_bytes(events[PSWPOUT]));
>>>>>>>>>>>>> update_stat(vb, idx++,
>>>>>>>>>>>>> VIRTIO_BALLOON_S_MAJFLT,
>>>>>>>>>>>>> events[PGMAJFAULT]);
>>>>>>>>>>>>> update_stat(vb, idx++,
>>>>>>>>>>>>> VIRTIO_BALLOON_S_MINFLT,
>>>>>>>>>>>>> events[PGFAULT]);
>>>>>>>>>>>>> +#ifdef CONFIG_HUGETLB_PAGE
>>>>>>>>>>>>> + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
>>>>>>>>>>>>> + events[HTLB_BUDDY_PGALLOC]);
>>>>>>>>>>>>> + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL,
>>>>>>>>>>>>> + events[HTLB_BUDDY_PGALLOC_FAIL]);
>>>>>>>>>>>>> +#endif
>>>>>>>>>>>>> #endif
>>>>>>>>>>>>> update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE,
>>>>>>>>>>>>> pages_to_bytes(i.freeram));
>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>> a/include/uapi/linux/virtio_balloon.h
>>>>>>>>>>>>> b/include/uapi/linux/virtio_balloon.h
>>>>>>>>>>>>> index 4e8b830..40297a3 100644
>>>>>>>>>>>>> --- a/include/uapi/linux/virtio_balloon.h
>>>>>>>>>>>>> +++ b/include/uapi/linux/virtio_balloon.h
>>>>>>>>>>>>> @@ -53,7 +53,9 @@ struct virtio_balloon_config {
>>>>>>>>>>>>> #define VIRTIO_BALLOON_S_MEMTOT 5
>>>>>>>>>>>>> /* Total amount of memory */
>>>>>>>>>>>>> #define VIRTIO_BALLOON_S_AVAIL 6
>>>>>>>>>>>>> /* Available memory as in /proc */
>>>>>>>>>>>>> #define VIRTIO_BALLOON_S_CACHES 7 /* Disk caches */
>>>>>>>>>>>>> -#define VIRTIO_BALLOON_S_NR 8
>>>>>>>>>>>>> +#define VIRTIO_BALLOON_S_HTLB_PGALLOC
>>>>>>>>>>>>> 8 /* Hugetlb page allocations */
>>>>>>>>>>>>> +#define VIRTIO_BALLOON_S_HTLB_PGFAIL
>>>>>>>>>>>>> 9 /* Hugetlb page allocation failures
>>>>>>>>>>>>> */
>>>>>>>>>>>>> +#define VIRTIO_BALLOON_S_NR 10
>>>>>>>>>>>>> /*
>>>>>>>>>>>>> * Memory statistics structure.
>>>>>>>>>>>> Not for this patch, but it looks to me that
>>>>>>>>>>>> exporting such nr through uapi is fragile.
>>>>>>>>>>> Sorry, can you explain what you mean here?
>>>>>>>>>>>
>>>>>>>>>>> Jon
>>>>>>>>>> Spec said "Within an output buffer submitted to the
>>>>>>>>>> statsq, the device MUST
>>>>>>>>>> ignore entries with tag values that it does not
>>>>>>>>>> recognize". So exporting
>>>>>>>>>> VIRTIO_BALLOON_S_NR seems useless and device
>>>>>>>>>> implementation can not depend
>>>>>>>>>> on such number in uapi.
>>>>>>>>>>
>>>>>>>>>> Thanks
>>>>>>>>> Suggestions? I don't like to break build for people ...
>>>>>>>>>
>>>>>>>>
>>>>>>>> Didn't have a good idea. But maybe we should keep
>>>>>>>> VIRTIO_BALLOON_S_NR unchanged, and add a comment here.
>>>>>>>>
>>>>>>>> Thanks
>>>>>>>
>>>>>>> I think Jason's comment is for a future patch. Didn't see this
>>>>>>> patch get applied, so wondering if it could be.
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Jon
>>>>>>
>>>>>> Hi Jon:
>>>>>>
>>>>>> Have you tested new driver with old qemu?
>>>>>
>>>>> Yes, this testing scenario looks good. Thanks.
>>>>>
>>>>> Jon
>>>>
>>>> Hi Jon:
>>>>
>>>> I mean e.g compiling qemu with new linux headers. E.g current qemu has:
>>>>
>>>> static const char *balloon_stat_names[] = {
>>>> [VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in",
>>>> [VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out",
>>>> [VIRTIO_BALLOON_S_MAJFLT] = "stat-major-faults",
>>>> [VIRTIO_BALLOON_S_MINFLT] = "stat-minor-faults",
>>>> [VIRTIO_BALLOON_S_MEMFREE] = "stat-free-memory",
>>>> [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory",
>>>> [VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory",
>>>> [VIRTIO_BALLOON_S_CACHES] = "stat-disk-caches",
>>>> [VIRTIO_BALLOON_S_NR] = NULL
>>>> };
>>>>
>>>> I'm afraid it will be broken if VIRTIO_BALLOON_S_NR is 10.
>>>>
>>>> Thanks
>>>
>>> Well it is handy for sizing arrays and this isn't the first time we did
>>> this:
>>>
>>> commit 4d32029b8ddb7be4d1699c6d8e1675ff5476d149
>>> Author: Tomáš Golembiovský <tgolembi@redhat.com>
>>> Date: Sun Nov 12 13:05:38 2017 +0100
>>>
>>> virtio_balloon: include disk/file caches memory statistics
>>>
>>> commit 5057dcd0f1aaad57e07e728ba20a99e205c6b9de
>>> Author: Igor Redko <redkoi@virtuozzo.com>
>>> Date: Thu Mar 17 14:19:08 2016 -0700
>>>
>>> virtio_balloon: export 'available' memory to balloon statistics
>>>
>>> how about we give QEMU a hand and just put the list of names in
>>> the header?
>>>
>>> I posted a patch like that, pls review.
>>>
>>
>> Sorry, maybe I'm missing something. We have an internal copy of the header
>> file in qemu under include/standard-headers/linux/virtio_balloon.h, which
>> hw/virtio/virtio-balloon.c includes (through including
>> hw/virtio/virtio-balloon.h). I thought qemu would use internal header files
>> so that it could be compiled on any Linux kernel version. So when would we
>> ever have the issue Jason is discussing?
>>
>> Thanks,
>> Jon
>
> Whenever we update the headers.
>
Ah got it. Updating the headers without updating that string array
breaks things. Thanks.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [virtio-dev] Re: [PATCH v2] virtio_balloon: export hugetlb page allocation counts
From: Michael S. Tsirkin @ 2018-04-13 21:51 UTC (permalink / raw)
To: Jonathan Helman; +Cc: Mihai Carabas, virtio-dev, linux-kernel, virtualization
In-Reply-To: <38e16983-4551-a621-b399-0de84d58bf15@oracle.com>
On Fri, Apr 13, 2018 at 10:10:57AM -0700, Jonathan Helman wrote:
>
>
> On 04/13/2018 06:44 AM, Michael S. Tsirkin wrote:
> > On Fri, Apr 13, 2018 at 03:01:11PM +0800, Jason Wang wrote:
> > >
> > >
> > > On 2018年04月12日 08:24, Jonathan Helman wrote:
> > > >
> > > >
> > > > On 04/10/2018 08:12 PM, Jason Wang wrote:
> > > > >
> > > > >
> > > > > On 2018年04月10日 05:11, Jonathan Helman wrote:
> > > > > >
> > > > > >
> > > > > > On 03/22/2018 07:38 PM, Jason Wang wrote:
> > > > > > >
> > > > > > >
> > > > > > > On 2018年03月22日 11:10, Michael S. Tsirkin wrote:
> > > > > > > > On Thu, Mar 22, 2018 at 09:52:18AM +0800, Jason Wang wrote:
> > > > > > > > > On 2018年03月20日 12:26, Jonathan Helman wrote:
> > > > > > > > > > > On Mar 19, 2018, at 7:31 PM, Jason
> > > > > > > > > > > Wang<jasowang@redhat.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > On 2018年03月20日 06:14, Jonathan Helman wrote:
> > > > > > > > > > > > Export the number of successful and failed hugetlb page
> > > > > > > > > > > > allocations via the virtio balloon driver. These 2 counts
> > > > > > > > > > > > come directly from the vm_events HTLB_BUDDY_PGALLOC and
> > > > > > > > > > > > HTLB_BUDDY_PGALLOC_FAIL.
> > > > > > > > > > > >
> > > > > > > > > > > > Signed-off-by: Jonathan Helman<jonathan.helman@oracle.com>
> > > > > > > > > > > Reviewed-by: Jason Wang<jasowang@redhat.com>
> > > > > > > > > > Thanks.
> > > > > > > > > >
> > > > > > > > > > > > ---
> > > > > > > > > > > > drivers/virtio/virtio_balloon.c | 6 ++++++
> > > > > > > > > > > > include/uapi/linux/virtio_balloon.h | 4 +++-
> > > > > > > > > > > > 2 files changed, 9 insertions(+), 1 deletion(-)
> > > > > > > > > > > >
> > > > > > > > > > > > diff --git
> > > > > > > > > > > > a/drivers/virtio/virtio_balloon.c
> > > > > > > > > > > > b/drivers/virtio/virtio_balloon.c
> > > > > > > > > > > > index dfe5684..6b237e3 100644
> > > > > > > > > > > > --- a/drivers/virtio/virtio_balloon.c
> > > > > > > > > > > > +++ b/drivers/virtio/virtio_balloon.c
> > > > > > > > > > > > @@ -272,6 +272,12 @@ static unsigned int
> > > > > > > > > > > > update_balloon_stats(struct
> > > > > > > > > > > > virtio_balloon *vb)
> > > > > > > > > > > > pages_to_bytes(events[PSWPOUT]));
> > > > > > > > > > > > update_stat(vb, idx++,
> > > > > > > > > > > > VIRTIO_BALLOON_S_MAJFLT,
> > > > > > > > > > > > events[PGMAJFAULT]);
> > > > > > > > > > > > update_stat(vb, idx++,
> > > > > > > > > > > > VIRTIO_BALLOON_S_MINFLT,
> > > > > > > > > > > > events[PGFAULT]);
> > > > > > > > > > > > +#ifdef CONFIG_HUGETLB_PAGE
> > > > > > > > > > > > + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
> > > > > > > > > > > > + events[HTLB_BUDDY_PGALLOC]);
> > > > > > > > > > > > + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL,
> > > > > > > > > > > > + events[HTLB_BUDDY_PGALLOC_FAIL]);
> > > > > > > > > > > > +#endif
> > > > > > > > > > > > #endif
> > > > > > > > > > > > update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE,
> > > > > > > > > > > > pages_to_bytes(i.freeram));
> > > > > > > > > > > > diff --git
> > > > > > > > > > > > a/include/uapi/linux/virtio_balloon.h
> > > > > > > > > > > > b/include/uapi/linux/virtio_balloon.h
> > > > > > > > > > > > index 4e8b830..40297a3 100644
> > > > > > > > > > > > --- a/include/uapi/linux/virtio_balloon.h
> > > > > > > > > > > > +++ b/include/uapi/linux/virtio_balloon.h
> > > > > > > > > > > > @@ -53,7 +53,9 @@ struct virtio_balloon_config {
> > > > > > > > > > > > #define VIRTIO_BALLOON_S_MEMTOT 5
> > > > > > > > > > > > /* Total amount of memory */
> > > > > > > > > > > > #define VIRTIO_BALLOON_S_AVAIL 6
> > > > > > > > > > > > /* Available memory as in /proc */
> > > > > > > > > > > > #define VIRTIO_BALLOON_S_CACHES 7 /* Disk caches */
> > > > > > > > > > > > -#define VIRTIO_BALLOON_S_NR 8
> > > > > > > > > > > > +#define VIRTIO_BALLOON_S_HTLB_PGALLOC
> > > > > > > > > > > > 8 /* Hugetlb page allocations */
> > > > > > > > > > > > +#define VIRTIO_BALLOON_S_HTLB_PGFAIL
> > > > > > > > > > > > 9 /* Hugetlb page allocation failures
> > > > > > > > > > > > */
> > > > > > > > > > > > +#define VIRTIO_BALLOON_S_NR 10
> > > > > > > > > > > > /*
> > > > > > > > > > > > * Memory statistics structure.
> > > > > > > > > > > Not for this patch, but it looks to me that
> > > > > > > > > > > exporting such nr through uapi is fragile.
> > > > > > > > > > Sorry, can you explain what you mean here?
> > > > > > > > > >
> > > > > > > > > > Jon
> > > > > > > > > Spec said "Within an output buffer submitted to the
> > > > > > > > > statsq, the device MUST
> > > > > > > > > ignore entries with tag values that it does not
> > > > > > > > > recognize". So exporting
> > > > > > > > > VIRTIO_BALLOON_S_NR seems useless and device
> > > > > > > > > implementation can not depend
> > > > > > > > > on such number in uapi.
> > > > > > > > >
> > > > > > > > > Thanks
> > > > > > > > Suggestions? I don't like to break build for people ...
> > > > > > > >
> > > > > > >
> > > > > > > Didn't have a good idea. But maybe we should keep
> > > > > > > VIRTIO_BALLOON_S_NR unchanged, and add a comment here.
> > > > > > >
> > > > > > > Thanks
> > > > > >
> > > > > > I think Jason's comment is for a future patch. Didn't see this
> > > > > > patch get applied, so wondering if it could be.
> > > > > >
> > > > > > Thanks,
> > > > > > Jon
> > > > >
> > > > > Hi Jon:
> > > > >
> > > > > Have you tested new driver with old qemu?
> > > >
> > > > Yes, this testing scenario looks good. Thanks.
> > > >
> > > > Jon
> > >
> > > Hi Jon:
> > >
> > > I mean e.g compiling qemu with new linux headers. E.g current qemu has:
> > >
> > > static const char *balloon_stat_names[] = {
> > > [VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in",
> > > [VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out",
> > > [VIRTIO_BALLOON_S_MAJFLT] = "stat-major-faults",
> > > [VIRTIO_BALLOON_S_MINFLT] = "stat-minor-faults",
> > > [VIRTIO_BALLOON_S_MEMFREE] = "stat-free-memory",
> > > [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory",
> > > [VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory",
> > > [VIRTIO_BALLOON_S_CACHES] = "stat-disk-caches",
> > > [VIRTIO_BALLOON_S_NR] = NULL
> > > };
> > >
> > > I'm afraid it will be broken if VIRTIO_BALLOON_S_NR is 10.
> > >
> > > Thanks
> >
> > Well it is handy for sizing arrays and this isn't the first time we did
> > this:
> >
> > commit 4d32029b8ddb7be4d1699c6d8e1675ff5476d149
> > Author: Tomáš Golembiovský <tgolembi@redhat.com>
> > Date: Sun Nov 12 13:05:38 2017 +0100
> >
> > virtio_balloon: include disk/file caches memory statistics
> >
> > commit 5057dcd0f1aaad57e07e728ba20a99e205c6b9de
> > Author: Igor Redko <redkoi@virtuozzo.com>
> > Date: Thu Mar 17 14:19:08 2016 -0700
> >
> > virtio_balloon: export 'available' memory to balloon statistics
> >
> > how about we give QEMU a hand and just put the list of names in
> > the header?
> >
> > I posted a patch like that, pls review.
> >
>
> Sorry, maybe I'm missing something. We have an internal copy of the header
> file in qemu under include/standard-headers/linux/virtio_balloon.h, which
> hw/virtio/virtio-balloon.c includes (through including
> hw/virtio/virtio-balloon.h). I thought qemu would use internal header files
> so that it could be compiled on any Linux kernel version. So when would we
> ever have the issue Jason is discussing?
>
> Thanks,
> Jon
Whenever we update the headers.
--
MST
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH] virtio_balloon: add array of stat names
From: Jonathan Helman @ 2018-04-13 18:53 UTC (permalink / raw)
To: Michael S. Tsirkin, linux-kernel; +Cc: virtualization
In-Reply-To: <1523627031-370966-1-git-send-email-mst@redhat.com>
On 04/13/2018 06:44 AM, Michael S. Tsirkin wrote:
> Jason Wang points out that it's vary hard for users to build an array of
s/vary/very
> stat names. The naive thing is to use VIRTIO_BALLOON_S_NR but that
> breaks if we add more stats.
>
> Let's add an array of reasonably readable names.
Thanks for doing this, this is goodness. Clients of this interface will
now only require a modification to their internal copy of the header
file (if they have one), rather than updating their string array as well.
>
> Fixes: 6c64fe7f2 ("virtio_balloon: export hugetlb page allocation counts")
I think this is a tad bit confusing since the only way you'd know why
this patch "fixes" 6c64fe7f2 is by reading the LKML archives and finding
Jason's comment.
> Cc: Jason Wang <jasowang@redhat.com>
> Cc: Jonathan Helman <jonathan.helman@oracle.com>, > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Jonathan Helman <jonathan.helman@oracle.com>
> ---
> include/uapi/linux/virtio_balloon.h | 15 +++++++++++++++
> 1 file changed, 15 insertions(+)
>
> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> index 9e02137..1477c17 100644
> --- a/include/uapi/linux/virtio_balloon.h
> +++ b/include/uapi/linux/virtio_balloon.h
> @@ -64,6 +64,21 @@ struct virtio_balloon_config {
> #define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page allocation failures */
> #define VIRTIO_BALLOON_S_NR 10
>
> +#define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \
> + VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \
> + VIRTIO_BALLOON_S_NAMES_prefix "swap-out", \
> + VIRTIO_BALLOON_S_NAMES_prefix "major-faults", \
> + VIRTIO_BALLOON_S_NAMES_prefix "minor-faults", \
> + VIRTIO_BALLOON_S_NAMES_prefix "free-memory", \
> + VIRTIO_BALLOON_S_NAMES_prefix "total-memory", \
> + VIRTIO_BALLOON_S_NAMES_prefix "available-memory", \
> + VIRTIO_BALLOON_S_NAMES_prefix "disk-caches", \
> + VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \
> + VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures" \ > +}
> +
> +#define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("")
> +
> /*
> * Memory statistics structure.
> * Driver fills an array of these structures and passes to device.
>
^ permalink raw reply
* Re: [virtio-dev] Re: [PATCH v2] virtio_balloon: export hugetlb page allocation counts
From: Jonathan Helman @ 2018-04-13 17:10 UTC (permalink / raw)
To: Michael S. Tsirkin, Jason Wang
Cc: Mihai Carabas, virtio-dev, linux-kernel, virtualization
In-Reply-To: <20180413155836-mutt-send-email-mst@kernel.org>
On 04/13/2018 06:44 AM, Michael S. Tsirkin wrote:
> On Fri, Apr 13, 2018 at 03:01:11PM +0800, Jason Wang wrote:
>>
>>
>> On 2018年04月12日 08:24, Jonathan Helman wrote:
>>>
>>>
>>> On 04/10/2018 08:12 PM, Jason Wang wrote:
>>>>
>>>>
>>>> On 2018年04月10日 05:11, Jonathan Helman wrote:
>>>>>
>>>>>
>>>>> On 03/22/2018 07:38 PM, Jason Wang wrote:
>>>>>>
>>>>>>
>>>>>> On 2018年03月22日 11:10, Michael S. Tsirkin wrote:
>>>>>>> On Thu, Mar 22, 2018 at 09:52:18AM +0800, Jason Wang wrote:
>>>>>>>> On 2018年03月20日 12:26, Jonathan Helman wrote:
>>>>>>>>>> On Mar 19, 2018, at 7:31 PM, Jason
>>>>>>>>>> Wang<jasowang@redhat.com> wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> On 2018年03月20日 06:14, Jonathan Helman wrote:
>>>>>>>>>>> Export the number of successful and failed hugetlb page
>>>>>>>>>>> allocations via the virtio balloon driver. These 2 counts
>>>>>>>>>>> come directly from the vm_events HTLB_BUDDY_PGALLOC and
>>>>>>>>>>> HTLB_BUDDY_PGALLOC_FAIL.
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Jonathan Helman<jonathan.helman@oracle.com>
>>>>>>>>>> Reviewed-by: Jason Wang<jasowang@redhat.com>
>>>>>>>>> Thanks.
>>>>>>>>>
>>>>>>>>>>> ---
>>>>>>>>>>> drivers/virtio/virtio_balloon.c | 6 ++++++
>>>>>>>>>>> include/uapi/linux/virtio_balloon.h | 4 +++-
>>>>>>>>>>> 2 files changed, 9 insertions(+), 1 deletion(-)
>>>>>>>>>>>
>>>>>>>>>>> diff --git
>>>>>>>>>>> a/drivers/virtio/virtio_balloon.c
>>>>>>>>>>> b/drivers/virtio/virtio_balloon.c
>>>>>>>>>>> index dfe5684..6b237e3 100644
>>>>>>>>>>> --- a/drivers/virtio/virtio_balloon.c
>>>>>>>>>>> +++ b/drivers/virtio/virtio_balloon.c
>>>>>>>>>>> @@ -272,6 +272,12 @@ static unsigned int
>>>>>>>>>>> update_balloon_stats(struct
>>>>>>>>>>> virtio_balloon *vb)
>>>>>>>>>>> pages_to_bytes(events[PSWPOUT]));
>>>>>>>>>>> update_stat(vb, idx++,
>>>>>>>>>>> VIRTIO_BALLOON_S_MAJFLT,
>>>>>>>>>>> events[PGMAJFAULT]);
>>>>>>>>>>> update_stat(vb, idx++,
>>>>>>>>>>> VIRTIO_BALLOON_S_MINFLT,
>>>>>>>>>>> events[PGFAULT]);
>>>>>>>>>>> +#ifdef CONFIG_HUGETLB_PAGE
>>>>>>>>>>> + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
>>>>>>>>>>> + events[HTLB_BUDDY_PGALLOC]);
>>>>>>>>>>> + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL,
>>>>>>>>>>> + events[HTLB_BUDDY_PGALLOC_FAIL]);
>>>>>>>>>>> +#endif
>>>>>>>>>>> #endif
>>>>>>>>>>> update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE,
>>>>>>>>>>> pages_to_bytes(i.freeram));
>>>>>>>>>>> diff --git
>>>>>>>>>>> a/include/uapi/linux/virtio_balloon.h
>>>>>>>>>>> b/include/uapi/linux/virtio_balloon.h
>>>>>>>>>>> index 4e8b830..40297a3 100644
>>>>>>>>>>> --- a/include/uapi/linux/virtio_balloon.h
>>>>>>>>>>> +++ b/include/uapi/linux/virtio_balloon.h
>>>>>>>>>>> @@ -53,7 +53,9 @@ struct virtio_balloon_config {
>>>>>>>>>>> #define VIRTIO_BALLOON_S_MEMTOT 5
>>>>>>>>>>> /* Total amount of memory */
>>>>>>>>>>> #define VIRTIO_BALLOON_S_AVAIL 6
>>>>>>>>>>> /* Available memory as in /proc */
>>>>>>>>>>> #define VIRTIO_BALLOON_S_CACHES 7 /* Disk caches */
>>>>>>>>>>> -#define VIRTIO_BALLOON_S_NR 8
>>>>>>>>>>> +#define VIRTIO_BALLOON_S_HTLB_PGALLOC
>>>>>>>>>>> 8 /* Hugetlb page allocations */
>>>>>>>>>>> +#define VIRTIO_BALLOON_S_HTLB_PGFAIL
>>>>>>>>>>> 9 /* Hugetlb page allocation failures
>>>>>>>>>>> */
>>>>>>>>>>> +#define VIRTIO_BALLOON_S_NR 10
>>>>>>>>>>> /*
>>>>>>>>>>> * Memory statistics structure.
>>>>>>>>>> Not for this patch, but it looks to me that
>>>>>>>>>> exporting such nr through uapi is fragile.
>>>>>>>>> Sorry, can you explain what you mean here?
>>>>>>>>>
>>>>>>>>> Jon
>>>>>>>> Spec said "Within an output buffer submitted to the
>>>>>>>> statsq, the device MUST
>>>>>>>> ignore entries with tag values that it does not
>>>>>>>> recognize". So exporting
>>>>>>>> VIRTIO_BALLOON_S_NR seems useless and device
>>>>>>>> implementation can not depend
>>>>>>>> on such number in uapi.
>>>>>>>>
>>>>>>>> Thanks
>>>>>>> Suggestions? I don't like to break build for people ...
>>>>>>>
>>>>>>
>>>>>> Didn't have a good idea. But maybe we should keep
>>>>>> VIRTIO_BALLOON_S_NR unchanged, and add a comment here.
>>>>>>
>>>>>> Thanks
>>>>>
>>>>> I think Jason's comment is for a future patch. Didn't see this
>>>>> patch get applied, so wondering if it could be.
>>>>>
>>>>> Thanks,
>>>>> Jon
>>>>
>>>> Hi Jon:
>>>>
>>>> Have you tested new driver with old qemu?
>>>
>>> Yes, this testing scenario looks good. Thanks.
>>>
>>> Jon
>>
>> Hi Jon:
>>
>> I mean e.g compiling qemu with new linux headers. E.g current qemu has:
>>
>> static const char *balloon_stat_names[] = {
>> [VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in",
>> [VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out",
>> [VIRTIO_BALLOON_S_MAJFLT] = "stat-major-faults",
>> [VIRTIO_BALLOON_S_MINFLT] = "stat-minor-faults",
>> [VIRTIO_BALLOON_S_MEMFREE] = "stat-free-memory",
>> [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory",
>> [VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory",
>> [VIRTIO_BALLOON_S_CACHES] = "stat-disk-caches",
>> [VIRTIO_BALLOON_S_NR] = NULL
>> };
>>
>> I'm afraid it will be broken if VIRTIO_BALLOON_S_NR is 10.
>>
>> Thanks
>
> Well it is handy for sizing arrays and this isn't the first time we did
> this:
>
> commit 4d32029b8ddb7be4d1699c6d8e1675ff5476d149
> Author: Tomáš Golembiovský <tgolembi@redhat.com>
> Date: Sun Nov 12 13:05:38 2017 +0100
>
> virtio_balloon: include disk/file caches memory statistics
>
>
>
> commit 5057dcd0f1aaad57e07e728ba20a99e205c6b9de
> Author: Igor Redko <redkoi@virtuozzo.com>
> Date: Thu Mar 17 14:19:08 2016 -0700
>
> virtio_balloon: export 'available' memory to balloon statistics
>
>
> how about we give QEMU a hand and just put the list of names in
> the header?
>
> I posted a patch like that, pls review.
>
Sorry, maybe I'm missing something. We have an internal copy of the
header file in qemu under
include/standard-headers/linux/virtio_balloon.h, which
hw/virtio/virtio-balloon.c includes (through including
hw/virtio/virtio-balloon.h). I thought qemu would use internal header
files so that it could be compiled on any Linux kernel version. So when
would we ever have the issue Jason is discussing?
Thanks,
Jon
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net] virtio-net: add missing virtqueue kick when flushing packets
From: David Miller @ 2018-04-13 15:27 UTC (permalink / raw)
To: jasowang; +Cc: daniel, mst, netdev, linux-kernel, virtualization, ktaka
In-Reply-To: <1523602705-5155-1-git-send-email-jasowang@redhat.com>
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 13 Apr 2018 14:58:25 +0800
> We tends to batch submitting packets during XDP_TX. This requires to
> kick virtqueue after a batch, we tried to do it through
> xdp_do_flush_map() which only makes sense for devmap not XDP_TX. So
> explicitly kick the virtqueue in this case.
>
> Reported-by: Kimitoshi Takahashi <ktaka@nii.ac.jp>
> Tested-by: Kimitoshi Takahashi <ktaka@nii.ac.jp>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Fixes: 186b3c998c50 ("virtio-net: support XDP_REDIRECT")
> Signed-off-by: Jason Wang <jasowang@redhat.com>
Applied and queued up for -stable, thanks Jason.
^ permalink raw reply
* Re: [RFC v2] virtio: support packed ring
From: Michael S. Tsirkin @ 2018-04-13 15:22 UTC (permalink / raw)
To: Tiwei Bie; +Cc: netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180401141216.8969-1-tiwei.bie@intel.com>
On Sun, Apr 01, 2018 at 10:12:16PM +0800, Tiwei Bie wrote:
> +static inline bool more_used(const struct vring_virtqueue *vq)
> +{
> + return vq->packed ? more_used_packed(vq) : more_used_split(vq);
> +}
> +
> +void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, unsigned int *len,
> + void **ctx)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + void *ret;
> + unsigned int i;
> + u16 last_used;
> +
> + START_USE(vq);
> +
> + if (unlikely(vq->broken)) {
> + END_USE(vq);
> + return NULL;
> + }
> +
> + if (!more_used(vq)) {
> + pr_debug("No more buffers in queue\n");
> + END_USE(vq);
> + return NULL;
> + }
So virtqueue_get_buf_ctx_split should only call more_used_split.
to avoid such issues I think we should lay out the code like this:
XXX_split
XXX_packed
XXX wrappers
> +/* The standard layout
I'd drop standard here.
> for the packed ring is a continuous chunk of memory
> + * which looks like this.
> + *
> + * struct vring_packed
> + * {
Can the opening bracket go on the prev line pls?
> + * // The actual descriptors (16 bytes each)
> + * struct vring_packed_desc desc[num];
> + *
> + * // Padding to the next align boundary.
> + * char pad[];
> + *
> + * // Driver Event Suppression
> + * struct vring_packed_desc_event driver;
> + *
> + * // Device Event Suppression
> + * struct vring_packed_desc_event device;
Maybe that's how our driver does it but it's not based on spec
so I don't think this belongs in the header.
> + * };
> + */
> +
> +static inline unsigned vring_packed_size(unsigned int num, unsigned long align)
> +{
> + return ((sizeof(struct vring_packed_desc) * num + align - 1)
> + & ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
> +}
> +
Cant say this API makes sense for me.
> #endif /* _UAPI_LINUX_VIRTIO_RING_H */
> --
> 2.11.0
^ permalink raw reply
* Re: [virtio-dev] Re: [PATCH v2] virtio_balloon: export hugetlb page allocation counts
From: Michael S. Tsirkin @ 2018-04-13 13:44 UTC (permalink / raw)
To: Jason Wang; +Cc: Mihai Carabas, virtio-dev, virtualization, linux-kernel
In-Reply-To: <9b13b742-3970-e322-5a2f-dcfc75c540dc@redhat.com>
On Fri, Apr 13, 2018 at 03:01:11PM +0800, Jason Wang wrote:
>
>
> On 2018年04月12日 08:24, Jonathan Helman wrote:
> >
> >
> > On 04/10/2018 08:12 PM, Jason Wang wrote:
> > >
> > >
> > > On 2018年04月10日 05:11, Jonathan Helman wrote:
> > > >
> > > >
> > > > On 03/22/2018 07:38 PM, Jason Wang wrote:
> > > > >
> > > > >
> > > > > On 2018年03月22日 11:10, Michael S. Tsirkin wrote:
> > > > > > On Thu, Mar 22, 2018 at 09:52:18AM +0800, Jason Wang wrote:
> > > > > > > On 2018年03月20日 12:26, Jonathan Helman wrote:
> > > > > > > > > On Mar 19, 2018, at 7:31 PM, Jason
> > > > > > > > > Wang<jasowang@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > On 2018年03月20日 06:14, Jonathan Helman wrote:
> > > > > > > > > > Export the number of successful and failed hugetlb page
> > > > > > > > > > allocations via the virtio balloon driver. These 2 counts
> > > > > > > > > > come directly from the vm_events HTLB_BUDDY_PGALLOC and
> > > > > > > > > > HTLB_BUDDY_PGALLOC_FAIL.
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Jonathan Helman<jonathan.helman@oracle.com>
> > > > > > > > > Reviewed-by: Jason Wang<jasowang@redhat.com>
> > > > > > > > Thanks.
> > > > > > > >
> > > > > > > > > > ---
> > > > > > > > > > drivers/virtio/virtio_balloon.c | 6 ++++++
> > > > > > > > > > include/uapi/linux/virtio_balloon.h | 4 +++-
> > > > > > > > > > 2 files changed, 9 insertions(+), 1 deletion(-)
> > > > > > > > > >
> > > > > > > > > > diff --git
> > > > > > > > > > a/drivers/virtio/virtio_balloon.c
> > > > > > > > > > b/drivers/virtio/virtio_balloon.c
> > > > > > > > > > index dfe5684..6b237e3 100644
> > > > > > > > > > --- a/drivers/virtio/virtio_balloon.c
> > > > > > > > > > +++ b/drivers/virtio/virtio_balloon.c
> > > > > > > > > > @@ -272,6 +272,12 @@ static unsigned int
> > > > > > > > > > update_balloon_stats(struct
> > > > > > > > > > virtio_balloon *vb)
> > > > > > > > > > pages_to_bytes(events[PSWPOUT]));
> > > > > > > > > > update_stat(vb, idx++,
> > > > > > > > > > VIRTIO_BALLOON_S_MAJFLT,
> > > > > > > > > > events[PGMAJFAULT]);
> > > > > > > > > > update_stat(vb, idx++,
> > > > > > > > > > VIRTIO_BALLOON_S_MINFLT,
> > > > > > > > > > events[PGFAULT]);
> > > > > > > > > > +#ifdef CONFIG_HUGETLB_PAGE
> > > > > > > > > > + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
> > > > > > > > > > + events[HTLB_BUDDY_PGALLOC]);
> > > > > > > > > > + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL,
> > > > > > > > > > + events[HTLB_BUDDY_PGALLOC_FAIL]);
> > > > > > > > > > +#endif
> > > > > > > > > > #endif
> > > > > > > > > > update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE,
> > > > > > > > > > pages_to_bytes(i.freeram));
> > > > > > > > > > diff --git
> > > > > > > > > > a/include/uapi/linux/virtio_balloon.h
> > > > > > > > > > b/include/uapi/linux/virtio_balloon.h
> > > > > > > > > > index 4e8b830..40297a3 100644
> > > > > > > > > > --- a/include/uapi/linux/virtio_balloon.h
> > > > > > > > > > +++ b/include/uapi/linux/virtio_balloon.h
> > > > > > > > > > @@ -53,7 +53,9 @@ struct virtio_balloon_config {
> > > > > > > > > > #define VIRTIO_BALLOON_S_MEMTOT 5
> > > > > > > > > > /* Total amount of memory */
> > > > > > > > > > #define VIRTIO_BALLOON_S_AVAIL 6
> > > > > > > > > > /* Available memory as in /proc */
> > > > > > > > > > #define VIRTIO_BALLOON_S_CACHES 7 /* Disk caches */
> > > > > > > > > > -#define VIRTIO_BALLOON_S_NR 8
> > > > > > > > > > +#define VIRTIO_BALLOON_S_HTLB_PGALLOC
> > > > > > > > > > 8 /* Hugetlb page allocations */
> > > > > > > > > > +#define VIRTIO_BALLOON_S_HTLB_PGFAIL
> > > > > > > > > > 9 /* Hugetlb page allocation failures
> > > > > > > > > > */
> > > > > > > > > > +#define VIRTIO_BALLOON_S_NR 10
> > > > > > > > > > /*
> > > > > > > > > > * Memory statistics structure.
> > > > > > > > > Not for this patch, but it looks to me that
> > > > > > > > > exporting such nr through uapi is fragile.
> > > > > > > > Sorry, can you explain what you mean here?
> > > > > > > >
> > > > > > > > Jon
> > > > > > > Spec said "Within an output buffer submitted to the
> > > > > > > statsq, the device MUST
> > > > > > > ignore entries with tag values that it does not
> > > > > > > recognize". So exporting
> > > > > > > VIRTIO_BALLOON_S_NR seems useless and device
> > > > > > > implementation can not depend
> > > > > > > on such number in uapi.
> > > > > > >
> > > > > > > Thanks
> > > > > > Suggestions? I don't like to break build for people ...
> > > > > >
> > > > >
> > > > > Didn't have a good idea. But maybe we should keep
> > > > > VIRTIO_BALLOON_S_NR unchanged, and add a comment here.
> > > > >
> > > > > Thanks
> > > >
> > > > I think Jason's comment is for a future patch. Didn't see this
> > > > patch get applied, so wondering if it could be.
> > > >
> > > > Thanks,
> > > > Jon
> > >
> > > Hi Jon:
> > >
> > > Have you tested new driver with old qemu?
> >
> > Yes, this testing scenario looks good. Thanks.
> >
> > Jon
>
> Hi Jon:
>
> I mean e.g compiling qemu with new linux headers. E.g current qemu has:
>
> static const char *balloon_stat_names[] = {
> [VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in",
> [VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out",
> [VIRTIO_BALLOON_S_MAJFLT] = "stat-major-faults",
> [VIRTIO_BALLOON_S_MINFLT] = "stat-minor-faults",
> [VIRTIO_BALLOON_S_MEMFREE] = "stat-free-memory",
> [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory",
> [VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory",
> [VIRTIO_BALLOON_S_CACHES] = "stat-disk-caches",
> [VIRTIO_BALLOON_S_NR] = NULL
> };
>
> I'm afraid it will be broken if VIRTIO_BALLOON_S_NR is 10.
>
> Thanks
Well it is handy for sizing arrays and this isn't the first time we did
this:
commit 4d32029b8ddb7be4d1699c6d8e1675ff5476d149
Author: Tomáš Golembiovský <tgolembi@redhat.com>
Date: Sun Nov 12 13:05:38 2017 +0100
virtio_balloon: include disk/file caches memory statistics
commit 5057dcd0f1aaad57e07e728ba20a99e205c6b9de
Author: Igor Redko <redkoi@virtuozzo.com>
Date: Thu Mar 17 14:19:08 2016 -0700
virtio_balloon: export 'available' memory to balloon statistics
how about we give QEMU a hand and just put the list of names in
the header?
I posted a patch like that, pls review.
--
MST
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PATCH] virtio_balloon: add array of stat names
From: Michael S. Tsirkin @ 2018-04-13 13:44 UTC (permalink / raw)
To: linux-kernel; +Cc: virtualization
Jason Wang points out that it's vary hard for users to build an array of
stat names. The naive thing is to use VIRTIO_BALLOON_S_NR but that
breaks if we add more stats.
Let's add an array of reasonably readable names.
Fixes: 6c64fe7f2 ("virtio_balloon: export hugetlb page allocation counts")
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jonathan Helman <jonathan.helman@oracle.com>,
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
include/uapi/linux/virtio_balloon.h | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index 9e02137..1477c17 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -64,6 +64,21 @@ struct virtio_balloon_config {
#define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page allocation failures */
#define VIRTIO_BALLOON_S_NR 10
+#define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \
+ VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \
+ VIRTIO_BALLOON_S_NAMES_prefix "swap-out", \
+ VIRTIO_BALLOON_S_NAMES_prefix "major-faults", \
+ VIRTIO_BALLOON_S_NAMES_prefix "minor-faults", \
+ VIRTIO_BALLOON_S_NAMES_prefix "free-memory", \
+ VIRTIO_BALLOON_S_NAMES_prefix "total-memory", \
+ VIRTIO_BALLOON_S_NAMES_prefix "available-memory", \
+ VIRTIO_BALLOON_S_NAMES_prefix "disk-caches", \
+ VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \
+ VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures" \
+}
+
+#define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("")
+
/*
* Memory statistics structure.
* Driver fills an array of these structures and passes to device.
--
MST
^ permalink raw reply related
* Re: [RFC v2] virtio: support packed ring
From: Tiwei Bie @ 2018-04-13 7:15 UTC (permalink / raw)
To: Jason Wang; +Cc: mst, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <ebfbfc3e-9e52-9331-61b6-258e131f822d@redhat.com>
On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
> On 2018年04月01日 22:12, Tiwei Bie wrote:
> > Hello everyone,
> >
> > This RFC implements packed ring support for virtio driver.
> >
> > The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> > by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> > Minor changes are needed for the vhost code, e.g. to kick the guest.
> >
> > TODO:
> > - Refinements and bug fixes;
> > - Split into small patches;
> > - Test indirect descriptor support;
> > - Test/fix event suppression support;
> > - Test devices other than net;
> >
> > RFC v1 -> RFC v2:
> > - Add indirect descriptor support - compile test only;
> > - Add event suppression supprt - compile test only;
> > - Move vring_packed_init() out of uapi (Jason, MST);
> > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > - Avoid using '%' operator (Jason);
> > - Rename free_head -> next_avail_idx (Jason);
> > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > - Some other refinements and bug fixes;
> >
> > Thanks!
> >
> > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > ---
> > drivers/virtio/virtio_ring.c | 1094 +++++++++++++++++++++++++++++-------
> > include/linux/virtio_ring.h | 8 +-
> > include/uapi/linux/virtio_config.h | 12 +-
> > include/uapi/linux/virtio_ring.h | 61 ++
> > 4 files changed, 980 insertions(+), 195 deletions(-)
[...]
> > +static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> > + unsigned int total_sg,
> > + gfp_t gfp)
> > +{
> > + struct vring_packed_desc *desc;
> > +
> > + /*
> > + * We require lowmem mappings for the descriptors because
> > + * otherwise virt_to_phys will give us bogus addresses in the
> > + * virtqueue.
> > + */
> > + gfp &= ~__GFP_HIGHMEM;
> > +
> > + desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
>
> Can we simply check vq->packed here to avoid duplicating helpers?
Then it would be something like this:
static void *alloc_indirect(struct virtqueue *_vq, unsigned int total_sg,
gfp_t gfp)
{
struct vring_virtqueue *vq = to_vvq(_vq);
void *data;
/*
* We require lowmem mappings for the descriptors because
* otherwise virt_to_phys will give us bogus addresses in the
* virtqueue.
*/
gfp &= ~__GFP_HIGHMEM;
if (vq->packed) {
data = kmalloc(total_sg * sizeof(struct vring_packed_desc),
gfp);
if (!data)
return NULL;
} else {
struct vring_desc *desc;
unsigned int i;
desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
if (!desc)
return NULL;
for (i = 0; i < total_sg; i++)
desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
data = desc;
}
return data;
}
I would prefer to have two simpler helpers (and to the callers,
it's already very clear about which one they should call), i.e.
the current implementation:
static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
unsigned int total_sg,
gfp_t gfp)
{
struct vring_packed_desc *desc;
/*
* We require lowmem mappings for the descriptors because
* otherwise virt_to_phys will give us bogus addresses in the
* virtqueue.
*/
gfp &= ~__GFP_HIGHMEM;
desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
return desc;
}
static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
unsigned int total_sg,
gfp_t gfp)
{
struct vring_desc *desc;
unsigned int i;
/*
* We require lowmem mappings for the descriptors because
* otherwise virt_to_phys will give us bogus addresses in the
* virtqueue.
*/
gfp &= ~__GFP_HIGHMEM;
desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
if (!desc)
return NULL;
for (i = 0; i < total_sg; i++)
desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
return desc;
}
>
> > +
> > + return desc;
> > +}
[...]
> > +static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > + struct scatterlist *sgs[],
> > + unsigned int total_sg,
> > + unsigned int out_sgs,
> > + unsigned int in_sgs,
> > + void *data,
> > + void *ctx,
> > + gfp_t gfp)
> > +{
> > + struct vring_virtqueue *vq = to_vvq(_vq);
> > + struct vring_packed_desc *desc;
> > + struct scatterlist *sg;
> > + unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
> > + __virtio16 uninitialized_var(head_flags), flags;
> > + int head, wrap_counter;
> > + bool indirect;
> > +
> > + START_USE(vq);
> > +
> > + BUG_ON(data == NULL);
> > + BUG_ON(ctx && vq->indirect);
> > +
> > + if (unlikely(vq->broken)) {
> > + END_USE(vq);
> > + return -EIO;
> > + }
> > +
> > +#ifdef DEBUG
> > + {
> > + ktime_t now = ktime_get();
> > +
> > + /* No kick or get, with .1 second between? Warn. */
> > + if (vq->last_add_time_valid)
> > + WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
> > + > 100);
> > + vq->last_add_time = now;
> > + vq->last_add_time_valid = true;
> > + }
> > +#endif
> > +
> > + BUG_ON(total_sg == 0);
> > +
> > + head = vq->next_avail_idx;
> > + wrap_counter = vq->wrap_counter;
> > +
> > + /* If the host supports indirect descriptor tables, and we have multiple
> > + * buffers, then go indirect. FIXME: tune this threshold */
> > + if (vq->indirect && total_sg > 1 && vq->vq.num_free)
>
> Let's introduce a helper like virtqueue_need_indirect() to avoid duplicating
> codes and FIXME.
Okay.
>
> > + desc = alloc_indirect_packed(_vq, total_sg, gfp);
> > + else {
> > + desc = NULL;
> > + WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
> > + }
> > +
> > + if (desc) {
> > + /* Use a single buffer which doesn't continue */
> > + indirect = true;
> > + /* Set up rest to use this indirect table. */
> > + i = 0;
> > + descs_used = 1;
> > + } else {
> > + indirect = false;
> > + desc = vq->vring_packed.desc;
> > + i = head;
> > + descs_used = total_sg;
> > + }
> > +
> > + if (vq->vq.num_free < descs_used) {
> > + pr_debug("Can't add buf len %i - avail = %i\n",
> > + descs_used, vq->vq.num_free);
> > + /* FIXME: for historical reasons, we force a notify here if
> > + * there are outgoing parts to the buffer. Presumably the
> > + * host should service the ring ASAP. */
> > + if (out_sgs)
> > + vq->notify(&vq->vq);
> > + if (indirect)
> > + kfree(desc);
> > + END_USE(vq);
> > + return -ENOSPC;
> > + }
> > +
> > + for (n = 0; n < out_sgs + in_sgs; n++) {
> > + for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > + dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
> > + DMA_TO_DEVICE : DMA_FROM_DEVICE);
> > + if (vring_mapping_error(vq, addr))
> > + goto unmap_release;
> > +
> > + flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> > + (n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
> > + VRING_DESC_F_AVAIL(vq->wrap_counter) |
> > + VRING_DESC_F_USED(!vq->wrap_counter));
> > + if (!indirect && i == head)
> > + head_flags = flags;
> > + else
> > + desc[i].flags = flags;
> > +
> > + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> > + desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> > + desc[i].id = cpu_to_virtio32(_vq->vdev, head);
>
> Similar to V1, we only need this for the last descriptor.
Okay, will just set it for the last desc.
>
> > + prev = i;
>
> It looks to me there's no need to track prev inside the loop here.
>
> > + i++;
> > + if (!indirect && i >= vq->vring_packed.num) {
> > + i = 0;
> > + vq->wrap_counter ^= 1;
> > + }
> > + }
> > + }
> > + /* Last one doesn't continue. */
> > + if (total_sg == 1)
> > + head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> > + else
> > + desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
>
> The only case when prev != i - 1 is i == 0, we can add a if here.
It's just a mirror of the existing implementation in split ring.
It seems that split ring implementation needs this just because
it's much harder for it to find the prev, which is not true for
packed ring. So I'll take your suggestion. Thanks!
>
[...]
> > +static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > +{
> > + struct vring_virtqueue *vq = to_vvq(_vq);
> > + u16 new, old, off_wrap;
> > + bool needs_kick;
> > +
> > + START_USE(vq);
> > + /* We need to expose the new flags value before checking notification
> > + * suppressions. */
> > + virtio_mb(vq->weak_barriers);
> > +
> > + old = vq->next_avail_idx - vq->num_added;
> > + new = vq->next_avail_idx;
> > + vq->num_added = 0;
> > +
> > +#ifdef DEBUG
> > + if (vq->last_add_time_valid) {
> > + WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
> > + vq->last_add_time)) > 100);
> > + }
> > + vq->last_add_time_valid = false;
> > +#endif
> > +
> > + off_wrap = virtio16_to_cpu(_vq->vdev, vq->vring_packed.device->off_wrap);
> > +
> > + if (vq->event) {
>
> It looks to me we should examine RING_EVENT_FLAGS_DESC in desc_event_flags
> instead of vq->event here. Spec does not forces to use evenf_off and
> event_wrap if event index is enabled.
>
> > + // FIXME: fix this!
> > + needs_kick = ((off_wrap >> 15) == vq->wrap_counter) &&
> > + vring_need_event(off_wrap & ~(1<<15), new, old);
>
> Why need a & here?
Because wrap_counter (the most significant bit in off_wrap)
isn't part of the index.
>
> > + } else {
>
> Need a smp_rmb() to make sure desc_event_flags was checked before flags.
I don't get your point, if my understanding is correct,
desc_event_flags is vq->vring_packed.device->flags. So
what's the "flags"?
>
> > + needs_kick = (vq->vring_packed.device->flags !=
> > + cpu_to_virtio16(_vq->vdev, VRING_EVENT_F_DISABLE));
> > + }
> > + END_USE(vq);
> > + return needs_kick;
> > +}
[...]
> > +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > + void **ctx)
> > +{
> > + struct vring_packed_desc *desc;
> > + unsigned int i, j;
> > +
> > + /* Clear data ptr. */
> > + vq->desc_state[head].data = NULL;
> > +
> > + i = head;
> > +
> > + for (j = 0; j < vq->desc_state[head].num; j++) {
> > + desc = &vq->vring_packed.desc[i];
> > + vring_unmap_one_packed(vq, desc);
> > + desc->flags = 0x0;
>
> Looks like this is unnecessary.
It's safer to zero it. If we don't zero it, after we
call virtqueue_detach_unused_buf_packed() which calls
this function, the desc is still available to the
device.
>
> > + i++;
> > + if (i >= vq->vring_packed.num)
> > + i = 0;
> > + }
[...]
> > +static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> > +{
> > + struct vring_virtqueue *vq = to_vvq(_vq);
> > + u16 last_used_idx, wrap_counter, off_wrap;
> > +
> > + START_USE(vq);
> > +
> > + last_used_idx = vq->last_used_idx;
> > + wrap_counter = vq->wrap_counter;
> > +
> > + if (last_used_idx > vq->next_avail_idx)
> > + wrap_counter ^= 1;
> > +
> > + off_wrap = last_used_idx | (wrap_counter << 15);
> > +
> > + /* We optimistically turn back on interrupts, then check if there was
> > + * more to do. */
> > + /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> > + * either clear the flags bit or point the event index at the next
> > + * entry. Always do both to keep code simple. */
> > + if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > + vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
> > + VRING_EVENT_F_ENABLE;
> > + vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > + vq->event_flags_shadow);
> > + }
>
> A smp_wmb() is missed here?
>
> > + vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev, off_wrap);
>
> And according to the spec, it looks to me write a VRING_EVENT_F_ENABLE is
> sufficient here.
I didn't think much when implementing the event suppression
for packed ring previously. After I saw your comments, I found
something new. Indeed, unlike the split ring, for the packed
ring, spec doesn't say we must use VRING_EVENT_F_DESC when
EVENT_IDX is negotiated. So do you think below thought is
right or makes sense?
- For virtqueue_enable_cb_prepare(), we just need to enable
the ring by setting flags to VRING_EVENT_F_ENABLE in any
case.
- We will try to use VRING_EVENT_F_DESC (if EVENT_IDX is
negotiated) only when we want to delay the interrupts
virtqueue_enable_cb_delayed().
>
> > + END_USE(vq);
> > + return last_used_idx;
> > +}
> > +
[...]
> > @@ -1157,14 +1852,18 @@ void vring_transport_features(struct virtio_device *vdev)
> > for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
> > switch (i) {
> > - case VIRTIO_RING_F_INDIRECT_DESC:
> > +#if 0
> > + case VIRTIO_RING_F_INDIRECT_DESC: // FIXME not tested yet.
> > break;
> > - case VIRTIO_RING_F_EVENT_IDX:
> > + case VIRTIO_RING_F_EVENT_IDX: // FIXME probably not work.
> > break;
> > +#endif
>
> It would be better if you can split EVENT_IDX and INDIRECT_DESC into
> separate patches too.
Sure. Will do it in the next version.
Thanks for the review!
>
> Thanks
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [virtio-dev] Re: [PATCH v2] virtio_balloon: export hugetlb page allocation counts
From: Jason Wang @ 2018-04-13 7:01 UTC (permalink / raw)
To: Jonathan Helman, Michael S. Tsirkin
Cc: Mihai Carabas, virtio-dev, linux-kernel, virtualization
In-Reply-To: <1d2bcb00-b210-b992-318e-c65607d31dff@oracle.com>
On 2018年04月12日 08:24, Jonathan Helman wrote:
>
>
> On 04/10/2018 08:12 PM, Jason Wang wrote:
>>
>>
>> On 2018年04月10日 05:11, Jonathan Helman wrote:
>>>
>>>
>>> On 03/22/2018 07:38 PM, Jason Wang wrote:
>>>>
>>>>
>>>> On 2018年03月22日 11:10, Michael S. Tsirkin wrote:
>>>>> On Thu, Mar 22, 2018 at 09:52:18AM +0800, Jason Wang wrote:
>>>>>> On 2018年03月20日 12:26, Jonathan Helman wrote:
>>>>>>>> On Mar 19, 2018, at 7:31 PM, Jason Wang<jasowang@redhat.com>
>>>>>>>> wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On 2018年03月20日 06:14, Jonathan Helman wrote:
>>>>>>>>> Export the number of successful and failed hugetlb page
>>>>>>>>> allocations via the virtio balloon driver. These 2 counts
>>>>>>>>> come directly from the vm_events HTLB_BUDDY_PGALLOC and
>>>>>>>>> HTLB_BUDDY_PGALLOC_FAIL.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Jonathan Helman<jonathan.helman@oracle.com>
>>>>>>>> Reviewed-by: Jason Wang<jasowang@redhat.com>
>>>>>>> Thanks.
>>>>>>>
>>>>>>>>> ---
>>>>>>>>> drivers/virtio/virtio_balloon.c | 6 ++++++
>>>>>>>>> include/uapi/linux/virtio_balloon.h | 4 +++-
>>>>>>>>> 2 files changed, 9 insertions(+), 1 deletion(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/virtio/virtio_balloon.c
>>>>>>>>> b/drivers/virtio/virtio_balloon.c
>>>>>>>>> index dfe5684..6b237e3 100644
>>>>>>>>> --- a/drivers/virtio/virtio_balloon.c
>>>>>>>>> +++ b/drivers/virtio/virtio_balloon.c
>>>>>>>>> @@ -272,6 +272,12 @@ static unsigned int
>>>>>>>>> update_balloon_stats(struct virtio_balloon *vb)
>>>>>>>>> pages_to_bytes(events[PSWPOUT]));
>>>>>>>>> update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT,
>>>>>>>>> events[PGMAJFAULT]);
>>>>>>>>> update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT,
>>>>>>>>> events[PGFAULT]);
>>>>>>>>> +#ifdef CONFIG_HUGETLB_PAGE
>>>>>>>>> + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
>>>>>>>>> + events[HTLB_BUDDY_PGALLOC]);
>>>>>>>>> + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL,
>>>>>>>>> + events[HTLB_BUDDY_PGALLOC_FAIL]);
>>>>>>>>> +#endif
>>>>>>>>> #endif
>>>>>>>>> update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE,
>>>>>>>>> pages_to_bytes(i.freeram));
>>>>>>>>> diff --git a/include/uapi/linux/virtio_balloon.h
>>>>>>>>> b/include/uapi/linux/virtio_balloon.h
>>>>>>>>> index 4e8b830..40297a3 100644
>>>>>>>>> --- a/include/uapi/linux/virtio_balloon.h
>>>>>>>>> +++ b/include/uapi/linux/virtio_balloon.h
>>>>>>>>> @@ -53,7 +53,9 @@ struct virtio_balloon_config {
>>>>>>>>> #define VIRTIO_BALLOON_S_MEMTOT 5 /* Total amount of
>>>>>>>>> memory */
>>>>>>>>> #define VIRTIO_BALLOON_S_AVAIL 6 /* Available memory
>>>>>>>>> as in /proc */
>>>>>>>>> #define VIRTIO_BALLOON_S_CACHES 7 /* Disk caches */
>>>>>>>>> -#define VIRTIO_BALLOON_S_NR 8
>>>>>>>>> +#define VIRTIO_BALLOON_S_HTLB_PGALLOC 8 /* Hugetlb page
>>>>>>>>> allocations */
>>>>>>>>> +#define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page
>>>>>>>>> allocation failures */
>>>>>>>>> +#define VIRTIO_BALLOON_S_NR 10
>>>>>>>>> /*
>>>>>>>>> * Memory statistics structure.
>>>>>>>> Not for this patch, but it looks to me that exporting such nr
>>>>>>>> through uapi is fragile.
>>>>>>> Sorry, can you explain what you mean here?
>>>>>>>
>>>>>>> Jon
>>>>>> Spec said "Within an output buffer submitted to the statsq, the
>>>>>> device MUST
>>>>>> ignore entries with tag values that it does not recognize". So
>>>>>> exporting
>>>>>> VIRTIO_BALLOON_S_NR seems useless and device implementation can
>>>>>> not depend
>>>>>> on such number in uapi.
>>>>>>
>>>>>> Thanks
>>>>> Suggestions? I don't like to break build for people ...
>>>>>
>>>>
>>>> Didn't have a good idea. But maybe we should keep
>>>> VIRTIO_BALLOON_S_NR unchanged, and add a comment here.
>>>>
>>>> Thanks
>>>
>>> I think Jason's comment is for a future patch. Didn't see this patch
>>> get applied, so wondering if it could be.
>>>
>>> Thanks,
>>> Jon
>>
>> Hi Jon:
>>
>> Have you tested new driver with old qemu?
>
> Yes, this testing scenario looks good. Thanks.
>
> Jon
Hi Jon:
I mean e.g compiling qemu with new linux headers. E.g current qemu has:
static const char *balloon_stat_names[] = {
[VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in",
[VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out",
[VIRTIO_BALLOON_S_MAJFLT] = "stat-major-faults",
[VIRTIO_BALLOON_S_MINFLT] = "stat-minor-faults",
[VIRTIO_BALLOON_S_MEMFREE] = "stat-free-memory",
[VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory",
[VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory",
[VIRTIO_BALLOON_S_CACHES] = "stat-disk-caches",
[VIRTIO_BALLOON_S_NR] = NULL
};
I'm afraid it will be broken if VIRTIO_BALLOON_S_NR is 10.
Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PATCH net] virtio-net: add missing virtqueue kick when flushing packets
From: Jason Wang @ 2018-04-13 6:58 UTC (permalink / raw)
To: mst, jasowang, virtualization, netdev, linux-kernel
Cc: ktaka, Daniel Borkmann
We tends to batch submitting packets during XDP_TX. This requires to
kick virtqueue after a batch, we tried to do it through
xdp_do_flush_map() which only makes sense for devmap not XDP_TX. So
explicitly kick the virtqueue in this case.
Reported-by: Kimitoshi Takahashi <ktaka@nii.ac.jp>
Tested-by: Kimitoshi Takahashi <ktaka@nii.ac.jp>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Fixes: 186b3c998c50 ("virtio-net: support XDP_REDIRECT")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/virtio_net.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 2337460..d8e1aea 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1269,7 +1269,9 @@ static int virtnet_poll(struct napi_struct *napi, int budget)
{
struct receive_queue *rq =
container_of(napi, struct receive_queue, napi);
- unsigned int received;
+ struct virtnet_info *vi = rq->vq->vdev->priv;
+ struct send_queue *sq;
+ unsigned int received, qp;
bool xdp_xmit = false;
virtnet_poll_cleantx(rq);
@@ -1280,8 +1282,13 @@ static int virtnet_poll(struct napi_struct *napi, int budget)
if (received < budget)
virtqueue_napi_complete(napi, rq->vq, received);
- if (xdp_xmit)
+ if (xdp_xmit) {
+ qp = vi->curr_queue_pairs - vi->xdp_queue_pairs +
+ smp_processor_id();
+ sq = &vi->sq[qp];
+ virtqueue_kick(sq->vq);
xdp_do_flush_map();
+ }
return received;
}
--
2.7.4
^ permalink raw reply related
* Re: [RFC v2] virtio: support packed ring
From: Jason Wang @ 2018-04-13 4:30 UTC (permalink / raw)
To: Tiwei Bie, mst, wexu, virtualization, linux-kernel, netdev
In-Reply-To: <20180401141216.8969-1-tiwei.bie@intel.com>
On 2018年04月01日 22:12, Tiwei Bie wrote:
> Hello everyone,
>
> This RFC implements packed ring support for virtio driver.
>
> The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> Minor changes are needed for the vhost code, e.g. to kick the guest.
>
> TODO:
> - Refinements and bug fixes;
> - Split into small patches;
> - Test indirect descriptor support;
> - Test/fix event suppression support;
> - Test devices other than net;
>
> RFC v1 -> RFC v2:
> - Add indirect descriptor support - compile test only;
> - Add event suppression supprt - compile test only;
> - Move vring_packed_init() out of uapi (Jason, MST);
> - Merge two loops into one in virtqueue_add_packed() (Jason);
> - Split vring_unmap_one() for packed ring and split ring (Jason);
> - Avoid using '%' operator (Jason);
> - Rename free_head -> next_avail_idx (Jason);
> - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> - Some other refinements and bug fixes;
>
> Thanks!
>
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
> drivers/virtio/virtio_ring.c | 1094 +++++++++++++++++++++++++++++-------
> include/linux/virtio_ring.h | 8 +-
> include/uapi/linux/virtio_config.h | 12 +-
> include/uapi/linux/virtio_ring.h | 61 ++
> 4 files changed, 980 insertions(+), 195 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 71458f493cf8..0515dca34d77 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -58,14 +58,15 @@
>
> struct vring_desc_state {
> void *data; /* Data for callback. */
> - struct vring_desc *indir_desc; /* Indirect descriptor, if any. */
> + void *indir_desc; /* Indirect descriptor, if any. */
> + int num; /* Descriptor list length. */
> };
>
> struct vring_virtqueue {
> struct virtqueue vq;
>
> - /* Actual memory layout for this queue */
> - struct vring vring;
> + /* Is this a packed ring? */
> + bool packed;
>
> /* Can we use weak barriers? */
> bool weak_barriers;
> @@ -79,19 +80,45 @@ struct vring_virtqueue {
> /* Host publishes avail event idx */
> bool event;
>
> - /* Head of free buffer list. */
> - unsigned int free_head;
> /* Number we've added since last sync. */
> unsigned int num_added;
>
> /* Last used index we've seen. */
> u16 last_used_idx;
>
> - /* Last written value to avail->flags */
> - u16 avail_flags_shadow;
> + union {
> + /* Available for split ring */
> + struct {
> + /* Actual memory layout for this queue. */
> + struct vring vring;
>
> - /* Last written value to avail->idx in guest byte order */
> - u16 avail_idx_shadow;
> + /* Head of free buffer list. */
> + unsigned int free_head;
> +
> + /* Last written value to avail->flags */
> + u16 avail_flags_shadow;
> +
> + /* Last written value to avail->idx in
> + * guest byte order. */
> + u16 avail_idx_shadow;
> + };
> +
> + /* Available for packed ring */
> + struct {
> + /* Actual memory layout for this queue. */
> + struct vring_packed vring_packed;
> +
> + /* Driver ring wrap counter. */
> + u8 wrap_counter;
> +
> + /* Index of the next avail descriptor. */
> + unsigned int next_avail_idx;
> +
> + /* Last written value to driver->flags in
> + * guest byte order. */
> + u16 event_flags_shadow;
> + };
> + };
>
> /* How to notify other side. FIXME: commonalize hcalls! */
> bool (*notify)(struct virtqueue *vq);
> @@ -201,8 +228,33 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
> cpu_addr, size, direction);
> }
>
> -static void vring_unmap_one(const struct vring_virtqueue *vq,
> - struct vring_desc *desc)
> +static void vring_unmap_one_split(const struct vring_virtqueue *vq,
> + struct vring_desc *desc)
> +{
> + u16 flags;
> +
> + if (!vring_use_dma_api(vq->vq.vdev))
> + return;
> +
> + flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
> +
> + if (flags & VRING_DESC_F_INDIRECT) {
> + dma_unmap_single(vring_dma_dev(vq),
> + virtio64_to_cpu(vq->vq.vdev, desc->addr),
> + virtio32_to_cpu(vq->vq.vdev, desc->len),
> + (flags & VRING_DESC_F_WRITE) ?
> + DMA_FROM_DEVICE : DMA_TO_DEVICE);
> + } else {
> + dma_unmap_page(vring_dma_dev(vq),
> + virtio64_to_cpu(vq->vq.vdev, desc->addr),
> + virtio32_to_cpu(vq->vq.vdev, desc->len),
> + (flags & VRING_DESC_F_WRITE) ?
> + DMA_FROM_DEVICE : DMA_TO_DEVICE);
> + }
> +}
> +
> +static void vring_unmap_one_packed(const struct vring_virtqueue *vq,
> + struct vring_packed_desc *desc)
> {
> u16 flags;
>
> @@ -235,8 +287,9 @@ static int vring_mapping_error(const struct vring_virtqueue *vq,
> return dma_mapping_error(vring_dma_dev(vq), addr);
> }
>
> -static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
> - unsigned int total_sg, gfp_t gfp)
> +static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> + unsigned int total_sg,
> + gfp_t gfp)
> {
> struct vring_desc *desc;
> unsigned int i;
> @@ -257,14 +310,32 @@ static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
> return desc;
> }
>
> -static inline int virtqueue_add(struct virtqueue *_vq,
> - struct scatterlist *sgs[],
> - unsigned int total_sg,
> - unsigned int out_sgs,
> - unsigned int in_sgs,
> - void *data,
> - void *ctx,
> - gfp_t gfp)
> +static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> + unsigned int total_sg,
> + gfp_t gfp)
> +{
> + struct vring_packed_desc *desc;
> +
> + /*
> + * We require lowmem mappings for the descriptors because
> + * otherwise virt_to_phys will give us bogus addresses in the
> + * virtqueue.
> + */
> + gfp &= ~__GFP_HIGHMEM;
> +
> + desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
Can we simply check vq->packed here to avoid duplicating helpers?
> +
> + return desc;
> +}
> +
> +static inline int virtqueue_add_split(struct virtqueue *_vq,
> + struct scatterlist *sgs[],
> + unsigned int total_sg,
> + unsigned int out_sgs,
> + unsigned int in_sgs,
> + void *data,
> + void *ctx,
> + gfp_t gfp)
> {
> struct vring_virtqueue *vq = to_vvq(_vq);
> struct scatterlist *sg;
> @@ -303,7 +374,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
> /* If the host supports indirect descriptor tables, and we have multiple
> * buffers, then go indirect. FIXME: tune this threshold */
> if (vq->indirect && total_sg > 1 && vq->vq.num_free)
> - desc = alloc_indirect(_vq, total_sg, gfp);
> + desc = alloc_indirect_split(_vq, total_sg, gfp);
> else {
> desc = NULL;
> WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect);
> @@ -424,7 +495,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
> for (n = 0; n < total_sg; n++) {
> if (i == err_idx)
> break;
> - vring_unmap_one(vq, &desc[i]);
> + vring_unmap_one_split(vq, &desc[i]);
> i = virtio16_to_cpu(_vq->vdev, vq->vring.desc[i].next);
> }
>
> @@ -435,6 +506,210 @@ static inline int virtqueue_add(struct virtqueue *_vq,
> return -EIO;
> }
>
> +static inline int virtqueue_add_packed(struct virtqueue *_vq,
> + struct scatterlist *sgs[],
> + unsigned int total_sg,
> + unsigned int out_sgs,
> + unsigned int in_sgs,
> + void *data,
> + void *ctx,
> + gfp_t gfp)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + struct vring_packed_desc *desc;
> + struct scatterlist *sg;
> + unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
> + __virtio16 uninitialized_var(head_flags), flags;
> + int head, wrap_counter;
> + bool indirect;
> +
> + START_USE(vq);
> +
> + BUG_ON(data == NULL);
> + BUG_ON(ctx && vq->indirect);
> +
> + if (unlikely(vq->broken)) {
> + END_USE(vq);
> + return -EIO;
> + }
> +
> +#ifdef DEBUG
> + {
> + ktime_t now = ktime_get();
> +
> + /* No kick or get, with .1 second between? Warn. */
> + if (vq->last_add_time_valid)
> + WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
> + > 100);
> + vq->last_add_time = now;
> + vq->last_add_time_valid = true;
> + }
> +#endif
> +
> + BUG_ON(total_sg == 0);
> +
> + head = vq->next_avail_idx;
> + wrap_counter = vq->wrap_counter;
> +
> + /* If the host supports indirect descriptor tables, and we have multiple
> + * buffers, then go indirect. FIXME: tune this threshold */
> + if (vq->indirect && total_sg > 1 && vq->vq.num_free)
Let's introduce a helper like virtqueue_need_indirect() to avoid
duplicating codes and FIXME.
> + desc = alloc_indirect_packed(_vq, total_sg, gfp);
> + else {
> + desc = NULL;
> + WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
> + }
> +
> + if (desc) {
> + /* Use a single buffer which doesn't continue */
> + indirect = true;
> + /* Set up rest to use this indirect table. */
> + i = 0;
> + descs_used = 1;
> + } else {
> + indirect = false;
> + desc = vq->vring_packed.desc;
> + i = head;
> + descs_used = total_sg;
> + }
> +
> + if (vq->vq.num_free < descs_used) {
> + pr_debug("Can't add buf len %i - avail = %i\n",
> + descs_used, vq->vq.num_free);
> + /* FIXME: for historical reasons, we force a notify here if
> + * there are outgoing parts to the buffer. Presumably the
> + * host should service the ring ASAP. */
> + if (out_sgs)
> + vq->notify(&vq->vq);
> + if (indirect)
> + kfree(desc);
> + END_USE(vq);
> + return -ENOSPC;
> + }
> +
> + for (n = 0; n < out_sgs + in_sgs; n++) {
> + for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> + dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
> + DMA_TO_DEVICE : DMA_FROM_DEVICE);
> + if (vring_mapping_error(vq, addr))
> + goto unmap_release;
> +
> + flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> + (n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
> + VRING_DESC_F_AVAIL(vq->wrap_counter) |
> + VRING_DESC_F_USED(!vq->wrap_counter));
> + if (!indirect && i == head)
> + head_flags = flags;
> + else
> + desc[i].flags = flags;
> +
> + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> + desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> + desc[i].id = cpu_to_virtio32(_vq->vdev, head);
Similar to V1, we only need this for the last descriptor.
> + prev = i;
It looks to me there's no need to track prev inside the loop here.
> + i++;
> + if (!indirect && i >= vq->vring_packed.num) {
> + i = 0;
> + vq->wrap_counter ^= 1;
> + }
> + }
> + }
> + /* Last one doesn't continue. */
> + if (total_sg == 1)
> + head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> + else
> + desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
The only case when prev != i - 1 is i == 0, we can add a if here.
> +
> + if (indirect) {
> + /* Now that the indirect table is filled in, map it. */
> + dma_addr_t addr = vring_map_single(
> + vq, desc, total_sg * sizeof(struct vring_packed_desc),
> + DMA_TO_DEVICE);
> + if (vring_mapping_error(vq, addr))
> + goto unmap_release;
> +
> + head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
> + VRING_DESC_F_AVAIL(wrap_counter) |
> + VRING_DESC_F_USED(!wrap_counter));
> + vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
> + vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
> + total_sg * sizeof(struct vring_packed_desc));
> + vq->vring_packed.desc[head].id = cpu_to_virtio32(_vq->vdev, head);
> + }
> +
> + /* We're using some buffers from the free list. */
> + vq->vq.num_free -= descs_used;
> +
> + /* Update free pointer */
> + if (indirect) {
> + n = head + 1;
> + if (n >= vq->vring_packed.num) {
> + n = 0;
> + vq->wrap_counter ^= 1;
> + }
> + vq->next_avail_idx = n;
> + } else
> + vq->next_avail_idx = i;
> +
> + /* Store token and indirect buffer state. */
> + vq->desc_state[head].num = descs_used;
> + vq->desc_state[head].data = data;
> + if (indirect)
> + vq->desc_state[head].indir_desc = desc;
> + else
> + vq->desc_state[head].indir_desc = ctx;
> +
> + /* A driver MUST NOT make the first descriptor in the list
> + * available before all subsequent descriptors comprising
> + * the list are made available. */
> + virtio_wmb(vq->weak_barriers);
> + vq->vring_packed.desc[head].flags = head_flags;
> + vq->num_added++;
> +
> + pr_debug("Added buffer head %i to %p\n", head, vq);
> + END_USE(vq);
> +
> + return 0;
> +
> +unmap_release:
> + err_idx = i;
> + i = head;
> +
> + for (n = 0; n < total_sg; n++) {
> + if (i == err_idx)
> + break;
> + vring_unmap_one_packed(vq, &desc[i]);
> + i++;
> + if (!indirect && i >= vq->vring_packed.num)
> + i = 0;
> + }
> +
> + vq->wrap_counter = wrap_counter;
> +
> + if (indirect)
> + kfree(desc);
> +
> + END_USE(vq);
> + return -EIO;
> +}
> +
> +static inline int virtqueue_add(struct virtqueue *_vq,
> + struct scatterlist *sgs[],
> + unsigned int total_sg,
> + unsigned int out_sgs,
> + unsigned int in_sgs,
> + void *data,
> + void *ctx,
> + gfp_t gfp)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> +
> + return vq->packed ? virtqueue_add_packed(_vq, sgs, total_sg, out_sgs,
> + in_sgs, data, ctx, gfp) :
> + virtqueue_add_split(_vq, sgs, total_sg, out_sgs,
> + in_sgs, data, ctx, gfp);
> +}
> +
> /**
> * virtqueue_add_sgs - expose buffers to other end
> * @vq: the struct virtqueue we're talking about.
> @@ -537,18 +812,7 @@ int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
> }
> EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx);
>
> -/**
> - * virtqueue_kick_prepare - first half of split virtqueue_kick call.
> - * @vq: the struct virtqueue
> - *
> - * Instead of virtqueue_kick(), you can do:
> - * if (virtqueue_kick_prepare(vq))
> - * virtqueue_notify(vq);
> - *
> - * This is sometimes useful because the virtqueue_kick_prepare() needs
> - * to be serialized, but the actual virtqueue_notify() call does not.
> - */
> -bool virtqueue_kick_prepare(struct virtqueue *_vq)
> +static bool virtqueue_kick_prepare_split(struct virtqueue *_vq)
> {
> struct vring_virtqueue *vq = to_vvq(_vq);
> u16 new, old;
> @@ -580,6 +844,62 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
> END_USE(vq);
> return needs_kick;
> }
> +
> +static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + u16 new, old, off_wrap;
> + bool needs_kick;
> +
> + START_USE(vq);
> + /* We need to expose the new flags value before checking notification
> + * suppressions. */
> + virtio_mb(vq->weak_barriers);
> +
> + old = vq->next_avail_idx - vq->num_added;
> + new = vq->next_avail_idx;
> + vq->num_added = 0;
> +
> +#ifdef DEBUG
> + if (vq->last_add_time_valid) {
> + WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
> + vq->last_add_time)) > 100);
> + }
> + vq->last_add_time_valid = false;
> +#endif
> +
> + off_wrap = virtio16_to_cpu(_vq->vdev, vq->vring_packed.device->off_wrap);
> +
> + if (vq->event) {
It looks to me we should examine RING_EVENT_FLAGS_DESC in
desc_event_flags instead of vq->event here. Spec does not forces to use
evenf_off and event_wrap if event index is enabled.
> + // FIXME: fix this!
> + needs_kick = ((off_wrap >> 15) == vq->wrap_counter) &&
> + vring_need_event(off_wrap & ~(1<<15), new, old);
Why need a & here?
> + } else {
Need a smp_rmb() to make sure desc_event_flags was checked before flags.
> + needs_kick = (vq->vring_packed.device->flags !=
> + cpu_to_virtio16(_vq->vdev, VRING_EVENT_F_DISABLE));
> + }
> + END_USE(vq);
> + return needs_kick;
> +}
> +
> +/**
> + * virtqueue_kick_prepare - first half of split virtqueue_kick call.
> + * @vq: the struct virtqueue
> + *
> + * Instead of virtqueue_kick(), you can do:
> + * if (virtqueue_kick_prepare(vq))
> + * virtqueue_notify(vq);
> + *
> + * This is sometimes useful because the virtqueue_kick_prepare() needs
> + * to be serialized, but the actual virtqueue_notify() call does not.
> + */
> +bool virtqueue_kick_prepare(struct virtqueue *_vq)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> +
> + return vq->packed ? virtqueue_kick_prepare_packed(_vq) :
> + virtqueue_kick_prepare_split(_vq);
> +}
> EXPORT_SYMBOL_GPL(virtqueue_kick_prepare);
>
> /**
> @@ -626,8 +946,8 @@ bool virtqueue_kick(struct virtqueue *vq)
> }
> EXPORT_SYMBOL_GPL(virtqueue_kick);
>
> -static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
> - void **ctx)
> +static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
> + void **ctx)
> {
> unsigned int i, j;
> __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
> @@ -639,12 +959,12 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
> i = head;
>
> while (vq->vring.desc[i].flags & nextflag) {
> - vring_unmap_one(vq, &vq->vring.desc[i]);
> + vring_unmap_one_split(vq, &vq->vring.desc[i]);
> i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
> vq->vq.num_free++;
> }
>
> - vring_unmap_one(vq, &vq->vring.desc[i]);
> + vring_unmap_one_split(vq, &vq->vring.desc[i]);
> vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
> vq->free_head = head;
>
> @@ -666,7 +986,7 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
> BUG_ON(len == 0 || len % sizeof(struct vring_desc));
>
> for (j = 0; j < len / sizeof(struct vring_desc); j++)
> - vring_unmap_one(vq, &indir_desc[j]);
> + vring_unmap_one_split(vq, &indir_desc[j]);
>
> kfree(indir_desc);
> vq->desc_state[head].indir_desc = NULL;
> @@ -675,11 +995,207 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
> }
> }
>
> -static inline bool more_used(const struct vring_virtqueue *vq)
> +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> + void **ctx)
> +{
> + struct vring_packed_desc *desc;
> + unsigned int i, j;
> +
> + /* Clear data ptr. */
> + vq->desc_state[head].data = NULL;
> +
> + i = head;
> +
> + for (j = 0; j < vq->desc_state[head].num; j++) {
> + desc = &vq->vring_packed.desc[i];
> + vring_unmap_one_packed(vq, desc);
> + desc->flags = 0x0;
Looks like this is unnecessary.
> + i++;
> + if (i >= vq->vring_packed.num)
> + i = 0;
> + }
> +
> + vq->vq.num_free += vq->desc_state[head].num;
> +
> + if (vq->indirect) {
> + u32 len;
> +
> + desc = vq->desc_state[head].indir_desc;
> + /* Free the indirect table, if any, now that it's unmapped. */
> + if (!desc)
> + goto out;
> +
> + len = virtio32_to_cpu(vq->vq.vdev,
> + vq->vring_packed.desc[head].len);
> +
> + BUG_ON(!(vq->vring_packed.desc[head].flags &
> + cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
> + BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));
> +
> + for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
> + vring_unmap_one_packed(vq, &desc[j]);
> +
> + kfree(desc);
> + vq->desc_state[head].indir_desc = NULL;
> + } else if (ctx) {
> + *ctx = vq->desc_state[head].indir_desc;
> + }
> +
> +out:
> + return vq->desc_state[head].num;
> +}
> +
> +static inline bool more_used_split(const struct vring_virtqueue *vq)
> {
> return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
> }
>
> +static inline bool more_used_packed(const struct vring_virtqueue *vq)
> +{
> + u16 last_used, flags;
> + bool avail, used;
> +
> + if (vq->vq.num_free == vq->vring_packed.num)
> + return false;
> +
> + last_used = vq->last_used_idx;
> + flags = virtio16_to_cpu(vq->vq.vdev,
> + vq->vring_packed.desc[last_used].flags);
> + avail = flags & VRING_DESC_F_AVAIL(1);
> + used = flags & VRING_DESC_F_USED(1);
> +
> + return avail == used;
> +}
> +
> +static inline bool more_used(const struct vring_virtqueue *vq)
> +{
> + return vq->packed ? more_used_packed(vq) : more_used_split(vq);
> +}
> +
> +void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, unsigned int *len,
> + void **ctx)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + void *ret;
> + unsigned int i;
> + u16 last_used;
> +
> + START_USE(vq);
> +
> + if (unlikely(vq->broken)) {
> + END_USE(vq);
> + return NULL;
> + }
> +
> + if (!more_used(vq)) {
> + pr_debug("No more buffers in queue\n");
> + END_USE(vq);
> + return NULL;
> + }
> +
> + /* Only get used array entries after they have been exposed by host. */
> + virtio_rmb(vq->weak_barriers);
> +
> + last_used = (vq->last_used_idx & (vq->vring.num - 1));
> + i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
> + *len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
> +
> + if (unlikely(i >= vq->vring.num)) {
> + BAD_RING(vq, "id %u out of range\n", i);
> + return NULL;
> + }
> + if (unlikely(!vq->desc_state[i].data)) {
> + BAD_RING(vq, "id %u is not a head!\n", i);
> + return NULL;
> + }
> +
> + /* detach_buf_split clears data, so grab it now. */
> + ret = vq->desc_state[i].data;
> + detach_buf_split(vq, i, ctx);
> + vq->last_used_idx++;
> + /* If we expect an interrupt for the next entry, tell host
> + * by writing event index and flush out the write before
> + * the read in the next get_buf call. */
> + if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
> + virtio_store_mb(vq->weak_barriers,
> + &vring_used_event(&vq->vring),
> + cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
> +
> +#ifdef DEBUG
> + vq->last_add_time_valid = false;
> +#endif
> +
> + END_USE(vq);
> + return ret;
> +}
> +
> +void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, unsigned int *len,
> + void **ctx)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + uint16_t wrap_counter;
> + void *ret;
> + unsigned int i;
> + u16 last_used;
> +
> + START_USE(vq);
> +
> + if (unlikely(vq->broken)) {
> + END_USE(vq);
> + return NULL;
> + }
> +
> + if (!more_used(vq)) {
> + pr_debug("No more buffers in queue\n");
> + END_USE(vq);
> + return NULL;
> + }
> +
> + /* Only get used elements after they have been exposed by host. */
> + virtio_rmb(vq->weak_barriers);
> +
> + last_used = vq->last_used_idx;
> + i = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].id);
> + *len = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].len);
> +
> + if (unlikely(i >= vq->vring_packed.num)) {
> + BAD_RING(vq, "id %u out of range\n", i);
> + return NULL;
> + }
> + if (unlikely(!vq->desc_state[i].data)) {
> + BAD_RING(vq, "id %u is not a head!\n", i);
> + return NULL;
> + }
> +
> + /* detach_buf_packed clears data, so grab it now. */
> + ret = vq->desc_state[i].data;
> + detach_buf_packed(vq, i, ctx);
> +
> + vq->last_used_idx += vq->desc_state[i].num;
> + if (vq->last_used_idx >= vq->vring_packed.num)
> + vq->last_used_idx -= vq->vring_packed.num;
> +
> + wrap_counter = vq->wrap_counter;
> + if (vq->last_used_idx > vq->next_avail_idx)
> + wrap_counter ^= 1;
> +
> + /* If we expect an interrupt for the next entry, tell host
> + * by writing event index and flush out the write before
> + * the read in the next get_buf call. */
> + if (vq->event_flags_shadow == VRING_EVENT_F_DESC)
> + virtio_store_mb(vq->weak_barriers,
> + &vq->vring_packed.driver->off_wrap,
> + cpu_to_virtio16(_vq->vdev, vq->last_used_idx |
> + wrap_counter << 15));
> +
> +#ifdef DEBUG
> + vq->last_add_time_valid = false;
> +#endif
> +
> + END_USE(vq);
> + return ret;
> +}
> +
> /**
> * virtqueue_get_buf - get the next used buffer
> * @vq: the struct virtqueue we're talking about.
> @@ -700,57 +1216,9 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
> void **ctx)
> {
> struct vring_virtqueue *vq = to_vvq(_vq);
> - void *ret;
> - unsigned int i;
> - u16 last_used;
>
> - START_USE(vq);
> -
> - if (unlikely(vq->broken)) {
> - END_USE(vq);
> - return NULL;
> - }
> -
> - if (!more_used(vq)) {
> - pr_debug("No more buffers in queue\n");
> - END_USE(vq);
> - return NULL;
> - }
> -
> - /* Only get used array entries after they have been exposed by host. */
> - virtio_rmb(vq->weak_barriers);
> -
> - last_used = (vq->last_used_idx & (vq->vring.num - 1));
> - i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
> - *len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
> -
> - if (unlikely(i >= vq->vring.num)) {
> - BAD_RING(vq, "id %u out of range\n", i);
> - return NULL;
> - }
> - if (unlikely(!vq->desc_state[i].data)) {
> - BAD_RING(vq, "id %u is not a head!\n", i);
> - return NULL;
> - }
> -
> - /* detach_buf clears data, so grab it now. */
> - ret = vq->desc_state[i].data;
> - detach_buf(vq, i, ctx);
> - vq->last_used_idx++;
> - /* If we expect an interrupt for the next entry, tell host
> - * by writing event index and flush out the write before
> - * the read in the next get_buf call. */
> - if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
> - virtio_store_mb(vq->weak_barriers,
> - &vring_used_event(&vq->vring),
> - cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
> -
> -#ifdef DEBUG
> - vq->last_add_time_valid = false;
> -#endif
> -
> - END_USE(vq);
> - return ret;
> + return vq->packed ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) :
> + virtqueue_get_buf_ctx_split(_vq, len, ctx);
> }
> EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx);
>
> @@ -759,6 +1227,29 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
> return virtqueue_get_buf_ctx(_vq, len, NULL);
> }
> EXPORT_SYMBOL_GPL(virtqueue_get_buf);
> +
> +static void virtqueue_disable_cb_split(struct virtqueue *_vq)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> +
> + if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
> + vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> + if (!vq->event)
> + vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> + }
> +}
> +
> +static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> +
> + if (vq->event_flags_shadow != VRING_EVENT_F_DISABLE) {
> + vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
> + vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> + vq->event_flags_shadow);
> + }
> +}
> +
> /**
> * virtqueue_disable_cb - disable callbacks
> * @vq: the struct virtqueue we're talking about.
> @@ -772,15 +1263,66 @@ void virtqueue_disable_cb(struct virtqueue *_vq)
> {
> struct vring_virtqueue *vq = to_vvq(_vq);
>
> - if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
> - vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> - if (!vq->event)
> - vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> - }
> -
> + if (vq->packed)
> + virtqueue_disable_cb_packed(_vq);
> + else
> + virtqueue_disable_cb_split(_vq);
> }
> EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
>
> +static unsigned virtqueue_enable_cb_prepare_split(struct virtqueue *_vq)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + u16 last_used_idx;
> +
> + START_USE(vq);
> +
> + /* We optimistically turn back on interrupts, then check if there was
> + * more to do. */
> + /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> + * either clear the flags bit or point the event index at the next
> + * entry. Always do both to keep code simple. */
> + if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> + vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> + if (!vq->event)
> + vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> + }
> + vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
> + END_USE(vq);
> + return last_used_idx;
> +}
> +
> +static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + u16 last_used_idx, wrap_counter, off_wrap;
> +
> + START_USE(vq);
> +
> + last_used_idx = vq->last_used_idx;
> + wrap_counter = vq->wrap_counter;
> +
> + if (last_used_idx > vq->next_avail_idx)
> + wrap_counter ^= 1;
> +
> + off_wrap = last_used_idx | (wrap_counter << 15);
> +
> + /* We optimistically turn back on interrupts, then check if there was
> + * more to do. */
> + /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> + * either clear the flags bit or point the event index at the next
> + * entry. Always do both to keep code simple. */
> + if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> + vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
> + VRING_EVENT_F_ENABLE;
> + vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> + vq->event_flags_shadow);
> + }
A smp_wmb() is missed here?
> + vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev, off_wrap);
And according to the spec, it looks to me write a VRING_EVENT_F_ENABLE
is sufficient here.
> + END_USE(vq);
> + return last_used_idx;
> +}
> +
> /**
> * virtqueue_enable_cb_prepare - restart callbacks after disable_cb
> * @vq: the struct virtqueue we're talking about.
> @@ -796,26 +1338,34 @@ EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
> unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
> {
> struct vring_virtqueue *vq = to_vvq(_vq);
> - u16 last_used_idx;
>
> - START_USE(vq);
> -
> - /* We optimistically turn back on interrupts, then check if there was
> - * more to do. */
> - /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> - * either clear the flags bit or point the event index at the next
> - * entry. Always do both to keep code simple. */
> - if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> - vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> - if (!vq->event)
> - vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> - }
> - vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
> - END_USE(vq);
> - return last_used_idx;
> + return vq->packed ? virtqueue_enable_cb_prepare_packed(_vq) :
> + virtqueue_enable_cb_prepare_split(_vq);
> }
> EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);
>
> +static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned last_used_idx)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> +
> + virtio_mb(vq->weak_barriers);
> + return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
> +}
> +
> +static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + bool avail, used;
> + u16 flags;
> +
> + virtio_mb(vq->weak_barriers);
> + flags = virtio16_to_cpu(vq->vq.vdev,
> + vq->vring_packed.desc[last_used_idx].flags);
> + avail = flags & VRING_DESC_F_AVAIL(1);
> + used = flags & VRING_DESC_F_USED(1);
> + return avail == used;
> +}
> +
> /**
> * virtqueue_poll - query pending used buffers
> * @vq: the struct virtqueue we're talking about.
> @@ -829,8 +1379,8 @@ bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
> {
> struct vring_virtqueue *vq = to_vvq(_vq);
>
> - virtio_mb(vq->weak_barriers);
> - return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
> + return vq->packed ? virtqueue_poll_packed(_vq, last_used_idx) :
> + virtqueue_poll_split(_vq, last_used_idx);
> }
> EXPORT_SYMBOL_GPL(virtqueue_poll);
>
> @@ -852,6 +1402,83 @@ bool virtqueue_enable_cb(struct virtqueue *_vq)
> }
> EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
>
> +static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + u16 bufs;
> +
> + START_USE(vq);
> +
> + /* We optimistically turn back on interrupts, then check if there was
> + * more to do. */
> + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> + * either clear the flags bit or point the event index at the next
> + * entry. Always update the event index to keep code simple. */
> + if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> + vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> + if (!vq->event)
> + vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> + }
> + /* TODO: tune this threshold */
> + bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
> +
> + virtio_store_mb(vq->weak_barriers,
> + &vring_used_event(&vq->vring),
> + cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
> +
> + if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
> + END_USE(vq);
> + return false;
> + }
> +
> + END_USE(vq);
> + return true;
> +}
> +
> +static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + u16 bufs, off_wrap, used_idx, wrap_counter;
> +
> + START_USE(vq);
> +
> + /* We optimistically turn back on interrupts, then check if there was
> + * more to do. */
> + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> + * either clear the flags bit or point the event index at the next
> + * entry. Always update the event index to keep code simple. */
> + if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> + vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
> + VRING_EVENT_F_ENABLE;
> + vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> + vq->event_flags_shadow);
> + }
> +
> + /* TODO: tune this threshold */
> + bufs = (u16)(vq->next_avail_idx - vq->last_used_idx) * 3 / 4;
> +
> + used_idx = vq->last_used_idx + bufs;
> + if (used_idx >= vq->vring_packed.num)
> + used_idx -= vq->vring_packed.num;
> +
> + wrap_counter = vq->wrap_counter;
> + if (used_idx > vq->next_avail_idx)
> + wrap_counter ^= 1;
> +
> + off_wrap = used_idx | (wrap_counter << 15);
> +
> + virtio_store_mb(vq->weak_barriers, &vq->vring_packed.driver->off_wrap,
> + cpu_to_virtio16(_vq->vdev, off_wrap));
> +
> + if (more_used_packed(vq)) {
> + END_USE(vq);
> + return false;
> + }
> +
> + END_USE(vq);
> + return true;
> +}
> +
> /**
> * virtqueue_enable_cb_delayed - restart callbacks after disable_cb.
> * @vq: the struct virtqueue we're talking about.
> @@ -868,37 +1495,69 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
> bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
> {
> struct vring_virtqueue *vq = to_vvq(_vq);
> - u16 bufs;
>
> - START_USE(vq);
> -
> - /* We optimistically turn back on interrupts, then check if there was
> - * more to do. */
> - /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> - * either clear the flags bit or point the event index at the next
> - * entry. Always update the event index to keep code simple. */
> - if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> - vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> - if (!vq->event)
> - vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> - }
> - /* TODO: tune this threshold */
> - bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
> -
> - virtio_store_mb(vq->weak_barriers,
> - &vring_used_event(&vq->vring),
> - cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
> -
> - if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
> - END_USE(vq);
> - return false;
> - }
> -
> - END_USE(vq);
> - return true;
> + return vq->packed ? virtqueue_enable_cb_delayed_packed(_vq) :
> + virtqueue_enable_cb_delayed_split(_vq);
> }
> EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
>
> +static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + unsigned int i;
> + void *buf;
> +
> + START_USE(vq);
> +
> + for (i = 0; i < vq->vring.num; i++) {
> + if (!vq->desc_state[i].data)
> + continue;
> + /* detach_buf clears data, so grab it now. */
> + buf = vq->desc_state[i].data;
> + detach_buf_split(vq, i, NULL);
> + vq->avail_idx_shadow--;
> + vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
> + END_USE(vq);
> + return buf;
> + }
> + /* That should have freed everything. */
> + BUG_ON(vq->vq.num_free != vq->vring.num);
> +
> + END_USE(vq);
> + return NULL;
> +}
> +
> +static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + unsigned int i, num;
> + void *buf;
> +
> + START_USE(vq);
> +
> + for (i = 0; i < vq->vring_packed.num; i++) {
> + if (!vq->desc_state[i].data)
> + continue;
> + /* detach_buf clears data, so grab it now. */
> + buf = vq->desc_state[i].data;
> + num = detach_buf_packed(vq, i, NULL);
> + if (vq->next_avail_idx < num) {
> + vq->next_avail_idx = vq->vring_packed.num -
> + (num - vq->next_avail_idx);
> + vq->wrap_counter ^= 1;
> + } else {
> + vq->next_avail_idx -= num;
> + }
> + END_USE(vq);
> + return buf;
> + }
> + /* That should have freed everything. */
> + BUG_ON(vq->vq.num_free != vq->vring_packed.num);
> +
> + END_USE(vq);
> + return NULL;
> +}
> +
> /**
> * virtqueue_detach_unused_buf - detach first unused buffer
> * @vq: the struct virtqueue we're talking about.
> @@ -910,27 +1569,9 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
> void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
> {
> struct vring_virtqueue *vq = to_vvq(_vq);
> - unsigned int i;
> - void *buf;
>
> - START_USE(vq);
> -
> - for (i = 0; i < vq->vring.num; i++) {
> - if (!vq->desc_state[i].data)
> - continue;
> - /* detach_buf clears data, so grab it now. */
> - buf = vq->desc_state[i].data;
> - detach_buf(vq, i, NULL);
> - vq->avail_idx_shadow--;
> - vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
> - END_USE(vq);
> - return buf;
> - }
> - /* That should have freed everything. */
> - BUG_ON(vq->vq.num_free != vq->vring.num);
> -
> - END_USE(vq);
> - return NULL;
> + return vq->packed ? virtqueue_detach_unused_buf_packed(_vq) :
> + virtqueue_detach_unused_buf_split(_vq);
> }
> EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf);
>
> @@ -955,7 +1596,8 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> EXPORT_SYMBOL_GPL(vring_interrupt);
>
> struct virtqueue *__vring_new_virtqueue(unsigned int index,
> - struct vring vring,
> + union vring_union vring,
> + bool packed,
> struct virtio_device *vdev,
> bool weak_barriers,
> bool context,
> @@ -963,19 +1605,20 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> void (*callback)(struct virtqueue *),
> const char *name)
> {
> - unsigned int i;
> + unsigned int num, i;
> struct vring_virtqueue *vq;
>
> - vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
> + num = packed ? vring.vring_packed.num : vring.vring_split.num;
> +
> + vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
> GFP_KERNEL);
> if (!vq)
> return NULL;
>
> - vq->vring = vring;
> vq->vq.callback = callback;
> vq->vq.vdev = vdev;
> vq->vq.name = name;
> - vq->vq.num_free = vring.num;
> + vq->vq.num_free = num;
> vq->vq.index = index;
> vq->we_own_ring = false;
> vq->queue_dma_addr = 0;
> @@ -984,9 +1627,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> vq->weak_barriers = weak_barriers;
> vq->broken = false;
> vq->last_used_idx = 0;
> - vq->avail_flags_shadow = 0;
> - vq->avail_idx_shadow = 0;
> vq->num_added = 0;
> + vq->packed = packed;
> list_add_tail(&vq->vq.list, &vdev->vqs);
> #ifdef DEBUG
> vq->in_use = false;
> @@ -997,18 +1639,37 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> !context;
> vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
>
> + if (vq->packed) {
> + vq->vring_packed = vring.vring_packed;
> + vq->next_avail_idx = 0;
> + vq->wrap_counter = 1;
> + vq->event_flags_shadow = 0;
> + } else {
> + vq->vring = vring.vring_split;
> + vq->avail_flags_shadow = 0;
> + vq->avail_idx_shadow = 0;
> +
> + /* Put everything in free lists. */
> + vq->free_head = 0;
> + for (i = 0; i < num-1; i++)
> + vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
> + }
> +
> /* No callback? Tell other side not to bother us. */
> if (!callback) {
> - vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> - if (!vq->event)
> - vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
> + if (packed) {
> + vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
> + vq->vring_packed.driver->flags = cpu_to_virtio16(vdev,
> + vq->event_flags_shadow);
> + } else {
> + vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> + if (!vq->event)
> + vq->vring.avail->flags = cpu_to_virtio16(vdev,
> + vq->avail_flags_shadow);
> + }
> }
>
> - /* Put everything in free lists. */
> - vq->free_head = 0;
> - for (i = 0; i < vring.num-1; i++)
> - vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
> - memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
> + memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
>
> return &vq->vq;
> }
> @@ -1056,6 +1717,22 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size,
> }
> }
>
> +static inline int
> +__vring_size(unsigned int num, unsigned long align, bool packed)
> +{
> + return packed ? vring_packed_size(num, align) : vring_size(num, align);
> +}
> +
> +static inline void vring_packed_init(struct vring_packed *vr, unsigned int num,
> + void *p, unsigned long align)
> +{
> + vr->num = num;
> + vr->desc = p;
> + vr->driver = (void *)(((uintptr_t)p + sizeof(struct vring_packed_desc)
> + * num + align - 1) & ~(align - 1));
> + vr->device = vr->driver + 1;
> +}
> +
> struct virtqueue *vring_create_virtqueue(
> unsigned int index,
> unsigned int num,
> @@ -1072,7 +1749,8 @@ struct virtqueue *vring_create_virtqueue(
> void *queue = NULL;
> dma_addr_t dma_addr;
> size_t queue_size_in_bytes;
> - struct vring vring;
> + union vring_union vring;
> + bool packed;
>
> /* We assume num is a power of 2. */
> if (num & (num - 1)) {
> @@ -1080,9 +1758,13 @@ struct virtqueue *vring_create_virtqueue(
> return NULL;
> }
>
> + packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
> +
> /* TODO: allocate each queue chunk individually */
> - for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
> - queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
> + for (; num && __vring_size(num, vring_align, packed) > PAGE_SIZE;
> + num /= 2) {
> + queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
> + packed),
> &dma_addr,
> GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
> if (queue)
> @@ -1094,17 +1776,21 @@ struct virtqueue *vring_create_virtqueue(
>
> if (!queue) {
> /* Try to get a single page. You are my only hope! */
> - queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
> + queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
> + packed),
> &dma_addr, GFP_KERNEL|__GFP_ZERO);
> }
> if (!queue)
> return NULL;
>
> - queue_size_in_bytes = vring_size(num, vring_align);
> - vring_init(&vring, num, queue, vring_align);
> + queue_size_in_bytes = __vring_size(num, vring_align, packed);
> + if (packed)
> + vring_packed_init(&vring.vring_packed, num, queue, vring_align);
> + else
> + vring_init(&vring.vring_split, num, queue, vring_align);
>
> - vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> - notify, callback, name);
> + vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> + context, notify, callback, name);
> if (!vq) {
> vring_free_queue(vdev, queue_size_in_bytes, queue,
> dma_addr);
> @@ -1130,10 +1816,17 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
> void (*callback)(struct virtqueue *vq),
> const char *name)
> {
> - struct vring vring;
> - vring_init(&vring, num, pages, vring_align);
> - return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> - notify, callback, name);
> + union vring_union vring;
> + bool packed;
> +
> + packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
> + if (packed)
> + vring_packed_init(&vring.vring_packed, num, pages, vring_align);
> + else
> + vring_init(&vring.vring_split, num, pages, vring_align);
> +
> + return __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> + context, notify, callback, name);
> }
> EXPORT_SYMBOL_GPL(vring_new_virtqueue);
>
> @@ -1143,7 +1836,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
>
> if (vq->we_own_ring) {
> vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
> - vq->vring.desc, vq->queue_dma_addr);
> + vq->packed ? (void *)vq->vring_packed.desc :
> + (void *)vq->vring.desc,
> + vq->queue_dma_addr);
> }
> list_del(&_vq->list);
> kfree(vq);
> @@ -1157,14 +1852,18 @@ void vring_transport_features(struct virtio_device *vdev)
>
> for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
> switch (i) {
> - case VIRTIO_RING_F_INDIRECT_DESC:
> +#if 0
> + case VIRTIO_RING_F_INDIRECT_DESC: // FIXME not tested yet.
> break;
> - case VIRTIO_RING_F_EVENT_IDX:
> + case VIRTIO_RING_F_EVENT_IDX: // FIXME probably not work.
> break;
> +#endif
It would be better if you can split EVENT_IDX and INDIRECT_DESC into
separate patches too.
Thanks
> case VIRTIO_F_VERSION_1:
> break;
> case VIRTIO_F_IOMMU_PLATFORM:
> break;
> + case VIRTIO_F_RING_PACKED:
> + break;
> default:
> /* We don't understand this bit. */
> __virtio_clear_bit(vdev, i);
> @@ -1185,7 +1884,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
>
> struct vring_virtqueue *vq = to_vvq(_vq);
>
> - return vq->vring.num;
> + return vq->packed ? vq->vring_packed.num : vq->vring.num;
> }
> EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
>
> @@ -1228,6 +1927,10 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
>
> BUG_ON(!vq->we_own_ring);
>
> + if (vq->packed)
> + return vq->queue_dma_addr + ((char *)vq->vring_packed.driver -
> + (char *)vq->vring_packed.desc);
> +
> return vq->queue_dma_addr +
> ((char *)vq->vring.avail - (char *)vq->vring.desc);
> }
> @@ -1239,11 +1942,16 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
>
> BUG_ON(!vq->we_own_ring);
>
> + if (vq->packed)
> + return vq->queue_dma_addr + ((char *)vq->vring_packed.device -
> + (char *)vq->vring_packed.desc);
> +
> return vq->queue_dma_addr +
> ((char *)vq->vring.used - (char *)vq->vring.desc);
> }
> EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
>
> +/* Only available for split ring */
> const struct vring *virtqueue_get_vring(struct virtqueue *vq)
> {
> return &to_vvq(vq)->vring;
> diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
> index bbf32524ab27..a0075894ad16 100644
> --- a/include/linux/virtio_ring.h
> +++ b/include/linux/virtio_ring.h
> @@ -60,6 +60,11 @@ static inline void virtio_store_mb(bool weak_barriers,
> struct virtio_device;
> struct virtqueue;
>
> +union vring_union {
> + struct vring vring_split;
> + struct vring_packed vring_packed;
> +};
> +
> /*
> * Creates a virtqueue and allocates the descriptor ring. If
> * may_reduce_num is set, then this may allocate a smaller ring than
> @@ -79,7 +84,8 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
>
> /* Creates a virtqueue with a custom layout. */
> struct virtqueue *__vring_new_virtqueue(unsigned int index,
> - struct vring vring,
> + union vring_union vring,
> + bool packed,
> struct virtio_device *vdev,
> bool weak_barriers,
> bool ctx,
> diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
> index 308e2096291f..a6e392325e3a 100644
> --- a/include/uapi/linux/virtio_config.h
> +++ b/include/uapi/linux/virtio_config.h
> @@ -49,7 +49,7 @@
> * transport being used (eg. virtio_ring), the rest are per-device feature
> * bits. */
> #define VIRTIO_TRANSPORT_F_START 28
> -#define VIRTIO_TRANSPORT_F_END 34
> +#define VIRTIO_TRANSPORT_F_END 36
>
> #ifndef VIRTIO_CONFIG_NO_LEGACY
> /* Do we get callbacks when the ring is completely used, even if we've
> @@ -71,4 +71,14 @@
> * this is for compatibility with legacy systems.
> */
> #define VIRTIO_F_IOMMU_PLATFORM 33
> +
> +/* This feature indicates support for the packed virtqueue layout. */
> +#define VIRTIO_F_RING_PACKED 34
> +
> +/*
> + * This feature indicates that all buffers are used by the device
> + * in the same order in which they have been made available.
> + */
> +#define VIRTIO_F_IN_ORDER 35
> +
> #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
> diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
> index 6d5d5faa989b..735d4207c988 100644
> --- a/include/uapi/linux/virtio_ring.h
> +++ b/include/uapi/linux/virtio_ring.h
> @@ -44,6 +44,9 @@
> /* This means the buffer contains a list of buffer descriptors. */
> #define VRING_DESC_F_INDIRECT 4
>
> +#define VRING_DESC_F_AVAIL(b) ((b) << 7)
> +#define VRING_DESC_F_USED(b) ((b) << 15)
> +
> /* The Host uses this in used->flags to advise the Guest: don't kick me when
> * you add a buffer. It's unreliable, so it's simply an optimization. Guest
> * will still kick if it's out of buffers. */
> @@ -53,6 +56,10 @@
> * optimization. */
> #define VRING_AVAIL_F_NO_INTERRUPT 1
>
> +#define VRING_EVENT_F_ENABLE 0x0
> +#define VRING_EVENT_F_DISABLE 0x1
> +#define VRING_EVENT_F_DESC 0x2
> +
> /* We support indirect buffer descriptors */
> #define VIRTIO_RING_F_INDIRECT_DESC 28
>
> @@ -171,4 +178,58 @@ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
> return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
> }
>
> +struct vring_packed_desc_event {
> + /* __virtio16 off : 15; // Descriptor Event Offset
> + * __virtio16 wrap : 1; // Descriptor Event Wrap Counter */
> + __virtio16 off_wrap;
> + /* __virtio16 flags : 2; // Descriptor Event Flags */
> + __virtio16 flags;
> +};
> +
> +struct vring_packed_desc {
> + /* Buffer Address. */
> + __virtio64 addr;
> + /* Buffer Length. */
> + __virtio32 len;
> + /* Buffer ID. */
> + __virtio16 id;
> + /* The flags depending on descriptor type. */
> + __virtio16 flags;
> +};
> +
> +struct vring_packed {
> + unsigned int num;
> +
> + struct vring_packed_desc *desc;
> +
> + struct vring_packed_desc_event *driver;
> +
> + struct vring_packed_desc_event *device;
> +};
> +
> +/* The standard layout for the packed ring is a continuous chunk of memory
> + * which looks like this.
> + *
> + * struct vring_packed
> + * {
> + * // The actual descriptors (16 bytes each)
> + * struct vring_packed_desc desc[num];
> + *
> + * // Padding to the next align boundary.
> + * char pad[];
> + *
> + * // Driver Event Suppression
> + * struct vring_packed_desc_event driver;
> + *
> + * // Device Event Suppression
> + * struct vring_packed_desc_event device;
> + * };
> + */
> +
> +static inline unsigned vring_packed_size(unsigned int num, unsigned long align)
> +{
> + return ((sizeof(struct vring_packed_desc) * num + align - 1)
> + & ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
> +}
> +
> #endif /* _UAPI_LINUX_VIRTIO_RING_H */
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* CfP for VHPC ‘18 - Papers due May 15 (extended) for the 13th Virtualization in High-Performance Cloud Computing Workshop
From: VHPC 18 @ 2018-04-12 13:39 UTC (permalink / raw)
To: virtualization
[-- Attachment #1.1: Type: text/plain, Size: 10061 bytes --]
*Please accept our apologies if you receive multiple copies of this Call
for
Papers====================================================================CALL
FOR PAPERS 13th Workshop on Virtualization in High-Performance Cloud
Computing (VHPC '18)held in conjunction with the International
Supercomputing Conference - High Performance,June 24-28, 2018, Frankfurt,
Germany.(Springer LNCS Proceedings)
====================================================================Date:
June 28, 2018Workshop URL: http://vhpc.org <http://vhpc.org>Paper
Submission Deadline: May 15, 2018 (extended)Springer LNCS, rolling abstract
submissionAbstract/Paper Submission Link:
https://edas.info/newPaper.php?c=24355
<https://edas.info/newPaper.php?c=24355>Special Track: GPU - Accelerator
Virtualization Call for PapersVirtualization technologies constitute a key
enabling factor for flexible resource managementin modern data centers, and
particularly in cloud environments. Cloud providers need tomanage complex
infrastructures in a seamless fashion to support the highly dynamic
andheterogeneous workloads and hosted applications customers deploy.
Similarly, HPCenvironments have been increasingly adopting techniques that
enable flexible managementof vast computing and networking resources, close
to marginal provisioning cost, which isunprecedented in the history of
scientific and commercial computing.Various virtualization technologies
contribute to the overall picture in different ways: machinevirtualization,
with its capability to enable consolidation of multiple underutilized
servers withheterogeneous software and operating systems (OSes), and its
capability to live-migrate afully operating virtual machine (VM) with a
very short downtime, enables novel and dynamicways to manage physical
servers; OS-level virtualization (i.e., containerization), with
itscapability to isolate multiple user-space environments and to allow for
their coexistencewithin the same OS kernel, promises to provide many of
the advantages of machine virtualization with high levels of responsiveness
and performance; I/O Virtualization allows physical network interfaces to
take traffic from multiple VMs or containers; network virtualization, with
its capability to create logical network overlays that are independent of
theunderlying physical topology is furthermore enabling virtualization of
HPC infrastructures. PublicationAccepted papers will be published in a
Springer LNCS proceedings volume.Topics of InterestThe VHPC program
committee solicits original, high-quality submissions related
tovirtualization across the entire software stack with a special focus on
the intersection of HPCand the cloud.Major Topics- Virtualization in
supercomputing environments, HPC clusters, HPC in the cloud and grids-
OS-level virtualization and containers (LXC, Docker, rkt, Singularity,
Shifter, i.a.)- Lightweight/specialized operating systems in conjunction
with virtual machines- Novel unikernels and use cases for virtualized HPC
environments- Performance improvements for or driven by unikernels- Tool
support for unikernels: configuration/build environments, debuggers,
profilers- Hypervisor extensions to mitigate side-channel attacks
([micro-]architectural timing attacks, privilege escalation)- VM &
Container trust and security- Containers inside VMs with hypervisor
isolation- GPU virtualization operationalization- Approaches to GPGPU
virtualization including API remoting and hypervisor abstraction-
Optimizations of virtual machine monitor platforms and hypervisors-
Hypervisor support for heterogeneous resources (GPUs, co-processors, FPGAs,
etc.)- Virtualization support for emerging memory technologies-
Virtualization in enterprise HPC and microvisors- Software defined networks
and network virtualization- Management, deployment of virtualized
environments and orchestration (Kubernetes i.a.)- Workflow-pipeline
container-based composability - Checkpointing facilitation utilizing
containers and VMs - Emerging topics including multi-kernel approaches and
NUMA in hypervisors- Operating MPI in containers/VMs and Unikernels -
Virtualization in data intensive computing (big data) - HPC convergence-
Adaptation of HPC technologies in the cloud (high performance networks,
RDMA, etc.)- Performance measurement, modelling and monitoring of
virtualized/cloud workloads- Latency-and jitter sensitive workloads in
virtualized/containerized environments- I/O virtualization (including
applications, SR-IOV, i.a.) - Hybrid local facility + cloud compute and
based storage systems, cloudbursting- FPGA and many-core accelerator
virtualization- Job scheduling/control/policy and container placement in
virtualized environments- Cloud reliability, fault-tolerance and
high-availability- QoS and SLA in virtualized environments- IaaS platforms,
cloud frameworks and APIs- Energy-efficient and power-aware virtualization-
Configuration management tools for containers (including in OpenStack,
Ansible, i.a.)- ARM-based hypervisors, ARM virtualization extensionsSpecial
Track: GPU - Accelerator VirtualizationGPU virtualization technologies,
performance and benchmarking, integration withworkflow scheduling systems,
integration to cluster managers.GPUs are taking on many HPC workload areas,
especially in deep learning withinmachine learning. In addition, a lot of
workload is being pushed to elastic environments utilizing various
virtualization technologies on different levels like hypervisors (e.g.
VMWare, Xen, KVM), kernel (Docker, Kubernetes) or on the resource
managerlevel (YARN, Mesos). In this track we invite submissions addressing
these problems. Suggested Themes and Topics:Technology - What technologies
and best practices exist for GPU - hardware accelerator virtualization and
usage of hardware accelerators in virtual environments on the hypervisor,
kernel or resource manager levelDevelopers - Real-life experience when
addressing HPC/ML/DL problems with GPUs or hardware accelerators in virtual
environmentsPerformance - Performance comparisons between different
technologies / solutionsThe Workshop on Virtualization in High-Performance
Cloud Computing (VHPC) aims tobring together researchers and industrial
practitioners facing the challenges posed by virtualization in order to
foster discussion, collaboration, mutual exchangeof knowledge and
experience, enabling research to ultimately provide novelsolutions for
virtualized computing systems of tomorrow.The workshop will be one day in
length, composed of 20 min paper presentations, eachfollowed by 10 min
discussion sections, plus lightning talks that are limited to 5
minutes.Presentations may be accompanied by interactive
demonstrations.KeynotesGregory M. Kurtzer, CEO Syslabs Inc. Singularity:
Application containers using operating system environment
virtualization.Josh Simons, HPC Chief Technologist, VMWare, Inc. Practical
Aspects of On-Premises Virtualized High Performance Computing.Important
DatesMay 15, 2018 (extended) - Paper submission deadline (Springer LNCS)May
30, 2018 - Acceptance notification June 28, 2018 - Workshop DayJuly 12,
2018 - Camera-ready version dueChairMichael Alexander (chair), Institute of
Science and Technology, AustriaAnastassios Nanos (co-chair), OnApp,
UKRomeo Kienzler (co-chair), IBM, SwitzerlandProgram committeeStergios
Anastasiadis, University of Ioannina, Greece Jakob Blomer, CERN, Europe
Eduardo César, Universidad Autonoma de Barcelona, SpainStephen Crago, USC
ISI, USATommaso Cucinotta, St. Anna School of Advanced Studies,
ItalyChristoffer Dall, Columbia University, USAFrançois Diakhaté, CEA,
FrancePatrick Dreher, MIT, USA Kyle Hale, Northwestern University, USA
Brian Kocoloski, University of Pittsburgh, USAUday Kurkure, VMware, USAJohn
Lange, University of Pittsburgh, USAGiuseppe Lettieri, University of Pisa,
ItalyQing Liu, Oak Ridge National Laboratory, USANikos Parlavantzas, IRISA,
FranceKevin Pedretti, Sandia National Laboratories, USAAmer Qouneh, Western
New England University, USA Carlos Reaño, Technical University of Valencia,
SpainBorja Sotomayor, University of Chicago, USA Anata Tiwari, San Diego
Supercomputer Center, USAKurt Tutschku, Blekinge Institute of Technology,
Sweden Yasuhiro Watashiba, Osaka University, JapanChao-Tung Yang, Tunghai
University, Taiwan Andrew Younge, Sandia National Laboratory, USANa Zhang,
VMware, USAPaper Submission-PublicationPapers submitted to the workshop
will be reviewed by at least twomembers of the program committee and
external reviewers. Submissionsshould include abstract, keywords, the
e-mail address of thecorresponding author, and must not exceed 10 pages,
including tablesand figures at a main font size no smaller than 11 point.
Submissionof a paper should be regarded as a commitment that, should the
paperbe accepted, at least one of the authors will register and attend
theconference to present the work. Accepted papers will be published in
aSpringer LNCS volume.The format must be according to the Springer LNCS
Style. Initialsubmissions are in PDF; authors of accepted papers will be
requestedto provide source files.Format
Guidelines:ftp://ftp.springer.de/pub/tex/latex/llncs/latex2e/llncs2e.zip
<ftp://ftp.springer.de/pub/tex/latex/llncs/latex2e/llncs2e.zip>Abstract,
Paper Submission Link:https://edas.info/newPaper.php?c=24355
<https://edas.info/newPaper.php?c=24355>Lightning Talks Lightning Talks are
non-paper track, synoptical in nature and are strictly limited to 5
minutes.They can be used to gain early feedback on ongoing research, for
demonstrations, to present research results, early research ideas,
perspectives and positions of interest to the community. Submit abstract
via the main submission link. General InformationThe workshop is one day in
length and will be held in conjunction with the InternationalSupercomputing
Conference - High Performance (ISC) 2018, June 24-28, Frankfurt, Germany.*
[-- Attachment #1.2: Type: text/html, Size: 63765 bytes --]
[-- Attachment #2: Type: text/plain, Size: 183 bytes --]
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [virtio-dev] Re: [PATCH v2] virtio_balloon: export hugetlb page allocation counts
From: Jonathan Helman @ 2018-04-12 0:24 UTC (permalink / raw)
To: Jason Wang, Michael S. Tsirkin
Cc: Mihai Carabas, virtio-dev, linux-kernel, virtualization
In-Reply-To: <e773c5bb-52a5-4b7d-4ce6-0fc3b189475f@redhat.com>
On 04/10/2018 08:12 PM, Jason Wang wrote:
>
>
> On 2018年04月10日 05:11, Jonathan Helman wrote:
>>
>>
>> On 03/22/2018 07:38 PM, Jason Wang wrote:
>>>
>>>
>>> On 2018年03月22日 11:10, Michael S. Tsirkin wrote:
>>>> On Thu, Mar 22, 2018 at 09:52:18AM +0800, Jason Wang wrote:
>>>>> On 2018年03月20日 12:26, Jonathan Helman wrote:
>>>>>>> On Mar 19, 2018, at 7:31 PM, Jason Wang<jasowang@redhat.com> wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 2018年03月20日 06:14, Jonathan Helman wrote:
>>>>>>>> Export the number of successful and failed hugetlb page
>>>>>>>> allocations via the virtio balloon driver. These 2 counts
>>>>>>>> come directly from the vm_events HTLB_BUDDY_PGALLOC and
>>>>>>>> HTLB_BUDDY_PGALLOC_FAIL.
>>>>>>>>
>>>>>>>> Signed-off-by: Jonathan Helman<jonathan.helman@oracle.com>
>>>>>>> Reviewed-by: Jason Wang<jasowang@redhat.com>
>>>>>> Thanks.
>>>>>>
>>>>>>>> ---
>>>>>>>> drivers/virtio/virtio_balloon.c | 6 ++++++
>>>>>>>> include/uapi/linux/virtio_balloon.h | 4 +++-
>>>>>>>> 2 files changed, 9 insertions(+), 1 deletion(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/virtio/virtio_balloon.c
>>>>>>>> b/drivers/virtio/virtio_balloon.c
>>>>>>>> index dfe5684..6b237e3 100644
>>>>>>>> --- a/drivers/virtio/virtio_balloon.c
>>>>>>>> +++ b/drivers/virtio/virtio_balloon.c
>>>>>>>> @@ -272,6 +272,12 @@ static unsigned int
>>>>>>>> update_balloon_stats(struct virtio_balloon *vb)
>>>>>>>> pages_to_bytes(events[PSWPOUT]));
>>>>>>>> update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT,
>>>>>>>> events[PGMAJFAULT]);
>>>>>>>> update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT,
>>>>>>>> events[PGFAULT]);
>>>>>>>> +#ifdef CONFIG_HUGETLB_PAGE
>>>>>>>> + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
>>>>>>>> + events[HTLB_BUDDY_PGALLOC]);
>>>>>>>> + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL,
>>>>>>>> + events[HTLB_BUDDY_PGALLOC_FAIL]);
>>>>>>>> +#endif
>>>>>>>> #endif
>>>>>>>> update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE,
>>>>>>>> pages_to_bytes(i.freeram));
>>>>>>>> diff --git a/include/uapi/linux/virtio_balloon.h
>>>>>>>> b/include/uapi/linux/virtio_balloon.h
>>>>>>>> index 4e8b830..40297a3 100644
>>>>>>>> --- a/include/uapi/linux/virtio_balloon.h
>>>>>>>> +++ b/include/uapi/linux/virtio_balloon.h
>>>>>>>> @@ -53,7 +53,9 @@ struct virtio_balloon_config {
>>>>>>>> #define VIRTIO_BALLOON_S_MEMTOT 5 /* Total amount of
>>>>>>>> memory */
>>>>>>>> #define VIRTIO_BALLOON_S_AVAIL 6 /* Available memory as
>>>>>>>> in /proc */
>>>>>>>> #define VIRTIO_BALLOON_S_CACHES 7 /* Disk caches */
>>>>>>>> -#define VIRTIO_BALLOON_S_NR 8
>>>>>>>> +#define VIRTIO_BALLOON_S_HTLB_PGALLOC 8 /* Hugetlb page
>>>>>>>> allocations */
>>>>>>>> +#define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page
>>>>>>>> allocation failures */
>>>>>>>> +#define VIRTIO_BALLOON_S_NR 10
>>>>>>>> /*
>>>>>>>> * Memory statistics structure.
>>>>>>> Not for this patch, but it looks to me that exporting such nr
>>>>>>> through uapi is fragile.
>>>>>> Sorry, can you explain what you mean here?
>>>>>>
>>>>>> Jon
>>>>> Spec said "Within an output buffer submitted to the statsq, the
>>>>> device MUST
>>>>> ignore entries with tag values that it does not recognize". So
>>>>> exporting
>>>>> VIRTIO_BALLOON_S_NR seems useless and device implementation can not
>>>>> depend
>>>>> on such number in uapi.
>>>>>
>>>>> Thanks
>>>> Suggestions? I don't like to break build for people ...
>>>>
>>>
>>> Didn't have a good idea. But maybe we should keep VIRTIO_BALLOON_S_NR
>>> unchanged, and add a comment here.
>>>
>>> Thanks
>>
>> I think Jason's comment is for a future patch. Didn't see this patch
>> get applied, so wondering if it could be.
>>
>> Thanks,
>> Jon
>
> Hi Jon:
>
> Have you tested new driver with old qemu?
Yes, this testing scenario looks good. Thanks.
Jon
>
> Thanks
>
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net-next 1/5] virtio: Add support for SCTP checksum offloading
From: Michael S. Tsirkin @ 2018-04-11 22:49 UTC (permalink / raw)
To: Vladislav Yasevich; +Cc: nhorman, netdev, virtualization, linux-sctp
In-Reply-To: <20180402134006.10111-2-vyasevic@redhat.com>
On Mon, Apr 02, 2018 at 09:40:02AM -0400, Vladislav Yasevich wrote:
> To support SCTP checksum offloading, we need to add a new feature
> to virtio_net, so we can negotiate support between the hypervisor
> and the guest.
>
> The signalling to the guest that an alternate checksum needs to
> be used is done via a new flag in the virtio_net_hdr. If the
> flag is set, the host will know to perform an alternate checksum
> calculation, which right now is only CRC32c.
>
> Signed-off-by: Vladislav Yasevich <vyasevic@redhat.com>
> ---
> drivers/net/virtio_net.c | 11 ++++++++---
> include/linux/virtio_net.h | 6 ++++++
> include/uapi/linux/virtio_net.h | 2 ++
> 3 files changed, 16 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 7b187ec..b601294 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2724,9 +2724,14 @@ static int virtnet_probe(struct virtio_device *vdev)
> /* Do we support "hardware" checksums? */
> if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
> /* This opens up the world of extra features. */
> - dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
> + netdev_features_t sctp = 0;
> +
> + if (virtio_has_feature(vdev, VIRTIO_NET_F_SCTP_CSUM))
> + sctp |= NETIF_F_SCTP_CRC;
> +
> + dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG | sctp;
> if (csum)
> - dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
> + dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG | sctp;
>
> if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
> dev->hw_features |= NETIF_F_TSO
> @@ -2952,7 +2957,7 @@ static struct virtio_device_id id_table[] = {
> };
>
> #define VIRTNET_FEATURES \
> - VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
> + VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_SCTP_CSUM, \
> VIRTIO_NET_F_MAC, \
> VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
> VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
> diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
> index f144216..2e7a64a 100644
> --- a/include/linux/virtio_net.h
> +++ b/include/linux/virtio_net.h
> @@ -39,6 +39,9 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
>
> if (!skb_partial_csum_set(skb, start, off))
> return -EINVAL;
> +
> + if (hdr->flags & VIRTIO_NET_HDR_F_CSUM_NOT_INET)
> + skb->csum_not_inet = 1;
> }
>
> if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
> @@ -96,6 +99,9 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
> hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
> } /* else everything is zero */
>
> + if (skb->csum_not_inet)
> + hdr->flags &= VIRTIO_NET_HDR_F_CSUM_NOT_INET;
> +
> return 0;
> }
>
> diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
> index 5de6ed3..3f279c8 100644
> --- a/include/uapi/linux/virtio_net.h
> +++ b/include/uapi/linux/virtio_net.h
> @@ -36,6 +36,7 @@
> #define VIRTIO_NET_F_GUEST_CSUM 1 /* Guest handles pkts w/ partial csum */
> #define VIRTIO_NET_F_CTRL_GUEST_OFFLOADS 2 /* Dynamic offload configuration. */
> #define VIRTIO_NET_F_MTU 3 /* Initial MTU advice */
> +#define VIRTIO_NET_F_SCTP_CSUM 4 /* SCTP checksum offload support */
> #define VIRTIO_NET_F_MAC 5 /* Host has given MAC address. */
> #define VIRTIO_NET_F_GUEST_TSO4 7 /* Guest can handle TSOv4 in. */
> #define VIRTIO_NET_F_GUEST_TSO6 8 /* Guest can handle TSOv6 in. */
Is this a guest or a host checksum? We should differenciate between the
two.
> @@ -101,6 +102,7 @@ struct virtio_net_config {
> struct virtio_net_hdr_v1 {
> #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */
> #define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */
> +#define VIRTIO_NET_HDR_F_CSUM_NOT_INET 4 /* Checksum is not inet */
> __u8 flags;
> #define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */
> #define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */
> --
> 2.9.5
^ permalink raw reply
* Re: [RFC PATCH net-next v6 2/4] net: Introduce generic bypass module
From: Samudrala, Sridhar @ 2018-04-11 19:13 UTC (permalink / raw)
To: Jiri Pirko
Cc: alexander.h.duyck, virtio-dev, mst, kubakici, netdev,
virtualization, loseweigh, davem
In-Reply-To: <20180411155127.GQ2028@nanopsycho>
On 4/11/2018 8:51 AM, Jiri Pirko wrote:
> Tue, Apr 10, 2018 at 08:59:48PM CEST, sridhar.samudrala@intel.com wrote:
>> This provides a generic interface for paravirtual drivers to listen
>> for netdev register/unregister/link change events from pci ethernet
>> devices with the same MAC and takeover their datapath. The notifier and
>> event handling code is based on the existing netvsc implementation.
>>
>> It exposes 2 sets of interfaces to the paravirtual drivers.
>> 1. existing netvsc driver that uses 2 netdev model. In this model, no
>> master netdev is created. The paravirtual driver registers each bypass
>> instance along with a set of ops to manage the slave events.
>> bypass_master_register()
>> bypass_master_unregister()
>> 2. new virtio_net based solution that uses 3 netdev model. In this model,
>> the bypass module provides interfaces to create/destroy additional master
>> netdev and all the slave events are managed internally.
>> bypass_master_create()
>> bypass_master_destroy()
>>
>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> ---
>> include/linux/netdevice.h | 14 +
>> include/net/bypass.h | 96 ++++++
>> net/Kconfig | 18 +
>> net/core/Makefile | 1 +
>> net/core/bypass.c | 844 ++++++++++++++++++++++++++++++++++++++++++++++
>> 5 files changed, 973 insertions(+)
>> create mode 100644 include/net/bypass.h
>> create mode 100644 net/core/bypass.c
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index cf44503ea81a..587293728f70 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -1430,6 +1430,8 @@ enum netdev_priv_flags {
>> IFF_PHONY_HEADROOM = 1<<24,
>> IFF_MACSEC = 1<<25,
>> IFF_NO_RX_HANDLER = 1<<26,
>> + IFF_BYPASS = 1 << 27,
>> + IFF_BYPASS_SLAVE = 1 << 28,
> I wonder, why you don't follow the existing coding style... Also, please
> add these to into the comment above.
To avoid checkpatch warnings. If it is OK to ignore these warnings, I can switch back
to the existing coding style to be consistent.
>
>
>> };
>>
>> #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
>> @@ -1458,6 +1460,8 @@ enum netdev_priv_flags {
>> #define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED
>> #define IFF_MACSEC IFF_MACSEC
>> #define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER
>> +#define IFF_BYPASS IFF_BYPASS
>> +#define IFF_BYPASS_SLAVE IFF_BYPASS_SLAVE
>>
>> /**
>> * struct net_device - The DEVICE structure.
>> @@ -4308,6 +4312,16 @@ static inline bool netif_is_rxfh_configured(const struct net_device *dev)
>> return dev->priv_flags & IFF_RXFH_CONFIGURED;
>> }
>>
>> +static inline bool netif_is_bypass_master(const struct net_device *dev)
>> +{
>> + return dev->priv_flags & IFF_BYPASS;
>> +}
>> +
>> +static inline bool netif_is_bypass_slave(const struct net_device *dev)
>> +{
>> + return dev->priv_flags & IFF_BYPASS_SLAVE;
>> +}
>> +
>> /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
>> static inline void netif_keep_dst(struct net_device *dev)
>> {
>> diff --git a/include/net/bypass.h b/include/net/bypass.h
>> new file mode 100644
>> index 000000000000..86b02cb894cf
>> --- /dev/null
>> +++ b/include/net/bypass.h
>> @@ -0,0 +1,96 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright (c) 2018, Intel Corporation. */
>> +
>> +#ifndef _NET_BYPASS_H
>> +#define _NET_BYPASS_H
>> +
>> +#include <linux/netdevice.h>
>> +
>> +struct bypass_ops {
>> + int (*slave_pre_register)(struct net_device *slave_netdev,
>> + struct net_device *bypass_netdev);
>> + int (*slave_join)(struct net_device *slave_netdev,
>> + struct net_device *bypass_netdev);
>> + int (*slave_pre_unregister)(struct net_device *slave_netdev,
>> + struct net_device *bypass_netdev);
>> + int (*slave_release)(struct net_device *slave_netdev,
>> + struct net_device *bypass_netdev);
>> + int (*slave_link_change)(struct net_device *slave_netdev,
>> + struct net_device *bypass_netdev);
>> + rx_handler_result_t (*handle_frame)(struct sk_buff **pskb);
>> +};
>> +
>> +struct bypass_master {
>> + struct list_head list;
>> + struct net_device __rcu *bypass_netdev;
>> + struct bypass_ops __rcu *ops;
>> +};
>> +
>> +/* bypass state */
>> +struct bypass_info {
>> + /* passthru netdev with same MAC */
>> + struct net_device __rcu *active_netdev;
> You still use "active"/"backup" names which is highly misleading as
> it has completely different meaning that in bond for example.
> I noted that in my previous review already. Please change it.
I guess the issue is with only the 'active' name. 'backup' should be fine as it also
matches with the BACKUP feature bit we are adding to virtio_net.
With regards to alternate names for 'active', you suggested 'stolen', but i
am not too happy with it.
netvsc uses vf_netdev, are you OK with this? Or another option is 'passthru'
>
>
>> +
>> + /* virtio_net netdev */
>> + struct net_device __rcu *backup_netdev;
>> +
>> + /* active netdev stats */
>> + struct rtnl_link_stats64 active_stats;
>> +
>> + /* backup netdev stats */
>> + struct rtnl_link_stats64 backup_stats;
>> +
>> + /* aggregated stats */
>> + struct rtnl_link_stats64 bypass_stats;
>> +
>> + /* spinlock while updating stats */
>> + spinlock_t stats_lock;
>> +};
>> +
>> +#if IS_ENABLED(CONFIG_NET_BYPASS)
>> +
>> +int bypass_master_create(struct net_device *backup_netdev,
>> + struct bypass_master **pbypass_master);
>> +void bypass_master_destroy(struct bypass_master *bypass_master);
>> +
>> +int bypass_master_register(struct net_device *dev, struct bypass_ops *ops,
>> + struct bypass_master **pbypass_master);
>> +void bypass_master_unregister(struct bypass_master *bypass_master);
>> +
>> +int bypass_slave_unregister(struct net_device *slave_netdev);
>> +
>> +#else
>> +
>> +static inline
>> +int bypass_master_create(struct net_device *backup_netdev,
>> + struct bypass_master **pbypass_master);
>> +{
>> + return 0;
>> +}
>> +
>> +static inline
>> +void bypass_master_destroy(struct bypass_master *bypass_master)
>> +{
>> +}
>> +
>> +static inline
>> +int bypass_master_register(struct net_device *dev, struct bypass_ops *ops,
>> + struct pbypass_master **pbypass_master);
>> +{
>> + return 0;
>> +}
>> +
>> +static inline
>> +void bypass_master_unregister(struct bypass_master *bypass_master)
>> +{
>> +}
>> +
>> +static inline
>> +int bypass_slave_unregister(struct net_device *slave_netdev)
>> +{
>> + return 0;
>> +}
>> +
>> +#endif
>> +
>> +#endif /* _NET_BYPASS_H */
>> diff --git a/net/Kconfig b/net/Kconfig
>> index 0428f12c25c2..994445f4a96a 100644
>> --- a/net/Kconfig
>> +++ b/net/Kconfig
>> @@ -423,6 +423,24 @@ config MAY_USE_DEVLINK
>> on MAY_USE_DEVLINK to ensure they do not cause link errors when
>> devlink is a loadable module and the driver using it is built-in.
>>
>> +config NET_BYPASS
>> + tristate "Bypass interface"
>> + ---help---
>> + This provides a generic interface for paravirtual drivers to listen
>> + for netdev register/unregister/link change events from pci ethernet
>> + devices with the same MAC and takeover their datapath. This also
>> + enables live migration of a VM with direct attached VF by failing
>> + over to the paravirtual datapath when the VF is unplugged.
>> +
>> +config MAY_USE_BYPASS
>> + tristate
>> + default m if NET_BYPASS=m
>> + default y if NET_BYPASS=y || NET_BYPASS=n
>> + help
>> + Drivers using the bypass infrastructure should have a dependency
>> + on MAY_USE_BYPASS to ensure they do not cause link errors when
>> + bypass is a loadable module and the driver using it is built-in.
>> +
>> endif # if NET
>>
>> # Used by archs to tell that they support BPF JIT compiler plus which flavour.
>> diff --git a/net/core/Makefile b/net/core/Makefile
>> index 6dbbba8c57ae..a9727ed1c8fc 100644
>> --- a/net/core/Makefile
>> +++ b/net/core/Makefile
>> @@ -30,3 +30,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
>> obj-$(CONFIG_HWBM) += hwbm.o
>> obj-$(CONFIG_NET_DEVLINK) += devlink.o
>> obj-$(CONFIG_GRO_CELLS) += gro_cells.o
>> +obj-$(CONFIG_NET_BYPASS) += bypass.o
>> diff --git a/net/core/bypass.c b/net/core/bypass.c
>> new file mode 100644
>> index 000000000000..b5b9cb554c3f
>> --- /dev/null
>> +++ b/net/core/bypass.c
>> @@ -0,0 +1,844 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright (c) 2018, Intel Corporation. */
>> +
>> +/* A common module to handle registrations and notifications for paravirtual
>> + * drivers to enable accelerated datapath and support VF live migration.
>> + *
>> + * The notifier and event handling code is based on netvsc driver.
>> + */
>> +
>> +#include <linux/netdevice.h>
>> +#include <linux/etherdevice.h>
>> +#include <linux/ethtool.h>
>> +#include <linux/module.h>
>> +#include <linux/slab.h>
>> +#include <linux/netdevice.h>
>> +#include <linux/netpoll.h>
>> +#include <linux/rtnetlink.h>
>> +#include <linux/if_vlan.h>
>> +#include <linux/pci.h>
>> +#include <net/sch_generic.h>
>> +#include <uapi/linux/if_arp.h>
>> +#include <net/bypass.h>
>> +
>> +static LIST_HEAD(bypass_master_list);
>> +static DEFINE_SPINLOCK(bypass_lock);
>> +
>> +static int bypass_slave_pre_register(struct net_device *slave_netdev,
>> + struct net_device *bypass_netdev,
>> + struct bypass_ops *bypass_ops)
>> +{
>> + struct bypass_info *bi;
>> + bool backup;
>> +
>> + if (bypass_ops) {
>> + if (!bypass_ops->slave_pre_register)
>> + return -EINVAL;
>> +
>> + return bypass_ops->slave_pre_register(slave_netdev,
>> + bypass_netdev);
>> + }
>> +
>> + bi = netdev_priv(bypass_netdev);
>> + backup = (slave_netdev->dev.parent == bypass_netdev->dev.parent);
>> + if (backup ? rtnl_dereference(bi->backup_netdev) :
>> + rtnl_dereference(bi->active_netdev)) {
>> + netdev_err(bypass_netdev, "%s attempting to register as slave dev when %s already present\n",
>> + slave_netdev->name, backup ? "backup" : "active");
>> + return -EEXIST;
>> + }
>> +
>> + /* Avoid non pci devices as active netdev */
>> + if (!backup && (!slave_netdev->dev.parent ||
>> + !dev_is_pci(slave_netdev->dev.parent)))
>> + return -EINVAL;
>> +
>> + return 0;
>> +}
>> +
>> +static int bypass_slave_join(struct net_device *slave_netdev,
>> + struct net_device *bypass_netdev,
>> + struct bypass_ops *bypass_ops)
>> +{
>> + struct bypass_info *bi;
>> + bool backup;
>> +
>> + if (bypass_ops) {
>> + if (!bypass_ops->slave_join)
>> + return -EINVAL;
>> +
>> + return bypass_ops->slave_join(slave_netdev, bypass_netdev);
>> + }
>> +
>> + bi = netdev_priv(bypass_netdev);
>> + backup = (slave_netdev->dev.parent == bypass_netdev->dev.parent);
>> +
>> + dev_hold(slave_netdev);
>> +
>> + if (backup) {
>> + rcu_assign_pointer(bi->backup_netdev, slave_netdev);
>> + dev_get_stats(bi->backup_netdev, &bi->backup_stats);
>> + } else {
>> + rcu_assign_pointer(bi->active_netdev, slave_netdev);
>> + dev_get_stats(bi->active_netdev, &bi->active_stats);
>> + bypass_netdev->min_mtu = slave_netdev->min_mtu;
>> + bypass_netdev->max_mtu = slave_netdev->max_mtu;
>> + }
>> +
>> + netdev_info(bypass_netdev, "bypass slave:%s joined\n",
>> + slave_netdev->name);
>> +
>> + return 0;
>> +}
>> +
>> +/* Called when slave dev is injecting data into network stack.
>> + * Change the associated network device from lower dev to virtio.
>> + * note: already called with rcu_read_lock
>> + */
>> +static rx_handler_result_t bypass_handle_frame(struct sk_buff **pskb)
>> +{
>> + struct sk_buff *skb = *pskb;
>> + struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
>> +
>> + skb->dev = ndev;
>> +
>> + return RX_HANDLER_ANOTHER;
>> +}
>> +
>> +static struct net_device *bypass_master_get_bymac(u8 *mac,
>> + struct bypass_ops **ops)
>> +{
>> + struct bypass_master *bypass_master;
>> + struct net_device *bypass_netdev;
>> +
>> + spin_lock(&bypass_lock);
>> + list_for_each_entry(bypass_master, &bypass_master_list, list) {
> As I wrote the last time, you don't need this list, spinlock.
> You can do just something like:
> for_each_net(net) {
> for_each_netdev(net, dev) {
> if (netif_is_bypass_master(dev)) {
This function returns the upper netdev as well as the ops associated
with that netdev.
bypass_master_list is a list of 'struct bypass_master' that associates
'bypass_netdev' with 'bypass_ops' and gets added via bypass_master_register().
We need 'ops' only to support the 2 netdev model of netvsc. ops will be
NULL for 3-netdev model.
>
>
>
>
>> + bypass_netdev = rcu_dereference(bypass_master->bypass_netdev);
>> + if (ether_addr_equal(bypass_netdev->perm_addr, mac)) {
>> + *ops = rcu_dereference(bypass_master->ops);
> I don't see how rcu_dereference is ok here.
> 1) I don't see rcu_read_lock taken
> 2) Looks like bypass_master->ops has the same value across the whole
> existence.
We hold rtnl_lock(), i think i need to change this to rtnl_dereference.
Yes. ops doesn't change.
>
>
>> + spin_unlock(&bypass_lock);
>> + return bypass_netdev;
>> + }
>> + }
>> + spin_unlock(&bypass_lock);
>> + return NULL;
>> +}
>> +
>> +static int bypass_slave_register(struct net_device *slave_netdev)
>> +{
>> + struct net_device *bypass_netdev;
>> + struct bypass_ops *bypass_ops;
>> + int ret, orig_mtu;
>> +
>> + ASSERT_RTNL();
>> +
>> + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>> + &bypass_ops);
> For master, could you use word "master" in the variables so it is clear?
> Also, "dev" is fine instead of "netdev".
> Something like "bpmaster_dev"
bypass_master is of type struct bypass_master, bypass_netdev is of type struct net_device.
I can change all _netdev suffixes to _dev to make the names shorter.
>
>
>> + if (!bypass_netdev)
>> + goto done;
>> +
>> + ret = bypass_slave_pre_register(slave_netdev, bypass_netdev,
>> + bypass_ops);
>> + if (ret != 0)
> Just "if (ret)" will do. You have this on more places.
OK.
>
>
>> + goto done;
>> +
>> + ret = netdev_rx_handler_register(slave_netdev,
>> + bypass_ops ? bypass_ops->handle_frame :
>> + bypass_handle_frame, bypass_netdev);
>> + if (ret != 0) {
>> + netdev_err(slave_netdev, "can not register bypass rx handler (err = %d)\n",
>> + ret);
>> + goto done;
>> + }
>> +
>> + ret = netdev_upper_dev_link(slave_netdev, bypass_netdev, NULL);
>> + if (ret != 0) {
>> + netdev_err(slave_netdev, "can not set master device %s (err = %d)\n",
>> + bypass_netdev->name, ret);
>> + goto upper_link_failed;
>> + }
>> +
>> + slave_netdev->priv_flags |= IFF_BYPASS_SLAVE;
>> +
>> + if (netif_running(bypass_netdev)) {
>> + ret = dev_open(slave_netdev);
>> + if (ret && (ret != -EBUSY)) {
>> + netdev_err(bypass_netdev, "Opening slave %s failed ret:%d\n",
>> + slave_netdev->name, ret);
>> + goto err_interface_up;
>> + }
>> + }
>> +
>> + /* Align MTU of slave with master */
>> + orig_mtu = slave_netdev->mtu;
>> + ret = dev_set_mtu(slave_netdev, bypass_netdev->mtu);
>> + if (ret != 0) {
>> + netdev_err(bypass_netdev, "unable to change mtu of %s to %u register failed\n",
>> + slave_netdev->name, bypass_netdev->mtu);
>> + goto err_set_mtu;
>> + }
>> +
>> + ret = bypass_slave_join(slave_netdev, bypass_netdev, bypass_ops);
>> + if (ret != 0)
>> + goto err_join;
>> +
>> + call_netdevice_notifiers(NETDEV_JOIN, slave_netdev);
>> +
>> + netdev_info(bypass_netdev, "bypass slave:%s registered\n",
>> + slave_netdev->name);
>> +
>> + goto done;
>> +
>> +err_join:
>> + dev_set_mtu(slave_netdev, orig_mtu);
>> +err_set_mtu:
>> + dev_close(slave_netdev);
>> +err_interface_up:
>> + netdev_upper_dev_unlink(slave_netdev, bypass_netdev);
>> + slave_netdev->priv_flags &= ~IFF_BYPASS_SLAVE;
>> +upper_link_failed:
>> + netdev_rx_handler_unregister(slave_netdev);
>> +done:
>> + return NOTIFY_DONE;
>> +}
>> +
>> +static int bypass_slave_pre_unregister(struct net_device *slave_netdev,
>> + struct net_device *bypass_netdev,
>> + struct bypass_ops *bypass_ops)
>> +{
>> + struct net_device *backup_netdev, *active_netdev;
>> + struct bypass_info *bi;
>> +
>> + if (bypass_ops) {
>> + if (!bypass_ops->slave_pre_unregister)
>> + return -EINVAL;
>> +
>> + return bypass_ops->slave_pre_unregister(slave_netdev,
>> + bypass_netdev);
>> + }
>> +
>> + bi = netdev_priv(bypass_netdev);
>> + active_netdev = rtnl_dereference(bi->active_netdev);
>> + backup_netdev = rtnl_dereference(bi->backup_netdev);
>> +
>> + if (slave_netdev != active_netdev && slave_netdev != backup_netdev)
>> + return -EINVAL;
>> +
>> + return 0;
>> +}
>> +
>> +static int bypass_slave_release(struct net_device *slave_netdev,
>> + struct net_device *bypass_netdev,
>> + struct bypass_ops *bypass_ops)
>> +{
>> + struct net_device *backup_netdev, *active_netdev;
>> + struct bypass_info *bi;
>> +
>> + if (bypass_ops) {
>> + if (!bypass_ops->slave_release)
>> + return -EINVAL;
> I think it would be good to make the API to the driver more strict and
> have a separate set of ops for "active" and "backup" netdevices.
> That should stop people thinking about extending this to more slaves in
> the future.
We have checks in slave_pre_register() that allows only 1 'backup' and 1
'active' slave.
>
>
>
>> +
>> + return bypass_ops->slave_release(slave_netdev, bypass_netdev);
>> + }
>> +
>> + bi = netdev_priv(bypass_netdev);
>> + active_netdev = rtnl_dereference(bi->active_netdev);
>> + backup_netdev = rtnl_dereference(bi->backup_netdev);
>> +
>> + if (slave_netdev == backup_netdev) {
>> + RCU_INIT_POINTER(bi->backup_netdev, NULL);
>> + } else {
>> + RCU_INIT_POINTER(bi->active_netdev, NULL);
>> + if (backup_netdev) {
>> + bypass_netdev->min_mtu = backup_netdev->min_mtu;
>> + bypass_netdev->max_mtu = backup_netdev->max_mtu;
>> + }
>> + }
>> +
>> + dev_put(slave_netdev);
>> +
>> + netdev_info(bypass_netdev, "bypass slave:%s released\n",
>> + slave_netdev->name);
>> +
>> + return 0;
>> +}
>> +
>> +int bypass_slave_unregister(struct net_device *slave_netdev)
>> +{
>> + struct net_device *bypass_netdev;
>> + struct bypass_ops *bypass_ops;
>> + int ret;
>> +
>> + if (!netif_is_bypass_slave(slave_netdev))
>> + goto done;
>> +
>> + ASSERT_RTNL();
>> +
>> + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>> + &bypass_ops);
>> + if (!bypass_netdev)
>> + goto done;
>> +
>> + ret = bypass_slave_pre_unregister(slave_netdev, bypass_netdev,
>> + bypass_ops);
>> + if (ret != 0)
>> + goto done;
>> +
>> + netdev_rx_handler_unregister(slave_netdev);
>> + netdev_upper_dev_unlink(slave_netdev, bypass_netdev);
>> + slave_netdev->priv_flags &= ~IFF_BYPASS_SLAVE;
>> +
>> + bypass_slave_release(slave_netdev, bypass_netdev, bypass_ops);
>> +
>> + netdev_info(bypass_netdev, "bypass slave:%s unregistered\n",
>> + slave_netdev->name);
>> +
>> +done:
>> + return NOTIFY_DONE;
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_slave_unregister);
>> +
>> +static bool bypass_xmit_ready(struct net_device *dev)
>> +{
>> + return netif_running(dev) && netif_carrier_ok(dev);
>> +}
>> +
>> +static int bypass_slave_link_change(struct net_device *slave_netdev)
>> +{
>> + struct net_device *bypass_netdev, *active_netdev, *backup_netdev;
>> + struct bypass_ops *bypass_ops;
>> + struct bypass_info *bi;
>> +
>> + if (!netif_is_bypass_slave(slave_netdev))
>> + goto done;
>> +
>> + ASSERT_RTNL();
>> +
>> + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>> + &bypass_ops);
>> + if (!bypass_netdev)
>> + goto done;
>> +
>> + if (bypass_ops) {
>> + if (!bypass_ops->slave_link_change)
>> + goto done;
>> +
>> + return bypass_ops->slave_link_change(slave_netdev,
>> + bypass_netdev);
>> + }
>> +
>> + if (!netif_running(bypass_netdev))
>> + return 0;
>> +
>> + bi = netdev_priv(bypass_netdev);
>> +
>> + active_netdev = rtnl_dereference(bi->active_netdev);
>> + backup_netdev = rtnl_dereference(bi->backup_netdev);
>> +
>> + if (slave_netdev != active_netdev && slave_netdev != backup_netdev)
>> + goto done;
> You don't need this check. "if (!netif_is_bypass_slave(slave_netdev))"
> above is enough.
I think we need this check to not allow events from a slave that is not
attached to this master but has the same MAC.
>
>
>> +
>> + if ((active_netdev && bypass_xmit_ready(active_netdev)) ||
>> + (backup_netdev && bypass_xmit_ready(backup_netdev))) {
>> + netif_carrier_on(bypass_netdev);
>> + netif_tx_wake_all_queues(bypass_netdev);
>> + } else {
>> + netif_carrier_off(bypass_netdev);
>> + netif_tx_stop_all_queues(bypass_netdev);
>> + }
>> +
>> +done:
>> + return NOTIFY_DONE;
>> +}
>> +
>> +static bool bypass_validate_event_dev(struct net_device *dev)
>> +{
>> + /* Skip parent events */
>> + if (netif_is_bypass_master(dev))
>> + return false;
>> +
>> + /* Avoid non-Ethernet type devices */
>> + if (dev->type != ARPHRD_ETHER)
>> + return false;
>> +
>> + /* Avoid Vlan dev with same MAC registering as VF */
>> + if (is_vlan_dev(dev))
>> + return false;
>> +
>> + /* Avoid Bonding master dev with same MAC registering as slave dev */
>> + if ((dev->priv_flags & IFF_BONDING) && (dev->flags & IFF_MASTER))
> Yeah, this is certainly incorrect. One thing is, you should be using the
> helpers netif_is_bond_master().
> But what about the rest? macsec, macvlan, team, bridge, ovs and others?
>
> You need to do it not by blacklisting, but with whitelisting. You need
> to whitelist VF devices. My port flavours patchset might help with this.
May be i can use netdev_has_lower_dev() helper to make sure that the slave
device is not an upper dev.
Can you point to your port flavours patchset? Is it upstream?
>
>
>> + return false;
>> +
>> + return true;
>> +}
>> +
>> +static int
>> +bypass_event(struct notifier_block *this, unsigned long event, void *ptr)
>> +{
>> + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
>> +
>> + if (!bypass_validate_event_dev(event_dev))
>> + return NOTIFY_DONE;
>> +
>> + switch (event) {
>> + case NETDEV_REGISTER:
>> + return bypass_slave_register(event_dev);
>> + case NETDEV_UNREGISTER:
>> + return bypass_slave_unregister(event_dev);
>> + case NETDEV_UP:
>> + case NETDEV_DOWN:
>> + case NETDEV_CHANGE:
>> + return bypass_slave_link_change(event_dev);
>> + default:
>> + return NOTIFY_DONE;
>> + }
>> +}
>> +
>> +static struct notifier_block bypass_notifier = {
>> + .notifier_call = bypass_event,
>> +};
>> +
>> +int bypass_open(struct net_device *dev)
>> +{
>> + struct bypass_info *bi = netdev_priv(dev);
>> + struct net_device *active_netdev, *backup_netdev;
>> + int err;
>> +
>> + netif_carrier_off(dev);
>> + netif_tx_wake_all_queues(dev);
>> +
>> + active_netdev = rtnl_dereference(bi->active_netdev);
>> + if (active_netdev) {
>> + err = dev_open(active_netdev);
>> + if (err)
>> + goto err_active_open;
>> + }
>> +
>> + backup_netdev = rtnl_dereference(bi->backup_netdev);
>> + if (backup_netdev) {
>> + err = dev_open(backup_netdev);
>> + if (err)
>> + goto err_backup_open;
>> + }
>> +
>> + return 0;
>> +
>> +err_backup_open:
>> + dev_close(active_netdev);
>> +err_active_open:
>> + netif_tx_disable(dev);
>> + return err;
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_open);
>> +
>> +int bypass_close(struct net_device *dev)
>> +{
>> + struct bypass_info *vi = netdev_priv(dev);
> This should be probably "bi"
Yes.
>
>
>> + struct net_device *slave_netdev;
>> +
>> + netif_tx_disable(dev);
>> +
>> + slave_netdev = rtnl_dereference(vi->active_netdev);
>> + if (slave_netdev)
>> + dev_close(slave_netdev);
>> +
>> + slave_netdev = rtnl_dereference(vi->backup_netdev);
>> + if (slave_netdev)
>> + dev_close(slave_netdev);
>> +
>> + return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_close);
>> +
>> +static netdev_tx_t bypass_drop_xmit(struct sk_buff *skb, struct net_device *dev)
>> +{
>> + atomic_long_inc(&dev->tx_dropped);
>> + dev_kfree_skb_any(skb);
>> + return NETDEV_TX_OK;
>> +}
>> +
>> +netdev_tx_t bypass_start_xmit(struct sk_buff *skb, struct net_device *dev)
>> +{
>> + struct bypass_info *bi = netdev_priv(dev);
> If you rename the other variable to "bpmaster_dev", it would be nice to
> rename this to bpinfo or something more descriptive. "bi" is too short
> to know what that is right away.
Will rename bypass_netdev to bypass_dev. bypass indicates that it is
an upper master dev.
>
>
>> + struct net_device *xmit_dev;
> Don't mix "dev" and "netdev" in one .c file. Just use "dev" for all.
OK.
>
>
>
>> +
>> + /* Try xmit via active netdev followed by backup netdev */
>> + xmit_dev = rcu_dereference_bh(bi->active_netdev);
>> + if (!xmit_dev || !bypass_xmit_ready(xmit_dev)) {
>> + xmit_dev = rcu_dereference_bh(bi->backup_netdev);
>> + if (!xmit_dev || !bypass_xmit_ready(xmit_dev))
>> + return bypass_drop_xmit(skb, dev);
>> + }
>> +
>> + skb->dev = xmit_dev;
>> + skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
>> +
>> + return dev_queue_xmit(skb);
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_start_xmit);
>> +
>> +u16 bypass_select_queue(struct net_device *dev, struct sk_buff *skb,
>> + void *accel_priv, select_queue_fallback_t fallback)
>> +{
>> + /* This helper function exists to help dev_pick_tx get the correct
>> + * destination queue. Using a helper function skips a call to
>> + * skb_tx_hash and will put the skbs in the queue we expect on their
>> + * way down to the bonding driver.
>> + */
>> + u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
>> +
>> + /* Save the original txq to restore before passing to the driver */
>> + qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
>> +
>> + if (unlikely(txq >= dev->real_num_tx_queues)) {
>> + do {
>> + txq -= dev->real_num_tx_queues;
>> + } while (txq >= dev->real_num_tx_queues);
>> + }
>> +
>> + return txq;
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_select_queue);
>> +
>> +/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
>> + * that some drivers can provide 32bit values only.
>> + */
>> +static void bypass_fold_stats(struct rtnl_link_stats64 *_res,
>> + const struct rtnl_link_stats64 *_new,
>> + const struct rtnl_link_stats64 *_old)
>> +{
>> + const u64 *new = (const u64 *)_new;
>> + const u64 *old = (const u64 *)_old;
>> + u64 *res = (u64 *)_res;
>> + int i;
>> +
>> + for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
>> + u64 nv = new[i];
>> + u64 ov = old[i];
>> + s64 delta = nv - ov;
>> +
>> + /* detects if this particular field is 32bit only */
>> + if (((nv | ov) >> 32) == 0)
>> + delta = (s64)(s32)((u32)nv - (u32)ov);
>> +
>> + /* filter anomalies, some drivers reset their stats
>> + * at down/up events.
>> + */
>> + if (delta > 0)
>> + res[i] += delta;
>> + }
>> +}
>> +
>> +void bypass_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
>> +{
>> + struct bypass_info *bi = netdev_priv(dev);
> You can WARN_ON and return in case the dev is not bypass master, just
> to catch buggy drivers. Same with other helpers.
I can make this static and not export this helper as well as all
bypass_netdev ops.
>
>
>> + const struct rtnl_link_stats64 *new;
>> + struct rtnl_link_stats64 temp;
>> + struct net_device *slave_netdev;
>> +
>> + spin_lock(&bi->stats_lock);
>> + memcpy(stats, &bi->bypass_stats, sizeof(*stats));
>> +
>> + rcu_read_lock();
>> +
>> + slave_netdev = rcu_dereference(bi->active_netdev);
>> + if (slave_netdev) {
>> + new = dev_get_stats(slave_netdev, &temp);
>> + bypass_fold_stats(stats, new, &bi->active_stats);
>> + memcpy(&bi->active_stats, new, sizeof(*new));
>> + }
>> +
>> + slave_netdev = rcu_dereference(bi->backup_netdev);
>> + if (slave_netdev) {
>> + new = dev_get_stats(slave_netdev, &temp);
>> + bypass_fold_stats(stats, new, &bi->backup_stats);
>> + memcpy(&bi->backup_stats, new, sizeof(*new));
>> + }
>> +
>> + rcu_read_unlock();
>> +
>> + memcpy(&bi->bypass_stats, stats, sizeof(*stats));
>> + spin_unlock(&bi->stats_lock);
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_get_stats);
>> +
>> +int bypass_change_mtu(struct net_device *dev, int new_mtu)
>> +{
>> + struct bypass_info *bi = netdev_priv(dev);
>> + struct net_device *active_netdev, *backup_netdev;
>> + int ret = 0;
> Pointless initialization.
>
>
>> +
>> + active_netdev = rcu_dereference(bi->active_netdev);
>> + if (active_netdev) {
>> + ret = dev_set_mtu(active_netdev, new_mtu);
>> + if (ret)
>> + return ret;
>> + }
>> +
>> + backup_netdev = rcu_dereference(bi->backup_netdev);
>> + if (backup_netdev) {
>> + ret = dev_set_mtu(backup_netdev, new_mtu);
>> + if (ret) {
>> + dev_set_mtu(active_netdev, dev->mtu);
>> + return ret;
>> + }
>> + }
>> +
>> + dev->mtu = new_mtu;
>> + return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_change_mtu);
>> +
>> +void bypass_set_rx_mode(struct net_device *dev)
>> +{
>> + struct bypass_info *bi = netdev_priv(dev);
>> + struct net_device *slave_netdev;
>> +
>> + rcu_read_lock();
>> +
>> + slave_netdev = rcu_dereference(bi->active_netdev);
>> + if (slave_netdev) {
>> + dev_uc_sync_multiple(slave_netdev, dev);
>> + dev_mc_sync_multiple(slave_netdev, dev);
>> + }
>> +
>> + slave_netdev = rcu_dereference(bi->backup_netdev);
>> + if (slave_netdev) {
>> + dev_uc_sync_multiple(slave_netdev, dev);
>> + dev_mc_sync_multiple(slave_netdev, dev);
>> + }
>> +
>> + rcu_read_unlock();
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_set_rx_mode);
>> +
>> +static const struct net_device_ops bypass_netdev_ops = {
>> + .ndo_open = bypass_open,
>> + .ndo_stop = bypass_close,
>> + .ndo_start_xmit = bypass_start_xmit,
>> + .ndo_select_queue = bypass_select_queue,
>> + .ndo_get_stats64 = bypass_get_stats,
>> + .ndo_change_mtu = bypass_change_mtu,
>> + .ndo_set_rx_mode = bypass_set_rx_mode,
>> + .ndo_validate_addr = eth_validate_addr,
>> + .ndo_features_check = passthru_features_check,
>> +};
>> +
>> +#define BYPASS_DRV_NAME "bypass"
>> +#define BYPASS_DRV_VERSION "0.1"
>> +
>> +static void bypass_ethtool_get_drvinfo(struct net_device *dev,
>> + struct ethtool_drvinfo *drvinfo)
>> +{
>> + strlcpy(drvinfo->driver, BYPASS_DRV_NAME, sizeof(drvinfo->driver));
>> + strlcpy(drvinfo->version, BYPASS_DRV_VERSION, sizeof(drvinfo->version));
>> +}
>> +
>> +int bypass_ethtool_get_link_ksettings(struct net_device *dev,
>> + struct ethtool_link_ksettings *cmd)
>> +{
>> + struct bypass_info *bi = netdev_priv(dev);
>> + struct net_device *slave_netdev;
>> +
>> + slave_netdev = rtnl_dereference(bi->active_netdev);
>> + if (!slave_netdev || !bypass_xmit_ready(slave_netdev)) {
>> + slave_netdev = rtnl_dereference(bi->backup_netdev);
>> + if (!slave_netdev || !bypass_xmit_ready(slave_netdev)) {
>> + cmd->base.duplex = DUPLEX_UNKNOWN;
>> + cmd->base.port = PORT_OTHER;
>> + cmd->base.speed = SPEED_UNKNOWN;
>> +
>> + return 0;
>> + }
>> + }
>> +
>> + return __ethtool_get_link_ksettings(slave_netdev, cmd);
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_ethtool_get_link_ksettings);
>> +
>> +static const struct ethtool_ops bypass_ethtool_ops = {
>> + .get_drvinfo = bypass_ethtool_get_drvinfo,
>> + .get_link = ethtool_op_get_link,
>> + .get_link_ksettings = bypass_ethtool_get_link_ksettings,
>> +};
>> +
>> +static void bypass_register_existing_slave(struct net_device *bypass_netdev)
>> +{
>> + struct net *net = dev_net(bypass_netdev);
>> + struct net_device *dev;
>> +
>> + rtnl_lock();
>> + for_each_netdev(net, dev) {
>> + if (dev == bypass_netdev)
>> + continue;
>> + if (!bypass_validate_event_dev(dev))
>> + continue;
>> + if (ether_addr_equal(bypass_netdev->perm_addr, dev->perm_addr))
>> + bypass_slave_register(dev);
>> + }
>> + rtnl_unlock();
>> +}
>> +
>> +int bypass_master_register(struct net_device *dev, struct bypass_ops *ops,
>> + struct bypass_master **pbypass_master)
>> +{
>> + struct bypass_master *bypass_master;
>> +
>> + bypass_master = kzalloc(sizeof(*bypass_master), GFP_KERNEL);
>> + if (!bypass_master)
>> + return -ENOMEM;
>> +
>> + rcu_assign_pointer(bypass_master->ops, ops);
>> + dev_hold(dev);
>> + dev->priv_flags |= IFF_BYPASS;
>> + rcu_assign_pointer(bypass_master->bypass_netdev, dev);
>> +
>> + spin_lock(&bypass_lock);
>> + list_add_tail(&bypass_master->list, &bypass_master_list);
>> + spin_unlock(&bypass_lock);
>> +
>> + bypass_register_existing_slave(dev);
>> +
>> + *pbypass_master = bypass_master;
>> + return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_master_register);
>> +
>> +void bypass_master_unregister(struct bypass_master *bypass_master)
>> +{
>> + struct net_device *bypass_netdev;
>> +
>> + bypass_netdev = rcu_dereference(bypass_master->bypass_netdev);
>> +
>> + bypass_netdev->priv_flags &= ~IFF_BYPASS;
>> + dev_put(bypass_netdev);
>> +
>> + spin_lock(&bypass_lock);
>> + list_del(&bypass_master->list);
>> + spin_unlock(&bypass_lock);
>> +
>> + kfree(bypass_master);
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_master_unregister);
>> +
>> +int bypass_master_create(struct net_device *backup_netdev,
>> + struct bypass_master **pbypass_master)
>> +{
>> + struct device *dev = backup_netdev->dev.parent;
>> + struct net_device *bypass_netdev;
>> + int err;
>> +
>> + /* Alloc at least 2 queues, for now we are going with 16 assuming
>> + * that most devices being bonded won't have too many queues.
>> + */
>> + bypass_netdev = alloc_etherdev_mq(sizeof(struct bypass_info), 16);
>> + if (!bypass_netdev) {
>> + dev_err(dev, "Unable to allocate bypass_netdev!\n");
>> + return -ENOMEM;
>> + }
>> +
>> + dev_net_set(bypass_netdev, dev_net(backup_netdev));
>> + SET_NETDEV_DEV(bypass_netdev, dev);
>> +
>> + bypass_netdev->netdev_ops = &bypass_netdev_ops;
>> + bypass_netdev->ethtool_ops = &bypass_ethtool_ops;
>> +
>> + /* Initialize the device options */
>> + bypass_netdev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
>> + bypass_netdev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
>> + IFF_TX_SKB_SHARING);
>> +
>> + /* don't acquire bypass netdev's netif_tx_lock when transmitting */
>> + bypass_netdev->features |= NETIF_F_LLTX;
>> +
>> + /* Don't allow bypass devices to change network namespaces. */
>> + bypass_netdev->features |= NETIF_F_NETNS_LOCAL;
>> +
>> + bypass_netdev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
>> + NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |
>> + NETIF_F_HIGHDMA | NETIF_F_LRO;
>> +
>> + bypass_netdev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
>> + bypass_netdev->features |= bypass_netdev->hw_features;
>> +
>> + memcpy(bypass_netdev->dev_addr, backup_netdev->dev_addr,
>> + bypass_netdev->addr_len);
>> +
>> + bypass_netdev->min_mtu = backup_netdev->min_mtu;
>> + bypass_netdev->max_mtu = backup_netdev->max_mtu;
>> +
>> + err = register_netdev(bypass_netdev);
>> + if (err < 0) {
>> + dev_err(dev, "Unable to register bypass_netdev!\n");
>> + goto err_register_netdev;
>> + }
>> +
>> + netif_carrier_off(bypass_netdev);
>> +
>> + err = bypass_master_register(bypass_netdev, NULL, pbypass_master);
>> + if (err < 0)
> just "if (err)" would do.
OK
>
>
>> + goto err_bypass;
>> +
>> + return 0;
>> +
>> +err_bypass:
>> + unregister_netdev(bypass_netdev);
>> +err_register_netdev:
>> + free_netdev(bypass_netdev);
>> +
>> + return err;
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_master_create);
>> +
>> +void bypass_master_destroy(struct bypass_master *bypass_master)
>> +{
>> + struct net_device *bypass_netdev;
>> + struct net_device *slave_netdev;
>> + struct bypass_info *bi;
>> +
>> + if (!bypass_master)
>> + return;
>> +
>> + bypass_netdev = rcu_dereference(bypass_master->bypass_netdev);
>> + bi = netdev_priv(bypass_netdev);
>> +
>> + netif_device_detach(bypass_netdev);
>> +
>> + rtnl_lock();
>> +
>> + slave_netdev = rtnl_dereference(bi->active_netdev);
>> + if (slave_netdev)
>> + bypass_slave_unregister(slave_netdev);
>> +
>> + slave_netdev = rtnl_dereference(bi->backup_netdev);
>> + if (slave_netdev)
>> + bypass_slave_unregister(slave_netdev);
>> +
>> + bypass_master_unregister(bypass_master);
>> +
>> + unregister_netdevice(bypass_netdev);
>> +
>> + rtnl_unlock();
>> +
>> + free_netdev(bypass_netdev);
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_master_destroy);
>> +
>> +static __init int
>> +bypass_init(void)
>> +{
>> + register_netdevice_notifier(&bypass_notifier);
>> +
>> + return 0;
>> +}
>> +module_init(bypass_init);
>> +
>> +static __exit
>> +void bypass_exit(void)
>> +{
>> + unregister_netdevice_notifier(&bypass_notifier);
>> +}
>> +module_exit(bypass_exit);
>> +
>> +MODULE_DESCRIPTION("Bypass infrastructure/interface for Paravirtual drivers");
>> +MODULE_LICENSE("GPL v2");
>> --
>> 2.14.3
>>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PULL] virtio: feature pull
From: Michael S. Tsirkin @ 2018-04-11 19:10 UTC (permalink / raw)
To: Linus Torvalds; +Cc: kvm, netdev, mst, linux-kernel, virtualization
The following changes since commit dc32bb678e103afbcfa4d814489af0566307f528:
vhost: add vsock compat ioctl (2018-03-20 03:17:42 +0200)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git tags/for_linus
for you to fetch changes up to 6c64fe7f2adcee21d7c4247f1ec021fd18428fc4:
virtio_balloon: export hugetlb page allocation counts (2018-04-10 21:23:55 +0300)
----------------------------------------------------------------
virtio: feature
This adds reporting hugepage stats to virtio-balloon.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
----------------------------------------------------------------
Jonathan Helman (1):
virtio_balloon: export hugetlb page allocation counts
drivers/virtio/virtio_balloon.c | 6 ++++++
include/uapi/linux/virtio_balloon.h | 4 +++-
2 files changed, 9 insertions(+), 1 deletion(-)
^ permalink raw reply
* [PULL] virtio: feature pull
From: Michael S. Tsirkin @ 2018-04-11 19:10 UTC (permalink / raw)
To: Linus Torvalds; +Cc: kvm, netdev, mst, linux-kernel, virtualization
The following changes since commit dc32bb678e103afbcfa4d814489af0566307f528:
vhost: add vsock compat ioctl (2018-03-20 03:17:42 +0200)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git tags/for_linus
for you to fetch changes up to 6c64fe7f2adcee21d7c4247f1ec021fd18428fc4:
virtio_balloon: export hugetlb page allocation counts (2018-04-10 21:23:55 +0300)
----------------------------------------------------------------
virtio: feature
This adds reporting hugepage stats to virtio-balloon.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
----------------------------------------------------------------
Jonathan Helman (1):
virtio_balloon: export hugetlb page allocation counts
drivers/virtio/virtio_balloon.c | 6 ++++++
include/uapi/linux/virtio_balloon.h | 4 +++-
2 files changed, 9 insertions(+), 1 deletion(-)
^ permalink raw reply
* Re: [PATCH 1/4] iommu: Add virtio-iommu driver
From: Jean-Philippe Brucker @ 2018-04-11 18:35 UTC (permalink / raw)
To: Tian, Kevin, Robin Murphy, iommu@lists.linux-foundation.org,
kvm@vger.kernel.org, virtualization@lists.linux-foundation.org,
virtio-dev@lists.oasis-open.org, kvmarm@lists.cs.columbia.edu
Cc: jayachandran.nair@cavium.com, tnowicki@caviumnetworks.com,
mst@redhat.com, Marc Zyngier, Will Deacon,
jintack@cs.columbia.edu, eric.auger.pro@gmail.com
In-Reply-To: <AADFC41AFE54684AB9EE6CBC0274A5D191090157@SHSMSX101.ccr.corp.intel.com>
On 23/03/18 08:27, Tian, Kevin wrote:
>>> The host kernel needs to have *some* MSI region in place before the
>>> guest can start configuring interrupts, otherwise it won't know what
>>> address to give to the underlying hardware. However, as soon as the host
>>> kernel has picked a region, host userspace needs to know that it can no
>>> longer use addresses in that region for DMA-able guest memory. It's a
>>> lot easier when the address is fixed in hardware and the host userspace
>>> will never be stupid enough to try and VFIO_IOMMU_DMA_MAP it, but in
>>> the
>>> more general case where MSI writes undergo IOMMU address translation
>>> so
>>> it's an arbitrary IOVA, this has the potential to conflict with stuff
>>> like guest memory hotplug.
>>>
>>> What we currently have is just the simplest option, with the host kernel
>>> just picking something up-front and pretending to host userspace that
>>> it's a fixed hardware address. There's certainly scope for it to be a
>>> bit more dynamic in the sense of adding an interface to let userspace
>>> move it around (before attaching any devices, at least), but I don't
>>> think it's feasible for the host kernel to second-guess userspace enough
>>> to make it entirely transparent like it is in the DMA API domain case.
>>>
>>> Of course, that's all assuming the host itself is using a virtio-iommu
>>> (e.g. in a nested virt or emulation scenario). When it's purely within a
>>> guest then an MSI reservation shouldn't matter so much, since the guest
>>> won't be anywhere near the real hardware configuration anyway.
>>>
>>> Robin.
>>
>> Curious since anyway we are defining a new iommu architecture
>> is it possible to avoid those ARM-specific burden completely?
>>
>
> OK, after some study around those tricks below is my learning:
>
> - MSI_IOVA window is used only on request (iommu_dma_get
> _msi_page), not meant to take effect on all architectures once
> initialized. e.g. ARM GIC does it but not x86. So it is reasonable
> for virtio-iommu driver to implement such capability;
>
> - I thought whether hardware MSI doorbell can be always reported
> on virtio-iommu since it's newly defined. Looks there is a problem
> if underlying IOMMU is sw-managed MSI style - valid mapping is
> expected in all level of translations, meaning guest has to manage
> stage-1 mapping in nested configuration since stage-1 is owned
> by guest.
>
> Then virtio-iommu is naturally expected to report the same MSI
> model as supported by underlying hardware. Below are some
> further thoughts along this route (use 'IOMMU' to represent the
> physical one and 'virtio-iommu' for virtual one):
>
> ----
>
> In the scope of current virtio-iommu spec v.6, there is no nested
> consideration yet. Guest driver is expected to use MAP/UNMAP
> interface on assigned endpoints. In this case the MAP requests
> (IOVA->GPA) is caught and maintained within Qemu which then
> further talks to VFIO to map IOVA->HPA in IOMMU.
>
> Qemu can learn the MSI model of IOMMU from sysfs.
>
> For hardware MSI doorbell (x86 and some ARM):
> * Host kernel reports to Qemu as IOMMU_RESV_MSI
> * Qemu report to guest as VIRTIO_IOMMU_RESV_MEM_T_MSI
> * Guest takes the range as IOMMU_RESV_MSI. reserved
> * Qemu MAP database has no mapping for the doorbell
> * Physical IOMMU page table has no mapping for the doorbell
> * MSI from passthrough device bypass IOMMU
> * MSI from emulated device bypass virtio-iommu
>
> For software MSI doorbell (most ARM):
> * Host kernel reports to Qemu as IOMMU_RESV_SW_MSI
> * Qemu report to guest as VIRTIO_IOMMU_RESV_MEM_T_RESERVED
> * Guest takes the range as IOMMU_RESV_RESERVED
> * vGIC requests to map 'GPA of the virtual doorbell'
> * a map request (IOVA->GPA) sent on endpoint
> * Qemu maintains the mapping in MAP database
> * but no VFIO_MAP request since it's purely virtual
> * GIC requests to map 'HPA of the physical doorbell'
> * e.g. triggered by VFIO enable msi
> * IOMMU now includes a valid mapping (IOVA->HPA)
> * MSI from emulated device go through Qemu MAP
> database (IOVA->'GPA of virtual doorbell') and then hit vGIC
> * MSI from passthrough device go through IOMMU
> (IOVA->'HPA of physical doorbell') and then hit GIC
>
> In this case, host doorbell is treated as reserved resource in
> guest side. Guest has its own sw-management for virtual
> doorbell which is only used for emulated device. two paths
> are completely separated.
>
> If above captures the right flow, current v0.6 spec is complete
> regarding to required function definition.
Yes I think this summarizes well the current state or SW/HW MSI
> Then comes nested case, with two level page tables (stage-1
> and stage-2) in IOMMU. stage-1 is for IOVA->GPA and stage-2
> is for GPA->HPA. VFIO map/unmap happens on stage-2,
> while stage-1 is directly managed by guest (and bound to
> IOMMU which enables nested translation from IOVA->GPA
> ->HPA).
>
> For hardware MSI, there is nothing special compared to
> previous requirement. Both host/guest treat the doorbell
> as reserved and guarantee no mapping in either stage-1 or
> stage-2.
>
> For software MSI, more consideration is required:
>
> * for emulated device it is just fine as long as guest keeps
> IOVA->'GPA of virtual doorbell' in stage-1. Qemu is expected
> to walk stage-1 page table upon MSI request from emulated
> device to hit vGIC;
>
> * for passthrough device however there is a problem. We
> need valid mapping in both stage-1 and stage-2, while host
> kernel is only responsible for stage-2:
>
> 1) if we expect to keep same isolation policy (i.e.
> host MSI fully managed by host kernel), then an identity
> mapping for host-reported MSI range is expected in stage-1.
> In such case we need a new type VIRTIO_IOMMU_RESV_
> MEM_T_DIRECT to teach guest setup identity mapping.
> it should be the right thing to add since anyway there might
> be true IOMMU_RESV_DIRECT range reported from host
> which also should be handled.
>
> 2) Alternatively we could instead allow Qemu to
> request dynamic change of physical doorbell mapping in
> stage2, e.g. from GPA of virtual doorbell to HPA of physical
> doorbell. But it doesn't like a good design - VFIO doesn't
> assign interrupt controller to user space then why should
> VFIO allow user mapping to doorbell...
>
> if 1) is agreed, looks the missing part in spec is just VIRTIO_
> IOMMU_RESV_MEM_T_DIRECT, though the whole story
> is lengthy and fully enabling nested require many other
> works. :-)
This is a great write-up, thanks. As said on the v0.6 thread [1], I also
prefer 1), because it doesn't require any additional interface in the
host kernel, and it doesn't force host userspace to guess which doorbell
address the guest is writing into the MSI-X table.
Thanks,
Jean
[1] https://www.mail-archive.com/virtualization@lists.linux-foundation.org/msg30104.html
^ permalink raw reply
* Re: [PATCH 2/4] iommu/virtio: Add probe request
From: Jean-Philippe Brucker @ 2018-04-11 18:33 UTC (permalink / raw)
To: Robin Murphy, iommu@lists.linux-foundation.org,
kvm@vger.kernel.org, virtualization@lists.linux-foundation.org,
virtio-dev@lists.oasis-open.org, kvmarm@lists.cs.columbia.edu
Cc: jayachandran.nair@cavium.com, Lorenzo Pieralisi,
tnowicki@caviumnetworks.com, mst@redhat.com, Marc Zyngier,
Will Deacon, jintack@cs.columbia.edu, eric.auger@redhat.com,
joro@8bytes.org, eric.auger.pro@gmail.com
In-Reply-To: <8156517f-74b1-2923-2838-402f09a5c488@arm.com>
On 23/03/18 15:00, Robin Murphy wrote:
[...]
>> + /*
>> + * Treat unknown subtype as RESERVED, but urge users to update their
>> + * driver.
>> + */
>> + if (mem->subtype != VIRTIO_IOMMU_RESV_MEM_T_RESERVED &&
>> + mem->subtype != VIRTIO_IOMMU_RESV_MEM_T_MSI)
>> + pr_warn("unknown resv mem subtype 0x%x\n", mem->subtype);
>
> Might as well avoid the extra comparisons by incorporating this into the
> switch statement, i.e.:
>
> default:
> dev_warn(vdev->viommu_dev->dev, ...);
> /* Fallthrough */
> case VIRTIO_IOMMU_RESV_MEM_T_RESERVED:
> ...
>
> (dev_warn is generally preferable to pr_warn when feasible)
Alright, that's nicer
[...]
>> /*
>> * Last step creates a default domain and attaches to it. Everything
>> * must be ready.
>> @@ -735,7 +849,19 @@ static int viommu_add_device(struct device *dev)
>>
>> static void viommu_remove_device(struct device *dev)
>> {
>> - kfree(dev->iommu_fwspec->iommu_priv);
>> + struct viommu_endpoint *vdev;
>> + struct iommu_resv_region *entry, *next;
>> + struct iommu_fwspec *fwspec = dev->iommu_fwspec;
>> +
>> + if (!fwspec || fwspec->ops != &viommu_ops)
>> + return;
>
> Oh good :) I guess that was just a patch-splitting issue. The group
> thing still applies, though.
Ok
Thanks,
Jean
^ permalink raw reply
* Re: [PATCH 1/4] iommu: Add virtio-iommu driver
From: Jean-Philippe Brucker @ 2018-04-11 18:33 UTC (permalink / raw)
To: Robin Murphy, iommu@lists.linux-foundation.org,
kvm@vger.kernel.org, virtualization@lists.linux-foundation.org,
virtio-dev@lists.oasis-open.org, kvmarm@lists.cs.columbia.edu
Cc: jayachandran.nair@cavium.com, Lorenzo Pieralisi, Tomasz.Nowicki,
tnowicki@caviumnetworks.com, mst@redhat.com, Marc Zyngier,
Will Deacon, jintack@cs.columbia.edu, eric.auger@redhat.com,
joro@8bytes.org, eric.auger.pro@gmail.com
In-Reply-To: <d0cfe602-f6e8-2d6e-0942-23012f3facef@arm.com>
On 23/03/18 14:48, Robin Murphy wrote:
[..]
>> + * Virtio driver for the paravirtualized IOMMU
>> + *
>> + * Copyright (C) 2018 ARM Limited
>> + * Author: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
>> + *
>> + * SPDX-License-Identifier: GPL-2.0
>
> This wants to be a // comment at the very top of the file (thankfully
> the policy is now properly documented in-tree since
> Documentation/process/license-rules.rst got merged)
Ok
[...]
>> +
>> +/*
>> + * viommu_del_mappings - remove mappings from the internal tree
>> + *
>> + * @vdomain: the domain
>> + * @iova: start of the range
>> + * @size: size of the range. A size of 0 corresponds to the entire address
>> + * space.
>> + * @out_mapping: if not NULL, the first removed mapping is returned in there.
>> + * This allows the caller to reuse the buffer for the unmap request. When
>> + * the returned size is greater than zero, if a mapping is returned, the
>> + * caller must free it.
>
> This "free multiple mappings except maybe hand one of them off to the
> caller" interface is really unintuitive. AFAICS it's only used by
> viommu_unmap() to grab mapping->req, but that doesn't seem to care about
> mapping itself, so I wonder whether it wouldn't make more sense to just
> have a global kmem_cache of struct virtio_iommu_req_unmap for that and
> avoid a lot of complexity...
Well it's a small complication for what I hoped would be a meanignful
performance difference, but more below.
>> +
>> +/* IOMMU API */
>> +
>> +static bool viommu_capable(enum iommu_cap cap)
>> +{
>> + return false;
>> +}
>
> The .capable callback is optional, so it's only worth implementing once
> you want it to do something beyond the default behaviour.
>
Ah, right
[...]
>> +static size_t viommu_unmap(struct iommu_domain *domain, unsigned long iova,
>> + size_t size)
>> +{
>> + int ret = 0;
>> + size_t unmapped;
>> + struct viommu_mapping *mapping = NULL;
>> + struct viommu_domain *vdomain = to_viommu_domain(domain);
>> +
>> + unmapped = viommu_del_mappings(vdomain, iova, size, &mapping);
>> + if (unmapped < size) {
>> + ret = -EINVAL;
>> + goto out_free;
>> + }
>> +
>> + /* Device already removed all mappings after detach. */
>> + if (!vdomain->endpoints)
>> + goto out_free;
>> +
>> + if (WARN_ON(!mapping))
>> + return 0;
>> +
>> + mapping->req.unmap = (struct virtio_iommu_req_unmap) {
>> + .head.type = VIRTIO_IOMMU_T_UNMAP,
>> + .domain = cpu_to_le32(vdomain->id),
>> + .virt_start = cpu_to_le64(iova),
>> + .virt_end = cpu_to_le64(iova + unmapped - 1),
>> + };
>
> ...In fact, the kmem_cache idea might be moot since it looks like with a
> bit of restructuring you could get away with just a single per-viommu
> virtio_iommu_req_unmap structure; this lot could be passed around on the
> stack until request_lock is taken, at which point it would be copied
> into the 'real' DMA-able structure. The equivalent might apply to
> viommu_map() too - now that I'm looking at it, it seems like it would
> take pretty minimal effort to encapsulate the whole business cleanly in
> viommu_send_req_sync(), which could do something like this instead of
> going through viommu_send_reqs_sync():
>
> ...
> spin_lock_irqsave(&viommu->request_lock, flags);
> viommu_copy_req(viommu->dma_req, req);
> ret = _viommu_send_reqs_sync(viommu, viommu->dma_req, 1, &sent);
> spin_unlock_irqrestore(&viommu->request_lock, flags);
> ...
I'll have to come back to that sorry, still conducting some experiments
with map/unmap.
I'd rather avoid introducing a single dma_req per viommu, because I'd
like to move to the iotlb_range_add/iotlb_sync interface as soon as
possible, and the request logic changes a lot when multiple threads are
susceptible to interleave map/unmap requests.
I ran some tests, and adding a kmem_cache (or simply using kmemdup, it
doesn't make a noticeable difference at our scale) reduces netperf
stream/maerts throughput by 1.1%/1.4% (+/- 0.5% over 30 tests). That's
for a virtio-net device (1 tx/rx vq), and with a vfio device the
difference isn't measurable. At this point I'm not fussy about such
small difference, so don't mind simplifying viommu_del_mapping.
[...]
>> + /*
>> + * Last step creates a default domain and attaches to it. Everything
>> + * must be ready.
>> + */
>> + group = iommu_group_get_for_dev(dev);
>> + if (!IS_ERR(group))
>> + iommu_group_put(group);
>
> Since you create the sysfs IOMMU device, maybe also create the links for
> the masters?
Ok
>> +
>> + return PTR_ERR_OR_ZERO(group);
>> +}
>> +
>> +static void viommu_remove_device(struct device *dev)
>> +{
>
> You need to remove dev from its group, too (basically, .remove_device
> should always undo everything .add_device did)
>
> It would also be good practice to verify that dev->iommu_fwspec exists
> and is one of yours before touching anything, although having checked
> the core code I see we do currently just about get away with it thanks
> to the horrible per-bus ops.
Ok
>
>> + kfree(dev->iommu_fwspec->iommu_priv);
>> +}
>> +
>> +static struct iommu_group *viommu_device_group(struct device *dev)
>> +{
>> + if (dev_is_pci(dev))
>> + return pci_device_group(dev);
>> + else
>> + return generic_device_group(dev);
>> +}
>> +
>> +static int viommu_of_xlate(struct device *dev, struct of_phandle_args *args)
>> +{
>> + return iommu_fwspec_add_ids(dev, args->args, 1);
>
> I'm sure a DT binding somewhere needs to document the appropriate value
> and meaning for #iommu-cells - I guess that probably falls under the
> virtio-mmio binding?
Yes I guess mmio.txt would be the best place for this.
[...]
>> +/*
>> + * Virtio-iommu definition v0.6
>> + *
>> + * Copyright (C) 2018 ARM Ltd.
>> + *
>> + * SPDX-License-Identifier: BSD-3-Clause
>
> Again, at the top, although in /* */ here since it's a header.
Right
Thanks for the review,
Jean
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox