Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
From: Michael S. Tsirkin @ 2017-09-27 22:57 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <17e9c3a9-7759-a674-bc00-414eabfed118@redhat.com>

On Wed, Sep 27, 2017 at 08:35:47AM +0800, Jason Wang wrote:
> 
> 
> On 2017年09月27日 03:19, Michael S. Tsirkin wrote:
> > On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
> > > This patch introduces vhost_prefetch_desc_indices() which could batch
> > > descriptor indices fetching and used ring updating. This intends to
> > > reduce the cache misses of indices fetching and updating and reduce
> > > cache line bounce when virtqueue is almost full. copy_to_user() was
> > > used in order to benefit from modern cpus that support fast string
> > > copy. Batched virtqueue processing will be the first user.
> > > 
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > ---
> > >   drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
> > >   drivers/vhost/vhost.h |  3 +++
> > >   2 files changed, 58 insertions(+)
> > > 
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index f87ec75..8424166d 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
> > >   }
> > >   EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
> > > +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> > > +				struct vring_used_elem *heads,
> > > +				u16 num, bool used_update)
> > why do you need to combine used update with prefetch?
> 
> For better performance


Why is sticking a branch in there better than requesting the update
conditionally from the caller?



> and I believe we don't care about the overhead when
> we meet errors in tx.

That's a separate question, I do not really understand how
you can fetch a descriptor and update the used ring at the same
time. This allows the guest to overwrite the buffer.
I might be misunderstanding what is going on here though.


> > 
> > > +{
> > > +	int ret, ret2;
> > > +	u16 last_avail_idx, last_used_idx, total, copied;
> > > +	__virtio16 avail_idx;
> > > +	struct vring_used_elem __user *used;
> > > +	int i;
> > > +
> > > +	if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
> > > +		vq_err(vq, "Failed to access avail idx at %p\n",
> > > +		       &vq->avail->idx);
> > > +		return -EFAULT;
> > > +	}
> > > +	last_avail_idx = vq->last_avail_idx & (vq->num - 1);
> > > +	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> > > +	total = vq->avail_idx - vq->last_avail_idx;
> > > +	ret = total = min(total, num);
> > > +
> > > +	for (i = 0; i < ret; i++) {
> > > +		ret2 = vhost_get_avail(vq, heads[i].id,
> > > +				      &vq->avail->ring[last_avail_idx]);
> > > +		if (unlikely(ret2)) {
> > > +			vq_err(vq, "Failed to get descriptors\n");
> > > +			return -EFAULT;
> > > +		}
> > > +		last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
> > > +	}
> > > +
> > > +	if (!used_update)
> > > +		return ret;
> > > +
> > > +	last_used_idx = vq->last_used_idx & (vq->num - 1);
> > > +	while (total) {
> > > +		copied = min((u16)(vq->num - last_used_idx), total);
> > > +		ret2 = vhost_copy_to_user(vq,
> > > +					  &vq->used->ring[last_used_idx],
> > > +					  &heads[ret - total],
> > > +					  copied * sizeof(*used));
> > > +
> > > +		if (unlikely(ret2)) {
> > > +			vq_err(vq, "Failed to update used ring!\n");
> > > +			return -EFAULT;
> > > +		}
> > > +
> > > +		last_used_idx = 0;
> > > +		total -= copied;
> > > +	}
> > > +
> > > +	/* Only get avail ring entries after they have been exposed by guest. */
> > > +	smp_rmb();
> > Barrier before return is a very confusing API. I guess it's designed to
> > be used in a specific way to make it necessary - but what is it?
> 
> Looks like a and we need do this after reading avail_idx.
> 
> Thanks
> 
> > 
> > 
> > > +	return ret;
> > > +}
> > > +EXPORT_SYMBOL(vhost_prefetch_desc_indices);
> > >   static int __init vhost_init(void)
> > >   {
> > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > index 39ff897..16c2cb6 100644
> > > --- a/drivers/vhost/vhost.h
> > > +++ b/drivers/vhost/vhost.h
> > > @@ -228,6 +228,9 @@ ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
> > >   ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
> > >   			     struct iov_iter *from);
> > >   int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
> > > +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> > > +				struct vring_used_elem *heads,
> > > +				u16 num, bool used_update);
> > >   #define vq_err(vq, fmt, ...) do {                                  \
> > >   		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> > > -- 
> > > 2.7.4
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
From: Michael S. Tsirkin @ 2017-09-27 22:58 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, linux-kernel, kvm
In-Reply-To: <c42b7244-6d43-2ce5-46a3-ea70ec9d957f@redhat.com>

On Wed, Sep 27, 2017 at 08:38:24AM +0800, Jason Wang wrote:
> 
> 
> On 2017年09月27日 03:13, Michael S. Tsirkin wrote:
> > On Fri, Sep 22, 2017 at 04:02:33PM +0800, Jason Wang wrote:
> > > This patch introduces a helper which just increase the used idx. This
> > > will be used in pair with vhost_prefetch_desc_indices() by batching
> > > code.
> > > 
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > ---
> > >   drivers/vhost/vhost.c | 33 +++++++++++++++++++++++++++++++++
> > >   drivers/vhost/vhost.h |  1 +
> > >   2 files changed, 34 insertions(+)
> > > 
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index 8424166d..6532cda 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -2178,6 +2178,39 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
> > >   }
> > >   EXPORT_SYMBOL_GPL(vhost_add_used);
> > > +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n)
> > > +{
> > > +	u16 old, new;
> > > +
> > > +	old = vq->last_used_idx;
> > > +	new = (vq->last_used_idx += n);
> > > +	/* If the driver never bothers to signal in a very long while,
> > > +	 * used index might wrap around. If that happens, invalidate
> > > +	 * signalled_used index we stored. TODO: make sure driver
> > > +	 * signals at least once in 2^16 and remove this.
> > > +	 */
> > > +	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
> > > +		vq->signalled_used_valid = false;
> > > +
> > > +	/* Make sure buffer is written before we update index. */
> > > +	smp_wmb();
> > > +	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
> > > +			   &vq->used->idx)) {
> > > +		vq_err(vq, "Failed to increment used idx");
> > > +		return -EFAULT;
> > > +	}
> > > +	if (unlikely(vq->log_used)) {
> > > +		/* Log used index update. */
> > > +		log_write(vq->log_base,
> > > +			  vq->log_addr + offsetof(struct vring_used, idx),
> > > +			  sizeof(vq->used->idx));
> > > +		if (vq->log_ctx)
> > > +			eventfd_signal(vq->log_ctx, 1);
> > > +	}
> > > +	return 0;
> > > +}
> > > +EXPORT_SYMBOL_GPL(vhost_add_used_idx);
> > > +
> > >   static int __vhost_add_used_n(struct vhost_virtqueue *vq,
> > >   			    struct vring_used_elem *heads,
> > >   			    unsigned count)
> > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > index 16c2cb6..5dd6c05 100644
> > > --- a/drivers/vhost/vhost.h
> > > +++ b/drivers/vhost/vhost.h
> > > @@ -199,6 +199,7 @@ int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > >   void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
> > >   int vhost_vq_init_access(struct vhost_virtqueue *);
> > > +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);
> > >   int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
> > >   int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
> > >   		     unsigned count);
> > Please change the API to hide the fact that there's an index that needs
> > to be updated.
> 
> In fact, an interesting optimization on top is just call
> vhost_add_used_idx(vq, n) instead of n vhost_add_used_idx(vq, 1). That's the
> reason I leave n in the API.
> 
> Thanks

Right but you could increment some internal counter in the vq
structure then update the used index using some api
with a generic name, e.g.  add_used_complete or something like this.

> > 
> > > -- 
> > > 2.7.4

^ permalink raw reply

* Re: [next-queue PATCH 2/3] net/sched: Introduce Credit Based Shaper (CBS) qdisc
From: Jesus Sanchez-Palencia @ 2017-09-27 22:57 UTC (permalink / raw)
  To: Vinicius Costa Gomes, Cong Wang
  Cc: Linux Kernel Network Developers, intel-wired-lan,
	Jamal Hadi Salim, Jiri Pirko, andre.guedes, ivan.briano,
	boon.leong.ong, richardcochran, henrik
In-Reply-To: <87lgkzg7xv.fsf@intel.com>

Hi,


On 09/27/2017 02:14 PM, Vinicius Costa Gomes wrote:
> Hi,
> 
> Cong Wang <xiyou.wangcong@gmail.com> writes:
> 
>> On Tue, Sep 26, 2017 at 4:39 PM, Vinicius Costa Gomes
>> <vinicius.gomes@intel.com> wrote:
>>> +static int cbs_init(struct Qdisc *sch, struct nlattr *opt)
>>> +{
>>> +       struct cbs_sched_data *q = qdisc_priv(sch);
>>> +       struct net_device *dev = qdisc_dev(sch);
>>> +
>>> +       if (!opt)
>>> +               return -EINVAL;
>>> +
>>> +       /* FIXME: this means that we can only install this qdisc
>>> +        * "under" mqprio. Do we need a more generic way to retrieve
>>> +        * the queue, or do we pass the netdev_queue to the driver?
>>> +        */
>>> +       q->queue = TC_H_MIN(sch->parent) - 1 - netdev_get_num_tc(dev);
>>> +
>>> +       return cbs_change(sch, opt);
>>> +}
>>
>> Yeah it is ugly to assume its parent is mqprio, at least you should
>> error out if it is not the case.
> 
> Will add an error for this, for now.
> 
>>
>> I am not sure how we can solve this elegantly, perhaps you should
>> extend mqprio rather than add a new one?
> 
> Is the alternative hinted in the FIXME worse? Instead of passing the
> index of the hardware queue to the driver we pass the pointer to a
> netdev_queue to the driver and it "discovers" the HW queue from that.

What if we keep passing the index, but calculate it from the netdev_queue
pointer instead?

i.e.:  q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);

At least it wouldn't rely on the root qdisc being of any specific type.

Regards,
Jesus

^ permalink raw reply

* Re: [PATCH v4 net-next 00/12] gtp: Additional feature support - Part I
From: Tom Herbert @ 2017-09-27 23:09 UTC (permalink / raw)
  To: Harald Welte
  Cc: David S . Miller, Pablo Neira Ayuso, Andreas Schultz,
	Linux Kernel Network Developers, Rohit LastName
In-Reply-To: <20170927122431.h26ey3h2pzrgay7a@nataraja>

On Wed, Sep 27, 2017 at 5:24 AM, Harald Welte <laforge@gnumonks.org> wrote:
> Hi Tom,
>
> thanks for your updated series!
>
> I'll revie as soon as I a, but that may likely not be before next
> week.  As indicated before, I'm on a motorbike roadtrip on vacation,
> with very limited connectivity and even more limited time for any
> technical work.  Thanks for your understanding.
>
Harald,

No problem. Enjoy your vacation!

Tom

> --
> - Harald Welte <laforge@gnumonks.org>           http://laforge.gnumonks.org/
> ============================================================================
> "Privacy in residential applications is a desirable marketing option."
>                                                   (ETSI EN 300 175-7 Ch. A6)

^ permalink raw reply

* Re: [next-queue PATCH 2/3] net/sched: Introduce Credit Based Shaper (CBS) qdisc
From: Guedes, Andre @ 2017-09-27 23:11 UTC (permalink / raw)
  To: Sanchez-Palencia, Jesus, Gomes, Vinicius,
	xiyou.wangcong@gmail.com
  Cc: jiri@resnulli.us, jhs@mojatatu.com, Ong, Boon Leong,
	richardcochran@gmail.com, netdev@vger.kernel.org,
	henrik@austad.us, Briano, Ivan, intel-wired-lan@lists.osuosl.org
In-Reply-To: <4b0e1610-c9df-191d-496e-4be04c785d2f@intel.com>

[-- Attachment #1: Type: text/plain, Size: 2074 bytes --]

On Wed, 2017-09-27 at 15:57 -0700, Jesus Sanchez-Palencia wrote:
> Hi,
> 
> 
> On 09/27/2017 02:14 PM, Vinicius Costa Gomes wrote:
> > Hi,
> > 
> > Cong Wang <xiyou.wangcong@gmail.com> writes:
> > 
> > > On Tue, Sep 26, 2017 at 4:39 PM, Vinicius Costa Gomes
> > > <vinicius.gomes@intel.com> wrote:
> > > > +static int cbs_init(struct Qdisc *sch, struct nlattr *opt)
> > > > +{
> > > > +       struct cbs_sched_data *q = qdisc_priv(sch);
> > > > +       struct net_device *dev = qdisc_dev(sch);
> > > > +
> > > > +       if (!opt)
> > > > +               return -EINVAL;
> > > > +
> > > > +       /* FIXME: this means that we can only install this qdisc
> > > > +        * "under" mqprio. Do we need a more generic way to retrieve
> > > > +        * the queue, or do we pass the netdev_queue to the driver?
> > > > +        */
> > > > +       q->queue = TC_H_MIN(sch->parent) - 1 - netdev_get_num_tc(dev);
> > > > +
> > > > +       return cbs_change(sch, opt);
> > > > +}
> > > 
> > > Yeah it is ugly to assume its parent is mqprio, at least you should
> > > error out if it is not the case.
> > 
> > Will add an error for this, for now.
> > 
> > > 
> > > I am not sure how we can solve this elegantly, perhaps you should
> > > extend mqprio rather than add a new one?
> > 
> > Is the alternative hinted in the FIXME worse? Instead of passing the
> > index of the hardware queue to the driver we pass the pointer to a
> > netdev_queue to the driver and it "discovers" the HW queue from that.

I don't see why we should move the queue index "discovery" into the driver. The
driver layer should be dead simple and getting the queue index from the upper
layer (qdisc) looks right to me.

> What if we keep passing the index, but calculate it from the netdev_queue
> pointer instead?
> 
> i.e.:  q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
> 
> At least it wouldn't rely on the root qdisc being of any specific type.

+1

- Andre

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 3262 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 0/3] support changing steering policies in tuntap
From: Willem de Bruijn @ 2017-09-27 23:25 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Jason Wang, Network Development, LKML
In-Reply-To: <20170927230042-mutt-send-email-mst@kernel.org>

>> In the future, both simple and sophisticated policy like RSS or other guest
>> driven steering policies could be done on top.
>
> IMHO there should be a more practical example before adding all this
> indirection. And it would be nice to understand why this queue selection
> needs to be tun specific.

I was thinking the same and this reminds me of the various strategies
implemented in packet fanout. tun_cpu_select_queue is analogous to
fanout_demux_cpu though it is tun-specific in that it requires tun->numqueues.

Fanout accrued various strategies until it gained an eBPF variant. Just
supporting BPF is probably sufficient here, too.

^ permalink raw reply

* Re: [PATCH V3] r8152: add Linksys USB3GIGV1 id
From: Doug Anderson @ 2017-09-27 23:47 UTC (permalink / raw)
  To: Grant Grundler
  Cc: Hayes Wang, Oliver Neukum, linux-usb, David S . Miller, LKML,
	netdev
In-Reply-To: <20170927172802.80654-1-grundler@chromium.org>

Hi,

On Wed, Sep 27, 2017 at 10:28 AM, Grant Grundler <grundler@chromium.org> wrote:
> This linksys dongle by default comes up in cdc_ether mode.
> This patch allows r8152 to claim the device:
>    Bus 002 Device 002: ID 13b1:0041 Linksys
>
> Signed-off-by: Grant Grundler <grundler@chromium.org>
> ---
>  drivers/net/usb/cdc_ether.c | 10 ++++++++++
>  drivers/net/usb/r8152.c     |  2 ++
>  2 files changed, 12 insertions(+)
>
> V3: for backwards compat, add #ifdef CONFIG_USB_RTL8152 around
>     the cdc_ether blacklist entry so the cdc_ether driver can
>     still claim the device if r8152 driver isn't configured.
>
> V2: add LINKSYS_VENDOR_ID to cdc_ether blacklist
>
> diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
> index 8ab281b478f2..446dcc0f1f70 100644
> --- a/drivers/net/usb/cdc_ether.c
> +++ b/drivers/net/usb/cdc_ether.c
> @@ -546,6 +546,7 @@ static const struct driver_info wwan_info = {
>  #define DELL_VENDOR_ID         0x413C
>  #define REALTEK_VENDOR_ID      0x0bda
>  #define SAMSUNG_VENDOR_ID      0x04e8
> +#define LINKSYS_VENDOR_ID      0x13b1
>  #define LENOVO_VENDOR_ID       0x17ef

Slight nit that "LI" sorts after "LE".  You got it right in the other case...


>  #define NVIDIA_VENDOR_ID       0x0955
>  #define HP_VENDOR_ID           0x03f0
> @@ -737,6 +738,15 @@ static const struct usb_device_id  products[] = {
>         .driver_info = 0,
>  },
>
> +#ifdef CONFIG_USB_RTL8152
> +/* Linksys USB3GIGV1 Ethernet Adapter */
> +{
> +       USB_DEVICE_AND_INTERFACE_INFO(LINKSYS_VENDOR_ID, 0x0041, USB_CLASS_COMM,
> +                       USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
> +       .driver_info = 0,
> +},
> +#endif

I believe you want to use IS_ENABLED(), don't you?

There's still a weird esoteric side case where kernel modules don't
all need to be included in the filesystem just because they were built
at the same time.  ...but IMHO that seems like enough of a nit that we
can probably ignore it unless someone has a better idea.


-Doug

^ permalink raw reply

* Re: [PATCH V3] r8152: add Linksys USB3GIGV1 id
From: Grant Grundler @ 2017-09-28  0:07 UTC (permalink / raw)
  To: Doug Anderson
  Cc: Grant Grundler, Hayes Wang, Oliver Neukum, linux-usb,
	David S . Miller, LKML, netdev
In-Reply-To: <CAD=FV=U-zMEQ8=_96SMENmcBywG0hSrDvebXUxGGbAur_2T-4g@mail.gmail.com>

Hi Doug!

On Wed, Sep 27, 2017 at 4:47 PM, Doug Anderson <dianders@chromium.org> wrote:
> Hi,
>
> On Wed, Sep 27, 2017 at 10:28 AM, Grant Grundler <grundler@chromium.org> wrote:
>> This linksys dongle by default comes up in cdc_ether mode.
>> This patch allows r8152 to claim the device:
>>    Bus 002 Device 002: ID 13b1:0041 Linksys
>>
>> Signed-off-by: Grant Grundler <grundler@chromium.org>
>> ---
>>  drivers/net/usb/cdc_ether.c | 10 ++++++++++
>>  drivers/net/usb/r8152.c     |  2 ++
>>  2 files changed, 12 insertions(+)
>>
>> V3: for backwards compat, add #ifdef CONFIG_USB_RTL8152 around
>>     the cdc_ether blacklist entry so the cdc_ether driver can
>>     still claim the device if r8152 driver isn't configured.
>>
>> V2: add LINKSYS_VENDOR_ID to cdc_ether blacklist
>>
>> diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
>> index 8ab281b478f2..446dcc0f1f70 100644
>> --- a/drivers/net/usb/cdc_ether.c
>> +++ b/drivers/net/usb/cdc_ether.c
>> @@ -546,6 +546,7 @@ static const struct driver_info wwan_info = {
>>  #define DELL_VENDOR_ID         0x413C
>>  #define REALTEK_VENDOR_ID      0x0bda
>>  #define SAMSUNG_VENDOR_ID      0x04e8
>> +#define LINKSYS_VENDOR_ID      0x13b1
>>  #define LENOVO_VENDOR_ID       0x17ef
>
> Slight nit that "LI" sorts after "LE".  You got it right in the other case...

The list isn't sorted by any rational thing I can see.  I managed to
check my OCD reaction to sort the list numerically. :)

>>  #define NVIDIA_VENDOR_ID       0x0955
>>  #define HP_VENDOR_ID           0x03f0
>> @@ -737,6 +738,15 @@ static const struct usb_device_id  products[] = {
>>         .driver_info = 0,
>>  },
>>
>> +#ifdef CONFIG_USB_RTL8152
>> +/* Linksys USB3GIGV1 Ethernet Adapter */
>> +{
>> +       USB_DEVICE_AND_INTERFACE_INFO(LINKSYS_VENDOR_ID, 0x0041, USB_CLASS_COMM,
>> +                       USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
>> +       .driver_info = 0,
>> +},
>> +#endif
>
> I believe you want to use IS_ENABLED(), don't you?

Ah yes - I wasn't aware IS_ENABLED existed.  Will respin V4 with this
if there isn't any other feedback.


> There's still a weird esoteric side case where kernel modules don't
> all need to be included in the filesystem just because they were built
> at the same time.  ...but IMHO that seems like enough of a nit that we
> can probably ignore it unless someone has a better idea.

I think that would require a run time check. I'm perfectly willing to
ignore that case. :)

thanks!
grant

>
>
> -Doug

^ permalink raw reply

* [PATCH net-next] vhost_net: do not stall on zerocopy depletion
From: Willem de Bruijn @ 2017-09-28  0:25 UTC (permalink / raw)
  To: netdev; +Cc: davem, mst, jasowang, den, virtualization, Willem de Bruijn

From: Willem de Bruijn <willemb@google.com>

Vhost-net has a hard limit on the number of zerocopy skbs in flight.
When reached, transmission stalls. Stalls cause latency, as well as
head-of-line blocking of other flows that do not use zerocopy.

Instead of stalling, revert to copy-based transmission.

Tested by sending two udp flows from guest to host, one with payload
of VHOST_GOODCOPY_LEN, the other too small for zerocopy (1B). The
large flow is redirected to a netem instance with 1MBps rate limit
and deep 1000 entry queue.

  modprobe ifb
  ip link set dev ifb0 up
  tc qdisc add dev ifb0 root netem limit 1000 rate 1MBit

  tc qdisc add dev tap0 ingress
  tc filter add dev tap0 parent ffff: protocol ip \
      u32 match ip dport 8000 0xffff \
      action mirred egress redirect dev ifb0

Before the delay, both flows process around 80K pps. With the delay,
before this patch, both process around 400. After this patch, the
large flow is still rate limited, while the small reverts to its
original rate. See also discussion in the first link, below.

The limit in vhost_exceeds_maxpend must be carefully chosen. When
vq->num >> 1, the flows remain correlated. This value happens to
correspond to VHOST_MAX_PENDING for vq->num == 256. Allow smaller
fractions and ensure correctness also for much smaller values of
vq->num, by testing the min() of both explicitly. See also the
discussion in the second link below.

Link:http://lkml.kernel.org/r/CAF=yD-+Wk9sc9dXMUq1+x_hh=3ThTXa6BnZkygP3tgVpjbp93g@mail.gmail.com
Link:http://lkml.kernel.org/r/20170819064129.27272-1-den@klaipeden.com
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 drivers/vhost/net.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 58585ec8699e..50758602ae9d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -436,8 +436,8 @@ static bool vhost_exceeds_maxpend(struct vhost_net *net)
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
 	struct vhost_virtqueue *vq = &nvq->vq;
 
-	return (nvq->upend_idx + vq->num - VHOST_MAX_PEND) % UIO_MAXIOV
-		== nvq->done_idx;
+	return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV >
+	       min(VHOST_MAX_PEND, vq->num >> 2);
 }
 
 /* Expects to be always run from workqueue - which acts as
@@ -480,12 +480,6 @@ static void handle_tx(struct vhost_net *net)
 		if (zcopy)
 			vhost_zerocopy_signal_used(net, vq);
 
-		/* If more outstanding DMAs, queue the work.
-		 * Handle upend_idx wrap around
-		 */
-		if (unlikely(vhost_exceeds_maxpend(net)))
-			break;
-
 		head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
 						ARRAY_SIZE(vq->iov),
 						&out, &in);
@@ -509,6 +503,7 @@ static void handle_tx(struct vhost_net *net)
 		len = iov_length(vq->iov, out);
 		iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
 		iov_iter_advance(&msg.msg_iter, hdr_size);
+
 		/* Sanity check */
 		if (!msg_data_left(&msg)) {
 			vq_err(vq, "Unexpected header len for TX: "
@@ -519,8 +514,7 @@ static void handle_tx(struct vhost_net *net)
 		len = msg_data_left(&msg);
 
 		zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
-				   && (nvq->upend_idx + 1) % UIO_MAXIOV !=
-				      nvq->done_idx
+				   && !vhost_exceeds_maxpend(net)
 				   && vhost_net_tx_select_zcopy(net);
 
 		/* use msg_control to pass vhost zerocopy ubuf info to skb */
-- 
2.14.2.822.g60be5d43e6-goog

^ permalink raw reply related

* Re: [PATCH net-next] vhost_net: do not stall on zerocopy depletion
From: Willem de Bruijn @ 2017-09-28  0:33 UTC (permalink / raw)
  To: Network Development
  Cc: David Miller, Michael S. Tsirkin, Jason Wang, Koichiro Den,
	virtualization, Willem de Bruijn
In-Reply-To: <20170928002556.41240-1-willemdebruijn.kernel@gmail.com>

On Wed, Sep 27, 2017 at 8:25 PM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> From: Willem de Bruijn <willemb@google.com>
>
> Vhost-net has a hard limit on the number of zerocopy skbs in flight.
> When reached, transmission stalls. Stalls cause latency, as well as
> head-of-line blocking of other flows that do not use zerocopy.
>
> Instead of stalling, revert to copy-based transmission.
>
> Tested by sending two udp flows from guest to host, one with payload
> of VHOST_GOODCOPY_LEN, the other too small for zerocopy (1B). The
> large flow is redirected to a netem instance with 1MBps rate limit
> and deep 1000 entry queue.
>
>   modprobe ifb
>   ip link set dev ifb0 up
>   tc qdisc add dev ifb0 root netem limit 1000 rate 1MBit
>
>   tc qdisc add dev tap0 ingress
>   tc filter add dev tap0 parent ffff: protocol ip \
>       u32 match ip dport 8000 0xffff \
>       action mirred egress redirect dev ifb0
>
> Before the delay, both flows process around 80K pps. With the delay,
> before this patch, both process around 400. After this patch, the
> large flow is still rate limited, while the small reverts to its
> original rate. See also discussion in the first link, below.
>
> The limit in vhost_exceeds_maxpend must be carefully chosen. When
> vq->num >> 1, the flows remain correlated. This value happens to
> correspond to VHOST_MAX_PENDING for vq->num == 256. Allow smaller
> fractions and ensure correctness also for much smaller values of
> vq->num, by testing the min() of both explicitly. See also the
> discussion in the second link below.
>
> Link:http://lkml.kernel.org/r/CAF=yD-+Wk9sc9dXMUq1+x_hh=3ThTXa6BnZkygP3tgVpjbp93g@mail.gmail.com

>From the same discussion thread: it would be good to expose stats
on the number of zerocopy skb sent and number completed without
copy.

To test this patch, I also added ethtool stats to tun and extended them
with two zerocopy counters. Then had tun override the uarg->callback
with its own and update the counters before calling the original callback.

The one useful datapoint I did not get out of that is why skbs would
revert to non-zerocopy: because of size, vhost_exceeds_maxpend
or vhost_net_tx_select_zcopy. The simplistic implementation with an
extra indirect function call and without percpu counters is also not
suitable for submission as is.

^ permalink raw reply

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
From: Willem de Bruijn @ 2017-09-28  0:47 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S. Tsirkin, virtualization, Network Development, LKML,
	kvm
In-Reply-To: <1506067355-5771-3-git-send-email-jasowang@redhat.com>

On Fri, Sep 22, 2017 at 4:02 AM, Jason Wang <jasowang@redhat.com> wrote:
> This patch introduces vhost_prefetch_desc_indices() which could batch
> descriptor indices fetching and used ring updating. This intends to
> reduce the cache misses of indices fetching and updating and reduce
> cache line bounce when virtqueue is almost full. copy_to_user() was
> used in order to benefit from modern cpus that support fast string
> copy. Batched virtqueue processing will be the first user.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |  3 +++
>  2 files changed, 58 insertions(+)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index f87ec75..8424166d 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>  }
>  EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>
> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
> +                               struct vring_used_elem *heads,
> +                               u16 num, bool used_update)
> +{
> +       int ret, ret2;
> +       u16 last_avail_idx, last_used_idx, total, copied;
> +       __virtio16 avail_idx;
> +       struct vring_used_elem __user *used;
> +       int i;
> +
> +       if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
> +               vq_err(vq, "Failed to access avail idx at %p\n",
> +                      &vq->avail->idx);
> +               return -EFAULT;
> +       }
> +       last_avail_idx = vq->last_avail_idx & (vq->num - 1);
> +       vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> +       total = vq->avail_idx - vq->last_avail_idx;
> +       ret = total = min(total, num);
> +
> +       for (i = 0; i < ret; i++) {
> +               ret2 = vhost_get_avail(vq, heads[i].id,
> +                                     &vq->avail->ring[last_avail_idx]);
> +               if (unlikely(ret2)) {
> +                       vq_err(vq, "Failed to get descriptors\n");
> +                       return -EFAULT;
> +               }
> +               last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
> +       }

This is understandably very similar to the existing logic in vhost_get_vq_desc.
Can that be extracted to a helper to avoid code duplication?

Perhaps one helper to update vq->avail_idx and return num, and
another to call vhost_get_avail one or more times.

> +
> +       if (!used_update)
> +               return ret;
> +
> +       last_used_idx = vq->last_used_idx & (vq->num - 1);
> +       while (total) {
> +               copied = min((u16)(vq->num - last_used_idx), total);
> +               ret2 = vhost_copy_to_user(vq,
> +                                         &vq->used->ring[last_used_idx],
> +                                         &heads[ret - total],
> +                                         copied * sizeof(*used));
> +
> +               if (unlikely(ret2)) {
> +                       vq_err(vq, "Failed to update used ring!\n");
> +                       return -EFAULT;
> +               }
> +
> +               last_used_idx = 0;
> +               total -= copied;
> +       }

This second part seems unrelated and could be a separate function?

Also, no need for ret2 and double assignment "ret = total =" if not
modifying total
in the the second loop:

  for (i = 0; i < total; ) {
    ...
    i += copied;
  }

^ permalink raw reply

* Re: [PATCH net-next RFC 5/5] vhost_net: basic tx virtqueue batched processing
From: Willem de Bruijn @ 2017-09-28  0:55 UTC (permalink / raw)
  To: Jason Wang
  Cc: Network Development, virtualization, LKML, kvm,
	Michael S. Tsirkin
In-Reply-To: <1506067355-5771-6-git-send-email-jasowang@redhat.com>

> @@ -461,6 +460,7 @@ static void handle_tx(struct vhost_net *net)
>         struct socket *sock;
>         struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
>         bool zcopy, zcopy_used;
> +       int i, batched = VHOST_NET_BATCH;
>
>         mutex_lock(&vq->mutex);
>         sock = vq->private_data;
> @@ -475,6 +475,12 @@ static void handle_tx(struct vhost_net *net)
>         hdr_size = nvq->vhost_hlen;
>         zcopy = nvq->ubufs;
>
> +       /* Disable zerocopy batched fetching for simplicity */

This special case can perhaps be avoided if we no longer block
on vhost_exceeds_maxpend, but revert to copying.

> +       if (zcopy) {
> +               heads = &used;

Can this special case of batchsize 1 not use vq->heads?

> +               batched = 1;
> +       }
> +
>         for (;;) {
>                 /* Release DMAs done buffers first */
>                 if (zcopy)
> @@ -486,95 +492,114 @@ static void handle_tx(struct vhost_net *net)
>                 if (unlikely(vhost_exceeds_maxpend(net)))
>                         break;

> +                       /* TODO: Check specific error and bomb out
> +                        * unless ENOBUFS?
> +                        */
> +                       err = sock->ops->sendmsg(sock, &msg, len);
> +                       if (unlikely(err < 0)) {
> +                               if (zcopy_used) {
> +                                       vhost_net_ubuf_put(ubufs);
> +                                       nvq->upend_idx =
> +                                  ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV;
> +                               }
> +                               vhost_discard_vq_desc(vq, 1);
> +                               goto out;
> +                       }
> +                       if (err != len)
> +                               pr_debug("Truncated TX packet: "
> +                                       " len %d != %zd\n", err, len);
> +                       if (!zcopy) {
> +                               vhost_add_used_idx(vq, 1);
> +                               vhost_signal(&net->dev, vq);
> +                       } else if (!zcopy_used) {
> +                               vhost_add_used_and_signal(&net->dev,
> +                                                         vq, head, 0);

While batching, perhaps can also move this producer index update
out of the loop and using vhost_add_used_and_signal_n.

> +                       } else
> +                               vhost_zerocopy_signal_used(net, vq);
> +                       vhost_net_tx_packet(net);
> +                       if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> +                               vhost_poll_queue(&vq->poll);
> +                               goto out;
>                         }
> -                       vhost_discard_vq_desc(vq, 1);
> -                       break;
> -               }
> -               if (err != len)
> -                       pr_debug("Truncated TX packet: "
> -                                " len %d != %zd\n", err, len);
> -               if (!zcopy_used)
> -                       vhost_add_used_and_signal(&net->dev, vq, head, 0);
> -               else
> -                       vhost_zerocopy_signal_used(net, vq);
> -               vhost_net_tx_packet(net);
> -               if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> -                       vhost_poll_queue(&vq->poll);
> -                       break;

This patch touches many lines just for indentation. If having to touch
these lines anyway (dirtying git blame), it may be a good time to move
the processing of a single descriptor code into a separate helper function.
And while breaking up, perhaps another helper for setting up ubuf_info.
If you agree, preferably in a separate noop refactor patch that precedes
the functional changes.

^ permalink raw reply

* Re: [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
From: Willem de Bruijn @ 2017-09-28  0:59 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, virtualization, Network Development, LKML, kvm
In-Reply-To: <20170928015749-mutt-send-email-mst@kernel.org>

>> > > @@ -199,6 +199,7 @@ int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
>> > >   void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
>> > >   int vhost_vq_init_access(struct vhost_virtqueue *);
>> > > +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);
>> > >   int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
>> > >   int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
>> > >                        unsigned count);
>> > Please change the API to hide the fact that there's an index that needs
>> > to be updated.
>>
>> In fact, an interesting optimization on top is just call
>> vhost_add_used_idx(vq, n) instead of n vhost_add_used_idx(vq, 1). That's the
>> reason I leave n in the API.
>>
>> Thanks
>
> Right but you could increment some internal counter in the vq
> structure then update the used index using some api
> with a generic name, e.g.  add_used_complete or something like this.

That adds a layer of information hiding. If the same variable can be
kept close to the computation in a local variable and passed directly
to vhost_add_used_idx_n that is easier to follow.

^ permalink raw reply

* [PATCH next] bonding: speed/duplex update at NETDEV_UP event
From: Mahesh Bandewar @ 2017-09-28  1:03 UTC (permalink / raw)
  To: Jay Vosburgh, Andy Gospodarek, Veaceslav Falico, David Miller
  Cc: Mahesh Bandewar, Netdev, Mahesh Bandewar

From: Mahesh Bandewar <maheshb@google.com>

Some NIC drivers don't have correct speed/duplex settings at the
time they send NETDEV_UP notification and that messes up the
bonding state. Especially 802.3ad mode which is very sensitive
to these settings. In the current implementation we invoke
bond_update_speed_duplex() when we receive NETDEV_UP, however,
ignore the return value. If the values we get are invalid
(UNKNOWN), then slave gets removed from the aggregator with
speed and duplex set to UNKNOWN while link is still marked as UP.

This patch fixes this scenario. Also 802.3ad mode is sensitive to
these conditions while other modes are not, so making sure that it
doesn't change the behavior for other modes.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
---
 drivers/net/bonding/bond_main.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index b7313c1d9dcd..177be373966b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3076,7 +3076,16 @@ static int bond_slave_netdev_event(unsigned long event,
 		break;
 	case NETDEV_UP:
 	case NETDEV_CHANGE:
-		bond_update_speed_duplex(slave);
+		/* For 802.3ad mode only:
+		 * Getting invalid Speed/Duplex values here will put slave
+		 * in weird state. So mark it as link-down for the time
+		 * being and let link-monitoring (miimon) set it right when
+		 * correct speeds/duplex are available.
+		 */
+		if (bond_update_speed_duplex(slave) &&
+		    BOND_MODE(bond) == BOND_MODE_8023AD)
+			slave->link = BOND_LINK_DOWN;
+
 		if (BOND_MODE(bond) == BOND_MODE_8023AD)
 			bond_3ad_adapter_speed_duplex_changed(slave);
 		/* Fallthrough */
-- 
2.14.2.822.g60be5d43e6-goog

^ permalink raw reply related

* RE: [PATCH net] i40e: Fix limit imprecise of the number of MAC/VLAN that can be added for VFs
From: wangyunjian @ 2017-09-28  1:32 UTC (permalink / raw)
  To: Sergei Shtylyov, davem@davemloft.net, jeffrey.t.kirsher@intel.com
  Cc: netdev@vger.kernel.org, intel-wired-lan@lists.osuosl.org, caihe
In-Reply-To: <fd104583-6743-9f1b-cba8-988fa1b216db@cogentembedded.com>

Thanks, I will send the v2 later.

> -----Original Message-----
> From: Sergei Shtylyov [mailto:sergei.shtylyov@cogentembedded.com]
> Sent: Wednesday, September 27, 2017 7:34 PM
> To: wangyunjian <wangyunjian@huawei.com>; davem@davemloft.net;
> jeffrey.t.kirsher@intel.com
> Cc: netdev@vger.kernel.org; intel-wired-lan@lists.osuosl.org; caihe
> <caihe@huawei.com>
> Subject: Re: [PATCH net] i40e: Fix limit imprecise of the number of
> MAC/VLAN that can be added for VFs
> 
> Hello!
> 
> On 9/27/2017 9:58 AM, w00273186 wrote:
> 
> > From: Yunjian Wang <wangyunjian@huawei.com>
> >
> > Now it don't limit the number of MAC/VLAN strictly. When there is more
> 
>     Doesn't.
> 
> > elements in the virtchnl MAC/VLAN list, it can still add successfully.
> >
> > Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
> 
> [...]
> 
> MBR, Sergei

^ permalink raw reply

* [PATCH net v2] i40e: Fix limit imprecise of the number of MAC/VLAN that can be added for VFs
From: w00273186 @ 2017-09-28  2:01 UTC (permalink / raw)
  To: davem, jeffrey.t.kirsher, sergei.shtylyov
  Cc: netdev, intel-wired-lan, caihe, Yunjian Wang

From: Yunjian Wang <wangyunjian@huawei.com>

Now it doesn't limit the number of MAC/VLAN strictly. When there is more
elements in the virtchnl MAC/VLAN list, it can still add successfully.

Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 27 +++++++++++++---------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 4d1e670..285b96a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -2065,11 +2065,6 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf, u8 *macaddr)
 		dev_err(&pf->pdev->dev,
 			"VF attempting to override administratively set MAC address, reload the VF driver to resume normal operation\n");
 		ret = -EPERM;
-	} else if ((vf->num_mac >= I40E_VC_MAX_MAC_ADDR_PER_VF) &&
-		   !test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) {
-		dev_err(&pf->pdev->dev,
-			"VF is not trusted, switch the VF to trusted to add more functionality\n");
-		ret = -EPERM;
 	}
 	return ret;
 }
@@ -2128,6 +2123,15 @@ static int i40e_vc_add_mac_addr_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
 		} else {
 			vf->num_mac++;
 		}
+
+		if ((vf->num_mac >= I40E_VC_MAX_MAC_ADDR_PER_VF) &&
+		    !test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) {
+			dev_err(&pf->pdev->dev,
+				"VF is not trusted, switch the VF to trusted to add more functionality\n");
+			ret = -EPERM;
+			spin_unlock_bh(&vsi->mac_filter_hash_lock);
+			goto error_param;
+		}
 	}
 	spin_unlock_bh(&vsi->mac_filter_hash_lock);
 
@@ -2221,12 +2225,6 @@ static int i40e_vc_add_vlan_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
 	i40e_status aq_ret = 0;
 	int i;
 
-	if ((vf->num_vlan >= I40E_VC_MAX_VLAN_PER_VF) &&
-	    !test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) {
-		dev_err(&pf->pdev->dev,
-			"VF is not trusted, switch the VF to trusted to add more VLAN addresses\n");
-		goto error_param;
-	}
 	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states) ||
 	    !i40e_vc_isvalid_vsi_id(vf, vsi_id)) {
 		aq_ret = I40E_ERR_PARAM;
@@ -2269,6 +2267,13 @@ static int i40e_vc_add_vlan_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
 			dev_err(&pf->pdev->dev,
 				"Unable to add VLAN filter %d for VF %d, error %d\n",
 				vfl->vlan_id[i], vf->vf_id, ret);
+		if ((vf->num_vlan >= I40E_VC_MAX_VLAN_PER_VF) &&
+		    !test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) {
+			dev_err(&pf->pdev->dev,
+				"VF is not trusted, switch the VF to trusted to add more VLAN addresses\n");
+			aq_ret = -EPERM;
+			goto error_param;
+		}
 	}
 
 error_param:
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next] net: ipv4: remove fib_weight
From: David Ahern @ 2017-09-28  2:08 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern

fib_weight in fib_info is set but not used. Remove it and the
helpers for setting it.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip_fib.h     | 3 ---
 net/ipv4/fib_semantics.c | 9 ---------
 2 files changed, 12 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 1a7f7e424320..f80524396c06 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -122,9 +122,6 @@ struct fib_info {
 #define fib_rtt fib_metrics->metrics[RTAX_RTT-1]
 #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1]
 	int			fib_nhs;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-	int			fib_weight;
-#endif
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
 #define fib_dev		fib_nh[0].nh_dev
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 57a5d48acee8..be0874620ecc 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -601,17 +601,9 @@ static void fib_rebalance(struct fib_info *fi)
 		atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
 	} endfor_nexthops(fi);
 }
-
-static inline void fib_add_weight(struct fib_info *fi,
-				  const struct fib_nh *nh)
-{
-	fi->fib_weight += nh->nh_weight;
-}
-
 #else /* CONFIG_IP_ROUTE_MULTIPATH */
 
 #define fib_rebalance(fi) do { } while (0)
-#define fib_add_weight(fi, nh) do { } while (0)
 
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 
@@ -1275,7 +1267,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 
 	change_nexthops(fi) {
 		fib_info_update_nh_saddr(net, nexthop_nh);
-		fib_add_weight(fi, nexthop_nh);
 	} endfor_nexthops(fi)
 
 	fib_rebalance(fi);
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next] net: ipv4: remove fib_info arg to fib_check_nh
From: David Ahern @ 2017-09-28  3:41 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern

fib_check_nh does not use the fib_info arg; remove t.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv4/fib_semantics.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 57a5d48acee8..79989124607e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -774,8 +774,8 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
  *					|
  *					|-> {local prefix} (terminal node)
  */
-static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
-			struct fib_nh *nh, struct netlink_ext_ack *extack)
+static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
+			struct netlink_ext_ack *extack)
 {
 	int err = 0;
 	struct net *net;
@@ -1258,7 +1258,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 		int linkdown = 0;
 
 		change_nexthops(fi) {
-			err = fib_check_nh(cfg, fi, nexthop_nh, extack);
+			err = fib_check_nh(cfg, nexthop_nh, extack);
 			if (err != 0)
 				goto failure;
 			if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH iproute2] tc: fix ipv6 filter selector attribute for some prefix lengths
From: Yulia Kartseva @ 2017-09-28  3:47 UTC (permalink / raw)
  To: Stephen Hemminger, netdev
In-Reply-To: <20170927092634.0870468d@shemminger-XPS-13-9360>

[-- Attachment #1: Type: text/plain, Size: 1999 bytes --]

Hello Stephen,
Sending as an attachment.
Thank you!

On Wed, Sep 27, 2017 at 1:26 AM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Mon, 25 Sep 2017 11:12:38 -0700
> Yulia Kartseva <yulia.kartseva@gmail.com> wrote:
>
>> Wrong TCA_U32_SEL attribute packing if prefixLen AND 0x1f equals 0x1f.
>> These are  /31, /63, /95 and /127 prefix lengths.
>>
>> Example:
>> # tc filter add dev eth0 protocol ipv6 parent b: prio 2307 u32 match
>> ip6 dst face:b00f::/31
>> # tc filter show dev eth0
>> filter parent b: protocol ipv6 pref 2307 u32
>> filter parent b: protocol ipv6 pref 2307 u32 fh 800: ht divisor 1
>> filter parent b: protocol ipv6 pref 2307 u32 fh 800::800 order 2048
>> key ht 800 bkt 0
>>   match faceb00f/ffffffff at 24
>>
>>
>> The correct match would be "faceb00e/fffffffe": don't count the last
>> bit of the 4th byte as the network prefix. With fix:
>>
>> # tc filter show dev eth0
>> filter parent b: protocol ipv6 pref 2307 u32
>> filter parent b: protocol ipv6 pref 2307 u32 fh 800: ht divisor 1
>> filter parent b: protocol ipv6 pref 2307 u32 fh 800::800 order 2048
>> key ht 800 bkt 0
>>   match faceb00e/fffffffe at 24
>>
>>  tc/f_u32.c | 3 +--
>>  1 file changed, 1 insertion(+), 2 deletions(-)
>>
>> diff --git a/tc/f_u32.c b/tc/f_u32.c
>> index 5815be9..14b9588 100644
>> --- a/tc/f_u32.c
>> +++ b/tc/f_u32.c
>> @@ -385,8 +385,7 @@ static int parse_ip6_addr(int *argc_p, char ***argv_p,
>>
>>   plen = addr.bitlen;
>>   for (i = 0; i < plen; i += 32) {
>> - /* if (((i + 31) & ~0x1F) <= plen) { */
>> - if (i + 31 <= plen) {
>> + if (i + 31 < plen) {
>>   res = pack_key(sel, addr.data[i / 32],
>>         0xFFFFFFFF, off + 4 * (i / 32), offmask);
>>   if (res < 0)
>
> This patch looks correct, but will not apply cleanly because
> the mail system that you submitted it with is removing whitespace.
> If possible use a different client, or send as an attachment.
>



-- 
C уважением, Юлия

[-- Attachment #2: f_u32.c.patch --]
[-- Type: application/octet-stream, Size: 417 bytes --]

diff --git a/tc/f_u32.c b/tc/f_u32.c
index 479b3f1..a8d07b3 100644
--- a/tc/f_u32.c
+++ b/tc/f_u32.c
@@ -382,8 +382,7 @@ static int parse_ip6_addr(int *argc_p, char ***argv_p,
 
 	plen = addr.bitlen;
 	for (i=0; i<plen; i+=32) {
-//		if (((i+31)&~0x1F)<=plen) {
-		if (i + 31 <= plen) {
+		if (i + 31 < plen) {
 			res = pack_key(sel, addr.data[i/32],
 				       0xFFFFFFFF, off+4*(i/32), offmask);
 			if (res < 0)

^ permalink raw reply related

* [net 02/11] net/mlx5: Fix FPGA capability location
From: Saeed Mahameed @ 2017-09-28  4:41 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Inbar Karmy, Saeed Mahameed
In-Reply-To: <20170928044132.30940-1-saeedm@mellanox.com>

From: Inbar Karmy <inbark@mellanox.com>

Currently, FPGA capability is located in (mdev)->caps.hca_cur,
change the location to be (mdev)->caps.fpga,
since hca_cur is reserved for HCA device capabilities.

Fixes: e29341fb3a5b ("net/mlx5: FPGA, Add basic support for Innova")
Signed-off-by: Inbar Karmy <inbark@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c  | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h  | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c | 3 +--
 include/linux/mlx5/device.h                         | 5 ++---
 include/linux/mlx5/driver.h                         | 1 +
 5 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c
index e37453d838db..c0fd2212e890 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c
@@ -71,11 +71,11 @@ int mlx5_fpga_access_reg(struct mlx5_core_dev *dev, u8 size, u64 addr,
 	return 0;
 }
 
-int mlx5_fpga_caps(struct mlx5_core_dev *dev, u32 *caps)
+int mlx5_fpga_caps(struct mlx5_core_dev *dev)
 {
 	u32 in[MLX5_ST_SZ_DW(fpga_cap)] = {0};
 
-	return mlx5_core_access_reg(dev, in, sizeof(in), caps,
+	return mlx5_core_access_reg(dev, in, sizeof(in), dev->caps.fpga,
 				    MLX5_ST_SZ_BYTES(fpga_cap),
 				    MLX5_REG_FPGA_CAP, 0, 0);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
index 94bdfd47c3f0..d05233c9b4f6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
@@ -65,7 +65,7 @@ struct mlx5_fpga_qp_counters {
 	u64 rx_total_drop;
 };
 
-int mlx5_fpga_caps(struct mlx5_core_dev *dev, u32 *caps);
+int mlx5_fpga_caps(struct mlx5_core_dev *dev);
 int mlx5_fpga_query(struct mlx5_core_dev *dev, struct mlx5_fpga_query *query);
 int mlx5_fpga_ctrl_op(struct mlx5_core_dev *dev, u8 op);
 int mlx5_fpga_access_reg(struct mlx5_core_dev *dev, u8 size, u64 addr,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
index 9034e9960a76..dc8970346521 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
@@ -139,8 +139,7 @@ int mlx5_fpga_device_start(struct mlx5_core_dev *mdev)
 	if (err)
 		goto out;
 
-	err = mlx5_fpga_caps(fdev->mdev,
-			     fdev->mdev->caps.hca_cur[MLX5_CAP_FPGA]);
+	err = mlx5_fpga_caps(fdev->mdev);
 	if (err)
 		goto out;
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index eaf4ad209c8f..e32dbc4934db 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -980,7 +980,6 @@ enum mlx5_cap_type {
 	MLX5_CAP_RESERVED,
 	MLX5_CAP_VECTOR_CALC,
 	MLX5_CAP_QOS,
-	MLX5_CAP_FPGA,
 	/* NUM OF CAP Types */
 	MLX5_CAP_NUM
 };
@@ -1110,10 +1109,10 @@ enum mlx5_mcam_feature_groups {
 	MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_feature_cap_mask.enhanced_features.fld)
 
 #define MLX5_CAP_FPGA(mdev, cap) \
-	MLX5_GET(fpga_cap, (mdev)->caps.hca_cur[MLX5_CAP_FPGA], cap)
+	MLX5_GET(fpga_cap, (mdev)->caps.fpga, cap)
 
 #define MLX5_CAP64_FPGA(mdev, cap) \
-	MLX5_GET64(fpga_cap, (mdev)->caps.hca_cur[MLX5_CAP_FPGA], cap)
+	MLX5_GET64(fpga_cap, (mdev)->caps.fpga, cap)
 
 enum {
 	MLX5_CMD_STAT_OK			= 0x0,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 02ff700e4f30..401c8972cc3a 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -774,6 +774,7 @@ struct mlx5_core_dev {
 		u32 hca_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
 		u32 pcam[MLX5_ST_SZ_DW(pcam_reg)];
 		u32 mcam[MLX5_ST_SZ_DW(mcam_reg)];
+		u32 fpga[MLX5_ST_SZ_DW(fpga_cap)];
 	} caps;
 	phys_addr_t		iseg_base;
 	struct mlx5_init_seg __iomem *iseg;
-- 
2.13.0

^ permalink raw reply related

* [pull request][net 00/11] Mellanox, mlx5 fixes 2017-09-28
From: Saeed Mahameed @ 2017-09-28  4:41 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Saeed Mahameed

Hi Dave,

This series provides misc fixes for mlx5 dirver.

Please pull and let me know if there's any problem.

for -stable:
  net/mlx5e: IPoIB, Fix access to invalid memory address (Kernels >= 4.12)

Thanks,
Saeed.

-- 

The following changes since commit c2cc187e53011c1c4931055984657da9085c763b:

  sctp: Fix a big endian bug in sctp_diag_dump() (2017-09-26 21:16:29 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git tags/mlx5-fixes-2017-09-28

for you to fetch changes up to 353f59f4d41e9c5798a15c5c52958f25b579a3d5:

  net/mlx5: Fix wrong indentation in enable SRIOV code (2017-09-28 07:23:10 +0300)

----------------------------------------------------------------
mlx5-fixes-2017-09-28

Misc. fixes for mlx5 drivers.

----------------------------------------------------------------
Gal Pressman (3):
      net/mlx5e: Print netdev features correctly in error message
      net/mlx5e: Don't add/remove 802.1ad rules when changing 802.1Q VLAN filter
      net/mlx5e: Fix calculated checksum offloads counters

Inbar Karmy (1):
      net/mlx5: Fix FPGA capability location

Matan Barak (1):
      net/mlx5: Fix static checker warning on steering tracepoints code

Or Gerlitz (2):
      net/mlx5e: Disallow TC offloading of unsupported match/action combinations
      net/mlx5: Fix wrong indentation in enable SRIOV code

Paul Blakey (1):
      net/mlx5e: Fix erroneous freeing of encap header buffer

Raed Salem (1):
      net/mlx5: Check device capability for maximum flow counters

Roi Dayan (1):
      net/mlx5e: IPoIB, Fix access to invalid memory address

Vlad Buslov (1):
      net/mlx5e: Check encap entry state when offloading tunneled flows

 .../mellanox/mlx5/core/diag/fs_tracepoint.h        |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c    |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 13 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |  3 +
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |  6 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    | 91 ++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c    |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h |  2 +-
 .../net/ethernet/mellanox/mlx5/core/fpga/core.c    |  3 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c   |  8 ++
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  | 11 +++
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c  |  3 +-
 drivers/net/ethernet/mellanox/mlx5/core/sriov.c    |  2 +-
 include/linux/mlx5/device.h                        |  5 +-
 include/linux/mlx5/driver.h                        |  1 +
 include/linux/mlx5/mlx5_ifc.h                      |  3 +-
 17 files changed, 133 insertions(+), 31 deletions(-)

^ permalink raw reply

* [net 01/11] net/mlx5e: IPoIB, Fix access to invalid memory address
From: Saeed Mahameed @ 2017-09-28  4:41 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Roi Dayan, Saeed Mahameed
In-Reply-To: <20170928044132.30940-1-saeedm@mellanox.com>

From: Roi Dayan <roid@mellanox.com>

When cleaning rdma netdevice we need to save the mdev pointer
because priv is released when we release netdev.

This bug was found using the kernel address sanitizer (KASAN).
use-after-free in mlx5_rdma_netdev_free+0xe3/0x100 [mlx5_core]

Fixes: 48935bbb7ae8 ("net/mlx5e: IPoIB, Add netdevice profile skeleton")
Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index 85298051a3e4..145e392ab849 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -572,12 +572,13 @@ void mlx5_rdma_netdev_free(struct net_device *netdev)
 {
 	struct mlx5e_priv          *priv    = mlx5i_epriv(netdev);
 	const struct mlx5e_profile *profile = priv->profile;
+	struct mlx5_core_dev       *mdev    = priv->mdev;
 
 	mlx5e_detach_netdev(priv);
 	profile->cleanup(priv);
 	destroy_workqueue(priv->wq);
 	free_netdev(netdev);
 
-	mlx5e_destroy_mdev_resources(priv->mdev);
+	mlx5e_destroy_mdev_resources(mdev);
 }
 EXPORT_SYMBOL(mlx5_rdma_netdev_free);
-- 
2.13.0

^ permalink raw reply related

* [net 04/11] net/mlx5e: Fix erroneous freeing of encap header buffer
From: Saeed Mahameed @ 2017-09-28  4:41 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Paul Blakey, Saeed Mahameed
In-Reply-To: <20170928044132.30940-1-saeedm@mellanox.com>

From: Paul Blakey <paulb@mellanox.com>

In case the neighbour for the tunnel destination isn't valid,
we send a neighbour update request but we free the encap
header buffer. This is wrong, because we still need it for
allocating a HW encap entry once the neighbour is available.

Fix that by skipping freeing it if we wait for neighbour.

Fixes: 232c001398ae ('net/mlx5e: Add support to neighbour update flow')
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index da503e6411da..4e2fc016bdd6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1564,7 +1564,7 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
 		break;
 	default:
 		err = -EOPNOTSUPP;
-		goto out;
+		goto free_encap;
 	}
 	fl4.flowi4_tos = tun_key->tos;
 	fl4.daddr = tun_key->u.ipv4.dst;
@@ -1573,7 +1573,7 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
 	err = mlx5e_route_lookup_ipv4(priv, mirred_dev, &out_dev,
 				      &fl4, &n, &ttl);
 	if (err)
-		goto out;
+		goto free_encap;
 
 	/* used by mlx5e_detach_encap to lookup a neigh hash table
 	 * entry in the neigh hash table when a user deletes a rule
@@ -1590,7 +1590,7 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
 	 */
 	err = mlx5e_rep_encap_entry_attach(netdev_priv(out_dev), e);
 	if (err)
-		goto out;
+		goto free_encap;
 
 	read_lock_bh(&n->lock);
 	nud_state = n->nud_state;
@@ -1630,8 +1630,9 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
 
 destroy_neigh_entry:
 	mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
-out:
+free_encap:
 	kfree(encap_header);
+out:
 	if (n)
 		neigh_release(n);
 	return err;
@@ -1668,7 +1669,7 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
 		break;
 	default:
 		err = -EOPNOTSUPP;
-		goto out;
+		goto free_encap;
 	}
 
 	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tun_key->tos), tun_key->label);
@@ -1678,7 +1679,7 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
 	err = mlx5e_route_lookup_ipv6(priv, mirred_dev, &out_dev,
 				      &fl6, &n, &ttl);
 	if (err)
-		goto out;
+		goto free_encap;
 
 	/* used by mlx5e_detach_encap to lookup a neigh hash table
 	 * entry in the neigh hash table when a user deletes a rule
@@ -1695,7 +1696,7 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
 	 */
 	err = mlx5e_rep_encap_entry_attach(netdev_priv(out_dev), e);
 	if (err)
-		goto out;
+		goto free_encap;
 
 	read_lock_bh(&n->lock);
 	nud_state = n->nud_state;
@@ -1736,8 +1737,9 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
 
 destroy_neigh_entry:
 	mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
-out:
+free_encap:
 	kfree(encap_header);
+out:
 	if (n)
 		neigh_release(n);
 	return err;
-- 
2.13.0

^ permalink raw reply related

* [net 05/11] net/mlx5e: Disallow TC offloading of unsupported match/action combinations
From: Saeed Mahameed @ 2017-09-28  4:41 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Or Gerlitz, Saeed Mahameed
In-Reply-To: <20170928044132.30940-1-saeedm@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>

When offloading header re-write, the HW may need to adjust checksums along
the packet. For IP traffic, and a case where we are asked to modify fields in
the IP header, current HW supports that only for TCP and UDP. Enforce it, in
this case fail the offloading attempt for non TCP/UDP packets.

Fixes: d7e75a325cb2 ('net/mlx5e: Add offloading of E-Switch TC pedit (header re-write) actions')
Fixes: 2f4fe4cab073 ('net/mlx5e: Add offloading of NIC TC pedit (header re-write) actions')
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 70 +++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 4e2fc016bdd6..d3786005fba7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1317,6 +1317,69 @@ static bool csum_offload_supported(struct mlx5e_priv *priv, u32 action, u32 upda
 	return true;
 }
 
+static bool modify_header_match_supported(struct mlx5_flow_spec *spec,
+					  struct tcf_exts *exts)
+{
+	const struct tc_action *a;
+	bool modify_ip_header;
+	LIST_HEAD(actions);
+	u8 htype, ip_proto;
+	void *headers_v;
+	u16 ethertype;
+	int nkeys, i;
+
+	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
+	ethertype = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ethertype);
+
+	/* for non-IP we only re-write MACs, so we're okay */
+	if (ethertype != ETH_P_IP && ethertype != ETH_P_IPV6)
+		goto out_ok;
+
+	modify_ip_header = false;
+	tcf_exts_to_list(exts, &actions);
+	list_for_each_entry(a, &actions, list) {
+		if (!is_tcf_pedit(a))
+			continue;
+
+		nkeys = tcf_pedit_nkeys(a);
+		for (i = 0; i < nkeys; i++) {
+			htype = tcf_pedit_htype(a, i);
+			if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 ||
+			    htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP6) {
+				modify_ip_header = true;
+				break;
+			}
+		}
+	}
+
+	ip_proto = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol);
+	if (modify_ip_header && ip_proto != IPPROTO_TCP && ip_proto != IPPROTO_UDP) {
+		pr_info("can't offload re-write of ip proto %d\n", ip_proto);
+		return false;
+	}
+
+out_ok:
+	return true;
+}
+
+static bool actions_match_supported(struct mlx5e_priv *priv,
+				    struct tcf_exts *exts,
+				    struct mlx5e_tc_flow_parse_attr *parse_attr,
+				    struct mlx5e_tc_flow *flow)
+{
+	u32 actions;
+
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH)
+		actions = flow->esw_attr->action;
+	else
+		actions = flow->nic_attr->action;
+
+	if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+		return modify_header_match_supported(&parse_attr->spec, exts);
+
+	return true;
+}
+
 static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 				struct mlx5e_tc_flow_parse_attr *parse_attr,
 				struct mlx5e_tc_flow *flow)
@@ -1378,6 +1441,9 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 		return -EINVAL;
 	}
 
+	if (!actions_match_supported(priv, exts, parse_attr, flow))
+		return -EOPNOTSUPP;
+
 	return 0;
 }
 
@@ -1936,6 +2002,10 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 
 		return -EINVAL;
 	}
+
+	if (!actions_match_supported(priv, exts, parse_attr, flow))
+		return -EOPNOTSUPP;
+
 	return err;
 }
 
-- 
2.13.0

^ permalink raw reply related

* [net 03/11] net/mlx5: Check device capability for maximum flow counters
From: Saeed Mahameed @ 2017-09-28  4:41 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Raed Salem, Saeed Mahameed
In-Reply-To: <20170928044132.30940-1-saeedm@mellanox.com>

From: Raed Salem <raeds@mellanox.com>

Added check for the maximal number of flow counters attached
to rule (FTE).

Fixes: bd5251dbf156b ('net/mlx5_core: Introduce flow steering destination of type counter')
Signed-off-by: Raed Salem <raeds@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c  |  8 ++++++++
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 11 +++++++++++
 include/linux/mlx5/mlx5_ifc.h                     |  3 ++-
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index e0d0efd903bc..36ecc2b2e187 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -293,6 +293,9 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 	}
 
 	if (fte->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		int max_list_size = BIT(MLX5_CAP_FLOWTABLE_TYPE(dev,
+					log_max_flow_counter,
+					ft->type));
 		int list_size = 0;
 
 		list_for_each_entry(dst, &fte->node.children, node.list) {
@@ -305,12 +308,17 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 			in_dests += MLX5_ST_SZ_BYTES(dest_format_struct);
 			list_size++;
 		}
+		if (list_size > max_list_size) {
+			err = -EINVAL;
+			goto err_out;
+		}
 
 		MLX5_SET(flow_context, in_flow_context, flow_counter_list_size,
 			 list_size);
 	}
 
 	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+err_out:
 	kvfree(in);
 	return err;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 5509a752f98e..48dd78975062 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -52,6 +52,7 @@ enum fs_flow_table_type {
 	FS_FT_FDB             = 0X4,
 	FS_FT_SNIFFER_RX	= 0X5,
 	FS_FT_SNIFFER_TX	= 0X6,
+	FS_FT_MAX_TYPE = FS_FT_SNIFFER_TX,
 };
 
 enum fs_flow_table_op_mod {
@@ -260,4 +261,14 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev);
 #define fs_for_each_dst(pos, fte)			\
 	fs_list_for_each_entry(pos, &(fte)->node.children)
 
+#define MLX5_CAP_FLOWTABLE_TYPE(mdev, cap, type) (		\
+	(type == FS_FT_NIC_RX) ? MLX5_CAP_FLOWTABLE_NIC_RX(mdev, cap) :		\
+	(type == FS_FT_ESW_EGRESS_ACL) ? MLX5_CAP_ESW_EGRESS_ACL(mdev, cap) :		\
+	(type == FS_FT_ESW_INGRESS_ACL) ? MLX5_CAP_ESW_INGRESS_ACL(mdev, cap) :		\
+	(type == FS_FT_FDB) ? MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) :		\
+	(type == FS_FT_SNIFFER_RX) ? MLX5_CAP_FLOWTABLE_SNIFFER_RX(mdev, cap) :		\
+	(type == FS_FT_SNIFFER_TX) ? MLX5_CAP_FLOWTABLE_SNIFFER_TX(mdev, cap) :		\
+	(BUILD_BUG_ON_ZERO(FS_FT_SNIFFER_TX != FS_FT_MAX_TYPE))\
+	)
+
 #endif
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index a528b35a022e..69772347f866 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -327,7 +327,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         reserved_at_80[0x18];
 	u8         log_max_destination[0x8];
 
-	u8         reserved_at_a0[0x18];
+	u8         log_max_flow_counter[0x8];
+	u8         reserved_at_a8[0x10];
 	u8         log_max_flow[0x8];
 
 	u8         reserved_at_c0[0x40];
-- 
2.13.0

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox