Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: Linux 4.14 - regression: broken tun/tap / bridge network with virtio - bisected
From: Andreas Hartmann @ 2017-12-21 17:05 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Michal Kubecek, Jason Wang, David Miller, Network Development
In-Reply-To: <CAF=yD-KFD3++koAi3mzYbH75D526QXZXBaKT5jfbNqniGMTL8w@mail.gmail.com>

On 12/20/2017 at 11:44 PM Willem de Bruijn wrote:
> On Wed, Dec 20, 2017 at 10:56 AM, Andreas Hartmann
> <andihartmann@01019freenet.de> wrote:
>> On 12/18/2017 at 06:11 PM Andreas Hartmann wrote:
>>> On 12/17/2017 at 11:33 PM Willem de Bruijn wrote:
>> [...]
>>>> I have been able to reproduce the hang by sending a UFO packet
>>>> between two guests running v4.13 on a host running v4.15-rc1.
>>>>
>>>> The vhost_net_ubuf_ref refcount indeed hits overflow (-1) from
>>>> vhost_zerocopy_callback being called for each segment of a
>>>> segmented UFO skb. This refcount is decremented then on each
>>>> segment, but incremented only once for the entire UFO skb.
>>>>
>>>> Before v4.14, these packets would be converted in skb_segment to
>>>> regular copy packets with skb_orphan_frags and the callback function
>>>> called once at this point. v4.14 added support for reference counted
>>>> zerocopy skb that can pass through skb_orphan_frags unmodified and
>>>> have their zerocopy state safely cloned with skb_zerocopy_clone.
>>>>
>>>> The call to skb_zerocopy_clone must come after skb_orphan_frags
>>>> to limit cloning of this state to those skbs that can do so safely.
>>>>
>>>> Please try a host with the following patch. This fixes it for me. I intend to
>>>> send it to net.
>>>>
>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>> index a592ca025fc4..d2d985418819 100644
>>>> --- a/net/core/skbuff.c
>>>> +++ b/net/core/skbuff.c
>>>> @@ -3654,8 +3654,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>>>>
>>>>                  skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
>>>>                                                SKBTX_SHARED_FRAG;
>>>> -               if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
>>>> -                       goto err;
>>>>
>>>>                  while (pos < offset + len) {
>>>>                          if (i >= nfrags) {
>>>> @@ -3681,6 +3679,8 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>>>>
>>>>                          if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
>>>>                                  goto err;
>>>> +                       if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
>>>> +                               goto err;
>>>>
>>>>                          *nskb_frag = *frag;
>>>>                          __skb_frag_ref(nskb_frag);
>>>>
>>>>
>>>> This is relatively inefficient, as it calls skb_zerocopy_clone for each frag
>>>> in the frags[] array. I will follow-up with a patch to net-next that only
>>>> checks once per skb:
>>>>
>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>> index 466581cf4cdc..a293a33604ec 100644
>>>> --- a/net/core/skbuff.c
>>>> +++ b/net/core/skbuff.c
>>>> @@ -3662,7 +3662,8 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>>>>
>>>>                  skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
>>>>                                                SKBTX_SHARED_FRAG;
>>>> -               if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
>>>> +               if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
>>>> +                   skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
>>>>                          goto err;
>>>>
>>>>                  while (pos < offset + len) {
>>>> @@ -3676,6 +3677,11 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>>>>
>>>>                                  BUG_ON(!nfrags);
>>>>
>>>> +                               if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
>>>> +                                   skb_zerocopy_clone(nskb, frag_skb,
>>>> +                                                      GFP_ATOMIC))
>>>> +                                       goto err;
>>>> +
>>>>                                  list_skb = list_skb->next;
>>>>                          }
>>>>
>>>> @@ -3687,9 +3693,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>>>>                                  goto err;
>>>>                          }
>>>>
>>>> -                       if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
>>>> -                               goto err;
>>>> -
>>>
>>> I'm currently testing this one.
>>>
>>
>> Test is in progress. I'm testing w/ 4.14.7, which already contains "net:
>> accept UFO datagrams from tuntap and packet".
>>
>> At first, I tested an unpatched 4.14.7 - the problem (no more killable
>> qemu-process) did occur promptly on shutdown of the machine. This was
>> expected.
>>
>> Next, I applied the above patch (the second one). Until now, I didn't
>> face any problem any more on shutdown of VMs. Looks promising.
> 
> Thanks for testing.
> 
> I sent the first, simpler, one to net together with another fix.
> 
>    http://patchwork.ozlabs.org/patch/851715/
> 

If I'm using the second patch above (the more efficient one and not 
"[net,1/2] skbuff: orphan frags before zerocopy clone"), which I'm 
already testing here: Is it still necessary to apply this patch 
"[net,2/2] skbuff: skb_copy_ubufs must release uarg even without user 
frags"?


Thanks,
Andreas

^ permalink raw reply

* Re: Linux 4.14 - regression: broken tun/tap / bridge network with virtio - bisected
From: Willem de Bruijn @ 2017-12-21 17:11 UTC (permalink / raw)
  To: Andreas Hartmann
  Cc: Michal Kubecek, Jason Wang, David Miller, Network Development
In-Reply-To: <b10b506b-1fb1-2e85-1905-83d3ef091be1@01019freenet.de>

On Thu, Dec 21, 2017 at 12:05 PM, Andreas Hartmann
<andihartmann@01019freenet.de> wrote:
> On 12/20/2017 at 11:44 PM Willem de Bruijn wrote:
>>
>> On Wed, Dec 20, 2017 at 10:56 AM, Andreas Hartmann
>> <andihartmann@01019freenet.de> wrote:
>>>
>>> On 12/18/2017 at 06:11 PM Andreas Hartmann wrote:
>>>>
>>>> On 12/17/2017 at 11:33 PM Willem de Bruijn wrote:
>>>
>>> [...]
>>>>>
>>>>> I have been able to reproduce the hang by sending a UFO packet
>>>>> between two guests running v4.13 on a host running v4.15-rc1.
>>>>>
>>>>> The vhost_net_ubuf_ref refcount indeed hits overflow (-1) from
>>>>> vhost_zerocopy_callback being called for each segment of a
>>>>> segmented UFO skb. This refcount is decremented then on each
>>>>> segment, but incremented only once for the entire UFO skb.
>>>>>
>>>>> Before v4.14, these packets would be converted in skb_segment to
>>>>> regular copy packets with skb_orphan_frags and the callback function
>>>>> called once at this point. v4.14 added support for reference counted
>>>>> zerocopy skb that can pass through skb_orphan_frags unmodified and
>>>>> have their zerocopy state safely cloned with skb_zerocopy_clone.
>>>>>
>>>>> The call to skb_zerocopy_clone must come after skb_orphan_frags
>>>>> to limit cloning of this state to those skbs that can do so safely.
>>>>>
>>>>> Please try a host with the following patch. This fixes it for me. I
>>>>> intend to
>>>>> send it to net.
>>>>>
>>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>>> index a592ca025fc4..d2d985418819 100644
>>>>> --- a/net/core/skbuff.c
>>>>> +++ b/net/core/skbuff.c
>>>>> @@ -3654,8 +3654,6 @@ struct sk_buff *skb_segment(struct sk_buff
>>>>> *head_skb,
>>>>>
>>>>>                  skb_shinfo(nskb)->tx_flags |=
>>>>> skb_shinfo(head_skb)->tx_flags &
>>>>>                                                SKBTX_SHARED_FRAG;
>>>>> -               if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
>>>>> -                       goto err;
>>>>>
>>>>>                  while (pos < offset + len) {
>>>>>                          if (i >= nfrags) {
>>>>> @@ -3681,6 +3679,8 @@ struct sk_buff *skb_segment(struct sk_buff
>>>>> *head_skb,
>>>>>
>>>>>                          if (unlikely(skb_orphan_frags(frag_skb,
>>>>> GFP_ATOMIC)))
>>>>>                                  goto err;
>>>>> +                       if (skb_zerocopy_clone(nskb, frag_skb,
>>>>> GFP_ATOMIC))
>>>>> +                               goto err;
>>>>>
>>>>>                          *nskb_frag = *frag;
>>>>>                          __skb_frag_ref(nskb_frag);
>>>>>
>>>>>
>>>>> This is relatively inefficient, as it calls skb_zerocopy_clone for each
>>>>> frag
>>>>> in the frags[] array. I will follow-up with a patch to net-next that
>>>>> only
>>>>> checks once per skb:
>>>>>
>>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>>> index 466581cf4cdc..a293a33604ec 100644
>>>>> --- a/net/core/skbuff.c
>>>>> +++ b/net/core/skbuff.c
>>>>> @@ -3662,7 +3662,8 @@ struct sk_buff *skb_segment(struct sk_buff
>>>>> *head_skb,
>>>>>
>>>>>                  skb_shinfo(nskb)->tx_flags |=
>>>>> skb_shinfo(head_skb)->tx_flags &
>>>>>                                                SKBTX_SHARED_FRAG;
>>>>> -               if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
>>>>> +               if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
>>>>> +                   skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
>>>>>                          goto err;
>>>>>
>>>>>                  while (pos < offset + len) {
>>>>> @@ -3676,6 +3677,11 @@ struct sk_buff *skb_segment(struct sk_buff
>>>>> *head_skb,
>>>>>
>>>>>                                  BUG_ON(!nfrags);
>>>>>
>>>>> +                               if (skb_orphan_frags(frag_skb,
>>>>> GFP_ATOMIC) ||
>>>>> +                                   skb_zerocopy_clone(nskb, frag_skb,
>>>>> +                                                      GFP_ATOMIC))
>>>>> +                                       goto err;
>>>>> +
>>>>>                                  list_skb = list_skb->next;
>>>>>                          }
>>>>>
>>>>> @@ -3687,9 +3693,6 @@ struct sk_buff *skb_segment(struct sk_buff
>>>>> *head_skb,
>>>>>                                  goto err;
>>>>>                          }
>>>>>
>>>>> -                       if (unlikely(skb_orphan_frags(frag_skb,
>>>>> GFP_ATOMIC)))
>>>>> -                               goto err;
>>>>> -
>>>>
>>>>
>>>> I'm currently testing this one.
>>>>
>>>
>>> Test is in progress. I'm testing w/ 4.14.7, which already contains "net:
>>> accept UFO datagrams from tuntap and packet".
>>>
>>> At first, I tested an unpatched 4.14.7 - the problem (no more killable
>>> qemu-process) did occur promptly on shutdown of the machine. This was
>>> expected.
>>>
>>> Next, I applied the above patch (the second one). Until now, I didn't
>>> face any problem any more on shutdown of VMs. Looks promising.
>>
>>
>> Thanks for testing.
>>
>> I sent the first, simpler, one to net together with another fix.
>>
>>    http://patchwork.ozlabs.org/patch/851715/
>>
>
> If I'm using the second patch above (the more efficient one and not
> "[net,1/2] skbuff: orphan frags before zerocopy clone"), which I'm already
> testing here: Is it still necessary to apply this patch "[net,2/2] skbuff:
> skb_copy_ubufs must release uarg even without user frags"?

Not for this issue. It is an unrelated bug and not triggered by virtio_net
as configured normally.

^ permalink raw reply

* [PATCH net-next v2] xen-netback: make copy batch size configurable
From: Joao Martins @ 2017-12-21 17:24 UTC (permalink / raw)
  To: netdev; +Cc: Joao Martins, Wei Liu, Paul Durrant, xen-devel

Commit eb1723a29b9a ("xen-netback: refactor guest rx") refactored Rx
handling and as a result decreased max grant copy ops from 4352 to 64.
Before this commit it would drain the rx_queue (while there are
enough slots in the ring to put packets) then copy to all pages and write
responses on the ring. With the refactor we do almost the same albeit
the last two steps are done every COPY_BATCH_SIZE (64) copies.

For big packets, the value of 64 means copying 3 packets best case scenario
(17 copies) and worst-case only 1 packet (34 copies, i.e. if all frags
plus head cross the 4k grant boundary) which could be the case when
packets go from local backend process.

Instead of making it static to 64 grant copies, lets allow the user to
select its value (while keeping the current as default) by introducing
the `copy_batch_size` module parameter. This allows users to select
the higher batches (i.e. for better throughput with big packets) as it
was prior to the above mentioned commit.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
Changes since v1:
 * move rx_copy.{idx,op} reallocation to separate helper
 Addressed Paul's comments:
 * rename xenvif_copy_state#size field to batch_size
 * argument `size` should be unsigned int
 * vfree is safe with NULL
 * realloc rx_copy.{idx,op} after copy op flush
---
 drivers/net/xen-netback/common.h    |  7 +++++--
 drivers/net/xen-netback/interface.c | 16 +++++++++++++++-
 drivers/net/xen-netback/netback.c   |  5 +++++
 drivers/net/xen-netback/rx.c        | 35 ++++++++++++++++++++++++++++++++++-
 4 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index a46a1e94505d..8e4eaf3a507d 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -129,8 +129,9 @@ struct xenvif_stats {
 #define COPY_BATCH_SIZE 64
 
 struct xenvif_copy_state {
-	struct gnttab_copy op[COPY_BATCH_SIZE];
-	RING_IDX idx[COPY_BATCH_SIZE];
+	struct gnttab_copy *op;
+	RING_IDX *idx;
+	unsigned int batch_size;
 	unsigned int num;
 	struct sk_buff_head *completed;
 };
@@ -358,6 +359,7 @@ irqreturn_t xenvif_ctrl_irq_fn(int irq, void *data);
 
 void xenvif_rx_action(struct xenvif_queue *queue);
 void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb);
+int xenvif_rx_copy_realloc(struct xenvif_queue *queue, unsigned int size);
 
 void xenvif_carrier_on(struct xenvif *vif);
 
@@ -381,6 +383,7 @@ extern unsigned int rx_drain_timeout_msecs;
 extern unsigned int rx_stall_timeout_msecs;
 extern unsigned int xenvif_max_queues;
 extern unsigned int xenvif_hash_cache_size;
+extern unsigned int xenvif_copy_batch_size;
 
 #ifdef CONFIG_DEBUG_FS
 extern struct dentry *xen_netback_dbg_root;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 78ebe494fef0..e12eb64ab0a9 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -518,6 +518,12 @@ int xenvif_init_queue(struct xenvif_queue *queue)
 {
 	int err, i;
 
+	err = xenvif_rx_copy_realloc(queue, xenvif_copy_batch_size);
+	if (err) {
+		netdev_err(queue->vif->dev, "Could not alloc rx_copy\n");
+		goto err;
+	}
+
 	queue->credit_bytes = queue->remaining_credit = ~0UL;
 	queue->credit_usec  = 0UL;
 	timer_setup(&queue->credit_timeout, xenvif_tx_credit_callback, 0);
@@ -544,7 +550,7 @@ int xenvif_init_queue(struct xenvif_queue *queue)
 				 queue->mmap_pages);
 	if (err) {
 		netdev_err(queue->vif->dev, "Could not reserve mmap_pages\n");
-		return -ENOMEM;
+		goto err;
 	}
 
 	for (i = 0; i < MAX_PENDING_REQS; i++) {
@@ -556,6 +562,11 @@ int xenvif_init_queue(struct xenvif_queue *queue)
 	}
 
 	return 0;
+
+err:
+	vfree(queue->rx_copy.op);
+	vfree(queue->rx_copy.idx);
+	return -ENOMEM;
 }
 
 void xenvif_carrier_on(struct xenvif *vif)
@@ -788,6 +799,9 @@ void xenvif_disconnect_ctrl(struct xenvif *vif)
  */
 void xenvif_deinit_queue(struct xenvif_queue *queue)
 {
+	vfree(queue->rx_copy.op);
+	vfree(queue->rx_copy.idx);
+	queue->rx_copy.batch_size = 0;
 	gnttab_free_pages(MAX_PENDING_REQS, queue->mmap_pages);
 }
 
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index a27daa23c9dc..3a5e1d7ac2f4 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -96,6 +96,11 @@ unsigned int xenvif_hash_cache_size = XENVIF_HASH_CACHE_SIZE_DEFAULT;
 module_param_named(hash_cache_size, xenvif_hash_cache_size, uint, 0644);
 MODULE_PARM_DESC(hash_cache_size, "Number of flows in the hash cache");
 
+/* This is the maximum batch of grant copies on Rx */
+unsigned int xenvif_copy_batch_size = COPY_BATCH_SIZE;
+module_param_named(copy_batch_size, xenvif_copy_batch_size, uint, 0644);
+MODULE_PARM_DESC(copy_batch_size, "Maximum batch of grant copies on Rx");
+
 static void xenvif_idx_release(struct xenvif_queue *queue, u16 pending_idx,
 			       u8 status);
 
diff --git a/drivers/net/xen-netback/rx.c b/drivers/net/xen-netback/rx.c
index b1cf7c6f407a..07eebd75e668 100644
--- a/drivers/net/xen-netback/rx.c
+++ b/drivers/net/xen-netback/rx.c
@@ -130,6 +130,36 @@ static void xenvif_rx_queue_drop_expired(struct xenvif_queue *queue)
 	}
 }
 
+int xenvif_rx_copy_realloc(struct xenvif_queue *queue, unsigned int size)
+{
+	void *op = NULL, *idx = NULL;
+
+	if (!size || queue->rx_copy.num)
+		return -EINVAL;
+
+	op = vzalloc(size * sizeof(struct gnttab_copy));
+	if (!op)
+		goto err;
+
+	idx = vzalloc(size * sizeof(RING_IDX));
+	if (!idx)
+		goto err;
+
+	vfree(queue->rx_copy.op);
+	vfree(queue->rx_copy.idx);
+
+	queue->rx_copy.op = op;
+	queue->rx_copy.idx = idx;
+	queue->rx_copy.batch_size = size;
+	netdev_dbg(queue->vif->dev, "Reallocated rx_copy for batch size %u\n",
+		   size);
+	return 0;
+
+err:
+	vfree(op);
+	return -ENOMEM;
+}
+
 static void xenvif_rx_copy_flush(struct xenvif_queue *queue)
 {
 	unsigned int i;
@@ -162,6 +192,9 @@ static void xenvif_rx_copy_flush(struct xenvif_queue *queue)
 		notify_remote_via_irq(queue->rx_irq);
 
 	__skb_queue_purge(queue->rx_copy.completed);
+
+	if (unlikely(xenvif_copy_batch_size != queue->rx_copy.batch_size))
+		xenvif_rx_copy_realloc(queue, xenvif_copy_batch_size);
 }
 
 static void xenvif_rx_copy_add(struct xenvif_queue *queue,
@@ -172,7 +205,7 @@ static void xenvif_rx_copy_add(struct xenvif_queue *queue,
 	struct page *page;
 	struct xen_page_foreign *foreign;
 
-	if (queue->rx_copy.num == COPY_BATCH_SIZE)
+	if (queue->rx_copy.num == queue->rx_copy.batch_size)
 		xenvif_rx_copy_flush(queue);
 
 	op = &queue->rx_copy.op[queue->rx_copy.num];
-- 
2.11.0

^ permalink raw reply related

* Re: Distress Call Please don't ignore
From: Sandra Younes @ 2017-12-21 16:43 UTC (permalink / raw)


Good Day,

Forgive my indignation if this message comes to you as a surprise and may offend your personality for contacting you without your prior consent and writing through this channel.

I came across your name and contact on the course of my personal searching when i was searching for a foreign reliable partner. I was assured of your capability and reliability after going true your profile.

I'm (Miss. Sandra) from Benghazi libya, My father of blessed memory by name late General Abdel Fattah Younes who was shot death by Islamist-linked militia within the anti-Gaddafi forces on 28th July, 2011 and after two days later my mother with my two brothers was killed one early morning by the rebels as result of civil war that is going on in my country Libya, then after the burial of my parents, my uncles conspired and sold my father's properties and left nothing for me. On a faithful morning, I opened my father's briefcase and discover a document which he has deposited ($6.250M USD) in a bank in a Turkish Bank which has a small branch in Canada with my name as the legitimate/next of kin. Meanwhile i have located the bank,and have also discussed the possiblity of transfering the fund. M
 y father left a clause to the bank that i must introduce a trusted foreign partner who would be my trustee to help me invest this fund; hence the need for your assistance,i request that you be my t
rustee and assist me in e

You will also be responsible for the investment and management of the fund for me and also you will help me get a good school where i will further my education.
I agreed to give you 40% of the $6.250M once the transfer is done. this is my true life story, I will be glad to receive your respond soonest for more details to enable us start and champion the transfer less than 14 banking days as i was informed by the bank manager.

Thanks for giving me your attention,

Yours sincerely,
Miss. Sandra Younes

^ permalink raw reply

* RE: [PATCH net-next v2] xen-netback: make copy batch size configurable
From: Paul Durrant @ 2017-12-21 17:29 UTC (permalink / raw)
  To: 'Joao Martins', netdev@vger.kernel.org
  Cc: Wei Liu, xen-devel@lists.xenproject.org
In-Reply-To: <20171221172428.32676-1-joao.m.martins@oracle.com>

> -----Original Message-----
> From: Joao Martins [mailto:joao.m.martins@oracle.com]
> Sent: 21 December 2017 17:24
> To: netdev@vger.kernel.org
> Cc: Joao Martins <joao.m.martins@oracle.com>; Wei Liu
> <wei.liu2@citrix.com>; Paul Durrant <Paul.Durrant@citrix.com>; xen-
> devel@lists.xenproject.org
> Subject: [PATCH net-next v2] xen-netback: make copy batch size
> configurable
> 
> Commit eb1723a29b9a ("xen-netback: refactor guest rx") refactored Rx
> handling and as a result decreased max grant copy ops from 4352 to 64.
> Before this commit it would drain the rx_queue (while there are
> enough slots in the ring to put packets) then copy to all pages and write
> responses on the ring. With the refactor we do almost the same albeit
> the last two steps are done every COPY_BATCH_SIZE (64) copies.
> 
> For big packets, the value of 64 means copying 3 packets best case scenario
> (17 copies) and worst-case only 1 packet (34 copies, i.e. if all frags
> plus head cross the 4k grant boundary) which could be the case when
> packets go from local backend process.
> 
> Instead of making it static to 64 grant copies, lets allow the user to
> select its value (while keeping the current as default) by introducing
> the `copy_batch_size` module parameter. This allows users to select
> the higher batches (i.e. for better throughput with big packets) as it
> was prior to the above mentioned commit.
> 
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>

Reviewed-by: Paul Durrant <paul.durrant@citrix.com>

> ---
> Changes since v1:
>  * move rx_copy.{idx,op} reallocation to separate helper
>  Addressed Paul's comments:
>  * rename xenvif_copy_state#size field to batch_size
>  * argument `size` should be unsigned int
>  * vfree is safe with NULL
>  * realloc rx_copy.{idx,op} after copy op flush
> ---
>  drivers/net/xen-netback/common.h    |  7 +++++--
>  drivers/net/xen-netback/interface.c | 16 +++++++++++++++-
>  drivers/net/xen-netback/netback.c   |  5 +++++
>  drivers/net/xen-netback/rx.c        | 35
> ++++++++++++++++++++++++++++++++++-
>  4 files changed, 59 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-
> netback/common.h
> index a46a1e94505d..8e4eaf3a507d 100644
> --- a/drivers/net/xen-netback/common.h
> +++ b/drivers/net/xen-netback/common.h
> @@ -129,8 +129,9 @@ struct xenvif_stats {
>  #define COPY_BATCH_SIZE 64
> 
>  struct xenvif_copy_state {
> -	struct gnttab_copy op[COPY_BATCH_SIZE];
> -	RING_IDX idx[COPY_BATCH_SIZE];
> +	struct gnttab_copy *op;
> +	RING_IDX *idx;
> +	unsigned int batch_size;
>  	unsigned int num;
>  	struct sk_buff_head *completed;
>  };
> @@ -358,6 +359,7 @@ irqreturn_t xenvif_ctrl_irq_fn(int irq, void *data);
> 
>  void xenvif_rx_action(struct xenvif_queue *queue);
>  void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff
> *skb);
> +int xenvif_rx_copy_realloc(struct xenvif_queue *queue, unsigned int size);
> 
>  void xenvif_carrier_on(struct xenvif *vif);
> 
> @@ -381,6 +383,7 @@ extern unsigned int rx_drain_timeout_msecs;
>  extern unsigned int rx_stall_timeout_msecs;
>  extern unsigned int xenvif_max_queues;
>  extern unsigned int xenvif_hash_cache_size;
> +extern unsigned int xenvif_copy_batch_size;
> 
>  #ifdef CONFIG_DEBUG_FS
>  extern struct dentry *xen_netback_dbg_root;
> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-
> netback/interface.c
> index 78ebe494fef0..e12eb64ab0a9 100644
> --- a/drivers/net/xen-netback/interface.c
> +++ b/drivers/net/xen-netback/interface.c
> @@ -518,6 +518,12 @@ int xenvif_init_queue(struct xenvif_queue *queue)
>  {
>  	int err, i;
> 
> +	err = xenvif_rx_copy_realloc(queue, xenvif_copy_batch_size);
> +	if (err) {
> +		netdev_err(queue->vif->dev, "Could not alloc rx_copy\n");
> +		goto err;
> +	}
> +
>  	queue->credit_bytes = queue->remaining_credit = ~0UL;
>  	queue->credit_usec  = 0UL;
>  	timer_setup(&queue->credit_timeout, xenvif_tx_credit_callback, 0);
> @@ -544,7 +550,7 @@ int xenvif_init_queue(struct xenvif_queue *queue)
>  				 queue->mmap_pages);
>  	if (err) {
>  		netdev_err(queue->vif->dev, "Could not reserve
> mmap_pages\n");
> -		return -ENOMEM;
> +		goto err;
>  	}
> 
>  	for (i = 0; i < MAX_PENDING_REQS; i++) {
> @@ -556,6 +562,11 @@ int xenvif_init_queue(struct xenvif_queue *queue)
>  	}
> 
>  	return 0;
> +
> +err:
> +	vfree(queue->rx_copy.op);
> +	vfree(queue->rx_copy.idx);
> +	return -ENOMEM;
>  }
> 
>  void xenvif_carrier_on(struct xenvif *vif)
> @@ -788,6 +799,9 @@ void xenvif_disconnect_ctrl(struct xenvif *vif)
>   */
>  void xenvif_deinit_queue(struct xenvif_queue *queue)
>  {
> +	vfree(queue->rx_copy.op);
> +	vfree(queue->rx_copy.idx);
> +	queue->rx_copy.batch_size = 0;
>  	gnttab_free_pages(MAX_PENDING_REQS, queue->mmap_pages);
>  }
> 
> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-
> netback/netback.c
> index a27daa23c9dc..3a5e1d7ac2f4 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -96,6 +96,11 @@ unsigned int xenvif_hash_cache_size =
> XENVIF_HASH_CACHE_SIZE_DEFAULT;
>  module_param_named(hash_cache_size, xenvif_hash_cache_size, uint,
> 0644);
>  MODULE_PARM_DESC(hash_cache_size, "Number of flows in the hash
> cache");
> 
> +/* This is the maximum batch of grant copies on Rx */
> +unsigned int xenvif_copy_batch_size = COPY_BATCH_SIZE;
> +module_param_named(copy_batch_size, xenvif_copy_batch_size, uint,
> 0644);
> +MODULE_PARM_DESC(copy_batch_size, "Maximum batch of grant copies
> on Rx");
> +
>  static void xenvif_idx_release(struct xenvif_queue *queue, u16
> pending_idx,
>  			       u8 status);
> 
> diff --git a/drivers/net/xen-netback/rx.c b/drivers/net/xen-netback/rx.c
> index b1cf7c6f407a..07eebd75e668 100644
> --- a/drivers/net/xen-netback/rx.c
> +++ b/drivers/net/xen-netback/rx.c
> @@ -130,6 +130,36 @@ static void xenvif_rx_queue_drop_expired(struct
> xenvif_queue *queue)
>  	}
>  }
> 
> +int xenvif_rx_copy_realloc(struct xenvif_queue *queue, unsigned int size)
> +{
> +	void *op = NULL, *idx = NULL;
> +
> +	if (!size || queue->rx_copy.num)
> +		return -EINVAL;
> +
> +	op = vzalloc(size * sizeof(struct gnttab_copy));
> +	if (!op)
> +		goto err;
> +
> +	idx = vzalloc(size * sizeof(RING_IDX));
> +	if (!idx)
> +		goto err;
> +
> +	vfree(queue->rx_copy.op);
> +	vfree(queue->rx_copy.idx);
> +
> +	queue->rx_copy.op = op;
> +	queue->rx_copy.idx = idx;
> +	queue->rx_copy.batch_size = size;
> +	netdev_dbg(queue->vif->dev, "Reallocated rx_copy for batch size
> %u\n",
> +		   size);
> +	return 0;
> +
> +err:
> +	vfree(op);
> +	return -ENOMEM;
> +}
> +
>  static void xenvif_rx_copy_flush(struct xenvif_queue *queue)
>  {
>  	unsigned int i;
> @@ -162,6 +192,9 @@ static void xenvif_rx_copy_flush(struct xenvif_queue
> *queue)
>  		notify_remote_via_irq(queue->rx_irq);
> 
>  	__skb_queue_purge(queue->rx_copy.completed);
> +
> +	if (unlikely(xenvif_copy_batch_size != queue->rx_copy.batch_size))
> +		xenvif_rx_copy_realloc(queue, xenvif_copy_batch_size);
>  }
> 
>  static void xenvif_rx_copy_add(struct xenvif_queue *queue,
> @@ -172,7 +205,7 @@ static void xenvif_rx_copy_add(struct xenvif_queue
> *queue,
>  	struct page *page;
>  	struct xen_page_foreign *foreign;
> 
> -	if (queue->rx_copy.num == COPY_BATCH_SIZE)
> +	if (queue->rx_copy.num == queue->rx_copy.batch_size)
>  		xenvif_rx_copy_flush(queue);
> 
>  	op = &queue->rx_copy.op[queue->rx_copy.num];
> --
> 2.11.0

^ permalink raw reply

* Re: [RFC PATCH net-next] tools/bpftool: use version from the kernel source tree
From: Jakub Kicinski @ 2017-12-21 17:34 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: netdev, linux-kernel, kernel-team, scientist, Alexei Starovoitov,
	Daniel Borkmann, Arnaldo Carvalho de Melo
In-Reply-To: <20171221120736.GA7054@castle.DHCP.thefacebook.com>

On Thu, 21 Dec 2017 12:07:42 +0000, Roman Gushchin wrote:
> On Wed, Dec 20, 2017 at 01:52:18PM -0800, Jakub Kicinski wrote:
> > On Wed, 20 Dec 2017 20:53:41 +0000, Roman Gushchin wrote:  
> > > On Wed, Dec 20, 2017 at 12:29:21PM -0800, Jakub Kicinski wrote:  
> > > Hm, why it's better? It's not only about the kernel version,
> > > IMO it's generally better to use includes from the source tree,
> > > rather then system-wide installed kernel headers.  
> > 
> > Right I agree the kernel headers are preferred.  I'm not entirely sure
> > why we don't use them, if it was OK to assume usr/ is there we wouldn't
> > need the tools/include/uapi/ contraption.  Maybe Arnaldo could explain?
> >   
> > > I've got about out-of-source builds, but do we support it in general?
> > > How can I build bpftool outside of the kernel tree?
> > > I've tried a bit, but failed.  
> > 
> > This is what I do:
> > 
> > make -C tools/bpf/bpftool/ W=1 O=/tmp/builds/bpftool  
> 
> This works perfectly with my patch:
> 
> $ make -C ~/linux/tools/bpf/ W=1 O=/home/guro/build/ --trace
> <...>
> echo '  CC       '/home/guro/build/main.o;gcc -O2 -W -Wall -Wextra -Wno-unused-parameter -Wshadow -D__EXPORTED_HEADERS__ -I/home/guro/linux/tools/include/uapi -I/home/guro/linux/tools/include -I/home/guro/linux/tools/lib/bpf -I/home/guro/linux/kernel/bpf/ -I/home/guro/linux/usr/include -DNEW_DISSASSEMBLER_SIGNATURE   -c -MMD -o /home/guro/build/main.o main.c
> <...>
> echo '  LINK     '/home/guro/build/bpftool;gcc -O2 -W -Wall -Wextra -Wno-unused-parameter -Wshadow -D__EXPORTED_HEADERS__ -I/home/guro/linux/tools/include/uapi -I/home/guro/linux/tools/include -I/home/guro/linux/tools/lib/bpf -I/home/guro/linux/kernel/bpf/ -I/home/guro/linux/usr/include -DNEW_DISSASSEMBLER_SIGNATURE -o /home/guro/build/bpftool /home/guro/build/common.o /home/guro/build/cgroup.o /home/guro/build/main.o /home/guro/build/json_writer.o /home/guro/build/prog.o /home/guro/build/map.o /home/guro/build/jit_disasm.o /home/guro/build/disasm.o /home/guro/build/libbpf.a -lelf -lbfd -lopcodes /home/guro/build/libbpf.a
>   LINK     /home/guro/build/bpftool
> make[1]: Leaving directory '/home/guro/linux/tools/bpf/bpftool'
> make: Leaving directory '/home/guro/linux/tools/bpf'
> 
> $ ./build/bpftool version
> ./build/bpftool v4.15.0

Argh, sorry for the confusion you need to build the kernel out-of-source
as well.  In my case I build the kernel and bpftool out of source, and
then the usr/ doesn't actually contain the auto-generated headers:

$ ls ~/devel/linux/usr/
gen_init_cpio.c  initramfs_data.S  Kconfig  Makefile

Only build directory does:

$ ls /tmp/builds/usr/
built-in.o  gen_init_cpio  include  initramfs_data.cpio  initramfs_data.o  modules.builtin  modules.order

Let me reiterate, the user space headers we need should all be already
included in -I$(srctree)/tools/include/uapi, and make kernelversion is
nice because it also adds the -rc tags.

^ permalink raw reply

* [PATCH v2 net-next] net: dsa: lan9303: lan9303_csr_reg_wait cleanups
From: Egil Hjelmeland @ 2017-12-21 17:34 UTC (permalink / raw)
  To: andrew, vivien.didelot, f.fainelli, netdev, linux-kernel; +Cc: Egil Hjelmeland

Non-functional cleanups in lan9303_csr_reg_wait():
 - Change type of param 'mask' from int to u32.
 - Remove param 'value' (will probably never be used)
 - Reduced retries from 1000 to 25, consistent with lan9303_read_wait.
 - Removed comments

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>

Changes v1 -> v2:
 - Removed comments
---
 drivers/net/dsa/lan9303-core.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index f412aad58253..944901f03f8b 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -249,7 +249,6 @@ static int lan9303_read(struct regmap *regmap, unsigned int offset, u32 *reg)
 	return -EIO;
 }
 
-/* Wait a while until mask & reg == value. Otherwise return timeout. */
 static int lan9303_read_wait(struct lan9303 *chip, int offset, u32 mask)
 {
 	int i;
@@ -541,20 +540,19 @@ lan9303_alr_cache_find_mac(struct lan9303 *chip, const u8 *mac_addr)
 	return NULL;
 }
 
-/* Wait a while until mask & reg == value. Otherwise return timeout. */
-static int lan9303_csr_reg_wait(struct lan9303 *chip, int regno,
-				int mask, char value)
+static int lan9303_csr_reg_wait(struct lan9303 *chip, int regno, u32 mask)
 {
 	int i;
 
-	for (i = 0; i < 0x1000; i++) {
+	for (i = 0; i < 25; i++) {
 		u32 reg;
 
 		lan9303_read_switch_reg(chip, regno, &reg);
-		if ((reg & mask) == value)
+		if (!(reg & mask))
 			return 0;
 		usleep_range(1000, 2000);
 	}
+
 	return -ETIMEDOUT;
 }
 
@@ -564,8 +562,7 @@ static int lan9303_alr_make_entry_raw(struct lan9303 *chip, u32 dat0, u32 dat1)
 	lan9303_write_switch_reg(chip, LAN9303_SWE_ALR_WR_DAT_1, dat1);
 	lan9303_write_switch_reg(chip, LAN9303_SWE_ALR_CMD,
 				 LAN9303_ALR_CMD_MAKE_ENTRY);
-	lan9303_csr_reg_wait(chip, LAN9303_SWE_ALR_CMD_STS, ALR_STS_MAKE_PEND,
-			     0);
+	lan9303_csr_reg_wait(chip, LAN9303_SWE_ALR_CMD_STS, ALR_STS_MAKE_PEND);
 	lan9303_write_switch_reg(chip, LAN9303_SWE_ALR_CMD, 0);
 
 	return 0;
-- 
2.14.1

^ permalink raw reply related

* Re: [PATCH 1/3] net: Fix possible race in peernet2id_alloc()
From: Eric W. Biederman @ 2017-12-21 17:39 UTC (permalink / raw)
  To: Kirill Tkhai; +Cc: netdev, davem, eric.dumazet
In-Reply-To: <151386201910.3724.7199367937841370542.stgit@localhost.localdomain>

Kirill Tkhai <ktkhai@virtuozzo.com> writes:

> peernet2id_alloc() is racy without rtnl_lock() as atomic_read(&peer->count)
> under net->nsid_lock does not guarantee, peer is alive:
>
> rcu_read_lock()
> peernet2id_alloc()                            ..
>   spin_lock_bh(&net->nsid_lock)               ..
>   atomic_read(&peer->count) == 1              ..
>   ..                                          put_net()
>   ..                                            cleanup_net()
>   ..                                              for_each_net(tmp)
>   ..                                                spin_lock_bh(&tmp->nsid_lock)
>   ..                                                __peernet2id(tmp, net) == -1
>   ..                                                    ..
>   ..                                                    ..
>     __peernet2id_alloc(alloc == true)                   ..
>   ..                                                    ..
> rcu_read_unlock()                                       ..
> ..                                                synchronize_rcu()
> ..                                                kmem_cache_free(net)
>
> After the above situation, net::netns_id contains id pointing to freed memory,
> and any other dereferencing by the id will operate with this freed memory.
>
> Currently, peernet2id_alloc() is used under rtnl_lock() everywhere except
> ovs_vport_cmd_fill_info(), and this race can't occur. But peernet2id_alloc()
> is generic interface, and better we fix it before someone really starts
> use it in wrong context.

So it comes down to this piece of code from ovs and just let me say ick.
	if (!net_eq(net, dev_net(vport->dev))) {
		int id = peernet2id_alloc(net, dev_net(vport->dev));

		if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id))
			goto nla_put_failure;
	}

Without the rtnl lock dev_net can cange between the test and the
call of peernet2id_alloc.

At first glance it looks like the bug is that we are running a control
path of the networking stack without the rtnl lock. So it may be that
ASSERT_RTNL() is the better fix.

Given that it would be nice to reduce the scope of the rtnl lock this
might not be a bad direction.  Let me see.

Is rtnl_notify safe without the rtnl lock?


>
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> ---
>  net/core/net_namespace.c |   23 +++++++++++++++++++----
>  1 file changed, 19 insertions(+), 4 deletions(-)
>
> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
> index 60a71be75aea..6a4eab438221 100644
> --- a/net/core/net_namespace.c
> +++ b/net/core/net_namespace.c
> @@ -221,17 +221,32 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id);
>   */
>  int peernet2id_alloc(struct net *net, struct net *peer)
>  {
> -	bool alloc;
> +	bool alloc = false, alive = false;
>  	int id;

        ^^^ Perhaps we want "ASSERT_RTNL();" here?
>  
> -	if (atomic_read(&net->count) == 0)
> -		return NETNSA_NSID_NOT_ASSIGNED;

Moving this hunk is of no benefit.  The code must be called with a valid
reference to net.   Which means net->count is a fancy way of testing to
see if the code is in cleanup_net.  In all other cases net->count should
be non-zero and it should remain that way because of our caller must
keep a reference.

>  	spin_lock_bh(&net->nsid_lock);
> -	alloc = atomic_read(&peer->count) == 0 ? false : true;
> +	/* Spinlock guarantees we never hash a peer to net->netns_ids
> +	 * after idr_destroy(&net->netns_ids) occurs in cleanup_net().
> +	 */
> +	if (atomic_read(&net->count) == 0) {
> +		id = NETNSA_NSID_NOT_ASSIGNED;
> +		goto unlock;
> +	}
> +	/*
> +	 * When peer is obtained from RCU lists, we may race with
> +	 * its cleanup. Check whether it's alive, and this guarantees
> +	 * we never hash a peer back to net->netns_ids, after it has
> +	 * just been idr_remove()'d from there in cleanup_net().
> +	 */
> +	if (maybe_get_net(peer))
> +		alive = alloc = true;

Yes this does seem reasonable.  The more obvious looking code which
would return NETNSA_NSID_NOT_ASSIGNED if the peer has a count of 0, is
silly as it makes would make it appear that a peer is momentary outside
of a network namespace when the peer is in fact moving from one network
namespace to another.
        
>  	id = __peernet2id_alloc(net, peer, &alloc);
> +unlock:
>  	spin_unlock_bh(&net->nsid_lock);
>  	if (alloc && id >= 0)
>  		rtnl_net_notifyid(net, RTM_NEWNSID, id);
                ^^^^^^
                Is this safe without the rtnl lock?
> +	if (alive)
> +		put_net(peer);
>  	return id;
>  }
>  EXPORT_SYMBOL_GPL(peernet2id_alloc);

Eric

^ permalink raw reply

* Re: [PATCHv4 net-next 00/14] net: sched: sch: introduce extack support
From: David Miller @ 2017-12-21 17:42 UTC (permalink / raw)
  To: aring; +Cc: jhs, xiyou.wangcong, jiri, netdev, kernel, dsahern
In-Reply-To: <20171220173524.25874-1-aring@mojatatu.com>

From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 20 Dec 2017 12:35:10 -0500

> this patch series basically add support for extack in common qdisc handling.
> Additional it adds extack pointer to common qdisc callback handling this
> offers per qdisc implementation to setting the extack message for each
> failure over netlink.

Series applied.

^ permalink raw reply

* Re: [PATCH net v3] openvswitch: Fix pop_vlan action for double tagged frames
From: David Miller @ 2017-12-21 18:05 UTC (permalink / raw)
  To: e; +Cc: netdev, ovs-dev, jbenc
In-Reply-To: <20171220200922.29415-1-e@erig.me>

From: Eric Garver <e@erig.me>
Date: Wed, 20 Dec 2017 15:09:22 -0500

> skb_vlan_pop() expects skb->protocol to be a valid TPID for double
> tagged frames. So set skb->protocol to the TPID and let skb_vlan_pop()
> shift the true ethertype into position for us.
> 
> Fixes: 5108bbaddc37 ("openvswitch: add processing of L3 packets")
> Signed-off-by: Eric Garver <e@erig.me>

Applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [PATCH net v3] openvswitch: Fix pop_vlan action for double tagged frames
From: David Miller @ 2017-12-21 18:05 UTC (permalink / raw)
  To: e; +Cc: netdev, ovs-dev, jbenc
In-Reply-To: <20171220200922.29415-1-e@erig.me>

From: Eric Garver <e@erig.me>
Date: Wed, 20 Dec 2017 15:09:22 -0500

> skb_vlan_pop() expects skb->protocol to be a valid TPID for double
> tagged frames. So set skb->protocol to the TPID and let skb_vlan_pop()
> shift the true ethertype into position for us.
> 
> Fixes: 5108bbaddc37 ("openvswitch: add processing of L3 packets")
> Signed-off-by: Eric Garver <e@erig.me>

Applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [PATCH net V3] net: reevalulate autoflowlabel setting after sysctl setting
From: David Miller @ 2017-12-21 18:07 UTC (permalink / raw)
  To: shli; +Cc: netdev, Kernel-team, shli, kafai, eric.dumazet, tom
In-Reply-To: <321216a522a3b46e77125a5b9df41c2b64821cf3.1513799711.git.shli@fb.com>

From: Shaohua Li <shli@kernel.org>
Date: Wed, 20 Dec 2017 12:10:21 -0800

> From: Shaohua Li <shli@fb.com>
> 
> sysctl.ip6.auto_flowlabels is default 1. In our hosts, we set it to 2.
> If sockopt doesn't set autoflowlabel, outcome packets from the hosts are
> supposed to not include flowlabel. This is true for normal packet, but
> not for reset packet.
> 
> The reason is ipv6_pinfo.autoflowlabel is set in sock creation. Later if
> we change sysctl.ip6.auto_flowlabels, the ipv6_pinfo.autoflowlabel isn't
> changed, so the sock will keep the old behavior in terms of auto
> flowlabel. Reset packet is suffering from this problem, because reset
> packet is sent from a special control socket, which is created at boot
> time. Since sysctl.ipv6.auto_flowlabels is 1 by default, the control
> socket will always have its ipv6_pinfo.autoflowlabel set, even after
> user set sysctl.ipv6.auto_flowlabels to 1, so reset packset will always
> have flowlabel. Normal sock created before sysctl setting suffers from
> the same issue. We can't even turn off autoflowlabel unless we kill all
> socks in the hosts.
> 
> To fix this, if IPV6_AUTOFLOWLABEL sockopt is used, we use the
> autoflowlabel setting from user, otherwise we always call
> ip6_default_np_autolabel() which has the new settings of sysctl.
> 
> Note, this changes behavior a little bit. Before commit 42240901f7c4
> (ipv6: Implement different admin modes for automatic flow labels), the
> autoflowlabel behavior of a sock isn't sticky, eg, if sysctl changes,
> existing connection will change autoflowlabel behavior. After that
> commit, autoflowlabel behavior is sticky in the whole life of the sock.
> With this patch, the behavior isn't sticky again.
> 
> Cc: Martin KaFai Lau <kafai@fb.com>
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Cc: Tom Herbert <tom@quantonium.net>
> Signed-off-by: Shaohua Li <shli@fb.com>

This looks a lot better, applied, thanks.

^ permalink raw reply

* Re: [PATCH v3 next-queue 00/10] ixgbe: Add ipsec offload
From: Shannon Nelson @ 2017-12-21 17:55 UTC (permalink / raw)
  To: Yanjun Zhu, intel-wired-lan, jeffrey.t.kirsher
  Cc: steffen.klassert, sowmini.varadhan, netdev
In-Reply-To: <e253e271-eda9-8707-af57-3a5cf33cb097@oracle.com>

On 12/20/2017 11:09 PM, Yanjun Zhu wrote:
> On 2017/12/21 14:39, Yanjun Zhu wrote:
>> On 2017/12/20 7:59, Shannon Nelson wrote:
>>> This is an implementation of the ipsec hardware offload feature for
>>> the ixgbe driver and Intel's 10Gbe series NICs: x540, x550, 82599.
>> Hi, Nelson
>>
>> I notice that the ipsec feature is based on x540, x550, 82599. But 
>> this ixgbe driver
>> will also work with 82598.
>>
>> Does this ipsec feature also work with 82598?
> Sorry. I mean, after these ipsec patches are applied, whether ipsec 
> offload enabled or not,
> can this ixgbe driver still work well with 82598?

Hmm... I don't have one to test on, but I suspect the 82598 might not be 
happy with this.  I'll send a followup patch to catch this case.

Thanks!
sln


> 
> Zhu Yanjun
>>
>> Thanks a lot.
>> Zhu Yanjun
>>> These patches apply to net-next v4.14 as well as Jeff Kirsher's 
>>> next-queue
>>> v4.15-rc1-206-ge47375b.
>>>
>>> The ixgbe NICs support ipsec offload for 1024 Rx and 1024 Tx Security
>>> Associations (SAs), using up to 128 inbound IP addresses, and using the
>>> rfc4106(gcm(aes)) encryption.  This code does not yet support IPv6,
>>> checksum offload, or TSO in conjunction with the ipsec offload - those
>>> will be added in the future.
>>>
>>> This code shows improvements in both packet throughput and CPU 
>>> utilization.
>>> For example, here are some quicky numbers that show the magnitude of the
>>> performance gain on a single run of "iperf -c <dest>" with the ipsec
>>> offload on both ends of a point-to-point connection:
>>>
>>>     9.4 Gbps - normal case
>>>     7.6 Gbps - ipsec with offload
>>>     343 Mbps - ipsec no offload
>>>
>>> To set up a similar test case, you first need to be sure you have a 
>>> recent
>>> version of iproute2 that supports the ipsec offload tag, probably 
>>> something
>>> from ip 4.12 or newer would be best.  I have a shell script that builds
>>> up the appropriate commands for me, but here are the resulting commands
>>> for all tcp traffic between 14.0.0.52 and 14.0.0.70:
>>>
>>> For the left side (14.0.0.52):
>>>    ip x p add dir out src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp tmpl \
>>>       proto esp src 14.0.0.52 dst 14.0.0.70 spi 0x07 mode transport 
>>> reqid 0x07
>>>    ip x p add dir in src 14.0.0.70/24 dst 14.0.0.52/24 proto tcp tmpl \
>>>       proto esp dst 14.0.0.52 src 14.0.0.70 spi 0x07 mode transport 
>>> reqid 0x07
>>>    ip x s add proto esp src 14.0.0.52 dst 14.0.0.70 spi 0x07 mode 
>>> transport \
>>>       reqid 0x07 replay-window 32 \
>>>       aead 'rfc4106(gcm(aes))' 
>>> 0x44434241343332312423222114131211f4f3f2f1 128 \
>>>       sel src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp offload dev 
>>> eth4 dir out
>>>    ip x s add proto esp dst 14.0.0.52 src 14.0.0.70 spi 0x07 mode 
>>> transport \
>>>       reqid 0x07 replay-window 32 \
>>>       aead 'rfc4106(gcm(aes))' 
>>> 0x44434241343332312423222114131211f4f3f2f1 128 \
>>>       sel src 14.0.0.70/24 dst 14.0.0.52/24 proto tcp offload dev 
>>> eth4 dir in
>>>   For the right side (14.0.0.70):
>>>    ip x p add dir out src 14.0.0.70/24 dst 14.0.0.52/24 proto tcp tmpl \
>>>       proto esp src 14.0.0.70 dst 14.0.0.52 spi 0x07 mode transport 
>>> reqid 0x07
>>>    ip x p add dir in src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp tmpl \
>>>       proto esp dst 14.0.0.70 src 14.0.0.52 spi 0x07 mode transport 
>>> reqid 0x07
>>>    ip x s add proto esp src 14.0.0.70 dst 14.0.0.52 spi 0x07 mode 
>>> transport \
>>>       reqid 0x07 replay-window 32 \
>>>       aead 'rfc4106(gcm(aes))' 
>>> 0x44434241343332312423222114131211f4f3f2f1 128 \
>>>       sel src 14.0.0.70/24 dst 14.0.0.52/24 proto tcp offload dev 
>>> eth4 dir out
>>>    ip x s add proto esp dst 14.0.0.70 src 14.0.0.52 spi 0x07 mode 
>>> transport \
>>>       reqid 0x07 replay-window 32 \
>>>       aead 'rfc4106(gcm(aes))' 
>>> 0x44434241343332312423222114131211f4f3f2f1 128 \
>>>       sel src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp offload dev 
>>> eth4 dir in
>>>
>>> In both cases, the command "ip x s flush ; ip x p flush" will clean
>>> it all out and remove the offloads.
>>>
>>> Lastly, thanks to Alex Duyck for his early comments.
>>>
>>> Please see the individual patches for specific update info.
>>>
>>> v3: fixes after comments from those wonderfully pesky kbuild robots
>>> v2: fixes after comments from Alex
>>>
>>> Shannon Nelson (10):
>>>    ixgbe: clean up ipsec defines
>>>    ixgbe: add ipsec register access routines
>>>    ixgbe: add ipsec engine start and stop routines
>>>    ixgbe: add ipsec data structures
>>>    ixgbe: add ipsec offload add and remove SA
>>>    ixgbe: restore offloaded SAs after a reset
>>>    ixgbe: process the Rx ipsec offload
>>>    ixgbe: process the Tx ipsec offload
>>>    ixgbe: ipsec offload stats
>>>    ixgbe: register ipsec offload with the xfrm subsystem
>>>
>>>   drivers/net/ethernet/intel/ixgbe/Makefile        |   1 +
>>>   drivers/net/ethernet/intel/ixgbe/ixgbe.h         |  33 +-
>>>   drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |   2 +
>>>   drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c   | 923 
>>> +++++++++++++++++++++++
>>>   drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.h   |  92 +++
>>>   drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c     |   4 +-
>>>   drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |  39 +-
>>>   drivers/net/ethernet/intel/ixgbe/ixgbe_type.h    |  22 +-
>>>   8 files changed, 1093 insertions(+), 23 deletions(-)
>>>   create mode 100644 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
>>>   create mode 100644 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.h
>>>
>>
>>
> 

^ permalink raw reply

* Re: [PATCH v3 1/3] net: ibm: emac: replace custom rgmii_mode_name with phy_modes
From: David Miller @ 2017-12-21 18:10 UTC (permalink / raw)
  To: chunkeey; +Cc: netdev, andrew, christophe.jaillet
In-Reply-To: <a9482f4f4037f6eb732de327290a432539648bcd.1513806256.git.chunkeey@gmail.com>

From: Christian Lamparter <chunkeey@gmail.com>
Date: Wed, 20 Dec 2017 23:01:48 +0100

> phy_modes() in the common phy.h already defines the same phy mode
> names in lower case. The deleted rgmii_mode_name() is used only
> in one place and for a "notice-level" printk. Hence, it will not
> be missed.
> 
> Signed-off-by: Christian Lamparter <chunkeey@gmail.com>

Applied to net-next.

^ permalink raw reply

* Re: [PATCH v3 2/3] net: ibm: emac: replace custom PHY_MODE_* macros
From: David Miller @ 2017-12-21 18:10 UTC (permalink / raw)
  To: chunkeey; +Cc: netdev, andrew, christophe.jaillet
In-Reply-To: <2cb74d50c22d01873d1d976ec384917dc799be08.1513806256.git.chunkeey@gmail.com>

From: Christian Lamparter <chunkeey@gmail.com>
Date: Wed, 20 Dec 2017 23:01:49 +0100

> The ibm_emac driver predates the PHY_INTERFACE_MODE_*
> enums by a few years.
> 
> And while the driver has been retrofitted to use the PHYLIB,
> the old definitions have stuck around to this day.
> 
> This patch replaces all occurences of PHY_MODE_* with
> the respective equivalent PHY_INTERFACE_MODE_* enum.
> And finally, it purges the old macros for good.
> 
> Signed-off-by: Christian Lamparter <chunkeey@gmail.com>

Applied to net-next.

^ permalink raw reply

* Re: [PATCH v3 3/3] net: ibm: emac: support RGMII-[RX|TX]ID phymode
From: David Miller @ 2017-12-21 18:10 UTC (permalink / raw)
  To: chunkeey; +Cc: netdev, andrew, christophe.jaillet
In-Reply-To: <de458d66be6e804a45e1bd96e57aa5907bb98e03.1513806256.git.chunkeey@gmail.com>

From: Christian Lamparter <chunkeey@gmail.com>
Date: Wed, 20 Dec 2017 23:01:50 +0100

> The RGMII spec allows compliance for devices that implement an internal
> delay on TXC and/or RXC inside the transmitter. This patch adds the
> necessary RGMII_[RX|TX]ID mode code to handle such PHYs with the
> emac driver.
> 
> Signed-off-by: Christian Lamparter <chunkeey@gmail.com>

Applied to net-next.

^ permalink raw reply

* [PATCH next-queue] ixgbe: no ipsec offload for 82598
From: Shannon Nelson @ 2017-12-21 18:21 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: steffen.klassert, netdev

Don't try to set up ipsec offload on the oldest part of
the ixgbe family.

Suggested-by: Yanjun Zhu <yanjun.zhu@oracle.com>
Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 424dbf7..12c7132 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -863,6 +863,9 @@ void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter)
 	struct ixgbe_ipsec *ipsec;
 	size_t size;
 
+	if (adapter->hw.mac.type == ixgbe_mac_82598EB)
+		return;
+
 	ipsec = kzalloc(sizeof(*ipsec), GFP_KERNEL);
 	if (!ipsec)
 		goto err1;
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next] tcp: md5: Handle RCU dereference of md5sig_info
From: Mat Martineau @ 2017-12-21 18:29 UTC (permalink / raw)
  To: netdev; +Cc: Mat Martineau
In-Reply-To: <20171221182910.4785-1-mathew.j.martineau@linux.intel.com>

Dereference tp->md5sig_info in tcp_v4_destroy_sock() the same way it is
done in the adjacent call to tcp_clear_md5_list().

Resolves this sparse warning:

net/ipv4/tcp_ipv4.c:1914:17: warning: incorrect type in argument 1 (different address spaces)
net/ipv4/tcp_ipv4.c:1914:17:    expected struct callback_head *head
net/ipv4/tcp_ipv4.c:1914:17:    got struct callback_head [noderef] <asn:4>*<noident>

Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 net/ipv4/tcp_ipv4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dd945b114215..5d203248123e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1911,7 +1911,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 	/* Clean up the MD5 key list, if any */
 	if (tp->md5sig_info) {
 		tcp_clear_md5_list(sk);
-		kfree_rcu(tp->md5sig_info, rcu);
+		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
 		tp->md5sig_info = NULL;
 	}
 #endif
-- 
2.15.1

^ permalink raw reply related

* [PATCH net] tcp: Avoid preprocessor directives in tracepoint macro args
From: Mat Martineau @ 2017-12-21 18:29 UTC (permalink / raw)
  To: netdev; +Cc: Mat Martineau, David Ahern

Using a preprocessor directive to check for CONFIG_IPV6 in the middle of
a DECLARE_EVENT_CLASS macro's arg list causes sparse to report a series
of errors:

./include/trace/events/tcp.h:68:1: error: directive in argument list
./include/trace/events/tcp.h:75:1: error: directive in argument list
./include/trace/events/tcp.h:144:1: error: directive in argument list
./include/trace/events/tcp.h:151:1: error: directive in argument list
./include/trace/events/tcp.h:216:1: error: directive in argument list
./include/trace/events/tcp.h:223:1: error: directive in argument list
./include/trace/events/tcp.h:274:1: error: directive in argument list
./include/trace/events/tcp.h:281:1: error: directive in argument list

Once sparse finds an error, it stops printing warnings for the file it
is checking. This masks any sparse warnings that would normally be
reported for the core TCP code.

Instead, handle the preprocessor conditionals in a couple of auxiliary
macros. This also has the benefit of reducing duplicate code.

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 include/trace/events/tcp.h | 97 ++++++++++++++++++----------------------------
 1 file changed, 37 insertions(+), 60 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 07cccca6cbf1..ab34c561f26b 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -25,6 +25,35 @@
 		tcp_state_name(TCP_CLOSING),		\
 		tcp_state_name(TCP_NEW_SYN_RECV))
 
+#define TP_STORE_V4MAPPED(__entry, saddr, daddr)		\
+	do {							\
+		struct in6_addr *pin6;				\
+								\
+		pin6 = (struct in6_addr *)__entry->saddr_v6;	\
+		ipv6_addr_set_v4mapped(saddr, pin6);		\
+		pin6 = (struct in6_addr *)__entry->daddr_v6;	\
+		ipv6_addr_set_v4mapped(daddr, pin6);		\
+	} while (0)
+
+#if IS_ENABLED(CONFIG_IPV6)
+#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6)		\
+	do {								\
+		if (sk->sk_family == AF_INET6) {			\
+			struct in6_addr *pin6;				\
+									\
+			pin6 = (struct in6_addr *)__entry->saddr_v6;	\
+			*pin6 = saddr6;					\
+			pin6 = (struct in6_addr *)__entry->daddr_v6;	\
+			*pin6 = daddr6;					\
+		} else {						\
+			TP_STORE_V4MAPPED(__entry, saddr, daddr);	\
+		}							\
+	} while (0)
+#else
+#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6)	\
+	TP_STORE_V4MAPPED(__entry, saddr, daddr)
+#endif
+
 /*
  * tcp event with arguments sk and skb
  *
@@ -50,7 +79,6 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 
 	TP_fast_assign(
 		struct inet_sock *inet = inet_sk(sk);
-		struct in6_addr *pin6;
 		__be32 *p32;
 
 		__entry->skbaddr = skb;
@@ -65,20 +93,8 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 		p32 = (__be32 *) __entry->daddr;
 		*p32 =  inet->inet_daddr;
 
-#if IS_ENABLED(CONFIG_IPV6)
-		if (sk->sk_family == AF_INET6) {
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			*pin6 = sk->sk_v6_rcv_saddr;
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			*pin6 = sk->sk_v6_daddr;
-		} else
-#endif
-		{
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
-		}
+		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+			      sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
 	),
 
 	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
@@ -127,7 +143,6 @@ DECLARE_EVENT_CLASS(tcp_event_sk,
 
 	TP_fast_assign(
 		struct inet_sock *inet = inet_sk(sk);
-		struct in6_addr *pin6;
 		__be32 *p32;
 
 		__entry->skaddr = sk;
@@ -141,20 +156,8 @@ DECLARE_EVENT_CLASS(tcp_event_sk,
 		p32 = (__be32 *) __entry->daddr;
 		*p32 =  inet->inet_daddr;
 
-#if IS_ENABLED(CONFIG_IPV6)
-		if (sk->sk_family == AF_INET6) {
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			*pin6 = sk->sk_v6_rcv_saddr;
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			*pin6 = sk->sk_v6_daddr;
-		} else
-#endif
-		{
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
-		}
+		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
 	),
 
 	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
@@ -197,7 +200,6 @@ TRACE_EVENT(tcp_set_state,
 
 	TP_fast_assign(
 		struct inet_sock *inet = inet_sk(sk);
-		struct in6_addr *pin6;
 		__be32 *p32;
 
 		__entry->skaddr = sk;
@@ -213,20 +215,8 @@ TRACE_EVENT(tcp_set_state,
 		p32 = (__be32 *) __entry->daddr;
 		*p32 =  inet->inet_daddr;
 
-#if IS_ENABLED(CONFIG_IPV6)
-		if (sk->sk_family == AF_INET6) {
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			*pin6 = sk->sk_v6_rcv_saddr;
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			*pin6 = sk->sk_v6_daddr;
-		} else
-#endif
-		{
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
-		}
+		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
 	),
 
 	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s",
@@ -256,7 +246,6 @@ TRACE_EVENT(tcp_retransmit_synack,
 
 	TP_fast_assign(
 		struct inet_request_sock *ireq = inet_rsk(req);
-		struct in6_addr *pin6;
 		__be32 *p32;
 
 		__entry->skaddr = sk;
@@ -271,20 +260,8 @@ TRACE_EVENT(tcp_retransmit_synack,
 		p32 = (__be32 *) __entry->daddr;
 		*p32 = ireq->ir_rmt_addr;
 
-#if IS_ENABLED(CONFIG_IPV6)
-		if (sk->sk_family == AF_INET6) {
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			*pin6 = ireq->ir_v6_loc_addr;
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			*pin6 = ireq->ir_v6_rmt_addr;
-		} else
-#endif
-		{
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			ipv6_addr_set_v4mapped(ireq->ir_loc_addr, pin6);
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			ipv6_addr_set_v4mapped(ireq->ir_rmt_addr, pin6);
-		}
+		TP_STORE_ADDRS(__entry, ireq->ir_loc_addr, ireq->ir_rmt_addr,
+			      ireq->ir_v6_loc_addr, ireq->ir_v6_rmt_addr);
 	),
 
 	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
-- 
2.15.1

^ permalink raw reply related

* Re: [Patch net] net_sched: fix a missing rcu barrier in mini_qdisc_pair_swap()
From: Cong Wang @ 2017-12-21 19:01 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: Linux Kernel Network Developers, Jiri Pirko, John Fastabend
In-Reply-To: <20171221090313.GB1930@nanopsycho>

On Thu, Dec 21, 2017 at 1:03 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> Thu, Dec 21, 2017 at 08:26:24AM CET, xiyou.wangcong@gmail.com wrote:
>>The rcu_barrier_bh() in mini_qdisc_pair_swap() is to wait for
>>flying RCU callback installed by a previous mini_qdisc_pair_swap(),
>>however we miss it on the tp_head==NULL path, which leads to that
>>the RCU callback still uses miniq_old->rcu after it is freed together
>>with qdisc in qdisc_graft(). So just add it on that path too.
>>
>>Fixes: 46209401f8f6 ("net: core: introduce mini_Qdisc and eliminate usage of tp->q for clsact fastpath ")
>
> This fixes:
> 752fbcc33405 ("net_sched: no need to free qdisc in RCU callback")
>
> Before that, the issue was not there as the qdisc struct got removed
> after a grace period.


This is non-sense. You have to read the stack trace from Jakub again
and tell me why you keep believing any RCU reader involved.

I am pretty sure no one reported any crash between commit
752fbcc33405 and 46209401f8f6.


>
>
>>Reported-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>>Tested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>>Cc: Jiri Pirko <jiri@mellanox.com>
>>Cc: John Fastabend <john.fastabend@gmail.com>
>>Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
>>---
>> net/sched/sch_generic.c | 4 +++-
>> 1 file changed, 3 insertions(+), 1 deletion(-)
>>
>>diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
>>index cd1b200acae7..661c7144b53a 100644
>>--- a/net/sched/sch_generic.c
>>+++ b/net/sched/sch_generic.c
>>@@ -1040,6 +1040,8 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
>>
>>       if (!tp_head) {
>>               RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
>>+              /* Wait for flying RCU callback before it is freed. */
>>+              rcu_barrier_bh();
>
>
>>               return;
>>       }
>>
>>@@ -1055,7 +1057,7 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
>>       rcu_assign_pointer(*miniqp->p_miniq, miniq);
>>
>>       if (miniq_old)
>>-              /* This is counterpart of the rcu barrier above. We need to
>>+              /* This is counterpart of the rcu barriers above. We need to
>
> This is incorrect. Here we block in order to not use the same miniq
> again in scenario
>
> miniq1 (X)
> miniq2
> miniq1 (yet there are reader using X)
>
> This call_rcu has 0 relation to the barrier you are adding.


Seriously? It is this call_rcu still flying after we free the qdisc.
Did you seriously look into the stack trace from Jakub?


>
>
> But again, we don't we just free qdisc in call_rcu and avoid the
> barrier?


Non-sense again. Why qdisc code should be adjusted for your
miniq code? It is your own responsibility to take care of this shit.
Don't spread it out of minq.

^ permalink raw reply

* Re: [net-next: PATCH 0/8] Armada 7k/8k PP2 ACPI support
From: Antoine Tenart @ 2017-12-21 19:21 UTC (permalink / raw)
  To: Marcin Wojtas
  Cc: linux-kernel, linux-arm-kernel, netdev, davem, linux,
	rafael.j.wysocki, andrew, f.fainelli, antoine.tenart,
	thomas.petazzoni, gregory.clement, ezequiel.garcia, nadavh, neta,
	ard.biesheuvel, jaz, tn
In-Reply-To: <1513588684-15647-1-git-send-email-mw@semihalf.com>

Hi Marcin,

On Mon, Dec 18, 2017 at 10:17:56AM +0100, Marcin Wojtas wrote:
> 
> Marcin Wojtas (8):
>   device property: Introduce fwnode_get_mac_address()
>   device property: Introduce fwnode_get_phy_mode()
>   mdio_bus: Introduce fwnode MDIO helpers
>   net: mvmdio: add ACPI support
>   net: mvpp2: simplify maintaining enabled ports' list
>   net: mvpp2: use device_*/fwnode_* APIs instead of of_*
>   net: mvpp2: handle PHY with its fwnode
>   net: mvpp2: enable ACPI support in the driver


I tested your series on a mcbin, using the dt way. It still worked. If
it is relevant, you can add on the mvpp2 related patches:

Tested-by: Antoine Tenart <antoine.tenart@free-electrons.com>

Thanks!

Antoine

-- 
Antoine Ténart, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply

* [PATCH v5 net-next 0/7] net: ILA notification mechanism and fixes
From: Tom Herbert @ 2017-12-21 19:33 UTC (permalink / raw)
  To: davem; +Cc: netdev, roopa, rohit, Tom Herbert

This patch set adds support to get netlink notifications for ILA 
routes when a route is used.

This patch set contains:

- General infrastructure for route notifications
- The ILA route notification mechanism
- Add net to ila build_state
- Add flush command to ila_xlat
- Fix use of rhashtable for latest fixes

Route notifications will be used in conjunction with populating
ILA forwarding caches. There are three methods described in
the ILA Mapping Protocol. These are redirects, request/reply,
and push. The ILA route mechanism is relevant to the first two methods.

  - ILA router secure redirect mechanism-- This is used on an ILA
    router where a notification is sent when an ILA host route is
    used. The purpose of this notification is to send an
    ILA redirect towards the ILA forwarding node of a source to
    inform it of a direct ILA route. When the forwarding node
    receives the redirect it can populate its cache so that
    subsequent packets take the direct path. This is the
    RECOMMENDED method.

  - Cache address resolution-- This used to perform request/reply
    address resolution on a route. As noted on netdev list, a
    request/reply mechanism is susceptible to DOS attacks.
    For this reason, this method is not NOT RECOMMENDED as the
    primary means to populate an ILA cache.

ILAMP is described in 
https://www.ietf.org/internet-drafts/draft-herbert-ila-ilamp-00.txt

Tested:

Ran ILA traffic, set up ILA notify routes and observed correct
routing message via ip monitor.

v5:
 - Fix some compiler and sparse warnings
 - Generalize route notify with RTM_NOTIFYROUTE,
   RTNLGRP_ROUTE_NOTIFY (suggested by Roopa)

v4:
 - Remove front end cache per davem feedback
 - Eliminate separate LWT type just use ILA LWT already in place

v3:
 - Removed rhashtable changes to their own patch set
 - Restructure ILA code to be more amenable to changes
 - Remove extra call back functions in resolution interface

Changes from initial RFC:

 - Added net argument to LWT build_state
 - Made resolve timeout an attribute of the LWT encap route
 - Changed ILA notifications to be regular routing messages of event
   RTM_ADDR_RESOLVE, family RTNL_FAMILY_ILA, and group
   RTNLGRP_ILA_NOTIFY

Tom Herbert (7):
  lwt: Add net to build_state argument
  rtnetlink: Add notify route message types
  ila: Fix use of rhashtable walk in ila_xlat.c
  ila: Call library function alloc_bucket_locks
  ila: Create main ila source file
  ila: Flush netlink command to clear xlat table
  ila: Route notify

 include/net/lwtunnel.h         |   6 +-
 include/uapi/linux/ila.h       |   3 +
 include/uapi/linux/rtnetlink.h |   6 +
 net/core/lwt_bpf.c             |   2 +-
 net/core/lwtunnel.c            |   4 +-
 net/ipv4/fib_semantics.c       |  13 +-
 net/ipv4/ip_tunnel_core.c      |   4 +-
 net/ipv6/ila/Makefile          |   2 +-
 net/ipv6/ila/ila.h             |  27 +++-
 net/ipv6/ila/ila_common.c      |  30 -----
 net/ipv6/ila/ila_lwt.c         | 275 ++++++++++++++++++++++++++------------
 net/ipv6/ila/ila_main.c        | 121 +++++++++++++++++
 net/ipv6/ila/ila_xlat.c        | 290 ++++++++++++++++++++---------------------
 net/ipv6/route.c               |   2 +-
 net/ipv6/seg6_iptunnel.c       |   2 +-
 net/ipv6/seg6_local.c          |   5 +-
 net/mpls/mpls_iptunnel.c       |   2 +-
 17 files changed, 511 insertions(+), 283 deletions(-)
 create mode 100644 net/ipv6/ila/ila_main.c

-- 
2.11.0

^ permalink raw reply

* [PATCH v5 net-next 1/7] lwt: Add net to build_state argument
From: Tom Herbert @ 2017-12-21 19:33 UTC (permalink / raw)
  To: davem; +Cc: netdev, roopa, rohit, Tom Herbert
In-Reply-To: <20171221193332.15303-1-tom@quantonium.net>

Users of LWT need to know net if they want to have per net operations
in LWT.

Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 include/net/lwtunnel.h    |  6 +++---
 net/core/lwt_bpf.c        |  2 +-
 net/core/lwtunnel.c       |  4 ++--
 net/ipv4/fib_semantics.c  | 13 ++++++++-----
 net/ipv4/ip_tunnel_core.c |  4 ++--
 net/ipv6/ila/ila_lwt.c    |  2 +-
 net/ipv6/route.c          |  2 +-
 net/ipv6/seg6_iptunnel.c  |  2 +-
 net/ipv6/seg6_local.c     |  5 +++--
 net/mpls/mpls_iptunnel.c  |  2 +-
 10 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index d747ef975cd8..da5e51e0d122 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -34,7 +34,7 @@ struct lwtunnel_state {
 };
 
 struct lwtunnel_encap_ops {
-	int (*build_state)(struct nlattr *encap,
+	int (*build_state)(struct net *net, struct nlattr *encap,
 			   unsigned int family, const void *cfg,
 			   struct lwtunnel_state **ts,
 			   struct netlink_ext_ack *extack);
@@ -113,7 +113,7 @@ int lwtunnel_valid_encap_type(u16 encap_type,
 			      struct netlink_ext_ack *extack);
 int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len,
 				   struct netlink_ext_ack *extack);
-int lwtunnel_build_state(u16 encap_type,
+int lwtunnel_build_state(struct net *net, u16 encap_type,
 			 struct nlattr *encap,
 			 unsigned int family, const void *cfg,
 			 struct lwtunnel_state **lws,
@@ -192,7 +192,7 @@ static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len,
 	return 0;
 }
 
-static inline int lwtunnel_build_state(u16 encap_type,
+static inline int lwtunnel_build_state(struct net *net, u16 encap_type,
 				       struct nlattr *encap,
 				       unsigned int family, const void *cfg,
 				       struct lwtunnel_state **lws,
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index e7e626fb87bb..3a3ac13fcf06 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -238,7 +238,7 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
 	[LWT_BPF_XMIT_HEADROOM]	= { .type = NLA_U32 },
 };
 
-static int bpf_build_state(struct nlattr *nla,
+static int bpf_build_state(struct net *net, struct nlattr *nla,
 			   unsigned int family, const void *cfg,
 			   struct lwtunnel_state **ts,
 			   struct netlink_ext_ack *extack)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 0b171756453c..b3f2f77dfe72 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -103,7 +103,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
 }
 EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops);
 
-int lwtunnel_build_state(u16 encap_type,
+int lwtunnel_build_state(struct net *net, u16 encap_type,
 			 struct nlattr *encap, unsigned int family,
 			 const void *cfg, struct lwtunnel_state **lws,
 			 struct netlink_ext_ack *extack)
@@ -124,7 +124,7 @@ int lwtunnel_build_state(u16 encap_type,
 	ops = rcu_dereference(lwtun_encaps[encap_type]);
 	if (likely(ops && ops->build_state && try_module_get(ops->owner))) {
 		found = true;
-		ret = ops->build_state(encap, family, cfg, lws, extack);
+		ret = ops->build_state(net, encap, family, cfg, lws, extack);
 		if (ret)
 			module_put(ops->owner);
 	}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f04d944f8abe..4979e5c6b9b8 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -523,6 +523,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 			if (nla) {
 				struct lwtunnel_state *lwtstate;
 				struct nlattr *nla_entype;
+				struct net *net = cfg->fc_nlinfo.nl_net;
 
 				nla_entype = nla_find(attrs, attrlen,
 						      RTA_ENCAP_TYPE);
@@ -533,7 +534,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 					goto err_inval;
 				}
 
-				ret = lwtunnel_build_state(nla_get_u16(
+				ret = lwtunnel_build_state(net, nla_get_u16(
 							   nla_entype),
 							   nla,  AF_INET, cfg,
 							   &lwtstate, extack);
@@ -607,7 +608,7 @@ static void fib_rebalance(struct fib_info *fi)
 
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 
-static int fib_encap_match(u16 encap_type,
+static int fib_encap_match(struct net *net, u16 encap_type,
 			   struct nlattr *encap,
 			   const struct fib_nh *nh,
 			   const struct fib_config *cfg,
@@ -619,7 +620,7 @@ static int fib_encap_match(u16 encap_type,
 	if (encap_type == LWTUNNEL_ENCAP_NONE)
 		return 0;
 
-	ret = lwtunnel_build_state(encap_type, encap, AF_INET,
+	ret = lwtunnel_build_state(net, encap_type, encap, AF_INET,
 				   cfg, &lwtstate, extack);
 	if (!ret) {
 		result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
@@ -632,6 +633,7 @@ static int fib_encap_match(u16 encap_type,
 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
 		 struct netlink_ext_ack *extack)
 {
+	struct net *net = cfg->fc_nlinfo.nl_net;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	struct rtnexthop *rtnh;
 	int remaining;
@@ -642,7 +644,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
 
 	if (cfg->fc_oif || cfg->fc_gw) {
 		if (cfg->fc_encap) {
-			if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap,
+			if (fib_encap_match(net, cfg->fc_encap_type,
+					    cfg->fc_encap,
 					    fi->fib_nh, cfg, extack))
 				return 1;
 		}
@@ -1180,7 +1183,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 					       "LWT encap type not specified");
 				goto err_inval;
 			}
-			err = lwtunnel_build_state(cfg->fc_encap_type,
+			err = lwtunnel_build_state(net, cfg->fc_encap_type,
 						   cfg->fc_encap, AF_INET, cfg,
 						   &lwtstate, extack);
 			if (err)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 2f39479be92f..32e05aa6117d 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -228,7 +228,7 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
 	[LWTUNNEL_IP_FLAGS]	= { .type = NLA_U16 },
 };
 
-static int ip_tun_build_state(struct nlattr *attr,
+static int ip_tun_build_state(struct net *net, struct nlattr *attr,
 			      unsigned int family, const void *cfg,
 			      struct lwtunnel_state **ts,
 			      struct netlink_ext_ack *extack)
@@ -327,7 +327,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
 	[LWTUNNEL_IP6_FLAGS]		= { .type = NLA_U16 },
 };
 
-static int ip6_tun_build_state(struct nlattr *attr,
+static int ip6_tun_build_state(struct net *net, struct nlattr *attr,
 			       unsigned int family, const void *cfg,
 			       struct lwtunnel_state **ts,
 			       struct netlink_ext_ack *extack)
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 3d56a2fb6f86..9f1e46a1468e 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -125,7 +125,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
 	[ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, },
 };
 
-static int ila_build_state(struct nlattr *nla,
+static int ila_build_state(struct net *net, struct nlattr *nla,
 			   unsigned int family, const void *cfg,
 			   struct lwtunnel_state **ts,
 			   struct netlink_ext_ack *extack)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b3f4d19b3ca5..0e0cc97e8f42 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2565,7 +2565,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 	if (cfg->fc_encap) {
 		struct lwtunnel_state *lwtstate;
 
-		err = lwtunnel_build_state(cfg->fc_encap_type,
+		err = lwtunnel_build_state(net, cfg->fc_encap_type,
 					   cfg->fc_encap, AF_INET6, cfg,
 					   &lwtstate, extack);
 		if (err)
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index bd6cc688bd19..a6cf2fba15f3 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -359,7 +359,7 @@ static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 	return err;
 }
 
-static int seg6_build_state(struct nlattr *nla,
+static int seg6_build_state(struct net *net, struct nlattr *nla,
 			    unsigned int family, const void *cfg,
 			    struct lwtunnel_state **ts,
 			    struct netlink_ext_ack *extack)
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 825b8e01f947..45dc670c5a93 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -779,8 +779,9 @@ static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
 	return 0;
 }
 
-static int seg6_local_build_state(struct nlattr *nla, unsigned int family,
-				  const void *cfg, struct lwtunnel_state **ts,
+static int seg6_local_build_state(struct net *net, struct nlattr *nla,
+				  unsigned int family, const void *cfg,
+				  struct lwtunnel_state **ts,
 				  struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[SEG6_LOCAL_MAX + 1];
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 6e558a419f60..c947310cc04f 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -157,7 +157,7 @@ static int mpls_xmit(struct sk_buff *skb)
 	return -EINVAL;
 }
 
-static int mpls_build_state(struct nlattr *nla,
+static int mpls_build_state(struct net *net, struct nlattr *nla,
 			    unsigned int family, const void *cfg,
 			    struct lwtunnel_state **ts,
 			    struct netlink_ext_ack *extack)
-- 
2.11.0

^ permalink raw reply related

* [PATCH v5 net-next 2/7] rtnetlink: Add notify route message types
From: Tom Herbert @ 2017-12-21 19:33 UTC (permalink / raw)
  To: davem; +Cc: netdev, roopa, rohit, Tom Herbert
In-Reply-To: <20171221193332.15303-1-tom@quantonium.net>

Add notify route message and notify rtnl group. This is used to send
a notification about a route. For example, this will be used with ILA
to notify a daemon to send an ILA redirect.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 include/uapi/linux/rtnetlink.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 843e29aa3cac..ee955c7ca48a 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -150,6 +150,9 @@ enum {
 	RTM_NEWCACHEREPORT = 96,
 #define RTM_NEWCACHEREPORT RTM_NEWCACHEREPORT
 
+	RTM_NOTIFYROUTE = 98,
+#define RTM_NOTIFYROUTE RTM_NOTIFYROUTE
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
@@ -677,6 +680,8 @@ enum rtnetlink_groups {
 #define RTNLGRP_IPV4_MROUTE_R	RTNLGRP_IPV4_MROUTE_R
 	RTNLGRP_IPV6_MROUTE_R,
 #define RTNLGRP_IPV6_MROUTE_R	RTNLGRP_IPV6_MROUTE_R
+	RTNLGRP_ROUTE_NOTIFY,
+#define RTNLGRP_ROUTE_NOTIFY	RTNLGRP_ROUTE_NOTIFY
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
-- 
2.11.0

^ permalink raw reply related

* [PATCH v5 net-next 3/7] ila: Fix use of rhashtable walk in ila_xlat.c
From: Tom Herbert @ 2017-12-21 19:33 UTC (permalink / raw)
  To: davem; +Cc: netdev, roopa, rohit, Tom Herbert
In-Reply-To: <20171221193332.15303-1-tom@quantonium.net>

Perform better EAGAIN handling, handle case where ila_dump_info
fails and we missed objects in the dump, and add a skip index
to skip over ila entires in a list on a rhashtable node that have
already been visited (by a previous call to ila_nl_dump).

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 net/ipv6/ila/ila_xlat.c | 70 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 16 deletions(-)

diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 44c39c5f0638..887dd5b785b5 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -474,24 +474,31 @@ static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info)
 
 struct ila_dump_iter {
 	struct rhashtable_iter rhiter;
+	int skip;
 };
 
 static int ila_nl_dump_start(struct netlink_callback *cb)
 {
 	struct net *net = sock_net(cb->skb->sk);
 	struct ila_net *ilan = net_generic(net, ila_net_id);
-	struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
+	struct ila_dump_iter *iter;
+	int ret;
 
-	if (!iter) {
-		iter = kmalloc(sizeof(*iter), GFP_KERNEL);
-		if (!iter)
-			return -ENOMEM;
+	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
 
-		cb->args[0] = (long)iter;
+	ret = rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter,
+				   GFP_KERNEL);
+	if (ret) {
+		kfree(iter);
+		return ret;
 	}
 
-	return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter,
-				    GFP_KERNEL);
+	iter->skip = 0;
+	cb->args[0] = (long)iter;
+
+	return ret;
 }
 
 static int ila_nl_dump_done(struct netlink_callback *cb)
@@ -509,20 +516,45 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
 	struct rhashtable_iter *rhiter = &iter->rhiter;
+	int skip = iter->skip;
 	struct ila_map *ila;
 	int ret;
 
 	rhashtable_walk_start(rhiter);
 
-	for (;;) {
-		ila = rhashtable_walk_next(rhiter);
+	/* Get first entry */
+	ila = rhashtable_walk_peek(rhiter);
+
+	if (ila && !IS_ERR(ila) && skip) {
+		/* Skip over visited entries */
+
+		while (ila && skip) {
+			/* Skip over any ila entries in this list that we
+			 * have already dumped.
+			 */
+			ila = rcu_access_pointer(ila->next);
+			skip--;
+		}
+	}
 
+	skip = 0;
+
+	for (;;) {
 		if (IS_ERR(ila)) {
-			if (PTR_ERR(ila) == -EAGAIN)
-				continue;
 			ret = PTR_ERR(ila);
-			goto done;
+			if (ret == -EAGAIN) {
+				/* Table has changed and iter has reset. Return
+				 * -EAGAIN to the application even if we have
+				 * written data to the skb. The application
+				 * needs to deal with this.
+				 */
+
+				goto out_ret;
+			} else {
+				break;
+			}
 		} else if (!ila) {
+			ret = 0;
 			break;
 		}
 
@@ -531,15 +563,21 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
 					     cb->nlh->nlmsg_seq, NLM_F_MULTI,
 					     skb, ILA_CMD_GET);
 			if (ret)
-				goto done;
+				goto out;
 
+			skip++;
 			ila = rcu_access_pointer(ila->next);
 		}
+
+		skip = 0;
+		ila = rhashtable_walk_next(rhiter);
 	}
 
-	ret = skb->len;
+out:
+	iter->skip = skip;
+	ret = (skb->len ? : ret);
 
-done:
+out_ret:
 	rhashtable_walk_stop(rhiter);
 	return ret;
 }
-- 
2.11.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox