Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next V3 7/9] tun: support receiving skb through msg_control
From: Jason Wang @ 2017-05-10  2:37 UTC (permalink / raw)
  To: netdev, linux-kernel, mst; +Cc: Jason Wang
In-Reply-To: <1494383881-6811-1-git-send-email-jasowang@redhat.com>

This patch makes tun_recvmsg() can receive from skb from its caller
through msg_control. Vhost_net will be the first user.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3cbfc5c..f8041f9c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1510,9 +1510,8 @@ static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
 
 static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 			   struct iov_iter *to,
-			   int noblock)
+			   int noblock, struct sk_buff *skb)
 {
-	struct sk_buff *skb;
 	ssize_t ret;
 	int err;
 
@@ -1521,10 +1520,12 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 	if (!iov_iter_count(to))
 		return 0;
 
-	/* Read frames from ring */
-	skb = tun_ring_recv(tfile, noblock, &err);
-	if (!skb)
-		return err;
+	if (!skb) {
+		/* Read frames from ring */
+		skb = tun_ring_recv(tfile, noblock, &err);
+		if (!skb)
+			return err;
+	}
 
 	ret = tun_put_user(tun, tfile, skb, to);
 	if (unlikely(ret < 0))
@@ -1544,7 +1545,7 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
 
 	if (!tun)
 		return -EBADFD;
-	ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK);
+	ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK, NULL);
 	ret = min_t(ssize_t, ret, len);
 	if (ret > 0)
 		iocb->ki_pos = ret;
@@ -1646,7 +1647,8 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
 					 SOL_PACKET, TUN_TX_TIMESTAMP);
 		goto out;
 	}
-	ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT);
+	ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT,
+			  m->msg_control);
 	if (ret > (ssize_t)total_len) {
 		m->msg_flags |= MSG_TRUNC;
 		ret = flags & MSG_TRUNC ? ret : total_len;
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V3 6/9] tap: export skb_array
From: Jason Wang @ 2017-05-10  2:37 UTC (permalink / raw)
  To: netdev, linux-kernel, mst; +Cc: Jason Wang
In-Reply-To: <1494383881-6811-1-git-send-email-jasowang@redhat.com>

This patch exports skb_array through tap_get_skb_array(). Caller can
then manipulate skb array directly.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tap.c      | 13 +++++++++++++
 include/linux/if_tap.h |  5 +++++
 2 files changed, 18 insertions(+)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 4d4173d..abdaf86 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1193,6 +1193,19 @@ struct socket *tap_get_socket(struct file *file)
 }
 EXPORT_SYMBOL_GPL(tap_get_socket);
 
+struct skb_array *tap_get_skb_array(struct file *file)
+{
+	struct tap_queue *q;
+
+	if (file->f_op != &tap_fops)
+		return ERR_PTR(-EINVAL);
+	q = file->private_data;
+	if (!q)
+		return ERR_PTR(-EBADFD);
+	return &q->skb_array;
+}
+EXPORT_SYMBOL_GPL(tap_get_skb_array);
+
 int tap_queue_resize(struct tap_dev *tap)
 {
 	struct net_device *dev = tap->dev;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 3482c3c..4837157 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -3,6 +3,7 @@
 
 #if IS_ENABLED(CONFIG_TAP)
 struct socket *tap_get_socket(struct file *);
+struct skb_array *tap_get_skb_array(struct file *file);
 #else
 #include <linux/err.h>
 #include <linux/errno.h>
@@ -12,6 +13,10 @@ static inline struct socket *tap_get_socket(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
 }
+static inline struct skb_array *tap_get_skb_array(struct file *f)
+{
+	return ERR_PTR(-EINVAL);
+}
 #endif /* CONFIG_TAP */
 
 #include <net/sock.h>
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V3 5/9] tun: export skb_array
From: Jason Wang @ 2017-05-10  2:37 UTC (permalink / raw)
  To: netdev, linux-kernel, mst; +Cc: Jason Wang
In-Reply-To: <1494383881-6811-1-git-send-email-jasowang@redhat.com>

This patch exports skb_array through tun_get_skb_array(). Caller can
then manipulate skb array directly.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c      | 13 +++++++++++++
 include/linux/if_tun.h |  5 +++++
 2 files changed, 18 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index bbd707b..3cbfc5c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2626,6 +2626,19 @@ struct socket *tun_get_socket(struct file *file)
 }
 EXPORT_SYMBOL_GPL(tun_get_socket);
 
+struct skb_array *tun_get_skb_array(struct file *file)
+{
+	struct tun_file *tfile;
+
+	if (file->f_op != &tun_fops)
+		return ERR_PTR(-EINVAL);
+	tfile = file->private_data;
+	if (!tfile)
+		return ERR_PTR(-EBADFD);
+	return &tfile->tx_array;
+}
+EXPORT_SYMBOL_GPL(tun_get_skb_array);
+
 module_init(tun_init);
 module_exit(tun_cleanup);
 MODULE_DESCRIPTION(DRV_DESCRIPTION);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index ed6da2e..bf9bdf4 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -19,6 +19,7 @@
 
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
+struct skb_array *tun_get_skb_array(struct file *file);
 #else
 #include <linux/err.h>
 #include <linux/errno.h>
@@ -28,5 +29,9 @@ static inline struct socket *tun_get_socket(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
 }
+static inline struct skb_array *tun_get_skb_array(struct file *f)
+{
+	return ERR_PTR(-EINVAL);
+}
 #endif /* CONFIG_TUN */
 #endif /* __IF_TUN_H */
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V3 4/9] skb_array: introduce batch dequeuing
From: Jason Wang @ 2017-05-10  2:37 UTC (permalink / raw)
  To: netdev, linux-kernel, mst; +Cc: Jason Wang
In-Reply-To: <1494383881-6811-1-git-send-email-jasowang@redhat.com>

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/skb_array.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index 79850b6..35226cd 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -97,21 +97,46 @@ static inline struct sk_buff *skb_array_consume(struct skb_array *a)
 	return ptr_ring_consume(&a->ring);
 }
 
+static inline int skb_array_consume_batched(struct skb_array *a,
+					    struct sk_buff **array, int n)
+{
+	return ptr_ring_consume_batched(&a->ring, (void **)array, n);
+}
+
 static inline struct sk_buff *skb_array_consume_irq(struct skb_array *a)
 {
 	return ptr_ring_consume_irq(&a->ring);
 }
 
+static inline int skb_array_consume_batched_irq(struct skb_array *a,
+						struct sk_buff **array, int n)
+{
+	return ptr_ring_consume_batched_irq(&a->ring, (void **)array, n);
+}
+
 static inline struct sk_buff *skb_array_consume_any(struct skb_array *a)
 {
 	return ptr_ring_consume_any(&a->ring);
 }
 
+static inline int skb_array_consume_batched_any(struct skb_array *a,
+						struct sk_buff **array, int n)
+{
+	return ptr_ring_consume_batched_any(&a->ring, (void **)array, n);
+}
+
+
 static inline struct sk_buff *skb_array_consume_bh(struct skb_array *a)
 {
 	return ptr_ring_consume_bh(&a->ring);
 }
 
+static inline int skb_array_consume_batched_bh(struct skb_array *a,
+					       struct sk_buff **array, int n)
+{
+	return ptr_ring_consume_batched_bh(&a->ring, (void **)array, n);
+}
+
 static inline int __skb_array_len_with_tag(struct sk_buff *skb)
 {
 	if (likely(skb)) {
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V3 3/9] ptr_ring: introduce batch dequeuing
From: Jason Wang @ 2017-05-10  2:37 UTC (permalink / raw)
  To: netdev, linux-kernel, mst; +Cc: Jason Wang
In-Reply-To: <1494383881-6811-1-git-send-email-jasowang@redhat.com>

This patch introduce a batched version of consuming, consumer can
dequeue more than one pointers from the ring at a time. We don't care
about the reorder of reading here so no need for compiler barrier.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/ptr_ring.h | 65 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 5476f68..9db39e6 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -247,6 +247,22 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r)
 	return ptr;
 }
 
+static inline int __ptr_ring_consume_batched(struct ptr_ring *r,
+					     void **array, int n)
+{
+	void *ptr;
+	int i;
+
+	for (i = 0; i < n; i++) {
+		ptr = __ptr_ring_consume(r);
+		if (!ptr)
+			break;
+		array[i] = ptr;
+	}
+
+	return i;
+}
+
 /*
  * Note: resize (below) nests producer lock within consumer lock, so if you
  * call this in interrupt or BH context, you must disable interrupts/BH when
@@ -297,6 +313,55 @@ static inline void *ptr_ring_consume_bh(struct ptr_ring *r)
 	return ptr;
 }
 
+static inline int ptr_ring_consume_batched(struct ptr_ring *r,
+					   void **array, int n)
+{
+	int ret;
+
+	spin_lock(&r->consumer_lock);
+	ret = __ptr_ring_consume_batched(r, array, n);
+	spin_unlock(&r->consumer_lock);
+
+	return ret;
+}
+
+static inline int ptr_ring_consume_batched_irq(struct ptr_ring *r,
+					       void **array, int n)
+{
+	int ret;
+
+	spin_lock_irq(&r->consumer_lock);
+	ret = __ptr_ring_consume_batched(r, array, n);
+	spin_unlock_irq(&r->consumer_lock);
+
+	return ret;
+}
+
+static inline int ptr_ring_consume_batched_any(struct ptr_ring *r,
+					       void **array, int n)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&r->consumer_lock, flags);
+	ret = __ptr_ring_consume_batched(r, array, n);
+	spin_unlock_irqrestore(&r->consumer_lock, flags);
+
+	return ret;
+}
+
+static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
+					      void **array, int n)
+{
+	int ret;
+
+	spin_lock_bh(&r->consumer_lock);
+	ret = __ptr_ring_consume_batched(r, array, n);
+	spin_unlock_bh(&r->consumer_lock);
+
+	return ret;
+}
+
 /* Cast to structure type and call a function without discarding from FIFO.
  * Function must return a value.
  * Callers must take consumer_lock.
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V3 2/9] skb_array: introduce skb_array_unconsume
From: Jason Wang @ 2017-05-10  2:37 UTC (permalink / raw)
  To: netdev, linux-kernel, mst; +Cc: Jason Wang
In-Reply-To: <1494383881-6811-1-git-send-email-jasowang@redhat.com>

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/skb_array.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index f4dfade..79850b6 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -156,6 +156,12 @@ static void __skb_array_destroy_skb(void *ptr)
 	kfree_skb(ptr);
 }
 
+static inline void skb_array_unconsume(struct skb_array *a,
+				       struct sk_buff **skbs, int n)
+{
+	ptr_ring_unconsume(&a->ring, (void **)skbs, n, __skb_array_destroy_skb);
+}
+
 static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
 {
 	return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb);
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V3 1/9] ptr_ring: add ptr_ring_unconsume
From: Jason Wang @ 2017-05-10  2:37 UTC (permalink / raw)
  To: netdev, linux-kernel, mst; +Cc: Jason Wang
In-Reply-To: <1494383881-6811-1-git-send-email-jasowang@redhat.com>

From: "Michael S. Tsirkin" <mst@redhat.com>

Applications that consume a batch of entries in one go
can benefit from ability to return some of them back
into the ring.

Add an API for that - assuming there's space. If there's no space
naturally can't do this and have to drop entries, but this implies ring
is full so we'd likely drop some anyway.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/ptr_ring.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 6c70444..5476f68 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -359,6 +359,61 @@ static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp)
 	return 0;
 }
 
+/*
+ * Return entries into ring. Destroy entries that don't fit.
+ *
+ * Note: this is expected to be a rare slow path operation.
+ *
+ * Note: producer lock is nested within consumer lock, so if you
+ * resize you must make sure all uses nest correctly.
+ * In particular if you consume ring in interrupt or BH context, you must
+ * disable interrupts/BH when doing so.
+ */
+static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n,
+				      void (*destroy)(void *))
+{
+	unsigned long flags;
+	int head;
+
+	spin_lock_irqsave(&r->consumer_lock, flags);
+	spin_lock(&r->producer_lock);
+
+	if (!r->size)
+		goto done;
+
+	/*
+	 * Clean out buffered entries (for simplicity). This way following code
+	 * can test entries for NULL and if not assume they are valid.
+	 */
+	head = r->consumer_head - 1;
+	while (likely(head >= r->consumer_tail))
+		r->queue[head--] = NULL;
+	r->consumer_tail = r->consumer_head;
+
+	/*
+	 * Go over entries in batch, start moving head back and copy entries.
+	 * Stop when we run into previously unconsumed entries.
+	 */
+	while (n) {
+		head = r->consumer_head - 1;
+		if (head < 0)
+			head = r->size - 1;
+		if (r->queue[head]) {
+			/* This batch entry will have to be destroyed. */
+			goto done;
+		}
+		r->queue[head] = batch[--n];
+		r->consumer_tail = r->consumer_head = head;
+	}
+
+done:
+	/* Destroy all entries left in the batch. */
+	while (n)
+		destroy(batch[--n]);
+	spin_unlock(&r->producer_lock);
+	spin_unlock_irqrestore(&r->consumer_lock, flags);
+}
+
 static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
 					   int size, gfp_t gfp,
 					   void (*destroy)(void *))
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V3 0/9] vhost_net rx batch dequeuing
From: Jason Wang @ 2017-05-10  2:37 UTC (permalink / raw)
  To: netdev, linux-kernel, mst; +Cc: Jason Wang

This series tries to implement rx batching for vhost-net. This is done
by batching the dequeuing from skb_array which was exported by
underlayer socket and pass the sbk back through msg_control to finish
userspace copying. This is also the requirement for more batching
implemention on rx path.

Tests shows at most 8.8% improvment bon rx pps on top of batch zeroing.

Please review.

Thanks

Changes from V2:
- rebase to net-next HEAD
- use unconsume helpers to put skb back on releasing
- introduce and use vhost_net internal buffer helpers
- renew performance numbers on top of batch zeroing

Changes from V1:
- switch to use for() in __ptr_ring_consume_batched()
- rename peek_head_len_batched() to fetch_skbs()
- use skb_array_consume_batched() instead of
  skb_array_consume_batched_bh() since no consumer run in bh
- drop the lockless peeking patch since skb_array could be resized, so
  it's not safe to call lockless one

Jason Wang (8):
  skb_array: introduce skb_array_unconsume
  ptr_ring: introduce batch dequeuing
  skb_array: introduce batch dequeuing
  tun: export skb_array
  tap: export skb_array
  tun: support receiving skb through msg_control
  tap: support receiving skb from msg_control
  vhost_net: try batch dequing from skb array

Michael S. Tsirkin (1):
  ptr_ring: add ptr_ring_unconsume

 drivers/net/tap.c         |  25 ++++++++--
 drivers/net/tun.c         |  31 ++++++++----
 drivers/vhost/net.c       | 117 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/if_tap.h    |   5 ++
 include/linux/if_tun.h    |   5 ++
 include/linux/ptr_ring.h  | 120 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/skb_array.h |  31 ++++++++++++
 7 files changed, 316 insertions(+), 18 deletions(-)

-- 
2.7.4

^ permalink raw reply

* Re: DQL and TCQ_F_CAN_BYPASS destroy performance under virtualizaiton (Was: "Re: net_sched strange in 4.11")
From: Jason Wang @ 2017-05-10  2:18 UTC (permalink / raw)
  To: Stefan Hajnoczi, Anton Ivanov; +Cc: David S. Miller, netdev, Michael S. Tsirkin
In-Reply-To: <20170509151109.GH16825@stefanha-x1.localdomain>



On 2017年05月09日 23:11, Stefan Hajnoczi wrote:
> On Tue, May 09, 2017 at 08:46:46AM +0100, Anton Ivanov wrote:
>> I have figured it out. Two issues.
>>
>> 1) skb->xmit_more is hardly ever set under virtualization because the qdisc
>> is usually bypassed because of TCQ_F_CAN_BYPASS. Once TCQ_F_CAN_BYPASS is
>> set a virtual NIC driver is not likely see skb->xmit_more (this answers my
>> "how does this work at all" question).
>>
>> 2) If that flag is turned off (I patched sched_generic to turn it off in
>> pfifo_fast while testing), DQL keeps xmit_more from being set. If the driver
>> is not DQL enabled xmit_more is never ever set. If the driver is DQL enabled
>> the queue is adjusted to ensure xmit_more stops happening within 10-15 xmit
>> cycles.
>>
>> That is plain *wrong* for virtual NICs - virtio, emulated NICs, etc. There,
>> the BIG cost is telling the hypervisor that it needs to "kick" the packets.
>> The cost of putting them into the vNIC buffers is negligible. You want
>> xmit_more to happen - it makes between 50% and 300% (depending on vNIC
>> design) difference. If there is no xmit_more the vNIC will immediately
>> "kick" the hypervisor and try to signal that  the packet needs to move
>> straight away (as for example in virtio_net).

How do you measure the performance? TCP or just measure pps?

>>
>> In addition to that, the perceived line rate is proportional to this cost,
>> so I am not sure that the current dql math holds. In fact, I think it does
>> not - it is trying to adjust something which influences the perceived line
>> rate.
>>
>> So - how do we turn BOTH bypass and DQL adjustment while under
>> virtualization and set them to be "always qdisc" + "always xmit_more
>> allowed"

Virtio-net net does not support BQL. Before commit ea7735d97ba9 
("virtio-net: move free_old_xmit_skbs"), it's even impossible to support 
that since we don't have tx interrupt for each packet.  I haven't 
measured the impact of xmit_more, maybe I was wrong but I think it may 
help in some cases since it may improve the batching on host more or less.

Thanks

>>
>> A.
>>
>> P.S. Cc-ing virtio maintainer
> CCing Michael Tsirkin and Jason Wang, who are the core virtio and
> virtio-net maintainers.  (I maintain the vsock driver - it's unrelated
> to this discussion.)
>
>> A.
>>
>>
>> On 08/05/17 08:15, Anton Ivanov wrote:
>>> Hi all,
>>>
>>> I was revising some of my old work for UML to prepare it for submission
>>> and I noticed that skb->xmit_more does not seem to be set any more.
>>>
>>> I traced the issue as far as net/sched/sched_generic.c
>>>
>>> try_bulk_dequeue_skb() is never invoked (the drivers I am working on are
>>> dql enabled so that is not the problem).
>>>
>>> More interestingly, if I put a breakpoint and debug output into
>>> dequeue_skb() around line 147 - right before the bulk: tag that skb
>>> there is always NULL. ???
>>>
>>> Similarly, debug in pfifo_fast_dequeue shows only NULLs being dequeued.
>>> Again - ???
>>>
>>> First and foremost, I apologize for the silly question, but how can this
>>> work at all? I see the skbs showing up at the driver level, why are
>>> NULLs being returned at qdisc dequeue and where do the skbs at the
>>> driver level come from?
>>>
>>> Second, where should I look to fix it?
>>>
>>> A.
>>>
>>
>> -- 
>> Anton R. Ivanov
>>
>> Cambridge Greys Limited, England company No 10273661
>> http://www.cambridgegreys.com/
>>

^ permalink raw reply

* Re: [PATCH v2 net-next] bnxt: add dma mapping attributes
From: Michael Chan @ 2017-05-10  2:16 UTC (permalink / raw)
  To: Shannon Nelson; +Cc: David Miller, Netdev, sparclinux
In-Reply-To: <1494379812-282087-1-git-send-email-shannon.nelson@oracle.com>

On Tue, May 9, 2017 at 6:30 PM, Shannon Nelson
<shannon.nelson@oracle.com> wrote:
> On the SPARC platform we need to use the DMA_ATTR_WEAK_ORDERING attribute
> in our Rx path dma mapping in order to get the expected performance out
> of the receive path.  Adding it to the Tx path has little effect, so
> that's not a part of this patch.
>
> Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
> Reviewed-by: Tushar Dave <tushar.n.dave@oracle.com>
> Reviewed-by: Tom Saeger <tom.saeger@oracle.com>
> ---
> v2: simplify by getting rid of the driver-specific ifdef and define

Acked-by: Michael Chan <michael.chan@broadcom.com>

^ permalink raw reply

* Re: [PATCH RFC v2] ptr_ring: add ptr_ring_unconsume
From: Jason Wang @ 2017-05-10  2:01 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: linux-kernel, netdev
In-Reply-To: <20170509162622-mutt-send-email-mst@kernel.org>



On 2017年05月09日 21:26, Michael S. Tsirkin wrote:
> On Wed, Apr 26, 2017 at 05:09:42PM +0800, Jason Wang wrote:
>>
>> On 2017年04月25日 00:01, Michael S. Tsirkin wrote:
>>> Applications that consume a batch of entries in one go
>>> can benefit from ability to return some of them back
>>> into the ring.
>>>
>>> Add an API for that - assuming there's space. If there's no space
>>> naturally can't do this and have to drop entries, but this implies ring
>>> is full so we'd likely drop some anyway.
>>>
>>> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>>> ---
>>>
>>> Jason, if you add this and unconsume the outstanding packets
>>> on backend disconnect, vhost close and reset, I think
>>> we should apply your patch even if we don't yet know 100%
>>> why it helps.
>>>
>>> changes from v1:
>>> - fix up coding style issues reported by Sergei Shtylyov
>>>
>>>
>>>    include/linux/ptr_ring.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>    1 file changed, 56 insertions(+)
>>>
>>> diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
>>> index 783e7f5..902afc2 100644
>>> --- a/include/linux/ptr_ring.h
>>> +++ b/include/linux/ptr_ring.h
>>> @@ -457,6 +457,62 @@ static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp)
>>>    	return 0;
>>>    }
>>> +/*
>>> + * Return entries into ring. Destroy entries that don't fit.
>>> + *
>>> + * Note: this is expected to be a rare slow path operation.
>>> + *
>>> + * Note: producer lock is nested within consumer lock, so if you
>>> + * resize you must make sure all uses nest correctly.
>>> + * In particular if you consume ring in interrupt or BH context, you must
>>> + * disable interrupts/BH when doing so.
>>> + */
>>> +static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n,
>>> +				      void (*destroy)(void *))
>>> +{
>>> +	unsigned long flags;
>>> +	int head;
>>> +
>>> +	spin_lock_irqsave(&r->consumer_lock, flags);
>>> +	spin_lock(&r->producer_lock);
>>> +
>>> +	if (!r->size)
>>> +		goto done;
>>> +
>>> +	/*
>>> +	 * Clean out buffered entries (for simplicity). This way following code
>>> +	 * can test entries for NULL and if not assume they are valid.
>>> +	 */
>>> +	head = r->consumer_head - 1;
>>> +	while (likely(head >= r->consumer_tail))
>>> +		r->queue[head--] = NULL;
>>> +	r->consumer_tail = r->consumer_head;
>>> +
>>> +	/*
>>> +	 * Go over entries in batch, start moving head back and copy entries.
>>> +	 * Stop when we run into previously unconsumed entries.
>>> +	 */
>>> +	while (n--) {
>>> +		head = r->consumer_head - 1;
>>> +		if (head < 0)
>>> +			head = r->size - 1;
>>> +		if (r->queue[head]) {
>>> +			/* This batch entry will have to be destroyed. */
>>> +			++n;
>>> +			goto done;
>>> +		}
>>> +		r->queue[head] = batch[n];
>>> +		r->consumer_tail = r->consumer_head = head;
>> Looks like something wrong here (bad page state reported), uncomment the
>> above while() solving the issue. But after staring it for a while I didn't
>> find anything interesting, maybe you have some idea on this?
>>
>> Thanks
>>
>>
>>> +	}
>>> +
>>> +done:
>>> +	/* Destroy all entries left in the batch. */
>>> +	while (n--)
>>> +		destroy(batch[n]);
>>> +	spin_unlock(&r->producer_lock);
>>> +	spin_unlock_irqrestore(&r->consumer_lock, flags);
>>> +}
>>> +
>>>    static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
>>>    					   int size, gfp_t gfp,
>>>    					   void (*destroy)(void *))
> What's our plan here? I can't delay pull request much longer.
>

I'm waiting for net-next to be opened (since the series touches tun/tap).

Let me post a new version soon.

Thanks

^ permalink raw reply

* [PATCH net 1/2] xdp: add flag to enforce driver mode
From: Daniel Borkmann @ 2017-05-10  1:31 UTC (permalink / raw)
  To: davem; +Cc: alexei.starovoitov, john.fastabend, netdev, Daniel Borkmann
In-Reply-To: <cover.1494379046.git.daniel@iogearbox.net>

After commit b5cdae3291f7 ("net: Generic XDP") we automatically fall
back to a generic XDP variant if the driver does not support native
XDP. Allow for an option where the user can specify that always the
native XDP variant should be selected and in case it's not supported
by a driver, just bail out.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/if_link.h       | 6 ++++--
 net/core/dev.c                     | 2 ++
 net/core/rtnetlink.c               | 5 +++++
 samples/bpf/xdp1_user.c            | 8 ++++++--
 samples/bpf/xdp_tx_iptunnel_user.c | 7 ++++++-
 5 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8e56ac7..549ac8a 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -888,9 +888,11 @@ enum {
 /* XDP section */
 
 #define XDP_FLAGS_UPDATE_IF_NOEXIST	(1U << 0)
-#define XDP_FLAGS_SKB_MODE		(2U << 0)
+#define XDP_FLAGS_SKB_MODE		(1U << 1)
+#define XDP_FLAGS_DRV_MODE		(1U << 2)
 #define XDP_FLAGS_MASK			(XDP_FLAGS_UPDATE_IF_NOEXIST | \
-					 XDP_FLAGS_SKB_MODE)
+					 XDP_FLAGS_SKB_MODE | \
+					 XDP_FLAGS_DRV_MODE)
 
 enum {
 	IFLA_XDP_UNSPEC,
diff --git a/net/core/dev.c b/net/core/dev.c
index d07aa5f..61443f0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6872,6 +6872,8 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 	ASSERT_RTNL();
 
 	xdp_op = ops->ndo_xdp;
+	if (!xdp_op && (flags & XDP_FLAGS_DRV_MODE))
+		return -EOPNOTSUPP;
 	if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE))
 		xdp_op = generic_xdp_install;
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bcb0f610..dda9f16 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2199,6 +2199,11 @@ static int do_setlink(const struct sk_buff *skb,
 				err = -EINVAL;
 				goto errout;
 			}
+			if ((xdp_flags & XDP_FLAGS_SKB_MODE) &&
+			    (xdp_flags & XDP_FLAGS_DRV_MODE)) {
+				err = -EINVAL;
+				goto errout;
+			}
 		}
 
 		if (xdp[IFLA_XDP_FD]) {
diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c
index 378850c..17be9ea 100644
--- a/samples/bpf/xdp1_user.c
+++ b/samples/bpf/xdp1_user.c
@@ -62,13 +62,14 @@ static void usage(const char *prog)
 	fprintf(stderr,
 		"usage: %s [OPTS] IFINDEX\n\n"
 		"OPTS:\n"
-		"    -S    use skb-mode\n",
+		"    -S    use skb-mode\n"
+		"    -N    enforce native mode\n",
 		prog);
 }
 
 int main(int argc, char **argv)
 {
-	const char *optstr = "S";
+	const char *optstr = "SN";
 	char filename[256];
 	int opt;
 
@@ -77,6 +78,9 @@ int main(int argc, char **argv)
 		case 'S':
 			xdp_flags |= XDP_FLAGS_SKB_MODE;
 			break;
+		case 'N':
+			xdp_flags |= XDP_FLAGS_DRV_MODE;
+			break;
 		default:
 			usage(basename(argv[0]));
 			return 1;
diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c
index 92b8bde..631cdcc 100644
--- a/samples/bpf/xdp_tx_iptunnel_user.c
+++ b/samples/bpf/xdp_tx_iptunnel_user.c
@@ -79,6 +79,8 @@ static void usage(const char *cmd)
 	printf("    -m <dest-MAC> Used in sending the IP Tunneled pkt\n");
 	printf("    -T <stop-after-X-seconds> Default: 0 (forever)\n");
 	printf("    -P <IP-Protocol> Default is TCP\n");
+	printf("    -S use skb-mode\n");
+	printf("    -N enforce native mode\n");
 	printf("    -h Display this help\n");
 }
 
@@ -138,7 +140,7 @@ int main(int argc, char **argv)
 {
 	unsigned char opt_flags[256] = {};
 	unsigned int kill_after_s = 0;
-	const char *optstr = "i:a:p:s:d:m:T:P:Sh";
+	const char *optstr = "i:a:p:s:d:m:T:P:SNh";
 	int min_port = 0, max_port = 0;
 	struct iptnl_info tnl = {};
 	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
@@ -206,6 +208,9 @@ int main(int argc, char **argv)
 		case 'S':
 			xdp_flags |= XDP_FLAGS_SKB_MODE;
 			break;
+		case 'N':
+			xdp_flags |= XDP_FLAGS_DRV_MODE;
+			break;
 		default:
 			usage(argv[0]);
 			return 1;
-- 
1.9.3

^ permalink raw reply related

* [PATCH net 0/2] Two generic xdp related follow-ups
From: Daniel Borkmann @ 2017-05-10  1:31 UTC (permalink / raw)
  To: davem; +Cc: alexei.starovoitov, john.fastabend, netdev, Daniel Borkmann

Two follow-ups for the generic XDP API, would be great if
both could still be considered, since the XDP API is not
frozen yet. For details please see individual patches.

Thanks!

Daniel Borkmann (2):
  xdp: add flag to enforce driver mode
  xdp: disallow use of native and generic hook at once

 include/linux/netdevice.h          | 15 ++++++++--
 include/uapi/linux/if_link.h       |  6 ++--
 net/core/dev.c                     | 57 +++++++++++++++++++++++++-------------
 net/core/rtnetlink.c               | 19 +++++++------
 samples/bpf/xdp1_user.c            |  8 ++++--
 samples/bpf/xdp_tx_iptunnel_user.c |  7 ++++-
 6 files changed, 77 insertions(+), 35 deletions(-)

-- 
1.9.3

^ permalink raw reply

* [PATCH net 2/2] xdp: disallow use of native and generic hook at once
From: Daniel Borkmann @ 2017-05-10  1:31 UTC (permalink / raw)
  To: davem; +Cc: alexei.starovoitov, john.fastabend, netdev, Daniel Borkmann
In-Reply-To: <cover.1494379046.git.daniel@iogearbox.net>

While working on the iproute2 generic XDP frontend, I noticed that
as of right now it's possible to have native *and* generic XDP
programs loaded both at the same time for the case when a driver
supports native XDP.

The intended model for generic XDP from b5cdae3291f7 ("net: Generic
XDP") is, however, that only one out of the two can be present at
once which is also indicated as such in the XPD netlink dump part.
The main rationale for generic XDP is to ease accessibility (in
case a driver does not yet have XDP support) and to generically
provide a semantical model as an example for driver developers
wanting to add XDP support. The generic XDP option for an XDP
aware driver can still be useful for comparing and testing both
implementations.

However, it is not intended to have a second XDP processing stage
or layer with exactly the same functionality of the first native
stage. Only reason would be to have a partial fallback for future
XDP features that are not supported yet in the native implementation
and we probably also shouldn't strive for such fallback and instead
encourage native feature support in the first place. Given there's
currently no such fallback issue or use case, lets not go there
yet if we don't need to.

Therefore, change semantics for loading XDP and bail out if the
user tries to load a generic XDP program when a native one is
present and vice versa. Another alternative to bailing out would
be to handle the transition from one flavor to another gracefully,
but that would require to bring the device down, exchange both
types of programs, and bring it up again in order to avoid a tiny
window where a packet could hit both hooks. Given this complicates
the logic a bit more for just a debugging feature, I went with the
simpler variant.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/netdevice.h | 15 +++++++++++--
 net/core/dev.c            | 55 +++++++++++++++++++++++++++++++----------------
 net/core/rtnetlink.c      | 14 +++++-------
 3 files changed, 54 insertions(+), 30 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9c23bd2..abedec7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3296,11 +3296,22 @@ int dev_get_phys_port_id(struct net_device *dev,
 int dev_get_phys_port_name(struct net_device *dev,
 			   char *name, size_t len);
 int dev_change_proto_down(struct net_device *dev, bool proto_down);
-int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
-		      int fd, u32 flags);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
+
+typedef int (*xdp_op_t)(struct net_device *dev, struct netdev_xdp *xdp);
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+		      int fd, u32 flags);
+bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op);
+
+static inline bool dev_xdp_attached(struct net_device *dev)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	return ops->ndo_xdp && __dev_xdp_attached(dev, ops->ndo_xdp);
+}
+
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 bool is_skb_forwardable(const struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 61443f0..d25f847 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6851,6 +6851,32 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
+bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op)
+{
+	struct netdev_xdp xdp;
+
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = XDP_QUERY_PROG;
+
+	/* Query must always succeed. */
+	WARN_ON(xdp_op(dev, &xdp) < 0);
+	return xdp.prog_attached;
+}
+
+static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op,
+			   struct netlink_ext_ack *extack,
+			   struct bpf_prog *prog)
+{
+	struct netdev_xdp xdp;
+
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = XDP_SETUP_PROG;
+	xdp.extack = extack;
+	xdp.prog = prog;
+
+	return xdp_op(dev, &xdp);
+}
+
 /**
  *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
  *	@dev: device
@@ -6863,43 +6889,34 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags)
 {
-	int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp);
 	const struct net_device_ops *ops = dev->netdev_ops;
 	struct bpf_prog *prog = NULL;
-	struct netdev_xdp xdp;
+	xdp_op_t xdp_op, xdp_chk;
 	int err;
 
 	ASSERT_RTNL();
 
-	xdp_op = ops->ndo_xdp;
+	xdp_op = xdp_chk = ops->ndo_xdp;
 	if (!xdp_op && (flags & XDP_FLAGS_DRV_MODE))
 		return -EOPNOTSUPP;
 	if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE))
 		xdp_op = generic_xdp_install;
+	if (xdp_op == xdp_chk)
+		xdp_chk = generic_xdp_install;
 
 	if (fd >= 0) {
-		if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
-			memset(&xdp, 0, sizeof(xdp));
-			xdp.command = XDP_QUERY_PROG;
-
-			err = xdp_op(dev, &xdp);
-			if (err < 0)
-				return err;
-			if (xdp.prog_attached)
-				return -EBUSY;
-		}
+		if (xdp_chk && __dev_xdp_attached(dev, xdp_chk))
+			return -EEXIST;
+		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
+		    __dev_xdp_attached(dev, xdp_op))
+			return -EBUSY;
 
 		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
 		if (IS_ERR(prog))
 			return PTR_ERR(prog);
 	}
 
-	memset(&xdp, 0, sizeof(xdp));
-	xdp.command = XDP_SETUP_PROG;
-	xdp.extack = extack;
-	xdp.prog = prog;
-
-	err = xdp_op(dev, &xdp);
+	err = dev_xdp_install(dev, xdp_op, extack, prog);
 	if (err < 0 && prog)
 		bpf_prog_put(prog);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dda9f16..99320f0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1251,24 +1251,20 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
 {
 	struct nlattr *xdp;
 	u32 xdp_flags = 0;
-	u8 val = 0;
 	int err;
+	u8 val;
 
 	xdp = nla_nest_start(skb, IFLA_XDP);
 	if (!xdp)
 		return -EMSGSIZE;
+
 	if (rcu_access_pointer(dev->xdp_prog)) {
 		xdp_flags = XDP_FLAGS_SKB_MODE;
 		val = 1;
-	} else if (dev->netdev_ops->ndo_xdp) {
-		struct netdev_xdp xdp_op = {};
-
-		xdp_op.command = XDP_QUERY_PROG;
-		err = dev->netdev_ops->ndo_xdp(dev, &xdp_op);
-		if (err)
-			goto err_cancel;
-		val = xdp_op.prog_attached;
+	} else {
+		val = dev_xdp_attached(dev);
 	}
+
 	err = nla_put_u8(skb, IFLA_XDP_ATTACHED, val);
 	if (err)
 		goto err_cancel;
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next] bnxt: add dma mapping attributes
From: Shannon Nelson @ 2017-05-10  1:30 UTC (permalink / raw)
  To: davem, netdev; +Cc: sparclinux, michael.chan, Shannon Nelson

On the SPARC platform we need to use the DMA_ATTR_WEAK_ORDERING attribute
in our Rx path dma mapping in order to get the expected performance out
of the receive path.  Adding it to the Tx path has little effect, so
that's not a part of this patch.

Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
Reviewed-by: Tushar Dave <tushar.n.dave@oracle.com>
Reviewed-by: Tom Saeger <tom.saeger@oracle.com>
---
v2: simplify by getting rid of the driver-specific ifdef and define

 drivers/net/ethernet/broadcom/bnxt/bnxt.c |   56 +++++++++++++++++-----------
 1 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 1f1e54b..687f852 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -582,7 +582,8 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts)
 	if (!page)
 		return NULL;
 
-	*mapping = dma_map_page(dev, page, 0, PAGE_SIZE, bp->rx_dir);
+	*mapping = dma_map_page_attrs(dev, page, 0, PAGE_SIZE, bp->rx_dir,
+				      DMA_ATTR_WEAK_ORDERING);
 	if (dma_mapping_error(dev, *mapping)) {
 		__free_page(page);
 		return NULL;
@@ -601,8 +602,9 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts)
 	if (!data)
 		return NULL;
 
-	*mapping = dma_map_single(&pdev->dev, data + bp->rx_dma_offset,
-				  bp->rx_buf_use_size, bp->rx_dir);
+	*mapping = dma_map_single_attrs(&pdev->dev, data + bp->rx_dma_offset,
+					bp->rx_buf_use_size, bp->rx_dir,
+					DMA_ATTR_WEAK_ORDERING);
 
 	if (dma_mapping_error(&pdev->dev, *mapping)) {
 		kfree(data);
@@ -705,8 +707,9 @@ static inline int bnxt_alloc_rx_page(struct bnxt *bp,
 			return -ENOMEM;
 	}
 
-	mapping = dma_map_page(&pdev->dev, page, offset, BNXT_RX_PAGE_SIZE,
-			       PCI_DMA_FROMDEVICE);
+	mapping = dma_map_page_attrs(&pdev->dev, page, offset,
+				     BNXT_RX_PAGE_SIZE, PCI_DMA_FROMDEVICE,
+				     DMA_ATTR_WEAK_ORDERING);
 	if (dma_mapping_error(&pdev->dev, mapping)) {
 		__free_page(page);
 		return -EIO;
@@ -799,7 +802,8 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_napi *bnapi, u16 cp_cons,
 		return NULL;
 	}
 	dma_addr -= bp->rx_dma_offset;
-	dma_unmap_page(&bp->pdev->dev, dma_addr, PAGE_SIZE, bp->rx_dir);
+	dma_unmap_page_attrs(&bp->pdev->dev, dma_addr, PAGE_SIZE, bp->rx_dir,
+			     DMA_ATTR_WEAK_ORDERING);
 
 	if (unlikely(!payload))
 		payload = eth_get_headlen(data_ptr, len);
@@ -841,8 +845,8 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_napi *bnapi, u16 cp_cons,
 	}
 
 	skb = build_skb(data, 0);
-	dma_unmap_single(&bp->pdev->dev, dma_addr, bp->rx_buf_use_size,
-			 bp->rx_dir);
+	dma_unmap_single_attrs(&bp->pdev->dev, dma_addr, bp->rx_buf_use_size,
+			       bp->rx_dir, DMA_ATTR_WEAK_ORDERING);
 	if (!skb) {
 		kfree(data);
 		return NULL;
@@ -909,8 +913,9 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_napi *bnapi, u16 cp_cons,
 			return NULL;
 		}
 
-		dma_unmap_page(&pdev->dev, mapping, BNXT_RX_PAGE_SIZE,
-			       PCI_DMA_FROMDEVICE);
+		dma_unmap_page_attrs(&pdev->dev, mapping, BNXT_RX_PAGE_SIZE,
+				     PCI_DMA_FROMDEVICE,
+				     DMA_ATTR_WEAK_ORDERING);
 
 		skb->data_len += frag_len;
 		skb->len += frag_len;
@@ -1329,8 +1334,9 @@ static void bnxt_abort_tpa(struct bnxt *bp, struct bnxt_napi *bnapi,
 		tpa_info->mapping = new_mapping;
 
 		skb = build_skb(data, 0);
-		dma_unmap_single(&bp->pdev->dev, mapping, bp->rx_buf_use_size,
-				 bp->rx_dir);
+		dma_unmap_single_attrs(&bp->pdev->dev, mapping,
+				       bp->rx_buf_use_size, bp->rx_dir,
+				       DMA_ATTR_WEAK_ORDERING);
 
 		if (!skb) {
 			kfree(data);
@@ -1971,9 +1977,11 @@ static void bnxt_free_rx_skbs(struct bnxt *bp)
 				if (!data)
 					continue;
 
-				dma_unmap_single(&pdev->dev, tpa_info->mapping,
-						 bp->rx_buf_use_size,
-						 bp->rx_dir);
+				dma_unmap_single_attrs(&pdev->dev,
+						       tpa_info->mapping,
+						       bp->rx_buf_use_size,
+						       bp->rx_dir,
+						       DMA_ATTR_WEAK_ORDERING);
 
 				tpa_info->data = NULL;
 
@@ -1993,13 +2001,15 @@ static void bnxt_free_rx_skbs(struct bnxt *bp)
 
 			if (BNXT_RX_PAGE_MODE(bp)) {
 				mapping -= bp->rx_dma_offset;
-				dma_unmap_page(&pdev->dev, mapping,
-					       PAGE_SIZE, bp->rx_dir);
+				dma_unmap_page_attrs(&pdev->dev, mapping,
+						     PAGE_SIZE, bp->rx_dir,
+						     DMA_ATTR_WEAK_ORDERING);
 				__free_page(data);
 			} else {
-				dma_unmap_single(&pdev->dev, mapping,
-						 bp->rx_buf_use_size,
-						 bp->rx_dir);
+				dma_unmap_single_attrs(&pdev->dev, mapping,
+						       bp->rx_buf_use_size,
+						       bp->rx_dir,
+						       DMA_ATTR_WEAK_ORDERING);
 				kfree(data);
 			}
 		}
@@ -2012,8 +2022,10 @@ static void bnxt_free_rx_skbs(struct bnxt *bp)
 			if (!page)
 				continue;
 
-			dma_unmap_page(&pdev->dev, rx_agg_buf->mapping,
-				       BNXT_RX_PAGE_SIZE, PCI_DMA_FROMDEVICE);
+			dma_unmap_page_attrs(&pdev->dev, rx_agg_buf->mapping,
+					     BNXT_RX_PAGE_SIZE,
+					     PCI_DMA_FROMDEVICE,
+					     DMA_ATTR_WEAK_ORDERING);
 
 			rx_agg_buf->page = NULL;
 			__clear_bit(j, rxr->rx_agg_bmap);
-- 
1.7.1


^ permalink raw reply related

* Re: [PATCH v1] ACPI: Switch to use generic UUID API
From: Zhang Rui @ 2017-05-10  1:20 UTC (permalink / raw)
  To: Andy Shevchenko, linux-acpi, tpmdd-devel, intel-gfx, nouveau,
	linux-input, iommu, linux-mmc, netdev, linux-pci, linux-usb,
	alsa-devel, linux-kernel
  Cc: Rafael J . Wysocki, Mika Westerberg, Borislav Petkov,
	Dan Williams, Amir Goldstein, Jarkko Sakkinen, Jani Nikula,
	Ben Skeggs, Benjamin Tissoires, Joerg Roedel, Adrian Hunter,
	Yisen Zhuang, Bjorn Helgaas, Felipe Balbi, Mathias Nyman,
	Heikki Krogerus, Liam Girdwood, Mark Brown
In-Reply-To: <20170504092151.88646-1-andriy.shevchenko@linux.intel.com>

On Thu, 2017-05-04 at 12:21 +0300, Andy Shevchenko wrote:
> acpi_evaluate_dsm() and friends take a pointer to a raw buffer of 16
> bytes. Instead we convert them to use uuid_le type. At the same time
> we
> convert current users.
> 
> acpi_str_to_uuid() becomes useless after the conversion and it's safe
> to
> get rid of it.
> 
> The conversion fixes a potential bug in int340x_thermal as well since
> we have to use memcmp() on binary data.
> 
> Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
> Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
> Cc: Borislav Petkov <bp@suse.de>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Amir Goldstein <amir73il@gmail.com>
> Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> Cc: Jani Nikula <jani.nikula@linux.intel.com>
> Cc: Ben Skeggs <bskeggs@redhat.com>
> Cc: Benjamin Tissoires <benjamin.tissoires@redhat.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Adrian Hunter <adrian.hunter@intel.com>
> Cc: Yisen Zhuang <yisen.zhuang@huawei.com>
> Cc: Bjorn Helgaas <bhelgaas@google.com>
> Cc: Zhang Rui <rui.zhang@intel.com>
> Cc: Felipe Balbi <balbi@kernel.org>
> Cc: Mathias Nyman <mathias.nyman@intel.com>
> Cc: Heikki Krogerus <heikki.krogerus@linux.intel.com>
> Cc: Liam Girdwood <lgirdwood@gmail.com>
> Cc: Mark Brown <broonie@kernel.org>
> Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
> ---
> 
> diff --git a/drivers/thermal/int340x_thermal/int3400_thermal.c
> b/drivers/thermal/int340x_thermal/int3400_thermal.c
> index 9413c4abf0b9..c0eb3bb19b23 100644
> --- a/drivers/thermal/int340x_thermal/int3400_thermal.c
> +++ b/drivers/thermal/int340x_thermal/int3400_thermal.c
> @@ -23,7 +23,7 @@ enum int3400_thermal_uuid {
>  	INT3400_THERMAL_MAXIMUM_UUID,
>  };
>  
> -static u8 *int3400_thermal_uuids[INT3400_THERMAL_MAXIMUM_UUID] = {
> +static const char
> *int3400_thermal_uuids[INT3400_THERMAL_MAXIMUM_UUID] = {
>  	"42A441D6-AE6A-462b-A84B-4A8CE79027D3",
>  	"3A95C389-E4B8-4629-A526-C52C88626BAE",
>  	"97C68AE7-15FA-499c-B8C9-5DA81D606E0A",
> @@ -141,10 +141,10 @@ static int int3400_thermal_get_uuids(struct
> int3400_thermal_priv *priv)
>  		}
>  
>  		for (j = 0; j < INT3400_THERMAL_MAXIMUM_UUID; j++) {
> -			u8 uuid[16];
> +			uuid_le u;
>  
> -			acpi_str_to_uuid(int3400_thermal_uuids[j],
> uuid);
> -			if (!strncmp(uuid, objb->buffer.pointer,
> 16)) {
> +			uuid_le_to_bin(int3400_thermal_uuids[j],
> &u);
> +			if (!uuid_le_cmp(*(uuid_le *)objb-
> >buffer.pointer), u) {
>  				priv->uuid_bitmap |= (1 << j);
>  				break;
>  			}
thanks for the fix.

Acked-by: Zhang Rui <rui.zhang@intel.com>

-rui

^ permalink raw reply

* Re: [PATCH net-next] bnxt: add dma mapping attributes
From: Shannon Nelson @ 2017-05-10  1:17 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, sparclinux, Michael Chan
In-Reply-To: <20170509.204937.115947896476049334.davem@davemloft.net>

On 5/9/2017 5:49 PM, David Miller wrote:
> From: Shannon Nelson <shannon.nelson@oracle.com>
> Date: Tue,  9 May 2017 13:37:33 -0700
>
>> @@ -66,6 +66,12 @@
>>  MODULE_DESCRIPTION("Broadcom BCM573xx network driver");
>>  MODULE_VERSION(DRV_MODULE_VERSION);
>>
>> +#ifdef CONFIG_SPARC
>> +#define BNXT_DMA_ATTRS  DMA_ATTR_WEAK_ORDERING
>> +#else
>> +#define BNXT_DMA_ATTRS	0
>> +#endif
>> +
>
> I do not what this ifdef crap showing up in every driver just
> to improve performance on one platform.
>
> This needs to be generically done somehow in a way that
> drivers get the correct setting automatically and without
> ifdef checks.
>

Yes, these do get ugly, and as Michael pointed out this really is an 
advisory bit and should just be ignored by platforms that don't care. 
I'll just respin this using the DMA_ATTR_WEAK_ORDERING bit directly 
rather than obscuring it with a BNXT define.

sln

^ permalink raw reply

* Re:Re: [PATCH net v2] driver: vrf: Fix one possible use-after-free issue
From: Gao Feng @ 2017-05-10  1:00 UTC (permalink / raw)
  To: David Miller; +Cc: dsa, shm, fw, netdev
In-Reply-To: <20170509.143736.1436700294491473750.davem@davemloft.net>


At 2017-05-10 02:37:36, "David Miller" <davem@davemloft.net> wrote:
>From: gfree.wind@vip.163.com
>Date: Tue,  9 May 2017 18:27:33 +0800
>
>> @@ -989,6 +989,7 @@ static u32 vrf_fib_table(const struct net_device *dev)
>>  
>>  static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
>>  {
>> +	kfree_skb(skb);
>>  	return 0;
>>  }
>>  
>> @@ -998,7 +999,7 @@ static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook,
>>  {
>>  	struct net *net = dev_net(dev);
>>  
>> -	if (NF_HOOK(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) < 0)
>> +	if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1)
>>  		skb = NULL;    /* kfree_skb(skb) handled by nf code */
>>  
>>  	return skb;
>
>Indeed, this fixes the immediate problem with NF_STOLEN.
>
>Making NF_STOLEN fully functional is another story, we'd need to stack
>this all together properly:

Yes, this fix just make the vrf wouldn't cause panic which is caused by use-after-free skb.
It doesn't work as real NF_QUEUE when reinject the skb. 

>
>static int __ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
>{
> ...
>}
>
>static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
>{
>	return l3mdev_ip_rcv(skb, __ip_rcv_finish);
>}
>...
>static inline
>struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb,
>			      int (*okfn)(struct net *, struct sock *, struct sk_buff *))
>{
>	return l3mdev_l3_rcv(skb, okfn, AF_INET);
>}
>
>etc. but that's going to really add a kink to the receive path,
>microbenchmark wise.

It is a solution to make NF_STOLEN fully function, but I haven't environment to test the benchmark.

Best Regards
Feng



^ permalink raw reply

* Re: [PATCH net-next] bnxt: add dma mapping attributes
From: David Miller @ 2017-05-10  0:49 UTC (permalink / raw)
  To: shannon.nelson; +Cc: netdev, sparclinux
In-Reply-To: <1494362253-270990-1-git-send-email-shannon.nelson@oracle.com>

From: Shannon Nelson <shannon.nelson@oracle.com>
Date: Tue,  9 May 2017 13:37:33 -0700

> @@ -66,6 +66,12 @@
>  MODULE_DESCRIPTION("Broadcom BCM573xx network driver");
>  MODULE_VERSION(DRV_MODULE_VERSION);
>  
> +#ifdef CONFIG_SPARC
> +#define BNXT_DMA_ATTRS  DMA_ATTR_WEAK_ORDERING
> +#else
> +#define BNXT_DMA_ATTRS	0
> +#endif
> +

I do not what this ifdef crap showing up in every driver just
to improve performance on one platform.

This needs to be generically done somehow in a way that
drivers get the correct setting automatically and without
ifdef checks.

^ permalink raw reply

* Re: [PATCH 0/2] net: Set maximum receive packet size on veth interfaces
From: Fredrik Markström @ 2017-05-10  0:42 UTC (permalink / raw)
  To: David Miller
  Cc: Eric Dumazet, Stephen Hemminger, Alexei Starovoitov,
	Daniel Borkmann, netdev, linux-kernel, bridge
In-Reply-To: <20170509.134839.2008658486655854998.davem@davemloft.net>

On Tue, May 9, 2017 at 7:48 PM, David Miller <davem@davemloft.net> wrote:
>
> From: Fredrik Markstrom <fredrik.markstrom@gmail.com>
> Date: Tue,  9 May 2017 14:44:36 +0200
>
> > Currently veth drops all packets larger then the mtu set on the
> > receiving end of the pair. This is inconsistent with most hardware
> > ethernet drivers.
>
> False.
>
> In fact, many pieces of ethernet hardware a not physically capable of
> sending even VLAN packets that are above the normal base ethernet MTU.
>

Maybe I was unclear, the veth implementation drops all packers larger then the
configured MTU (on the receiving interface).
Most ethernet drivers accepts packets up to the ethernet MTU no matter the
configured MTU. As far as I can tell from the RFC:s that is ok.

A simpler solution would be to only drop packets larger then ethernet MTU (like
most network drivers), but I guess that will break existing stuff
already out there.

The example below demonstrates what behaviour I'm referring to.

Example:
---- 8< ------
# veth0 and veth1 is a veth pair and veth1 has ben moved to a separate
network namespace.
% # NS A
% ip address add 1.2.3.4/24 dev veth0
% ip link set veth0 mtu 300 up

% # NS B
% ip address add 1.2.3.5/24 dev veth1
% ip link set veth1 mtu 1000 up
% ping -c 1 -W 1 -s 400 1.2.3.4
PING 1.2.3.4 (1.2.3.4) 400(428) bytes of data.

--- 1.2.3.4 ping statistics ---
1 packets transmitted, 0 received, 100% packet loss, time 0ms
---- 8< ------

While it works fine in most setup:s with hw-interfaces:

---- 8< ------
# Host A eth2 and Host B eth0 is on the same network.

# On HOST A
% ip address add 1.2.3.4/24 dev eth2
% ip link set eth2 mtu 300 up

% # HOST B
% ip address add 1.2.3.5/24 dev eth0
% ip link set eth0 mtu 1000 up
% ping -c 1 -W 1 -s 400 1.2.3.4
PING 1.2.3.4 (1.2.3.4) 400(428) bytes of data.
408 bytes from 1.2.3.4: icmp_seq=1 ttl=64 time=1.57 ms

--- 1.2.3.4 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 1.573/1.573/1.573/0.000 ms
---- 8< ------

>
> It is therefore untenable to remove this restriction univerally like
> this.

With the explaination above, do you still think it's untenable ?

/Fredrik

^ permalink raw reply

* Re: [PATCH 0/2] net: Set maximum receive packet size on veth interfaces
From: Fredrik Markström @ 2017-05-10  0:38 UTC (permalink / raw)
  To: David Miller
  Cc: Eric Dumazet, Daniel Borkmann, netdev, bridge, Alexei Starovoitov,
	linux-kernel
In-Reply-To: <20170509.134839.2008658486655854998.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 2189 bytes --]

On Tue, May 9, 2017 at 7:48 PM, David Miller <davem@davemloft.net> wrote:
>
> From: Fredrik Markstrom <fredrik.markstrom@gmail.com>
> Date: Tue,  9 May 2017 14:44:36 +0200
>
> > Currently veth drops all packets larger then the mtu set on the
> > receiving end of the pair. This is inconsistent with most hardware
> > ethernet drivers.
>
> False.
>
> In fact, many pieces of ethernet hardware a not physically capable of
> sending even VLAN packets that are above the normal base ethernet MTU.
>

Maybe I was unclear, the veth implementation drops all packers larger then
the
configured MTU (on the receiving interface).
Most ethernet drivers accepts packets up to the ethernet MTU no matter the
configured MTU. As far as I can tell from the RFC:s that is ok.

A simpler solution would be to only drop packets larger then ethernet MTU
(like
most network drivers), but I guess that will break existing stuff already
out there.

The example below demonstrates what behaviour I'm referring to.

Example:
---- 8< ------
# veth0 and veth1 is a veth pair and veth1 has ben moved to a separate
network namespace.
% # NS A
% ip address add 1.2.3.4/24 dev veth0
% ip link set veth0 mtu 300 up

% # NS B
% ip address add 1.2.3.5/24 dev veth1
% ip link set veth1 mtu 1000 up
% ping -c 1 -W 1 -s 400 1.2.3.4
PING 1.2.3.4 (1.2.3.4) 400(428) bytes of data.

--- 1.2.3.4 ping statistics ---
1 packets transmitted, 0 received, 100% packet loss, time 0ms
---- 8< ------

While it works fine in most setup:s with hw-interfaces:

---- 8< ------
# Host A eth2 and Host B eth0 is on the same network.

# On HOST A
% ip address add 1.2.3.4/24 dev eth2
% ip link set eth2 mtu 300 up

% # HOST B
% ip address add 1.2.3.5/24 dev eth0
% ip link set eth0 mtu 1000 up
% ping -c 1 -W 1 -s 400 1.2.3.4
PING 1.2.3.4 (1.2.3.4) 400(428) bytes of data.
408 bytes from 1.2.3.4: icmp_seq=1 ttl=64 time=1.57 ms

--- 1.2.3.4 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 1.573/1.573/1.573/0.000 ms
---- 8< ------

>
> It is therefore untenable to remove this restriction univerally like
> this.

With the explaination above, do you still think it's untenable ?

/Fredrik

[-- Attachment #2: Type: text/html, Size: 2759 bytes --]

^ permalink raw reply

* Re: [Patch net] ipv6/dccp: do not inherit ipv6_mc_list from parent
From: Eric Dumazet @ 2017-05-10  0:21 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev, edumazet
In-Reply-To: <1494374394-24373-1-git-send-email-xiyou.wangcong@gmail.com>

On Tue, 2017-05-09 at 16:59 -0700, Cong Wang wrote:
> Like commit 657831ffc38e ("dccp/tcp: do not inherit mc_list from parent")
> we should clear ipv6_mc_list etc. for IPv6 sockets too.
> 
> Cc: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
> ---

Thanks !

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply

* [PATCH] arp: honour gratuitous ARP _replies_
From: Ihar Hrachyshka @ 2017-05-10  0:16 UTC (permalink / raw)
  To: David S. Miller, James Morris, Hideaki YOSHIFUJI, Patrick McHardy,
	netdev
  Cc: Ihar Hrachyshka

When arp_accept is 1, gratuitous ARPs are supposed to override matching
entries irrespective of whether they arrive during locktime. This was
implemented in commit 56022a8fdd87 ("ipv4: arp: update neighbour address
when a gratuitous arp is received and arp_accept is set")

There is a glitch in the patch though. RFC 2002, section 4.6, "ARP,
Proxy ARP, and Gratuitous ARP", defines gratuitous ARPs so that they can
be either of Request or Reply type. Those Reply gratuitous ARPs can be
triggered with standard tooling, for example, arping -A option does just
that.

This patch fixes the glitch, making both Request and Reply flavours of
gratuitous ARPs to behave identically.

As per RFC, if gratuitous ARPs are of Reply type, their Target Hardware
Address field should also be set to the link-layer address to which this
cache entry should be updated. The field is present in ARP over Ethernet
but not in IEEE 1394. In this patch, I don't consider any broadcasted
ARP replies as gratuitous if the field is not present, to conform the
standard. It's not clear whether there is such a thing for IEEE 1394 as
a gratuitous ARP reply; until it's cleared up, we will ignore such
broadcasts. Note that they will still update existing ARP cache entries,
assuming they arrive out of locktime time interval.

Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com>
---
 net/ipv4/arp.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 0937b34..fb97b9c 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -653,6 +653,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 	unsigned char *arp_ptr;
 	struct rtable *rt;
 	unsigned char *sha;
+	unsigned char *tha = NULL;
 	__be32 sip, tip;
 	u16 dev_type = dev->type;
 	int addr_type;
@@ -724,6 +725,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 		break;
 #endif
 	default:
+		tha = arp_ptr;
 		arp_ptr += dev->addr_len;
 	}
 	memcpy(&tip, arp_ptr, 4);
@@ -842,8 +844,20 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 		   It is possible, that this option should be enabled for some
 		   devices (strip is candidate)
 		 */
-		is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip &&
-			  addr_type == RTN_UNICAST;
+		is_garp = tip == sip && addr_type == RTN_UNICAST;
+
+		/* Unsolicited ARP _replies_ also require target hwaddr to be
+		 * the same as source.
+		 */
+		if (is_garp && arp->ar_op == htons(ARPOP_REPLY))
+			is_garp =
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+				/* IPv4 over IEEE 1394 doesn't provide target
+				 * hardware address field in its ARP payload.
+				 */
+				tha &&
+#endif
+				!memcmp(tha, sha, dev->addr_len);

 		if (!n &&
 		    ((arp->ar_op == htons(ARPOP_REPLY)  &&
-- 
2.9.3

^ permalink raw reply related

* [PATCH] neighbour: update neigh timestamps iff update is effective
From: Ihar Hrachyshka @ 2017-05-10  0:06 UTC (permalink / raw)
  To: David S. Miller, Ihar Hrachyshka, Julian Anastasov, He Chunhui,
	netdev

It's a common practice to send gratuitous ARPs after moving an
IP address to another device to speed up healing of a service. To
fulfill service availability constraints, the timing of network peers
updating their caches to point to a new location of an IP address can be
particularly important.

Sometimes neigh_update calls won't touch neither lladdr nor state, for
example if an update arrives in locktime interval. Then we effectively
ignore the update request, bailing out of touching the neigh entry,
except that we still bump its timestamps.

This may be a problem for updates arriving in quick succession. For
example, consider the following scenario:

A service is moved to another device with its IP address. The new device
sends three gratuitous ARP requests into the network with ~1 seconds
interval between them. Just before the first request arrives to one of
network peer nodes, its neigh entry for the IP address transitions from
STALE to DELAY.  This transition, among other things, updates
neigh->updated. Once the kernel receives the first gratuitous ARP, it
ignores it because its arrival time is inside the locktime interval. The
kernel still bumps neigh->updated. Then the second gratuitous ARP
request arrives, and it's also ignored because it's still in the (new)
locktime interval. Same happens for the third request. The node
eventually heals itself (after delay_first_probe_time seconds since the
initial transition to DELAY state), but it just wasted some time and
require a new ARP request/reply round trip. This unfortunate behaviour
both puts more load on the network, as well as reduces service
availability.

This patch changes neigh_update so that it bumps neigh->updated (as well
as neigh->confirmed) only once we are sure that either lladdr or entry
state will change). In the scenario described above, it means that the
second gratuitous ARP request will actually update the entry lladdr.

Ideally, we would update the neigh entry on the very first gratuitous
ARP request. The locktime mechanism is designed to ignore ARP updates in
a short timeframe after a previous ARP update was honoured by the kernel
layer. This would require tracking timestamps for state transitions
separately from timestamps when actual updates are received. This would
probably involve changes in neighbour struct. Therefore, the patch
doesn't tackle the issue of the first gratuitous APR ignored, leaving
it for a follow-up.

Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com>
---
 net/core/neighbour.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 58b0bcc..d274f81 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1132,10 +1132,6 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
 		lladdr = neigh->ha;
 	}

-	if (new & NUD_CONNECTED)
-		neigh->confirmed = jiffies;
-	neigh->updated = jiffies;
-
 	/* If entry was valid and address is not changed,
 	   do not change entry state, if new one is STALE.
 	 */
@@ -1157,6 +1153,16 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
 		}
 	}

+	/* Update timestamps only once we know we will make a change to the
+	 * neighbour entry. Otherwise we risk to move the locktime window with
+	 * noop updates and ignore relevant ARP updates.
+	 */
+	if (new != old || lladdr != neigh->ha) {
+		if (new & NUD_CONNECTED)
+			neigh->confirmed = jiffies;
+		neigh->updated = jiffies;
+	}
+
 	if (new != old) {
 		neigh_del_timer(neigh);
 		if (new & NUD_PROBE)
-- 
2.9.3

^ permalink raw reply related

* [Patch net] ipv6/dccp: do not inherit ipv6_mc_list from parent
From: Cong Wang @ 2017-05-09 23:59 UTC (permalink / raw)
  To: netdev; +Cc: edumazet, Cong Wang

Like commit 657831ffc38e ("dccp/tcp: do not inherit mc_list from parent")
we should clear ipv6_mc_list etc. for IPv6 sockets too.

Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 net/dccp/ipv6.c     | 6 ++++++
 net/ipv6/tcp_ipv6.c | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index d9b6a4e..b6bbb71 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -426,6 +426,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
 		newsk->sk_backlog_rcv = dccp_v4_do_rcv;
 		newnp->pktoptions  = NULL;
 		newnp->opt	   = NULL;
+		newnp->ipv6_mc_list = NULL;
+		newnp->ipv6_ac_list = NULL;
+		newnp->ipv6_fl_list = NULL;
 		newnp->mcast_oif   = inet6_iif(skb);
 		newnp->mcast_hops  = ipv6_hdr(skb)->hop_limit;
 
@@ -490,6 +493,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
 	/* Clone RX bits */
 	newnp->rxopt.all = np->rxopt.all;
 
+	newnp->ipv6_mc_list = NULL;
+	newnp->ipv6_ac_list = NULL;
+	newnp->ipv6_fl_list = NULL;
 	newnp->pktoptions = NULL;
 	newnp->opt	  = NULL;
 	newnp->mcast_oif  = inet6_iif(skb);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index aeb9497..df5a9ff 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1062,6 +1062,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 		newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
 #endif
 
+		newnp->ipv6_mc_list = NULL;
 		newnp->ipv6_ac_list = NULL;
 		newnp->ipv6_fl_list = NULL;
 		newnp->pktoptions  = NULL;
@@ -1131,6 +1132,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 	   First: no IPv4 options.
 	 */
 	newinet->inet_opt = NULL;
+	newnp->ipv6_mc_list = NULL;
 	newnp->ipv6_ac_list = NULL;
 	newnp->ipv6_fl_list = NULL;
 
-- 
2.5.5

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox