Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next V2 1/2] tun/tap: use ptr_ring instead of skb_array
From: Jason Wang @ 2018-01-04  3:14 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, jbrouer, Jason Wang
In-Reply-To: <1515035668-6241-1-git-send-email-jasowang@redhat.com>

This patch switches to use ptr_ring instead of skb_array. This will be
used to enqueue different types of pointers by encoding type into
lower bits.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tap.c      | 41 +++++++++++++++++++++--------------------
 drivers/net/tun.c      | 42 ++++++++++++++++++++++--------------------
 drivers/vhost/net.c    | 39 ++++++++++++++++++++-------------------
 include/linux/if_tap.h |  6 +++---
 include/linux/if_tun.h |  4 ++--
 5 files changed, 68 insertions(+), 64 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 0a886fda..7c38659 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -330,7 +330,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 	if (!q)
 		return RX_HANDLER_PASS;
 
-	if (__skb_array_full(&q->skb_array))
+	if (__ptr_ring_full(&q->ring))
 		goto drop;
 
 	skb_push(skb, ETH_HLEN);
@@ -348,7 +348,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 			goto drop;
 
 		if (!segs) {
-			if (skb_array_produce(&q->skb_array, skb))
+			if (ptr_ring_produce(&q->ring, skb))
 				goto drop;
 			goto wake_up;
 		}
@@ -358,7 +358,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 			struct sk_buff *nskb = segs->next;
 
 			segs->next = NULL;
-			if (skb_array_produce(&q->skb_array, segs)) {
+			if (ptr_ring_produce(&q->ring, segs)) {
 				kfree_skb(segs);
 				kfree_skb_list(nskb);
 				break;
@@ -375,7 +375,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 		    !(features & NETIF_F_CSUM_MASK) &&
 		    skb_checksum_help(skb))
 			goto drop;
-		if (skb_array_produce(&q->skb_array, skb))
+		if (ptr_ring_produce(&q->ring, skb))
 			goto drop;
 	}
 
@@ -497,7 +497,7 @@ static void tap_sock_destruct(struct sock *sk)
 {
 	struct tap_queue *q = container_of(sk, struct tap_queue, sk);
 
-	skb_array_cleanup(&q->skb_array);
+	ptr_ring_cleanup(&q->ring, __skb_array_destroy_skb);
 }
 
 static int tap_open(struct inode *inode, struct file *file)
@@ -517,7 +517,7 @@ static int tap_open(struct inode *inode, struct file *file)
 					     &tap_proto, 0);
 	if (!q)
 		goto err;
-	if (skb_array_init(&q->skb_array, tap->dev->tx_queue_len, GFP_KERNEL)) {
+	if (ptr_ring_init(&q->ring, tap->dev->tx_queue_len, GFP_KERNEL)) {
 		sk_free(&q->sk);
 		goto err;
 	}
@@ -546,7 +546,7 @@ static int tap_open(struct inode *inode, struct file *file)
 
 	err = tap_set_queue(tap, file, q);
 	if (err) {
-		/* tap_sock_destruct() will take care of freeing skb_array */
+		/* tap_sock_destruct() will take care of freeing ptr_ring */
 		goto err_put;
 	}
 
@@ -583,7 +583,7 @@ static unsigned int tap_poll(struct file *file, poll_table *wait)
 	mask = 0;
 	poll_wait(file, &q->wq.wait, wait);
 
-	if (!skb_array_empty(&q->skb_array))
+	if (!ptr_ring_empty(&q->ring))
 		mask |= POLLIN | POLLRDNORM;
 
 	if (sock_writeable(&q->sk) ||
@@ -844,7 +844,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
 					TASK_INTERRUPTIBLE);
 
 		/* Read frames from the queue */
-		skb = skb_array_consume(&q->skb_array);
+		skb = ptr_ring_consume(&q->ring);
 		if (skb)
 			break;
 		if (noblock) {
@@ -1176,7 +1176,7 @@ static int tap_peek_len(struct socket *sock)
 {
 	struct tap_queue *q = container_of(sock, struct tap_queue,
 					       sock);
-	return skb_array_peek_len(&q->skb_array);
+	return PTR_RING_PEEK_CALL(&q->ring, __skb_array_len_with_tag);
 }
 
 /* Ops structure to mimic raw sockets with tun */
@@ -1202,7 +1202,7 @@ struct socket *tap_get_socket(struct file *file)
 }
 EXPORT_SYMBOL_GPL(tap_get_socket);
 
-struct skb_array *tap_get_skb_array(struct file *file)
+struct ptr_ring *tap_get_ptr_ring(struct file *file)
 {
 	struct tap_queue *q;
 
@@ -1211,29 +1211,30 @@ struct skb_array *tap_get_skb_array(struct file *file)
 	q = file->private_data;
 	if (!q)
 		return ERR_PTR(-EBADFD);
-	return &q->skb_array;
+	return &q->ring;
 }
-EXPORT_SYMBOL_GPL(tap_get_skb_array);
+EXPORT_SYMBOL_GPL(tap_get_ptr_ring);
 
 int tap_queue_resize(struct tap_dev *tap)
 {
 	struct net_device *dev = tap->dev;
 	struct tap_queue *q;
-	struct skb_array **arrays;
+	struct ptr_ring **rings;
 	int n = tap->numqueues;
 	int ret, i = 0;
 
-	arrays = kmalloc_array(n, sizeof(*arrays), GFP_KERNEL);
-	if (!arrays)
+	rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL);
+	if (!rings)
 		return -ENOMEM;
 
 	list_for_each_entry(q, &tap->queue_list, next)
-		arrays[i++] = &q->skb_array;
+		rings[i++] = &q->ring;
 
-	ret = skb_array_resize_multiple(arrays, n,
-					dev->tx_queue_len, GFP_KERNEL);
+	ret = ptr_ring_resize_multiple(rings, n,
+				       dev->tx_queue_len, GFP_KERNEL,
+				       __skb_array_destroy_skb);
 
-	kfree(arrays);
+	kfree(rings);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(tap_queue_resize);
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index e367d631..2c89efe 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -179,7 +179,7 @@ struct tun_file {
 	struct mutex napi_mutex;	/* Protects access to the above napi */
 	struct list_head next;
 	struct tun_struct *detached;
-	struct skb_array tx_array;
+	struct ptr_ring tx_ring;
 };
 
 struct tun_flow_entry {
@@ -634,7 +634,7 @@ static void tun_queue_purge(struct tun_file *tfile)
 {
 	struct sk_buff *skb;
 
-	while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
+	while ((skb = ptr_ring_consume(&tfile->tx_ring)) != NULL)
 		kfree_skb(skb);
 
 	skb_queue_purge(&tfile->sk.sk_write_queue);
@@ -688,7 +688,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 				unregister_netdevice(tun->dev);
 		}
 		if (tun)
-			skb_array_cleanup(&tfile->tx_array);
+			ptr_ring_cleanup(&tfile->tx_ring,
+					 __skb_array_destroy_skb);
 		sock_put(&tfile->sk);
 	}
 }
@@ -777,7 +778,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
 	}
 
 	if (!tfile->detached &&
-	    skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) {
+	    ptr_ring_init(&tfile->tx_ring, dev->tx_queue_len, GFP_KERNEL)) {
 		err = -ENOMEM;
 		goto out;
 	}
@@ -1027,7 +1028,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	nf_reset(skb);
 
-	if (skb_array_produce(&tfile->tx_array, skb))
+	if (ptr_ring_produce(&tfile->tx_ring, skb))
 		goto drop;
 
 	/* Notify and wake up reader process */
@@ -1295,7 +1296,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, sk_sleep(sk), wait);
 
-	if (!skb_array_empty(&tfile->tx_array))
+	if (!ptr_ring_empty(&tfile->tx_ring))
 		mask |= POLLIN | POLLRDNORM;
 
 	if (tun->dev->flags & IFF_UP &&
@@ -1944,7 +1945,7 @@ static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
 	struct sk_buff *skb = NULL;
 	int error = 0;
 
-	skb = skb_array_consume(&tfile->tx_array);
+	skb = ptr_ring_consume(&tfile->tx_ring);
 	if (skb)
 		goto out;
 	if (noblock) {
@@ -1956,7 +1957,7 @@ static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
 	current->state = TASK_INTERRUPTIBLE;
 
 	while (1) {
-		skb = skb_array_consume(&tfile->tx_array);
+		skb = ptr_ring_consume(&tfile->tx_ring);
 		if (skb)
 			break;
 		if (signal_pending(current)) {
@@ -2186,7 +2187,7 @@ static int tun_peek_len(struct socket *sock)
 	if (!tun)
 		return 0;
 
-	ret = skb_array_peek_len(&tfile->tx_array);
+	ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, __skb_array_len_with_tag);
 	tun_put(tun);
 
 	return ret;
@@ -3092,25 +3093,26 @@ static int tun_queue_resize(struct tun_struct *tun)
 {
 	struct net_device *dev = tun->dev;
 	struct tun_file *tfile;
-	struct skb_array **arrays;
+	struct ptr_ring **rings;
 	int n = tun->numqueues + tun->numdisabled;
 	int ret, i;
 
-	arrays = kmalloc_array(n, sizeof(*arrays), GFP_KERNEL);
-	if (!arrays)
+	rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL);
+	if (!rings)
 		return -ENOMEM;
 
 	for (i = 0; i < tun->numqueues; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
-		arrays[i] = &tfile->tx_array;
+		rings[i] = &tfile->tx_ring;
 	}
 	list_for_each_entry(tfile, &tun->disabled, next)
-		arrays[i++] = &tfile->tx_array;
+		rings[i++] = &tfile->tx_ring;
 
-	ret = skb_array_resize_multiple(arrays, n,
-					dev->tx_queue_len, GFP_KERNEL);
+	ret = ptr_ring_resize_multiple(rings, n,
+				       dev->tx_queue_len, GFP_KERNEL,
+				       __skb_array_destroy_skb);
 
-	kfree(arrays);
+	kfree(rings);
 	return ret;
 }
 
@@ -3196,7 +3198,7 @@ struct socket *tun_get_socket(struct file *file)
 }
 EXPORT_SYMBOL_GPL(tun_get_socket);
 
-struct skb_array *tun_get_skb_array(struct file *file)
+struct ptr_ring *tun_get_tx_ring(struct file *file)
 {
 	struct tun_file *tfile;
 
@@ -3205,9 +3207,9 @@ struct skb_array *tun_get_skb_array(struct file *file)
 	tfile = file->private_data;
 	if (!tfile)
 		return ERR_PTR(-EBADFD);
-	return &tfile->tx_array;
+	return &tfile->tx_ring;
 }
-EXPORT_SYMBOL_GPL(tun_get_skb_array);
+EXPORT_SYMBOL_GPL(tun_get_tx_ring);
 
 module_init(tun_init);
 module_exit(tun_cleanup);
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index c7bdeb6..c316555 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -89,7 +89,7 @@ struct vhost_net_ubuf_ref {
 
 #define VHOST_RX_BATCH 64
 struct vhost_net_buf {
-	struct sk_buff **queue;
+	void **queue;
 	int tail;
 	int head;
 };
@@ -108,7 +108,7 @@ struct vhost_net_virtqueue {
 	/* Reference counting for outstanding ubufs.
 	 * Protected by vq mutex. Writers must also take device mutex. */
 	struct vhost_net_ubuf_ref *ubufs;
-	struct skb_array *rx_array;
+	struct ptr_ring *rx_ring;
 	struct vhost_net_buf rxq;
 };
 
@@ -158,7 +158,7 @@ static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
 	struct vhost_net_buf *rxq = &nvq->rxq;
 
 	rxq->head = 0;
-	rxq->tail = skb_array_consume_batched(nvq->rx_array, rxq->queue,
+	rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
 					      VHOST_RX_BATCH);
 	return rxq->tail;
 }
@@ -167,9 +167,10 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
 {
 	struct vhost_net_buf *rxq = &nvq->rxq;
 
-	if (nvq->rx_array && !vhost_net_buf_is_empty(rxq)) {
-		skb_array_unconsume(nvq->rx_array, rxq->queue + rxq->head,
-				    vhost_net_buf_get_size(rxq));
+	if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) {
+		ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head,
+				   vhost_net_buf_get_size(rxq),
+				   __skb_array_destroy_skb);
 		rxq->head = rxq->tail = 0;
 	}
 }
@@ -583,7 +584,7 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
 	int len = 0;
 	unsigned long flags;
 
-	if (rvq->rx_array)
+	if (rvq->rx_ring)
 		return vhost_net_buf_peek(rvq);
 
 	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
@@ -790,7 +791,7 @@ static void handle_rx(struct vhost_net *net)
 			 * they refilled. */
 			goto out;
 		}
-		if (nvq->rx_array)
+		if (nvq->rx_ring)
 			msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
 		/* On overrun, truncate and discard */
 		if (unlikely(headcount > UIO_MAXIOV)) {
@@ -896,7 +897,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 	struct vhost_net *n;
 	struct vhost_dev *dev;
 	struct vhost_virtqueue **vqs;
-	struct sk_buff **queue;
+	void **queue;
 	int i;
 
 	n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
@@ -908,7 +909,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 		return -ENOMEM;
 	}
 
-	queue = kmalloc_array(VHOST_RX_BATCH, sizeof(struct sk_buff *),
+	queue = kmalloc_array(VHOST_RX_BATCH, sizeof(void *),
 			      GFP_KERNEL);
 	if (!queue) {
 		kfree(vqs);
@@ -1046,23 +1047,23 @@ static struct socket *get_raw_socket(int fd)
 	return ERR_PTR(r);
 }
 
-static struct skb_array *get_tap_skb_array(int fd)
+static struct ptr_ring *get_tap_ptr_ring(int fd)
 {
-	struct skb_array *array;
+	struct ptr_ring *ring;
 	struct file *file = fget(fd);
 
 	if (!file)
 		return NULL;
-	array = tun_get_skb_array(file);
-	if (!IS_ERR(array))
+	ring = tun_get_tx_ring(file);
+	if (!IS_ERR(ring))
 		goto out;
-	array = tap_get_skb_array(file);
-	if (!IS_ERR(array))
+	ring = tap_get_ptr_ring(file);
+	if (!IS_ERR(ring))
 		goto out;
-	array = NULL;
+	ring = NULL;
 out:
 	fput(file);
-	return array;
+	return ring;
 }
 
 static struct socket *get_tap_socket(int fd)
@@ -1143,7 +1144,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 		vq->private_data = sock;
 		vhost_net_buf_unproduce(nvq);
 		if (index == VHOST_NET_VQ_RX)
-			nvq->rx_array = get_tap_skb_array(fd);
+			nvq->rx_ring = get_tap_ptr_ring(fd);
 		r = vhost_vq_init_access(vq);
 		if (r)
 			goto err_used;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 3ecef57..8e66866 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -4,7 +4,7 @@
 
 #if IS_ENABLED(CONFIG_TAP)
 struct socket *tap_get_socket(struct file *);
-struct skb_array *tap_get_skb_array(struct file *file);
+struct ptr_ring *tap_get_ptr_ring(struct file *file);
 #else
 #include <linux/err.h>
 #include <linux/errno.h>
@@ -14,7 +14,7 @@ static inline struct socket *tap_get_socket(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
 }
-static inline struct skb_array *tap_get_skb_array(struct file *f)
+static inline struct ptr_ring *tap_get_ptr_ring(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
 }
@@ -70,7 +70,7 @@ struct tap_queue {
 	u16 queue_index;
 	bool enabled;
 	struct list_head next;
-	struct skb_array skb_array;
+	struct ptr_ring ring;
 };
 
 rx_handler_result_t tap_handle_frame(struct sk_buff **pskb);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index bf9bdf4..bdee9b8 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -19,7 +19,7 @@
 
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
-struct skb_array *tun_get_skb_array(struct file *file);
+struct ptr_ring *tun_get_tx_ring(struct file *file);
 #else
 #include <linux/err.h>
 #include <linux/errno.h>
@@ -29,7 +29,7 @@ static inline struct socket *tun_get_socket(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
 }
-static inline struct skb_array *tun_get_skb_array(struct file *f)
+static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
 }
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V2 0/2] XDP transmission for tuntap
From: Jason Wang @ 2018-01-04  3:14 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, jbrouer, Jason Wang

Hi all:

This series tries to implement XDP transmission (ndo_xdp_xmit) for
tuntap. Pointer ring was used for queuing both XDP buffers and
sk_buff, this is done by encoding the type into lowest bit of the
pointer and storin XDP metadata in the headroom of XDP buff.

Tests gets 3.05 Mpps when doing xdp_redirect_map from ixgbe to VM
(testpmd + virtio-net in guest). This gives us ~20% improvments
compared to use skb during redirect.

Please review.

Changes from V1:

- slient warnings
- fix typos
- add skb mode number in the commit log

Jason Wang (2):
  tun/tap: use ptr_ring instead of skb_array
  tuntap: XDP transmission

 drivers/net/tap.c      |  41 ++++-----
 drivers/net/tun.c      | 239 +++++++++++++++++++++++++++++++++++++++----------
 drivers/vhost/net.c    |  52 ++++++-----
 include/linux/if_tap.h |   6 +-
 include/linux/if_tun.h |  21 ++++-
 5 files changed, 269 insertions(+), 90 deletions(-)

-- 
2.7.4

^ permalink raw reply

* Re: [RFC PATCH] asm/generic: introduce if_nospec and nospec_barrier
From: Alexei Starovoitov @ 2018-01-04  3:12 UTC (permalink / raw)
  To: Alan Cox
  Cc: Jiri Kosina, Dan Williams, Linus Torvalds,
	Linux Kernel Mailing List, Mark Rutland, linux-arch,
	Peter Zijlstra, Greg KH, Thomas Gleixner, Elena Reshetova, netdev,
	Daniel Borkmann, David S. Miller
In-Reply-To: <20180104021553.32084de3@alans-desktop>

On Thu, Jan 04, 2018 at 02:15:53AM +0000, Alan Cox wrote:
> 
> > > Elena has done the work of auditing static analysis reports to a dozen
> > > or so locations that need some 'nospec' handling.  
> > 
> > How exactly is that related (especially in longer-term support terms) to 
> > BPF anyway?
> 
> If you read the papers you need a very specific construct in order to not
> only cause a speculative load of an address you choose but also to then
> manage to cause a second operation that in some way reveals bits of data
> or allows you to ask questions.
> 
> BPF allows you to construct those sequences relatively easily and it's
> the one case where a user space application can fairly easily place code
> it wants to execute in the kernel. Without BPF you have to find the right
> construct in the kernel, prime all the right predictions and measure the
> result without getting killed off. There are places you can do that but
> they are not so easy and we don't (at this point) think there are that
> many.

for BPF in particular we're thinking to do a different fix.
Instead of killing speculation we can let cpu speculate.
The fix will include rounding up bpf maps to nearest power of 2 and
inserting bpf_and operation on the index after bounds check,
so cpu can continue speculate beyond bounds check, but it will
load from zero-ed memory.
So this nospec arch dependent hack won't be necessary for bpf side,
but may still be needed in other parts of the kernel.

Also note that variant 1 is talking about exploiting prog_array
bpf feature which had 64-bit access prior to
commit 90caccdd8cc0 ("bpf: fix bpf_tail_call() x64 JIT")
That was a fix for JIT and not related to this cpu issue, but
I believe it breaks the existing exploit.

Since it's not clear whether it's still possible to use bpf
with 32-bit speculation only, we're going to do this rounding fix
for unpriv just in case.

^ permalink raw reply

* Re: "lockless" qdisc breaks tx_queue_len change too?
From: John Fastabend @ 2018-01-04  3:03 UTC (permalink / raw)
  To: Cong Wang; +Cc: Linux Kernel Network Developers
In-Reply-To: <CAM_iQpV2re1q1B+NQJtOPJRE28edpzQMEDot04fxLHdqFe5s1Q@mail.gmail.com>

On 01/03/2018 03:41 PM, Cong Wang wrote:
> On Wed, Jan 3, 2018 at 10:09 AM, John Fastabend
> <john.fastabend@gmail.com> wrote:
>> On 01/02/2018 08:41 PM, Cong Wang wrote:
>>> Hi, John
>>>
>>> While reviewing your ptr_ring fix again today, it looks like your
>>> "lockless" qdisc patchset breaks dev->tx_queue_len behavior.
>>>
>>> Before your patchset, dev->tx_queue_len is merely an integer to read,
>>> after your patchset, the skb array has to be resized when
>>> dev->tx_queue_len changes, but I don't see any qdisc code handles
>>> this...
>>>
>>> Also, because of that, I doubt __skb_array_empty() in
>>> pfifo_fast_dequeue() can be safe any more even with your ptr_ring fix.
>>>
>>> What am I missing?
>>>
>>
>> I dropped support for tx_queue_len changes after qdisc has been
>> created. The only check is at init time when building the qdisc.
> 
> This is where it breaks.
> 
> 
>>
>> Before this series teql and pfifo_fast were the only qdiscs that
>> used tx_queue_len other qdiscs used other mechanisms or copied
>> tx_queue_len at init time. So the API is inconsistent.
> 
> Yeah, pfifo_fast was able to drop based on latest value of tx_queue_len
> before your patchset, this is why I am complaining.
> 

Yep good complaint.

> 
>>
>> OK, but arguably its kAPI now and needs to be supported on live
>> qdiscs. So couple options drop the __skb_array_empty() check,
>> stop supporting changes on running qdiscs, or do a qdisc swap
>> with the new array.
> 
> I don't think we can break the old behavior of tx_queue_len change
> for pfifo_fast, people may already rely on it.
> 

Agreed needed for legacy support.

> Doing a swap seems reasonable.
> 
>>
>> I'm tempted to make the qdisc swap work, still need benchmarks
>> I guess without the empty check. Either way to get it working
>> we need a callback from tx_queue_len code paths.
> 
> Right, probably need a new ops in Qdisc_ops.
> 

Maybe instead of a Qdisc op just do a direct call to avoid
encouraging users to use this code path. Either way is
probably fine we can just watch any future patches and
have users add a specific attribute for it like codel.

> 
>>
>> Unfortunately, I guess someone somewhere probably uses pfifo_fast
>> and changes there queue length with a script after creating the
>> qdisc and expects it to work.
>>
> 
> This is my concern as well. I will work on some patches, this doesn't
> look trivial to solve at all.


How about a dev_deactivate_many() that instead of replacing
with noop qdisc replaces with new updated qdisc. Seems like
it might work.

Thanks,
John

> 
> Thanks.
> 

^ permalink raw reply

* [PATCH net,stable 1/1] net: fec: free/restore resource in related probe error pathes
From: Fugang Duan @ 2018-01-04  2:47 UTC (permalink / raw)
  To: troy.kisky, davem; +Cc: netdev, festevam, fugang.duan

Fixes in probe error path:
- Restore dev_id before failed_ioremap path.
  Fixes: ("net: fec: restore dev_id in the cases of probe error")
- Call of_node_put(phy_node) before failed_phy path.
  Fixes: ("net: fec: Support phys probed from devicetree and fixed-link")

Signed-off-by: Fugang Duan <fugang.duan@nxp.com>
---
 drivers/net/ethernet/freescale/fec_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index feed383..90aa69a 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3576,11 +3576,11 @@ static int fec_enet_get_irq_cnt(struct platform_device *pdev)
 failed_clk:
 	if (of_phy_is_fixed_link(np))
 		of_phy_deregister_fixed_link(np);
-failed_phy:
 	of_node_put(phy_node);
+failed_phy:
+	dev_id--;
 failed_ioremap:
 	free_netdev(ndev);
-	dev_id--;
 
 	return ret;
 }
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 2/2] net: revert "Update RFS target at poll for tcp/udp"
From: Soheil Hassas Yeganeh @ 2018-01-04  2:47 UTC (permalink / raw)
  To: davem, netdev
  Cc: pjt, ycheng, Soheil Hassas Yeganeh, Willem de Bruijn,
	Eric Dumazet, Neal Cardwell
In-Reply-To: <20180104024711.257600-1-soheil.kdev@gmail.com>

From: Soheil Hassas Yeganeh <soheil@google.com>

On multi-threaded processes, one common architecture is to have
one (or a small number of) threads polling sockets, and a
considerably larger pool of threads reading form and writing to the
sockets. When we set RPS core on tcp_poll() or udp_poll() we essentially
steer all packets of all the polled FDs to one (or small number of)
cores, creaing a bottleneck and/or RPS misprediction.

Another common architecture is to shard FDs among threads pinned
to cores. In such a setting, setting RPS core in tcp_poll() and
udp_poll() is redundant because the RFS core is correctly
set in recvmsg and sendmsg.

Thus, revert the following commit:
c3f1dbaf6e28 ("net: Update RFS target at poll for tcp/udp").

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
---
 net/ipv4/tcp.c | 2 --
 net/ipv4/udp.c | 2 --
 2 files changed, 4 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7ac583a2b9fe..f68cb33d50d1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -498,8 +498,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	const struct tcp_sock *tp = tcp_sk(sk);
 	int state;
 
-	sock_rps_record_flow(sk);
-
 	sock_poll_wait(file, sk_sleep(sk), wait);
 
 	state = inet_sk_state_load(sk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e9c0d1e1772e..db72619e07e4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2490,8 +2490,6 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	if (!skb_queue_empty(&udp_sk(sk)->reader_queue))
 		mask |= POLLIN | POLLRDNORM;
 
-	sock_rps_record_flow(sk);
-
 	/* Check for false positives due to checksum errors */
 	if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
 	    !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
-- 
2.16.0.rc0.223.g4a4ac83678-goog

^ permalink raw reply related

* [PATCH net-next 1/2] ip: do not set RFS core on error queue reads
From: Soheil Hassas Yeganeh @ 2018-01-04  2:47 UTC (permalink / raw)
  To: davem, netdev
  Cc: pjt, ycheng, Soheil Hassas Yeganeh, Willem de Bruijn,
	Eric Dumazet, Neal Cardwell

From: Soheil Hassas Yeganeh <soheil@google.com>

We should only record RPS on normal reads and writes.
In single threaded processes, all calls record the same state. In
multi-threaded processes where a separate thread processes
errors, the RFS table mispredicts.

Note that, when CONFIG_RPS is disabled, sock_rps_record_flow
is a noop and no branch is added as a result of this patch.

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
---
 net/ipv4/af_inet.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index bab98a4fedad..54cccdd8b1e3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -790,7 +790,8 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 	int addr_len = 0;
 	int err;
 
-	sock_rps_record_flow(sk);
+	if (likely(!(flags & MSG_ERRQUEUE)))
+		sock_rps_record_flow(sk);
 
 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
 				   flags & ~MSG_DONTWAIT, &addr_len);
-- 
2.16.0.rc0.223.g4a4ac83678-goog

^ permalink raw reply related

* Re: [patch net-next v2 00/10] Add support for resource abstraction
From: David Ahern @ 2018-01-04  2:28 UTC (permalink / raw)
  To: Arkadi Sharshevsky, Jiri Pirko, netdev, roopa
  Cc: davem, mlxsw, andrew, vivien.didelot, f.fainelli, michael.chan,
	ganeshgr, saeedm, matanb, leonro, idosch, jakub.kicinski, ast,
	daniel, simon.horman, pieter.jansenvanvuuren, john.hurley,
	alexander.h.duyck, linville, gospo, steven.lin1, yuvalm, ogerlitz
In-Reply-To: <0f861e90-63d3-2666-ef2d-0fc91beae957@mellanox.com>

On 1/3/18 11:05 AM, Arkadi Sharshevsky wrote:
> 
> 
> On 01/02/2018 08:05 PM, David Ahern wrote:
>> On 1/1/18 7:58 AM, Arkadi Sharshevsky wrote:
>>>
>>> Just to summarize the current fixes required:
>>>
>>> 1. ERIF dpipe table size is reporting wrong size. More precisely the
>>>    ERIF table does not take rifs, so it should not be linked to the rif
>>>    bank resource (is not part of this patchset, future extension).
>>> 2. Extended ACK user-space bug.
>>> 3. ABI documentation- Not sure we agreed upon it, Jiri?
>>>
>>> If I missed something please respond. Nothing of the fixes mentioned
>>> above is relevant for this patchset actually.
>>>
>>
>> Can you fix the userspace command and then we come back to what else is
>> needed? Right now, it is hard to tell what is a user space bug and what
>> is a kernel space bug.
>>
>> For example:
>> $ devlink resource set pci/0000:03:00.0 path /kvd/linear size 10000
>> $ devlink resource show pci/0000:03:00.0
>> pci/0000:03:00.0:
>>   name kvd size 245760 size_valid true
>>   resources:
>>     name linear size 98304 occ 0
>>     name hash_double size 60416
>>     name hash_single size 87040
>>
>> The set command did not fail, yet there is no size_new arg in the output
>> like there is for this change:
>>
>> $ devlink resource set pci/0000:03:00.0 path /kvd/linear size 0
>> $ devlink resource show pci/0000:03:00.0
>> pci/0000:03:00.0:
>>   name kvd size 245760 size_valid true
>>   resources:
>>     name linear size 98304 size_new 0 occ 0
>>     name hash_double size 60416
>>     name hash_single size 87040
>>
> 
> As I stated this is a user-space bug which I fixed, and updated my repo
> so please pull. Devlink uses mnl,and currently mnl does not support
> extended ack. I added support for this in my local ver of libmnl:
> 
> https://github.com/arkadis/libmnl.git
> 
> On branch master, so you can check it out. Besides this bugs, which were
> userspace, can please specify what are the pending problems from your
> point of view? Thanks!
> 

Again, my comments all stem from user experience.

Can you explain what "double_word" means for a unit? I would expect a
units to be kB or count (or items or entries).

$ devlink resource show pci/0000:03:00.0
pci/0000:03:00.0:
  name kvd size 245760 unit double_word size_valid true
  resources:
    name linear size 98304 occ 0 unit double_word
    name hash_double size 60416 unit double_word
    name hash_single size 87040 unit double_word

While that is confusing here from the userspace command it goes hand in
hand with patch 2 and:

+enum devlink_resource_unit {
+	DEVLINK_RESOURCE_UNIT_DOUBLE_WORD,
+};


Also, it seems like the occ of 0 is wrong since we know from past
responses that if I set linear to 0 all of networking breaks.



How does a user learn the granularity of a resource:

$ devlink resource set pci/0000:03:00.0 path /kvd/hash_double size 50000
Error: mlxsw_spectrum: resource set with wrong granularity.

Try again with 51000 and then 52000 and ... Why not export the
granularity read-only? I don't see it in the proposed UAPI.


And then on the reload:

$ devlink reload pci/0000:03:00.0
Error: devlink: resources size validation failed.

Since the reload is not doing any resource sizing that error message is
confusing. Maybe something like "Sum of the resource components exceeds
total size."


Finally, I still contend a 1-line description of each of the resources
goes a long way to improving the user friendliness of this set.

^ permalink raw reply

* RE: [PATCH v3 net,stable 0/2] net: fec: clean up in the cases of probe error
From: Andy Duan @ 2018-01-04  2:25 UTC (permalink / raw)
  To: David Miller
  Cc: festevam@gmail.com, netdev@vger.kernel.org,
	troy.kisky@boundarydevices.com, andrew@lunn.ch
In-Reply-To: <20180103.212241.1465013589762600791.davem@davemloft.net>

From: David Miller <davem@davemloft.net> Sent: Thursday, January 04, 2018 10:23 AM
>> The simple patches just clean up in the cases of probe error like
>> restore dev_id and handle the defer probe when regulator is still not ready.
>
>As I stated, v2 of these patches are already in my tree, so this patch series will
>not apply.
>
>You need to send me fixes relative to v2.

Okay, got it.  I will submit another patch to fix it. Sorry for the inconvenience.
Thanks very much.

^ permalink raw reply

* Re: [PATCH v3 net,stable 0/2] net: fec: clean up in the cases of probe error
From: David Miller @ 2018-01-04  2:22 UTC (permalink / raw)
  To: fugang.duan; +Cc: festevam, netdev, troy.kisky, andrew
In-Reply-To: <1515032129-7899-1-git-send-email-fugang.duan@nxp.com>

From: Fugang Duan <fugang.duan@nxp.com>
Date: Thu, 4 Jan 2018 10:15:27 +0800

> The simple patches just clean up in the cases of probe error like restore dev_id and
> handle the defer probe when regulator is still not ready.

As I stated, v2 of these patches are already in my tree, so
this patch series will not apply.

You need to send me fixes relative to v2.

^ permalink raw reply

* Re: [PATCH v2 net,stable 0/2] net: fec: clean up in the cases of probe error
From: David Miller @ 2018-01-04  2:20 UTC (permalink / raw)
  To: fugang.duan; +Cc: festevam, netdev, troy.kisky, andrew
In-Reply-To: <AM4PR0401MB2260428F5AAE1C6E3A6EE175FF1F0@AM4PR0401MB2260.eurprd04.prod.outlook.com>

From: Andy Duan <fugang.duan@nxp.com>
Date: Thu, 4 Jan 2018 02:07:40 +0000

> Sorry, pls hold on apply these series, there have one comment from
> Troy kisky, sorry for the inconvenience due to my mistake.

Once I have applied patches, they are there in my GIT tree and and
cannot be "undone".

You have to send me relative fixes on top your changes.

^ permalink raw reply

* [PATCH v3 net,stable 2/2] net: fec: defer probe if regulator is not ready
From: Fugang Duan @ 2018-01-04  2:15 UTC (permalink / raw)
  To: festevam, davem; +Cc: netdev, troy.kisky, andrew, fugang.duan
In-Reply-To: <1515032129-7899-1-git-send-email-fugang.duan@nxp.com>

Defer probe if regulator is not ready. E.g. some regulator is fixed
regulator controlled by i2c expander gpio, the i2c device may be probed
after the driver, then it should handle the case of defer probe error.

Signed-off-by: Fugang Duan <fugang.duan@nxp.com>
---
 drivers/net/ethernet/freescale/fec_main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 732a8e3..a32fbf5 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3489,6 +3489,10 @@ static int fec_enet_get_irq_cnt(struct platform_device *pdev)
 			goto failed_regulator;
 		}
 	} else {
+		if (PTR_ERR(fep->reg_phy) == -EPROBE_DEFER) {
+			ret = -EPROBE_DEFER;
+			goto failed_regulator;
+		}
 		fep->reg_phy = NULL;
 	}
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH v3 net,stable 1/2] net: fec: restore dev_id in the cases of probe error
From: Fugang Duan @ 2018-01-04  2:15 UTC (permalink / raw)
  To: festevam, davem; +Cc: netdev, troy.kisky, andrew, fugang.duan
In-Reply-To: <1515032129-7899-1-git-send-email-fugang.duan@nxp.com>

The static variable dev_id always plus one before netdev registerred.
It should restore the dev_id value in the cases of probe error.

Signed-off-by: Fugang Duan <fugang.duan@nxp.com>
---
 drivers/net/ethernet/freescale/fec_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index e17d10b..732a8e3 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3574,6 +3574,7 @@ static int fec_enet_get_irq_cnt(struct platform_device *pdev)
 		of_phy_deregister_fixed_link(np);
 failed_phy:
 	of_node_put(phy_node);
+	dev_id--;
 failed_ioremap:
 	free_netdev(ndev);
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH v3 net,stable 0/2] net: fec: clean up in the cases of probe error
From: Fugang Duan @ 2018-01-04  2:15 UTC (permalink / raw)
  To: festevam, davem; +Cc: netdev, troy.kisky, andrew, fugang.duan

The simple patches just clean up in the cases of probe error like restore dev_id and
handle the defer probe when regulator is still not ready.

v2:
* Fabio Estevam's comment to suggest split v1 to separate patches.
v3:
* Restore dev_id before failed_ioremap path from Troy Kisky's comment.

Fugang Duan (2):
  net: fec: restore dev_id in the cases of probe error
  net: fec: defer probe if regulator is not ready

 drivers/net/ethernet/freescale/fec_main.c | 5 +++++
 1 file changed, 5 insertions(+)

-- 
1.9.1

^ permalink raw reply

* Re: [PATCH net-next] net: sched: fix tcf_block_get_ext() in case CONFIG_NET_CLS is not set
From: Cong Wang @ 2018-01-04  2:08 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Linux Kernel Network Developers, oss-drivers, Jiri Pirko,
	Alexander Aring, Quentin Monnet
In-Reply-To: <20180104013045.29510-1-jakub.kicinski@netronome.com>

On Wed, Jan 3, 2018 at 5:30 PM, Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
> From: Quentin Monnet <quentin.monnet@netronome.com>
>
> The definition of functions tcf_block_get() and tcf_block_get_ext()
> depends of CONFIG_NET_CLS being set. When those functions gained extack
> support, only one version of the declaration of those functions was
> updated. Function tcf_block_get() was later fixed with commit
> 3c1490913f3b ("net: sch: api: fix tcf_block_get").
>
> Change arguments of tcf_block_get_ext() for the case when CONFIG_NET_CLS
> is not set.

There is one already:
https://patchwork.kernel.org/patch/10130849/

^ permalink raw reply

* RE: [PATCH v2 net,stable 0/2] net: fec: clean up in the cases of probe error
From: Andy Duan @ 2018-01-04  2:07 UTC (permalink / raw)
  To: David Miller
  Cc: festevam@gmail.com, netdev@vger.kernel.org,
	troy.kisky@boundarydevices.com, andrew@lunn.ch
In-Reply-To: <20180103.114057.1176360024955452896.davem@davemloft.net>

From: David Miller <davem@davemloft.net> Sent: Thursday, January 04, 2018 12:41 AM
>> The simple patches just clean up in the cases of probe error like
>> restore dev_id and handle the defer probe when regulator is still not ready.
>>
>> v2:
>> * Fabio Estevam's comment to suggest split v1 to separate patches.
>
>Series applied and queued up for -stable, thanks.

Sorry, pls hold on apply these series, there have one comment from Troy kisky, sorry for the  inconvenience due to my mistake.

^ permalink raw reply

* [net-next PATCH 2/2] bpf: only build sockmap with CONFIG_INET
From: John Fastabend @ 2018-01-04  1:57 UTC (permalink / raw)
  To: borkmann, alexei.starovoitov; +Cc: netdev
In-Reply-To: <20180104015739.14160.96127.stgit@john-Precision-Tower-5810>

The sockmap infrastructure is only aware of TCP sockets at the
moment. In the future we plan to add UDP. In both cases CONFIG_NET
should be built-in.

So lets only build sockmap if CONFIG_INET is enabled.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 include/linux/bpf.h       |    2 +-
 include/linux/bpf_types.h |    2 +-
 kernel/bpf/Makefile       |    2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7810ae5..9e03046 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -554,7 +554,7 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
 }
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
-#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
+#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
 #else
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 978c1d9..19b8349 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -42,7 +42,7 @@
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 #ifdef CONFIG_NET
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
-#ifdef CONFIG_STREAM_PARSER
+#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e691da0..a713fd2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -9,9 +9,11 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
 obj-$(CONFIG_BPF_SYSCALL) += offload.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
+ifeq ($(CONFIG_INET),y)
 obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
 endif
 endif
+endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif

^ permalink raw reply related

* [net-next PATCH 1/2] bpf: sockmap remove unused function
From: John Fastabend @ 2018-01-04  1:57 UTC (permalink / raw)
  To: borkmann, alexei.starovoitov; +Cc: netdev

This was added for some work that was eventually factored out but the
helper call was missed. Remove it now and add it back later if needed.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 kernel/bpf/sockmap.c |    8 --------
 1 file changed, 8 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 5ee2e41..3f662ee 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -96,14 +96,6 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
 	return rcu_dereference_sk_user_data(sk);
 }
 
-/* compute the linear packet data range [data, data_end) for skb when
- * sk_skb type programs are in use.
- */
-static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
-{
-	TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
-}
-
 enum __sk_action {
 	__SK_DROP = 0,
 	__SK_PASS,

^ permalink raw reply related

* [PATCH net-next] net: sched: fix tcf_block_get_ext() in case CONFIG_NET_CLS is not set
From: Jakub Kicinski @ 2018-01-04  1:30 UTC (permalink / raw)
  To: netdev; +Cc: oss-drivers, jiri, aring, Quentin Monnet

From: Quentin Monnet <quentin.monnet@netronome.com>

The definition of functions tcf_block_get() and tcf_block_get_ext()
depends of CONFIG_NET_CLS being set. When those functions gained extack
support, only one version of the declaration of those functions was
updated. Function tcf_block_get() was later fixed with commit
3c1490913f3b ("net: sch: api: fix tcf_block_get").

Change arguments of tcf_block_get_ext() for the case when CONFIG_NET_CLS
is not set.

Fixes: 8d1a77f974ca ("net: sch: api: add extack support in tcf_block_get")
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 include/net/pkt_cls.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 5cd3cf51cb35..c4f4e46ea8d6 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -87,7 +87,8 @@ int tcf_block_get(struct tcf_block **p_block,
 
 static inline
 int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
-		      struct tcf_block_ext_info *ei)
+		      struct tcf_block_ext_info *ei,
+		      struct netlink_ext_ack *extack)
 {
 	return 0;
 }
-- 
2.15.1

^ permalink raw reply related

* Re: [PATCH] net/mlx5e: hide an unused variable
From: Saeed Mahameed @ 2018-01-04  1:04 UTC (permalink / raw)
  To: Arnd Bergmann, Matan Barak, Leon Romanovsky
  Cc: Or Gerlitz, Hadar Hen Zion, David S. Miller, Paul Blakey,
	netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20180103224022.3737385-1-arnd-r2nGTMty4D4@public.gmane.org>



On 1/3/2018 2:40 PM, Arnd Bergmann wrote:
> The uplink_rpriv variable was added at the start of the function but
> only used inside of an #ifdef:
> 
> drivers/net/ethernet/mellanox/mlx5/core/en_tc.c: In function 'mlx5e_route_lookup_ipv6':
> drivers/net/ethernet/mellanox/mlx5/core/en_tc.c:1549:25: error: unused variable 'uplink_rpriv' [-Werror=unused-variable]
> 
> This moves the declaration into that #ifdef as well.
> 
> Fixes: 5ed99fb421d4 ("net/mlx5e: Move ethernet representors data into separate struct")
> Signed-off-by: Arnd Bergmann <arnd-r2nGTMty4D4@public.gmane.org>

Acked-by: Saeed Mahameed <saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Thank you Arnd.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [wireless-testsing2:master 1/4] drivers/net/netdevsim/bpf.c:130:14: sparse: incompatible types for 'case' statement
From: Jakub Kicinski @ 2018-01-04  1:02 UTC (permalink / raw)
  To: kbuild test robot; +Cc: David S. Miller, kbuild-all, netdev, Bob Copeland
In-Reply-To: <201801040317.tEFQay7W%fengguang.wu@intel.com>

On Thu, 4 Jan 2018 03:53:20 +0800, kbuild test robot wrote:
> tree:   https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-testing.git master
> head:   6b3b30d0c31ddb2f4d8208c90bc2b4adef47204d
> commit: af2cae39f6ab9dc596616d6a28c7772e1dd55e91 [1/4] Merge remote-tracking branch 'wireless-drivers-next/master'
> reproduce:
>         # apt-get install sparse
>         git checkout af2cae39f6ab9dc596616d6a28c7772e1dd55e91
>         make ARCH=x86_64 allmodconfig
>         make C=1 CF=-D__CHECK_ENDIAN__

>    drivers/net/netdevsim/bpf.c: In function 'nsim_bpf_setup_tc_block_cb':
> >> drivers/net/netdevsim/bpf.c:130:7: error: 'TC_CLSBPF_REPLACE' undeclared (first use in this function); did you mean 'TC_RED_REPLACE'?  
>      case TC_CLSBPF_REPLACE:
>           ^~~~~~~~~~~~~~~~~
>           TC_RED_REPLACE

FWIW looks like the tree contains old net-next code and latest net
(linux/master) code.  Pulling from net-next will solve this.

> :::::: TO: Jakub Kicinski <jakub.kicinski@netronome.com>
> :::::: CC: Daniel Borkmann <daniel@iogearbox.net>

Interestingly Daniel and I were not CCed on the report, is this
intentional?

^ permalink raw reply

* Re: [PATCH net-next 0/2] Enable virtio to act as a master for a passthru device
From: Samudrala, Sridhar @ 2018-01-04  0:22 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Jakub Kicinski, Brandeburg, Jesse, Michael S. Tsirkin,
	Stephen Hemminger, Netdev, virtualization, virtio-dev,
	Alexander Duyck
In-Reply-To: <CAKgT0UdtHVq3WWpjHggNabw_+2piaRQAKguivV690HdzKrg62w@mail.gmail.com>

On 1/3/2018 10:28 AM, Alexander Duyck wrote:
> On Wed, Jan 3, 2018 at 10:14 AM, Samudrala, Sridhar
> <sridhar.samudrala@intel.com> wrote:
>>
>> On 1/3/2018 8:59 AM, Alexander Duyck wrote:
>>> On Tue, Jan 2, 2018 at 6:16 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
>>>> On Tue,  2 Jan 2018 16:35:36 -0800, Sridhar Samudrala wrote:
>>>>> This patch series enables virtio to switch over to a VF datapath when a
>>>>> VF
>>>>> netdev is present with the same MAC address. It allows live migration of
>>>>> a VM
>>>>> with a direct attached VF without the need to setup a bond/team between
>>>>> a
>>>>> VF and virtio net device in the guest.
>>>>>
>>>>> The hypervisor needs to unplug the VF device from the guest on the
>>>>> source
>>>>> host and reset the MAC filter of the VF to initiate failover of datapath
>>>>> to
>>>>> virtio before starting the migration. After the migration is completed,
>>>>> the
>>>>> destination hypervisor sets the MAC filter on the VF and plugs it back
>>>>> to
>>>>> the guest to switch over to VF datapath.
>>>>>
>>>>> It is based on netvsc implementation and it may be possible to make this
>>>>> code
>>>>> generic and move it to a common location that can be shared by netvsc
>>>>> and virtio.
>>>>>
>>>>> This patch series is based on the discussion initiated by Jesse on this
>>>>> thread.
>>>>> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
>>>> How does the notion of a device which is both a bond and a leg of a
>>>> bond fit with Alex's recent discussions about feature propagation?
>>>> Which propagation rules will apply to VirtIO master?  Meaning of the
>>>> flags on a software upper device may be different.  Why muddy the
>>>> architecture like this and not introduce a synthetic bond device?
>>> It doesn't really fit with the notion I had. I think there may have
>>> been a bit of a disconnect as I have been out for the last week or so
>>> for the holidays.
>>>
>>> My thought on this was that the feature bit should be spawning a new
>>> para-virtual bond device and that bond should have the virto and the
>>> VF as slaves. Also I thought there was some discussion about trying to
>>> reuse as much of the netvsc code as possible for this so that we could
>>> avoid duplication of effort and have the two drivers use the same
>>> approach. It seems like it should be pretty straight forward since you
>>> would have the feature bit in the case of virto, and netvsc just does
>>> this sort of thing by default if I am not mistaken.
>> This patch is mostly based on netvsc implementation. The only change is
>> avoiding the
>> explicit dev_open() call of the VF netdev after a delay. I am assuming that
>> the guest userspace
>> will bring up the VF netdev and the hypervisor will update the MAC filters
>> to switch to
>> the right data path.
>> We could commonize the code and make it shared between netvsc and virtio. Do
>> we want
>> to do this right away or later? If so, what would be a good location for
>> these shared functions?
>> Is it net/core/dev.c?
> No, I would think about starting a new driver file in "/drivers/net/".
> The idea is this driver would be utilized to create a bond
> automatically and set the appropriate registration hooks. If nothing
> else you could probably just call it something generic like virt-bond
> or vbond or whatever.

We are trying to avoid creating another driver or a device.  Can we look 
into
consolidation of the 2 implementations(virtio & netvsc) as a later patch?
>
>> Also, if we want to go with a solution that creates a bond device, do we
>> want virtio_net/netvsc
>> drivers to create a upper device?  Such a solution is already possible via
>> config scripts that can
>> create a bond with virtio and a VF net device as slaves.  netvsc and this
>> patch series is trying to
>> make it as simple as possible for the VM to use directly attached devices
>> and support live migration
>> by switching to virtio datapath as a backup during the migration process
>> when the VF device
>> is unplugged.
> We all understand that. But you are making the solution very virtio
> specific. We want to see this be usable for other interfaces such as
> netsc and whatever other virtual interfaces are floating around out
> there.
>
> Also I haven't seen us address what happens as far as how we will
> handle this on the host. My thought was we should have a paired
> interface. Something like veth, but made up of a bond on each end. So
> in the host we should have one bond that has a tap/vhost interface and
> a VF port representor, and on the other we would be looking at the
> virtio interface and the VF. Attaching the tap/vhost to the bond could
> be a way of triggering the feature bit to be set in the virtio. That
> way communication between the guest and the host won't get too
> confusing as you will see all traffic from the bonded MAC address
> always show up on the host side bond instead of potentially showing up
> on two unrelated interfaces. It would also make for a good way to
> resolve the east/west traffic problem on hosts since you could just
> send the broadcast/multicast traffic via the tap/vhost/virtio channel
> instead of having to send it back through the port representor and eat
> up all that PCIe bus traffic.
 From the host point of view, here is a simple script that needs to be 
run to do the
live migration. We don't need any bond configuration on the host.

virsh detach-interface $DOMAIN hostdev --mac $MAC
ip link set $PF vf $VF_NUM mac $ZERO_MAC

virsh migrate --live $DOMAIN qemu+ssh://$REMOTE_HOST/system

ssh $REMOTE_HOST ip link set $PF vf $VF_NUM mac $MAC
ssh $REMOTE_HOST virsh attach-interface $DOMAIN hostdev $REMOTE_HOSTDEV 
--mac $MAC

^ permalink raw reply

* Re: [patch net-next v2 00/10] Add support for resource abstraction
From: Arkadi Sharshevsky @ 2018-01-04  0:07 UTC (permalink / raw)
  To: David Ahern, Jiri Pirko
  Cc: netdev, roopa, davem, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz
In-Reply-To: <49c72225-6437-54d5-a046-96fff5b65ce9@cumulusnetworks.com>



On 01/03/2018 08:29 PM, David Ahern wrote:
> On 1/3/18 11:17 AM, Jiri Pirko wrote:
>> Wed, Jan 03, 2018 at 07:14:16PM CET, dsa@cumulusnetworks.com wrote:
>>> On 1/3/18 11:05 AM, Arkadi Sharshevsky wrote:
>>>> As I stated this is a user-space bug which I fixed, and updated my repo
>>>> so please pull. Devlink uses mnl,and currently mnl does not support
>>>> extended ack. I added support for this in my local ver of libmnl:
>>>>
>>>> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2Farkadis%2Flibmnl.git&data=02%7C01%7Carkadis%40mellanox.com%7C5c86b6240eb84459c6ae08d552d7f9a4%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636506009929977440&sdata=sgrNzMhPwe63BIVxexZTjl%2FXqW51kpuRiHVhTDNaa70%3D&reserved=0
>>>>
>>>> On branch master, so you can check it out. Besides this bugs, which were
>>>> userspace, can please specify what are the pending problems from your
>>>> point of view? Thanks!
>>>
>>> devlink is in iproute2 package and it has extack support. See 'git log
>>> lib/libnetlink.c'
>>
>> Dave, devlink uses libmnl.
>>
> 
> Now I remember. You wrote it independently and but needed iproute2 be a
> delivery vehicle. It uses none of the common infrastructure from
> iproute2. Could we make this more difficult ....
> 
> Sometime in the next day I will jump through the hoops to get a proper
> devlink command.
> 

This actually was very confusing, I think the extack should be
handled by libmnl and iproute should use mnl_cb_run() routines
and not to implement its own. That way we could both benefit
from that.

You actually do use libmnl in libnetlink.c only for parsing
the headers, and its a dependency for extack handling.

I see this as a completely independent user space issue, which
doesn't have to do anything with this patchset. Not to mention
that everything is working right now.

^ permalink raw reply

* Re: [PATCH bpf-next v4 1/3] libbpf: add function to setup XDP
From: Eric Leblond @ 2018-01-03 23:59 UTC (permalink / raw)
  To: daniel, Toshiaki Makita, Philippe Ombredanne
  Cc: Alexei Starovoitov, netdev, linux-kernel
In-Reply-To: <20171230204116.30871-2-eric@regit.org>

Hello,

On Sat, 2017-12-30 at 21:41 +0100, Eric Leblond wrote:
> Most of the code is taken from set_link_xdp_fd() in bpf_load.c and
> slightly modified to be library compliant.

I've just discovered this patch is breaking the build of samples/bpf/
(nlattr not included at least and some int type problem). I'm going to
resubmit a patchset fixing this.

Sorry for the noise.

Best regards,
-- 
Eric Leblond <eric@regit.org>
Blog: https://home.regit.org/

^ permalink raw reply

* Re: [patch net-next v4 00/10] net: sched: allow qdiscs to share filter block instances
From: Jakub Kicinski @ 2018-01-03 23:51 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: David Ahern, netdev, davem, jhs, xiyou.wangcong, mlxsw, andrew,
	vivien.didelot, f.fainelli, michael.chan, ganeshgr, saeedm,
	matanb, leonro, idosch, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, ogerlitz, john.fastabend, daniel
In-Reply-To: <20180103172209.GD2067@nanopsycho.orion>

On Wed, 3 Jan 2018 18:22:09 +0100, Jiri Pirko wrote:
> However I don't agree about breaking the existing filter add and show
> and also imposibility to make not-shared block shared in the runtime
> before defining it first.

FWIW I would agree with David that allowing add on a shared block
modify filters on another interface can break existing users.  (No
opinion on dump and lifetime).

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox