Netdev List
 help / color / mirror / Atom feed
* [PATCH] vif queue counters from int to long
From: Mart van Santen @ 2016-12-23 15:09 UTC (permalink / raw)
  To: netdev; +Cc: ian.campbell, wei.liu2


[-- Attachment #1.1: Type: text/plain, Size: 1388 bytes --]


Hello,

This patch fixes an issue where counters in the queue have type int,
while the counters of the vif itself are specified as long. This can
cause incorrect reporting of tx/rx values of the vif interface.
More extensively reported on xen-devel mailinglist.



Signed-off-by: Mart van Santen <mart@greenhost.nl>
--- a/drivers/net/xen-netback/common.h  2016-12-22 15:41:07.785535748 +0000
+++ b/drivers/net/xen-netback/common.h  2016-12-23 13:08:18.123080064 +0000
@@ -113,10 +113,10 @@ struct xenvif_stats {
         * A subset of struct net_device_stats that contains only the
         * fields that are updated in netback.c for each queue.
         */
-       unsigned int rx_bytes;
-       unsigned int rx_packets;
-       unsigned int tx_bytes;
-       unsigned int tx_packets;
+       unsigned long rx_bytes;
+       unsigned long rx_packets;
+       unsigned long tx_bytes;
+       unsigned long tx_packets;

        /* Additional stats used by xenvif */
        unsigned long rx_gso_checksum_fixup;

-- 
Mart van Santen
Greenhost
E: mart@greenhost.nl
T: +31 20 4890444
W: https://greenhost.nl

A PGP signature can be attached to this e-mail,
you need PGP software to verify it. 
My public key is available in keyserver(s)
see: http://tinyurl.com/openpgp-manual

PGP Fingerprint: CA85 EB11 2B70 042D AF66  B29A 6437 01A1 10A3 D3A5



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply

* [PATCH net 9/9] virtio-net: XDP support for small buffers
From: Jason Wang @ 2016-12-23 14:37 UTC (permalink / raw)
  To: mst, virtualization, netdev, linux-kernel; +Cc: john.r.fastabend
In-Reply-To: <1482503852-12438-1-git-send-email-jasowang@redhat.com>

Commit f600b6905015 ("virtio_net: Add XDP support") leaves the case of
small receive buffer untouched. This will confuse the user who want to
set XDP but use small buffers. Other than forbid XDP in small buffer
mode, let's make it work. XDP then can only work at skb->data since
virtio-net create skbs during refill, this is sub optimal which could
be optimized in the future.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 112 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 87 insertions(+), 25 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index e53365a8..5deeda6 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -333,9 +333,9 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 static void virtnet_xdp_xmit(struct virtnet_info *vi,
 			     struct receive_queue *rq,
 			     struct send_queue *sq,
-			     struct xdp_buff *xdp)
+			     struct xdp_buff *xdp,
+			     void *data)
 {
-	struct page *page = virt_to_head_page(xdp->data);
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
 	unsigned int num_sg, len;
 	void *xdp_sent;
@@ -343,20 +343,45 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
 
 	/* Free up any pending old buffers before queueing new ones. */
 	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
-		struct page *sent_page = virt_to_head_page(xdp_sent);
-		put_page(sent_page);
+		if (vi->mergeable_rx_bufs) {
+			struct page *sent_page = virt_to_head_page(xdp_sent);
+
+			put_page(sent_page);
+		} else { /* small buffer */
+			struct sk_buff *skb = xdp_sent;
+
+			kfree_skb(skb);
+		}
 	}
 
-	/* Zero header and leave csum up to XDP layers */
-	hdr = xdp->data;
-	memset(hdr, 0, vi->hdr_len);
+	if (vi->mergeable_rx_bufs) {
+		/* Zero header and leave csum up to XDP layers */
+		hdr = xdp->data;
+		memset(hdr, 0, vi->hdr_len);
+
+		num_sg = 1;
+		sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
+	} else { /* small buffer */
+		struct sk_buff *skb = data;
 
-	num_sg = 1;
-	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
+		/* Zero header and leave csum up to XDP layers */
+		hdr = skb_vnet_hdr(skb);
+		memset(hdr, 0, vi->hdr_len);
+
+		num_sg = 2;
+		sg_init_table(sq->sg, 2);
+		sg_set_buf(sq->sg, hdr, vi->hdr_len);
+		skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
+	}
 	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
-				   xdp->data, GFP_ATOMIC);
+				   data, GFP_ATOMIC);
 	if (unlikely(err)) {
-		put_page(page);
+		if (vi->mergeable_rx_bufs) {
+			struct page *page = virt_to_head_page(xdp->data);
+
+			put_page(page);
+		} else /* small buffer */
+			kfree_skb(data);
 		return; // On error abort to avoid unnecessary kick
 	}
 
@@ -366,23 +391,26 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
 static u32 do_xdp_prog(struct virtnet_info *vi,
 		       struct receive_queue *rq,
 		       struct bpf_prog *xdp_prog,
-		       struct page *page, int offset, int len)
+		       void *data, int len)
 {
 	int hdr_padded_len;
 	struct xdp_buff xdp;
+	void *buf;
 	unsigned int qp;
 	u32 act;
-	u8 *buf;
-
-	buf = page_address(page) + offset;
 
-	if (vi->mergeable_rx_bufs)
+	if (vi->mergeable_rx_bufs) {
 		hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
-	else
-		hdr_padded_len = sizeof(struct padded_vnet_hdr);
+		xdp.data = data + hdr_padded_len;
+		xdp.data_end = xdp.data + (len - vi->hdr_len);
+		buf = data;
+	} else { /* small buffers */
+		struct sk_buff *skb = data;
 
-	xdp.data = buf + hdr_padded_len;
-	xdp.data_end = xdp.data + (len - vi->hdr_len);
+		xdp.data = skb->data;
+		xdp.data_end = xdp.data + len;
+		buf = skb->data;
+	}
 
 	act = bpf_prog_run_xdp(xdp_prog, &xdp);
 	switch (act) {
@@ -392,8 +420,8 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
 		qp = vi->curr_queue_pairs -
 			vi->xdp_queue_pairs +
 			smp_processor_id();
-		xdp.data = buf + (vi->mergeable_rx_bufs ? 0 : 4);
-		virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp);
+		xdp.data = buf;
+		virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp, data);
 		return XDP_TX;
 	default:
 		bpf_warn_invalid_xdp_action(act);
@@ -403,14 +431,47 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
 	}
 }
 
-static struct sk_buff *receive_small(struct virtnet_info *vi, void *buf, unsigned int len)
+static struct sk_buff *receive_small(struct net_device *dev,
+				     struct virtnet_info *vi,
+				     struct receive_queue *rq,
+				     void *buf, unsigned int len)
 {
 	struct sk_buff * skb = buf;
+	struct bpf_prog *xdp_prog;
 
 	len -= vi->hdr_len;
 	skb_trim(skb, len);
 
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(rq->xdp_prog);
+	if (xdp_prog) {
+		struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
+		u32 act;
+
+		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
+			goto err_xdp;
+		act = do_xdp_prog(vi, rq, xdp_prog, skb, len);
+		switch (act) {
+		case XDP_PASS:
+			break;
+		case XDP_TX:
+			rcu_read_unlock();
+			goto xdp_xmit;
+		case XDP_DROP:
+		default:
+			goto err_xdp;
+		}
+	}
+	rcu_read_unlock();
+
 	return skb;
+
+err_xdp:
+	rcu_read_unlock();
+	dev->stats.rx_dropped++;
+	kfree_skb(skb);
+xdp_xmit:
+	return NULL;
 }
 
 static struct sk_buff *receive_big(struct net_device *dev,
@@ -537,7 +598,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		if (unlikely(hdr->hdr.gso_type))
 			goto err_xdp;
 
-		act = do_xdp_prog(vi, rq, xdp_prog, xdp_page, offset, len);
+		act = do_xdp_prog(vi, rq, xdp_prog,
+				  page_address(xdp_page) + offset, len);
 		switch (act) {
 		case XDP_PASS:
 			/* We can only create skb based on xdp_page. */
@@ -672,7 +734,7 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
 	else if (vi->big_packets)
 		skb = receive_big(dev, vi, rq, buf, len);
 	else
-		skb = receive_small(vi, buf, len);
+		skb = receive_small(dev, vi, rq, buf, len);
 
 	if (unlikely(!skb))
 		return;
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 8/9] virtio-net: remove big packet XDP codes
From: Jason Wang @ 2016-12-23 14:37 UTC (permalink / raw)
  To: mst, virtualization, netdev, linux-kernel; +Cc: john.r.fastabend
In-Reply-To: <1482503852-12438-1-git-send-email-jasowang@redhat.com>

Now we in fact don't allow XDP for big packets, remove its codes.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 44 +++-----------------------------------------
 1 file changed, 3 insertions(+), 41 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index c1f66d8..e53365a8 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -344,11 +344,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
 	/* Free up any pending old buffers before queueing new ones. */
 	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
 		struct page *sent_page = virt_to_head_page(xdp_sent);
-
-		if (vi->mergeable_rx_bufs)
-			put_page(sent_page);
-		else
-			give_pages(rq, sent_page);
+		put_page(sent_page);
 	}
 
 	/* Zero header and leave csum up to XDP layers */
@@ -360,15 +356,8 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
 	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
 				   xdp->data, GFP_ATOMIC);
 	if (unlikely(err)) {
-		if (vi->mergeable_rx_bufs)
-			put_page(page);
-		else
-			give_pages(rq, page);
+		put_page(page);
 		return; // On error abort to avoid unnecessary kick
-	} else if (!vi->mergeable_rx_bufs) {
-		/* If not mergeable bufs must be big packets so cleanup pages */
-		give_pages(rq, (struct page *)page->private);
-		page->private = 0;
 	}
 
 	virtqueue_kick(sq->vq);
@@ -430,44 +419,17 @@ static struct sk_buff *receive_big(struct net_device *dev,
 				   void *buf,
 				   unsigned int len)
 {
-	struct bpf_prog *xdp_prog;
 	struct page *page = buf;
-	struct sk_buff *skb;
-
-	rcu_read_lock();
-	xdp_prog = rcu_dereference(rq->xdp_prog);
-	if (xdp_prog) {
-		struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
-		u32 act;
-
-		if (unlikely(hdr->hdr.gso_type))
-			goto err_xdp;
-		act = do_xdp_prog(vi, rq, xdp_prog, page, 0, len);
-		switch (act) {
-		case XDP_PASS:
-			break;
-		case XDP_TX:
-			rcu_read_unlock();
-			goto xdp_xmit;
-		case XDP_DROP:
-		default:
-			goto err_xdp;
-		}
-	}
-	rcu_read_unlock();
+	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
 
-	skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
 	if (unlikely(!skb))
 		goto err;
 
 	return skb;
 
-err_xdp:
-	rcu_read_unlock();
 err:
 	dev->stats.rx_dropped++;
 	give_pages(rq, page);
-xdp_xmit:
 	return NULL;
 }
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 7/9] virtio-net: forbid XDP when VIRTIO_NET_F_GUEST_UFO is support
From: Jason Wang @ 2016-12-23 14:37 UTC (permalink / raw)
  To: mst, virtualization, netdev, linux-kernel; +Cc: john.r.fastabend
In-Reply-To: <1482503852-12438-1-git-send-email-jasowang@redhat.com>

When VIRTIO_NET_F_GUEST_UFO is negotiated, host could still send UFO
packet that exceeds a single page which could not be handled
correctly by XDP. So this patch forbids setting XDP when GUEST_UFO is
supported. While at it, forbid XDP for ECN (which comes only from GRO)
too to prevent user from misconfiguration.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 77ae358..c1f66d8 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1684,7 +1684,9 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 	int i, err;
 
 	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
-	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6)) {
+	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
+	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
+	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) {
 		netdev_warn(dev, "can't set XDP while host is implementing LRO, disable LRO first\n");
 		return -EOPNOTSUPP;
 	}
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 6/9] virtio-net: make rx buf size estimation works for XDP
From: Jason Wang @ 2016-12-23 14:37 UTC (permalink / raw)
  To: mst, virtualization, netdev, linux-kernel; +Cc: john.r.fastabend
In-Reply-To: <1482503852-12438-1-git-send-email-jasowang@redhat.com>

We don't update ewma rx buf size in the case of XDP. This will lead
underestimation of rx buf size which causes host to produce more than
one buffers. This will greatly increase the possibility of XDP page
linearization.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 0778dc8..77ae358 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -584,10 +584,12 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 				put_page(page);
 				head_skb = page_to_skb(vi, rq, xdp_page,
 						       0, len, PAGE_SIZE);
+				ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
 				return head_skb;
 			}
 			break;
 		case XDP_TX:
+			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
 			if (unlikely(xdp_page != page))
 				goto err_xdp;
 			rcu_read_unlock();
@@ -596,6 +598,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		default:
 			if (unlikely(xdp_page != page))
 				__free_pages(xdp_page, 0);
+			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
 			goto err_xdp;
 		}
 	}
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 5/9] virtio-net: unbreak csumed packets for XDP_PASS
From: Jason Wang @ 2016-12-23 14:37 UTC (permalink / raw)
  To: mst, virtualization, netdev, linux-kernel; +Cc: john.r.fastabend
In-Reply-To: <1482503852-12438-1-git-send-email-jasowang@redhat.com>

We drop csumed packet when do XDP for packets. This breaks
XDP_PASS when GUEST_CSUM is supported. Fix this by allowing csum flag
to be set. With this patch, simple TCP works for XDP_PASS.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 470293e..0778dc8 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -440,7 +440,7 @@ static struct sk_buff *receive_big(struct net_device *dev,
 		struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
 		u32 act;
 
-		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
+		if (unlikely(hdr->hdr.gso_type))
 			goto err_xdp;
 		act = do_xdp_prog(vi, rq, xdp_prog, page, 0, len);
 		switch (act) {
@@ -572,7 +572,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		 * the receive path after XDP is loaded. In practice I
 		 * was not able to create this condition.
 		 */
-		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
+		if (unlikely(hdr->hdr.gso_type))
 			goto err_xdp;
 
 		act = do_xdp_prog(vi, rq, xdp_prog, xdp_page, offset, len);
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 4/9] virtio-net: correctly handle XDP_PASS for linearized packets
From: Jason Wang @ 2016-12-23 14:37 UTC (permalink / raw)
  To: mst, virtualization, netdev, linux-kernel; +Cc: john.r.fastabend
In-Reply-To: <1482503852-12438-1-git-send-email-jasowang@redhat.com>

When XDP_PASS were determined for linearized packets, we try to get
new buffers in the virtqueue and build skbs from them. This is wrong,
we should create skbs based on existed buffers instead. Fixing them by
creating skb based on xdp_page.

With this patch "ping 192.168.100.4 -s 3900 -M do" works for XDP_PASS.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 58ad40e..470293e 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -578,8 +578,14 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		act = do_xdp_prog(vi, rq, xdp_prog, xdp_page, offset, len);
 		switch (act) {
 		case XDP_PASS:
-			if (unlikely(xdp_page != page))
-				__free_pages(xdp_page, 0);
+			/* We can only create skb based on xdp_page. */
+			if (unlikely(xdp_page != page)) {
+				rcu_read_unlock();
+				put_page(page);
+				head_skb = page_to_skb(vi, rq, xdp_page,
+						       0, len, PAGE_SIZE);
+				return head_skb;
+			}
 			break;
 		case XDP_TX:
 			if (unlikely(xdp_page != page))
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 3/9] virtio-net: fix page miscount during XDP linearizing
From: Jason Wang @ 2016-12-23 14:37 UTC (permalink / raw)
  To: mst, virtualization, netdev, linux-kernel; +Cc: john.r.fastabend
In-Reply-To: <1482503852-12438-1-git-send-email-jasowang@redhat.com>

We don't put page during linearizing, the would cause leaking when
xmit through XDP_TX or the packet exceeds PAGE_SIZE. Fix them by
put page accordingly. Also decrease the number of buffers during
linearizing to make sure caller can free buffers correctly when packet
exceeds PAGE_SIZE. With this patch, we won't get OOM after linearize
huge number of packets.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fe4562d..58ad40e 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -483,7 +483,7 @@ static struct sk_buff *receive_big(struct net_device *dev,
  * anymore.
  */
 static struct page *xdp_linearize_page(struct receive_queue *rq,
-				       u16 num_buf,
+				       u16 *num_buf,
 				       struct page *p,
 				       int offset,
 				       unsigned int *len)
@@ -497,7 +497,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
 	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
 	page_off += *len;
 
-	while (--num_buf) {
+	while (--*num_buf) {
 		unsigned int buflen;
 		unsigned long ctx;
 		void *buf;
@@ -507,19 +507,22 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
 		if (unlikely(!ctx))
 			goto err_buf;
 
+		buf = mergeable_ctx_to_buf_address(ctx);
+		p = virt_to_head_page(buf);
+		off = buf - page_address(p);
+
 		/* guard against a misconfigured or uncooperative backend that
 		 * is sending packet larger than the MTU.
 		 */
-		if ((page_off + buflen) > PAGE_SIZE)
+		if ((page_off + buflen) > PAGE_SIZE) {
+			put_page(p);
 			goto err_buf;
-
-		buf = mergeable_ctx_to_buf_address(ctx);
-		p = virt_to_head_page(buf);
-		off = buf - page_address(p);
+		}
 
 		memcpy(page_address(page) + page_off,
 		       page_address(p) + off, buflen);
 		page_off += buflen;
+		put_page(p);
 	}
 
 	*len = page_off;
@@ -555,7 +558,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		/* This happens when rx buffer size is underestimated */
 		if (unlikely(num_buf > 1)) {
 			/* linearize data for XDP */
-			xdp_page = xdp_linearize_page(rq, num_buf,
+			xdp_page = xdp_linearize_page(rq, &num_buf,
 						      page, offset, &len);
 			if (!xdp_page)
 				goto err_xdp;
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 2/9] virtio-net: correctly xmit linearized page on XDP_TX
From: Jason Wang @ 2016-12-23 14:37 UTC (permalink / raw)
  To: mst, virtualization, netdev, linux-kernel; +Cc: john.r.fastabend
In-Reply-To: <1482503852-12438-1-git-send-email-jasowang@redhat.com>

After we linearize page, we should xmit this page instead of the page
of first buffer which may lead unexpected result. With this patch, we
can see correct packet during XDP_TX.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 1067253..fe4562d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -572,7 +572,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
 			goto err_xdp;
 
-		act = do_xdp_prog(vi, rq, xdp_prog, page, offset, len);
+		act = do_xdp_prog(vi, rq, xdp_prog, xdp_page, offset, len);
 		switch (act) {
 		case XDP_PASS:
 			if (unlikely(xdp_page != page))
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 1/9] virtio-net: remove the warning before XDP linearizing
From: Jason Wang @ 2016-12-23 14:37 UTC (permalink / raw)
  To: mst, virtualization, netdev, linux-kernel; +Cc: john.r.fastabend
In-Reply-To: <1482503852-12438-1-git-send-email-jasowang@redhat.com>

Since we use EWMA to estimate the size of rx buffer. When rx buffer
size is underestimated, it's usual to have a packet with more than one
buffers. Consider this is not a bug, remove the warning and correct
the comment before XDP linearizing.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 08327e0..1067253 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -552,14 +552,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		struct page *xdp_page;
 		u32 act;
 
-		/* No known backend devices should send packets with
-		 * more than a single buffer when XDP conditions are
-		 * met. However it is not strictly illegal so the case
-		 * is handled as an exception and a warning is thrown.
-		 */
+		/* This happens when rx buffer size is underestimated */
 		if (unlikely(num_buf > 1)) {
-			bpf_warn_invalid_xdp_buffer();
-
 			/* linearize data for XDP */
 			xdp_page = xdp_linearize_page(rq, num_buf,
 						      page, offset, &len);
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 0/9] several fixups for virtio-net XDP
From: Jason Wang @ 2016-12-23 14:37 UTC (permalink / raw)
  To: mst, virtualization, netdev, linux-kernel; +Cc: john.r.fastabend

Merry Xmas and a Happy New year to all:

This series tries to fixes several issues for virtio-net XDP which
could be categorized into several parts:

- fix several issues during XDP linearizing
- allow csumed packet to work for XDP_PASS
- make EWMA rxbuf size estimation works for XDP
- forbid XDP when GUEST_UFO is support
- remove big packet XDP support
- add XDP support or small buffer

Please see individual patches for details.

Thanks

Jason Wang (9):
  virtio-net: remove the warning before XDP linearizing
  virtio-net: correctly xmit linearized page on XDP_TX
  virtio-net: fix page miscount during XDP linearizing
  virtio-net: correctly handle XDP_PASS for linearized packets
  virtio-net: unbreak csumed packets for XDP_PASS
  virtio-net: make rx buf size estimation works for XDP
  virtio-net: forbid XDP when VIRTIO_NET_F_GUEST_UFO is support
  virtio-net: remove big packet XDP codes
  virtio-net: XDP support for small buffers

 drivers/net/virtio_net.c | 172 ++++++++++++++++++++++++++++-------------------
 1 file changed, 102 insertions(+), 70 deletions(-)

-- 
2.7.4

^ permalink raw reply

* Re: [PATCHv2 net] netfilter: check duplicate config when initializing in ipt_CLUSTERIP
From: Pablo Neira Ayuso @ 2016-12-23 14:18 UTC (permalink / raw)
  To: Xin Long; +Cc: network dev, netfilter-devel, davem, Marcelo Ricardo Leitner
In-Reply-To: <b96e06fc4b0bc50a3c39dffebbcd43685d2e2a0b.1482232474.git.lucien.xin@gmail.com>

On Tue, Dec 20, 2016 at 07:14:34PM +0800, Xin Long wrote:
> Now when adding an ipt_CLUSTERIP rule, it only checks duplicate config in
> clusterip_config_find_get(). But after that, there may be still another
> thread to insert a config with the same ip, then it leaves proc_create_data
> to do duplicate check.
> 
> It's more reasonable to check duplicate config by ipt_CLUSTERIP itself,
> instead of checking it by proc fs duplicate file check. Before, when proc
> fs allowed duplicate name files in a directory, It could even crash kernel
> because of use-after-free.
> 
> This patch is to check duplicate config under the protection of clusterip
> net lock when initializing a new config and correct the return err.
> 
> Note that it also moves proc file node creation after adding new config, as
> proc_create_data may sleep, it couldn't be called under the clusterip_net
> lock. clusterip_config_find_get returns NULL if c->pde is null to make sure
> it can't be used until the proc file node creation is done.

Applied, thanks.

^ permalink raw reply

* Re: George's crazy full state idea (Re: HalfSipHash Acceptable Usage)
From: Hannes Frederic Sowa @ 2016-12-23 12:05 UTC (permalink / raw)
  To: George Spelvin, luto
  Cc: ak, davem, David.Laight, djb, ebiggers3, eric.dumazet, Jason,
	jeanphilippe.aumasson, kernel-hardening, linux-crypto,
	linux-kernel, netdev, tom, torvalds, tytso, vegard.nossum
In-Reply-To: <20161223000716.5768.qmail@ns.sciencehorizons.net>

On Thu, 2016-12-22 at 19:07 -0500, George Spelvin wrote:
> Hannes Frederic Sowa wrote:
> > A lockdep test should still be done. ;)
> 
> Adding might_lock() annotations will improve coverage a lot.

Might be hard to find the correct lock we take later down the code
path, but if that is possible, certainly.

> > Yes, that does look nice indeed. Accounting for bits instead of bytes
> > shouldn't be a huge problem either. Maybe it gets a bit more verbose in
> > case you can't satisfy a request with one batched entropy block and have
> > to consume randomness from two.
> 
> The bit granularity is also for the callers' convenience, so they don't
> have to mask again.  Whether get_random_bits rounds up to byte boundaries
> internally or not is something else.
> 
> When the current batch runs low, I was actually thinking of throwing
> away the remaining bits and computing a new batch of 512.  But it's
> whatever works best at implementation time.
> 
> > > > It could only mix the output back in every two calls, in which case
> > > > you can backtrack up to one call but you need to do 2^128 work to
> > > > backtrack farther.  But yes, this is getting excessively complicated.
> > > No, if you're willing to accept limited backtrack, this is a perfectly
> > > acceptable solution, and not too complicated.  You could do it phase-less
> > > if you like; store the previous output, then after generating the new
> > > one, mix in both.  Then overwrite the previous output.  (But doing two
> > > rounds of a crypto primtive to avoid one conditional jump is stupid,
> > > so forget that.)
> > Can you quickly explain why we lose the backtracking capability?
> 
> Sure.  An RNG is (state[i], output[i]) = f(state[i-1]).  The goal of
> backtracking is to compute output[i], or better yet state[i-1], given
> state[i].
> 
> For example, consider an OFB or CTR mode generator.  The state is a key
> and and IV, and you encrypt the IV with the key to produce output, then
> either replace the IV with the output, or increment it.  Either way,
> since you still have the key, you can invert the transformation and
> recover the previous IV.
> 
> The standard way around this is to use the Davies-Meyer construction:
> 
> IV[i] = IV[i-1] + E(IV[i-1], key)
> 
> This is the standard way to make a non-invertible random function
> out of an invertible random permutation.
> 
> From the sum, there's no easy way to find the ciphertext *or* the
> plaintext that was encrypted.  Assuming the encryption is secure,
> the only way to reverse it is brute force: guess IV[i-1] and run the
> operation forward to see if the resultant IV[i] matches.
> 
> There are a variety of ways to organize this computation, since the
> guess gives toy both IV[i-1] and E(IV[i-1], key) = IV[i] - IV[i-1], including
> running E forward, backward, or starting from both ends to see if you
> meet in the middle.
> 
> The way you add the encryption output to the IV is not very important.
> It can be addition, xor, or some more complex invertible transformation.
> In the case of SipHash, the "encryption" output is smaller than the
> input, so we have to get a bit more creative, but it's still basically
> the same thing.
> 
> The problem is that the output which is combined with the IV is too small.
> With only 64 bits, trying all possible values is practical.  (The world's
> Bitcoin miners are collectively computing SHA-256(SHA-256(input)) 1.7 * 2^64
> times per second.)
> 
> By basically doing two iterations at once and mixing in 128 bits of
> output, the guessing attack is rendered impractical.  The only downside
> is that you need to remember and store one result between when it's
> computed and last used.  This is part of the state, so an attack can
> find output[i-1], but not anything farther back.

Thanks a lot for the explanation!

> > ChaCha as a block cipher gives a "perfect" permutation from the output
> > of either the CRNG or the CPRNG, which actually itself has backtracking
> > protection.
> 
> I'm not quite understanding.  The /dev/random implementation uses some
> of the ChaCha output as a new ChaCha key (that's another way to mix output
> back into the state) to prevent backtracking.  But this slows it down, and
> again if you want to be efficient, you're generating and storing large batches
> of entropy and storing it in the RNG state.

I was actually referring to the anti-backtrack protection in
/dev/random and also /dev/urandom, from where we reseed every 300
seconds and if our batched entropy runs low with Ted's/Jason's current
patch for get_random_int.

As far as I can understand it, backtracking is not a problem in case of
a reseed event inside extract_crng.

When we hit the chacha20 without doing a reseed we only mutate the
state of chacha, but being an invertible function in its own, a
proposal would be to mix parts of the chacha20 output back into the
state, which, as a result, would cause slowdown because we couldn't
propagate the complete output of the cipher back to the caller (looking
at the function _extract_crng).

Or are you referring that the anti-backtrack protection should happen
in every call from get_random_int?

Thanks,
Hannes

^ permalink raw reply

* Re: BPF hash algo (Re: [kernel-hardening] Re: [PATCH v7 3/6] random: use SipHash in place of MD5)
From: Daniel Borkmann @ 2016-12-23 11:59 UTC (permalink / raw)
  To: Hannes Frederic Sowa, Andy Lutomirski, Alexei Starovoitov
  Cc: Jason A. Donenfeld, kernel-hardening@lists.openwall.com,
	Theodore Ts'o, Netdev, LKML, Linux Crypto Mailing List,
	David Laight, Eric Dumazet, Linus Torvalds, Eric Biggers,
	Tom Herbert, Andi Kleen, David S. Miller, Jean-Philippe Aumasson
In-Reply-To: <1482490762.3353.2.camel@stressinduktion.org>

On 12/23/2016 11:59 AM, Hannes Frederic Sowa wrote:
> On Fri, 2016-12-23 at 11:04 +0100, Daniel Borkmann wrote:
>> On 12/22/2016 05:59 PM, Hannes Frederic Sowa wrote:
>>> On Thu, 2016-12-22 at 08:07 -0800, Andy Lutomirski wrote:
[...]
>>> The hashing is not a proper sha1 neither, unfortunately. I think that
>>> is why it will have a custom implementation in iproute2?
>>
>> Still trying to catch up on this admittedly bit confusing thread. I
>> did run automated tests over couple of days comparing the data I got
>> from fdinfo with the one from af_alg and found no mismatch on the test
>> cases varying from min to max possible program sizes. In the process
>> of testing, as you might have seen on netdev, I found couple of other
>> bugs in bpf code along the way and fixed them up as well. So my question,
>> do you or Andy or anyone participating in claiming this have any
>> concrete data or test cases that suggests something different? If yes,
>> I'm very curious to hear about it and willing fix it up, of course.
>> When I'm back from pto I'll prep and cook up my test suite to be
>> included into the selftests/bpf/, should have done this initially,
>> sorry about that. I'll also post something to expose the alg, that
>> sounds fine to me.
>
> Looking into your code closer, I noticed that you indeed seem to do the
> finalization of sha-1 by hand by aligning and padding the buffer
> accordingly and also patching in the necessary payload length.
>
> Apologies for my side for claiming that this is not correct sha1
> output, I was only looking at sha_transform and its implementation and
> couldn't see the padding and finalization round with embedding the data
> length in there and hadn't thought of it being done manually.
>
> Anyway, is it difficult to get the sha finalization into some common
> code library? It is not very bpf specific and crypto code reviewers
> won't find it there at all.

Yes, sure, I'll rework it that way (early next year when I'm back if
that's fine with you).

Thanks,
Daniel

^ permalink raw reply

* [ANNOUNCE] ipvsadm release v1.29
From: Jesper Dangaard Brouer @ 2016-12-23 11:03 UTC (permalink / raw)
  To: lvs-devel, lvs-users
  Cc: brouer, netdev, linux-kernel, Julian Anastasov, Ryan O'Hara,
	Quentin Armitage, Simon Horman, Wensong Zhang, Alex Gartrell


We are happy to announce the release of ipvsadm v1.29.

 ipvsadm is a utility to administer the kernels IPVS/LVS load-balancer service

It has been far too long since the last ipvsadm release. Even-though
only two changes to the ipvsadm tool happened since last release, a
release must be made as these feature relates to kernel side features.

Support for reading 64-bit stats is avail since kernel v4.1.  The new
attributes for sync daemon got introduced in kernel v4.3, but got
fixed in kernel v4.7.

Merry Xmas and a Happy New year to all :-)


This release is based on the kernel.org git tree:
  https://git.kernel.org/cgit/utils/kernel/ipvsadm/ipvsadm.git/

You can download the tarballs from:
 https://kernel.org/pub/linux/utils/kernel/ipvsadm/

Git tree:
 git://git.kernel.org/pub/scm/utils/kernel/ipvsadm/ipvsadm.git

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

Shortlog:

Jesper Dangaard Brouer (1):
      Release: Version 1.29

Julian Anastasov (2):
      ipvsadm: support 64-bit stats and rates
      ipvsadm: new attributes for sync daemon

^ permalink raw reply

* Re: BPF hash algo (Re: [kernel-hardening] Re: [PATCH v7 3/6] random: use SipHash in place of MD5)
From: Hannes Frederic Sowa @ 2016-12-23 10:59 UTC (permalink / raw)
  To: Daniel Borkmann, Andy Lutomirski, Alexei Starovoitov
  Cc: Jason A. Donenfeld, kernel-hardening@lists.openwall.com,
	Theodore Ts'o, Netdev, LKML, Linux Crypto Mailing List,
	David Laight, Eric Dumazet, Linus Torvalds, Eric Biggers,
	Tom Herbert, Andi Kleen, David S. Miller, Jean-Philippe Aumasson
In-Reply-To: <585CF6A3.1050107@iogearbox.net>

On Fri, 2016-12-23 at 11:04 +0100, Daniel Borkmann wrote:
> On 12/22/2016 05:59 PM, Hannes Frederic Sowa wrote:
> > On Thu, 2016-12-22 at 08:07 -0800, Andy Lutomirski wrote:
> > > On Thu, Dec 22, 2016 at 7:51 AM, Hannes Frederic Sowa
> > > <hannes@stressinduktion.org> wrote:
> > > > On Thu, 2016-12-22 at 16:41 +0100, Jason A. Donenfeld wrote:
> > > > > On Thu, Dec 22, 2016 at 4:33 PM, Hannes Frederic Sowa
> > > > > <hannes@stressinduktion.org> wrote:
> > > > > > IPv6 you cannot touch anymore. The hashing algorithm is part of uAPI.
> > > > > > You don't want to give people new IPv6 addresses with the same stable
> > > > > > secret (across reboots) after a kernel upgrade. Maybe they lose
> > > > > > connectivity then and it is extra work?
> > > > > 
> > > > > Ahh, too bad. So it goes.
> > > > 
> > > > If no other users survive we can put it into the ipv6 module.
> > > > 
> > > > > > The bpf hash stuff can be changed during this merge window, as it is
> > > > > > not yet in a released kernel. Albeit I would probably have preferred
> > > > > > something like sha256 here, which can be easily replicated by user
> > > > > > space tools (minus the problem of patching out references to not
> > > > > > hashable data, which must be zeroed).
> > > > > 
> > > > > Oh, interesting, so time is of the essence then. Do you want to handle
> > > > > changing the new eBPF code to something not-SHA1 before it's too late,
> > > > > as part of a ne
> > > 
> > > w patchset that can fast track itself to David? And
> > > > > then I can preserve my large series for the next merge window.
> > > > 
> > > > This algorithm should be a non-seeded algorithm, because the hashes
> > > > should be stable and verifiable by user space tooling. Thus this would
> > > > need a hashing algorithm that is hardened against pre-image
> > > > attacks/collision resistance, which siphash is not. I would prefer some
> > > > higher order SHA algorithm for that actually.
> > > > 
> > > 
> > > You mean:
> > > 
> > > commit 7bd509e311f408f7a5132fcdde2069af65fa05ae
> > > Author: Daniel Borkmann <daniel@iogearbox.net>
> > > Date:   Sun Dec 4 23:19:41 2016 +0100
> > > 
> > >      bpf: add prog_digest and expose it via fdinfo/netlink
> > > 
> > > 
> > > Yes, please!  This actually matters for security -- imagine a
> > > malicious program brute-forcing a collision so that it gets loaded
> > > wrong.  And this is IMO a use case for SHA-256 or SHA-512/256
> > > (preferably the latter).  Speed basically doesn't matter here and
> > > Blake2 is both less stable (didn't they slightly change it recently?)
> > > and much less well studied.
> > 
> > We don't prevent ebpf programs being loaded based on the digest but
> > just to uniquely identify loaded programs from user space and match up
> > with their source.
> > 
> > There have been talks about signing bpf programs, thus this would
> > probably need another digest algorithm additionally to that one,
> > wasting probably instructions. Probably going somewhere in direction of
> > PKCS#7 might be the thing to do (which leads to the problem to make
> > PKCS#7 attackable by ordinary unpriv users, hmpf).
> > 
> > > My inclination would have been to seed them with something that isn't
> > > exposed to userspace for the precise reason that it would prevent user
> > > code from making assumptions about what's in the hash.  But if there's
> > > a use case for why user code needs to be able to calculate the hash on
> > > its own, then that's fine.  But perhaps the actual fdinfo string
> > > should be "sha256:abcd1234..." to give some flexibility down the road.
> > > 
> > > Also:
> > > 
> > > +       result = (__force __be32 *)fp->digest;
> > > +       for (i = 0; i < SHA_DIGEST_WORDS; i++)
> > > +               result[i] = cpu_to_be32(fp->digest[i]);
> > > 
> > > Everyone, please, please, please don't open-code crypto primitives.
> > > Is this and the code above it even correct?  It might be but on a very
> > > brief glance it looks wrong to me.  If you're doing this to avoid
> > > depending on crypto, then fix crypto so you can pull in the algorithm
> > > without pulling in the whole crypto core.
> > 
> > The hashing is not a proper sha1 neither, unfortunately. I think that
> > is why it will have a custom implementation in iproute2?
> 
> Still trying to catch up on this admittedly bit confusing thread. I
> did run automated tests over couple of days comparing the data I got
> from fdinfo with the one from af_alg and found no mismatch on the test
> cases varying from min to max possible program sizes. In the process
> of testing, as you might have seen on netdev, I found couple of other
> bugs in bpf code along the way and fixed them up as well. So my question,
> do you or Andy or anyone participating in claiming this have any
> concrete data or test cases that suggests something different? If yes,
> I'm very curious to hear about it and willing fix it up, of course.
> When I'm back from pto I'll prep and cook up my test suite to be
> included into the selftests/bpf/, should have done this initially,
> sorry about that. I'll also post something to expose the alg, that
> sounds fine to me.

Looking into your code closer, I noticed that you indeed seem to do the
finalization of sha-1 by hand by aligning and padding the buffer
accordingly and also patching in the necessary payload length.

Apologies for my side for claiming that this is not correct sha1
output, I was only looking at sha_transform and its implementation and
couldn't see the padding and finalization round with embedding the data
length in there and hadn't thought of it being done manually.

Anyway, is it difficult to get the sha finalization into some common
code library? It is not very bpf specific and crypto code reviewers
won't find it there at all.

Thanks,
Hannes

^ permalink raw reply

* [PATCH iproute2] fix typo in ip-xfrm man page, rmd610 -> rmd160
From: Alexey Kodanev @ 2016-12-23 11:03 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Vasily Isaenko, Alexey Kodanev

Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
---
 man/man8/ip-xfrm.8 |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/man/man8/ip-xfrm.8 b/man/man8/ip-xfrm.8
index 11f7104..a0bbef5 100644
--- a/man/man8/ip-xfrm.8
+++ b/man/man8/ip-xfrm.8
@@ -477,7 +477,7 @@ Encryption algorithms include
 
 Authentication algorithms include
 .BR digest_null ", " hmac(md5) ", " hmac(sha1) ", " hmac(sha256) ","
-.BR hmac(sha384) ", " hmac(sha512) ", " hmac(rmd610) ", and " xcbc(aes) "."
+.BR hmac(sha384) ", " hmac(sha512) ", " hmac(rmd160) ", and " xcbc(aes) "."
 
 Authenticated encryption with associated data (AEAD) algorithms include
 .BR rfc4106(gcm(aes)) ", " rfc4309(ccm(aes)) ", and " rfc4543(gcm(aes)) "."
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH net-next 1/3] net:dsa:mv88e6xxx: use hashtable to store multicast entries
From: Andrew Lunn @ 2016-12-23 10:17 UTC (permalink / raw)
  To: Volodymyr Bendiuga
  Cc: Vivien Didelot, Florian Fainelli, Volodymyr Bendiuga, netdev,
	Volodymyr Bendiuga, John Crispin
In-Reply-To: <5CB44DFD-2807-4FD8-B409-FD28A53DE8B6@gmail.com>

On Fri, Dec 23, 2016 at 10:30:27AM +0100, Volodymyr Bendiuga wrote:
> Hi Andrew,
> Here is the program I promised you.

Hi Volodymyr

Thanks for this. I will try it out beginning of January. I don't have
access to the hardware at the moment.

       Thanks
		Andrew

^ permalink raw reply

* Re: BPF hash algo (Re: [kernel-hardening] Re: [PATCH v7 3/6] random: use SipHash in place of MD5)
From: Daniel Borkmann @ 2016-12-23 10:23 UTC (permalink / raw)
  To: Andy Lutomirski, Hannes Frederic Sowa
  Cc: Alexei Starovoitov, Jason A. Donenfeld,
	kernel-hardening@lists.openwall.com, Theodore Ts'o, Netdev,
	LKML, Linux Crypto Mailing List, David Laight, Eric Dumazet,
	Linus Torvalds, Eric Biggers, Tom Herbert, Andi Kleen,
	David S. Miller, Jean-Philippe Aumasson
In-Reply-To: <CALCETrW_T1v4qKPJDs5dXwAnAit3M52AWMH-K+GJLb1WoLMuRQ@mail.gmail.com>

On 12/22/2016 06:25 PM, Andy Lutomirski wrote:
> On Thu, Dec 22, 2016 at 8:59 AM, Hannes Frederic Sowa
[...]
>> I wondered if bpf program loading should have used the module loading
>> infrastructure from the beginning...
>
> That would be way too complicated and would be nasty for the unprivileged cases.

Also, there are users be it privileged or not that don't require to have a full
obj loader from user space, but are fine with just hard-coding parts or all of
the insns in their application. Back then we settled with using fds based on
Andy's suggestion, it has both ups and downs as we saw along the way but worked
okay thus far.

^ permalink raw reply

* [PATCH v3] stmmac: CSR clock configuration fix
From: Joao Pinto @ 2016-12-23 10:15 UTC (permalink / raw)
  To: peppe.cavallaro, davem, seraphin.bonnaffe
  Cc: hock.leong.kweh, niklas.cassel, pavel, linux-kernel, preid,
	netdev, Joao Pinto

When testing stmmac with my QoS reference design I checked a problem in the
CSR clock configuration that was impossibilitating the phy discovery, since
every read operation returned 0x0000ffff. This patch fixes the issue.

Signed-off-by: Joao Pinto <jpinto@synopsys.com>
---
changes v2->v3 (Phil Reid)
- Altera uses the reserved bit 5 also for CR clock, so the mask was changed
changes v1->v2 (David Miller)
- DWMAC100 and DWMAC1000 csr clocks masks should also be fixed for the patch
to make sense

 drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c | 2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c  | 2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c    | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
index b21d03f..be3c91c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
@@ -539,7 +539,7 @@ struct mac_device_info *dwmac1000_setup(void __iomem *ioaddr, int mcbins,
 	mac->mii.reg_shift = 6;
 	mac->mii.reg_mask = 0x000007C0;
 	mac->mii.clk_csr_shift = 2;
-	mac->mii.clk_csr_mask = 0xF;
+	mac->mii.clk_csr_mask = GENMASK(5, 2);
 
 	/* Get and dump the chip ID */
 	*synopsys_id = stmmac_get_synopsys_id(hwid);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c
index a1d582f..9dd2987 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c
@@ -197,7 +197,7 @@ struct mac_device_info *dwmac100_setup(void __iomem *ioaddr, int *synopsys_id)
 	mac->mii.reg_shift = 6;
 	mac->mii.reg_mask = 0x000007C0;
 	mac->mii.clk_csr_shift = 2;
-	mac->mii.clk_csr_mask = 0xF;
+	mac->mii.clk_csr_mask = GENMASK(5, 2);
 
 	/* Synopsys Id is not available on old chips */
 	*synopsys_id = 0;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 23322fd..fda01f7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -81,8 +81,8 @@ static int stmmac_mdio_read(struct mii_bus *bus, int phyaddr, int phyreg)
 	value |= (phyaddr << priv->hw->mii.addr_shift)
 		& priv->hw->mii.addr_mask;
 	value |= (phyreg << priv->hw->mii.reg_shift) & priv->hw->mii.reg_mask;
-	value |= (priv->clk_csr & priv->hw->mii.clk_csr_mask)
-		<< priv->hw->mii.clk_csr_shift;
+	value |= (priv->clk_csr << priv->hw->mii.clk_csr_shift)
+		& priv->hw->mii.clk_csr_mask;
 	if (priv->plat->has_gmac4)
 		value |= MII_GMAC4_READ;
 
@@ -122,8 +122,8 @@ static int stmmac_mdio_write(struct mii_bus *bus, int phyaddr, int phyreg,
 		& priv->hw->mii.addr_mask;
 	value |= (phyreg << priv->hw->mii.reg_shift) & priv->hw->mii.reg_mask;
 
-	value |= ((priv->clk_csr & priv->hw->mii.clk_csr_mask)
-		<< priv->hw->mii.clk_csr_shift);
+	value |= (priv->clk_csr << priv->hw->mii.clk_csr_shift)
+		& priv->hw->mii.clk_csr_mask;
 	if (priv->plat->has_gmac4)
 		value |= MII_GMAC4_WRITE;
 
-- 
2.9.3

^ permalink raw reply related

* Re: [PATCH v2] stmmac: CSR clock configuration fix
From: Joao Pinto @ 2016-12-23 10:09 UTC (permalink / raw)
  To: Phil Reid, Joao Pinto, peppe.cavallaro, davem, seraphin.bonnaffe
  Cc: hock.leong.kweh, niklas.cassel, pavel, linux-kernel, netdev
In-Reply-To: <bc49326e-15ff-cbb3-6117-9fb4b96cc281@electromag.com.au>


Hello Phil,

Às 1:09 AM de 12/23/2016, Phil Reid escreveu:
> G'day Joao,
> On 23/12/2016 01:06, Joao Pinto wrote:
>> Às 4:57 PM de 12/22/2016, Phil Reid escreveu:
>>> On 22/12/2016 23:47, Joao Pinto wrote:
>>>>
>>>> Hello Phil,
>>>>
>>>> Às 3:42 PM de 12/22/2016, Phil Reid escreveu:
>>>>> G'day Joao,
>>>>>
>>>>> On 22/12/2016 20:38, Joao Pinto wrote:
>>>>>> When testing stmmac with my QoS reference design I checked a problem in the
>>>>>> CSR clock configuration that was impossibilitating the phy discovery, since
>>>>>> every read operation returned 0x0000ffff. This patch fixes the issue.
>>>>>>
>>>>>> Signed-off-by: Joao Pinto <jpinto@synopsys.com>
>>>>>> ---
>>>>>> changes v1->v2 (David Miller)
>>>>>> - DWMAC100 and DWMAC1000 csr clocks masks should also be fixed for the patch
>>>>>> to make sense
>>>>>>
>>>>>>  drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c | 2 +-
>>>>>>  drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c  | 2 +-
>>>>>>  drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c    | 8 ++++----
>>>>>>  3 files changed, 6 insertions(+), 6 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
>>>>>> b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
>>>>>> index b21d03f..94223c8 100644
>>>>>> --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
>>>>>> +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
>>>>>> @@ -539,7 +539,7 @@ struct mac_device_info *dwmac1000_setup(void __iomem
>>>>>> *ioaddr, int mcbins,
>>>>>>      mac->mii.reg_shift = 6;
>>>>>>      mac->mii.reg_mask = 0x000007C0;
>>>>>>      mac->mii.clk_csr_shift = 2;
>>>>>> -    mac->mii.clk_csr_mask = 0xF;
>>>>>> +    mac->mii.clk_csr_mask = GENMASK(4, 2);
>>>>>
>>>>> Should this not be GENMASK(5,2)
>>>>
>>>> According to Universal MAC databook (valid for MAC100 and MAC1000), we have:
>>>>
>>>> Bits: 4:2
>>>> 000 60-100 MHz clk_csr_i/42
>>>> 001 100-150 MHz clk_csr_i/62
>>>> 010 20-35 MHz clk_csr_i/16
>>>> 011 35-60 MHz clk_csr_i/26
>>>> 100 150-250 MHz clk_csr_i/102
>>>> 101 250-300 MHz clk_csr_i/124
>>>> 110, 111 Reserved
>>>>
>>>> So only bits 2, 3 and 4 should be masked.
>>>>
>>>> Thanks.
>>>>
>>> But this is a change in behaviour from what was there isn't.
>>> Previous mask was 4 bits. now it's 3.
>>>
>>> And for example the altera socfgpa implementation in the cyclone V has valid
>>> values
>>> for values 0x8-0xf, using bit 5:2.
>>
>> According to the databook, bit 5 is reserved and RO. When reserved, it is
>> possible to customize. Is that the case? If there is hardware using the 5th bit
>> we can put it GENMASK(5, 2) with no problem.
>>
> I've also checked the Aria 10 documentation and bit 5 is also RW.
> The following options are documented and supported
>     1000: CSR clock/4
>     1001: CSR clock/6
>     1010: CSR clock/8
>     1011: CSR clock/10
>     1100: CSR clock/12
>     1101: CSR clock/14
>     1110: CSR clock/16
>     1111: CSR clock/18
> They do mention that these values will probably be outside the IEEE 802.3
> specified range.
> 
> But there's at least a couple of cores out there where GENMASK(5,2) is valid.
> Can't say if anyone is using that setting thou.

Thanks for checking it! Ok, it seems like they are using the reserved bit 5. No
problem, I am going to change the patch and put the mask from 2 to 5. Thanks for
your help!

Joao

> 
> 

^ permalink raw reply

* Re: BPF hash algo (Re: [kernel-hardening] Re: [PATCH v7 3/6] random: use SipHash in place of MD5)
From: Daniel Borkmann @ 2016-12-23 10:04 UTC (permalink / raw)
  To: Hannes Frederic Sowa, Andy Lutomirski, Alexei Starovoitov
  Cc: Jason A. Donenfeld, kernel-hardening@lists.openwall.com,
	Theodore Ts'o, Netdev, LKML, Linux Crypto Mailing List,
	David Laight, Eric Dumazet, Linus Torvalds, Eric Biggers,
	Tom Herbert, Andi Kleen, David S. Miller, Jean-Philippe Aumasson
In-Reply-To: <1482425969.2673.5.camel@stressinduktion.org>

On 12/22/2016 05:59 PM, Hannes Frederic Sowa wrote:
> On Thu, 2016-12-22 at 08:07 -0800, Andy Lutomirski wrote:
>> On Thu, Dec 22, 2016 at 7:51 AM, Hannes Frederic Sowa
>> <hannes@stressinduktion.org> wrote:
>>> On Thu, 2016-12-22 at 16:41 +0100, Jason A. Donenfeld wrote:
>>>> On Thu, Dec 22, 2016 at 4:33 PM, Hannes Frederic Sowa
>>>> <hannes@stressinduktion.org> wrote:
>>>>> IPv6 you cannot touch anymore. The hashing algorithm is part of uAPI.
>>>>> You don't want to give people new IPv6 addresses with the same stable
>>>>> secret (across reboots) after a kernel upgrade. Maybe they lose
>>>>> connectivity then and it is extra work?
>>>>
>>>> Ahh, too bad. So it goes.
>>>
>>> If no other users survive we can put it into the ipv6 module.
>>>
>>>>> The bpf hash stuff can be changed during this merge window, as it is
>>>>> not yet in a released kernel. Albeit I would probably have preferred
>>>>> something like sha256 here, which can be easily replicated by user
>>>>> space tools (minus the problem of patching out references to not
>>>>> hashable data, which must be zeroed).
>>>>
>>>> Oh, interesting, so time is of the essence then. Do you want to handle
>>>> changing the new eBPF code to something not-SHA1 before it's too late,
>>>> as part of a ne
>>
>> w patchset that can fast track itself to David? And
>>>> then I can preserve my large series for the next merge window.
>>>
>>> This algorithm should be a non-seeded algorithm, because the hashes
>>> should be stable and verifiable by user space tooling. Thus this would
>>> need a hashing algorithm that is hardened against pre-image
>>> attacks/collision resistance, which siphash is not. I would prefer some
>>> higher order SHA algorithm for that actually.
>>>
>>
>> You mean:
>>
>> commit 7bd509e311f408f7a5132fcdde2069af65fa05ae
>> Author: Daniel Borkmann <daniel@iogearbox.net>
>> Date:   Sun Dec 4 23:19:41 2016 +0100
>>
>>      bpf: add prog_digest and expose it via fdinfo/netlink
>>
>>
>> Yes, please!  This actually matters for security -- imagine a
>> malicious program brute-forcing a collision so that it gets loaded
>> wrong.  And this is IMO a use case for SHA-256 or SHA-512/256
>> (preferably the latter).  Speed basically doesn't matter here and
>> Blake2 is both less stable (didn't they slightly change it recently?)
>> and much less well studied.
>
> We don't prevent ebpf programs being loaded based on the digest but
> just to uniquely identify loaded programs from user space and match up
> with their source.
>
> There have been talks about signing bpf programs, thus this would
> probably need another digest algorithm additionally to that one,
> wasting probably instructions. Probably going somewhere in direction of
> PKCS#7 might be the thing to do (which leads to the problem to make
> PKCS#7 attackable by ordinary unpriv users, hmpf).
>
>> My inclination would have been to seed them with something that isn't
>> exposed to userspace for the precise reason that it would prevent user
>> code from making assumptions about what's in the hash.  But if there's
>> a use case for why user code needs to be able to calculate the hash on
>> its own, then that's fine.  But perhaps the actual fdinfo string
>> should be "sha256:abcd1234..." to give some flexibility down the road.
>>
>> Also:
>>
>> +       result = (__force __be32 *)fp->digest;
>> +       for (i = 0; i < SHA_DIGEST_WORDS; i++)
>> +               result[i] = cpu_to_be32(fp->digest[i]);
>>
>> Everyone, please, please, please don't open-code crypto primitives.
>> Is this and the code above it even correct?  It might be but on a very
>> brief glance it looks wrong to me.  If you're doing this to avoid
>> depending on crypto, then fix crypto so you can pull in the algorithm
>> without pulling in the whole crypto core.
>
> The hashing is not a proper sha1 neither, unfortunately. I think that
> is why it will have a custom implementation in iproute2?

Still trying to catch up on this admittedly bit confusing thread. I
did run automated tests over couple of days comparing the data I got
from fdinfo with the one from af_alg and found no mismatch on the test
cases varying from min to max possible program sizes. In the process
of testing, as you might have seen on netdev, I found couple of other
bugs in bpf code along the way and fixed them up as well. So my question,
do you or Andy or anyone participating in claiming this have any
concrete data or test cases that suggests something different? If yes,
I'm very curious to hear about it and willing fix it up, of course.
When I'm back from pto I'll prep and cook up my test suite to be
included into the selftests/bpf/, should have done this initially,
sorry about that. I'll also post something to expose the alg, that
sounds fine to me.

> I wondered if bpf program loading should have used the module loading
> infrastructure from the beginning...
>
>> At the very least, there should be a separate function that calculates
>> the hash of a buffer and that function should explicitly run itself
>> against test vectors of various lengths to make sure that it's
>> calculating what it claims to be calculating.  And it doesn't look
>> like the userspace code has landed, so, if this thing isn't
>> calculating SHA1 correctly, it's plausible that no one has noticed.
>
> I hope this was known from the beginning, this is not sha1 unfortunately.
>
> But ebpf elf programs also need preprocessing to get rid of some
> embedded load-depending data, so maybe it was considered to be just
> enough?
>
> Bye,
> Hannes
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: [PATCH net-next 1/3] net:dsa:mv88e6xxx: use hashtable to store multicast entries
From: Volodymyr Bendiuga @ 2016-12-23  9:30 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Vivien Didelot, Florian Fainelli, Volodymyr Bendiuga, netdev,
	Volodymyr Bendiuga, John Crispin
In-Reply-To: <20161216110115.GC20359@lunn.ch>

[-- Attachment #1: Type: text/plain, Size: 1052 bytes --]

Hi Andrew,
Here is the program I promised you.
There is a .c and Makefile attached to this mail. Simply type ”make” to build it.
There is a dependency on libnl3, which needs to be installed. If you don’t have it
Just install it: "apt-get install libnl-3-dev libnl-route-3-dev” if you use ubuntu based
Linux.

What the program does is it creates a libnl cache, inserts into it some hardcoded
Multicast entries, and then adds entries to the kernel (to the bridge and to hardware).
By default the program looks for interfaces that have ”eth” in their names. If your interfaces
Have different names, just correct that line in the code. Otherwise it should just run straight away.
When entries are added the timing is presented. By default the program uses vlan id 1. If you need
To override it, just pass "-v 2” or whatever vlan id you wish.
Please try running the program with hash table patch and without. You should see a significant difference.

Let me know if there is something you need help with.

Thanks,
Volodymyr

[-- Attachment #2: Makefile --]
[-- Type: application/octet-stream, Size: 326 bytes --]


EXEC      = fdbtest
OBJS      = fdbtest.o 
SRCS      = $(OBJS:.o=.c)
DEPS      = $(SRCS:.c=.d)
CFLAGS   += `pkg-config --cflags libnl-route-3.0`
DEPLIBS   = `pkg-config --libs libnl-route-3.0`
LDLIBS   += $(DEPLIBS)

all: Makefile $(EXEC)

$(EXEC): $(OBJS)

clean:
	-@$(RM) $(OBJS) $(EXEC)

distclean: clean
	-@$(RM) $(DEPS)

[-- Attachment #3: fdbtest.c --]
[-- Type: application/octet-stream, Size: 6045 bytes --]

/* 
 * FDB test - insert preprogrammed fdb entries
 * into HW database and measure the time it takes
 * to do the job.
 *
 * Author: Volodymyr Bendiuga <volodymyr.bendiuga@gmail.com>
 */

#include <net/if.h>
#include <netinet/ether.h>
#include <unistd.h>
#include <time.h>

#define _LINUX_IF_H
#include <linux/if_link.h>
#include <netlink/netlink.h>
#include <netlink/route/link.h>
#include <netlink/route/neighbour.h>
#include <netlink/route/link/bridge.h>

struct cb_data {
	int err;
	int vid;
	struct nl_sock *sk;
	void *arg;
};

static void __add(struct nl_object *obj, void *data)
{
	struct cb_data *cbd = data;
	struct nl_sock *sk;
	struct rtnl_neigh *neigh = (struct rtnl_neigh *)obj;

	sk = cbd->sk;
	if (cbd->err)
		return;

        cbd->err = rtnl_neigh_add(sk, neigh, NLM_F_CREATE);
}

static int __insert(struct nl_cache *cache, int ifindex, unsigned char *mac, int vlan)
{
	struct rtnl_neigh *neigh;
	struct nl_addr *addr;
	int err = 0;

        neigh = rtnl_neigh_alloc();
        if (!neigh) {
                printf("Could not allocate netlink neighbour\n");
                return -NLE_NOMEM;
        }

        addr = nl_addr_alloc(ETH_ALEN);
        if (!addr) {
		printf("Could not allocate netlink address\n");
		err = -NLE_NOMEM;
		goto err_put_neigh;
        }

	nl_addr_set_family(addr, AF_LLC);
	nl_addr_set_prefixlen(addr, 48);
        if (nl_addr_set_binary_addr(addr, mac, ETH_ALEN) < 0) {
		printf("Could not set netlink address\n");
		err = -NLE_FAILURE;
		goto err_put_addr;
	}

	rtnl_neigh_set_family(neigh, PF_BRIDGE);
	rtnl_neigh_set_lladdr(neigh, addr);
	rtnl_neigh_set_flags(neigh, NTF_SELF);
	rtnl_neigh_set_state(neigh, NUD_NOARP);
	rtnl_neigh_set_ifindex(neigh, ifindex);
	rtnl_neigh_set_vlan(neigh, vlan);

	err = nl_cache_add(cache, (struct nl_object*)neigh);
	if (err)
		printf("Could not add object to cache, err -> %d", err);

err_put_addr:
	nl_addr_put(addr);
err_put_neigh:
	rtnl_neigh_put(neigh);

	return err;
}

static int __insert_in_vlan(struct nl_cache *cache, char* ifname, unsigned char *mac, int vid)
{
       int ifindex;
       int err = 0;

       ifindex = if_nametoindex(ifname);
       err = __insert(cache, ifindex, mac, vid);

       return err;
}

static int __insert_default_mac(struct nl_cache *cache, char* ifname, int vid)
{
	int err = 0;
	size_t i;
        unsigned char mac[][ETH_ALEN] = {
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x01 }, /* 224.0.0.1 */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x02 }, /* 224.0.0.2 */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x04 }, /* 224.0.0.4   - DVMRP */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x05 }, /* 224.0.0.5   - OSPF */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x06 }, /* 224.0.0.6   - OSPF */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x09 }, /* 224.0.0.9   - RIPv2 */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x0a }, /* 224.0.0.10  - IGRP */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x0c }, /* 224.0.0.12  - DHCP Server/Relay */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x0d }, /* 224.0.0.13  - PIM */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x0e }, /* 224.0.0.14  - RSVP */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x12 }, /* 224.0.0.18  - VRRP*/
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x13 }, /* 224.0.0.19 */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x14 }, /* 224.0.0.20 */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x15 }, /* 224.0.0.21 */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x16 }, /* 224.0.0.22  - IGMP */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x18 }, /* 224.0.0.24  - OSPF */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x66 }, /* 224.0.0.102 - HSRP */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x6b }, /* 224.0.0.107 - PTP-pdelay */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0xfb }, /* 224.0.0.251 - mDNS */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0xfc }, /* 224.0.0.252 - Link-local multicast DNS */
                { 0x01, 0x00, 0x5e, 0x00, 0x00, 0xfe }  /* 224.0.0.254 */
        };

	for (i = 0; i < sizeof(mac) / sizeof(mac[0]); i++)
		__insert_in_vlan(cache, ifname, mac[i], vid);

	return err;
}

static void __add_fdb_defaults(struct nl_object *obj, void *data)
{
	struct cb_data *cbd = data;
	struct nl_cache *fdb_cache = cbd->arg;
	struct rtnl_link *link = nl_object_priv(obj);
	char *ifname;

	ifname = rtnl_link_get_name(link);
	if (!strncmp(ifname, "eth", 3)) {
		printf("Using interface -> %s\n", ifname);
		cbd->err = __insert_default_mac(fdb_cache, ifname, cbd->vid);
	}
	else
		printf("Skipping interface -> %s\n", ifname);
}

static int usage(int code)
{
	fprintf(stderr, "\nUsage: fdbtest [-h] [-v vid]\n\n"
		"  -h         Show summary of command line options and exit\n"
		"  -v vid     Set vlan id\n"
		"\n");

	return code;
}

int main(int argc , char *argv[])
{
	struct nl_cache *link_cache, *neigh_cache;
	struct nl_sock *nlsk;
	struct cb_data cbd = {.err = 0, .vid = 1 };
	clock_t start, end;
	double time_used;
	int err = 0;
	int c;

	while ((c = getopt(argc, argv, "hv:")) != EOF) {
		switch (c) {
		case 'h':
			return usage(0);

		case 'v':
			cbd.vid = atoi(optarg);
			break;

		default:
			return usage(1);
		}
	}

	printf("Init\n");
	nlsk = nl_socket_alloc();
	if (!nlsk)
		return -NLE_NOMEM;
	err = nl_connect(nlsk, NETLINK_ROUTE);
	if (err)
		goto free_nlsk;

	err = rtnl_link_alloc_cache(nlsk, AF_UNSPEC, &link_cache);
	if (err)
		goto free_nlsk;

	err = rtnl_neigh_alloc_cache(nlsk, &neigh_cache);
	if (err)
		goto free_lcache;

	printf("Using vlan %d\n\n", cbd.vid);
	cbd.arg = neigh_cache;
	nl_cache_foreach(link_cache, __add_fdb_defaults, &cbd);
	err = cbd.err;

	cbd.sk = nlsk;
	start = clock();
	nl_cache_foreach(neigh_cache, __add, &cbd);
	end = clock();
	err = cbd.err;
	time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
	printf("\nTime spent: %f sec\n", time_used);
	
	nl_cache_free(neigh_cache);
	printf("Exit %d\n", err);

 free_lcache:
	nl_cache_free(link_cache);
 free_nlsk:
	nl_socket_free(nlsk);

	return err;
}

[-- Attachment #4: Type: text/plain, Size: 621 bytes --]


> 16 dec. 2016 kl. 12:01 skrev Andrew Lunn <andrew@lunn.ch>:
> 
> On Fri, Dec 16, 2016 at 11:26:01AM +0100, Volodymyr Bendiuga wrote:
>> Hi Andrew,
>> 
>> I don't have any script right now, the code I have is a part of
>> the OS, but I could write simple C program which represents
>> what we are doing with mc entries and send it to you for profiling.
> 
> It would be nice to have a benchmark test we can use to test out
> ideas.
> 
> Please try to make the code flexible. The slave interface names on my
> boards are probably not the same as on your. Also, the number of ports
> may differ.
> 
>    Thanks
> 	Andrew


^ permalink raw reply

* [patch net 3/3] mlxsw: spectrum_router: Correctly remove nexthop groups
From: Jiri Pirko @ 2016-12-23  8:32 UTC (permalink / raw)
  To: netdev; +Cc: davem, idosch, eladr, yotamg, nogahf, arkadis
In-Reply-To: <1482481970-2327-1-git-send-email-jiri@resnulli.us>

From: Ido Schimmel <idosch@mellanox.com>

At the end of the nexthop initialization process we determine whether
the nexthop should be offloaded or not based on the NUD state of the
neighbour representing it. After all the nexthops were initialized we
refresh the nexthop group and potentially offload it to the device, in
case some of the nexthops were resolved.

Make the destruction of a nexthop group symmetric with its creation by
marking all nexthops as invalid and then refresh the nexthop group to
make sure it was removed from the device's tables.

Fixes: b2157149b0b0 ("mlxsw: spectrum_router: Add the nexthop neigh activity update")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index a0f9742..01d0efa 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -1396,6 +1396,7 @@ static void mlxsw_sp_nexthop_fini(struct mlxsw_sp *mlxsw_sp,
 {
 	struct mlxsw_sp_neigh_entry *neigh_entry = nh->neigh_entry;
 
+	__mlxsw_sp_nexthop_neigh_update(nh, true);
 	list_del(&nh->neigh_list_node);
 
 	/* If that is the last nexthop connected to that neigh, remove from
@@ -1454,6 +1455,8 @@ mlxsw_sp_nexthop_group_destroy(struct mlxsw_sp *mlxsw_sp,
 		nh = &nh_grp->nexthops[i];
 		mlxsw_sp_nexthop_fini(mlxsw_sp, nh);
 	}
+	mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh_grp);
+	WARN_ON_ONCE(nh_grp->adj_index_valid);
 	kfree(nh_grp);
 }
 
-- 
2.7.4

^ permalink raw reply related

* [patch net 2/3] mlxsw: spectrum_router: Don't reflect dead neighs
From: Jiri Pirko @ 2016-12-23  8:32 UTC (permalink / raw)
  To: netdev; +Cc: davem, idosch, eladr, yotamg, nogahf, arkadis
In-Reply-To: <1482481970-2327-1-git-send-email-jiri@resnulli.us>

From: Ido Schimmel <idosch@mellanox.com>

When a neighbour is considered to be dead, we should remove it from the
device's table regardless of its NUD state.

Without this patch, after setting a port to be administratively down we
get the following errors when we periodically try to update the kernel
about neighbours activity:

[  461.947268] mlxsw_spectrum 0000:03:00.0 sw1p3: Failed to find
matching neighbour for IP=192.168.100.2

Fixes: a6bf9e933daf ("mlxsw: spectrum_router: Offload neighbours based on NUD state change")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 53126bf..a0f9742 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -942,7 +942,7 @@ static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work)
 	char rauht_pl[MLXSW_REG_RAUHT_LEN];
 	struct net_device *dev;
 	bool entry_connected;
-	u8 nud_state;
+	u8 nud_state, dead;
 	bool updating;
 	bool removing;
 	bool adding;
@@ -953,10 +953,11 @@ static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work)
 	dip = ntohl(*((__be32 *) n->primary_key));
 	memcpy(neigh_entry->ha, n->ha, sizeof(neigh_entry->ha));
 	nud_state = n->nud_state;
+	dead = n->dead;
 	dev = n->dev;
 	read_unlock_bh(&n->lock);
 
-	entry_connected = nud_state & NUD_VALID;
+	entry_connected = nud_state & NUD_VALID && !dead;
 	adding = (!neigh_entry->offloaded) && entry_connected;
 	updating = neigh_entry->offloaded && entry_connected;
 	removing = neigh_entry->offloaded && !entry_connected;
@@ -1351,7 +1352,7 @@ static int mlxsw_sp_nexthop_init(struct mlxsw_sp *mlxsw_sp,
 	struct mlxsw_sp_neigh_entry *neigh_entry;
 	struct net_device *dev = fib_nh->nh_dev;
 	struct neighbour *n;
-	u8 nud_state;
+	u8 nud_state, dead;
 
 	/* Take a reference of neigh here ensuring that neigh would
 	 * not be detructed before the nexthop entry is finished.
@@ -1383,8 +1384,9 @@ static int mlxsw_sp_nexthop_init(struct mlxsw_sp *mlxsw_sp,
 	list_add_tail(&nh->neigh_list_node, &neigh_entry->nexthop_list);
 	read_lock_bh(&n->lock);
 	nud_state = n->nud_state;
+	dead = n->dead;
 	read_unlock_bh(&n->lock);
-	__mlxsw_sp_nexthop_neigh_update(nh, !(nud_state & NUD_VALID));
+	__mlxsw_sp_nexthop_neigh_update(nh, !(nud_state & NUD_VALID && !dead));
 
 	return 0;
 }
-- 
2.7.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox