Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next V5 PATCH 2/5] bpf: XDP_REDIRECT enable use of cpumap
From: Jesper Dangaard Brouer @ 2017-10-06 16:12 UTC (permalink / raw)
  To: netdev
  Cc: jakub.kicinski, Michael S. Tsirkin, pavel.odintsov, Jason Wang,
	mchan, John Fastabend, peter.waskiewicz.jr,
	Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov,
	Andy Gospodarek
In-Reply-To: <150730632837.22839.11804085686478888137.stgit@firesoul>

This patch connects cpumap to the xdp_do_redirect_map infrastructure.

Still no SKB allocation are done yet.  The XDP frames are transferred
to the other CPU, but they are simply refcnt decremented on the remote
CPU.  This served as a good benchmark for measuring the overhead of
remote refcnt decrement.  If driver page recycle cache is not
efficient then this, exposes a bottleneck in the page allocator.

A shout-out to MST's ptr_ring, which is the secret behind is being so
efficient to transfer memory pointers between CPUs, without constantly
bouncing cache-lines between CPUs.

V3: Handle !CONFIG_BPF_SYSCALL pointed out by kbuild test robot.

V4: Make Generic-XDP aware of cpumap type, but don't allow redirect yet,
 as implementation require a separate upstream discussion.

V5:
 - Fix a maybe-uninitialized pointed out by kbuild test robot.
 - Restrict bpf-prog side access to cpumap, open when use-cases appear
 - Implement cpu_map_enqueue() as a more simple void pointer enqueue

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/linux/bpf.h        |   31 +++++++++-
 include/trace/events/xdp.h |   10 +++
 kernel/bpf/cpumap.c        |   22 +++++++
 kernel/bpf/verifier.c      |    8 ++-
 net/core/filter.c          |  140 +++++++++++++++++++++++++++++++++++---------
 5 files changed, 177 insertions(+), 34 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 252f4bc9eb25..36ec83343c82 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -320,6 +320,13 @@ struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
 
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
+void __cpu_map_flush(struct bpf_map *map);
+struct xdp_buff;
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+		    struct net_device *dev_rx);
+
 /* Return map's numa specified by userspace */
 static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 {
@@ -327,7 +334,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 		attr->numa_node : NUMA_NO_NODE;
 }
 
-#else
+#else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
 	return ERR_PTR(-EOPNOTSUPP);
@@ -385,6 +392,28 @@ static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
 static inline void __dev_map_flush(struct bpf_map *map)
 {
 }
+
+static inline
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	return NULL;
+}
+
+static inline void __cpu_map_insert_ctx(struct bpf_map *map, u32 index)
+{
+}
+
+static inline void __cpu_map_flush(struct bpf_map *map)
+{
+}
+
+struct xdp_buff;
+static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
+				  struct xdp_buff *xdp,
+				  struct net_device *dev_rx)
+{
+	return 0;
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 4e16c43fba10..eb2ece96c1a2 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -136,12 +136,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
 		  __entry->map_id, __entry->map_index)
 );
 
+#define devmap_ifindex(fwd, map)				\
+	(!fwd ? 0 :						\
+	 (!map ? 0 :						\
+	  ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
+	   ((struct net_device *)fwd)->ifindex : 0)))
+
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
-	 trace_xdp_redirect_map(dev, xdp, fwd ? fwd->ifindex : 0,	\
+	 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map),	\
 				0, map, idx)
 
 #define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err)	\
-	 trace_xdp_redirect_map_err(dev, xdp, fwd ? fwd->ifindex : 0,	\
+	 trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map),	\
 				    err, map, idx)
 
 #endif /* _TRACE_XDP_H */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index d3e921620097..1499eae96d30 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -480,7 +480,7 @@ struct xdp_pkt {
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
 
@@ -500,6 +500,26 @@ int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
 	return 0;
 }
 
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
+{
+	struct xdp_pkt *xdp_pkt;
+	int headroom;
+
+	/* For now this is just used as a void pointer to data_hard_start.
+	 * Followup patch will generalize this.
+	 */
+	xdp_pkt = xdp->data_hard_start;
+
+	/* Fake writing into xdp_pkt->data to measure overhead */
+	headroom = xdp->data - xdp->data_hard_start;
+	if (headroom < sizeof(*xdp_pkt))
+		xdp_pkt->data = xdp->data;
+
+	bq_enqueue(rcpu, xdp_pkt);
+	return 0;
+}
+
 void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4cf9b72c59a0..34e3ebbc221f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1572,6 +1572,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
+	/* Restrict bpf side of cpumap, open when use-cases appear */
+	case BPF_MAP_TYPE_CPUMAP:
+		if (func_id != BPF_FUNC_redirect_map)
+			goto error;
+		break;
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 		if (func_id != BPF_FUNC_map_lookup_elem)
@@ -1608,7 +1613,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 			goto error;
 		break;
 	case BPF_FUNC_redirect_map:
-		if (map->map_type != BPF_MAP_TYPE_DEVMAP)
+		if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+		    map->map_type != BPF_MAP_TYPE_CPUMAP)
 			goto error;
 		break;
 	case BPF_FUNC_sk_redirect_map:
diff --git a/net/core/filter.c b/net/core/filter.c
index 9b6e7e84aafd..c02e772309d5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2521,10 +2521,36 @@ static int __bpf_tx_xdp(struct net_device *dev,
 	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
 	if (err)
 		return err;
-	if (map)
+	dev->netdev_ops->ndo_xdp_flush(dev);
+	return 0;
+}
+
+static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
+			    struct bpf_map *map,
+			    struct xdp_buff *xdp,
+			    u32 index)
+{
+	int err;
+
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+		struct net_device *dev = fwd;
+
+		if (!dev->netdev_ops->ndo_xdp_xmit)
+			return -EOPNOTSUPP;
+
+		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+		if (err)
+			return err;
 		__dev_map_insert_ctx(map, index);
-	else
-		dev->netdev_ops->ndo_xdp_flush(dev);
+
+	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+		struct bpf_cpu_map_entry *rcpu = fwd;
+
+		err = cpu_map_enqueue(rcpu, xdp, dev_rx);
+		if (err)
+			return err;
+		__cpu_map_insert_ctx(map, index);
+	}
 	return 0;
 }
 
@@ -2534,11 +2560,33 @@ void xdp_do_flush_map(void)
 	struct bpf_map *map = ri->map_to_flush;
 
 	ri->map_to_flush = NULL;
-	if (map)
-		__dev_map_flush(map);
+	if (map) {
+		switch (map->map_type) {
+		case BPF_MAP_TYPE_DEVMAP:
+			__dev_map_flush(map);
+			break;
+		case BPF_MAP_TYPE_CPUMAP:
+			__cpu_map_flush(map);
+			break;
+		default:
+			break;
+		}
+	}
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush_map);
 
+static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
+{
+	switch (map->map_type) {
+	case BPF_MAP_TYPE_DEVMAP:
+		return __dev_map_lookup_elem(map, index);
+	case BPF_MAP_TYPE_CPUMAP:
+		return __cpu_map_lookup_elem(map, index);
+	default:
+		return NULL;
+	}
+}
+
 static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog,
 				   unsigned long aux)
 {
@@ -2551,8 +2599,8 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	unsigned long map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
-	struct net_device *fwd = NULL;
 	u32 index = ri->ifindex;
+	void *fwd = NULL;
 	int err;
 
 	ri->ifindex = 0;
@@ -2565,7 +2613,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 		goto err;
 	}
 
-	fwd = __dev_map_lookup_elem(map, index);
+	fwd = __xdp_map_lookup_elem(map, index);
 	if (!fwd) {
 		err = -EINVAL;
 		goto err;
@@ -2573,7 +2621,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	if (ri->map_to_flush && ri->map_to_flush != map)
 		xdp_do_flush_map();
 
-	err = __bpf_tx_xdp(fwd, map, xdp, index);
+	err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index);
 	if (unlikely(err))
 		goto err;
 
@@ -2615,54 +2663,88 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
 
-int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
-			    struct bpf_prog *xdp_prog)
+static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
+{
+	unsigned int len;
+
+	if (unlikely(!(fwd->flags & IFF_UP)))
+		return -ENETDOWN;
+
+	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
+	if (skb->len > len)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb,
+				struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	unsigned long map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
 	struct net_device *fwd = NULL;
 	u32 index = ri->ifindex;
-	unsigned int len;
 	int err = 0;
 
 	ri->ifindex = 0;
 	ri->map = NULL;
 	ri->map_owner = 0;
 
-	if (map) {
-		if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
-			err = -EFAULT;
-			map = NULL;
-			goto err;
-		}
-		fwd = __dev_map_lookup_elem(map, index);
-	} else {
-		fwd = dev_get_by_index_rcu(dev_net(dev), index);
+	if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
+		err = -EFAULT;
+		map = NULL;
+		goto err;
 	}
+	fwd = __xdp_map_lookup_elem(map, index);
 	if (unlikely(!fwd)) {
 		err = -EINVAL;
 		goto err;
 	}
 
-	if (unlikely(!(fwd->flags & IFF_UP))) {
-		err = -ENETDOWN;
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+		if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
+			goto err;
+		skb->dev = fwd;
+	} else {
+		/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
+		err = -EBADRQC;
 		goto err;
 	}
 
-	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
-	if (skb->len > len) {
-		err = -EMSGSIZE;
+	_trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+	return 0;
+err:
+	_trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+	return err;
+}
+
+int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
+			    struct bpf_prog *xdp_prog)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	u32 index = ri->ifindex;
+	struct net_device *fwd;
+	int err = 0;
+
+	if (ri->map)
+		return xdp_do_generic_redirect_map(dev, skb, xdp_prog);
+
+	ri->ifindex = 0;
+	fwd = dev_get_by_index_rcu(dev_net(dev), index);
+	if (unlikely(!fwd)) {
+		err = -EINVAL;
 		goto err;
 	}
 
+	if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
+		goto err;
+
 	skb->dev = fwd;
-	map ? _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index)
-		: _trace_xdp_redirect(dev, xdp_prog, index);
+	_trace_xdp_redirect(dev, xdp_prog, index);
 	return 0;
 err:
-	map ? _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err)
-		: _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+	_trace_xdp_redirect_err(dev, xdp_prog, index, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);

^ permalink raw reply related

* [net-next V5 PATCH 3/5] bpf: cpumap xdp_buff to skb conversion and allocation
From: Jesper Dangaard Brouer @ 2017-10-06 16:12 UTC (permalink / raw)
  To: netdev
  Cc: jakub.kicinski, Michael S. Tsirkin, pavel.odintsov, Jason Wang,
	mchan, John Fastabend, peter.waskiewicz.jr,
	Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov,
	Andy Gospodarek
In-Reply-To: <150730632837.22839.11804085686478888137.stgit@firesoul>

This patch makes cpumap functional, by adding SKB allocation and
invoking the network stack on the dequeuing CPU.

For constructing the SKB on the remote CPU, the xdp_buff in converted
into a struct xdp_pkt, and it mapped into the top headroom of the
packet, to avoid allocating separate mem.  For now, struct xdp_pkt is
just a cpumap internal data structure, with info carried between
enqueue to dequeue.

If a driver doesn't have enough headroom it is simply dropped, with
return code -EOVERFLOW.  This will be picked up the xdp tracepoint
infrastructure, to allow users to catch this.

V2: take into account xdp->data_meta

V4:
 - Drop busypoll tricks, keeping it more simple.
 - Skip RPS and Generic-XDP-recursive-reinjection, suggested by Alexei

V5: correct RCU read protection around __netif_receive_skb_core.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/linux/netdevice.h |    1 
 kernel/bpf/cpumap.c       |  150 +++++++++++++++++++++++++++++++++++++++------
 net/core/dev.c            |   27 ++++++++
 3 files changed, 157 insertions(+), 21 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d04424cfffba..0a3dc246f066 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3251,6 +3251,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
 int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
+int netif_receive_skb_core(struct sk_buff *skb);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 1499eae96d30..960deeccf245 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -25,6 +25,9 @@
 #include <linux/kthread.h>
 #include <linux/capability.h>
 
+#include <linux/netdevice.h>   /* netif_receive_skb_core */
+#include <linux/etherdevice.h> /* eth_type_trans */
+
 /* General idea: XDP packets getting XDP redirected to another CPU,
  * will maximum be stored/queued for one driver ->poll() call.  It is
  * guaranteed that setting flush bit and flush operation happen on
@@ -169,20 +172,136 @@ static void cpu_map_kthread_stop(struct work_struct *work)
 	kthread_stop(rcpu->kthread); /* calls put_cpu_map_entry */
 }
 
+/* For now, xdp_pkt is a cpumap internal data structure, with info
+ * carried between enqueue to dequeue. It is mapped into the top
+ * headroom of the packet, to avoid allocating separate mem.
+ */
+struct xdp_pkt {
+	void *data;
+	u16 len;
+	u16 headroom;
+	u16 metasize;
+	struct net_device *dev_rx;
+};
+
+/* Convert xdp_buff to xdp_pkt */
+static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
+{
+	struct xdp_pkt *xdp_pkt;
+	int metasize;
+	int headroom;
+
+	/* Assure headroom is available for storing info */
+	headroom = xdp->data - xdp->data_hard_start;
+	metasize = xdp->data - xdp->data_meta;
+	metasize = metasize > 0 ? metasize : 0;
+	if ((headroom - metasize) < sizeof(*xdp_pkt))
+		return NULL;
+
+	/* Store info in top of packet */
+	xdp_pkt = xdp->data_hard_start;
+
+	xdp_pkt->data = xdp->data;
+	xdp_pkt->len  = xdp->data_end - xdp->data;
+	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
+	xdp_pkt->metasize = metasize;
+
+	return xdp_pkt;
+}
+
+struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
+				  struct xdp_pkt *xdp_pkt)
+{
+	unsigned int frame_size;
+	void *pkt_data_start;
+	struct sk_buff *skb;
+
+	/* build_skb need to place skb_shared_info after SKB end, and
+	 * also want to know the memory "truesize".  Thus, need to
+	 * know the memory frame size backing xdp_buff.
+	 *
+	 * XDP was designed to have PAGE_SIZE frames, but this
+	 * assumption is not longer true with ixgbe and i40e.  It
+	 * would be preferred to set frame_size to 2048 or 4096
+	 * depending on the driver.
+	 *   frame_size = 2048;
+	 *   frame_len  = frame_size - sizeof(*xdp_pkt);
+	 *
+	 * Instead, with info avail, skb_shared_info in placed after
+	 * packet len.  This, unfortunately fakes the truesize.
+	 * Another disadvantage of this approach, the skb_shared_info
+	 * is not at a fixed memory location, with mixed length
+	 * packets, which is bad for cache-line hotness.
+	 */
+	frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+	skb = build_skb(pkt_data_start, frame_size);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, xdp_pkt->headroom);
+	__skb_put(skb, xdp_pkt->len);
+	if (xdp_pkt->metasize)
+		skb_metadata_set(skb, xdp_pkt->metasize);
+
+	/* Essential SKB info: protocol and skb->dev */
+	skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+
+	/* Optional SKB info, currently missing:
+	 * - HW checksum info		(skb->ip_summed)
+	 * - HW RX hash			(skb_set_hash)
+	 * - RX ring dev queue index	(skb_record_rx_queue)
+	 */
+
+	return skb;
+}
+
 static int cpu_map_kthread_run(void *data)
 {
 	struct bpf_cpu_map_entry *rcpu = data;
 
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
+		unsigned int processed = 0, drops = 0;
 		struct xdp_pkt *xdp_pkt;
 
-		schedule();
-		/* Do work */
-		while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) {
-			/* For now just "refcnt-free" */
-			page_frag_free(xdp_pkt);
+		/* Release CPU reschedule checks */
+		if (__ptr_ring_empty(rcpu->queue)) {
+			schedule();
+		} else {
+			cond_resched();
+		}
+
+		/* Process packets in rcpu->queue */
+		local_bh_disable();
+		/*
+		 * The bpf_cpu_map_entry is single consumer, with this
+		 * kthread CPU pinned. Lockless access to ptr_ring
+		 * consume side valid as no-resize allowed of queue.
+		 */
+		while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+			struct sk_buff *skb;
+			int ret;
+
+			skb = cpu_map_build_skb(rcpu, xdp_pkt);
+			if (!skb) {
+				page_frag_free(xdp_pkt);
+				continue;
+			}
+
+			/* Inject into network stack */
+			ret = netif_receive_skb_core(skb);
+			if (ret == NET_RX_DROP)
+				drops++;
+
+			/* Limit BH-disable period */
+			if (++processed == 8)
+				break;
 		}
+		local_bh_enable(); /* resched point, may call do_softirq() */
+
 		__set_current_state(TASK_INTERRUPTIBLE);
 	}
 	put_cpu_map_entry(rcpu);
@@ -470,13 +589,6 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 	return 0;
 }
 
-/* Notice: Will change in later patch */
-struct xdp_pkt {
-	void *data;
-	u16 len;
-	u16 headroom;
-};
-
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
@@ -504,17 +616,13 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx)
 {
 	struct xdp_pkt *xdp_pkt;
-	int headroom;
 
-	/* For now this is just used as a void pointer to data_hard_start.
-	 * Followup patch will generalize this.
-	 */
-	xdp_pkt = xdp->data_hard_start;
+	xdp_pkt = convert_to_xdp_pkt(xdp);
+	if (!xdp_pkt)
+		return -EOVERFLOW;
 
-	/* Fake writing into xdp_pkt->data to measure overhead */
-	headroom = xdp->data - xdp->data_hard_start;
-	if (headroom < sizeof(*xdp_pkt))
-		xdp_pkt->data = xdp->data;
+	/* Info needed when constructing SKB on remote CPU */
+	xdp_pkt->dev_rx = dev_rx;
 
 	bq_enqueue(rcpu, xdp_pkt);
 	return 0;
diff --git a/net/core/dev.c b/net/core/dev.c
index 1770097cfd86..bfe316a0918b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4489,6 +4489,33 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
 	return ret;
 }
 
+/**
+ *	netif_receive_skb_core - special purpose version of netif_receive_skb
+ *	@skb: buffer to process
+ *
+ *	More direct receive version of netif_receive_skb().  It should
+ *	only be used by callers that have a need to skip RPS and Generic XDP.
+ *	Caller must also take care of handling if (page_is_)pfmemalloc.
+ *
+ *	This function may only be called from softirq context and interrupts
+ *	should be enabled.
+ *
+ *	Return values (usually ignored):
+ *	NET_RX_SUCCESS: no congestion
+ *	NET_RX_DROP: packet was dropped
+ */
+int netif_receive_skb_core(struct sk_buff *skb)
+{
+	int ret;
+
+	rcu_read_lock();
+	ret = __netif_receive_skb_core(skb, false);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(netif_receive_skb_core);
+
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	int ret;

^ permalink raw reply related

* [net-next V5 PATCH 4/5] bpf: cpumap add tracepoints
From: Jesper Dangaard Brouer @ 2017-10-06 16:12 UTC (permalink / raw)
  To: netdev
  Cc: jakub.kicinski, Michael S. Tsirkin, pavel.odintsov, Jason Wang,
	mchan, John Fastabend, peter.waskiewicz.jr,
	Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov,
	Andy Gospodarek
In-Reply-To: <150730632837.22839.11804085686478888137.stgit@firesoul>

This adds two tracepoint to the cpumap.  One for the enqueue side
trace_xdp_cpumap_enqueue() and one for the kthread dequeue side
trace_xdp_cpumap_kthread().

To mitigate the tracepoint overhead, these are invoked during the
enqueue/dequeue bulking phases, thus amortizing the cost.

The obvious use-cases are for debugging and monitoring.  The
non-intuitive use-case is using these as a feedback loop to know the
system load.  One can imagine auto-scaling by reducing, adding or
activating more worker CPUs on demand.

V4: tracepoint remove time_limit info, instead add sched info

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/trace/events/xdp.h |   70 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/cpumap.c        |   23 +++++++++++---
 2 files changed, 88 insertions(+), 5 deletions(-)

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index eb2ece96c1a2..0c8dec61987e 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -150,6 +150,76 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
 	 trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map),	\
 				    err, map, idx)
 
+TRACE_EVENT(xdp_cpumap_kthread,
+
+	TP_PROTO(int map_id, unsigned int processed,  unsigned int drops,
+		 int sched),
+
+	TP_ARGS(map_id, processed, drops, sched),
+
+	TP_STRUCT__entry(
+		__field(int, map_id)
+		__field(u32, act)
+		__field(int, cpu)
+		__field(unsigned int, drops)
+		__field(unsigned int, processed)
+		__field(int, sched)
+	),
+
+	TP_fast_assign(
+		__entry->map_id		= map_id;
+		__entry->act		= XDP_REDIRECT;
+		__entry->cpu		= smp_processor_id();
+		__entry->drops		= drops;
+		__entry->processed	= processed;
+		__entry->sched	= sched;
+	),
+
+	TP_printk("kthread"
+		  " cpu=%d map_id=%d action=%s"
+		  " processed=%u drops=%u"
+		  " sched=%d",
+		  __entry->cpu, __entry->map_id,
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->processed, __entry->drops,
+		  __entry->sched)
+);
+
+TRACE_EVENT(xdp_cpumap_enqueue,
+
+	TP_PROTO(int map_id, unsigned int processed,  unsigned int drops,
+		 int to_cpu),
+
+	TP_ARGS(map_id, processed, drops, to_cpu),
+
+	TP_STRUCT__entry(
+		__field(int, map_id)
+		__field(u32, act)
+		__field(int, cpu)
+		__field(unsigned int, drops)
+		__field(unsigned int, processed)
+		__field(int, to_cpu)
+	),
+
+	TP_fast_assign(
+		__entry->map_id		= map_id;
+		__entry->act		= XDP_REDIRECT;
+		__entry->cpu		= smp_processor_id();
+		__entry->drops		= drops;
+		__entry->processed	= processed;
+		__entry->to_cpu		= to_cpu;
+	),
+
+	TP_printk("enqueue"
+		  " cpu=%d map_id=%d action=%s"
+		  " processed=%u drops=%u"
+		  " to_cpu=%d",
+		  __entry->cpu, __entry->map_id,
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->processed, __entry->drops,
+		  __entry->to_cpu)
+);
+
 #endif /* _TRACE_XDP_H */
 
 #include <trace/define_trace.h>
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 960deeccf245..60410b70094a 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -24,6 +24,7 @@
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/capability.h>
+#include <trace/events/xdp.h>
 
 #include <linux/netdevice.h>   /* netif_receive_skb_core */
 #include <linux/etherdevice.h> /* eth_type_trans */
@@ -264,14 +265,15 @@ static int cpu_map_kthread_run(void *data)
 
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
-		unsigned int processed = 0, drops = 0;
+		unsigned int processed = 0, drops = 0, sched = 0;
 		struct xdp_pkt *xdp_pkt;
 
 		/* Release CPU reschedule checks */
 		if (__ptr_ring_empty(rcpu->queue)) {
 			schedule();
+			sched = 1;
 		} else {
-			cond_resched();
+			sched = cond_resched();
 		}
 
 		/* Process packets in rcpu->queue */
@@ -300,6 +302,9 @@ static int cpu_map_kthread_run(void *data)
 			if (++processed == 8)
 				break;
 		}
+		/* Feedback loop via tracepoint */
+		trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched);
+
 		local_bh_enable(); /* resched point, may call do_softirq() */
 
 		__set_current_state(TASK_INTERRUPTIBLE);
@@ -337,7 +342,10 @@ struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
 	err = ptr_ring_init(rcpu->queue, qsize, gfp);
 	if (err)
 		goto fail;
-	rcpu->qsize = qsize;
+
+	rcpu->cpu    = cpu;
+	rcpu->map_id = map_id;
+	rcpu->qsize  = qsize;
 
 	/* Setup kthread */
 	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
@@ -564,6 +572,8 @@ const struct bpf_map_ops cpu_map_ops = {
 static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 			     struct xdp_bulk_queue *bq)
 {
+	unsigned int processed = 0, drops = 0;
+	const int to_cpu = rcpu->cpu;
 	struct ptr_ring *q;
 	int i;
 
@@ -579,13 +589,16 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 
 		err = __ptr_ring_produce(q, xdp_pkt);
 		if (err) {
-			/* Free xdp_pkt */
-			page_frag_free(xdp_pkt);
+			drops++;
+			page_frag_free(xdp_pkt); /* Free xdp_pkt */
 		}
+		processed++;
 	}
 	bq->count = 0;
 	spin_unlock(&q->producer_lock);
 
+	/* Feedback loop via tracepoints */
+	trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
 	return 0;
 }
 

^ permalink raw reply related

* [net-next V5 PATCH 5/5] samples/bpf: add cpumap sample program xdp_redirect_cpu
From: Jesper Dangaard Brouer @ 2017-10-06 16:13 UTC (permalink / raw)
  To: netdev
  Cc: jakub.kicinski, Michael S. Tsirkin, pavel.odintsov, Jason Wang,
	mchan, John Fastabend, peter.waskiewicz.jr,
	Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov,
	Andy Gospodarek
In-Reply-To: <150730632837.22839.11804085686478888137.stgit@firesoul>

This sample program show how to use cpumap and the associated
tracepoints.

It provides command line stats, which shows how the XDP-RX process,
cpumap-enqueue and cpumap kthread dequeue is cooperating on a per CPU
basis.  It also utilize the xdp_exception and xdp_redirect_err
transpoints to allow users quickly to identify setup issues.

One issue with ixgbe driver is that the driver reset the link when
loading XDP.  This reset the procfs smp_affinity settings.  Thus,
after loading the program, these must be reconfigured.  The easiest
workaround it to reduce the RX-queue to e.g. two via:

 # ethtool --set-channels ixgbe1 combined 2

And then add CPUs above 0 and 1, like:

 # xdp_redirect_cpu --dev ixgbe1 --prog 2 --cpu 2 --cpu 3 --cpu 4

Another issue with ixgbe is that the page recycle mechanism is tied to
the RX-ring size.  And the default setting of 512 elements is too
small.  This is the same issue with regular devmap XDP_REDIRECT.
To overcome this I've been using 1024 rx-ring size:

 # ethtool -G ixgbe1 rx 1024 tx 1024

V3:
 - whitespace cleanups
 - bpf tracepoint cannot access top part of struct

V4:
 - report on kthread sched events, according to tracepoint change
 - report average bulk enqueue size

V5:
 - bpf_map_lookup_elem on cpumap not allowed from bpf_prog
   use separate map to mark CPUs not available

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 samples/bpf/Makefile                |    4 
 samples/bpf/xdp_redirect_cpu_kern.c |  609 ++++++++++++++++++++++++++++++++
 samples/bpf/xdp_redirect_cpu_user.c |  673 +++++++++++++++++++++++++++++++++++
 3 files changed, 1286 insertions(+)
 create mode 100644 samples/bpf/xdp_redirect_cpu_kern.c
 create mode 100644 samples/bpf/xdp_redirect_cpu_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ebc2ad69b62c..52c4dab2c153 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -39,6 +39,7 @@ hostprogs-y += per_socket_stats_example
 hostprogs-y += load_sock_ops
 hostprogs-y += xdp_redirect
 hostprogs-y += xdp_redirect_map
+hostprogs-y += xdp_redirect_cpu
 hostprogs-y += xdp_monitor
 hostprogs-y += syscall_tp
 
@@ -84,6 +85,7 @@ test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o
 per_socket_stats_example-objs := $(LIBBPF) cookie_uid_helper_example.o
 xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o
 xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o
+xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o
 xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
 
@@ -129,6 +131,7 @@ always += tcp_iw_kern.o
 always += tcp_clamp_kern.o
 always += xdp_redirect_kern.o
 always += xdp_redirect_map_kern.o
+always += xdp_redirect_cpu_kern.o
 always += xdp_monitor_kern.o
 always += syscall_tp_kern.o
 
@@ -169,6 +172,7 @@ HOSTLOADLIBES_xdp_tx_iptunnel += -lelf
 HOSTLOADLIBES_test_map_in_map += -lelf
 HOSTLOADLIBES_xdp_redirect += -lelf
 HOSTLOADLIBES_xdp_redirect_map += -lelf
+HOSTLOADLIBES_xdp_redirect_cpu += -lelf
 HOSTLOADLIBES_xdp_monitor += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
 
diff --git a/samples/bpf/xdp_redirect_cpu_kern.c b/samples/bpf/xdp_redirect_cpu_kern.c
new file mode 100644
index 000000000000..303e9e7161f3
--- /dev/null
+++ b/samples/bpf/xdp_redirect_cpu_kern.c
@@ -0,0 +1,609 @@
+/*  XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
+ *
+ *  GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
+ */
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/if_vlan.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/udp.h>
+
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define MAX_CPUS 12 /* WARNING - sync with _user.c */
+
+/* Special map type that can XDP_REDIRECT frames to another CPU */
+struct bpf_map_def SEC("maps") cpu_map = {
+	.type		= BPF_MAP_TYPE_CPUMAP,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(u32),
+	.max_entries	= MAX_CPUS,
+};
+
+/* Common stats data record to keep userspace more simple */
+struct datarec {
+	__u64 processed;
+	__u64 dropped;
+	__u64 issue;
+};
+
+/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
+ * feedback.  Redirect TX errors can be caught via a tracepoint.
+ */
+struct bpf_map_def SEC("maps") rx_cnt = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= 1,
+};
+
+/* Used by trace point */
+struct bpf_map_def SEC("maps") redirect_err_cnt = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= 2,
+	/* TODO: have entries for all possible errno's */
+};
+
+/* Used by trace point */
+struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= MAX_CPUS,
+};
+
+/* Used by trace point */
+struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= 1,
+};
+
+/* Set of maps controlling available CPU, and for iterating through
+ * selectable redirect CPUs.
+ */
+struct bpf_map_def SEC("maps") cpus_available = {
+	.type		= BPF_MAP_TYPE_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(u32),
+	.max_entries	= MAX_CPUS,
+};
+struct bpf_map_def SEC("maps") cpus_count = {
+	.type		= BPF_MAP_TYPE_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(u32),
+	.max_entries	= 1,
+};
+struct bpf_map_def SEC("maps") cpus_iterator = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(u32),
+	.max_entries	= 1,
+};
+
+/* Used by trace point */
+struct bpf_map_def SEC("maps") exception_cnt = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= 1,
+};
+
+/* Helper parse functions */
+
+/* Parse Ethernet layer 2, extract network layer 3 offset and protocol
+ *
+ * Returns false on error and non-supported ether-type
+ */
+struct vlan_hdr {
+	__be16 h_vlan_TCI;
+	__be16 h_vlan_encapsulated_proto;
+};
+
+static __always_inline
+bool parse_eth(struct ethhdr *eth, void *data_end,
+	       u16 *eth_proto, u64 *l3_offset)
+{
+	u16 eth_type;
+	u64 offset;
+
+	offset = sizeof(*eth);
+	if ((void *)eth + offset > data_end)
+		return false;
+
+	eth_type = eth->h_proto;
+
+	/* Skip non 802.3 Ethertypes */
+	if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN))
+		return false;
+
+	/* Handle VLAN tagged packet */
+	if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
+		struct vlan_hdr *vlan_hdr;
+
+		vlan_hdr = (void *)eth + offset;
+		offset += sizeof(*vlan_hdr);
+		if ((void *)eth + offset > data_end)
+			return false;
+		eth_type = vlan_hdr->h_vlan_encapsulated_proto;
+	}
+	/* TODO: Handle double VLAN tagged packet */
+
+	*eth_proto = ntohs(eth_type);
+	*l3_offset = offset;
+	return true;
+}
+
+static __always_inline
+u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data     = (void *)(long)ctx->data;
+	struct iphdr *iph = data + nh_off;
+	struct udphdr *udph;
+	u16 dport;
+
+	if (iph + 1 > data_end)
+		return 0;
+	if (!(iph->protocol == IPPROTO_UDP))
+		return 0;
+
+	udph = (void *)(iph + 1);
+	if (udph + 1 > data_end)
+		return 0;
+
+	dport = ntohs(udph->dest);
+	return dport;
+}
+
+static __always_inline
+int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data     = (void *)(long)ctx->data;
+	struct iphdr *iph = data + nh_off;
+
+	if (iph + 1 > data_end)
+		return 0;
+	return iph->protocol;
+}
+
+static __always_inline
+int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data     = (void *)(long)ctx->data;
+	struct ipv6hdr *ip6h = data + nh_off;
+
+	if (ip6h + 1 > data_end)
+		return 0;
+	return ip6h->nexthdr;
+}
+
+SEC("xdp_cpu_map0")
+int  xdp_prognum0_no_touch(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data     = (void *)(long)ctx->data;
+	struct datarec *rec;
+	u32 *cpu_selected;
+	u32 cpu_dest;
+	u32 key = 0;
+
+	/* Only use first entry in cpus_available */
+	cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
+	if (!cpu_selected)
+		return XDP_ABORTED;
+	cpu_dest = *cpu_selected;
+
+	/* Count RX packet in map */
+	rec = bpf_map_lookup_elem(&rx_cnt, &key);
+	if (!rec)
+		return XDP_ABORTED;
+	rec->processed++;
+
+	if (cpu_dest >= MAX_CPUS) {
+		rec->issue++;
+		return XDP_ABORTED;
+	}
+
+	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+SEC("xdp_cpu_map1_touch_data")
+int  xdp_prognum1_touch_data(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data     = (void *)(long)ctx->data;
+	struct ethhdr *eth = data;
+	struct datarec *rec;
+	u32 *cpu_selected;
+	u32 cpu_dest;
+	u16 eth_type;
+	u32 key = 0;
+
+	/* Only use first entry in cpus_available */
+	cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
+	if (!cpu_selected)
+		return XDP_ABORTED;
+	cpu_dest = *cpu_selected;
+
+	/* Validate packet length is minimum Eth header size */
+	if (eth + 1 > data_end)
+		return XDP_ABORTED;
+
+	/* Count RX packet in map */
+	rec = bpf_map_lookup_elem(&rx_cnt, &key);
+	if (!rec)
+		return XDP_ABORTED;
+	rec->processed++;
+
+	/* Read packet data, and use it (drop non 802.3 Ethertypes) */
+	eth_type = eth->h_proto;
+	if (ntohs(eth_type) < ETH_P_802_3_MIN) {
+		rec->dropped++;
+		return XDP_DROP;
+	}
+
+	if (cpu_dest >= MAX_CPUS) {
+		rec->issue++;
+		return XDP_ABORTED;
+	}
+
+	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+SEC("xdp_cpu_map2_round_robin")
+int  xdp_prognum2_round_robin(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data     = (void *)(long)ctx->data;
+	struct ethhdr *eth = data;
+	struct datarec *rec;
+	u32 cpu_dest;
+	u32 *cpu_lookup;
+	u32 key0 = 0;
+
+	u32 *cpu_selected;
+	u32 *cpu_iterator;
+	u32 *cpu_max;
+	u32 cpu_idx;
+
+	cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
+	if (!cpu_max)
+		return XDP_ABORTED;
+
+	cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0);
+	if (!cpu_iterator)
+		return XDP_ABORTED;
+	cpu_idx = *cpu_iterator;
+
+	*cpu_iterator += 1;
+	if (*cpu_iterator == *cpu_max)
+		*cpu_iterator = 0;
+
+	cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
+	if (!cpu_selected)
+		return XDP_ABORTED;
+	cpu_dest = *cpu_selected;
+
+	/* Count RX packet in map */
+	rec = bpf_map_lookup_elem(&rx_cnt, &key0);
+	if (!rec)
+		return XDP_ABORTED;
+	rec->processed++;
+
+	if (cpu_dest >= MAX_CPUS) {
+		rec->issue++;
+		return XDP_ABORTED;
+	}
+
+	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+SEC("xdp_cpu_map3_proto_separate")
+int  xdp_prognum3_proto_separate(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data     = (void *)(long)ctx->data;
+	struct ethhdr *eth = data;
+	u8 ip_proto = IPPROTO_UDP;
+	struct datarec *rec;
+	u16 eth_proto = 0;
+	u64 l3_offset = 0;
+	u32 cpu_dest = 0;
+	u32 cpu_idx = 0;
+	u32 *cpu_lookup;
+	u32 key = 0;
+
+	/* Count RX packet in map */
+	rec = bpf_map_lookup_elem(&rx_cnt, &key);
+	if (!rec)
+		return XDP_ABORTED;
+	rec->processed++;
+
+	if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
+		return XDP_PASS; /* Just skip */
+
+	/* Extract L4 protocol */
+	switch (eth_proto) {
+	case ETH_P_IP:
+		ip_proto = get_proto_ipv4(ctx, l3_offset);
+		break;
+	case ETH_P_IPV6:
+		ip_proto = get_proto_ipv6(ctx, l3_offset);
+		break;
+	case ETH_P_ARP:
+		cpu_idx = 0; /* ARP packet handled on separate CPU */
+		break;
+	default:
+		cpu_idx = 0;
+	}
+
+	/* Choose CPU based on L4 protocol */
+	switch (ip_proto) {
+	case IPPROTO_ICMP:
+	case IPPROTO_ICMPV6:
+		cpu_idx = 2;
+		break;
+	case IPPROTO_TCP:
+		cpu_idx = 0;
+		break;
+	case IPPROTO_UDP:
+		cpu_idx = 1;
+		break;
+	default:
+		cpu_idx = 0;
+	}
+
+	cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
+	if (!cpu_lookup)
+		return XDP_ABORTED;
+	cpu_dest = *cpu_lookup;
+
+	if (cpu_dest >= MAX_CPUS) {
+		rec->issue++;
+		return XDP_ABORTED;
+	}
+
+	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+SEC("xdp_cpu_map4_ddos_filter_pktgen")
+int  xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data     = (void *)(long)ctx->data;
+	struct ethhdr *eth = data;
+	u8 ip_proto = IPPROTO_UDP;
+	struct datarec *rec;
+	u16 eth_proto = 0;
+	u64 l3_offset = 0;
+	u32 cpu_dest = 0;
+	u32 cpu_idx = 0;
+	u16 dest_port;
+	u32 *cpu_lookup;
+	u32 key = 0;
+
+	/* Count RX packet in map */
+	rec = bpf_map_lookup_elem(&rx_cnt, &key);
+	if (!rec)
+		return XDP_ABORTED;
+	rec->processed++;
+
+	if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
+		return XDP_PASS; /* Just skip */
+
+	/* Extract L4 protocol */
+	switch (eth_proto) {
+	case ETH_P_IP:
+		ip_proto = get_proto_ipv4(ctx, l3_offset);
+		break;
+	case ETH_P_IPV6:
+		ip_proto = get_proto_ipv6(ctx, l3_offset);
+		break;
+	case ETH_P_ARP:
+		cpu_idx = 0; /* ARP packet handled on separate CPU */
+		break;
+	default:
+		cpu_idx = 0;
+	}
+
+	/* Choose CPU based on L4 protocol */
+	switch (ip_proto) {
+	case IPPROTO_ICMP:
+	case IPPROTO_ICMPV6:
+		cpu_idx = 2;
+		break;
+	case IPPROTO_TCP:
+		cpu_idx = 0;
+		break;
+	case IPPROTO_UDP:
+		cpu_idx = 1;
+		/* DDoS filter UDP port 9 (pktgen) */
+		dest_port = get_dest_port_ipv4_udp(ctx, l3_offset);
+		if (dest_port == 9) {
+			if (rec)
+				rec->dropped++;
+			return XDP_DROP;
+		}
+		break;
+	default:
+		cpu_idx = 0;
+	}
+
+	cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
+	if (!cpu_lookup)
+		return XDP_ABORTED;
+	cpu_dest = *cpu_lookup;
+
+	if (cpu_dest >= MAX_CPUS) {
+		rec->issue++;
+		return XDP_ABORTED;
+	}
+
+	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+
+char _license[] SEC("license") = "GPL";
+
+/*** Trace point code ***/
+
+/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
+ * Code in:                kernel/include/trace/events/xdp.h
+ */
+struct xdp_redirect_ctx {
+	u64 __pad;	// First 8 bytes are not accessible by bpf code
+	int prog_id;	//	offset:8;  size:4; signed:1;
+	u32 act;	//	offset:12  size:4; signed:0;
+	int ifindex;	//	offset:16  size:4; signed:1;
+	int err;	//	offset:20  size:4; signed:1;
+	int to_ifindex;	//	offset:24  size:4; signed:1;
+	u32 map_id;	//	offset:28  size:4; signed:0;
+	int map_index;	//	offset:32  size:4; signed:1;
+};			//	offset:36
+
+enum {
+	XDP_REDIRECT_SUCCESS = 0,
+	XDP_REDIRECT_ERROR = 1
+};
+
+static __always_inline
+int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
+{
+	u32 key = XDP_REDIRECT_ERROR;
+	struct datarec *rec;
+	int err = ctx->err;
+
+	if (!err)
+		key = XDP_REDIRECT_SUCCESS;
+
+	rec = bpf_map_lookup_elem(&redirect_err_cnt, &key);
+	if (!rec)
+		return 0;
+	rec->dropped += 1;
+
+	return 0; /* Indicate event was filtered (no further processing)*/
+	/*
+	 * Returning 1 here would allow e.g. a perf-record tracepoint
+	 * to see and record these events, but it doesn't work well
+	 * in-practice as stopping perf-record also unload this
+	 * bpf_prog.  Plus, there is additional overhead of doing so.
+	 */
+}
+
+SEC("tracepoint/xdp/xdp_redirect_err")
+int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
+{
+	return xdp_redirect_collect_stat(ctx);
+}
+
+SEC("tracepoint/xdp/xdp_redirect_map_err")
+int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
+{
+	return xdp_redirect_collect_stat(ctx);
+}
+
+/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
+ * Code in:                kernel/include/trace/events/xdp.h
+ */
+struct xdp_exception_ctx {
+	u64 __pad;	// First 8 bytes are not accessible by bpf code
+	int prog_id;	//	offset:8;  size:4; signed:1;
+	u32 act;	//	offset:12; size:4; signed:0;
+	int ifindex;	//	offset:16; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_exception")
+int trace_xdp_exception(struct xdp_exception_ctx *ctx)
+{
+	struct datarec *rec;
+	u32 key = 0;
+
+	rec = bpf_map_lookup_elem(&exception_cnt, &key);
+	if (!rec)
+		return 1;
+	rec->dropped += 1;
+
+	return 0;
+}
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
+ * Code in:         kernel/include/trace/events/xdp.h
+ */
+struct cpumap_enqueue_ctx {
+	u64 __pad;		// First 8 bytes are not accessible by bpf code
+	int map_id;		//	offset:8;  size:4; signed:1;
+	u32 act;		//	offset:12; size:4; signed:0;
+	int cpu;		//	offset:16; size:4; signed:1;
+	unsigned int drops;	//	offset:20; size:4; signed:0;
+	unsigned int processed;	//	offset:24; size:4; signed:0;
+	int to_cpu;		//	offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_enqueue")
+int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
+{
+	u32 to_cpu = ctx->to_cpu;
+	struct datarec *rec;
+
+	if (to_cpu >= MAX_CPUS)
+		return 1;
+
+	rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
+	if (!rec)
+		return 0;
+	rec->processed += ctx->processed;
+	rec->dropped   += ctx->drops;
+
+	/* Record bulk events, then userspace can calc average bulk size */
+	if (ctx->processed > 0)
+		rec->issue += 1;
+
+	/* Inception: It's possible to detect overload situations, via
+	 * this tracepoint.  This can be used for creating a feedback
+	 * loop to XDP, which can take appropriate actions to mitigate
+	 * this overload situation.
+	 */
+	return 0;
+}
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
+ * Code in:         kernel/include/trace/events/xdp.h
+ */
+struct cpumap_kthread_ctx {
+	u64 __pad;		// First 8 bytes are not accessible by bpf code
+	int map_id;		//	offset:8;  size:4; signed:1;
+	u32 act;		//	offset:12; size:4; signed:0;
+	int cpu;		//	offset:16; size:4; signed:1;
+	unsigned int drops;	//	offset:20; size:4; signed:0;
+	unsigned int processed;	//	offset:24; size:4; signed:0;
+	int sched;		//	offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_kthread")
+int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
+{
+	struct datarec *rec;
+	u32 key = 0;
+
+	rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
+	if (!rec)
+		return 0;
+	rec->processed += ctx->processed;
+	rec->dropped   += ctx->drops;
+
+	/* Count times kthread yielded CPU via schedule call */
+	if (ctx->sched)
+		rec->issue++;
+
+	return 0;
+}
diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c
new file mode 100644
index 000000000000..fc67837a5c1c
--- /dev/null
+++ b/samples/bpf/xdp_redirect_cpu_user.c
@@ -0,0 +1,673 @@
+/* GPLv2 Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
+ */
+static const char *__doc__ =
+	" XDP redirect with a CPU-map type \"BPF_MAP_TYPE_CPUMAP\"";
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <locale.h>
+#include <sys/resource.h>
+#include <getopt.h>
+#include <net/if.h>
+#include <time.h>
+
+#include <arpa/inet.h>
+#include <linux/if_link.h>
+
+#define MAX_CPUS 12 /* WARNING - sync with _kern.c */
+
+/* How many xdp_progs are defined in _kern.c */
+#define MAX_PROG 5
+
+/* Wanted to get rid of bpf_load.h and fake-"libbpf.h" (and instead
+ * use bpf/libbpf.h), but cannot as (currently) needed for XDP
+ * attaching to a device via set_link_xdp_fd()
+ */
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#include "bpf_util.h"
+
+static int ifindex = -1;
+static char ifname_buf[IF_NAMESIZE];
+static char *ifname;
+
+static __u32 xdp_flags;
+
+/* Exit return codes */
+#define EXIT_OK		0
+#define EXIT_FAIL		1
+#define EXIT_FAIL_OPTION	2
+#define EXIT_FAIL_XDP		3
+#define EXIT_FAIL_BPF		4
+#define EXIT_FAIL_MEM		5
+
+static const struct option long_options[] = {
+	{"help",	no_argument,		NULL, 'h' },
+	{"dev",	required_argument,	NULL, 'd' },
+	{"skb-mode",	no_argument,		NULL, 'S' },
+	{"debug",	no_argument,		NULL, 'D' },
+	{"sec",	required_argument,	NULL, 's' },
+	{"prognum",	required_argument,	NULL, 'p' },
+	{"qsize",	required_argument,	NULL, 'q' },
+	{"cpu",	required_argument,	NULL, 'c' },
+	{"no-separators", no_argument,		NULL, 'z' },
+	{0, 0, NULL,  0 }
+};
+
+static void int_exit(int sig)
+{
+	fprintf(stderr,
+		"Interrupted: Removing XDP program on ifindex:%d device:%s\n",
+		ifindex, ifname);
+	if (ifindex > -1)
+		set_link_xdp_fd(ifindex, -1, xdp_flags);
+	exit(EXIT_OK);
+}
+
+static void usage(char *argv[])
+{
+	int i;
+
+	printf("\nDOCUMENTATION:\n%s\n", __doc__);
+	printf("\n");
+	printf(" Usage: %s (options-see-below)\n", argv[0]);
+	printf(" Listing options:\n");
+	for (i = 0; long_options[i].name != 0; i++) {
+		printf(" --%-12s", long_options[i].name);
+		if (long_options[i].flag != NULL)
+			printf(" flag (internal value:%d)",
+				*long_options[i].flag);
+		else
+			printf(" short-option: -%c",
+				long_options[i].val);
+		printf("\n");
+	}
+	printf("\n");
+}
+
+/* gettime returns the current time of day in nanoseconds.
+ * Cost: clock_gettime (ns) => 26ns (CLOCK_MONOTONIC)
+ *       clock_gettime (ns) =>  9ns (CLOCK_MONOTONIC_COARSE)
+ */
+#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
+static __u64 gettime(void)
+{
+	struct timespec t;
+	int res;
+
+	res = clock_gettime(CLOCK_MONOTONIC, &t);
+	if (res < 0) {
+		fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
+		exit(EXIT_FAIL);
+	}
+	return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
+}
+
+/* Common stats data record shared with _kern.c */
+struct datarec {
+	__u64 processed;
+	__u64 dropped;
+	__u64 issue;
+};
+struct record {
+	__u64 timestamp;
+	struct datarec total;
+	struct datarec *cpu;
+};
+struct stats_record {
+	struct record rx_cnt;
+	struct record redir_err;
+	struct record kthread;
+	struct record exception;
+	struct record enq[MAX_CPUS];
+};
+
+static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
+{
+	/* For percpu maps, userspace gets a value per possible CPU */
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	struct datarec values[nr_cpus];
+	__u64 sum_processed = 0;
+	__u64 sum_dropped = 0;
+	__u64 sum_issue = 0;
+	int i;
+
+	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+		fprintf(stderr,
+			"ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+		return false;
+	}
+	/* Get time as close as possible to reading map contents */
+	rec->timestamp = gettime();
+
+	/* Record and sum values from each CPU */
+	for (i = 0; i < nr_cpus; i++) {
+		rec->cpu[i].processed = values[i].processed;
+		sum_processed        += values[i].processed;
+		rec->cpu[i].dropped = values[i].dropped;
+		sum_dropped        += values[i].dropped;
+		rec->cpu[i].issue = values[i].issue;
+		sum_issue        += values[i].issue;
+	}
+	rec->total.processed = sum_processed;
+	rec->total.dropped   = sum_dropped;
+	rec->total.issue     = sum_issue;
+	return true;
+}
+
+static struct datarec *alloc_record_per_cpu(void)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	struct datarec *array;
+	size_t size;
+
+	size = sizeof(struct datarec) * nr_cpus;
+	array = malloc(size);
+	memset(array, 0, size);
+	if (!array) {
+		fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
+		exit(EXIT_FAIL_MEM);
+	}
+	return array;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+	struct stats_record *rec;
+	int i;
+
+	rec = malloc(sizeof(*rec));
+	memset(rec, 0, sizeof(*rec));
+	if (!rec) {
+		fprintf(stderr, "Mem alloc error\n");
+		exit(EXIT_FAIL_MEM);
+	}
+	rec->rx_cnt.cpu    = alloc_record_per_cpu();
+	rec->redir_err.cpu = alloc_record_per_cpu();
+	rec->kthread.cpu   = alloc_record_per_cpu();
+	rec->exception.cpu = alloc_record_per_cpu();
+	for (i = 0; i < MAX_CPUS; i++)
+		rec->enq[i].cpu = alloc_record_per_cpu();
+
+	return rec;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+	int i;
+
+	for (i = 0; i < MAX_CPUS; i++)
+		free(r->enq[i].cpu);
+	free(r->exception.cpu);
+	free(r->kthread.cpu);
+	free(r->redir_err.cpu);
+	free(r->rx_cnt.cpu);
+	free(r);
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+	double period_ = 0;
+	__u64 period = 0;
+
+	period = r->timestamp - p->timestamp;
+	if (period > 0)
+		period_ = ((double) period / NANOSEC_PER_SEC);
+
+	return period_;
+}
+
+static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_)
+{
+	__u64 packets = 0;
+	__u64 pps = 0;
+
+	if (period_ > 0) {
+		packets = r->processed - p->processed;
+		pps = packets / period_;
+	}
+	return pps;
+}
+
+static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_)
+{
+	__u64 packets = 0;
+	__u64 pps = 0;
+
+	if (period_ > 0) {
+		packets = r->dropped - p->dropped;
+		pps = packets / period_;
+	}
+	return pps;
+}
+
+static __u64 calc_errs_pps(struct datarec *r,
+			    struct datarec *p, double period_)
+{
+	__u64 packets = 0;
+	__u64 pps = 0;
+
+	if (period_ > 0) {
+		packets = r->issue - p->issue;
+		pps = packets / period_;
+	}
+	return pps;
+}
+
+static void stats_print(struct stats_record *stats_rec,
+			struct stats_record *stats_prev,
+			int prog_num)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	double pps = 0, drop = 0, err = 0;
+	struct record *rec, *prev;
+	int to_cpu;
+	double t;
+	int i;
+
+	/* Header */
+	printf("Running XDP/eBPF prog_num:%d\n", prog_num);
+	printf("%-15s %-7s %-14s %-11s %-9s\n",
+	       "XDP-cpumap", "CPU:to", "pps", "drop-pps", "extra-info");
+
+	/* XDP rx_cnt */
+	{
+		char *fmt_rx = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n";
+		char *fm2_rx = "%-15s %-7s %'-14.0f %'-11.0f\n";
+		char *errstr = "";
+
+		rec  = &stats_rec->rx_cnt;
+		prev = &stats_prev->rx_cnt;
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps = calc_pps(r, p, t);
+			drop = calc_drop_pps(r, p, t);
+			err  = calc_errs_pps(r, p, t);
+			if (err > 0)
+				errstr = "cpu-dest/err";
+			if (pps > 0)
+				printf(fmt_rx, "XDP-RX",
+					i, pps, drop, err, errstr);
+		}
+		pps  = calc_pps(&rec->total, &prev->total, t);
+		drop = calc_drop_pps(&rec->total, &prev->total, t);
+		err  = calc_errs_pps(&rec->total, &prev->total, t);
+		printf(fm2_rx, "XDP-RX", "total", pps, drop);
+	}
+
+	/* cpumap enqueue stats */
+	for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) {
+		char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
+		char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
+		char *errstr = "";
+
+		rec  =  &stats_rec->enq[to_cpu];
+		prev = &stats_prev->enq[to_cpu];
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps  = calc_pps(r, p, t);
+			drop = calc_drop_pps(r, p, t);
+			err  = calc_errs_pps(r, p, t);
+			if (err > 0) {
+				errstr = "bulk-average";
+				err = pps / err; /* calc average bulk size */
+			}
+			if (pps > 0)
+				printf(fmt, "cpumap-enqueue",
+				       i, to_cpu, pps, drop, err, errstr);
+		}
+		pps = calc_pps(&rec->total, &prev->total, t);
+		if (pps > 0) {
+			drop = calc_drop_pps(&rec->total, &prev->total, t);
+			err  = calc_errs_pps(&rec->total, &prev->total, t);
+			if (err > 0) {
+				errstr = "bulk-average";
+				err = pps / err; /* calc average bulk size */
+			}
+			printf(fm2, "cpumap-enqueue",
+			       "sum", to_cpu, pps, drop, err, errstr);
+		}
+	}
+
+	/* cpumap kthread stats */
+	{
+		char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n";
+		char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f %s\n";
+		char *errstr = "";
+
+		rec  = &stats_rec->kthread;
+		prev = &stats_prev->kthread;
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps  = calc_pps(r, p, t);
+			drop = calc_drop_pps(r, p, t);
+			err  = calc_errs_pps(r, p, t);
+			if (err > 0)
+				errstr = "sched";
+			if (pps > 0)
+				printf(fmt_k, "cpumap_kthread",
+				       i, pps, drop, err, errstr);
+		}
+		pps = calc_pps(&rec->total, &prev->total, t);
+		drop = calc_drop_pps(&rec->total, &prev->total, t);
+		printf(fm2_k, "cpumap_kthread", "total", pps, drop);
+	}
+
+	/* XDP redirect err tracepoints (very unlikely) */
+	{
+		char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n";
+		char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n";
+
+		rec  = &stats_rec->redir_err;
+		prev = &stats_prev->redir_err;
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps  = calc_pps(r, p, t);
+			drop = calc_drop_pps(r, p, t);
+			if (pps > 0)
+				printf(fmt_err, "redirect_err", i, pps, drop);
+		}
+		pps = calc_pps(&rec->total, &prev->total, t);
+		drop = calc_drop_pps(&rec->total, &prev->total, t);
+		printf(fm2_err, "redirect_err", "total", pps, drop);
+	}
+
+	/* XDP general exception tracepoints */
+	{
+		char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n";
+		char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n";
+
+		rec  = &stats_rec->exception;
+		prev = &stats_prev->exception;
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps  = calc_pps(r, p, t);
+			drop = calc_drop_pps(r, p, t);
+			if (pps > 0)
+				printf(fmt_err, "xdp_exception", i, pps, drop);
+		}
+		pps = calc_pps(&rec->total, &prev->total, t);
+		drop = calc_drop_pps(&rec->total, &prev->total, t);
+		printf(fm2_err, "xdp_exception", "total", pps, drop);
+	}
+
+	printf("\n");
+	fflush(stdout);
+}
+
+static void stats_collect(struct stats_record *rec)
+{
+	int fd, i;
+
+	fd = map_fd[1]; /* map: rx_cnt */
+	map_collect_percpu(fd, 0, &rec->rx_cnt);
+
+	fd = map_fd[2]; /* map: redirect_err_cnt */
+	map_collect_percpu(fd, 1, &rec->redir_err);
+
+	fd = map_fd[3]; /* map: cpumap_enqueue_cnt */
+	for (i = 0; i < MAX_CPUS; i++)
+		map_collect_percpu(fd, i, &rec->enq[i]);
+
+	fd = map_fd[4]; /* map: cpumap_kthread_cnt */
+	map_collect_percpu(fd, 0, &rec->kthread);
+
+	fd = map_fd[8]; /* map: exception_cnt */
+	map_collect_percpu(fd, 0, &rec->exception);
+}
+
+
+/* Pointer swap trick */
+static inline void swap(struct stats_record **a, struct stats_record **b)
+{
+	struct stats_record *tmp;
+
+	tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+static void stats_poll(int interval, bool use_separators, int prog_num)
+{
+	struct stats_record *record, *prev;
+
+	record = alloc_stats_record();
+	prev   = alloc_stats_record();
+	stats_collect(record);
+
+	/* Trick to pretty printf with thousands separators use %' */
+	if (use_separators)
+		setlocale(LC_NUMERIC, "en_US");
+
+	while (1) {
+		swap(&prev, &record);
+		stats_collect(record);
+		stats_print(record, prev, prog_num);
+		sleep(interval);
+	}
+
+	free_stats_record(record);
+	free_stats_record(prev);
+}
+
+static int create_cpu_entry(__u32 cpu, __u32 queue_size,
+			    __u32 avail_idx, bool new)
+{
+	__u32 curr_cpus_count;
+	__u32 key = 0;
+	int ret;
+
+	/* Add a CPU entry to cpumap, as this allocate a cpu entry in
+	 * the kernel for the cpu.
+	 */
+	ret = bpf_map_update_elem(map_fd[0], &cpu, &queue_size, 0);
+	if (ret) {
+		fprintf(stderr, "Create CPU entry failed\n");
+		exit(EXIT_FAIL_BPF);
+	}
+
+	/* Inform bpf_prog's that a new CPU is available to select
+	 * from via some control maps.
+	 */
+	/* map_fd[5] = cpus_available */
+	ret = bpf_map_update_elem(map_fd[5], &avail_idx, &cpu, 0);
+	if (ret) {
+		fprintf(stderr, "Add to avail CPUs failed\n");
+		exit(EXIT_FAIL_BPF);
+	}
+
+	/* When not replacing/updating existing entry, bump the count */
+	/* map_fd[6] = cpus_count */
+	if (new) {
+		ret = bpf_map_lookup_elem(map_fd[6], &key, &curr_cpus_count);
+		if (ret) {
+			fprintf(stderr, "Failed reading curr cpus_count\n");
+			exit(EXIT_FAIL_BPF);
+		}
+		curr_cpus_count++;
+		ret = bpf_map_update_elem(map_fd[6], &key, &curr_cpus_count, 0);
+		if (ret) {
+			fprintf(stderr, "Failed write curr cpus_count\n");
+			exit(EXIT_FAIL_BPF);
+		}
+	}
+	/* map_fd[7] = cpus_iterator */
+	printf("%s CPU:%u as idx:%u cpus_count:%u\n",
+	       new ? "Add-new":"Replace", cpu, avail_idx, curr_cpus_count);
+
+	return 0;
+}
+
+/* CPUs are zero-indexed. Thus, add a special sentinel default value
+ * in map cpus_available to mark CPU index'es not configured
+ */
+static void mark_cpus_unavailable(void)
+{
+	__u32 invalid_cpu = MAX_CPUS;
+	int ret, i;
+
+	for (i = 0; i < MAX_CPUS; i++) {
+		/* map_fd[5] = cpus_available */
+		ret = bpf_map_update_elem(map_fd[5], &i, &invalid_cpu, 0);
+		if (ret) {
+			fprintf(stderr, "Failed marking CPU unavailable\n");
+			exit(EXIT_FAIL_BPF);
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
+	bool use_separators = true;
+	char filename[256];
+	bool debug = false;
+	int added_cpus = 0;
+	int longindex = 0;
+	int interval = 2;
+	int prog_num = 0;
+	int add_cpu = -1;
+	__u32 qsize;
+	int opt;
+
+	/* Notice: choosing he queue size is very important with the
+	 * ixgbe driver, because it's driver page recycling trick is
+	 * dependend on pages being returned quickly.  The number of
+	 * out-standing packets in the system must be less-than 2x
+	 * RX-ring size.
+	 */
+	qsize = 128+64;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
+	if (load_bpf_file(filename)) {
+		fprintf(stderr, "ERR in load_bpf_file(): %s", bpf_log_buf);
+		return EXIT_FAIL;
+	}
+
+	if (!prog_fd[0]) {
+		fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno));
+		return EXIT_FAIL;
+	}
+
+	mark_cpus_unavailable();
+
+	/* Parse commands line args */
+	while ((opt = getopt_long(argc, argv, "hSd:",
+				  long_options, &longindex)) != -1) {
+		switch (opt) {
+		case 'd':
+			if (strlen(optarg) >= IF_NAMESIZE) {
+				fprintf(stderr, "ERR: --dev name too long\n");
+				goto error;
+			}
+			ifname = (char *)&ifname_buf;
+			strncpy(ifname, optarg, IF_NAMESIZE);
+			ifindex = if_nametoindex(ifname);
+			if (ifindex == 0) {
+				fprintf(stderr,
+					"ERR: --dev name unknown err(%d):%s\n",
+					errno, strerror(errno));
+				goto error;
+			}
+			break;
+		case 's':
+			interval = atoi(optarg);
+			break;
+		case 'S':
+			xdp_flags |= XDP_FLAGS_SKB_MODE;
+			break;
+		case 'D':
+			debug = true;
+			break;
+		case 'z':
+			use_separators = false;
+			break;
+		case 'p':
+			/* Selecting eBPF prog to load */
+			prog_num = atoi(optarg);
+			if (prog_num < 0 || prog_num >= MAX_PROG) {
+				fprintf(stderr,
+					"--prognum too large err(%d):%s\n",
+					errno, strerror(errno));
+				goto error;
+			}
+			break;
+		case 'c':
+			/* Add multiple CPUs */
+			add_cpu = strtoul(optarg, NULL, 0);
+			if (add_cpu > MAX_CPUS) {
+				fprintf(stderr,
+				"--cpu nr too large for cpumap err(%d):%s\n",
+					errno, strerror(errno));
+				goto error;
+			}
+			create_cpu_entry(add_cpu, qsize, added_cpus, true);
+			added_cpus++;
+			break;
+		case 'q':
+			qsize = atoi(optarg);
+			break;
+		case 'h':
+		error:
+		default:
+			usage(argv);
+			return EXIT_FAIL_OPTION;
+		}
+	}
+	/* Required option */
+	if (ifindex == -1) {
+		fprintf(stderr, "ERR: required option --dev missing\n");
+		usage(argv);
+		return EXIT_FAIL_OPTION;
+	}
+	/* Required option */
+	if (add_cpu == -1) {
+		fprintf(stderr, "ERR: required option --cpu missing\n");
+		fprintf(stderr, " Specify multiple --cpu option to add more\n");
+		usage(argv);
+		return EXIT_FAIL_OPTION;
+	}
+
+	/* Remove XDP program when program is interrupted */
+	signal(SIGINT, int_exit);
+
+	if (set_link_xdp_fd(ifindex, prog_fd[prog_num], xdp_flags) < 0) {
+		fprintf(stderr, "link set xdp fd failed\n");
+		return EXIT_FAIL_XDP;
+	}
+
+	if (debug) {
+		printf("Debug-mode reading trace pipe (fix #define DEBUG)\n");
+		read_trace_pipe();
+	}
+
+	stats_poll(interval, use_separators, prog_num);
+	return EXIT_OK;
+}

^ permalink raw reply related

* Re: [net-next PATCH 1/3] samples/bpf: xdp_monitor first 8 bytes are not accessible by bpf
From: Alexei Starovoitov @ 2017-10-06 16:13 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: netdev, Andy Gospodarek, Daniel Borkmann
In-Reply-To: <150727930151.4460.3117254071267380300.stgit@firesoul>

On Fri, Oct 06, 2017 at 10:41:41AM +0200, Jesper Dangaard Brouer wrote:
> The first 8 bytes of the tracepoint context struct are not accessible
> by the bpf code.  This is a choice that dates back to the original
> inclusion of this code.
> 
> See explaination in:
>  commit 98b5c2c65c29 ("perf, bpf: allow bpf programs attach to tracepoints")
> 
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>

thank you for fixing it.
Acked-by: Alexei Starovoitov <ast@kernel.org>

^ permalink raw reply

* Re: [PATCH net] bpf: fix liveness marking
From: Edward Cree @ 2017-10-06 16:33 UTC (permalink / raw)
  To: Alexei Starovoitov, David S . Miller; +Cc: Daniel Borkmann, netdev, kernel-team
In-Reply-To: <20171005232056.2234669-1-ast@fb.com>

On 06/10/17 00:20, Alexei Starovoitov wrote:
> while processing Rx = Ry instruction the verifier does
> regs[insn->dst_reg] = regs[insn->src_reg]
> which often clears write mark (when Ry doesn't have it)
> that was just set by check_reg_arg(Rx) prior to the assignment.
> That causes mark_reg_read() to keep marking Rx in this block as
> REG_LIVE_READ (since the logic incorrectly misses that it's
> screened by the write) and in many of its parents (until lucky
> write into the same Rx or beginning of the program).
> That causes is_state_visited() logic to miss many pruning opportunities.
Good catch!
> Furthermore mark_reg_read() logic propagates the read mark
> for BPF_REG_FP as well (though it's readonly) which causes
> harmless but unnecssary work during is_state_visited().
Surely it's unnecessary for is_state_visited() to even look at
 BPF_REG_FP anyway, so in addition to your change we could make
 states_equal just do `for (i = 0; i < BPF_REG_FP; i++)`?  That
 might save a bit more time.
> Note that do_propagate_liveness() skips FP correctly,
> so do the same in mark_reg_read() as well.
> It saves 0.2 seconds for the test below
>
> program               before  after
> bpf_lb-DLB_L3.o       2604    2304
> bpf_lb-DLB_L4.o       11159   3723
> bpf_lb-DUNKNOWN.o     1116    1110
> bpf_lxc-DDROP_ALL.o   34566   28004
> bpf_lxc-DUNKNOWN.o    53267   39026
> bpf_netdev.o          17843   16943
> bpf_overlay.o         8672    7929
> time                  ~11 sec  ~4 sec
>
> Fixes: dc503a8ad984 ("bpf/verifier: track liveness for pruning")
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Very nice numbers!
Acked-by: Edward Cree <ecree@solarflare.com>

^ permalink raw reply

* Re: [PATCH 0/4] RCU: introduce noref debug
From: Paul E. McKenney @ 2017-10-06 16:34 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: linux-kernel, Josh Triplett, Steven Rostedt, David S. Miller,
	Eric Dumazet, Hannes Frederic Sowa, netdev
In-Reply-To: <1507302609.2793.16.camel@redhat.com>

On Fri, Oct 06, 2017 at 05:10:09PM +0200, Paolo Abeni wrote:
> Hi,
> 
> On Fri, 2017-10-06 at 06:34 -0700, Paul E. McKenney wrote:
> > On Fri, Oct 06, 2017 at 02:57:45PM +0200, Paolo Abeni wrote:
> > > The networking subsystem is currently using some kind of long-lived
> > > RCU-protected, references to avoid the overhead of full book-keeping.
> > > 
> > > Such references - skb_dst() noref - are stored inside the skbs and can be
> > > moved across relevant slices of the network stack, with the users
> > > being in charge of properly clearing the relevant skb - or properly refcount
> > > the related dst references - before the skb escapes the RCU section.
> > > 
> > > We currently don't have any deterministic debug infrastructure to check
> > > the dst noref usages - and the introduction of others noref artifact is
> > > currently under discussion.
> > > 
> > > This series tries to tackle the above introducing an RCU debug infrastructure
> > > aimed at spotting incorrect noref pointer usage, in patch one. The
> > > infrastructure is small and must be explicitly enabled via a newly introduced
> > > build option.
> > > 
> > > Patch two uses such infrastructure to track dst noref usage in the networking
> > > stack.
> > > 
> > > Patch 3 and 4 are bugfixes for small buglet found running this infrastructure
> > > on basic scenarios.
> 
> Thank you for the prompt reply!
> > 
> > This patchset does not look like it handles rcu_read_lock() nesting.
> > For example, given code like this:
> > 
> > 	void foo(void)
> > 	{
> > 		rcu_read_lock();
> > 		rcu_track_noref(&key2, &noref2, true);
> > 		do_something();
> > 		rcu_track_noref(&key2, &noref2, false);
> > 		rcu_read_unlock();
> > 	}
> > 
> > 	void bar(void)
> > 	{
> > 		rcu_read_lock();
> > 		rcu_track_noref(&key1, &noref1, true);
> > 		do_something_more();
> > 		foo();
> > 		do_something_else();
> > 		rcu_track_noref(&key1, &noref1, false);
> > 		rcu_read_unlock();
> > 	}
> > 
> > 	void grill(void)
> > 	{
> > 		foo();
> > 	}
> > 
> > It looks like foo()'s rcu_read_unlock() will complain about key1.
> > You could remove foo()'s rcu_read_lock() and rcu_read_unlock(), but
> > that will break the call from grill().
> 
> Actually the code should cope correctly with your example; when foo()'s
> rcu_read_unlock() is called, 'cache' contains:
> 
> { { &key1, &noref1, 1},  // ...
> 
> and when the related __rcu_check_noref() is invoked preempt_count() is
> 2 - because the check is called before decreasing the preempt counter.
> 
> In the main loop inside __rcu_check_noref() we will hit always the
> 'continue' statement because 'cache->store[i].nesting != nesting', so
> no warn will be triggered.

You are right, it was too early, and my example wasn't correct.  How
about this one?

	void foo(void (*f)(struct s *sp), struct s **spp)
	{
		rcu_read_lock();
		rcu_track_noref(&key2, &noref2, true);
		f(spp);
		rcu_track_noref(&key2, &noref2, false);
		rcu_read_unlock();
	}

	void barcb(struct s **spp)
	{
		*spp = &noref3;
		rcu_track_noref(&key3, *spp, true);
	}

	void bar(void)
	{
		struct s *sp;

		rcu_read_lock();
		rcu_track_noref(&key1, &noref1, true);
		do_something_more();
		foo(barcb, &sp);
		do_something_else(sp);
		rcu_track_noref(&key3, sp, false);
		rcu_track_noref(&key1, &noref1, false);
		rcu_read_unlock();
	}

	void grillcb(struct s **spp)
	{
		*spp
	}

	void grill(void)
	{
		foo();
	}

How does the user select the key argument?  It looks like someone
can choose to just pass in NULL -- is that the intent, or am I confused
about this as well?

> > Or am I missing something subtle here?  Given patch 3/4, I suspect not...
> 
> The problem with the code in patch 3/4 is different; currently
> ip_route_input_noref() is basically doing:
> 
> rcu_read_lock();
> 
> rcu_track_noref(&key1, &noref1, true);
> 
> rcu_read_unlock();
> 
> So the rcu lock there silence any RCU based check inside
> ip_route_input_noref() but does not really protect the noref dst.
> 
> Please let me know if the above clarify the scenario.

OK.

I am also concerned about false negatives when the user invokes
rcu_track_noref(..., false) but then leaks the pointer anyway.  Or is
there something you are doing that catches this that I am missing?

							Thanx, Paul

^ permalink raw reply

* Re: [PATCH net-next 0/3] tcp: improving RACK cpu performance
From: Yuchung Cheng @ 2017-10-06 16:34 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20171005.212612.1580802624897584218.davem@davemloft.net>

On Thu, Oct 5, 2017 at 9:26 PM, David Miller <davem@davemloft.net> wrote:
>
> From: Yuchung Cheng <ycheng@google.com>
> Date: Wed,  4 Oct 2017 12:59:57 -0700
>
> > This patch set improves the CPU consumption of the RACK TCP loss
> > recovery algorithm, in particular for high-speed networks. Currently,
> > for every ACK in recovery RACK can potentially iterate over all sent
> > packets in the write queue. On large BDP networks with non-trivial
> > losses the RACK write queue walk CPU usage becomes unreasonably high.
> >
> > This patch introduces a new queue in TCP that keeps only skbs sent and
> > not yet (s)acked or marked lost, in time order instead of sequence
> > order.  With that, RACK can examine this time-sorted list and only
> > check packets that were sent recently, within the reordering window,
> > per ACK. This is the fastest way without any write queue walks. The
> > number of skbs examined per ACK is reduced by orders of magnitude.
>
> That's a pretty risky way to implement the second SKB list.... but
Agreed. I really appreciate you accepting this change. We tried a few
alternatives but this is the most clean and fastest approach. BBR can
reach 8-10Gbps on 300ms RTT link so both RACK and SACK need to scale
in the era of hundreds MB BDP :-)

> you avoided making sk_buff larger so what can I say :-)
>
> Series applied, thank.

^ permalink raw reply

* Re: [PATCH v2] gso: fix payload length when gso_size is zero
From: Duyck, Alexander H @ 2017-10-06 16:37 UTC (permalink / raw)
  To: netdev@vger.kernel.org, alexey.kodanev@oracle.com
  Cc: davem@davemloft.net, steffen.klassert@secunet.com
In-Reply-To: <1507305755-14393-1-git-send-email-alexey.kodanev@oracle.com>

On Fri, 2017-10-06 at 19:02 +0300, Alexey Kodanev wrote:
> When gso_size reset to zero for the tail segment in skb_segment(), later
> in ipv6_gso_segment(), __skb_udp_tunnel_segment() and gre_gso_segment()
> we will get incorrect results (payload length, pcsum) for that segment.
> inet_gso_segment() already has a check for gso_size before calculating
> payload.
> 
> The issue was found with LTP vxlan & gre tests over ixgbe NIC.
> 
> Fixes: 07b26c9454a2 ("gso: Support partial splitting at the frag_list pointer")
> Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>

Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>

> ---
> v2: also added skb_is_gso to gre_gso_segment() and __skb_udp_tunnel_segment()
> 
>  net/ipv4/gre_offload.c | 2 +-
>  net/ipv4/udp_offload.c | 2 +-
>  net/ipv6/ip6_offload.c | 2 +-
>  3 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
> index d5cac99..8c72034 100644
> --- a/net/ipv4/gre_offload.c
> +++ b/net/ipv4/gre_offload.c
> @@ -98,7 +98,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
>  		greh = (struct gre_base_hdr *)skb_transport_header(skb);
>  		pcsum = (__sum16 *)(greh + 1);
>  
> -		if (gso_partial) {
> +		if (gso_partial && skb_is_gso(skb)) {
>  			unsigned int partial_adj;
>  
>  			/* Adjust checksum to account for the fact that
> diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
> index 0932c85..6401574 100644
> --- a/net/ipv4/udp_offload.c
> +++ b/net/ipv4/udp_offload.c
> @@ -122,7 +122,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
>  		 * will be using a length value equal to only one MSS sized
>  		 * segment instead of the entire frame.
>  		 */
> -		if (gso_partial) {
> +		if (gso_partial && skb_is_gso(skb)) {
>  			uh->len = htons(skb_shinfo(skb)->gso_size +
>  					SKB_GSO_CB(skb)->data_offset +
>  					skb->head - (unsigned char *)uh);
> diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
> index cdb3728..4a87f94 100644
> --- a/net/ipv6/ip6_offload.c
> +++ b/net/ipv6/ip6_offload.c
> @@ -105,7 +105,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
>  
>  	for (skb = segs; skb; skb = skb->next) {
>  		ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
> -		if (gso_partial)
> +		if (gso_partial && skb_is_gso(skb))
>  			payload_len = skb_shinfo(skb)->gso_size +
>  				      SKB_GSO_CB(skb)->data_offset +
>  				      skb->head - (unsigned char *)(ipv6h + 1);

^ permalink raw reply

* Re: [PATCH net] bpf: fix liveness marking
From: Alexei Starovoitov @ 2017-10-06 16:43 UTC (permalink / raw)
  To: Edward Cree, David S . Miller; +Cc: Daniel Borkmann, netdev, kernel-team
In-Reply-To: <759810f5-f6db-0439-1b2d-637cf83ac2f8@solarflare.com>

On 10/6/17 9:33 AM, Edward Cree wrote:
> On 06/10/17 00:20, Alexei Starovoitov wrote:
>> while processing Rx = Ry instruction the verifier does
>> regs[insn->dst_reg] = regs[insn->src_reg]
>> which often clears write mark (when Ry doesn't have it)
>> that was just set by check_reg_arg(Rx) prior to the assignment.
>> That causes mark_reg_read() to keep marking Rx in this block as
>> REG_LIVE_READ (since the logic incorrectly misses that it's
>> screened by the write) and in many of its parents (until lucky
>> write into the same Rx or beginning of the program).
>> That causes is_state_visited() logic to miss many pruning opportunities.
> Good catch!
>> Furthermore mark_reg_read() logic propagates the read mark
>> for BPF_REG_FP as well (though it's readonly) which causes
>> harmless but unnecssary work during is_state_visited().
> Surely it's unnecessary for is_state_visited() to even look at
>  BPF_REG_FP anyway, so in addition to your change we could make
>  states_equal just do `for (i = 0; i < BPF_REG_FP; i++)`?  That
>  might save a bit more time.

yeah. before this patch it was doing extra
memcmp(rold, rcur, ..) on FP reg. This patch saves this memcpy.
The i < BPF_REG_FP would effectively do the same, but I'm not sure
I want to do it just yet.
For net-next I have a bunch of changes for verifier to support bpf_call
and there two different states may have two different FPs.
One FP from caller and one from callee.
So I might still need to do full
for (i = 0; i < MAX_BPF_REG; i++)
      if (!regsafe(..))

>> Note that do_propagate_liveness() skips FP correctly,
>> so do the same in mark_reg_read() as well.
>> It saves 0.2 seconds for the test below
>>
>> program               before  after
>> bpf_lb-DLB_L3.o       2604    2304
>> bpf_lb-DLB_L4.o       11159   3723
>> bpf_lb-DUNKNOWN.o     1116    1110
>> bpf_lxc-DDROP_ALL.o   34566   28004
>> bpf_lxc-DUNKNOWN.o    53267   39026
>> bpf_netdev.o          17843   16943
>> bpf_overlay.o         8672    7929
>> time                  ~11 sec  ~4 sec
>>
>> Fixes: dc503a8ad984 ("bpf/verifier: track liveness for pruning")
>> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> Very nice numbers!
> Acked-by: Edward Cree <ecree@solarflare.com>

Thanks!

^ permalink raw reply

* [PATCH net-next] openvswitch: add ct_clear action
From: Eric Garver @ 2017-10-06 16:44 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA; +Cc: dev-yBygre7rU0TnMu66kgdUjQ

This adds a ct_clear action for clearing conntrack state. ct_clear is
currently implemented in OVS userspace, but is not backed by an action
in the kernel datapath. This is useful for flows that may modify a
packet tuple after a ct lookup has already occurred.

Signed-off-by: Eric Garver <e@erig.me>
---
 include/uapi/linux/openvswitch.h |  2 ++
 net/openvswitch/actions.c        |  5 +++++
 net/openvswitch/conntrack.c      | 12 ++++++++++++
 net/openvswitch/conntrack.h      |  7 +++++++
 net/openvswitch/flow_netlink.c   |  5 +++++
 5 files changed, 31 insertions(+)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 156ee4cab82e..1b6e510e2cc6 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -806,6 +806,7 @@ struct ovs_action_push_eth {
  * packet.
  * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
  * packet.
+ * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -835,6 +836,7 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_TRUNC,        /* u32 struct ovs_action_trunc. */
 	OVS_ACTION_ATTR_PUSH_ETH,     /* struct ovs_action_push_eth. */
 	OVS_ACTION_ATTR_POP_ETH,      /* No argument. */
+	OVS_ACTION_ATTR_CT_CLEAR,     /* No argument. */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index a54a556fcdb5..db9c7f2e662b 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1203,6 +1203,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 				return err == -EINPROGRESS ? 0 : err;
 			break;
 
+		case OVS_ACTION_ATTR_CT_CLEAR:
+			err = ovs_ct_clear(skb, key);
+			break;
+
 		case OVS_ACTION_ATTR_PUSH_ETH:
 			err = push_eth(skb, key, nla_data(a));
 			break;
@@ -1210,6 +1214,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 		case OVS_ACTION_ATTR_POP_ETH:
 			err = pop_eth(skb, key);
 			break;
+
 		}
 
 		if (unlikely(err)) {
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index d558e882ca0c..f9b73c726ad7 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1129,6 +1129,18 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 	return err;
 }
 
+int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key)
+{
+	if (skb_nfct(skb)) {
+		nf_conntrack_put(skb_nfct(skb));
+		nf_ct_set(skb, NULL, 0);
+	}
+
+	ovs_ct_fill_key(skb, key);
+
+	return 0;
+}
+
 static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
 			     const struct sw_flow_key *key, bool log)
 {
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index bc7efd1867ab..399dfdd2c4f9 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -30,6 +30,7 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *);
 
 int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
 		   const struct ovs_conntrack_info *);
+int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key);
 
 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
 int ovs_ct_put_key(const struct sw_flow_key *swkey,
@@ -73,6 +74,12 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 	return -ENOTSUPP;
 }
 
+static inline int ovs_ct_clear(struct sk_buff *skb,
+			       struct sw_flow_key *key)
+{
+	return -ENOTSUPP;
+}
+
 static inline void ovs_ct_fill_key(const struct sk_buff *skb,
 				   struct sw_flow_key *key)
 {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index e8eb427ce6d1..198bb828d592 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -75,6 +75,7 @@ static bool actions_may_change_flow(const struct nlattr *actions)
 			break;
 
 		case OVS_ACTION_ATTR_CT:
+		case OVS_ACTION_ATTR_CT_CLEAR:
 		case OVS_ACTION_ATTR_HASH:
 		case OVS_ACTION_ATTR_POP_ETH:
 		case OVS_ACTION_ATTR_POP_MPLS:
@@ -2479,6 +2480,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			[OVS_ACTION_ATTR_SAMPLE] = (u32)-1,
 			[OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash),
 			[OVS_ACTION_ATTR_CT] = (u32)-1,
+			[OVS_ACTION_ATTR_CT_CLEAR] = 0,
 			[OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
 			[OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth),
 			[OVS_ACTION_ATTR_POP_ETH] = 0,
@@ -2620,6 +2622,9 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			skip_copy = true;
 			break;
 
+		case OVS_ACTION_ATTR_CT_CLEAR:
+			break;
+
 		case OVS_ACTION_ATTR_PUSH_ETH:
 			/* Disallow pushing an Ethernet header if one
 			 * is already present */
-- 
2.12.0

^ permalink raw reply related

* Re: [PATCH] net/ipv6: Convert icmpv6_push_pending_frames to void
From: David Miller @ 2017-10-06 16:52 UTC (permalink / raw)
  To: joe; +Cc: kuznet, yoshfuji, devtimhansen, netdev, linux-kernel
In-Reply-To: <14612f639009193e561bcdb11c5ef5b6f12830ed.1507272313.git.joe@perches.com>

From: Joe Perches <joe@perches.com>
Date: Thu,  5 Oct 2017 23:46:14 -0700

> commit cc71b7b07119 ("net/ipv6: remove unused err variable on
> icmpv6_push_pending_frames") exposed icmpv6_push_pending_frames
> return value not being used.
> 
> Remove now unnecessary int err declarations and uses.
> 
> Miscellanea:
> 
> o Remove unnecessary goto and out: labels
> o Realign arguments
> 
> Signed-off-by: Joe Perches <joe@perches.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next 0/7] nfp: extend match and action for flower offload
From: David Miller @ 2017-10-06 16:56 UTC (permalink / raw)
  To: simon.horman; +Cc: jakub.kicinski, netdev, oss-drivers
In-Reply-To: <1507278086-3102-1-git-send-email-simon.horman@netronome.com>

From: Simon Horman <simon.horman@netronome.com>
Date: Fri,  6 Oct 2017 10:21:19 +0200

> Pieter says:
> 
> This series extends flower offload match and action capabilities. It
> specifically adds offload capabilities for matching on MPLS, TTL, TOS
> and flow label. Furthermore offload capabilities for action have been
> expanded to include set ethernet, ipv4, ipv6, tcp and udp headers.

Series applied, thanks Simon.

^ permalink raw reply

* Re: [PATCH net-next v2 0/3] ethtool: support for forward error correction mode setting on a link
From: Roopa Prabhu @ 2017-10-06 17:05 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: davem@davemloft.net, John W. Linville, netdev@vger.kernel.org,
	Vidya Sagar Ravipati, Dustin Byford, Dave Olson, Casey Leedom,
	Gal Pressman, Andrew Lunn, Manoj Malviya, Santosh Rastapur,
	yuval.mintz, odedw, Ariel Almog, Jeff Kirsher, Dirk van der Merwe
In-Reply-To: <20171005113055.477aec34@cakuba.netronome.com>

On Thu, Oct 5, 2017 at 11:30 AM, Jakub Kicinski <kubakici@wp.pl> wrote:
> On Fri, 28 Jul 2017 23:28:26 -0700, Roopa Prabhu wrote:
>> On Fri, Jul 28, 2017 at 9:46 AM, Jakub Kicinski <kubakici@wp.pl> wrote:
>> > On Fri, 28 Jul 2017 07:53:01 -0700, Roopa Prabhu wrote:
>> >> On Thu, Jul 27, 2017 at 7:33 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
>> >> > On Thu, 27 Jul 2017 16:47:25 -0700, Roopa Prabhu wrote:
>> >> >> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>> >> >>
>> >> >> Forward Error Correction (FEC) modes i.e Base-R
>> >> >> and Reed-Solomon modes are introduced in 25G/40G/100G standards
>> >> >> for providing good BER at high speeds. Various networking devices
>> >> >> which support 25G/40G/100G provides ability to manage supported FEC
>> >> >> modes and the lack of FEC encoding control and reporting today is a
>> >> >> source for interoperability issues for many vendors.
>> >> >> FEC capability as well as specific FEC mode i.e. Base-R
>> >> >> or RS modes can be requested or advertised through bits D44:47 of base link
>> >> >> codeword.
>> >> >>
>> >> >> This patch set intends to provide option under ethtool to manage and
>> >> >> report FEC encoding settings for networking devices as per IEEE 802.3
>> >> >> bj, bm and by specs.
>> >> >>
>> >> >> v2 :
>> >> >>         - minor patch format fixes and typos pointed out by Andrew
>> >> >>         - there was a pending discussion on the use of 'auto' vs
>> >> >>           'automatic' for fec settings. I have left it as 'auto'
>> >> >>           because in most cases today auto is used in place of
>> >> >>           automatic to represent automatically generated values.
>> >> >>           We use it in other networking config too. I would prefer
>> >> >>           leaving it as auto.
>> >> >
>> >> > On the subject of resetting the values when module is replugged I
>> >> > assume what was previously described remains:
>> >> >  - we always allow users to set the FEC regardless of the module type;
>> >> >  - if user set an incorrect FEC for the module type (or module gets
>> >> >    swapped) the link will be administratively taken down by either
>> >> >    the driver or FW.
>> >> >
>> >> > Is that correct?  Am I misremembering?
>> >>
>> >> yes, correct. And possible future sfp hotplug events can give user-space
>> >> more info to react to module type changes etc.
>> >
>> > OK, if nobody else objects and we go with that - lets make sure we
>> > document clearly those are expected :)  My concern is that if there is
>> > ever 10G + RS FEC standard we don't want to end up in a situation where
>> > some drivers silently ignore FEC settings in 10G and other apply it.
>> > So let's make it clear what the intended Linux behaviour is.  It could
>> > be in the ethtool man page, or the kernel somewhere.
>>
>> sure :), ack. We will document it in the ethtool manpage.
>
> Hi Roopa!  Did you ever publish the ethtool user space patches at all?
> I can't find them...

not yet, they are in my queue. will submit in the next two days.

^ permalink raw reply

* Re: [net-next PATCH 0/3] Improve xdp_monitor samples/bpf
From: David Miller @ 2017-10-06 17:10 UTC (permalink / raw)
  To: brouer; +Cc: netdev, andy, borkmann, alexei.starovoitov
In-Reply-To: <150727927390.4460.3200093291677318710.stgit@firesoul>

From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 06 Oct 2017 10:41:36 +0200

> Here are some improvements to the xdp_monitor tool currently located
> under samples/bpf/.  Once the tools library libbpf become more feature
> complete, xdp_monitor should be converted to use it, and be moved into
> tools/bpf/xdp/ or tools/xdp/.

Series applied.

^ permalink raw reply

* Re: [PATCH] bnx2x: Use pci_ari_enabled() instead of local copy
From: David Miller @ 2017-10-06 17:10 UTC (permalink / raw)
  To: helgaas; +Cc: ariel.elior, netdev, everest-linux-l2, linux-kernel
In-Reply-To: <20171006110030.10980.52268.stgit@bhelgaas-glaptop.roam.corp.google.com>

From: Bjorn Helgaas <helgaas@kernel.org>
Date: Fri, 06 Oct 2017 06:00:30 -0500

> From: Bjorn Helgaas <bhelgaas@google.com>
> 
> Use pci_ari_enabled() from the PCI core instead of the identical local copy
> bnx2x_ari_enabled().  No functional change intended.
> 
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>

Applied to net-next.

^ permalink raw reply

* Re: [PATCH 0/4] pull request for net-next: batman-adv 2017-10-06
From: David Miller @ 2017-10-06 17:13 UTC (permalink / raw)
  To: sw; +Cc: netdev, b.a.t.m.a.n
In-Reply-To: <20171006135437.26736-1-sw@simonwunderlich.de>

From: Simon Wunderlich <sw@simonwunderlich.de>
Date: Fri,  6 Oct 2017 15:54:33 +0200

> here is a small cleanup pull request of batman-adv to go into net-next.
> 
> Please pull or let me know of any problem!

Pulled, thanks Simon.

^ permalink raw reply

* Re: [PATCH net] ppp: fix race in ppp device destruction
From: David Miller @ 2017-10-06 17:17 UTC (permalink / raw)
  To: g.nault; +Cc: netdev, bgalvani, linux-ppp, paulus, dsahern, gfree.wind
In-Reply-To: <f197b94ec7e16902a9601fdf840d5a8e94c8e91a.1507302309.git.g.nault@alphalink.fr>

From: Guillaume Nault <g.nault@alphalink.fr>
Date: Fri, 6 Oct 2017 17:05:49 +0200

> ppp_release() tries to ensure that netdevices are unregistered before
> decrementing the unit refcount and running ppp_destroy_interface().
> 
> This is all fine as long as the the device is unregistered by
> ppp_release(): the unregister_netdevice() call, followed by
> rtnl_unlock(), guarantee that the unregistration process completes
> before rtnl_unlock() returns.
> 
> However, the device may be unregistered by other means (like
> ppp_nl_dellink()). If this happens right before ppp_release() calling
> rtnl_lock(), then ppp_release() has to wait for the concurrent
> unregistration code to release the lock.
> But rtnl_unlock() releases the lock before completing the device
> unregistration process. This allows ppp_release() to proceed and
> eventually call ppp_destroy_interface() before the unregistration
> process completes. Calling free_netdev() on this partially unregistered
> device will BUG():
 ...
> We could set the ->needs_free_netdev flag on PPP devices and move the
> ppp_destroy_interface() logic in the ->priv_destructor() callback. But
> that'd be quite intrusive as we'd first need to unlink from the other
> channels and units that depend on the device (the ones that used the
> PPPIOCCONNECT and PPPIOCATTACH ioctls).
> 
> Instead, we can just let the netdevice hold a reference on its
> ppp_file. This reference is dropped in ->priv_destructor(), at the very
> end of the unregistration process, so that neither ppp_release() nor
> ppp_disconnect_channel() can call ppp_destroy_interface() in the interim.
> 
> Reported-by: Beniamino Galvani <bgalvani@redhat.com>
> Fixes: 8cb775bc0a34 ("ppp: fix device unregistration upon netns deletion")
> Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>

Applied and queued up for -stable, thanks.

^ permalink raw reply

* [PATCH net-next v2] vhost_net: do not stall on zerocopy depletion
From: Willem de Bruijn @ 2017-10-06 17:22 UTC (permalink / raw)
  To: netdev; +Cc: davem, mst, jasowang, den, virtualization, Willem de Bruijn

From: Willem de Bruijn <willemb@google.com>

Vhost-net has a hard limit on the number of zerocopy skbs in flight.
When reached, transmission stalls. Stalls cause latency, as well as
head-of-line blocking of other flows that do not use zerocopy.

Instead of stalling, revert to copy-based transmission.

Tested by sending two udp flows from guest to host, one with payload
of VHOST_GOODCOPY_LEN, the other too small for zerocopy (1B). The
large flow is redirected to a netem instance with 1MBps rate limit
and deep 1000 entry queue.

  modprobe ifb
  ip link set dev ifb0 up
  tc qdisc add dev ifb0 root netem limit 1000 rate 1MBit

  tc qdisc add dev tap0 ingress
  tc filter add dev tap0 parent ffff: protocol ip \
      u32 match ip dport 8000 0xffff \
      action mirred egress redirect dev ifb0

Before the delay, both flows process around 80K pps. With the delay,
before this patch, both process around 400. After this patch, the
large flow is still rate limited, while the small reverts to its
original rate. See also discussion in the first link, below.

Without rate limiting, {1, 10, 100}x TCP_STREAM tests continued to
send at 100% zerocopy.

The limit in vhost_exceeds_maxpend must be carefully chosen. With
vq->num >> 1, the flows remain correlated. This value happens to
correspond to VHOST_MAX_PENDING for vq->num == 256. Allow smaller
fractions and ensure correctness also for much smaller values of
vq->num, by testing the min() of both explicitly. See also the
discussion in the second link below.

Changes
  v1 -> v2
    - replaced min with typed min_t
    - avoid unnecessary whitespace change

Link:http://lkml.kernel.org/r/CAF=yD-+Wk9sc9dXMUq1+x_hh=3ThTXa6BnZkygP3tgVpjbp93g@mail.gmail.com
Link:http://lkml.kernel.org/r/20170819064129.27272-1-den@klaipeden.com
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 drivers/vhost/net.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 58585ec8699e..68677d930e20 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -436,8 +436,8 @@ static bool vhost_exceeds_maxpend(struct vhost_net *net)
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
 	struct vhost_virtqueue *vq = &nvq->vq;

-	return (nvq->upend_idx + vq->num - VHOST_MAX_PEND) % UIO_MAXIOV
-		== nvq->done_idx;
+	return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV >
+	       min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
 }

 /* Expects to be always run from workqueue - which acts as
@@ -480,11 +480,6 @@ static void handle_tx(struct vhost_net *net)
 		if (zcopy)
 			vhost_zerocopy_signal_used(net, vq);

-		/* If more outstanding DMAs, queue the work.
-		 * Handle upend_idx wrap around
-		 */
-		if (unlikely(vhost_exceeds_maxpend(net)))
-			break;

 		head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
 						ARRAY_SIZE(vq->iov),
@@ -519,8 +514,7 @@ static void handle_tx(struct vhost_net *net)
 		len = msg_data_left(&msg);

 		zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
-				   && (nvq->upend_idx + 1) % UIO_MAXIOV !=
-				      nvq->done_idx
+				   && !vhost_exceeds_maxpend(net)
 				   && vhost_net_tx_select_zcopy(net);

 		/* use msg_control to pass vhost zerocopy ubuf info to skb */
-- 
2.14.2.920.gcf0c67979c-goog

^ permalink raw reply related

* Linux bridge of hsr and Ethernet interface. Is this valid?
From: Murali Karicheri @ 2017-10-06 17:28 UTC (permalink / raw)
  To: open list:TI NETCP ETHERNET DRIVER

Hello Linux netdev experts,

I have a board that has multiple Ethernet intefaces. two, eth0 and eth1 and 1G interfaces and two are 100M interfaces (eth2 and eth3). I create  hsr interface, hsr0 using eth2 and eth3 using ip link command. Now I want to create a linux bridge as follows:-

brctl addbr my_bridge
brctl addif eth0
brctl addif hsr0

I connect a PC to eth0 interface and another hsr compliant device to eth2 or eth3. Is it a valid scenario? 

I see following description at https://wiki.linuxfoundation.org/networking/bridge where is mentioned that 

=============================
Adding devices to a bridge

The command

 brctl addif //bridgename// //device//

adds the network device device to take part in the bridging of “bridgename.” All the devices contained in a bridge act as one big network. It is not possible to add a device to multiple bridges or bridge a bridge device, because it just wouldn't make any sense! The bridge will take a short amount of time when a device is added to learn the Ethernet addresses on the segment before starting to forward. 

=============================

In this case hsr is already a 802.1D bridge and we are trying to bridge a bridge.

So your expert opinion is needed. Thanks.
-- 
Murali Karicheri
Linux Kernel, Keystone

^ permalink raw reply

* Re: [PATCH] netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'
From: Willem de Bruijn @ 2017-10-06 17:40 UTC (permalink / raw)
  To: Shmulik Ladkani
  Cc: Pablo Neira Ayuso, netfilter-devel, Willem de Bruijn,
	Network Development, Daniel Borkmann, Rafael Buchbinder,
	Shmulik Ladkani
In-Reply-To: <20171006160242.4403-1-shmulik@nsof.io>

On Fri, Oct 6, 2017 at 12:02 PM, Shmulik Ladkani <shmulik@nsof.io> wrote:
> From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
>
> Commit 2c16d6033264 ("netfilter: xt_bpf: support ebpf") introduced
> support for attaching an eBPF object by an fd, with the
> 'bpf_mt_check_v1' ABI expecting the '.fd' to be specified upon each
> IPT_SO_SET_REPLACE call.
>
> However this breaks subsequent iptables calls:
>
>  # iptables -A INPUT -m bpf --object-pinned /sys/fs/bpf/xxx -j ACCEPT
>  # iptables -A INPUT -s 5.6.7.8 -j ACCEPT
>  iptables: Invalid argument. Run `dmesg' for more information.
>
> That's because iptables works by loading exising rules using
> IPT_SO_GET_ENTRIES to userspace, then issuing IPT_SO_SET_REPLACE with
> the replacement set.
>
> However, the loaded 'xt_bpf_info_v1' has an arbitrary '.fd' number
> (from the initial "iptables -m bpf" invocation) - so when 2nd invocation
> occurs, userspace passes a bogus fd number, which leads to
> 'bpf_mt_check_v1' to fail.
>
> One suggested solution [1] was to hack iptables userspace, to perform a
> "entries fixup" immediatley after IPT_SO_GET_ENTRIES, by opening a new,
> process-local fd per every 'xt_bpf_info_v1' entry seen.
>
> However, in [2] both Pablo Neira Ayuso and Willem de Bruijn suggested to
> depricate the xt_bpf_info_v1 ABI dealing with pinned ebpf objects.
>
> This fix changes the XT_BPF_MODE_FD_PINNED behavior to ignore the given
> '.fd' and instead perform an in-kernel lookup for the bpf object given
> the provided '.path'.
>
> It also defines an alias for the XT_BPF_MODE_FD_PINNED mode, named
> XT_BPF_MODE_PATH_PINNED, to better reflect the fact that the user is
> expected to provide the path of the pinned object.
>
> Existing XT_BPF_MODE_FD_ELF behavior (non-pinned fd mode) is preserved.

I suppose that we have the same problem here. As a matter of fact, the
implementation is the same for both FD_ELF and FD_PINNED, and checks

  f.file->f_op == &bpf_prog_fops

so a file descriptor to a random open ELF file outside a bpf fs would not be
accepted as is.

Anyway, that is outside the scope of this fix.

>
> References: [1] https://marc.info/?l=netfilter-devel&m=150564724607440&w=2
>             [2] https://marc.info/?l=netfilter-devel&m=150575727129880&w=2
>
> Cc: Pablo Neira Ayuso <pablo@netfilter.org>
> Cc: Willem de Bruijn <willemb@google.com>
> Reported-by: Rafael Buchbinder <rafi@rbk.ms>
> Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>

Acked-by: Willem de Bruijn <willemb@google.com>

Thanks a lot for fixing this.

> ---
>  include/uapi/linux/netfilter/xt_bpf.h |  1 +
>  kernel/bpf/inode.c                    |  1 +
>  net/netfilter/xt_bpf.c                | 22 ++++++++++++++++++++--
>  3 files changed, 22 insertions(+), 2 deletions(-)
>
> diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h
> index b97725af2ac0..da161b56c79e 100644
> --- a/include/uapi/linux/netfilter/xt_bpf.h
> +++ b/include/uapi/linux/netfilter/xt_bpf.h
> @@ -23,6 +23,7 @@ enum xt_bpf_modes {
>         XT_BPF_MODE_FD_PINNED,
>         XT_BPF_MODE_FD_ELF,
>  };
> +#define XT_BPF_MODE_PATH_PINNED XT_BPF_MODE_FD_PINNED
>
>  struct xt_bpf_info_v1 {
>         __u16 mode;
> diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
> index e833ed914358..be1dde967208 100644
> --- a/kernel/bpf/inode.c
> +++ b/kernel/bpf/inode.c
> @@ -363,6 +363,7 @@ int bpf_obj_get_user(const char __user *pathname)
>         putname(pname);
>         return ret;
>  }
> +EXPORT_SYMBOL_GPL(bpf_obj_get_user);
>
>  static void bpf_evict_inode(struct inode *inode)
>  {
> diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
> index 38986a95216c..29123934887b 100644
> --- a/net/netfilter/xt_bpf.c
> +++ b/net/netfilter/xt_bpf.c
> @@ -8,6 +8,7 @@
>   */
>
>  #include <linux/module.h>
> +#include <linux/syscalls.h>
>  #include <linux/skbuff.h>
>  #include <linux/filter.h>
>  #include <linux/bpf.h>
> @@ -49,6 +50,22 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
>         return 0;
>  }
>
> +static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
> +{
> +       mm_segment_t oldfs = get_fs();
> +       int retval, fd;
> +
> +       set_fs(KERNEL_DS);
> +       fd = bpf_obj_get_user(path);
> +       set_fs(oldfs);
> +       if (fd < 0)
> +               return fd;
> +
> +       retval = __bpf_mt_check_fd(fd, ret);
> +       sys_close(fd);
> +       return retval;
> +}
> +
>  static int bpf_mt_check(const struct xt_mtchk_param *par)
>  {
>         struct xt_bpf_info *info = par->matchinfo;
> @@ -66,9 +83,10 @@ static int bpf_mt_check_v1(const struct xt_mtchk_param *par)
>                 return __bpf_mt_check_bytecode(info->bpf_program,
>                                                info->bpf_program_num_elem,
>                                                &info->filter);
> -       else if (info->mode == XT_BPF_MODE_FD_PINNED ||
> -                info->mode == XT_BPF_MODE_FD_ELF)
> +       else if (info->mode == XT_BPF_MODE_FD_ELF)
>                 return __bpf_mt_check_fd(info->fd, &info->filter);
> +       else if (info->mode == XT_BPF_MODE_PATH_PINNED)
> +               return __bpf_mt_check_path(info->path, &info->filter);
>         else
>                 return -EINVAL;
>  }
> --
> 2.14.2
>

^ permalink raw reply

* Re: [PATCH net-next 1/1] [net] bonding: Add NUMA notice
From: Patrick Talbert @ 2017-10-06 17:54 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: netdev
In-Reply-To: <20171005214636.GM13247@lunn.ch>

On Thu, Oct 5, 2017 at 5:46 PM, Andrew Lunn <andrew@lunn.ch> wrote:
> On Thu, Oct 05, 2017 at 04:23:45PM -0400, Patrick Talbert wrote:
>> Network performance can suffer when a load balancing bond uses slave
>> interfaces which are in different NUMA domains.
>>
>> This compares the NUMA domain of a newly enslaved interface against any
>> existing enslaved interfaces and prints a warning if they do not match.
>
> Hi Patrick
>
> Is there a bonding mode which might actually want to do this? Send on
> the local domain, unless it is overloaded, in which case send it to
> the other domain?
>

I suppose there could theoretically be a bonding mode that could do
that, but currently no such mode exists.

> There is also this talk for netdev:
>
> https://netdevconf.org/2.2/session.html?shochat-devicemgmt-talk

>From reading the abstract there, it sounds like such a device driver
would want to abstract away the numa location of the underlying
devices from the "unified" net device it presents to the kernel.

>
>         Andrew

My goal with the patch is not to prevent some one from bonding
whichever interfaces they want, only to notify them that what they are
doing is *likely* to be less than ideal from a performance
perspective. Even if some theoretical load balancing bonding mode was
intelligent enough to consider NUMA when choosing a transmit
interface, it never has control over the interface traffic is received
on (excluding the strange balance-alb mode).

Patrick

^ permalink raw reply

* [net-next 01/15] i40e: fix a typo in i40e_pf documentation
From: Jeff Kirsher @ 2017-10-06 17:57 UTC (permalink / raw)
  To: davem; +Cc: Rami Rosen, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20171006175727.868-1-jeffrey.t.kirsher@intel.com>

From: Rami Rosen <rami.rosen@intel.com>

This patch fixes a typo in i40e_pf object documentation; num_req_vfs
refers to the number of VFs requested for the PF.

Signed-off-by: Rami Rosen <rami.rosen@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index 439c63cb2a0c..2bc4dd0dbbf1 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -350,7 +350,7 @@ struct i40e_pf {
 	u16 num_vmdq_vsis;         /* num vmdq vsis this PF has set up */
 	u16 num_vmdq_qps;          /* num queue pairs per vmdq pool */
 	u16 num_vmdq_msix;         /* num queue vectors per vmdq pool */
-	u16 num_req_vfs;           /* num VFs requested for this VF */
+	u16 num_req_vfs;           /* num VFs requested for this PF */
 	u16 num_vf_qps;            /* num queue pairs per VF */
 	u16 num_lan_qps;           /* num lan queues this PF has set up */
 	u16 num_lan_msix;          /* num queue vectors for the base PF vsi */
-- 
2.14.2

^ permalink raw reply related

* [net-next 00/15][pull request] 40GbE Intel Wired LAN Driver Updates 2017-10-06
From: Jeff Kirsher @ 2017-10-06 17:57 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, nhorman, sassmann, jogreene

This series contains updates to i40e and i40evf only.

Rami fixes a typo in the code comments.

Mitch adds an ethtool private flag to control source pruning to resolve an
issue where our default behavior is to enable source pruning which breaks ARP
monitoring in channel bonding.  Fixes a couple of register definitions, which
were incorrect.

Jake fixes an issue with multiple logical CPUs per core (simultaneous
multithreading - SMT) and how we set an affinity hint based on the v_idx of
that q_vector, which is an incremental value and might lead to multiple
offline CPUs being assigned to a q_vector.  Instead, we should only assign
hints for CPUs which are online, so look to use cpumask_local_spread().
Also fixed a VF VLAN tag stripping issue, where the flag created to change
this feature was seen as unchangeable.  Lastly, organized and re-numbered
the feature flags.

Alan re-enables PTP L4 for XL710 devices with firmware version 6.0 or
greater, now that the previous bug in the older firmware is fixed.
Implements the PCI error handlers for reset_prepare() and reset_done() to
allow us to handle function level resets.

Alice cleans up code that was added to the incorrect function during a
merge.

Filip adds a change to display an error message when a module is inserted
that does not meet the thermal requirements, Talking Heads "Burning Down
the House" comes to mind.  Also fixed a flow director filter issue where
a variable was not being cleared which stores the filter number to be
removed from the list when the firmware refused to add the requested
filter.

The following are changes since commit cc71b7b071192ac1c288e272fdc3f3877eb96663:
  net/ipv6: remove unused err variable on icmpv6_push_pending_frames
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 40GbE

Alan Brady (2):
  i40e: re-enable PTP L4 capabilities for XL710 if FW >6.0
  i40e: implement split PCI error reset handler

Alice Michael (1):
  i40e: fix merge error

Filip Sadowski (2):
  i40e: Display error message if module does not meet thermal
    requirements
  i40e: Properly maintain flow director filters list

Jacob Keller (4):
  i40e/i40evf: spread CPU affinity hints across online CPUs only
  i40evf: enable support for VF VLAN tag stripping control
  i40e: ignore skb->xmit_more when deciding to set RS bit
  i40e/i40evf: organize and re-number feature flags

Jesse Brandeburg (1):
  i40e/i40evf: use DECLARE_BITMAP for state

Mariusz Stachura (1):
  i40e: do not enter PHY debug mode while setting LEDs behaviour

Mitch Williams (3):
  i40e: add private flag to control source pruning
  i40e: redfine I40E_PHY_TYPE_MAX
  i40e: fix incorrect register definition

Rami Rosen (1):
  i40e: fix a typo in i40e_pf documentation

 drivers/net/ethernet/intel/i40e/i40e.h             |  98 +++++++++----------
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  |   3 +-
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c     |   8 +-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c     |  20 ++--
 drivers/net/ethernet/intel/i40e/i40e_main.c        | 104 +++++++++++++++++----
 drivers/net/ethernet/intel/i40e/i40e_register.h    |   2 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c        |  34 +------
 drivers/net/ethernet/intel/i40e/i40e_txrx.h        |   3 +-
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h    |   3 +-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.h      |   3 +-
 drivers/net/ethernet/intel/i40evf/i40evf.h         |  32 +++----
 drivers/net/ethernet/intel/i40evf/i40evf_main.c    |  31 +++---
 12 files changed, 203 insertions(+), 138 deletions(-)

-- 
2.14.2

^ permalink raw reply

* [net-next 04/15] i40e: re-enable PTP L4 capabilities for XL710 if FW >6.0
From: Jeff Kirsher @ 2017-10-06 17:57 UTC (permalink / raw)
  To: davem; +Cc: Alan Brady, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20171006175727.868-1-jeffrey.t.kirsher@intel.com>

From: Alan Brady <alan.brady@intel.com>

Starting with XL710 FW 5.3 PTP L4 was disabled for XL710 due to a bug.  The
bug has since been resolved in XL710 FW >6.0 and PTP L4 can now be
re-enabled on those devices with updated firmware.

Signed-off-by: Alan Brady <alan.brady@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index d2bb4f17c89e..85132eee9f64 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -9074,6 +9074,11 @@ static int i40e_sw_init(struct i40e_pf *pf)
 	    (pf->hw.aq.fw_maj_ver >= 5)))
 		pf->hw_features |= I40E_HW_USE_SET_LLDP_MIB;
 
+	/* Enable PTP L4 if FW > v6.0 */
+	if (pf->hw.mac.type == I40E_MAC_XL710 &&
+	    pf->hw.aq.fw_maj_ver >= 6)
+		pf->hw_features |= I40E_HW_PTP_L4_CAPABLE;
+
 	if (pf->hw.func_caps.vmdq) {
 		pf->num_vmdq_vsis = I40E_DEFAULT_NUM_VMDQ_VSI;
 		pf->flags |= I40E_FLAG_VMDQ_ENABLED;
-- 
2.14.2

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox