Netdev List
 help / color / mirror / Atom feed
* [bpf-next V2 PATCH 0/4] xdp: introduce bulking for ndo_xdp_xmit API
From: Jesper Dangaard Brouer @ 2018-05-11 18:11 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson

This patchset change ndo_xdp_xmit API to take a bulk of xdp frames.

When kernel is compiled with CONFIG_RETPOLINE, every indirect function
pointer (branch) call hurts performance. For XDP this have a huge
negative performance impact.

This patchset reduce the needed (indirect) calls to ndo_xdp_xmit, but
also prepares for further optimizations.  The DMA APIs use of indirect
function pointer calls is the primary source the regression.  It is
left for a followup patchset, to use bulking calls towards the DMA API
(via the scatter-gatter calls).

The other advantage of this API change is that drivers can easier
amortize the cost of any sync/locking scheme, over the bulk of
packets.  The assumption of the current API is that the driver
implemementing the NDO will also allocate a dedicated XDP TX queue for
every CPU in the system.  Which is not always possible or practical to
configure. E.g. ixgbe cannot load an XDP program on a machine with
more than 96 CPUs, due to limited hardware TX queues.  E.g. virtio_net
is hard to configure as it requires manually increasing the
queues. E.g. tun driver chooses to use a per XDP frame producer lock
modulo smp_processor_id over avail queues.

---

Jesper Dangaard Brouer (4):
      bpf: devmap introduce dev_map_enqueue
      bpf: devmap prepare xdp frames for bulking
      xdp: add tracepoint for devmap like cpumap have
      xdp: change ndo_xdp_xmit API to support bulking


 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |   26 ++++-
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   |    2 
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   21 +++-
 drivers/net/tun.c                             |   37 ++++---
 drivers/net/virtio_net.c                      |   66 +++++++++---
 include/linux/bpf.h                           |   16 ++-
 include/linux/netdevice.h                     |   14 ++-
 include/net/page_pool.h                       |    5 +
 include/net/xdp.h                             |    1 
 include/trace/events/xdp.h                    |   50 +++++++++
 kernel/bpf/devmap.c                           |  134 ++++++++++++++++++++++++-
 net/core/filter.c                             |   19 +---
 net/core/xdp.c                                |   20 +++-
 samples/bpf/xdp_monitor_kern.c                |   49 +++++++++
 samples/bpf/xdp_monitor_user.c                |   69 +++++++++++++
 15 files changed, 446 insertions(+), 83 deletions(-)

^ permalink raw reply

* [bpf-next V2 PATCH 1/4] bpf: devmap introduce dev_map_enqueue
From: Jesper Dangaard Brouer @ 2018-05-11 18:11 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson
In-Reply-To: <152606228984.30376.4434523160732739487.stgit@firesoul>

Functionality is the same, but the ndo_xdp_xmit call is now
simply invoked from inside the devmap.c code.

V2: Fix compile issue reported by kbuild test robot <lkp@intel.com>

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/linux/bpf.h        |   14 +++++++++++---
 include/trace/events/xdp.h |    9 ++++++++-
 kernel/bpf/devmap.c        |   37 +++++++++++++++++++++++++++++++------
 net/core/filter.c          |   15 ++-------------
 4 files changed, 52 insertions(+), 23 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a38e474bf7ee..8527964da402 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -485,14 +485,15 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
 void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
 
 /* Map specifics */
-struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct xdp_buff;
+struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp);
 
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
 void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
 void __cpu_map_flush(struct bpf_map *map);
-struct xdp_buff;
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
 
@@ -571,6 +572,14 @@ static inline void __dev_map_flush(struct bpf_map *map)
 {
 }
 
+struct xdp_buff;
+struct bpf_dtab_netdev;
+static inline
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+{
+	return 0;
+}
+
 static inline
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
 {
@@ -585,7 +594,6 @@ static inline void __cpu_map_flush(struct bpf_map *map)
 {
 }
 
-struct xdp_buff;
 static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 				  struct xdp_buff *xdp,
 				  struct net_device *dev_rx)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 8989a92c571a..96104610d40e 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -138,11 +138,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
 		  __entry->map_id, __entry->map_index)
 );
 
+#ifndef __DEVMAP_OBJ_TYPE
+#define __DEVMAP_OBJ_TYPE
+struct _bpf_dtab_netdev {
+	struct net_device *dev;
+};
+#endif /* __DEVMAP_OBJ_TYPE */
+
 #define devmap_ifindex(fwd, map)				\
 	(!fwd ? 0 :						\
 	 (!map ? 0 :						\
 	  ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
-	   ((struct net_device *)fwd)->ifindex : 0)))
+	   ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)))
 
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
 	 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map),	\
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 565f9ece9115..808808bf2bf2 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,18 +48,21 @@
  * calls will fail at this point.
  */
 #include <linux/bpf.h>
+#include <net/xdp.h>
 #include <linux/filter.h>
 
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
+/* objects in the map */
 struct bpf_dtab_netdev {
-	struct net_device *dev;
+	struct net_device *dev; /* must be first member, due to tracepoint */
 	struct bpf_dtab *dtab;
 	unsigned int bit;
 	struct rcu_head rcu;
 };
 
+/* bpf map container */
 struct bpf_dtab {
 	struct bpf_map map;
 	struct bpf_dtab_netdev **netdev_map;
@@ -240,21 +243,43 @@ void __dev_map_flush(struct bpf_map *map)
  * update happens in parallel here a dev_put wont happen until after reading the
  * ifindex.
  */
-struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	struct bpf_dtab_netdev *dev;
+	struct bpf_dtab_netdev *obj;
 
 	if (key >= map->max_entries)
 		return NULL;
 
-	dev = READ_ONCE(dtab->netdev_map[key]);
-	return dev ? dev->dev : NULL;
+	obj = READ_ONCE(dtab->netdev_map[key]);
+	return obj;
+}
+
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+{
+	struct net_device *dev = dst->dev;
+	struct xdp_frame *xdpf;
+	int err;
+
+	if (!dev->netdev_ops->ndo_xdp_xmit)
+		return -EOPNOTSUPP;
+
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	/* TODO: implement a bulking/enqueue step later */
+	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+	if (err)
+		return err;
+
+	return 0;
 }
 
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 {
-	struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
+	struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
+	struct net_device *dev = dev = obj ? obj->dev : NULL;
 
 	return dev ? &dev->ifindex : NULL;
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index ca60d2872da4..284b9ec3d50f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3024,20 +3024,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 
 	switch (map->map_type) {
 	case BPF_MAP_TYPE_DEVMAP: {
-		struct net_device *dev = fwd;
-		struct xdp_frame *xdpf;
+		struct bpf_dtab_netdev *dst = fwd;
 
-		if (!dev->netdev_ops->ndo_xdp_xmit)
-			return -EOPNOTSUPP;
-
-		xdpf = convert_to_xdp_frame(xdp);
-		if (unlikely(!xdpf))
-			return -EOVERFLOW;
-
-		/* TODO: move to inside map code instead, for bulk support
-		 * err = dev_map_enqueue(dev, xdp);
-		 */
-		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+		err = dev_map_enqueue(dst, xdp);
 		if (err)
 			return err;
 		__dev_map_insert_ctx(map, index);

^ permalink raw reply related

* [bpf-next V2 PATCH 2/4] bpf: devmap prepare xdp frames for bulking
From: Jesper Dangaard Brouer @ 2018-05-11 18:12 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson
In-Reply-To: <152606228984.30376.4434523160732739487.stgit@firesoul>

Like cpumap create queue for xdp frames that will be bulked.  For now,
this patch simply invoke ndo_xdp_xmit foreach frame.  This happens,
either when the map flush operation is envoked, or when the limit
DEV_MAP_BULK_SIZE is reached.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 kernel/bpf/devmap.c |   77 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 73 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 808808bf2bf2..cab72c100bb5 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -54,11 +54,18 @@
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
+#define DEV_MAP_BULK_SIZE 16
+struct xdp_bulk_queue {
+	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+	unsigned int count;
+};
+
 /* objects in the map */
 struct bpf_dtab_netdev {
 	struct net_device *dev; /* must be first member, due to tracepoint */
 	struct bpf_dtab *dtab;
 	unsigned int bit;
+	struct xdp_bulk_queue __percpu *bulkq;
 	struct rcu_head rcu;
 };
 
@@ -209,6 +216,38 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
 	__set_bit(bit, bitmap);
 }
 
+static int bq_xmit_all(struct bpf_dtab_netdev *obj,
+			 struct xdp_bulk_queue *bq)
+{
+	unsigned int processed = 0, drops = 0;
+	struct net_device *dev = obj->dev;
+	int i;
+
+	if (unlikely(!bq->count))
+		return 0;
+
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+
+		prefetch(xdpf);
+	}
+
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+		int err;
+
+		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+		if (err) {
+			drops++;
+			xdp_return_frame(xdpf);
+		}
+		processed++;
+	}
+	bq->count = 0;
+
+	return 0;
+}
+
 /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
  * from the driver before returning from its napi->poll() routine. The poll()
  * routine is called either from busy_poll context or net_rx_action signaled
@@ -224,6 +263,7 @@ void __dev_map_flush(struct bpf_map *map)
 
 	for_each_set_bit(bit, bitmap, map->max_entries) {
 		struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
+		struct xdp_bulk_queue *bq;
 		struct net_device *netdev;
 
 		/* This is possible if the dev entry is removed by user space
@@ -233,6 +273,9 @@ void __dev_map_flush(struct bpf_map *map)
 			continue;
 
 		__clear_bit(bit, bitmap);
+
+		bq = this_cpu_ptr(dev->bulkq);
+		bq_xmit_all(dev, bq);
 		netdev = dev->dev;
 		if (likely(netdev->netdev_ops->ndo_xdp_flush))
 			netdev->netdev_ops->ndo_xdp_flush(netdev);
@@ -255,6 +298,20 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 	return obj;
 }
 
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
+{
+	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
+
+	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
+		bq_xmit_all(obj, bq);
+
+	bq->q[bq->count++] = xdpf;
+	return 0;
+}
+
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
 {
 	struct net_device *dev = dst->dev;
@@ -268,8 +325,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	/* TODO: implement a bulking/enqueue step later */
-	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+	err = bq_enqueue(dst, xdpf);
 	if (err)
 		return err;
 
@@ -288,13 +344,18 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 {
 	if (dev->dev->netdev_ops->ndo_xdp_flush) {
 		struct net_device *fl = dev->dev;
+		struct xdp_bulk_queue *bq;
 		unsigned long *bitmap;
+
 		int cpu;
 
 		for_each_online_cpu(cpu) {
 			bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
 			__clear_bit(dev->bit, bitmap);
 
+			bq = per_cpu_ptr(dev->bulkq, cpu);
+			bq_xmit_all(dev, bq);
+
 			fl->netdev_ops->ndo_xdp_flush(dev->dev);
 		}
 	}
@@ -306,6 +367,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
 
 	dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
 	dev_map_flush_old(dev);
+	free_percpu(dev->bulkq);
 	dev_put(dev->dev);
 	kfree(dev);
 }
@@ -338,6 +400,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct net *net = current->nsproxy->net_ns;
+	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
 	struct bpf_dtab_netdev *dev, *old_dev;
 	u32 i = *(u32 *)key;
 	u32 ifindex = *(u32 *)value;
@@ -352,11 +415,17 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (!ifindex) {
 		dev = NULL;
 	} else {
-		dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
-				   map->numa_node);
+		dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
 		if (!dev)
 			return -ENOMEM;
 
+		dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
+						sizeof(void *), gfp);
+		if (!dev->bulkq) {
+			kfree(dev);
+			return -ENOMEM;
+		}
+
 		dev->dev = dev_get_by_index(net, ifindex);
 		if (!dev->dev) {
 			kfree(dev);

^ permalink raw reply related

* [bpf-next V2 PATCH 3/4] xdp: add tracepoint for devmap like cpumap have
From: Jesper Dangaard Brouer @ 2018-05-11 18:12 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson
In-Reply-To: <152606228984.30376.4434523160732739487.stgit@firesoul>

Notice how this allow us get XDP statistic without affecting the XDP
performance, as tracepoint is no-longer activated on a per packet basis.

The xdp_monitor sample/tool is updated to use this new tracepoint.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/linux/bpf.h            |    6 ++++-
 include/trace/events/xdp.h     |   39 +++++++++++++++++++++++++++++++++++
 kernel/bpf/devmap.c            |   25 ++++++++++++++++++-----
 net/core/filter.c              |    2 +-
 samples/bpf/xdp_monitor_kern.c |   39 +++++++++++++++++++++++++++++++++++
 samples/bpf/xdp_monitor_user.c |   44 +++++++++++++++++++++++++++++++++++++++-
 6 files changed, 146 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8527964da402..3dda20a29cdb 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -489,7 +489,8 @@ struct xdp_buff;
 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp);
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx);
 
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
 void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
@@ -575,7 +576,8 @@ static inline void __dev_map_flush(struct bpf_map *map)
 struct xdp_buff;
 struct bpf_dtab_netdev;
 static inline
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
 {
 	return 0;
 }
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 96104610d40e..2e9ef0650144 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -229,6 +229,45 @@ TRACE_EVENT(xdp_cpumap_enqueue,
 		  __entry->to_cpu)
 );
 
+TRACE_EVENT(xdp_devmap_xmit,
+
+	TP_PROTO(const struct bpf_map *map, u32 map_index,
+		 int sent, int drops,
+		 const struct net_device *from_dev,
+		 const struct net_device *to_dev),
+
+	TP_ARGS(map, map_index, sent, drops, from_dev, to_dev),
+
+	TP_STRUCT__entry(
+		__field(int, map_id)
+		__field(u32, act)
+		__field(u32, map_index)
+		__field(int, drops)
+		__field(int, sent)
+		__field(int, from_ifindex)
+		__field(int, to_ifindex)
+	),
+
+	TP_fast_assign(
+		__entry->map_id		= map->id;
+		__entry->act		= XDP_REDIRECT;
+		__entry->map_index	= map_index;
+		__entry->drops		= drops;
+		__entry->sent		= sent;
+		__entry->from_ifindex	= from_dev->ifindex;
+		__entry->to_ifindex	= to_dev->ifindex;
+	),
+
+	TP_printk("ndo_xdp_xmit"
+		  " map_id=%d map_index=%d action=%s"
+		  " sent=%d drops=%d"
+		  " from_ifindex=%d to_ifindex=%d",
+		  __entry->map_id, __entry->map_index,
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->sent, __entry->drops,
+		  __entry->from_ifindex, __entry->to_ifindex)
+);
+
 #endif /* _TRACE_XDP_H */
 
 #include <trace/define_trace.h>
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index cab72c100bb5..6f84100723b0 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -50,6 +50,7 @@
 #include <linux/bpf.h>
 #include <net/xdp.h>
 #include <linux/filter.h>
+#include <trace/events/xdp.h>
 
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -57,6 +58,7 @@
 #define DEV_MAP_BULK_SIZE 16
 struct xdp_bulk_queue {
 	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+	struct net_device *dev_rx;
 	unsigned int count;
 };
 
@@ -219,8 +221,8 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
 static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 			 struct xdp_bulk_queue *bq)
 {
-	unsigned int processed = 0, drops = 0;
 	struct net_device *dev = obj->dev;
+	int sent = 0, drops = 0;
 	int i;
 
 	if (unlikely(!bq->count))
@@ -241,10 +243,13 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 			drops++;
 			xdp_return_frame(xdpf);
 		}
-		processed++;
+		sent++;
 	}
 	bq->count = 0;
 
+	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
+			      sent, drops, bq->dev_rx, dev);
+	bq->dev_rx = NULL;
 	return 0;
 }
 
@@ -301,18 +306,28 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
+static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
+		      struct net_device *dev_rx)
+
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
 
 	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
 		bq_xmit_all(obj, bq);
 
+	/* Ingress dev_rx will be the same for all xdp_frame's in
+	 * bulk_queue, because bq stored per-CPU and must be flushed
+	 * from net_device drivers NAPI func end.
+	 */
+	if (!bq->dev_rx)
+		bq->dev_rx = dev_rx;
+
 	bq->q[bq->count++] = xdpf;
 	return 0;
 }
 
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
 {
 	struct net_device *dev = dst->dev;
 	struct xdp_frame *xdpf;
@@ -325,7 +340,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	err = bq_enqueue(dst, xdpf);
+	err = bq_enqueue(dst, xdpf, dev_rx);
 	if (err)
 		return err;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 284b9ec3d50f..68277358cca9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3026,7 +3026,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 	case BPF_MAP_TYPE_DEVMAP: {
 		struct bpf_dtab_netdev *dst = fwd;
 
-		err = dev_map_enqueue(dst, xdp);
+		err = dev_map_enqueue(dst, xdp, dev_rx);
 		if (err)
 			return err;
 		__dev_map_insert_ctx(map, index);
diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 211db8ded0de..2854aa0665ea 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -208,3 +208,42 @@ int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
 
 	return 0;
 }
+
+struct bpf_map_def SEC("maps") devmap_xmit_cnt = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= 1,
+};
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format
+ * Code in:         kernel/include/trace/events/xdp.h
+ */
+struct devmap_xmit_ctx {
+	u64 __pad;		// First 8 bytes are not accessible by bpf code
+	int map_id;		//	offset:8;  size:4; signed:1;
+	u32 act;		//	offset:12; size:4; signed:0;
+	u32 map_index;		//	offset:16; size:4; signed:0;
+	int drops;		//	offset:20; size:4; signed:1;
+	int sent;		//	offset:24; size:4; signed:1;
+	int from_ifindex;	//	offset:28; size:4; signed:1;
+	int to_ifindex;		//	offset:32; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_devmap_xmit")
+int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx)
+{
+	struct datarec *rec;
+	u32 key = 0;
+
+	rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key);
+	if (!rec)
+		return 0;
+	rec->processed += ctx->sent;
+	rec->dropped   += ctx->drops;
+
+	/* Record bulk events, then userspace can calc average bulk size */
+	rec->info += 1;
+
+	return 1;
+}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index 894bc64c2cac..4aaf1ab1927d 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -141,6 +141,7 @@ struct stats_record {
 	struct record_u64 xdp_exception[XDP_ACTION_MAX];
 	struct record xdp_cpumap_kthread;
 	struct record xdp_cpumap_enqueue[MAX_CPUS];
+	struct record xdp_devmap_xmit;
 };
 
 static bool map_collect_record(int fd, __u32 key, struct record *rec)
@@ -397,7 +398,7 @@ static void stats_print(struct stats_record *stats_rec,
 			info = calc_info(r, p, t);
 			if (info > 0)
 				i_str = "sched";
-			if (pps > 0)
+			if (pps > 0 || drop > 0)
 				printf(fmt1, "cpumap-kthread",
 				       i, pps, drop, info, i_str);
 		}
@@ -409,6 +410,42 @@ static void stats_print(struct stats_record *stats_rec,
 		printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
 	}
 
+	/* devmap ndo_xdp_xmit stats */
+	{
+		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s\n";
+		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s\n";
+		struct record *rec, *prev;
+		double drop, info;
+		char *i_str = "";
+
+		rec  =  &stats_rec->xdp_devmap_xmit;
+		prev = &stats_prev->xdp_devmap_xmit;
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps  = calc_pps(r, p, t);
+			drop = calc_drop(r, p, t);
+			info = calc_info(r, p, t);
+			if (info > 0) {
+				i_str = "bulk-average";
+				info = (pps+drop) / info; /* calc avg bulk */
+			}
+			if (pps > 0 || drop > 0)
+				printf(fmt1, "devmap-xmit",
+				       i, pps, drop, info, i_str);
+		}
+		pps = calc_pps(&rec->total, &prev->total, t);
+		drop = calc_drop(&rec->total, &prev->total, t);
+		info = calc_info(&rec->total, &prev->total, t);
+		if (info > 0) {
+			i_str = "bulk-average";
+			info = (pps+drop) / info; /* calc avg bulk */
+		}
+		printf(fmt2, "devmap-xmit", "total", pps, drop, info, i_str);
+	}
+
 	printf("\n");
 }
 
@@ -437,6 +474,9 @@ static bool stats_collect(struct stats_record *rec)
 	fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
 	map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
 
+	fd = map_data[4].fd; /* map4: devmap_xmit_cnt */
+	map_collect_record(fd, 0, &rec->xdp_devmap_xmit);
+
 	return true;
 }
 
@@ -480,6 +520,7 @@ static struct stats_record *alloc_stats_record(void)
 
 	rec_sz = sizeof(struct datarec);
 	rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
+	rec->xdp_devmap_xmit.cpu    = alloc_rec_per_cpu(rec_sz);
 
 	for (i = 0; i < MAX_CPUS; i++)
 		rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
@@ -498,6 +539,7 @@ static void free_stats_record(struct stats_record *r)
 		free(r->xdp_exception[i].cpu);
 
 	free(r->xdp_cpumap_kthread.cpu);
+	free(r->xdp_devmap_xmit.cpu);
 
 	for (i = 0; i < MAX_CPUS; i++)
 		free(r->xdp_cpumap_enqueue[i].cpu);

^ permalink raw reply related

* [bpf-next V2 PATCH 4/4] xdp: change ndo_xdp_xmit API to support bulking
From: Jesper Dangaard Brouer @ 2018-05-11 18:12 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson
In-Reply-To: <152606228984.30376.4434523160732739487.stgit@firesoul>

This patch change the API for ndo_xdp_xmit to support bulking
xdp_frames.

When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown.
Most of the slowdown is caused by DMA API indirect function calls, but
also the net_device->ndo_xdp_xmit() call.

Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with
single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed
performance improved:
 for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps
 for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps

With frames avail as a bulk inside the driver ndo_xdp_xmit call,
further optimizations are possible, like bulk DMA-mapping for TX.

Testing without CONFIG_RETPOLINE show the same performance for
physical NIC drivers.

The virtual NIC driver tun sees a huge performance boost, as it can
avoid doing per frame producer locking, but instead amortize the
locking cost over the bulk.

V2: Fix compile errors reported by kbuild test robot <lkp@intel.com>

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |   26 +++++++---
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   |    2 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   21 ++++++--
 drivers/net/tun.c                             |   37 +++++++++-----
 drivers/net/virtio_net.c                      |   66 +++++++++++++++++++------
 include/linux/netdevice.h                     |   14 +++--
 include/net/page_pool.h                       |    5 +-
 include/net/xdp.h                             |    1 
 include/trace/events/xdp.h                    |   10 ++--
 kernel/bpf/devmap.c                           |   33 ++++++++-----
 net/core/filter.c                             |    4 +-
 net/core/xdp.c                                |   20 ++++++--
 samples/bpf/xdp_monitor_kern.c                |   10 ++++
 samples/bpf/xdp_monitor_user.c                |   35 +++++++++++--
 14 files changed, 206 insertions(+), 78 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 5efa68de935b..9b698c5acd05 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3664,14 +3664,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
  * @dev: netdev
  * @xdp: XDP buffer
  *
- * Returns Zero if sent, else an error code
+ * Returns number of frames successfully sent. Frames that fail are
+ * free'ed via XDP return API.
+ *
+ * For error cases, a negative errno code is returned and no-frames
+ * are transmitted (caller must handle freeing frames).
  **/
-int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	unsigned int queue_index = smp_processor_id();
 	struct i40e_vsi *vsi = np->vsi;
-	int err;
+	int drops = 0;
+	int i;
 
 	if (test_bit(__I40E_VSI_DOWN, vsi->state))
 		return -ENETDOWN;
@@ -3679,11 +3684,18 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
 		return -ENXIO;
 
-	err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
-	if (err != I40E_XDP_TX)
-		return -ENOSPC;
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		int err;
 
-	return 0;
+		err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
+		if (err != I40E_XDP_TX) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		}
+	}
+
+	return n - drops;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index fdd2c55f03a6..eb8804b3d7b6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -487,7 +487,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
 void i40e_detect_recover_hung(struct i40e_vsi *vsi);
 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
 bool __i40e_chk_linearize(struct sk_buff *skb);
-int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
 void i40e_xdp_flush(struct net_device *dev);
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 6652b201df5b..9645619f7729 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10017,11 +10017,13 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
-static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
+static int ixgbe_xdp_xmit(struct net_device *dev, int n,
+			  struct xdp_frame **frames)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 	struct ixgbe_ring *ring;
-	int err;
+	int drops = 0;
+	int i;
 
 	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
 		return -ENETDOWN;
@@ -10033,11 +10035,18 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 	if (unlikely(!ring))
 		return -ENXIO;
 
-	err = ixgbe_xmit_xdp_ring(adapter, xdpf);
-	if (err != IXGBE_XDP_TX)
-		return -ENOSPC;
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		int err;
 
-	return 0;
+		err = ixgbe_xmit_xdp_ring(adapter, xdpf);
+		if (err != IXGBE_XDP_TX) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		}
+	}
+
+	return n - drops;
 }
 
 static void ixgbe_xdp_flush(struct net_device *dev)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d3c04ab9752a..4fe0c75c5e0b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -70,6 +70,7 @@
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
 #include <net/sock.h>
+#include <net/xdp.h>
 #include <linux/seq_file.h>
 #include <linux/uio.h>
 #include <linux/skb_array.h>
@@ -1290,34 +1291,44 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 };
 
-static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
+static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 	struct tun_file *tfile;
 	u32 numqueues;
-	int ret = 0;
+	int drops = 0;
+	int cnt = n;
+	int i;
 
 	rcu_read_lock();
 
 	numqueues = READ_ONCE(tun->numqueues);
 	if (!numqueues) {
-		ret = -ENOSPC;
-		goto out;
+		rcu_read_unlock();
+		return -ENXIO; /* Caller will free/return all frames */
 	}
 
 	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
 					    numqueues]);
-	/* Encode the XDP flag into lowest bit for consumer to differ
-	 * XDP buffer from sk_buff.
-	 */
-	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) {
-		this_cpu_inc(tun->pcpu_stats->tx_dropped);
-		ret = -ENOSPC;
+
+	spin_lock(&tfile->tx_ring.producer_lock);
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdp = frames[i];
+		/* Encode the XDP flag into lowest bit for consumer to differ
+		 * XDP buffer from sk_buff.
+		 */
+		void *frame = tun_xdp_to_ptr(xdp);
+
+		if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
+			this_cpu_inc(tun->pcpu_stats->tx_dropped);
+			xdp_return_frame_rx_napi(xdp);
+			drops++;
+		}
 	}
+	spin_unlock(&tfile->tx_ring.producer_lock);
 
-out:
 	rcu_read_unlock();
-	return ret;
+	return cnt - drops;
 }
 
 static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
@@ -1327,7 +1338,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
 	if (unlikely(!frame))
 		return -EOVERFLOW;
 
-	return tun_xdp_xmit(dev, frame);
+	return tun_xdp_xmit(dev, 1, &frame);
 }
 
 static void tun_xdp_flush(struct net_device *dev)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f34794a76c4d..39a0783d1cde 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -419,23 +419,13 @@ static void virtnet_xdp_flush(struct net_device *dev)
 	virtqueue_kick(sq->vq);
 }
 
-static int __virtnet_xdp_xmit(struct virtnet_info *vi,
-			       struct xdp_frame *xdpf)
+static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
+				   struct send_queue *sq,
+				   struct xdp_frame *xdpf)
 {
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
-	struct xdp_frame *xdpf_sent;
-	struct send_queue *sq;
-	unsigned int len;
-	unsigned int qp;
 	int err;
 
-	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
-	sq = &vi->sq[qp];
-
-	/* Free up any pending old buffers before queueing new ones. */
-	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
-		xdp_return_frame(xdpf_sent);
-
 	/* virtqueue want to use data area in-front of packet */
 	if (unlikely(xdpf->metasize > 0))
 		return -EOPNOTSUPP;
@@ -459,11 +449,40 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
 	return 0;
 }
 
-static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
+static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
+				   struct xdp_frame *xdpf)
+{
+	struct xdp_frame *xdpf_sent;
+	struct send_queue *sq;
+	unsigned int len;
+	unsigned int qp;
+
+	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
+	sq = &vi->sq[qp];
+
+	/* Free up any pending old buffers before queueing new ones. */
+	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
+		xdp_return_frame(xdpf_sent);
+
+	return __virtnet_xdp_xmit_one(vi, sq, xdpf);
+}
+
+static int virtnet_xdp_xmit(struct net_device *dev,
+			    int n, struct xdp_frame **frames)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct receive_queue *rq = vi->rq;
+	struct xdp_frame *xdpf_sent;
 	struct bpf_prog *xdp_prog;
+	struct send_queue *sq;
+	unsigned int len;
+	unsigned int qp;
+	int drops = 0;
+	int err;
+	int i;
+
+	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
+	sq = &vi->sq[qp];
 
 	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
 	 * indicate XDP resources have been successfully allocated.
@@ -472,7 +491,20 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 	if (!xdp_prog)
 		return -ENXIO;
 
-	return __virtnet_xdp_xmit(vi, xdpf);
+	/* Free up any pending old buffers before queueing new ones. */
+	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
+		xdp_return_frame(xdpf_sent);
+
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+
+		err = __virtnet_xdp_xmit_one(vi, sq, xdpf);
+		if (err) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		}
+	}
+	return n - drops;
 }
 
 static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
@@ -616,7 +648,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 			xdpf = convert_to_xdp_frame(&xdp);
 			if (unlikely(!xdpf))
 				goto err_xdp;
-			err = __virtnet_xdp_xmit(vi, xdpf);
+			err = __virtnet_xdp_tx_xmit(vi, xdpf);
 			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				goto err_xdp;
@@ -779,7 +811,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			xdpf = convert_to_xdp_frame(&xdp);
 			if (unlikely(!xdpf))
 				goto err_xdp;
-			err = __virtnet_xdp_xmit(vi, xdpf);
+			err = __virtnet_xdp_tx_xmit(vi, xdpf);
 			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				if (unlikely(xdp_page != page))
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 03ed492c4e14..debdb6286170 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1185,9 +1185,13 @@ struct dev_ifalias {
  *	This function is used to set or query state related to XDP on the
  *	netdevice and manage BPF offload. See definition of
  *	enum bpf_netdev_command for details.
- * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp);
- *	This function is used to submit a XDP packet for transmit on a
- *	netdevice.
+ * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp);
+ *	This function is used to submit @n XDP packets for transmit on a
+ *	netdevice. Returns number of frames successfully transmitted, frames
+ *	that got dropped are freed/returned via xdp_return_frame().
+ *	Returns negative number, means general error invoking ndo, meaning
+ *	no frames were xmit'ed and core-caller will free all frames.
+ *	TODO: Consider add flag to allow sending flush operation.
  * void (*ndo_xdp_flush)(struct net_device *dev);
  *	This function is used to inform the driver to flush a particular
  *	xdp tx queue. Must be called on same CPU as xdp_xmit.
@@ -1375,8 +1379,8 @@ struct net_device_ops {
 						       int needed_headroom);
 	int			(*ndo_bpf)(struct net_device *dev,
 					   struct netdev_bpf *bpf);
-	int			(*ndo_xdp_xmit)(struct net_device *dev,
-						struct xdp_frame *xdp);
+	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
+						struct xdp_frame **xdp);
 	void			(*ndo_xdp_flush)(struct net_device *dev);
 };
 
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index c79087153148..694d055e01ef 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -115,13 +115,14 @@ void page_pool_destroy(struct page_pool *pool);
 void __page_pool_put_page(struct page_pool *pool,
 			  struct page *page, bool allow_direct);
 
-static inline void page_pool_put_page(struct page_pool *pool, struct page *page)
+static inline void page_pool_put_page(struct page_pool *pool,
+				      struct page *page, bool allow_direct)
 {
 	/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
 	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
 	 */
 #ifdef CONFIG_PAGE_POOL
-	__page_pool_put_page(pool, page, false);
+	__page_pool_put_page(pool, page, allow_direct);
 #endif
 }
 /* Very limited use-cases allow recycle direct */
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 0b689cf561c7..7ad779237ae8 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 }
 
 void xdp_return_frame(struct xdp_frame *xdpf);
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
 void xdp_return_buff(struct xdp_buff *xdp);
 
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 2e9ef0650144..1ecf4c67fcf7 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -234,9 +234,9 @@ TRACE_EVENT(xdp_devmap_xmit,
 	TP_PROTO(const struct bpf_map *map, u32 map_index,
 		 int sent, int drops,
 		 const struct net_device *from_dev,
-		 const struct net_device *to_dev),
+		 const struct net_device *to_dev, int err),
 
-	TP_ARGS(map, map_index, sent, drops, from_dev, to_dev),
+	TP_ARGS(map, map_index, sent, drops, from_dev, to_dev, err),
 
 	TP_STRUCT__entry(
 		__field(int, map_id)
@@ -246,6 +246,7 @@ TRACE_EVENT(xdp_devmap_xmit,
 		__field(int, sent)
 		__field(int, from_ifindex)
 		__field(int, to_ifindex)
+		__field(int, err)
 	),
 
 	TP_fast_assign(
@@ -256,16 +257,17 @@ TRACE_EVENT(xdp_devmap_xmit,
 		__entry->sent		= sent;
 		__entry->from_ifindex	= from_dev->ifindex;
 		__entry->to_ifindex	= to_dev->ifindex;
+		__entry->err		= err;
 	),
 
 	TP_printk("ndo_xdp_xmit"
 		  " map_id=%d map_index=%d action=%s"
 		  " sent=%d drops=%d"
-		  " from_ifindex=%d to_ifindex=%d",
+		  " from_ifindex=%d to_ifindex=%d err=%d",
 		  __entry->map_id, __entry->map_index,
 		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
 		  __entry->sent, __entry->drops,
-		  __entry->from_ifindex, __entry->to_ifindex)
+		  __entry->from_ifindex, __entry->to_ifindex, __entry->err)
 );
 
 #endif /* _TRACE_XDP_H */
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 6f84100723b0..4dd8f0e3a8d9 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -222,7 +222,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 			 struct xdp_bulk_queue *bq)
 {
 	struct net_device *dev = obj->dev;
-	int sent = 0, drops = 0;
+	int sent = 0, drops = 0, err = 0;
 	int i;
 
 	if (unlikely(!bq->count))
@@ -234,23 +234,32 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		prefetch(xdpf);
 	}
 
-	for (i = 0; i < bq->count; i++) {
-		struct xdp_frame *xdpf = bq->q[i];
-		int err;
-
-		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
-		if (err) {
-			drops++;
-			xdp_return_frame(xdpf);
-		}
-		sent++;
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
+	if (sent < 0) {
+		err = sent;
+		sent = 0;
+		goto error;
 	}
+	drops = bq->count - sent;
+out:
 	bq->count = 0;
 
 	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
-			      sent, drops, bq->dev_rx, dev);
+			      sent, drops, bq->dev_rx, dev, err);
 	bq->dev_rx = NULL;
 	return 0;
+error:
+	/* If ndo_xdp_xmit fails with an errno, no frames have been
+	 * xmit'ed and it's our responsibility to them free all.
+	 */
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+
+		/* RX path under NAPI protection, can return frames faster */
+		xdp_return_frame_rx_napi(xdpf);
+		drops++;
+	}
+	goto out;
 }
 
 /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
diff --git a/net/core/filter.c b/net/core/filter.c
index 68277358cca9..1f663404e36a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3008,8 +3008,8 @@ static int __bpf_tx_xdp(struct net_device *dev,
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
-	if (err)
+	err = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf);
+	if (err <= 0)
 		return err;
 	dev->netdev_ops->ndo_xdp_flush(dev);
 	return 0;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index bf6758f74339..cb8c4e061a5a 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -308,7 +308,13 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
 
-static void xdp_return(void *data, struct xdp_mem_info *mem)
+/* XDP RX runs under NAPI protection, and in different delivery error
+ * scenarios (e.g. queue full), it is possible to return the xdp_frame
+ * while still leveraging this protection.  The @napi_direct boolian
+ * is used for those calls sites.  Thus, allowing for faster recycling
+ * of xdp_frames/pages in those cases.
+ */
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
 {
 	struct xdp_mem_allocator *xa;
 	struct page *page;
@@ -320,7 +326,7 @@ static void xdp_return(void *data, struct xdp_mem_info *mem)
 		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
 		page = virt_to_head_page(data);
 		if (xa)
-			page_pool_put_page(xa->page_pool, page);
+			page_pool_put_page(xa->page_pool, page, napi_direct);
 		else
 			put_page(page);
 		rcu_read_unlock();
@@ -340,12 +346,18 @@ static void xdp_return(void *data, struct xdp_mem_info *mem)
 
 void xdp_return_frame(struct xdp_frame *xdpf)
 {
-	xdp_return(xdpf->data, &xdpf->mem);
+	__xdp_return(xdpf->data, &xdpf->mem, false);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
 
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
+{
+	__xdp_return(xdpf->data, &xdpf->mem, true);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
+
 void xdp_return_buff(struct xdp_buff *xdp)
 {
-	xdp_return(xdp->data, &xdp->rxq->mem);
+	__xdp_return(xdp->data, &xdp->rxq->mem, true);
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);
diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 2854aa0665ea..ad10fe700d7d 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -125,6 +125,7 @@ struct datarec {
 	u64 processed;
 	u64 dropped;
 	u64 info;
+	u64 err;
 };
 #define MAX_CPUS 64
 
@@ -228,6 +229,7 @@ struct devmap_xmit_ctx {
 	int sent;		//	offset:24; size:4; signed:1;
 	int from_ifindex;	//	offset:28; size:4; signed:1;
 	int to_ifindex;		//	offset:32; size:4; signed:1;
+	int err;		//	offset:36; size:4; signed:1;
 };
 
 SEC("tracepoint/xdp/xdp_devmap_xmit")
@@ -245,5 +247,13 @@ int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx)
 	/* Record bulk events, then userspace can calc average bulk size */
 	rec->info += 1;
 
+	/* Record error cases, where no frame were sent */
+	if (ctx->err)
+		rec->err++;
+
+	/* Catch API error of drv ndo_xdp_xmit sent more than count */
+	if (ctx->drops < 0)
+		rec->err++;
+
 	return 1;
 }
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index 4aaf1ab1927d..0ec7967dd32a 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -117,6 +117,7 @@ struct datarec {
 	__u64 processed;
 	__u64 dropped;
 	__u64 info;
+	__u64 err;
 };
 #define MAX_CPUS 64
 
@@ -152,6 +153,7 @@ static bool map_collect_record(int fd, __u32 key, struct record *rec)
 	__u64 sum_processed = 0;
 	__u64 sum_dropped = 0;
 	__u64 sum_info = 0;
+	__u64 sum_err = 0;
 	int i;
 
 	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
@@ -170,10 +172,13 @@ static bool map_collect_record(int fd, __u32 key, struct record *rec)
 		sum_dropped        += values[i].dropped;
 		rec->cpu[i].info = values[i].info;
 		sum_info        += values[i].info;
+		rec->cpu[i].err = values[i].err;
+		sum_err        += values[i].err;
 	}
 	rec->total.processed = sum_processed;
 	rec->total.dropped   = sum_dropped;
 	rec->total.info      = sum_info;
+	rec->total.err       = sum_err;
 	return true;
 }
 
@@ -274,6 +279,18 @@ static double calc_info(struct datarec *r, struct datarec *p, double period)
 	return pps;
 }
 
+static double calc_err(struct datarec *r, struct datarec *p, double period)
+{
+	__u64 packets = 0;
+	double pps = 0;
+
+	if (period > 0) {
+		packets = r->err - p->err;
+		pps = packets / period;
+	}
+	return pps;
+}
+
 static void stats_print(struct stats_record *stats_rec,
 			struct stats_record *stats_prev,
 			bool err_only)
@@ -412,11 +429,12 @@ static void stats_print(struct stats_record *stats_rec,
 
 	/* devmap ndo_xdp_xmit stats */
 	{
-		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s\n";
-		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s\n";
+		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s %s\n";
+		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s %s\n";
 		struct record *rec, *prev;
-		double drop, info;
+		double drop, info, err;
 		char *i_str = "";
+		char *err_str = "";
 
 		rec  =  &stats_rec->xdp_devmap_xmit;
 		prev = &stats_prev->xdp_devmap_xmit;
@@ -428,22 +446,29 @@ static void stats_print(struct stats_record *stats_rec,
 			pps  = calc_pps(r, p, t);
 			drop = calc_drop(r, p, t);
 			info = calc_info(r, p, t);
+			err  = calc_err(r, p, t);
 			if (info > 0) {
 				i_str = "bulk-average";
 				info = (pps+drop) / info; /* calc avg bulk */
 			}
+			if (err > 0)
+				err_str = "drv-err";
 			if (pps > 0 || drop > 0)
 				printf(fmt1, "devmap-xmit",
-				       i, pps, drop, info, i_str);
+				       i, pps, drop, info, i_str, err_str);
 		}
 		pps = calc_pps(&rec->total, &prev->total, t);
 		drop = calc_drop(&rec->total, &prev->total, t);
 		info = calc_info(&rec->total, &prev->total, t);
+		err  = calc_err(&rec->total, &prev->total, t);
 		if (info > 0) {
 			i_str = "bulk-average";
 			info = (pps+drop) / info; /* calc avg bulk */
 		}
-		printf(fmt2, "devmap-xmit", "total", pps, drop, info, i_str);
+		if (err > 0)
+			err_str = "drv-err";
+		printf(fmt2, "devmap-xmit", "total", pps, drop,
+		       info, i_str, err_str);
 	}
 
 	printf("\n");

^ permalink raw reply related

* Re: [PATCH net V2] tun: fix use after free for ptr_ring
From: Michael S. Tsirkin @ 2018-05-11 18:19 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, xiyou.wangcong, eric.dumazet
In-Reply-To: <1526006965-9124-1-git-send-email-jasowang@redhat.com>

On Fri, May 11, 2018 at 10:49:25AM +0800, Jason Wang wrote:
> We used to initialize ptr_ring during TUNSETIFF, this is because its
> size depends on the tx_queue_len of netdevice. And we try to clean it
> up when socket were detached from netdevice. A race were spotted when
> trying to do uninit during a read which will lead a use after free for
> pointer ring. Solving this by always initialize a zero size ptr_ring
> in open() and do resizing during TUNSETIFF, and then we can safely do
> cleanup during close(). With this, there's no need for the workaround
> that was introduced by commit 4df0bfc79904 ("tun: fix a memory leak
> for tfile->tx_array").
> 
> Reported-by: syzbot+e8b902c3c3fadf0a9dba@syzkaller.appspotmail.com
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Cc: Cong Wang <xiyou.wangcong@gmail.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Fixes: 1576d9860599 ("tun: switch to use skb array for tx")
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Acked-by: Michael S. Tsirkin <mst@redhat.com>

and will you send the revert pls then?


> ---
> Changes from v1:
> - free ptr_ring during close()
> - use tun_ptr_free() during resie for safety
> ---
>  drivers/net/tun.c | 27 ++++++++++++---------------
>  1 file changed, 12 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index ef33950..9fbbb32 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -681,15 +681,6 @@ static void tun_queue_purge(struct tun_file *tfile)
>  	skb_queue_purge(&tfile->sk.sk_error_queue);
>  }
>  
> -static void tun_cleanup_tx_ring(struct tun_file *tfile)
> -{
> -	if (tfile->tx_ring.queue) {
> -		ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
> -		xdp_rxq_info_unreg(&tfile->xdp_rxq);
> -		memset(&tfile->tx_ring, 0, sizeof(tfile->tx_ring));
> -	}
> -}
> -
>  static void __tun_detach(struct tun_file *tfile, bool clean)
>  {
>  	struct tun_file *ntfile;
> @@ -736,7 +727,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
>  			    tun->dev->reg_state == NETREG_REGISTERED)
>  				unregister_netdevice(tun->dev);
>  		}
> -		tun_cleanup_tx_ring(tfile);
> +		if (tun)
> +			xdp_rxq_info_unreg(&tfile->xdp_rxq);
>  		sock_put(&tfile->sk);
>  	}
>  }
> @@ -783,14 +775,14 @@ static void tun_detach_all(struct net_device *dev)
>  		tun_napi_del(tun, tfile);
>  		/* Drop read queue */
>  		tun_queue_purge(tfile);
> +		xdp_rxq_info_unreg(&tfile->xdp_rxq);
>  		sock_put(&tfile->sk);
> -		tun_cleanup_tx_ring(tfile);
>  	}
>  	list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
>  		tun_enable_queue(tfile);
>  		tun_queue_purge(tfile);
> +		xdp_rxq_info_unreg(&tfile->xdp_rxq);
>  		sock_put(&tfile->sk);
> -		tun_cleanup_tx_ring(tfile);
>  	}
>  	BUG_ON(tun->numdisabled != 0);
>  
> @@ -834,7 +826,8 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
>  	}
>  
>  	if (!tfile->detached &&
> -	    ptr_ring_init(&tfile->tx_ring, dev->tx_queue_len, GFP_KERNEL)) {
> +	    ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len,
> +			    GFP_KERNEL, tun_ptr_free)) {
>  		err = -ENOMEM;
>  		goto out;
>  	}
> @@ -3219,6 +3212,11 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>  					    &tun_proto, 0);
>  	if (!tfile)
>  		return -ENOMEM;
> +	if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) {
> +		sk_free(&tfile->sk);
> +		return -ENOMEM;
> +	}
> +
>  	RCU_INIT_POINTER(tfile->tun, NULL);
>  	tfile->flags = 0;
>  	tfile->ifindex = 0;
> @@ -3239,8 +3237,6 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>  
>  	sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
>  
> -	memset(&tfile->tx_ring, 0, sizeof(tfile->tx_ring));
> -
>  	return 0;
>  }
>  
> @@ -3249,6 +3245,7 @@ static int tun_chr_close(struct inode *inode, struct file *file)
>  	struct tun_file *tfile = file->private_data;
>  
>  	tun_detach(tfile, true);
> +	ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
>  
>  	return 0;
>  }
> -- 
> 2.7.4

^ permalink raw reply

* Re: [PATCH v3] net: phy: DP83TC811: Introduce support for the DP83TC811 phy
From: Andrew Lunn @ 2018-05-11 18:30 UTC (permalink / raw)
  To: Dan Murphy; +Cc: f.fainelli, netdev, linux-kernel
In-Reply-To: <20180511180819.5036-1-dmurphy@ti.com>

On Fri, May 11, 2018 at 01:08:19PM -0500, Dan Murphy wrote:
> Add support for the DP83811 phy.
> 
> The DP83811 supports both rgmii and sgmii interfaces.
> There are 2 part numbers for this the DP83TC811R does not
> reliably support the SGMII interface but the DP83TC811S will.
> 
> There is not a way to differentiate these parts from the
> hardware or register set.  So this is controlled via the DT
> to indicate which phy mode is required.  Or the part can be
> strapped to a certain interface.
> 
> Data sheet can be found here:
> http://www.ti.com/product/DP83TC811S-Q1/description
> http://www.ti.com/product/DP83TC811R-Q1/description
> 
> Signed-off-by: Dan Murphy <dmurphy@ti.com>

Hi Dan

It is normal to add any Reviewed-by, or Tested-by: tags you received,
so long as you don't make major changes.

    Andrew

^ permalink raw reply

* Re: possible deadlock in sk_diag_fill
From: Andrei Vagin @ 2018-05-11 18:33 UTC (permalink / raw)
  To: syzbot; +Cc: avagin, davem, linux-kernel, netdev, syzkaller-bugs
In-Reply-To: <000000000000169606056b793179@google.com>

On Sat, May 05, 2018 at 10:59:02AM -0700, syzbot wrote:
> Hello,
> 
> syzbot found the following crash on:
> 
> HEAD commit:    c1c07416cdd4 Merge tag 'kbuild-fixes-v4.17' of git://git.k..
> git tree:       upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=12164c97800000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=5a1dc06635c10d27
> dashboard link: https://syzkaller.appspot.com/bug?extid=c1872be62e587eae9669
> compiler:       gcc (GCC) 8.0.1 20180413 (experimental)
> userspace arch: i386
> 
> Unfortunately, I don't have any reproducer for this crash yet.
> 
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+c1872be62e587eae9669@syzkaller.appspotmail.com
> 
> 
> ======================================================
> WARNING: possible circular locking dependency detected
> 4.17.0-rc3+ #59 Not tainted
> ------------------------------------------------------
> syz-executor1/25282 is trying to acquire lock:
> 000000004fddf743 (&(&u->lock)->rlock/1){+.+.}, at: sk_diag_dump_icons
> net/unix/diag.c:82 [inline]
> 000000004fddf743 (&(&u->lock)->rlock/1){+.+.}, at:
> sk_diag_fill.isra.5+0xa43/0x10d0 net/unix/diag.c:144
> 
> but task is already holding lock:
> 00000000b6895645 (rlock-AF_UNIX){+.+.}, at: spin_lock
> include/linux/spinlock.h:310 [inline]
> 00000000b6895645 (rlock-AF_UNIX){+.+.}, at: sk_diag_dump_icons
> net/unix/diag.c:64 [inline]
> 00000000b6895645 (rlock-AF_UNIX){+.+.}, at: sk_diag_fill.isra.5+0x94e/0x10d0
> net/unix/diag.c:144
> 
> which lock already depends on the new lock.

In the code, we have a comment which explains why it is safe to take this lock

/*
 * The state lock is outer for the same sk's
 * queue lock. With the other's queue locked it's
 * OK to lock the state.
 */
unix_state_lock_nested(req);

It is a question how to explain this to lockdep.

> 
> 
> the existing dependency chain (in reverse order) is:
> 
> -> #1 (rlock-AF_UNIX){+.+.}:
>        __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
>        _raw_spin_lock_irqsave+0x96/0xc0 kernel/locking/spinlock.c:152
>        skb_queue_tail+0x26/0x150 net/core/skbuff.c:2900
>        unix_dgram_sendmsg+0xf77/0x1730 net/unix/af_unix.c:1797
>        sock_sendmsg_nosec net/socket.c:629 [inline]
>        sock_sendmsg+0xd5/0x120 net/socket.c:639
>        ___sys_sendmsg+0x525/0x940 net/socket.c:2117
>        __sys_sendmmsg+0x3bb/0x6f0 net/socket.c:2205
>        __compat_sys_sendmmsg net/compat.c:770 [inline]
>        __do_compat_sys_sendmmsg net/compat.c:777 [inline]
>        __se_compat_sys_sendmmsg net/compat.c:774 [inline]
>        __ia32_compat_sys_sendmmsg+0x9f/0x100 net/compat.c:774
>        do_syscall_32_irqs_on arch/x86/entry/common.c:323 [inline]
>        do_fast_syscall_32+0x345/0xf9b arch/x86/entry/common.c:394
>        entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139
> 
> -> #0 (&(&u->lock)->rlock/1){+.+.}:
>        lock_acquire+0x1dc/0x520 kernel/locking/lockdep.c:3920
>        _raw_spin_lock_nested+0x28/0x40 kernel/locking/spinlock.c:354
>        sk_diag_dump_icons net/unix/diag.c:82 [inline]
>        sk_diag_fill.isra.5+0xa43/0x10d0 net/unix/diag.c:144
>        sk_diag_dump net/unix/diag.c:178 [inline]
>        unix_diag_dump+0x35f/0x550 net/unix/diag.c:206
>        netlink_dump+0x507/0xd20 net/netlink/af_netlink.c:2226
>        __netlink_dump_start+0x51a/0x780 net/netlink/af_netlink.c:2323
>        netlink_dump_start include/linux/netlink.h:214 [inline]
>        unix_diag_handler_dump+0x3f4/0x7b0 net/unix/diag.c:307
>        __sock_diag_cmd net/core/sock_diag.c:230 [inline]
>        sock_diag_rcv_msg+0x2e0/0x3d0 net/core/sock_diag.c:261
>        netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2448
>        sock_diag_rcv+0x2a/0x40 net/core/sock_diag.c:272
>        netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
>        netlink_unicast+0x58b/0x740 net/netlink/af_netlink.c:1336
>        netlink_sendmsg+0x9f0/0xfa0 net/netlink/af_netlink.c:1901
>        sock_sendmsg_nosec net/socket.c:629 [inline]
>        sock_sendmsg+0xd5/0x120 net/socket.c:639
>        sock_write_iter+0x35a/0x5a0 net/socket.c:908
>        call_write_iter include/linux/fs.h:1784 [inline]
>        new_sync_write fs/read_write.c:474 [inline]
>        __vfs_write+0x64d/0x960 fs/read_write.c:487
>        vfs_write+0x1f8/0x560 fs/read_write.c:549
>        ksys_write+0xf9/0x250 fs/read_write.c:598
>        __do_sys_write fs/read_write.c:610 [inline]
>        __se_sys_write fs/read_write.c:607 [inline]
>        __ia32_sys_write+0x71/0xb0 fs/read_write.c:607
>        do_syscall_32_irqs_on arch/x86/entry/common.c:323 [inline]
>        do_fast_syscall_32+0x345/0xf9b arch/x86/entry/common.c:394
>        entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139
> 
> other info that might help us debug this:
> 
>  Possible unsafe locking scenario:
> 
>        CPU0                    CPU1
>        ----                    ----
>   lock(rlock-AF_UNIX);
>                                lock(&(&u->lock)->rlock/1);
>                                lock(rlock-AF_UNIX);
>   lock(&(&u->lock)->rlock/1);
> 
>  *** DEADLOCK ***
> 
> 5 locks held by syz-executor1/25282:
>  #0: 000000003919e1bd (sock_diag_mutex){+.+.}, at: sock_diag_rcv+0x1b/0x40
> net/core/sock_diag.c:271
>  #1: 000000004f328d3e (sock_diag_table_mutex){+.+.}, at: __sock_diag_cmd
> net/core/sock_diag.c:225 [inline]
>  #1: 000000004f328d3e (sock_diag_table_mutex){+.+.}, at:
> sock_diag_rcv_msg+0x169/0x3d0 net/core/sock_diag.c:261
>  #2: 000000004cc04dbb (nlk_cb_mutex-SOCK_DIAG){+.+.}, at:
> netlink_dump+0x98/0xd20 net/netlink/af_netlink.c:2182
>  #3: 00000000accdef41 (unix_table_lock){+.+.}, at: spin_lock
> include/linux/spinlock.h:310 [inline]
>  #3: 00000000accdef41 (unix_table_lock){+.+.}, at:
> unix_diag_dump+0x10a/0x550 net/unix/diag.c:192
>  #4: 00000000b6895645 (rlock-AF_UNIX){+.+.}, at: spin_lock
> include/linux/spinlock.h:310 [inline]
>  #4: 00000000b6895645 (rlock-AF_UNIX){+.+.}, at: sk_diag_dump_icons
> net/unix/diag.c:64 [inline]
>  #4: 00000000b6895645 (rlock-AF_UNIX){+.+.}, at:
> sk_diag_fill.isra.5+0x94e/0x10d0 net/unix/diag.c:144
> 
> stack backtrace:
> CPU: 1 PID: 25282 Comm: syz-executor1 Not tainted 4.17.0-rc3+ #59
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0x1b9/0x294 lib/dump_stack.c:113
>  print_circular_bug.isra.36.cold.54+0x1bd/0x27d
> kernel/locking/lockdep.c:1223
>  check_prev_add kernel/locking/lockdep.c:1863 [inline]
>  check_prevs_add kernel/locking/lockdep.c:1976 [inline]
>  validate_chain kernel/locking/lockdep.c:2417 [inline]
>  __lock_acquire+0x343e/0x5140 kernel/locking/lockdep.c:3431
>  lock_acquire+0x1dc/0x520 kernel/locking/lockdep.c:3920
>  _raw_spin_lock_nested+0x28/0x40 kernel/locking/spinlock.c:354
>  sk_diag_dump_icons net/unix/diag.c:82 [inline]
>  sk_diag_fill.isra.5+0xa43/0x10d0 net/unix/diag.c:144
>  sk_diag_dump net/unix/diag.c:178 [inline]
>  unix_diag_dump+0x35f/0x550 net/unix/diag.c:206
>  netlink_dump+0x507/0xd20 net/netlink/af_netlink.c:2226
>  __netlink_dump_start+0x51a/0x780 net/netlink/af_netlink.c:2323
>  netlink_dump_start include/linux/netlink.h:214 [inline]
>  unix_diag_handler_dump+0x3f4/0x7b0 net/unix/diag.c:307
>  __sock_diag_cmd net/core/sock_diag.c:230 [inline]
>  sock_diag_rcv_msg+0x2e0/0x3d0 net/core/sock_diag.c:261
>  netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2448
>  sock_diag_rcv+0x2a/0x40 net/core/sock_diag.c:272
>  netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
>  netlink_unicast+0x58b/0x740 net/netlink/af_netlink.c:1336
>  netlink_sendmsg+0x9f0/0xfa0 net/netlink/af_netlink.c:1901
>  sock_sendmsg_nosec net/socket.c:629 [inline]
>  sock_sendmsg+0xd5/0x120 net/socket.c:639
>  sock_write_iter+0x35a/0x5a0 net/socket.c:908
>  call_write_iter include/linux/fs.h:1784 [inline]
>  new_sync_write fs/read_write.c:474 [inline]
>  __vfs_write+0x64d/0x960 fs/read_write.c:487
>  vfs_write+0x1f8/0x560 fs/read_write.c:549
>  ksys_write+0xf9/0x250 fs/read_write.c:598
>  __do_sys_write fs/read_write.c:610 [inline]
>  __se_sys_write fs/read_write.c:607 [inline]
>  __ia32_sys_write+0x71/0xb0 fs/read_write.c:607
>  do_syscall_32_irqs_on arch/x86/entry/common.c:323 [inline]
>  do_fast_syscall_32+0x345/0xf9b arch/x86/entry/common.c:394
>  entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139
> RIP: 0023:0xf7f8ccb9
> RSP: 002b:00000000f5f880ac EFLAGS: 00000282 ORIG_RAX: 0000000000000004
> RAX: ffffffffffffffda RBX: 0000000000000017 RCX: 000000002058bfe4
> RDX: 0000000000000029 RSI: 0000000000000000 RDI: 0000000000000000
> RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000296 R12: 0000000000000000
> R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
> 
> 
> ---
> This bug is generated by a bot. It may contain errors.
> See https://goo.gl/tpsmEJ for more information about syzbot.
> syzbot engineers can be reached at syzkaller@googlegroups.com.
> 
> syzbot will keep track of this bug report.
> If you forgot to add the Reported-by tag, once the fix for this bug is
> merged
> into any tree, please reply to this email with:
> #syz fix: exact-commit-title
> To mark this as a duplicate of another syzbot report, please reply with:
> #syz dup: exact-subject-of-another-report
> If it's a one-off invalid bug report, please reply with:
> #syz invalid
> Note: if the crash happens again, it will cause creation of a new bug
> report.
> Note: all commands must start from beginning of the line in the email body.

^ permalink raw reply

* Re: [PATCH v6 1/6] net: phy: at803x: Export at803x_debug_reg_mask()
From: Paul Burton @ 2018-05-11 18:25 UTC (permalink / raw)
  To: Andrew Lunn, Darren Hart; +Cc: netdev, linux-mips, David S . Miller
In-Reply-To: <20180511002619.GD5527@lunn.ch>

Hi Andrew,

On Fri, May 11, 2018 at 02:26:19AM +0200, Andrew Lunn wrote:
> On Thu, May 10, 2018 at 04:16:52PM -0700, Paul Burton wrote:
> > From: Andrew Lunn <andrew@lunn.ch>
> > 
> > On some boards, this PHY has a problem when it hibernates. Export this
> > function to a board can register a PHY fixup to disable hibernation.
> 
> What do you know about the problem?
> 
> https://patchwork.ozlabs.org/patch/686371/
> 
> I don't remember how it was solved, but you should probably do the
> same.
> 
> 	Andrew

I'm afraid I don't know much about the problem - this one is your patch
entirely unchanged, and I don't have access to the hardware in question
(my board uses a Realtek RTL8211E PHY).

I presume you did this because the pch_gbe driver as-is in mainline
disables hibernation for the AR803X PHY found on the MinnowBoard, so
this would be preserving the existing behaviour of the driver?

That behaviour was introduced by commit f1a26fdf5944f ("pch_gbe: Add
MinnowBoard support"), so perhaps Darren as its author might know more?

My presumption would be that this is done to ensure that the PHY is
always providing the RX clock, which the EG20T manual says is required
for the MAC reset register RX_RST & ALL_RST bits to clear. We wait for
those using the call to pch_gbe_wait_clr_bit() in
pch_gbe_mac_reset_hw(), which happens before we initialize the PHY.

I could reorder the probe function a little to initialize the PHY before
performing the MAC reset, drop this patch and the AR803X hibernation
stuff from patch 2 if you like. But again, I can't actually test the
result on the affected hardware.

Thanks,
    Paul

^ permalink raw reply

* [PATCH net 1/1] net sched actions: fix refcnt leak in skbmod
From: Roman Mashak @ 2018-05-11 18:35 UTC (permalink / raw)
  To: davem; +Cc: netdev, kernel, jhs, xiyou.wangcong, jiri, Roman Mashak

When application fails to pass flags in netlink TLV when replacing
existing skbmod action, the kernel will leak refcnt:

$ tc actions get action skbmod index 1
total acts 0

        action order 0: skbmod pipe set smac 00:11:22:33:44:55
         index 1 ref 1 bind 0

For example, at this point a buggy application replaces the action with
index 1 with new smac 00:aa:22:33:44:55, it fails because of zero flags,
however refcnt gets bumped:

$ tc actions get actions skbmod index 1
total acts 0

        action order 0: skbmod pipe set smac 00:11:22:33:44:55
         index 1 ref 2 bind 0
$

Tha patch fixes this by calling tcf_idr_release() on existing actions.

Fixes: 86da71b57383d ("net_sched: Introduce skbmod action")
Signed-off-by: Roman Mashak <mrv@mojatatu.com>
---
 net/sched/act_skbmod.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index bbcbdce732cc..ad050d7d4b46 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -131,8 +131,11 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 	if (exists && bind)
 		return 0;
 
-	if (!lflags)
+	if (!lflags) {
+		if (exists)
+			tcf_idr_release(*a, bind);
 		return -EINVAL;
+	}
 
 	if (!exists) {
 		ret = tcf_idr_create(tn, parm->index, est, a,
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH v6 1/6] net: phy: at803x: Export at803x_debug_reg_mask()
From: Paul Burton @ 2018-05-11 18:38 UTC (permalink / raw)
  To: Andrew Lunn, Darren Hart; +Cc: netdev, linux-mips, David S . Miller
In-Reply-To: <20180511182502.y74wm6dmtf3dbcln@pburton-laptop>

On Fri, May 11, 2018 at 11:25:02AM -0700, Paul Burton wrote:
> Hi Andrew,
> 
> On Fri, May 11, 2018 at 02:26:19AM +0200, Andrew Lunn wrote:
> > On Thu, May 10, 2018 at 04:16:52PM -0700, Paul Burton wrote:
> > > From: Andrew Lunn <andrew@lunn.ch>
> > > 
> > > On some boards, this PHY has a problem when it hibernates. Export this
> > > function to a board can register a PHY fixup to disable hibernation.
> > 
> > What do you know about the problem?
> > 
> > https://patchwork.ozlabs.org/patch/686371/
> > 
> > I don't remember how it was solved, but you should probably do the
> > same.
> > 
> > 	Andrew
> 
> I'm afraid I don't know much about the problem - this one is your patch
> entirely unchanged, and I don't have access to the hardware in question
> (my board uses a Realtek RTL8211E PHY).
> 
> I presume you did this because the pch_gbe driver as-is in mainline
> disables hibernation for the AR803X PHY found on the MinnowBoard, so
> this would be preserving the existing behaviour of the driver?
> 
> That behaviour was introduced by commit f1a26fdf5944f ("pch_gbe: Add
> MinnowBoard support"), so perhaps Darren as its author might know more?
> 
> My presumption would be that this is done to ensure that the PHY is
> always providing the RX clock, which the EG20T manual says is required
> for the MAC reset register RX_RST & ALL_RST bits to clear. We wait for
> those using the call to pch_gbe_wait_clr_bit() in
> pch_gbe_mac_reset_hw(), which happens before we initialize the PHY.
> 
> I could reorder the probe function a little to initialize the PHY before
> performing the MAC reset, drop this patch and the AR803X hibernation
> stuff from patch 2 if you like. But again, I can't actually test the
> result on the affected hardware.
> 
> Thanks,
>     Paul

I got an undeliverable response using Darren's email address from the
commit referenced above, so updating to the latest address I see for him
in git history.

Thanks,
    Paul

^ permalink raw reply

* Re: INFO: rcu detected stall in kfree_skbmem
From: Marcelo Ricardo Leitner @ 2018-05-11 18:41 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: syzbot, Vladislav Yasevich, Neil Horman, linux-sctp, Andrei Vagin,
	David Miller, Kirill Tkhai, LKML, netdev, syzkaller-bugs
In-Reply-To: <CACT4Y+Z_+=VLbELVW69B7WEVqvri1xBMM5RMnMQynec_XgaE=w@mail.gmail.com>

On Fri, May 11, 2018 at 12:00:38PM +0200, Dmitry Vyukov wrote:
> On Mon, Apr 30, 2018 at 8:09 PM, syzbot
> <syzbot+fc78715ba3b3257caf6a@syzkaller.appspotmail.com> wrote:
> > Hello,
> >
> > syzbot found the following crash on:
> >
> > HEAD commit:    5d1365940a68 Merge
> > git://git.kernel.org/pub/scm/linux/kerne...
> > git tree:       net-next
> > console output: https://syzkaller.appspot.com/x/log.txt?id=5667997129637888
> > kernel config:
> > https://syzkaller.appspot.com/x/.config?id=-5947642240294114534
> > dashboard link: https://syzkaller.appspot.com/bug?extid=fc78715ba3b3257caf6a
> > compiler:       gcc (GCC) 8.0.1 20180413 (experimental)
> >
> > Unfortunately, I don't have any reproducer for this crash yet.
>
> This looks sctp-related, +sctp maintainers.
>
> > IMPORTANT: if you fix the bug, please add the following tag to the commit:
> > Reported-by: syzbot+fc78715ba3b3257caf6a@syzkaller.appspotmail.com
> >
> > INFO: rcu_sched self-detected stall on CPU
> >         1-...!: (1 GPs behind) idle=a3e/1/4611686018427387908
> > softirq=71980/71983 fqs=33
> >          (t=125000 jiffies g=39438 c=39437 q=958)
> > rcu_sched kthread starved for 124829 jiffies! g39438 c39437 f0x0
> > RCU_GP_WAIT_FQS(3) ->state=0x0 ->cpu=0
> > RCU grace-period kthread stack dump:
> > rcu_sched       R  running task    23768     9      2 0x80000000
> > Call Trace:
> >  context_switch kernel/sched/core.c:2848 [inline]
> >  __schedule+0x801/0x1e30 kernel/sched/core.c:3490
> >  schedule+0xef/0x430 kernel/sched/core.c:3549
> >  schedule_timeout+0x138/0x240 kernel/time/timer.c:1801
> >  rcu_gp_kthread+0x6b5/0x1940 kernel/rcu/tree.c:2231
> >  kthread+0x345/0x410 kernel/kthread.c:238
> >  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:411
> > NMI backtrace for cpu 1
> > CPU: 1 PID: 20560 Comm: syz-executor4 Not tainted 4.16.0+ #1
> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> > Google 01/01/2011
> > Call Trace:
> >  <IRQ>
> >  __dump_stack lib/dump_stack.c:77 [inline]
> >  dump_stack+0x1b9/0x294 lib/dump_stack.c:113
> >  nmi_cpu_backtrace.cold.4+0x19/0xce lib/nmi_backtrace.c:103
> >  nmi_trigger_cpumask_backtrace+0x151/0x192 lib/nmi_backtrace.c:62
> >  arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38
> >  trigger_single_cpu_backtrace include/linux/nmi.h:156 [inline]
> >  rcu_dump_cpu_stacks+0x175/0x1c2 kernel/rcu/tree.c:1376
> >  print_cpu_stall kernel/rcu/tree.c:1525 [inline]
> >  check_cpu_stall.isra.61.cold.80+0x36c/0x59a kernel/rcu/tree.c:1593
> >  __rcu_pending kernel/rcu/tree.c:3356 [inline]
> >  rcu_pending kernel/rcu/tree.c:3401 [inline]
> >  rcu_check_callbacks+0x21b/0xad0 kernel/rcu/tree.c:2763
> >  update_process_times+0x2d/0x70 kernel/time/timer.c:1636
> >  tick_sched_handle+0x9f/0x180 kernel/time/tick-sched.c:173
> >  tick_sched_timer+0x45/0x130 kernel/time/tick-sched.c:1283
> >  __run_hrtimer kernel/time/hrtimer.c:1386 [inline]
> >  __hrtimer_run_queues+0x3e3/0x10a0 kernel/time/hrtimer.c:1448
> >  hrtimer_interrupt+0x286/0x650 kernel/time/hrtimer.c:1506
> >  local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1025 [inline]
> >  smp_apic_timer_interrupt+0x15d/0x710 arch/x86/kernel/apic/apic.c:1050
> >  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:862
> > RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:783
> > [inline]
> > RIP: 0010:kmem_cache_free+0xb3/0x2d0 mm/slab.c:3757
> > RSP: 0018:ffff8801db105228 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13
> > RAX: 0000000000000007 RBX: ffff8800b055c940 RCX: 1ffff1003b2345a5
> > RDX: 0000000000000000 RSI: ffff8801d91a2d80 RDI: 0000000000000282
> > RBP: ffff8801db105248 R08: ffff8801d91a2cb8 R09: 0000000000000002
> > R10: ffff8801d91a2480 R11: 0000000000000000 R12: ffff8801d9848e40
> > R13: 0000000000000282 R14: ffffffff85b7f27c R15: 0000000000000000
> >  kfree_skbmem+0x13c/0x210 net/core/skbuff.c:582
> >  __kfree_skb net/core/skbuff.c:642 [inline]
> >  kfree_skb+0x19d/0x560 net/core/skbuff.c:659
> >  enqueue_to_backlog+0x2fc/0xc90 net/core/dev.c:3968
> >  netif_rx_internal+0x14d/0xae0 net/core/dev.c:4181
> >  netif_rx+0xba/0x400 net/core/dev.c:4206
> >  loopback_xmit+0x283/0x741 drivers/net/loopback.c:91
> >  __netdev_start_xmit include/linux/netdevice.h:4087 [inline]
> >  netdev_start_xmit include/linux/netdevice.h:4096 [inline]
> >  xmit_one net/core/dev.c:3053 [inline]
> >  dev_hard_start_xmit+0x264/0xc10 net/core/dev.c:3069
> >  __dev_queue_xmit+0x2724/0x34c0 net/core/dev.c:3584
> >  dev_queue_xmit+0x17/0x20 net/core/dev.c:3617
> >  neigh_hh_output include/net/neighbour.h:472 [inline]
> >  neigh_output include/net/neighbour.h:480 [inline]
> >  ip6_finish_output2+0x134e/0x2810 net/ipv6/ip6_output.c:120
> >  ip6_finish_output+0x5fe/0xbc0 net/ipv6/ip6_output.c:154
> >  NF_HOOK_COND include/linux/netfilter.h:277 [inline]
> >  ip6_output+0x227/0x9b0 net/ipv6/ip6_output.c:171
> >  dst_output include/net/dst.h:444 [inline]
> >  NF_HOOK include/linux/netfilter.h:288 [inline]
> >  ip6_xmit+0xf51/0x23f0 net/ipv6/ip6_output.c:277

sctp_v6_xmit calls ip6_xmit with rcu_read_lock() as it has to pass
np->opt to ip6_xmit. Sounds like this packet then went through a long
journey and hit the bell.

But calling ip6_xmit with rcu_read_lock is expected. tcp stack also
does it.
Thus I think this is more of an issue with IPv6 stack. If a host has
an extensive ip6tables ruleset, it probably generates this more
easily.

> >  sctp_v6_xmit+0x4a5/0x6b0 net/sctp/ipv6.c:225
> >  sctp_packet_transmit+0x26f6/0x3ba0 net/sctp/output.c:650
> >  sctp_outq_flush+0x1373/0x4370 net/sctp/outqueue.c:1197
> >  sctp_outq_uncork+0x6a/0x80 net/sctp/outqueue.c:776
> >  sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1820 [inline]
> >  sctp_side_effects net/sctp/sm_sideeffect.c:1220 [inline]
> >  sctp_do_sm+0x596/0x7160 net/sctp/sm_sideeffect.c:1191
> >  sctp_generate_heartbeat_event+0x218/0x450 net/sctp/sm_sideeffect.c:406
> >  call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
> >  expire_timers kernel/time/timer.c:1363 [inline]

Having this call from a timer means it wasn't processing sctp stack
for too long.

> >  __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
> >  run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
> >  __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
> >  invoke_softirq kernel/softirq.c:365 [inline]
> >  irq_exit+0x1d1/0x200 kernel/softirq.c:405
> >  exiting_irq arch/x86/include/asm/apic.h:525 [inline]
> >  smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
> >  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:862
> >  </IRQ>
> > RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:783
> > [inline]
> > RIP: 0010:lock_release+0x4d4/0xa10 kernel/locking/lockdep.c:3942
> > RSP: 0018:ffff8801971ce7b0 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13
> > RAX: dffffc0000000000 RBX: 1ffff10032e39cfb RCX: 1ffff1003b234595
> > RDX: 1ffffffff11630ed RSI: 0000000000000002 RDI: 0000000000000282
> > RBP: ffff8801971ce8e0 R08: 1ffff10032e39cff R09: ffffed003b6246c2
> > R10: 0000000000000003 R11: 0000000000000001 R12: ffff8801d91a2480
> > R13: ffffffff88b8df60 R14: ffff8801d91a2480 R15: ffff8801971ce7f8
> >  rcu_lock_release include/linux/rcupdate.h:251 [inline]
> >  rcu_read_unlock include/linux/rcupdate.h:688 [inline]
> >  __unlock_page_memcg+0x72/0x100 mm/memcontrol.c:1654
> >  unlock_page_memcg+0x2c/0x40 mm/memcontrol.c:1663
> >  page_remove_file_rmap mm/rmap.c:1248 [inline]
> >  page_remove_rmap+0x6f2/0x1250 mm/rmap.c:1299
> >  zap_pte_range mm/memory.c:1337 [inline]
> >  zap_pmd_range mm/memory.c:1441 [inline]
> >  zap_pud_range mm/memory.c:1470 [inline]
> >  zap_p4d_range mm/memory.c:1491 [inline]
> >  unmap_page_range+0xeb4/0x2200 mm/memory.c:1512
> >  unmap_single_vma+0x1a0/0x310 mm/memory.c:1557
> >  unmap_vmas+0x120/0x1f0 mm/memory.c:1587
> >  exit_mmap+0x265/0x570 mm/mmap.c:3038
> >  __mmput kernel/fork.c:962 [inline]
> >  mmput+0x251/0x610 kernel/fork.c:983
> >  exit_mm kernel/exit.c:544 [inline]
> >  do_exit+0xe98/0x2730 kernel/exit.c:852
> >  do_group_exit+0x16f/0x430 kernel/exit.c:968
> >  get_signal+0x886/0x1960 kernel/signal.c:2469
> >  do_signal+0x98/0x2040 arch/x86/kernel/signal.c:810
> >  exit_to_usermode_loop+0x28a/0x310 arch/x86/entry/common.c:162
> >  prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
> >  syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
> >  do_syscall_64+0x792/0x9d0 arch/x86/entry/common.c:292
> >  entry_SYSCALL_64_after_hwframe+0x42/0xb7
> > RIP: 0033:0x455319
> > RSP: 002b:00007fa346e81ce8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
> > RAX: fffffffffffffe00 RBX: 000000000072bf80 RCX: 0000000000455319
> > RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000072bf80
> > RBP: 000000000072bf80 R08: 0000000000000000 R09: 000000000072bf58
> > R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
> > R13: 0000000000a3e81f R14: 00007fa346e829c0 R15: 0000000000000001
> >
> >
> > ---
> > This bug is generated by a bot. It may contain errors.
> > See https://goo.gl/tpsmEJ for more information about syzbot.
> > syzbot engineers can be reached at syzkaller@googlegroups.com.
> >
> > syzbot will keep track of this bug report.
> > If you forgot to add the Reported-by tag, once the fix for this bug is
> > merged
> > into any tree, please reply to this email with:
> > #syz fix: exact-commit-title
> > To mark this as a duplicate of another syzbot report, please reply with:
> > #syz dup: exact-subject-of-another-report
> > If it's a one-off invalid bug report, please reply with:
> > #syz invalid
> > Note: if the crash happens again, it will cause creation of a new bug
> > report.
> > Note: all commands must start from beginning of the line in the email body.
> >
> > --
> > You received this message because you are subscribed to the Google Groups
> > "syzkaller-bugs" group.
> > To unsubscribe from this group and stop receiving emails from it, send an
> > email to syzkaller-bugs+unsubscribe@googlegroups.com.
> > To view this discussion on the web visit
> > https://groups.google.com/d/msgid/syzkaller-bugs/000000000000a9b0e3056b14bfb2%40google.com.
> > For more options, visit https://groups.google.com/d/optout.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: [PATCH v3] net: phy: DP83TC811: Introduce support for the DP83TC811 phy
From: Dan Murphy @ 2018-05-11 18:51 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: f.fainelli, netdev, linux-kernel
In-Reply-To: <20180511183006.GB12738@lunn.ch>

Andrew

On 05/11/2018 01:30 PM, Andrew Lunn wrote:
> On Fri, May 11, 2018 at 01:08:19PM -0500, Dan Murphy wrote:
>> Add support for the DP83811 phy.
>>
>> The DP83811 supports both rgmii and sgmii interfaces.
>> There are 2 part numbers for this the DP83TC811R does not
>> reliably support the SGMII interface but the DP83TC811S will.
>>
>> There is not a way to differentiate these parts from the
>> hardware or register set.  So this is controlled via the DT
>> to indicate which phy mode is required.  Or the part can be
>> strapped to a certain interface.
>>
>> Data sheet can be found here:
>> http://www.ti.com/product/DP83TC811S-Q1/description
>> http://www.ti.com/product/DP83TC811R-Q1/description
>>
>> Signed-off-by: Dan Murphy <dmurphy@ti.com>
> 
> Hi Dan
> 
> It is normal to add any Reviewed-by, or Tested-by: tags you received,
> so long as you don't make major changes.
> 

Thanks for the reminder.

I usually add them if I get them explicitly stated in the review.

I have not seen any Reviewed-by or Tested-by tags in any of the replies for the
patch.  But I may have missed it.

Dan

>     Andrew
> 


-- 
------------------
Dan Murphy

^ permalink raw reply

* Re: INFO: rcu detected stall in kfree_skbmem
From: Eric Dumazet @ 2018-05-11 19:08 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner, Dmitry Vyukov
  Cc: syzbot, Vladislav Yasevich, Neil Horman, linux-sctp, Andrei Vagin,
	David Miller, Kirill Tkhai, LKML, netdev, syzkaller-bugs
In-Reply-To: <20180511184141.GW5105@localhost.localdomain>



On 05/11/2018 11:41 AM, Marcelo Ricardo Leitner wrote:

> But calling ip6_xmit with rcu_read_lock is expected. tcp stack also
> does it.
> Thus I think this is more of an issue with IPv6 stack. If a host has
> an extensive ip6tables ruleset, it probably generates this more
> easily.
> 
>>>  sctp_v6_xmit+0x4a5/0x6b0 net/sctp/ipv6.c:225
>>>  sctp_packet_transmit+0x26f6/0x3ba0 net/sctp/output.c:650
>>>  sctp_outq_flush+0x1373/0x4370 net/sctp/outqueue.c:1197
>>>  sctp_outq_uncork+0x6a/0x80 net/sctp/outqueue.c:776
>>>  sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1820 [inline]
>>>  sctp_side_effects net/sctp/sm_sideeffect.c:1220 [inline]
>>>  sctp_do_sm+0x596/0x7160 net/sctp/sm_sideeffect.c:1191
>>>  sctp_generate_heartbeat_event+0x218/0x450 net/sctp/sm_sideeffect.c:406
>>>  call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
>>>  expire_timers kernel/time/timer.c:1363 [inline]
> 
> Having this call from a timer means it wasn't processing sctp stack
> for too long.
>

I feel the problem is that this part is looping, in some infinite loop.

I have seen this stack traces in other reports.

Maybe some kind of list corruption.

^ permalink raw reply

* Re: [PATCH net 1/1] net sched actions: fix refcnt leak in skbmod
From: Cong Wang @ 2018-05-11 19:09 UTC (permalink / raw)
  To: Roman Mashak
  Cc: David Miller, Linux Kernel Network Developers, kernel,
	Jamal Hadi Salim, Jiri Pirko
In-Reply-To: <1526063733-7813-1-git-send-email-mrv@mojatatu.com>

On Fri, May 11, 2018 at 11:35 AM, Roman Mashak <mrv@mojatatu.com> wrote:
> When application fails to pass flags in netlink TLV when replacing
> existing skbmod action, the kernel will leak refcnt:
>
> $ tc actions get action skbmod index 1
> total acts 0
>
>         action order 0: skbmod pipe set smac 00:11:22:33:44:55
>          index 1 ref 1 bind 0
>
> For example, at this point a buggy application replaces the action with
> index 1 with new smac 00:aa:22:33:44:55, it fails because of zero flags,
> however refcnt gets bumped:
>
> $ tc actions get actions skbmod index 1
> total acts 0
>
>         action order 0: skbmod pipe set smac 00:11:22:33:44:55
>          index 1 ref 2 bind 0
> $
>
> Tha patch fixes this by calling tcf_idr_release() on existing actions.
>
> Fixes: 86da71b57383d ("net_sched: Introduce skbmod action")
> Signed-off-by: Roman Mashak <mrv@mojatatu.com>

Acked-by: Cong Wang <xiyou.wangcong@gmail.com>

^ permalink raw reply

* Re: [PATCH v3] net: phy: DP83TC811: Introduce support for the DP83TC811 phy
From: Andrew Lunn @ 2018-05-11 19:10 UTC (permalink / raw)
  To: Dan Murphy; +Cc: f.fainelli, netdev, linux-kernel
In-Reply-To: <56ca63ce-d609-5bb7-f38c-17d875496bd5@ti.com>

On Fri, May 11, 2018 at 01:51:28PM -0500, Dan Murphy wrote:
> Andrew
> 
> On 05/11/2018 01:30 PM, Andrew Lunn wrote:
> > On Fri, May 11, 2018 at 01:08:19PM -0500, Dan Murphy wrote:
> >> Add support for the DP83811 phy.
> >>
> >> The DP83811 supports both rgmii and sgmii interfaces.
> >> There are 2 part numbers for this the DP83TC811R does not
> >> reliably support the SGMII interface but the DP83TC811S will.
> >>
> >> There is not a way to differentiate these parts from the
> >> hardware or register set.  So this is controlled via the DT
> >> to indicate which phy mode is required.  Or the part can be
> >> strapped to a certain interface.
> >>
> >> Data sheet can be found here:
> >> http://www.ti.com/product/DP83TC811S-Q1/description
> >> http://www.ti.com/product/DP83TC811R-Q1/description
> >>
> >> Signed-off-by: Dan Murphy <dmurphy@ti.com>
> > 
> > Hi Dan
> > 
> > It is normal to add any Reviewed-by, or Tested-by: tags you received,
> > so long as you don't make major changes.
> > 
> 
> Thanks for the reminder.
> 
> I usually add them if I get them explicitly stated in the review.

Humm, i thought i had given one. But i cannot find it in the mail
archive. Going senile :-(

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH v3] net: phy: DP83TC811: Introduce support for the DP83TC811 phy
From: Florian Fainelli @ 2018-05-11 19:12 UTC (permalink / raw)
  To: Dan Murphy, andrew; +Cc: netdev, linux-kernel
In-Reply-To: <20180511180819.5036-1-dmurphy@ti.com>

On 05/11/2018 11:08 AM, Dan Murphy wrote:
> Add support for the DP83811 phy.
> 
> The DP83811 supports both rgmii and sgmii interfaces.
> There are 2 part numbers for this the DP83TC811R does not
> reliably support the SGMII interface but the DP83TC811S will.
> 
> There is not a way to differentiate these parts from the
> hardware or register set.  So this is controlled via the DT
> to indicate which phy mode is required.  Or the part can be
> strapped to a certain interface.
> 
> Data sheet can be found here:
> http://www.ti.com/product/DP83TC811S-Q1/description
> http://www.ti.com/product/DP83TC811R-Q1/description
> 
> Signed-off-by: Dan Murphy <dmurphy@ti.com>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

* Re: [PATCH v3] net: phy: DP83TC811: Introduce support for the DP83TC811 phy
From: David Miller @ 2018-05-11 19:15 UTC (permalink / raw)
  To: andrew; +Cc: dmurphy, f.fainelli, netdev, linux-kernel
In-Reply-To: <20180511191011.GC12738@lunn.ch>

From: Andrew Lunn <andrew@lunn.ch>
Date: Fri, 11 May 2018 21:10:11 +0200

> Humm, i thought i had given one. But i cannot find it in the mail
> archive. Going senile :-(

You aren't going senile, there is just are a huge number of patches
being submitted since net-next openned up.

^ permalink raw reply

* Re: [PATCH] mlx4_core: allocate 4KB ICM chunks
From: Qing Huang @ 2018-05-11 19:16 UTC (permalink / raw)
  To: Håkon Bugge; +Cc: tariqt, davem, netdev, OFED mailing list, linux-kernel
In-Reply-To: <5ABF1B88-882E-4575-8E8C-41F0452FECC1@oracle.com>


On 5/11/2018 3:27 AM, Håkon Bugge wrote:
>> On 11 May 2018, at 01:31, Qing Huang<qing.huang@oracle.com>  wrote:
>>
>> When a system is under memory presure (high usage with fragments),
>> the original 256KB ICM chunk allocations will likely trigger kernel
>> memory management to enter slow path doing memory compact/migration
>> ops in order to complete high order memory allocations.
>>
>> When that happens, user processes calling uverb APIs may get stuck
>> for more than 120s easily even though there are a lot of free pages
>> in smaller chunks available in the system.
>>
>> Syslog:
>> ...
>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>> oracle_205573_e:205573 blocked for more than 120 seconds.
>> ...
>>
>> With 4KB ICM chunk size, the above issue is fixed.
>>
>> However in order to support 4KB ICM chunk size, we need to fix another
>> issue in large size kcalloc allocations.
>>
>> E.g.
>> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
>> size, each ICM chunk can only hold 512 mtt entries (8 bytes for each mtt
>> entry). So we need a 16MB allocation for a table->icm pointer array to
>> hold 2M pointers which can easily cause kcalloc to fail.
>>
>> The solution is to use vzalloc to replace kcalloc. There is no need
>> for contiguous memory pages for a driver meta data structure (no need
>> of DMA ops).
>>
>> Signed-off-by: Qing Huang<qing.huang@oracle.com>
>> Acked-by: Daniel Jurgens<danielj@mellanox.com>
>> ---
>> drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
>> 1 file changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
>> index a822f7a..2b17a4b 100644
>> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
>> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
>> @@ -43,12 +43,12 @@
>> #include "fw.h"
>>
>> /*
>> - * We allocate in as big chunks as we can, up to a maximum of 256 KB
>> - * per chunk.
>> + * We allocate in 4KB page size chunks to avoid high order memory
>> + * allocations in fragmented/high usage memory situation.
>>   */
>> enum {
>> -	MLX4_ICM_ALLOC_SIZE	= 1 << 18,
>> -	MLX4_TABLE_CHUNK_SIZE	= 1 << 18
>> +	MLX4_ICM_ALLOC_SIZE	= 1 << 12,
>> +	MLX4_TABLE_CHUNK_SIZE	= 1 << 12
> Shouldn’t these be the arch’s page size order? E.g., if running on SPARC, the hw page size is 8KiB.

Good point on supporting wider range of architectures. I got tunnel 
vision when fixing this on our x64 lab machines.
Will send an v2 patch.

Thanks,
Qing

> Thxs, Håkon
>
>> };
>>
>> static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
>> @@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
>> 	obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
>> 	num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
>>
>> -	table->icm      = kcalloc(num_icm, sizeof(*table->icm), GFP_KERNEL);
>> +	table->icm      = vzalloc(num_icm * sizeof(*table->icm));
>> 	if (!table->icm)
>> 		return -ENOMEM;
>> 	table->virt     = virt;
>> @@ -446,7 +446,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
>> 			mlx4_free_icm(dev, table->icm[i], use_coherent);
>> 		}
>>
>> -	kfree(table->icm);
>> +	vfree(table->icm);
>>
>> 	return -ENOMEM;
>> }
>> @@ -462,5 +462,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
>> 			mlx4_free_icm(dev, table->icm[i], table->coherent);
>> 		}
>>
>> -	kfree(table->icm);
>> +	vfree(table->icm);
>> }
>> -- 
>> 2.9.3
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message tomajordomo@vger.kernel.org
>> More majordomo info athttp://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message tomajordomo@vger.kernel.org
> More majordomo info athttp://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH V2] mlx4_core: allocate ICM memory in page size chunks
From: Qing Huang @ 2018-05-11 19:23 UTC (permalink / raw)
  To: tariqt, davem, haakon.bugge, yanjun.zhu
  Cc: netdev, linux-rdma, linux-kernel, Qing Huang

When a system is under memory presure (high usage with fragments),
the original 256KB ICM chunk allocations will likely trigger kernel
memory management to enter slow path doing memory compact/migration
ops in order to complete high order memory allocations.

When that happens, user processes calling uverb APIs may get stuck
for more than 120s easily even though there are a lot of free pages
in smaller chunks available in the system.

Syslog:
...
Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
oracle_205573_e:205573 blocked for more than 120 seconds.
...

With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.

However in order to support smaller ICM chunk size, we need to fix
another issue in large size kcalloc allocations.

E.g.
Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
size, each ICM chunk can only hold 512 mtt entries (8 bytes for each mtt
entry). So we need a 16MB allocation for a table->icm pointer array to
hold 2M pointers which can easily cause kcalloc to fail.

The solution is to use vzalloc to replace kcalloc. There is no need
for contiguous memory pages for a driver meta data structure (no need
of DMA ops).

Signed-off-by: Qing Huang <qing.huang@oracle.com>
Acked-by: Daniel Jurgens <danielj@mellanox.com>
Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
---
v2 -> v1: adjusted chunk size to reflect different architectures.

 drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
index a822f7a..ccb62b8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -43,12 +43,12 @@
 #include "fw.h"
 
 /*
- * We allocate in as big chunks as we can, up to a maximum of 256 KB
- * per chunk.
+ * We allocate in page size (default 4KB on many archs) chunks to avoid high
+ * order memory allocations in fragmented/high usage memory situation.
  */
 enum {
-	MLX4_ICM_ALLOC_SIZE	= 1 << 18,
-	MLX4_TABLE_CHUNK_SIZE	= 1 << 18
+	MLX4_ICM_ALLOC_SIZE	= 1 << PAGE_SHIFT,
+	MLX4_TABLE_CHUNK_SIZE	= 1 << PAGE_SHIFT
 };
 
 static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
@@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
 	obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
 	num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
 
-	table->icm      = kcalloc(num_icm, sizeof(*table->icm), GFP_KERNEL);
+	table->icm      = vzalloc(num_icm * sizeof(*table->icm));
 	if (!table->icm)
 		return -ENOMEM;
 	table->virt     = virt;
@@ -446,7 +446,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
 			mlx4_free_icm(dev, table->icm[i], use_coherent);
 		}
 
-	kfree(table->icm);
+	vfree(table->icm);
 
 	return -ENOMEM;
 }
@@ -462,5 +462,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
 			mlx4_free_icm(dev, table->icm[i], table->coherent);
 		}
 
-	kfree(table->icm);
+	vfree(table->icm);
 }
-- 
2.9.3

^ permalink raw reply related

* Re: [PATCH v6 1/6] net: phy: at803x: Export at803x_debug_reg_mask()
From: Andrew Lunn @ 2018-05-11 19:24 UTC (permalink / raw)
  To: Paul Burton; +Cc: Darren Hart, netdev, linux-mips, David S . Miller
In-Reply-To: <20180511182502.y74wm6dmtf3dbcln@pburton-laptop>

> I could reorder the probe function a little to initialize the PHY before
> performing the MAC reset, drop this patch and the AR803X hibernation
> stuff from patch 2 if you like. But again, I can't actually test the
> result on the affected hardware.

Hi Paul

I don't like a MAC driver poking around in PHY registers.

So if you can rearrange the code, that would be great.

   Thanks
	Andrew

^ permalink raw reply

* [PATCH net-next 2/4] bonding: use common mac addr checks
From: Debabrata Banerjee @ 2018-05-11 19:25 UTC (permalink / raw)
  To: David S . Miller, netdev
  Cc: Jay Vosburgh, Veaceslav Falico, Andy Gospodarek, dbanerje
In-Reply-To: <20180511192548.8119-1-dbanerje@akamai.com>

Replace homegrown mac addr checks with faster defs from etherdevice.h

Signed-off-by: Debabrata Banerjee <dbanerje@akamai.com>
---
 drivers/net/bonding/bond_alb.c | 28 +++++++++-------------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index c2f6c58e4e6a..180e50f7806f 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -40,11 +40,6 @@
 #include <net/bonding.h>
 #include <net/bond_alb.h>
 
-
-
-static const u8 mac_bcast[ETH_ALEN + 2] __long_aligned = {
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-};
 static const u8 mac_v6_allmcast[ETH_ALEN + 2] __long_aligned = {
 	0x33, 0x33, 0x00, 0x00, 0x00, 0x01
 };
@@ -420,9 +415,7 @@ static void rlb_clear_slave(struct bonding *bond, struct slave *slave)
 
 			if (assigned_slave) {
 				rx_hash_table[index].slave = assigned_slave;
-				if (!ether_addr_equal_64bits(rx_hash_table[index].mac_dst,
-							     mac_bcast) &&
-				    !is_zero_ether_addr(rx_hash_table[index].mac_dst)) {
+				if (is_valid_ether_addr(rx_hash_table[index].mac_dst)) {
 					bond_info->rx_hashtbl[index].ntt = 1;
 					bond_info->rx_ntt = 1;
 					/* A slave has been removed from the
@@ -525,8 +518,7 @@ static void rlb_req_update_slave_clients(struct bonding *bond, struct slave *sla
 		client_info = &(bond_info->rx_hashtbl[hash_index]);
 
 		if ((client_info->slave == slave) &&
-		    !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
-		    !is_zero_ether_addr(client_info->mac_dst)) {
+		    is_valid_ether_addr(client_info->mac_dst)) {
 			client_info->ntt = 1;
 			ntt = 1;
 		}
@@ -567,8 +559,7 @@ static void rlb_req_update_subnet_clients(struct bonding *bond, __be32 src_ip)
 		if ((client_info->ip_src == src_ip) &&
 		    !ether_addr_equal_64bits(client_info->slave->dev->dev_addr,
 					     bond->dev->dev_addr) &&
-		    !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
-		    !is_zero_ether_addr(client_info->mac_dst)) {
+		    is_valid_ether_addr(client_info->mac_dst)) {
 			client_info->ntt = 1;
 			bond_info->rx_ntt = 1;
 		}
@@ -596,7 +587,7 @@ static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bon
 		if ((client_info->ip_src == arp->ip_src) &&
 		    (client_info->ip_dst == arp->ip_dst)) {
 			/* the entry is already assigned to this client */
-			if (!ether_addr_equal_64bits(arp->mac_dst, mac_bcast)) {
+			if (!is_broadcast_ether_addr(arp->mac_dst)) {
 				/* update mac address from arp */
 				ether_addr_copy(client_info->mac_dst, arp->mac_dst);
 			}
@@ -644,8 +635,7 @@ static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bon
 		ether_addr_copy(client_info->mac_src, arp->mac_src);
 		client_info->slave = assigned_slave;
 
-		if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
-		    !is_zero_ether_addr(client_info->mac_dst)) {
+		if (is_valid_ether_addr(client_info->mac_dst)) {
 			client_info->ntt = 1;
 			bond->alb_info.rx_ntt = 1;
 		} else {
@@ -1418,9 +1408,9 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 	case ETH_P_IP: {
 		const struct iphdr *iph = ip_hdr(skb);
 
-		if (ether_addr_equal_64bits(eth_data->h_dest, mac_bcast) ||
-		    (iph->daddr == ip_bcast) ||
-		    (iph->protocol == IPPROTO_IGMP)) {
+		if (is_broadcast_ether_addr(eth_data->h_dest) ||
+		    iph->daddr == ip_bcast ||
+		    iph->protocol == IPPROTO_IGMP) {
 			do_tx_balance = false;
 			break;
 		}
@@ -1432,7 +1422,7 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 		/* IPv6 doesn't really use broadcast mac address, but leave
 		 * that here just in case.
 		 */
-		if (ether_addr_equal_64bits(eth_data->h_dest, mac_bcast)) {
+		if (is_broadcast_ether_addr(eth_data->h_dest)) {
 			do_tx_balance = false;
 			break;
 		}
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next 3/4] bonding: allow use of tx hashing in balance-alb
From: Debabrata Banerjee @ 2018-05-11 19:25 UTC (permalink / raw)
  To: David S . Miller, netdev
  Cc: Jay Vosburgh, Veaceslav Falico, Andy Gospodarek, dbanerje
In-Reply-To: <20180511192548.8119-1-dbanerje@akamai.com>

The rx load balancing provided by balance-alb is not mutually
exclusive with using hashing for tx selection, and should provide a decent
speed increase because this eliminates spinlocks and cache contention.

Signed-off-by: Debabrata Banerjee <dbanerje@akamai.com>
---
 drivers/net/bonding/bond_alb.c     | 20 ++++++++++++++++++--
 drivers/net/bonding/bond_main.c    | 25 +++++++++++++++----------
 drivers/net/bonding/bond_options.c |  2 +-
 include/net/bonding.h              | 10 +++++++++-
 4 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 180e50f7806f..6228635880d5 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1478,8 +1478,24 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 	}
 
 	if (do_tx_balance) {
-		hash_index = _simple_hash(hash_start, hash_size);
-		tx_slave = tlb_choose_channel(bond, hash_index, skb->len);
+		if (bond->params.tlb_dynamic_lb) {
+			hash_index = _simple_hash(hash_start, hash_size);
+			tx_slave = tlb_choose_channel(bond, hash_index, skb->len);
+		} else {
+			/*
+			 * do_tx_balance means we are free to select the tx_slave
+			 * So we do exactly what tlb would do for hash selection
+			 */
+
+			struct bond_up_slave *slaves;
+			unsigned int count;
+
+			slaves = rcu_dereference(bond->slave_arr);
+			count = slaves ? READ_ONCE(slaves->count) : 0;
+			if (likely(count))
+				tx_slave = slaves->arr[bond_xmit_hash(bond, skb) %
+						       count];
+		}
 	}
 
 	return bond_do_alb_xmit(skb, bond, tx_slave);
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 1f1e97b26f95..f7f8a49cb32b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -159,7 +159,7 @@ module_param(min_links, int, 0);
 MODULE_PARM_DESC(min_links, "Minimum number of available links before turning on carrier");
 
 module_param(xmit_hash_policy, charp, 0);
-MODULE_PARM_DESC(xmit_hash_policy, "balance-xor and 802.3ad hashing method; "
+MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; "
 				   "0 for layer 2 (default), 1 for layer 3+4, "
 				   "2 for layer 2+3, 3 for encap layer 2+3, "
 				   "4 for encap layer 3+4");
@@ -1735,7 +1735,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 		unblock_netpoll_tx();
 	}
 
-	if (bond_mode_uses_xmit_hash(bond))
+	if (bond_mode_can_use_xmit_hash(bond))
 		bond_update_slave_arr(bond, NULL);
 
 	bond->nest_level = dev_get_nest_level(bond_dev);
@@ -1870,7 +1870,7 @@ static int __bond_release_one(struct net_device *bond_dev,
 	if (BOND_MODE(bond) == BOND_MODE_8023AD)
 		bond_3ad_unbind_slave(slave);
 
-	if (bond_mode_uses_xmit_hash(bond))
+	if (bond_mode_can_use_xmit_hash(bond))
 		bond_update_slave_arr(bond, slave);
 
 	netdev_info(bond_dev, "Releasing %s interface %s\n",
@@ -3102,7 +3102,7 @@ static int bond_slave_netdev_event(unsigned long event,
 		 * events. If these (miimon/arpmon) parameters are configured
 		 * then array gets refreshed twice and that should be fine!
 		 */
-		if (bond_mode_uses_xmit_hash(bond))
+		if (bond_mode_can_use_xmit_hash(bond))
 			bond_update_slave_arr(bond, NULL);
 		break;
 	case NETDEV_CHANGEMTU:
@@ -3322,7 +3322,7 @@ static int bond_open(struct net_device *bond_dev)
 		 */
 		if (bond_alb_initialize(bond, (BOND_MODE(bond) == BOND_MODE_ALB)))
 			return -ENOMEM;
-		if (bond->params.tlb_dynamic_lb)
+		if (bond->params.tlb_dynamic_lb || BOND_MODE(bond) == BOND_MODE_ALB)
 			queue_delayed_work(bond->wq, &bond->alb_work, 0);
 	}
 
@@ -3341,7 +3341,7 @@ static int bond_open(struct net_device *bond_dev)
 		bond_3ad_initiate_agg_selection(bond, 1);
 	}
 
-	if (bond_mode_uses_xmit_hash(bond))
+	if (bond_mode_can_use_xmit_hash(bond))
 		bond_update_slave_arr(bond, NULL);
 
 	return 0;
@@ -3892,7 +3892,7 @@ static void bond_slave_arr_handler(struct work_struct *work)
  * to determine the slave interface -
  * (a) BOND_MODE_8023AD
  * (b) BOND_MODE_XOR
- * (c) BOND_MODE_TLB && tlb_dynamic_lb == 0
+ * (c) (BOND_MODE_TLB || BOND_MODE_ALB) && tlb_dynamic_lb == 0
  *
  * The caller is expected to hold RTNL only and NO other lock!
  */
@@ -3945,6 +3945,11 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave)
 			continue;
 		if (skipslave == slave)
 			continue;
+
+		netdev_dbg(bond->dev,
+			   "Adding slave dev %s to tx hash array[%d]\n",
+			   slave->dev->name, new_arr->count);
+
 		new_arr->arr[new_arr->count++] = slave;
 	}
 
@@ -4320,9 +4325,9 @@ static int bond_check_params(struct bond_params *params)
 	}
 
 	if (xmit_hash_policy) {
-		if ((bond_mode != BOND_MODE_XOR) &&
-		    (bond_mode != BOND_MODE_8023AD) &&
-		    (bond_mode != BOND_MODE_TLB)) {
+		if (bond_mode == BOND_MODE_ROUNDROBIN ||
+		    bond_mode == BOND_MODE_ACTIVEBACKUP ||
+		    bond_mode == BOND_MODE_BROADCAST) {
 			pr_info("xmit_hash_policy param is irrelevant in mode %s\n",
 				bond_mode_name(bond_mode));
 		} else {
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 58c705f24f96..8a945c9341d6 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -395,7 +395,7 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 		.id = BOND_OPT_TLB_DYNAMIC_LB,
 		.name = "tlb_dynamic_lb",
 		.desc = "Enable dynamic flow shuffling",
-		.unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_TLB)),
+		.unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB)),
 		.values = bond_tlb_dynamic_lb_tbl,
 		.flags = BOND_OPTFLAG_IFDOWN,
 		.set = bond_option_tlb_dynamic_lb_set,
diff --git a/include/net/bonding.h b/include/net/bonding.h
index b52235158836..9a41a50b0bd2 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -285,10 +285,18 @@ static inline bool bond_needs_speed_duplex(const struct bonding *bond)
 
 static inline bool bond_is_nondyn_tlb(const struct bonding *bond)
 {
-	return (BOND_MODE(bond) == BOND_MODE_TLB)  &&
+	return (BOND_MODE(bond) == BOND_MODE_TLB || BOND_MODE(bond) == BOND_MODE_ALB) &&
 	       (bond->params.tlb_dynamic_lb == 0);
 }
 
+static inline bool bond_mode_can_use_xmit_hash(const struct bonding *bond)
+{
+	return (BOND_MODE(bond) == BOND_MODE_8023AD ||
+		BOND_MODE(bond) == BOND_MODE_XOR ||
+		BOND_MODE(bond) == BOND_MODE_TLB ||
+		BOND_MODE(bond) == BOND_MODE_ALB);
+}
+
 static inline bool bond_mode_uses_xmit_hash(const struct bonding *bond)
 {
 	return (BOND_MODE(bond) == BOND_MODE_8023AD ||
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next 1/4] bonding: don't queue up extraneous rlb updates
From: Debabrata Banerjee @ 2018-05-11 19:25 UTC (permalink / raw)
  To: David S . Miller, netdev
  Cc: Jay Vosburgh, Veaceslav Falico, Andy Gospodarek, dbanerje
In-Reply-To: <20180511192548.8119-1-dbanerje@akamai.com>

arps for incomplete entries can't be sent anyway.

Signed-off-by: Debabrata Banerjee <dbanerje@akamai.com>
---
 drivers/net/bonding/bond_alb.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 5eb0df2e5464..c2f6c58e4e6a 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -421,7 +421,8 @@ static void rlb_clear_slave(struct bonding *bond, struct slave *slave)
 			if (assigned_slave) {
 				rx_hash_table[index].slave = assigned_slave;
 				if (!ether_addr_equal_64bits(rx_hash_table[index].mac_dst,
-							     mac_bcast)) {
+							     mac_bcast) &&
+				    !is_zero_ether_addr(rx_hash_table[index].mac_dst)) {
 					bond_info->rx_hashtbl[index].ntt = 1;
 					bond_info->rx_ntt = 1;
 					/* A slave has been removed from the
@@ -524,7 +525,8 @@ static void rlb_req_update_slave_clients(struct bonding *bond, struct slave *sla
 		client_info = &(bond_info->rx_hashtbl[hash_index]);
 
 		if ((client_info->slave == slave) &&
-		    !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) {
+		    !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+		    !is_zero_ether_addr(client_info->mac_dst)) {
 			client_info->ntt = 1;
 			ntt = 1;
 		}
@@ -565,7 +567,8 @@ static void rlb_req_update_subnet_clients(struct bonding *bond, __be32 src_ip)
 		if ((client_info->ip_src == src_ip) &&
 		    !ether_addr_equal_64bits(client_info->slave->dev->dev_addr,
 					     bond->dev->dev_addr) &&
-		    !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) {
+		    !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+		    !is_zero_ether_addr(client_info->mac_dst)) {
 			client_info->ntt = 1;
 			bond_info->rx_ntt = 1;
 		}
@@ -641,7 +644,8 @@ static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bon
 		ether_addr_copy(client_info->mac_src, arp->mac_src);
 		client_info->slave = assigned_slave;
 
-		if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) {
+		if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+		    !is_zero_ether_addr(client_info->mac_dst)) {
 			client_info->ntt = 1;
 			bond->alb_info.rx_ntt = 1;
 		} else {
@@ -733,8 +737,10 @@ static void rlb_rebalance(struct bonding *bond)
 		assigned_slave = __rlb_next_rx_slave(bond);
 		if (assigned_slave && (client_info->slave != assigned_slave)) {
 			client_info->slave = assigned_slave;
-			client_info->ntt = 1;
-			ntt = 1;
+			if (!is_zero_ether_addr(client_info->mac_dst)) {
+				client_info->ntt = 1;
+				ntt = 1;
+			}
 		}
 	}
 
-- 
2.17.0

^ permalink raw reply related

* [net 1/4] ice: Set rq_last_status when cleaning rq
From: Jeff Kirsher @ 2018-05-11 19:47 UTC (permalink / raw)
  To: davem
  Cc: Jeff Shaw, netdev, nhorman, sassmann, jogreene,
	Anirudh Venkataramanan, Jeff Kirsher
In-Reply-To: <20180511194722.28325-1-jeffrey.t.kirsher@intel.com>

From: Jeff Shaw <jeffrey.b.shaw@intel.com>

Prior to this commit, the rq_last_status was only set when hardware
responded with an error. This leads to rq_last_status being invalid
in the future when hardware eventually responds without error. This
commit resolves the issue by unconditionally setting rq_last_status
with the value returned in the descriptor.

Fixes: 940b61af02f4 ("ice: Initialize PF and setup miscellaneous
interrupt")

Signed-off-by: Jeff Shaw <jeffrey.b.shaw@intel.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_controlq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_controlq.c b/drivers/net/ethernet/intel/ice/ice_controlq.c
index 5909a4407e38..7c511f144ed6 100644
--- a/drivers/net/ethernet/intel/ice/ice_controlq.c
+++ b/drivers/net/ethernet/intel/ice/ice_controlq.c
@@ -1014,10 +1014,10 @@ ice_clean_rq_elem(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 	desc = ICE_CTL_Q_DESC(cq->rq, ntc);
 	desc_idx = ntc;
 
+	cq->rq_last_status = (enum ice_aq_err)le16_to_cpu(desc->retval);
 	flags = le16_to_cpu(desc->flags);
 	if (flags & ICE_AQ_FLAG_ERR) {
 		ret_code = ICE_ERR_AQ_ERROR;
-		cq->rq_last_status = (enum ice_aq_err)le16_to_cpu(desc->retval);
 		ice_debug(hw, ICE_DBG_AQ_MSG,
 			  "Control Receive Queue Event received with error 0x%x\n",
 			  cq->rq_last_status);
-- 
2.17.0

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox