Netdev List
 help / color / mirror / Atom feed
* [bpf-next V1 PATCH 3/8] ixgbe: implement flush flag for ndo_xdp_xmit
From: Jesper Dangaard Brouer @ 2018-05-30 18:00 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: John Fastabend, makita.toshiaki
In-Reply-To: <152770312703.20510.5854417568847239931.stgit@firesoul>

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 87f088f4af52..4fd77c9067f2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10022,6 +10022,15 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
+static void ixgbe_xdp_ring_update_tail(struct ixgbe_ring *ring)
+{
+	/* Force memory writes to complete before letting h/w know there
+	 * are new descriptors to fetch.
+	 */
+	wmb();
+	writel(ring->next_to_use, ring->tail);
+}
+
 static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 			  struct xdp_frame **frames, u32 flags)
 {
@@ -10033,7 +10042,7 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
 		return -ENETDOWN;
 
-	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 		return -EINVAL;
 
 	/* During program transitions its possible adapter->xdp_prog is assigned
@@ -10054,6 +10063,9 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 		}
 	}
 
+	if (unlikely(flags & XDP_XMIT_FLUSH))
+		ixgbe_xdp_ring_update_tail(ring);
+
 	return n - drops;
 }
 
@@ -10072,11 +10084,7 @@ static void ixgbe_xdp_flush(struct net_device *dev)
 	if (unlikely(!ring))
 		return;
 
-	/* Force memory writes to complete before letting h/w know there
-	 * are new descriptors to fetch.
-	 */
-	wmb();
-	writel(ring->next_to_use, ring->tail);
+	ixgbe_xdp_ring_update_tail(ring);
 
 	return;
 }

^ permalink raw reply related

* [bpf-next V1 PATCH 4/8] tun: implement flush flag for ndo_xdp_xmit
From: Jesper Dangaard Brouer @ 2018-05-30 18:00 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: John Fastabend, makita.toshiaki
In-Reply-To: <152770312703.20510.5854417568847239931.stgit@firesoul>

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/tun.c |   19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b182b8cdd219..d82a05fb0594 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1285,6 +1285,14 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 };
 
+static void __tun_xdp_flush_tfile(struct tun_file *tfile)
+{
+	/* Notify and wake up reader process */
+	if (tfile->flags & TUN_FASYNC)
+		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
+	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
+}
+
 static int tun_xdp_xmit(struct net_device *dev, int n,
 			struct xdp_frame **frames, u32 flags)
 {
@@ -1295,7 +1303,7 @@ static int tun_xdp_xmit(struct net_device *dev, int n,
 	int cnt = n;
 	int i;
 
-	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 		return -EINVAL;
 
 	rcu_read_lock();
@@ -1325,6 +1333,9 @@ static int tun_xdp_xmit(struct net_device *dev, int n,
 	}
 	spin_unlock(&tfile->tx_ring.producer_lock);
 
+	if (flags & XDP_XMIT_FLUSH)
+		__tun_xdp_flush_tfile(tfile);
+
 	rcu_read_unlock();
 	return cnt - drops;
 }
@@ -1353,11 +1364,7 @@ static void tun_xdp_flush(struct net_device *dev)
 
 	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
 					    numqueues]);
-	/* Notify and wake up reader process */
-	if (tfile->flags & TUN_FASYNC)
-		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
-	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
-
+	__tun_xdp_flush_tfile(tfile);
 out:
 	rcu_read_unlock();
 }

^ permalink raw reply related

* [bpf-next V1 PATCH 5/8] virtio_net: implement flush flag for ndo_xdp_xmit
From: Jesper Dangaard Brouer @ 2018-05-30 18:00 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: John Fastabend, makita.toshiaki
In-Reply-To: <152770312703.20510.5854417568847239931.stgit@firesoul>

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/virtio_net.c |    6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4ed823625953..62ba8aadd8e6 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -481,7 +481,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
 	int err;
 	int i;
 
-	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 		return -EINVAL;
 
 	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
@@ -507,6 +507,10 @@ static int virtnet_xdp_xmit(struct net_device *dev,
 			drops++;
 		}
 	}
+
+	if (flags & XDP_XMIT_FLUSH)
+		virtqueue_kick(sq->vq);
+
 	return n - drops;
 }
 

^ permalink raw reply related

* [bpf-next V1 PATCH 6/8] xdp: done implementing ndo_xdp_xmit flush flag for all drivers
From: Jesper Dangaard Brouer @ 2018-05-30 18:01 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: John Fastabend, makita.toshiaki
In-Reply-To: <152770312703.20510.5854417568847239931.stgit@firesoul>

Removing XDP_XMIT_FLAGS_NONE as all driver now implement
a flush operation in their ndo_xdp_xmit call.  The compiler
will catch if any users of XDP_XMIT_FLAGS_NONE remains.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/net/xdp.h |    1 -
 1 file changed, 1 deletion(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 308a4b30b484..0bc304b80cdf 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -41,7 +41,6 @@ enum xdp_mem_type {
 };
 
 /* XDP flags for ndo_xdp_xmit */
-#define XDP_XMIT_FLAGS_NONE	0U
 #define XDP_XMIT_FLUSH		(1U << 0)
 #define XDP_XMIT_FLAGS_MASK	XDP_XMIT_FLUSH
 

^ permalink raw reply related

* [bpf-next V1 PATCH 7/8] bpf/xdp: non-map redirect can avoid calling ndo_xdp_flush
From: Jesper Dangaard Brouer @ 2018-05-30 18:01 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: John Fastabend, makita.toshiaki
In-Reply-To: <152770312703.20510.5854417568847239931.stgit@firesoul>

This is the first real user of the XDP_XMIT_FLUSH flag.

As pointed out many times, XDP_REDIRECT without using BPF maps is
significant slower than the map variant.  This is primary due to the
lack of bulking, as the ndo_xdp_flush operation is required after each
frame (to avoid frames hanging on the egress device).

It is still possible to optimize this case.  Instead of invoking two
NDO indirect calls, which are very expensive with CONFIG_RETPOLINE,
instead instruct ndo_xdp_xmit to flush via XDP_XMIT_FLUSH flag.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 net/core/filter.c |    3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 6a21dbcad350..6981b4608979 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3056,10 +3056,9 @@ static int __bpf_tx_xdp(struct net_device *dev,
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, 0);
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH);
 	if (sent <= 0)
 		return sent;
-	dev->netdev_ops->ndo_xdp_flush(dev);
 	return 0;
 }
 

^ permalink raw reply related

* [bpf-next V1 PATCH 8/8] bpf/xdp: devmap can avoid calling ndo_xdp_flush
From: Jesper Dangaard Brouer @ 2018-05-30 18:01 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: John Fastabend, makita.toshiaki
In-Reply-To: <152770312703.20510.5854417568847239931.stgit@firesoul>

The XDP_REDIRECT map devmap can avoid using ndo_xdp_flush, by instead
instructing ndo_xdp_xmit to flush via XDP_XMIT_FLUSH flag in
appropriate places.

Notice after this patch it is possible to remove ndo_xdp_flush
completely, as this is the last user of ndo_xdp_flush. This is left
for later patches, to keep driver changes separate.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 kernel/bpf/devmap.c |   20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 04fbd75a5274..9c846a7a8cff 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -217,7 +217,7 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
 }
 
 static int bq_xmit_all(struct bpf_dtab_netdev *obj,
-			 struct xdp_bulk_queue *bq)
+		       struct xdp_bulk_queue *bq, bool flush)
 {
 	struct net_device *dev = obj->dev;
 	int sent = 0, drops = 0, err = 0;
@@ -232,7 +232,8 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		prefetch(xdpf);
 	}
 
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, 0);
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q,
+					     flush ? XDP_XMIT_FLUSH : 0);
 	if (sent < 0) {
 		err = sent;
 		sent = 0;
@@ -276,7 +277,6 @@ void __dev_map_flush(struct bpf_map *map)
 	for_each_set_bit(bit, bitmap, map->max_entries) {
 		struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
 		struct xdp_bulk_queue *bq;
-		struct net_device *netdev;
 
 		/* This is possible if the dev entry is removed by user space
 		 * between xdp redirect and flush op.
@@ -287,10 +287,7 @@ void __dev_map_flush(struct bpf_map *map)
 		__clear_bit(bit, bitmap);
 
 		bq = this_cpu_ptr(dev->bulkq);
-		bq_xmit_all(dev, bq);
-		netdev = dev->dev;
-		if (likely(netdev->netdev_ops->ndo_xdp_flush))
-			netdev->netdev_ops->ndo_xdp_flush(netdev);
+		bq_xmit_all(dev, bq, true);
 	}
 }
 
@@ -320,7 +317,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
 	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
 
 	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
-		bq_xmit_all(obj, bq);
+		bq_xmit_all(obj, bq, false);
 
 	/* Ingress dev_rx will be the same for all xdp_frame's in
 	 * bulk_queue, because bq stored per-CPU and must be flushed
@@ -359,8 +356,7 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 
 static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 {
-	if (dev->dev->netdev_ops->ndo_xdp_flush) {
-		struct net_device *fl = dev->dev;
+	if (dev->dev->netdev_ops->ndo_xdp_xmit) {
 		struct xdp_bulk_queue *bq;
 		unsigned long *bitmap;
 
@@ -371,9 +367,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 			__clear_bit(dev->bit, bitmap);
 
 			bq = per_cpu_ptr(dev->bulkq, cpu);
-			bq_xmit_all(dev, bq);
-
-			fl->netdev_ops->ndo_xdp_flush(dev->dev);
+			bq_xmit_all(dev, bq, true);
 		}
 	}
 }

^ permalink raw reply related

* Re: Missing skb->dst with flow offloading
From: Pablo Neira Ayuso @ 2018-05-30 18:05 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Felix Fietkau, netfilter-devel, Netdev, openwrt-devel,
	Jaap Buurman, WireGuard mailing list
In-Reply-To: <CAHmME9que7BqU9sJcmtSpBoHTs8=kybvDZAT_Ai151qZQOCscA@mail.gmail.com>

Hi Jason,

On Wed, May 30, 2018 at 02:01:05AM +0200, Jason A. Donenfeld wrote:
> Hey Pablo,
> 
> Some OpenWRT people have reported to me that there's a crash when
> enabling flow offloading, because I rely on skb_dst(skb) being
> non-null in ndo_start_xmit. The fix in my code for this is very
> simple:
> 
> - mtu = dst_mtu(skb_dst(skb));
> + dst = skb_dst(skb);
> + mtu = dst ? dst_mtu(dst) : dev->mtu;
> 
> I can make this change, but I wanted to be certain first that omitting
> the dst in the skb is intentional on your part. (If so, there might be
> other drivers to fix as well.) In tracing this, it looks like a packet
> that's forwarded from a flow offloaded interface to a virtual
> interface gets diverted immediately via neigh_xmit, where it is then
> passed to a virtual interface via dev_queue_xmit. I can't see anywhere
> along this path a call to skb_dst_set. Perhaps this is intended, as
> flow offloading is supposed to skip the routing table? Or is there an
> oversight in the new flow offloading code?
> 
> I'd appreciate your input, so that I can make the appropriate change
> -- or not -- to my code.

If there a more drivers in-tree that need this, we may add
skb_dst_set_noref() calls to _hook function in the flowtable codebase.

^ permalink raw reply

* Re: Missing skb->dst with flow offloading
From: Jason A. Donenfeld @ 2018-05-30 18:14 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Netdev, netfilter-devel, Jaap Buurman, openwrt-devel,
	WireGuard mailing list, Felix Fietkau
In-Reply-To: <20180530180547.kn4cikprj7drhlc3@salvia>

Hey Pablo,

On Wed, May 30, 2018 at 8:05 PM, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> If there a more drivers in-tree that need this, we may add
> skb_dst_set_noref() calls to _hook function in the flowtable codebase.

Can I, then, take that as an implicit acknowledgement that this
observed behavior on OpenWRT is to be expected with the current state
of events, and that I should patch my driver accordingly?

As one example of this in tree, take a look at vxlan -- it's using it
for the mtu/pmtu exactly as WireGuard does.

Regards,
Jason

^ permalink raw reply

* Re: [PATCH bpf 1/2] bpf: fix alignment of netns_dev/netns_ino fields in bpf_{map,prog}_info
From: Dmitry V. Levin @ 2018-05-30 18:18 UTC (permalink / raw)
  To: Eugene Syromiatnikov
  Cc: netdev, linux-kernel, Martin KaFai Lau, Daniel Borkmann,
	Alexei Starovoitov, David S. Miller, Jiri Olsa, Ingo Molnar,
	Lawrence Brakmo, Andrey Ignatov, Jakub Kicinski, John Fastabend
In-Reply-To: <20180527112842.GA18204@asgard.redhat.com>

[-- Attachment #1: Type: text/plain, Size: 760 bytes --]

On Sun, May 27, 2018 at 01:28:42PM +0200, Eugene Syromiatnikov wrote:
> Recent introduction of netns_dev/netns_ino to bpf_map_info/bpf_prog info
> has broken compat, as offsets of these fields are different in 32-bit
> and 64-bit ABIs.  One fix (other than implementing compat support in
> syscall in order to handle this discrepancy) is to use __aligned_u64
> instead of __u64 for these fields.
> 
> Reported-by: Dmitry V. Levin <ldv@altlinux.org>
> Fixes: 52775b33bb507 ("bpf: offload: report device information about
> offloaded maps")
> Fixes: 675fc275a3a2d ("bpf: offload: report device information for
> offloaded programs")

Reviewed-by: "Dmitry V. Levin" <ldv@altlinux.org>
Cc: <stable@vger.kernel.org> # v4.16+

Thanks,


-- 
ldv

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* [PATCH v2] ixgbe: check ipsec ip addr against mgmt filters
From: Shannon Nelson @ 2018-05-30 18:20 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: netdev

Make sure we don't try to offload the decryption of an incoming
packet that should get delivered to the management engine.  This
is a corner case that will likely be very seldom seen, but could
really confuse someone if they were to hit it.

Suggested-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
---
v2 - added the BMC IP check

 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 88 ++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 99b170f..e1c9762 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -445,6 +445,89 @@ static int ixgbe_ipsec_parse_proto_keys(struct xfrm_state *xs,
 }
 
 /**
+ * ixgbe_ipsec_check_mgmt_ip - make sure there is no clash with mgmt IP filters
+ * @xs: pointer to transformer state struct
+ **/
+static int ixgbe_ipsec_check_mgmt_ip(struct xfrm_state *xs)
+{
+	struct net_device *dev = xs->xso.dev;
+	struct ixgbe_adapter *adapter = netdev_priv(dev);
+	struct ixgbe_hw *hw = &adapter->hw;
+	u32 mfval, manc, reg;
+	int num_filters = 4;
+	bool manc_ipv4;
+	u32 bmcipval;
+	int i, j;
+
+#define MANC_EN_IPV4_FILTER      BIT(24)
+#define MFVAL_IPV4_FILTER_SHIFT  16
+#define MFVAL_IPV6_FILTER_SHIFT  24
+#define MIPAF_ARR(_m, _n)        (IXGBE_MIPAF + ((_m) * 0x10) + ((_n) * 4))
+
+#define IXGBE_BMCIP(_n)          (0x5050 + ((_n) * 4))
+#define IXGBE_BMCIPVAL           0x5060
+#define BMCIP_V4                 0x2
+#define BMCIP_V6                 0x3
+#define BMCIP_MASK               0x3
+
+	manc = IXGBE_READ_REG(hw, IXGBE_MANC);
+	manc_ipv4 = !!(manc & MANC_EN_IPV4_FILTER);
+	mfval = IXGBE_READ_REG(hw, IXGBE_MFVAL);
+	bmcipval = IXGBE_READ_REG(hw, IXGBE_BMCIPVAL);
+
+	if (xs->props.family == AF_INET) {
+		/* are there any IPv4 filters to check? */
+		if (manc_ipv4) {
+			/* the 4 ipv4 filters are all in MIPAF(3, i) */
+			for (i = 0; i < num_filters; i++) {
+				if (!(mfval & BIT(MFVAL_IPV4_FILTER_SHIFT + i)))
+					continue;
+
+				reg = IXGBE_READ_REG(hw, MIPAF_ARR(3, i));
+				if (reg == xs->id.daddr.a4)
+					return 1;
+			}
+		}
+
+		if ((bmcipval & BMCIP_MASK) == BMCIP_V4) {
+			reg = IXGBE_READ_REG(hw, IXGBE_BMCIP(3));
+			if (reg == xs->id.daddr.a4)
+				return 1;
+		}
+
+	} else {
+		/* if there are ipv4 filters, they are in the last ipv6 slot */
+		if (manc_ipv4)
+			num_filters = 3;
+
+		for (i = 0; i < num_filters; i++) {
+			if (!(mfval & BIT(MFVAL_IPV6_FILTER_SHIFT + i)))
+				continue;
+
+			for (j = 0; j < 4; j++) {
+				reg = IXGBE_READ_REG(hw, MIPAF_ARR(i, j));
+				if (reg != xs->id.daddr.a6[j])
+					break;
+			}
+			if (j == 4)   /* did we match all 4 words? */
+				return 1;
+		}
+
+		if ((bmcipval & BMCIP_MASK) == BMCIP_V6) {
+			for (j = 0; j < 4; j++) {
+				reg = IXGBE_READ_REG(hw, IXGBE_BMCIP(j));
+				if (reg != xs->id.daddr.a6[j])
+					break;
+			}
+			if (j == 4)   /* did we match all 4 words? */
+				return 1;
+		}
+	}
+
+	return 0;
+}
+
+/**
  * ixgbe_ipsec_add_sa - program device with a security association
  * @xs: pointer to transformer state struct
  **/
@@ -465,6 +548,11 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs)
 		return -EINVAL;
 	}
 
+	if (ixgbe_ipsec_check_mgmt_ip(xs)) {
+		netdev_err(dev, "IPsec IP addr clash with mgmt filters\n");
+		return -EINVAL;
+	}
+
 	if (xs->xso.flags & XFRM_OFFLOAD_INBOUND) {
 		struct rx_sa rsa;
 
-- 
2.7.4

^ permalink raw reply related

* Re: Missing skb->dst with flow offloading
From: Pablo Neira Ayuso @ 2018-05-30 18:24 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Felix Fietkau, netfilter-devel, Netdev, openwrt-devel,
	Jaap Buurman, WireGuard mailing list
In-Reply-To: <CAHmME9p-Wdsd7=BaQJSnPKyPc+xyQkdMLGUz48O5b7yVwAmHzg@mail.gmail.com>

On Wed, May 30, 2018 at 08:14:42PM +0200, Jason A. Donenfeld wrote:
> Hey Pablo,
> 
> On Wed, May 30, 2018 at 8:05 PM, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > If there a more drivers in-tree that need this, we may add
> > skb_dst_set_noref() calls to _hook function in the flowtable codebase.
> 
> Can I, then, take that as an implicit acknowledgement that this
> observed behavior on OpenWRT is to be expected with the current state
> of events, and that I should patch my driver accordingly?
> 
> As one example of this in tree, take a look at vxlan -- it's using it
> for the mtu/pmtu exactly as WireGuard does.

May it crash the kernel because it's assuming is set? If so, then
I'd appreciate if you send us a patch to
netfilter-devel@vger.kernel.org.

Please, use the nf-next.git tree to patch nf_flow_offload_ip_hook()
and nf_flow_offload_ip6_hook(), it's rather late, we'll request a
-stable submission for this if needed.

Thanks.

^ permalink raw reply

* [PATCH V2 mlx5-next 0/2] Mellanox, mlx5 new device events
From: Saeed Mahameed @ 2018-05-30 17:59 UTC (permalink / raw)
  To: netdev, linux-rdma; +Cc: Leon Romanovsky, Jason Gunthorpe, Saeed Mahameed

Hi, 

The following series is for mlx5-next tree [1], it adds the support of two
new device events, from Ilan Tayari:

1. High temperature warnings.
2. FPGA QP error event.

In case of no objection this series will be applied to mlx5-next tree
and will be sent later as a pull request to both rdma and net trees.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git/log/?h=mlx5-next

v1->v2:
  - improve commit message of the FPGA QP error event patch.

Thanks,
Saeed.

Ilan Tayari (2):
  net/mlx5: Add temperature warning event to log
  net/mlx5: Add FPGA QP error event

 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 28 +++++++++++++++++++-
 include/linux/mlx5/device.h                  |  8 ++++++
 include/linux/mlx5/mlx5_ifc.h                |  3 ++-
 include/linux/mlx5/mlx5_ifc_fpga.h           | 16 +++++++++++
 4 files changed, 53 insertions(+), 2 deletions(-)

-- 
2.17.0

^ permalink raw reply

* [PATCH V2 mlx5-next 1/2] net/mlx5: Add temperature warning event to log
From: Saeed Mahameed @ 2018-05-30 17:59 UTC (permalink / raw)
  To: netdev, linux-rdma
  Cc: Leon Romanovsky, Jason Gunthorpe, Ilan Tayari, Adi Nissim,
	Saeed Mahameed
In-Reply-To: <20180530175950.9488-1-saeedm@mellanox.com>

From: Ilan Tayari <ilant@mellanox.com>

Temperature warning event is sent by FW to indicate high temperature
as detected by one of the sensors on the board.
Add handling of this event by writing the numbers of the alert sensors
to the kernel log.

Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Adi Nissim <adin@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 23 ++++++++++++++++++++
 include/linux/mlx5/device.h                  |  7 ++++++
 include/linux/mlx5/mlx5_ifc.h                |  2 +-
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index c1c94974e16b..4bd4f011f0a9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -141,6 +141,8 @@ static const char *eqe_type_str(u8 type)
 		return "MLX5_EVENT_TYPE_GPIO_EVENT";
 	case MLX5_EVENT_TYPE_PORT_MODULE_EVENT:
 		return "MLX5_EVENT_TYPE_PORT_MODULE_EVENT";
+	case MLX5_EVENT_TYPE_TEMP_WARN_EVENT:
+		return "MLX5_EVENT_TYPE_TEMP_WARN_EVENT";
 	case MLX5_EVENT_TYPE_REMOTE_CONFIG:
 		return "MLX5_EVENT_TYPE_REMOTE_CONFIG";
 	case MLX5_EVENT_TYPE_DB_BF_CONGESTION:
@@ -393,6 +395,20 @@ static void general_event_handler(struct mlx5_core_dev *dev,
 	}
 }
 
+static void mlx5_temp_warning_event(struct mlx5_core_dev *dev,
+				    struct mlx5_eqe *eqe)
+{
+	u64 value_lsb;
+	u64 value_msb;
+
+	value_lsb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_lsb);
+	value_msb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_msb);
+
+	mlx5_core_warn(dev,
+		       "High temperature on sensors with bit set %llx %llx",
+		       value_msb, value_lsb);
+}
+
 /* caller must eventually call mlx5_cq_put on the returned cq */
 static struct mlx5_core_cq *mlx5_eq_cq_get(struct mlx5_eq *eq, u32 cqn)
 {
@@ -547,6 +563,10 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
 			mlx5_fpga_event(dev, eqe->type, &eqe->data.raw);
 			break;
 
+		case MLX5_EVENT_TYPE_TEMP_WARN_EVENT:
+			mlx5_temp_warning_event(dev, eqe);
+			break;
+
 		case MLX5_EVENT_TYPE_GENERAL_EVENT:
 			general_event_handler(dev, eqe);
 			break;
@@ -824,6 +844,9 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_DCT_DRAINED);
 
 
+	if (MLX5_CAP_GEN(dev, temp_warn_event))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_TEMP_WARN_EVENT);
+
 	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
 				 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
 				 "mlx5_cmd_eq", MLX5_EQ_TYPE_ASYNC);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 2bc27f8c5b87..eddacee5cf61 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -314,6 +314,7 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_PORT_CHANGE	   = 0x09,
 	MLX5_EVENT_TYPE_GPIO_EVENT	   = 0x15,
 	MLX5_EVENT_TYPE_PORT_MODULE_EVENT  = 0x16,
+	MLX5_EVENT_TYPE_TEMP_WARN_EVENT    = 0x17,
 	MLX5_EVENT_TYPE_REMOTE_CONFIG	   = 0x19,
 	MLX5_EVENT_TYPE_GENERAL_EVENT	   = 0x22,
 	MLX5_EVENT_TYPE_PPS_EVENT          = 0x25,
@@ -626,6 +627,11 @@ struct mlx5_eqe_dct {
 	__be32  dctn;
 };
 
+struct mlx5_eqe_temp_warning {
+	__be64 sensor_warning_msb;
+	__be64 sensor_warning_lsb;
+} __packed;
+
 union ev_data {
 	__be32				raw[7];
 	struct mlx5_eqe_cmd		cmd;
@@ -642,6 +648,7 @@ union ev_data {
 	struct mlx5_eqe_port_module	port_module;
 	struct mlx5_eqe_pps		pps;
 	struct mlx5_eqe_dct             dct;
+	struct mlx5_eqe_temp_warning	temp_warning;
 } __packed;
 
 struct mlx5_eqe {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 10c1613d9434..ba30c26aa6eb 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -926,7 +926,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         log_max_msg[0x5];
 	u8         reserved_at_1c8[0x4];
 	u8         max_tc[0x4];
-	u8         reserved_at_1d0[0x1];
+	u8         temp_warn_event[0x1];
 	u8         dcbx[0x1];
 	u8         general_notification_event[0x1];
 	u8         reserved_at_1d3[0x2];
-- 
2.17.0

^ permalink raw reply related

* [PATCH V2 mlx5-next 2/2] net/mlx5: Add FPGA QP error event
From: Saeed Mahameed @ 2018-05-30 17:59 UTC (permalink / raw)
  To: netdev, linux-rdma
  Cc: Leon Romanovsky, Jason Gunthorpe, Ilan Tayari, Adi Nissim,
	Saeed Mahameed
In-Reply-To: <20180530175950.9488-1-saeedm@mellanox.com>

From: Ilan Tayari <ilant@mellanox.com>

The FPGA queue pair (QP) event fires whenever a QP on the FPGA
transitions to the error state.

At this stage, this event is unrecoverable, it may become recoverable
in the future.

Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Adi Nissim <adin@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c |  7 +++++--
 include/linux/mlx5/device.h                  |  1 +
 include/linux/mlx5/mlx5_ifc.h                |  1 +
 include/linux/mlx5/mlx5_ifc_fpga.h           | 16 ++++++++++++++++
 4 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 4bd4f011f0a9..77c685645c66 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -161,6 +161,8 @@ static const char *eqe_type_str(u8 type)
 		return "MLX5_EVENT_TYPE_NIC_VPORT_CHANGE";
 	case MLX5_EVENT_TYPE_FPGA_ERROR:
 		return "MLX5_EVENT_TYPE_FPGA_ERROR";
+	case MLX5_EVENT_TYPE_FPGA_QP_ERROR:
+		return "MLX5_EVENT_TYPE_FPGA_QP_ERROR";
 	case MLX5_EVENT_TYPE_GENERAL_EVENT:
 		return "MLX5_EVENT_TYPE_GENERAL_EVENT";
 	default:
@@ -560,6 +562,7 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
 			break;
 
 		case MLX5_EVENT_TYPE_FPGA_ERROR:
+		case MLX5_EVENT_TYPE_FPGA_QP_ERROR:
 			mlx5_fpga_event(dev, eqe->type, &eqe->data.raw);
 			break;
 
@@ -839,11 +842,11 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_PPS_EVENT);
 
 	if (MLX5_CAP_GEN(dev, fpga))
-		async_event_mask |= (1ull << MLX5_EVENT_TYPE_FPGA_ERROR);
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_FPGA_ERROR) |
+				    (1ull << MLX5_EVENT_TYPE_FPGA_QP_ERROR);
 	if (MLX5_CAP_GEN_MAX(dev, dct))
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_DCT_DRAINED);
 
-
 	if (MLX5_CAP_GEN(dev, temp_warn_event))
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_TEMP_WARN_EVENT);
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index eddacee5cf61..71e1dc2523a6 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -331,6 +331,7 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_DCT_DRAINED        = 0x1c,
 
 	MLX5_EVENT_TYPE_FPGA_ERROR         = 0x20,
+	MLX5_EVENT_TYPE_FPGA_QP_ERROR      = 0x21,
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ba30c26aa6eb..3e8845dc85fe 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -60,6 +60,7 @@ enum {
 	MLX5_EVENT_TYPE_CODING_COMMAND_INTERFACE_COMPLETION        = 0xa,
 	MLX5_EVENT_TYPE_CODING_PAGE_REQUEST                        = 0xb,
 	MLX5_EVENT_TYPE_CODING_FPGA_ERROR                          = 0x20,
+	MLX5_EVENT_TYPE_CODING_FPGA_QP_ERROR                       = 0x21
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h
index ec052491ba3d..7ddca31fa05d 100644
--- a/include/linux/mlx5/mlx5_ifc_fpga.h
+++ b/include/linux/mlx5/mlx5_ifc_fpga.h
@@ -432,6 +432,22 @@ struct mlx5_ifc_ipsec_counters_bits {
 	u8         dropped_cmd[0x40];
 };
 
+enum {
+	MLX5_FPGA_QP_ERROR_EVENT_SYNDROME_RETRY_COUNTER_EXPIRED  = 0x1,
+	MLX5_FPGA_QP_ERROR_EVENT_SYNDROME_RNR_EXPIRED            = 0x2,
+};
+
+struct mlx5_ifc_fpga_qp_error_event_bits {
+	u8         reserved_at_0[0x40];
+
+	u8         reserved_at_40[0x18];
+	u8         syndrome[0x8];
+
+	u8         reserved_at_60[0x60];
+
+	u8         reserved_at_c0[0x8];
+	u8         fpga_qpn[0x18];
+};
 enum mlx5_ifc_fpga_ipsec_response_syndrome {
 	MLX5_FPGA_IPSEC_RESPONSE_SUCCESS = 0,
 	MLX5_FPGA_IPSEC_RESPONSE_ILLEGAL_REQUEST = 1,
-- 
2.17.0

^ permalink raw reply related

* Re: Missing skb->dst with flow offloading
From: Jason A. Donenfeld @ 2018-05-30 18:30 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Netdev, netfilter-devel, Jaap Buurman, openwrt-devel,
	WireGuard mailing list, Felix Fietkau
In-Reply-To: <20180530182442.wbgke26kaoau2du2@salvia>

On Wed, May 30, 2018 at 8:24 PM, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> May it crash the kernel because it's assuming is set? If so, then
> I'd appreciate if you send us a patch to

I suspect it won't crash, but the pmtu might wind up wrong / not calculated.

> Please, use the nf-next.git tree to patch nf_flow_offload_ip_hook()
> and nf_flow_offload_ip6_hook(), it's rather late, we'll request a
> -stable submission for this if needed.

Given the above, I'll submit a patch, though I don't suppose it will
be necessary for -stable.

^ permalink raw reply

* Re: [PATCH] netfilter: nfnetlink: Remove VLA usage
From: Kees Cook @ 2018-05-30 19:08 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Jozsef Kadlecsik, Florian Westphal, David S. Miller,
	netfilter-devel, coreteam, Network Development, LKML
In-Reply-To: <20180530095249.3jdhb5gdwt24cnsa@salvia>

On Wed, May 30, 2018 at 2:52 AM, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> On Tue, May 29, 2018 at 05:35:25PM -0700, Kees Cook wrote:
>> In the quest to remove all stack VLA usage from the kernel[1], this
>> allocates the maximum size expected for all possible attrs and adds
>> a sanity-check to make sure nothing gets out of sync.
>>
>> [1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com
>>
>> Signed-off-by: Kees Cook <keescook@chromium.org>
>> ---
>>  net/netfilter/nfnetlink.c | 22 ++++++++++++++++++++--
>>  1 file changed, 20 insertions(+), 2 deletions(-)
>>
>> diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
>> index 03ead8a9e90c..0cb395f9627e 100644
>> --- a/net/netfilter/nfnetlink.c
>> +++ b/net/netfilter/nfnetlink.c
>> @@ -28,6 +28,7 @@
>>
>>  #include <net/netlink.h>
>>  #include <linux/netfilter/nfnetlink.h>
>> +#include <linux/netfilter/nf_tables.h>
>>
>>  MODULE_LICENSE("GPL");
>>  MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
>> @@ -37,6 +38,11 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
>>       rcu_dereference_protected(table[(id)].subsys, \
>>                                 lockdep_nfnl_is_held((id)))
>>
>> +#define NFTA_MAX_ATTR        max(max(max(NFTA_CHAIN_MAX, NFTA_FLOWTABLE_MAX),\
>> +                             max(NFTA_OBJ_MAX, NFTA_RULE_MAX)),      \
>> +                         max(NFTA_TABLE_MAX,                         \
>> +                             max(NFTA_SET_ELEM_LIST_MAX, NFTA_SET_MAX)))
>
> This is very specific of nftables, there are other nf subsystems using
> nfnetlink that may go over this maximum attribute value (grep from
> "struct nfnetlink_subsystem").

Oops, yes. I see that now.

> To remove the VLA, I think we need an artificial maximum attribute
> that reasonably large enough.

git grep insanity:

$ for i in $(git grep -A10 'static .*struct nfnetlink_subsystem' | \
                 grep '\.cb\b' | awk '{print $NF}' | cut -d, -f1)
do git grep -A100 'static .*'"$i\b"; done | \
    grep '\.attr_count\b' | awk -F= '{print $NF}' | \
    sed -e 's/ //g' | cut -d, -f1 | sort -u

and manual counting gets me:

CTA_EXPECT_MAX 12
CTA_MAX 25
CTA_TIMEOUT_MAX 6
IPSET_ATTR_CMD_MAX 11
NFACCT_MAX 9
NFCTH_MAX 7
NFQA_CFG_MAX 6
NFQA_MAX 21
NFTA_CHAIN_MAX 10
NFTA_COMPAT_MAX 4
NFTA_OBJ_MAX 8
NFTA_RULE_MAX 10
NFTA_SET_ELEM_LIST_MAX 5
NFTA_SET_MAX 17
NFTA_TABLE_MAX 6
NFULA_CFG_MAX 7
NFULA_MAX 20
OSF_ATTR_MAX 2

How about 32?

As Florian suggested, I'll add the check in
nfnetlink_subsys_register() with a WARN().

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply

* [PATCH] iproute2: fix 'ip xfrm monitor all' command
From: Nathan Harold @ 2018-05-30 19:11 UTC (permalink / raw)
  To: netdev; +Cc: Nathan Harold

Currently, calling 'ip xfrm monitor all' will
actually invoke the 'all-nsid' command because the
soft-match for 'all-nsid' occurs before the precise
match for 'all'. This patch rearranges the checks
so that the 'all' command, itself an alias for
invoking 'ip xfrm monitor' with no argument, can
be called consistent with the syntax for other ip
commands that accept an 'all'.

Signed-off-by: Nathan Harold <nharold@google.com>
---
 ip/xfrm_monitor.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ip/xfrm_monitor.c b/ip/xfrm_monitor.c
index 2eabece0..5d086768 100644
--- a/ip/xfrm_monitor.c
+++ b/ip/xfrm_monitor.c
@@ -359,6 +359,8 @@ int do_xfrm_monitor(int argc, char **argv)
 		if (matches(*argv, "file") == 0) {
 			NEXT_ARG();
 			file = *argv;
+		} else if (strcmp(*argv, "all") == 0) {
+			/* fall out */
 		} else if (matches(*argv, "all-nsid") == 0) {
 			listen_all_nsid = 1;
 		} else if (matches(*argv, "acquire") == 0) {
@@ -381,7 +383,7 @@ int do_xfrm_monitor(int argc, char **argv)
 			groups = 0;
 		} else if (matches(*argv, "help") == 0) {
 			usage();
-		} else if (strcmp(*argv, "all")) {
+		} else {
 			fprintf(stderr, "Argument \"%s\" is unknown, try \"ip xfrm monitor help\".\n", *argv);
 			exit(-1);
 		}
-- 
2.17.1.1185.g55be947832-goog

^ permalink raw reply related

* [PATCH v2] netfilter: nfnetlink: Remove VLA usage
From: Kees Cook @ 2018-05-30 19:17 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Jozsef Kadlecsik, Florian Westphal, David S. Miller,
	netfilter-devel, coreteam, netdev, linux-kernel

In the quest to remove all stack VLA usage from the kernel[1], this
allocates the maximum size expected for all possible attrs and adds
sanity-checks at both registration and usage to make sure nothing
gets out of sync.

[1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com

Signed-off-by: Kees Cook <keescook@chromium.org>
---
v2: Use a static value and add a check in nfnetlink_subsys_register()
---
 net/netfilter/nfnetlink.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 03ead8a9e90c..1a80fdcf008f 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -37,6 +37,8 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
 	rcu_dereference_protected(table[(id)].subsys, \
 				  lockdep_nfnl_is_held((id)))
 
+#define NFNL_MAX_ATTR_COUNT	32
+
 static struct {
 	struct mutex				mutex;
 	const struct nfnetlink_subsystem __rcu	*subsys;
@@ -76,6 +78,13 @@ EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held);
 
 int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
 {
+	u8 cb_id;
+
+	/* Sanity-check attr_count size to avoid stack buffer overflow. */
+	for (cb_id = 0; cb_id < n->cb_count; cb_id++)
+		if (WARN_ON(n->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT))
+			return -EINVAL;
+
 	nfnl_lock(n->subsys_id);
 	if (table[n->subsys_id].subsys) {
 		nfnl_unlock(n->subsys_id);
@@ -185,11 +194,17 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	{
 		int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
 		u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
-		struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+		struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
 		struct nlattr *attr = (void *)nlh + min_len;
 		int attrlen = nlh->nlmsg_len - min_len;
 		__u8 subsys_id = NFNL_SUBSYS_ID(type);
 
+		/* Sanity-check NFNL_MAX_ATTR_COUNT */
+		if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
+			rcu_read_unlock();
+			return -ENOMEM;
+		}
+
 		err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen,
 				ss->cb[cb_id].policy, extack);
 		if (err < 0) {
@@ -379,10 +394,16 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
 		{
 			int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
 			u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
-			struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+			struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
 			struct nlattr *attr = (void *)nlh + min_len;
 			int attrlen = nlh->nlmsg_len - min_len;
 
+			/* Sanity-check NFTA_MAX_ATTR */
+			if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
+				err = -ENOMEM;
+				goto ack;
+			}
+
 			err = nla_parse(cda, ss->cb[cb_id].attr_count, attr,
 					attrlen, ss->cb[cb_id].policy, NULL);
 			if (err < 0)
-- 
2.17.0


-- 
Kees Cook
Pixel Security

^ permalink raw reply related

* [PATCH bpf-next] bpf: Change bpf_fib_lookup to return -EAFNOSUPPORT for unsupported address families
From: dsahern @ 2018-05-30 19:24 UTC (permalink / raw)
  To: netdev, borkmann, ast; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Update bpf_fib_lookup to return -EAFNOSUPPORT for unsupported address
families. Allows userspace to probe for support as more are added
(e.g., AF_MPLS).

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/core/filter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 4cff6d9cd724..a2b96e44b2c1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4285,7 +4285,7 @@ BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
 					   flags, true);
 #endif
 	}
-	return 0;
+	return -EAFNOSUPPORT;
 }
 
 static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
@@ -4302,7 +4302,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
 	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
 {
 	struct net *net = dev_net(skb->dev);
-	int index = 0;
+	int index = -EAFNOSUPPORT;
 
 	if (plen < sizeof(*params))
 		return -EINVAL;
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH bpf v3 3/5] selftests/bpf: test_sockmap, fix test timeout
From: Alexei Starovoitov @ 2018-05-30 19:29 UTC (permalink / raw)
  To: Prashant Bhole
  Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	David S . Miller, Shuah Khan, netdev, linux-kselftest
In-Reply-To: <20180530055611.10216-4-bhole_prashant_q7@lab.ntt.co.jp>

On Wed, May 30, 2018 at 02:56:09PM +0900, Prashant Bhole wrote:
> In order to reduce runtime of tests, recently timout for select() call
> was reduced from 1sec to 10usec. This was causing many tests failures.
> It was caught with failure handling commits in this series.
> 
> Restoring the timeout from 10usec to 1sec
> 
> Fixes: a18fda1a62c3 ("bpf: reduce runtime of test_sockmap tests")
> Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
> ---
>  tools/testing/selftests/bpf/test_sockmap.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
> index 64f9e25c451f..9d01f5c2abe2 100644
> --- a/tools/testing/selftests/bpf/test_sockmap.c
> +++ b/tools/testing/selftests/bpf/test_sockmap.c
> @@ -345,8 +345,8 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
>  		if (err < 0)
>  			perror("recv start time: ");
>  		while (s->bytes_recvd < total_bytes) {
> -			timeout.tv_sec = 0;
> -			timeout.tv_usec = 10;
> +			timeout.tv_sec = 1;
> +			timeout.tv_usec = 0;

I've applied the set, but had to revert it, since it takes too long.

real	1m40.124s
user	0m0.375s
sys	0m14.521s

Myself and Daniel run the test semi-manually when we apply patches.
Adding 2 extra minutes of wait time is unnecessary.
Especially since most of it is idle time.
Please find a way to fix tests differently.
btw I don't see any failures today. Not sure what is being fixed
by incresing a timeout.

Also please mention [PATCH bpf-next] in the subject when you respin.
Thanks!

^ permalink raw reply

* Re: [PATCH bpf v3 3/5] selftests/bpf: test_sockmap, fix test timeout
From: John Fastabend @ 2018-05-30 19:59 UTC (permalink / raw)
  To: Alexei Starovoitov, Prashant Bhole
  Cc: Alexei Starovoitov, Daniel Borkmann, David S . Miller, Shuah Khan,
	netdev, linux-kselftest
In-Reply-To: <20180530192942.4ahcaxupbhfkopl4@ast-mbp.dhcp.thefacebook.com>

On 05/30/2018 12:29 PM, Alexei Starovoitov wrote:
> On Wed, May 30, 2018 at 02:56:09PM +0900, Prashant Bhole wrote:
>> In order to reduce runtime of tests, recently timout for select() call
>> was reduced from 1sec to 10usec. This was causing many tests failures.
>> It was caught with failure handling commits in this series.
>>
>> Restoring the timeout from 10usec to 1sec
>>
>> Fixes: a18fda1a62c3 ("bpf: reduce runtime of test_sockmap tests")
>> Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
>> ---
>>  tools/testing/selftests/bpf/test_sockmap.c | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
>> index 64f9e25c451f..9d01f5c2abe2 100644
>> --- a/tools/testing/selftests/bpf/test_sockmap.c
>> +++ b/tools/testing/selftests/bpf/test_sockmap.c
>> @@ -345,8 +345,8 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
>>  		if (err < 0)
>>  			perror("recv start time: ");
>>  		while (s->bytes_recvd < total_bytes) {
>> -			timeout.tv_sec = 0;
>> -			timeout.tv_usec = 10;
>> +			timeout.tv_sec = 1;
>> +			timeout.tv_usec = 0;
> 
> I've applied the set, but had to revert it, since it takes too long.
> 
> real	1m40.124s
> user	0m0.375s
> sys	0m14.521s
> 

Dang, I thought it would be a bit longer but not minutes.

> Myself and Daniel run the test semi-manually when we apply patches.> Adding 2 extra minutes of wait time is unnecessary.

Yep.

> Especially since most of it is idle time.
> Please find a way to fix tests differently.
> btw I don't see any failures today. Not sure what is being fixed
> by incresing a timeout.
> 

Calling these fixes is a bit much, they are primarily improvements.

The background is, when I originally wrote the tests my goal was to
exercise the kernel code paths. Because of this I didn't really care if
the tests actually sent/recv all bytes in the test. (I have long
running tests using netperf/wrk/apached/etc. for that) But, the manual
tests do have an option to verify the data if specified. The 'verify'
option is a bit fragile in that with the right tests (e.g. drop)
or the certain options (e.g. cork) it can fail which is expected.

What Prashant added was support to actually verify the data correctly.
And also fix a few cgroup handling and some pretty printing as well.
He noticed the low timeout causing issue in these cases though so
increased it.

@Prashant, how about increasing this less dramatically because now
all cork tests are going to stall for 1s unless perfectly aligned.
How about 100us? Or even better we can conditionally set it based
on if tx_cork is set. If tx_cork is set use 1us otherwise use 200us
or something. (1s is really to high in any cases for lo)

Also capturing some of the above in the cover letter would help
folks understand the context a bit better.

Thanks!
John

^ permalink raw reply

* [PATCH net-next] net: phy: consider PHY_IGNORE_INTERRUPT in state machine PHY_NOLINK handling
From: Heiner Kallweit @ 2018-05-30 20:13 UTC (permalink / raw)
  To: Florian Fainelli, Andrew Lunn, David Miller; +Cc: netdev@vger.kernel.org

We can bail out immediately also in case of PHY_IGNORE_INTERRUPT because
phy_mac_interupt() informs us once the link is up.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
 drivers/net/phy/phy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 05c1e8ef..537297d2 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -894,7 +894,7 @@ void phy_state_machine(struct work_struct *work)
 			needs_aneg = true;
 		break;
 	case PHY_NOLINK:
-		if (phy_interrupt_is_valid(phydev))
+		if (phydev->irq != PHY_POLL)
 			break;
 
 		err = phy_read_status(phydev);
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH net-next v2 0/2] net: phy: improve PHY suspend/resume
From: Heiner Kallweit @ 2018-05-30 20:22 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: Florian Fainelli, David Miller, netdev@vger.kernel.org
In-Reply-To: <20180523220418.GB5128@lunn.ch>

Am 24.05.2018 um 00:04 schrieb Andrew Lunn:
> On Wed, May 23, 2018 at 10:15:29PM +0200, Heiner Kallweit wrote:
>> I have the issue that suspending the MAC-integrated PHY gives an
>> error during system suspend. The sequence is:
>>
>> 1. unconnected PHY/MAC are runtime-suspended already
>> 2. system suspend commences
>> 3. mdio_bus_phy_suspend is called
>> 4. suspend callback of the network driver is called (implicitly
>>    MAC/PHY are runtime-resumed before)
>> 5. suspend callback suspends MAC/PHY
>>
>> The problem occurs in step 3. phy_suspend() fails because the MDIO
>> bus isn't accessible due to the chip being runtime-suspended.
> 
> I think you are fixing the wrong problem. I've had the same with the
> FEC driver. I fixed it by making the MDIO operations runtime-suspend
> aware:
> 
I checked the fec driver and there it's reletively easy because
runtime suspend/resume is just about disabling/enabling one clock.

In case of r8169 runtime suspend/resume do much more and there would
be quite some potential deadlock issues to take care of.
To provide one example:
pm_runtime_get_sync() would be called with the mdio bus lock being
held (from mdio write op), runtime-resuming however includes PHY
writes what would result in a deadlock.

I think we need a better solution than spending the effort needed
to make the MDIO ops runtime-pm-aware. In general there seems to be
just one network driver using both phylib and runtime pm, so most
drivers aren't affected (yet).

I will spend few more thoughts on a solution ..

Regards, Heiner

> commit 8fff755e9f8d0f70a595e79f248695ce6aef5cc3
> Author: Andrew Lunn <andrew@lunn.ch>
> Date:   Sat Jul 25 22:38:02 2015 +0200
> 
>     net: fec: Ensure clocks are enabled while using mdio bus
>     
>     When a switch is attached to the mdio bus, the mdio bus can be used
>     while the interface is not open. If the IPG clock is not enabled, MDIO
>     reads/writes will simply time out.
>     
>     Add support for runtime PM to control this clock. Enable/disable this
>     clock using runtime PM, with open()/close() and mdio read()/write()
>     function triggering runtime PM operations. Since PM is optional, the
>     IPG clock is enabled at probe and is no longer modified by
>     fec_enet_clk_enable(), thus if PM is not enabled in the kernel, it is
>     guaranteed the clock is running when MDIO operations are performed.
> 
> Don't copy this patch 1:1. I introduced a few bugs which took a while
> to be shaken out :-(
> 
>    Andrew
> 

^ permalink raw reply

* Re: [PATCH net 1/2] ip_tunnel: restore binding to ifaces with a large mtu
From: Ido Schimmel @ 2018-05-30 20:29 UTC (permalink / raw)
  To: Nicolas Dichtel; +Cc: davem, petrm, idosch, netdev
In-Reply-To: <20180530082843.6076-2-nicolas.dichtel@6wind.com>

On Wed, May 30, 2018 at 10:28:42AM +0200, Nicolas Dichtel wrote:
> After commit f6cc9c054e77, the following conf is broken (note that the
> default loopback mtu is 65536, ie IP_MAX_MTU + 1):
> 
> $ ip tunnel add gre1 mode gre local 10.125.0.1 remote 10.125.0.2 dev lo
> add tunnel "gre0" failed: Invalid argument
> $ ip l a type dummy
> $ ip l s dummy1 up
> $ ip l s dummy1 mtu 65535
> $ ip tunnel add gre1 mode gre local 10.125.0.1 remote 10.125.0.2 dev dummy1
> add tunnel "gre0" failed: Invalid argument
> 
> dev_set_mtu() doesn't allow to set a mtu which is too large.
> First, let's cap the mtu returned by ip_tunnel_bind_dev(). Second, remove
> the magic value 0xFFF8 and use IP_MAX_MTU instead.
> 0xFFF8 seems to be there for ages, I don't know why this value was used.
> 
> With a recent kernel, it's also possible to set a mtu > IP_MAX_MTU:
> $ ip l s dummy1 mtu 66000
> After that patch, it's also possible to bind an ip tunnel on that kind of
> interface.
> 
> CC: Petr Machata <petrm@mellanox.com>
> CC: Ido Schimmel <idosch@mellanox.com>
> Link: https://git.kernel.org/pub/scm/linux/kernel/git/davem/netdev-vger-cvs.git/commit/?id=e5afd356a411a
> Fixes: f6cc9c054e77 ("ip_tunnel: Emit events for post-register MTU changes")
> Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>

Reviewed-by: Ido Schimmel <idosch@mellanox.com>

There is another instance of this magic number in the file, but it's
written in lower case so you might have missed it - see
ip_tunnel_newlink(). Can you please take care of it in v2?

Thanks for the fix, Nicolas!

^ permalink raw reply

* Re: suspicius csum initialization in vmxnet3_rx_csum
From: Paolo Abeni @ 2018-05-30 20:29 UTC (permalink / raw)
  To: Guolin Yang, Ronak Doshi; +Cc: Neil Horman, Boon Ang, Louis Luo, netdev
In-Reply-To: <BLUPR05MB1907699C14C2F900BA57323DD96A0@BLUPR05MB1907.namprd05.prod.outlook.com>

Hi,

On Thu, 2018-05-24 at 21:48 +0000, Guolin Yang wrote:
> Yes, that code  is not correct, we should fix that code

Did you have any chance to address the issue and/or to give a more in-
deepth look to the change proposed in my initial email?

Thanks,

Paolo 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox