Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v3 6/8] net: mscc: improve the frame header parsing readability
From: Antoine Tenart @ 2019-07-24  8:17 UTC (permalink / raw)
  To: davem, richardcochran, alexandre.belloni, UNGLinuxDriver, ralf,
	paul.burton, jhogan
  Cc: Antoine Tenart, netdev, linux-mips, thomas.petazzoni,
	allan.nielsen
In-Reply-To: <20190724081715.29159-1-antoine.tenart@bootlin.com>

This cosmetic patch improves the frame header parsing readability by
introducing a new macro to access and mask its fields.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
---
 drivers/net/ethernet/mscc/ocelot_board.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c
index 990027f04d1b..5e4f1718dd99 100644
--- a/drivers/net/ethernet/mscc/ocelot_board.c
+++ b/drivers/net/ethernet/mscc/ocelot_board.c
@@ -16,24 +16,26 @@
 
 #include "ocelot.h"
 
-static int ocelot_parse_ifh(u32 *ifh, struct frame_info *info)
+#define IFH_EXTRACT_BITFIELD64(x, o, w) (((x) >> (o)) & GENMASK_ULL((w) - 1, 0))
+
+static int ocelot_parse_ifh(u32 *_ifh, struct frame_info *info)
 {
-	int i;
 	u8 llen, wlen;
+	u64 ifh[2];
+
+	ifh[0] = be64_to_cpu(((__force __be64 *)_ifh)[0]);
+	ifh[1] = be64_to_cpu(((__force __be64 *)_ifh)[1]);
 
-	/* The IFH is in network order, switch to CPU order */
-	for (i = 0; i < IFH_LEN; i++)
-		ifh[i] = ntohl((__force __be32)ifh[i]);
+	wlen = IFH_EXTRACT_BITFIELD64(ifh[0], 7,  8);
+	llen = IFH_EXTRACT_BITFIELD64(ifh[0], 15,  6);
 
-	wlen = (ifh[1] >> 7) & 0xff;
-	llen = (ifh[1] >> 15) & 0x3f;
 	info->len = OCELOT_BUFFER_CELL_SZ * wlen + llen - 80;
 
-	info->port = (ifh[2] & GENMASK(14, 11)) >> 11;
+	info->port = IFH_EXTRACT_BITFIELD64(ifh[1], 43, 4);
 
-	info->cpuq = (ifh[3] & GENMASK(27, 20)) >> 20;
-	info->tag_type = (ifh[3] & BIT(16)) >> 16;
-	info->vid = ifh[3] & GENMASK(11, 0);
+	info->cpuq = IFH_EXTRACT_BITFIELD64(ifh[1], 20, 8);
+	info->tag_type = IFH_EXTRACT_BITFIELD64(ifh[1], 16,  1);
+	info->vid = IFH_EXTRACT_BITFIELD64(ifh[1], 0,  12);
 
 	return 0;
 }
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v3 5/8] net: mscc: describe the PTP register range
From: Antoine Tenart @ 2019-07-24  8:17 UTC (permalink / raw)
  To: davem, richardcochran, alexandre.belloni, UNGLinuxDriver, ralf,
	paul.burton, jhogan
  Cc: Antoine Tenart, netdev, linux-mips, thomas.petazzoni,
	allan.nielsen
In-Reply-To: <20190724081715.29159-1-antoine.tenart@bootlin.com>

This patch adds support for using the PTP register range, and adds a
description of its registers. This bank is used when configuring PTP.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
---
 drivers/net/ethernet/mscc/ocelot.h       |  9 ++++++
 drivers/net/ethernet/mscc/ocelot_board.c | 10 +++++-
 drivers/net/ethernet/mscc/ocelot_ptp.h   | 41 ++++++++++++++++++++++++
 drivers/net/ethernet/mscc/ocelot_regs.c  | 11 +++++++
 4 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mscc/ocelot_ptp.h

diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
index f7eeb4806897..e0da8b4eddf2 100644
--- a/drivers/net/ethernet/mscc/ocelot.h
+++ b/drivers/net/ethernet/mscc/ocelot.h
@@ -23,6 +23,7 @@
 #include "ocelot_sys.h"
 #include "ocelot_qs.h"
 #include "ocelot_tc.h"
+#include "ocelot_ptp.h"
 
 #define PGID_AGGR    64
 #define PGID_SRC     80
@@ -71,6 +72,7 @@ enum ocelot_target {
 	SYS,
 	S2,
 	HSIO,
+	PTP,
 	TARGET_MAX,
 };
 
@@ -343,6 +345,13 @@ enum ocelot_reg {
 	S2_CACHE_ACTION_DAT,
 	S2_CACHE_CNT_DAT,
 	S2_CACHE_TG_DAT,
+	PTP_PIN_CFG = PTP << TARGET_OFFSET,
+	PTP_PIN_TOD_SEC_MSB,
+	PTP_PIN_TOD_SEC_LSB,
+	PTP_PIN_TOD_NSEC,
+	PTP_CFG_MISC,
+	PTP_CLK_CFG_ADJ_CFG,
+	PTP_CLK_CFG_ADJ_FREQ,
 };
 
 enum ocelot_regfield {
diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c
index 2451d4a96490..990027f04d1b 100644
--- a/drivers/net/ethernet/mscc/ocelot_board.c
+++ b/drivers/net/ethernet/mscc/ocelot_board.c
@@ -182,6 +182,7 @@ static int mscc_ocelot_probe(struct platform_device *pdev)
 	struct {
 		enum ocelot_target id;
 		char *name;
+		u8 optional:1;
 	} res[] = {
 		{ SYS, "sys" },
 		{ REW, "rew" },
@@ -189,6 +190,7 @@ static int mscc_ocelot_probe(struct platform_device *pdev)
 		{ ANA, "ana" },
 		{ QS, "qs" },
 		{ S2, "s2" },
+		{ PTP, "ptp", 1 },
 	};
 
 	if (!np && !pdev->dev.platform_data)
@@ -205,8 +207,14 @@ static int mscc_ocelot_probe(struct platform_device *pdev)
 		struct regmap *target;
 
 		target = ocelot_io_platform_init(ocelot, pdev, res[i].name);
-		if (IS_ERR(target))
+		if (IS_ERR(target)) {
+			if (res[i].optional) {
+				ocelot->targets[res[i].id] = NULL;
+				continue;
+			}
+
 			return PTR_ERR(target);
+		}
 
 		ocelot->targets[res[i].id] = target;
 	}
diff --git a/drivers/net/ethernet/mscc/ocelot_ptp.h b/drivers/net/ethernet/mscc/ocelot_ptp.h
new file mode 100644
index 000000000000..9ede14a12573
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot_ptp.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
+/*
+ * Microsemi Ocelot Switch driver
+ *
+ * License: Dual MIT/GPL
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+
+#ifndef _MSCC_OCELOT_PTP_H_
+#define _MSCC_OCELOT_PTP_H_
+
+#define PTP_PIN_CFG_RSZ			0x20
+#define PTP_PIN_TOD_SEC_MSB_RSZ		PTP_PIN_CFG_RSZ
+#define PTP_PIN_TOD_SEC_LSB_RSZ		PTP_PIN_CFG_RSZ
+#define PTP_PIN_TOD_NSEC_RSZ		PTP_PIN_CFG_RSZ
+
+#define PTP_PIN_CFG_DOM			BIT(0)
+#define PTP_PIN_CFG_SYNC		BIT(2)
+#define PTP_PIN_CFG_ACTION(x)		((x) << 3)
+#define PTP_PIN_CFG_ACTION_MASK		PTP_PIN_CFG_ACTION(0x7)
+
+enum {
+	PTP_PIN_ACTION_IDLE = 0,
+	PTP_PIN_ACTION_LOAD,
+	PTP_PIN_ACTION_SAVE,
+	PTP_PIN_ACTION_CLOCK,
+	PTP_PIN_ACTION_DELTA,
+	PTP_PIN_ACTION_NOSYNC,
+	PTP_PIN_ACTION_SYNC,
+};
+
+#define PTP_CFG_MISC_PTP_EN		BIT(2)
+
+#define PSEC_PER_SEC			1000000000000LL
+
+#define PTP_CFG_CLK_ADJ_CFG_ENA		BIT(0)
+#define PTP_CFG_CLK_ADJ_CFG_DIR		BIT(1)
+
+#define PTP_CFG_CLK_ADJ_FREQ_NS		BIT(30)
+
+#endif
diff --git a/drivers/net/ethernet/mscc/ocelot_regs.c b/drivers/net/ethernet/mscc/ocelot_regs.c
index 6c387f994ec5..e59977d20400 100644
--- a/drivers/net/ethernet/mscc/ocelot_regs.c
+++ b/drivers/net/ethernet/mscc/ocelot_regs.c
@@ -234,6 +234,16 @@ static const u32 ocelot_s2_regmap[] = {
 	REG(S2_CACHE_TG_DAT,               0x000388),
 };
 
+static const u32 ocelot_ptp_regmap[] = {
+	REG(PTP_PIN_CFG,                   0x000000),
+	REG(PTP_PIN_TOD_SEC_MSB,           0x000004),
+	REG(PTP_PIN_TOD_SEC_LSB,           0x000008),
+	REG(PTP_PIN_TOD_NSEC,              0x00000c),
+	REG(PTP_CFG_MISC,                  0x0000a0),
+	REG(PTP_CLK_CFG_ADJ_CFG,           0x0000a4),
+	REG(PTP_CLK_CFG_ADJ_FREQ,          0x0000a8),
+};
+
 static const u32 *ocelot_regmap[] = {
 	[ANA] = ocelot_ana_regmap,
 	[QS] = ocelot_qs_regmap,
@@ -241,6 +251,7 @@ static const u32 *ocelot_regmap[] = {
 	[REW] = ocelot_rew_regmap,
 	[SYS] = ocelot_sys_regmap,
 	[S2] = ocelot_s2_regmap,
+	[PTP] = ocelot_ptp_regmap,
 };
 
 static const struct reg_field ocelot_regfields[] = {
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v3 4/8] MIPS: dts: mscc: describe the PTP ready interrupt
From: Antoine Tenart @ 2019-07-24  8:17 UTC (permalink / raw)
  To: davem, richardcochran, alexandre.belloni, UNGLinuxDriver, ralf,
	paul.burton, jhogan
  Cc: Antoine Tenart, netdev, linux-mips, thomas.petazzoni,
	allan.nielsen
In-Reply-To: <20190724081715.29159-1-antoine.tenart@bootlin.com>

This patch adds a description of the PTP ready interrupt, which can be
triggered when a PTP timestamp is available on an hardware FIFO.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Acked-by: Paul Burton <paul.burton@mips.com>
---
 arch/mips/boot/dts/mscc/ocelot.dtsi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mips/boot/dts/mscc/ocelot.dtsi b/arch/mips/boot/dts/mscc/ocelot.dtsi
index 1e55a778def5..797d336db54d 100644
--- a/arch/mips/boot/dts/mscc/ocelot.dtsi
+++ b/arch/mips/boot/dts/mscc/ocelot.dtsi
@@ -139,8 +139,8 @@
 				    "port2", "port3", "port4", "port5", "port6",
 				    "port7", "port8", "port9", "port10", "qsys",
 				    "ana", "s2";
-			interrupts = <21 22>;
-			interrupt-names = "xtr", "inj";
+			interrupts = <18 21 22>;
+			interrupt-names = "ptp_rdy", "xtr", "inj";
 
 			ethernet-ports {
 				#address-cells = <1>;
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v3 3/8] Documentation/bindings: net: ocelot: document the PTP ready IRQ
From: Antoine Tenart @ 2019-07-24  8:17 UTC (permalink / raw)
  To: davem, richardcochran, alexandre.belloni, UNGLinuxDriver, ralf,
	paul.burton, jhogan
  Cc: Antoine Tenart, netdev, linux-mips, thomas.petazzoni,
	allan.nielsen
In-Reply-To: <20190724081715.29159-1-antoine.tenart@bootlin.com>

One additional interrupt needs to be described within the Ocelot device
tree node: the PTP ready one. This patch documents the binding needed to
do so.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
---
 Documentation/devicetree/bindings/net/mscc-ocelot.txt | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/mscc-ocelot.txt b/Documentation/devicetree/bindings/net/mscc-ocelot.txt
index 4d05a3b0f786..3b6290b45ce5 100644
--- a/Documentation/devicetree/bindings/net/mscc-ocelot.txt
+++ b/Documentation/devicetree/bindings/net/mscc-ocelot.txt
@@ -17,9 +17,10 @@ Required properties:
   - "ana"
   - "portX" with X from 0 to the number of last port index available on that
     switch
-- interrupts: Should contain the switch interrupts for frame extraction and
-  frame injection
-- interrupt-names: should contain the interrupt names: "xtr", "inj"
+- interrupts: Should contain the switch interrupts for frame extraction,
+  frame injection and PTP ready.
+- interrupt-names: should contain the interrupt names: "xtr", "inj". Can contain
+  "ptp_rdy" which is optional due to backward compatibility.
 - ethernet-ports: A container for child nodes representing switch ports.
 
 The ethernet-ports container has the following properties
@@ -63,8 +64,8 @@ Example:
 			    "port2", "port3", "port4", "port5", "port6",
 			    "port7", "port8", "port9", "port10", "qsys",
 			    "ana";
-		interrupts = <21 22>;
-		interrupt-names = "xtr", "inj";
+		interrupts = <18 21 22>;
+		interrupt-names = "ptp_rdy", "xtr", "inj";
 
 		ethernet-ports {
 			#address-cells = <1>;
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v3 2/8] MIPS: dts: mscc: describe the PTP register range
From: Antoine Tenart @ 2019-07-24  8:17 UTC (permalink / raw)
  To: davem, richardcochran, alexandre.belloni, UNGLinuxDriver, ralf,
	paul.burton, jhogan
  Cc: Antoine Tenart, netdev, linux-mips, thomas.petazzoni,
	allan.nielsen
In-Reply-To: <20190724081715.29159-1-antoine.tenart@bootlin.com>

This patch adds one register range within the mscc,vsc7514-switch node,
to describe the PTP registers.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Acked-by: Paul Burton <paul.burton@mips.com>
---
 arch/mips/boot/dts/mscc/ocelot.dtsi | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/mips/boot/dts/mscc/ocelot.dtsi b/arch/mips/boot/dts/mscc/ocelot.dtsi
index 33ae74aaa1bb..1e55a778def5 100644
--- a/arch/mips/boot/dts/mscc/ocelot.dtsi
+++ b/arch/mips/boot/dts/mscc/ocelot.dtsi
@@ -120,6 +120,7 @@
 			reg = <0x1010000 0x10000>,
 			      <0x1030000 0x10000>,
 			      <0x1080000 0x100>,
+			      <0x10e0000 0x10000>,
 			      <0x11e0000 0x100>,
 			      <0x11f0000 0x100>,
 			      <0x1200000 0x100>,
@@ -134,7 +135,7 @@
 			      <0x1800000 0x80000>,
 			      <0x1880000 0x10000>,
 			      <0x1060000 0x10000>;
-			reg-names = "sys", "rew", "qs", "port0", "port1",
+			reg-names = "sys", "rew", "qs", "ptp", "port0", "port1",
 				    "port2", "port3", "port4", "port5", "port6",
 				    "port7", "port8", "port9", "port10", "qsys",
 				    "ana", "s2";
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v3 1/8] Documentation/bindings: net: ocelot: document the PTP bank
From: Antoine Tenart @ 2019-07-24  8:17 UTC (permalink / raw)
  To: davem, richardcochran, alexandre.belloni, UNGLinuxDriver, ralf,
	paul.burton, jhogan
  Cc: Antoine Tenart, netdev, linux-mips, thomas.petazzoni,
	allan.nielsen
In-Reply-To: <20190724081715.29159-1-antoine.tenart@bootlin.com>

One additional register range needs to be described within the Ocelot
device tree node: the PTP. This patch documents the binding needed to do
so.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
---
 Documentation/devicetree/bindings/net/mscc-ocelot.txt | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/mscc-ocelot.txt b/Documentation/devicetree/bindings/net/mscc-ocelot.txt
index 9e5c17d426ce..4d05a3b0f786 100644
--- a/Documentation/devicetree/bindings/net/mscc-ocelot.txt
+++ b/Documentation/devicetree/bindings/net/mscc-ocelot.txt
@@ -12,6 +12,7 @@ Required properties:
   - "sys"
   - "rew"
   - "qs"
+  - "ptp" (optional due to backward compatibility)
   - "qsys"
   - "ana"
   - "portX" with X from 0 to the number of last port index available on that
@@ -44,6 +45,7 @@ Example:
 		reg = <0x1010000 0x10000>,
 		      <0x1030000 0x10000>,
 		      <0x1080000 0x100>,
+		      <0x10e0000 0x10000>,
 		      <0x11e0000 0x100>,
 		      <0x11f0000 0x100>,
 		      <0x1200000 0x100>,
@@ -57,9 +59,10 @@ Example:
 		      <0x1280000 0x100>,
 		      <0x1800000 0x80000>,
 		      <0x1880000 0x10000>;
-		reg-names = "sys", "rew", "qs", "port0", "port1", "port2",
-			    "port3", "port4", "port5", "port6", "port7",
-			    "port8", "port9", "port10", "qsys", "ana";
+		reg-names = "sys", "rew", "qs", "ptp", "port0", "port1",
+			    "port2", "port3", "port4", "port5", "port6",
+			    "port7", "port8", "port9", "port10", "qsys",
+			    "ana";
 		interrupts = <21 22>;
 		interrupt-names = "xtr", "inj";
 
-- 
2.21.0


^ permalink raw reply related

* [PATCH] iproute2: devlink: use sys/queue.h from libbsd as a fallback
From: Sergei Trofimovich @ 2019-07-24  8:18 UTC (permalink / raw)
  To: netdev; +Cc: Sergei Trofimovich, Stephen Hemminger

On sys/queue.h does not exist linux-musl targets and
fails build as:

    devlink.c:28:10: fatal error: sys/queue.h: No such file or directory
       28 | #include <sys/queue.h>
          |          ^~~~~~~~~~~~~

The change pulls in 'sys/queue.h' from libbsd in case
system headers don't already provides it.

Tested on linux-musl and linux-glibc.

Bug: https://bugs.gentoo.org/690486
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: netdev@vger.kernel.org
Signed-off-by: Sergei Trofimovich <slyfox@gentoo.org>
---
 configure         | 30 ++++++++++++++++++++++++++++++
 devlink/devlink.c |  9 ++++++++-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/configure b/configure
index 45fcffb6..a1ee946f 100755
--- a/configure
+++ b/configure
@@ -323,6 +323,33 @@ check_cap()
 	fi
 }
 
+check_sys_queue()
+{
+    cat >$TMPDIR/queue_test.c <<EOF
+#include <sys/queue.h>
+struct nest_qentry {
+	int attr_type;
+	TAILQ_ENTRY(nest_qentry) nest_entries;
+};
+int main(int argc, char **argv) {
+	return 0;
+}
+EOF
+    if $CC -I$INCLUDE -o $TMPDIR/queue_test $TMPDIR/queue_test.c >/dev/null 2>&1; then
+	echo "no"
+    else
+	if ${PKG_CONFIG} libbsd --exists; then
+		echo 'CFLAGS += -DHAVE_LIBBSD_SYS_QUEUE' `${PKG_CONFIG} libbsd --cflags` >>$CONFIG
+		echo 'LDLIBS +=' `${PKG_CONFIG} libbsd --libs` >> $CONFIG
+		echo "no"
+	else
+		echo 'CFLAGS += -DNEED_SYS_QUEUE' >>$CONFIG
+		echo "yes"
+	fi
+    fi
+    rm -f $TMPDIR/queue_test.c $TMPDIR/queue_test
+}
+
 quiet_config()
 {
 	cat <<EOF
@@ -398,6 +425,9 @@ check_strlcpy
 echo -n "libcap support: "
 check_cap
 
+echo -n "need for sys/queue.h API: "
+check_sys_queue
+
 echo >> $CONFIG
 echo "%.o: %.c" >> $CONFIG
 echo '	$(QUIET_CC)$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(CPPFLAGS) -c -o $@ $<' >> $CONFIG
diff --git a/devlink/devlink.c b/devlink/devlink.c
index bb023c0c..fd91198c 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -25,7 +25,14 @@
 #include <linux/devlink.h>
 #include <libmnl/libmnl.h>
 #include <netinet/ether.h>
-#include <sys/queue.h>
+#ifdef HAVE_LIBBSD_SYS_QUEUE
+#    include <bsd/sys/queue.h>
+#else
+#    include <sys/queue.h>
+#endif
+#ifdef NEED_SYS_QUEUE
+#    error "No <sys/queue.h> implementation found."
+#endif
 
 #include "SNAPSHOT.h"
 #include "list.h"
-- 
2.22.0


^ permalink raw reply related

* RE: [PATCH net-next] dpaa2-eth: Don't use netif_receive_skb_list for TCP frames
From: Ioana Ciocoi Radulescu @ 2019-07-24  8:15 UTC (permalink / raw)
  To: David Miller
  Cc: netdev@vger.kernel.org, Ioana Ciornei, Vladimir Oltean,
	Eric Dumazet, Edward Cree
In-Reply-To: <20190723.140255.1785812525450069326.davem@davemloft.net>

> -----Original Message-----
> From: David Miller <davem@davemloft.net>
> Sent: Wednesday, July 24, 2019 12:03 AM
> To: Ioana Ciocoi Radulescu <ruxandra.radulescu@nxp.com>
> Cc: netdev@vger.kernel.org; Ioana Ciornei <ioana.ciornei@nxp.com>; Vladimir
> Oltean <vladimir.oltean@nxp.com>
> Subject: Re: [PATCH net-next] dpaa2-eth: Don't use netif_receive_skb_list for
> TCP frames
> 
> From: Ioana Radulescu <ruxandra.radulescu@nxp.com>
> Date: Tue, 23 Jul 2019 20:28:43 +0300
> 
> > Using Rx skb bulking for all frames may negatively impact the
> > performance in some TCP termination scenarios, as it effectively
> > bypasses GRO.
> 
> "may"?
> 
> Please provide numbers so that we know exactly whether it actually
> hurts performance one way or another.

We observed the worst degradation running netperf TCP_STREAM on a
setup with two 16cores LX2160A boards connected back-to-back, one
stream per cpu:
With netif_receive_skb_list() on all packets, we get a total bandwidth
of 41.6Gbps, with rx cpu load of 97%; after applying current patch, bw
increases to 45.8Gbps with rx cpu at 64%, which is similar to what we
got before using netif_receive_skb_list() in the first place.

On other platforms/setups the impact is lower, in some cases there is
no difference in throughput, only higher cpu load when skb batching
is used for TCP packets.

Anyway, based on feedback so far I guess the best path forward is to
withdraw this patch and wait for Edward's GRO batching work to be
accepted. I can help test those patches if needed.

Thanks,
Ioana

^ permalink raw reply

* Re: [PATCH net-next v2 0/8] Use dev_get_drvdata where possible
From: Chuhong Yuan @ 2019-07-24  8:13 UTC (permalink / raw)
  Cc: Steffen Klassert, David S . Miller, Jay Cliburn, Chris Snook,
	Rasesh Mody, Michael Chan, Siva Reddy Kallam, Prashant Sreedharan,
	GR-Linux-NIC-Dev, Jeff Kirsher, Guo-Fu Tseng, intel-wired-lan,
	netdev, linux-kernel
In-Reply-To: <20190724060512.23899-1-hslester96@gmail.com>

On Wed, Jul 24, 2019 at 2:05 PM Chuhong Yuan <hslester96@gmail.com> wrote:
>
> These patches use dev_get_drvdata instead of
> using to_pci_dev + pci_get_drvdata to make
> code simpler where possible.
>
> Changelog:
>
> v1 -> v2:
> - Change pci_set_drvdata to dev_set_drvdata
>   to keep consistency.
>

Hi all,
I checked the cases which mentioned the consistency
of get/set_drvdata usages.
The cases' commit IDs are
488d040e3a3452a0dceef5d3ec4f61942262f57f
b77c98780e682fe780d899b91543769d4cf94585

After checking, I think that the consistency problem
refers to inconsistency between probe and remove.
But the changes of these patches are not related
to probe and remove.

So I think the previously sent and applied v1 patches
which do not change pci_set_drvdata to dev_set_drvdata
are okay.
Therefore there may be no need to use these v2 patches.

Regards,
Chuhong


> Chuhong Yuan (8):
>   net: 3com: 3c59x: Use dev_get_drvdata
>   net: atheros: Use dev_get_drvdata
>   net: broadcom: Use dev_get_drvdata
>   e1000e: Use dev_get_drvdata where possible
>   fm10k: Use dev_get_drvdata
>   i40e: Use dev_get_drvdata
>   igb: Use dev_get_drvdata where possible
>   net: jme: Use dev_get_drvdata
>
>  drivers/net/ethernet/3com/3c59x.c               |  8 +++-----
>  drivers/net/ethernet/atheros/alx/main.c         |  8 +++-----
>  drivers/net/ethernet/atheros/atl1c/atl1c_main.c | 10 ++++------
>  drivers/net/ethernet/atheros/atlx/atl1.c        |  8 +++-----
>  drivers/net/ethernet/broadcom/bnx2.c            |  8 +++-----
>  drivers/net/ethernet/broadcom/bnxt/bnxt.c       |  8 +++-----
>  drivers/net/ethernet/broadcom/tg3.c             |  8 +++-----
>  drivers/net/ethernet/intel/e1000e/netdev.c      |  9 ++++-----
>  drivers/net/ethernet/intel/fm10k/fm10k_pci.c    |  6 +++---
>  drivers/net/ethernet/intel/i40e/i40e_main.c     | 10 ++++------
>  drivers/net/ethernet/intel/igb/igb_main.c       |  5 ++---
>  drivers/net/ethernet/jme.c                      |  8 +++-----
>  12 files changed, 38 insertions(+), 58 deletions(-)
>
> --
> 2.20.1
>

^ permalink raw reply

* Re: [RFC PATCH net-next 00/12] drop_monitor: Capture dropped packets and metadata
From: Ido Schimmel @ 2019-07-24  8:10 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: netdev, davem, nhorman, dsahern, roopa, nikolay, jakub.kicinski,
	andy, f.fainelli, andrew, vivien.didelot, mlxsw, Ido Schimmel
In-Reply-To: <87ftmw3f9m.fsf@toke.dk>

On Tue, Jul 23, 2019 at 06:08:21PM +0200, Toke Høiland-Jørgensen wrote:
> Also, presumably the queue will have to change from a struct
> sk_buff_head to something that can hold XDP frames and whatever devlink
> puts there as well, right?

Good point!

For HW drops we get an SKB and relevant metadata from devlink about why
the packet was dropped etc. I plan to store a pointer to this metadata
in the SKB control block.

Let me see how the implementation goes. Even if use sk_buff_head for
now, I will make sure that converting to a more generalized data
structure is straightforward.

^ permalink raw reply

* Re: [PATCH 07/12] vhost-scsi: convert put_page() to put_user_page*()
From: Michael S. Tsirkin @ 2019-07-24  8:07 UTC (permalink / raw)
  To: john.hubbard
  Cc: Andrew Morton, Alexander Viro, Anna Schumaker, David S . Miller,
	Dominique Martinet, Eric Van Hensbergen, Jason Gunthorpe,
	Jason Wang, Jens Axboe, Latchesar Ionkov, Miklos Szeredi,
	Trond Myklebust, Christoph Hellwig, Matthew Wilcox, linux-mm,
	LKML, ceph-devel, kvm, linux-block, linux-cifs, linux-fsdevel,
	linux-nfs, linux-rdma, netdev, samba-technical, v9fs-developer,
	virtualization, Jérôme Glisse, John Hubbard, Jan Kara,
	Dan Williams, Johannes Thumshirn, Ming Lei, Dave Chinner,
	Boaz Harrosh, Paolo Bonzini, Stefan Hajnoczi
In-Reply-To: <20190724042518.14363-8-jhubbard@nvidia.com>

On Tue, Jul 23, 2019 at 09:25:13PM -0700, john.hubbard@gmail.com wrote:
> From: Jérôme Glisse <jglisse@redhat.com>
> 
> For pages that were retained via get_user_pages*(), release those pages
> via the new put_user_page*() routines, instead of via put_page().
> 
> This is part a tree-wide conversion, as described in commit fc1d8e7cca2d
> ("mm: introduce put_user_page*(), placeholder versions").
> 
> Changes from Jérôme's original patch:
> 
> * Changed a WARN_ON to a BUG_ON.
> 
> Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
> Signed-off-by: John Hubbard <jhubbard@nvidia.com>
> Cc: virtualization@lists.linux-foundation.org
> Cc: linux-fsdevel@vger.kernel.org
> Cc: linux-block@vger.kernel.org
> Cc: linux-mm@kvack.org
> Cc: Jan Kara <jack@suse.cz>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Cc: Johannes Thumshirn <jthumshirn@suse.de>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc: Ming Lei <ming.lei@redhat.com>
> Cc: Dave Chinner <david@fromorbit.com>
> Cc: Jason Gunthorpe <jgg@ziepe.ca>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Boaz Harrosh <boaz@plexistor.com>
> Cc: Miklos Szeredi <miklos@szeredi.hu>
> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> Cc: Jason Wang <jasowang@redhat.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Stefan Hajnoczi <stefanha@redhat.com>

Acked-by: Michael S. Tsirkin <mst@redhat.com>

> ---
>  drivers/vhost/scsi.c | 13 ++++++++++---
>  1 file changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
> index a9caf1bc3c3e..282565ab5e3f 100644
> --- a/drivers/vhost/scsi.c
> +++ b/drivers/vhost/scsi.c
> @@ -329,11 +329,11 @@ static void vhost_scsi_release_cmd(struct se_cmd *se_cmd)
>  
>  	if (tv_cmd->tvc_sgl_count) {
>  		for (i = 0; i < tv_cmd->tvc_sgl_count; i++)
> -			put_page(sg_page(&tv_cmd->tvc_sgl[i]));
> +			put_user_page(sg_page(&tv_cmd->tvc_sgl[i]));
>  	}
>  	if (tv_cmd->tvc_prot_sgl_count) {
>  		for (i = 0; i < tv_cmd->tvc_prot_sgl_count; i++)
> -			put_page(sg_page(&tv_cmd->tvc_prot_sgl[i]));
> +			put_user_page(sg_page(&tv_cmd->tvc_prot_sgl[i]));
>  	}
>  
>  	vhost_scsi_put_inflight(tv_cmd->inflight);
> @@ -630,6 +630,13 @@ vhost_scsi_map_to_sgl(struct vhost_scsi_cmd *cmd,
>  	size_t offset;
>  	unsigned int npages = 0;
>  
> +	/*
> +	 * Here in all cases we should have an IOVEC which use GUP. If that is
> +	 * not the case then we will wrongly call put_user_page() and the page
> +	 * refcount will go wrong (this is in vhost_scsi_release_cmd())
> +	 */
> +	WARN_ON(!iov_iter_get_pages_use_gup(iter));
> +
>  	bytes = iov_iter_get_pages(iter, pages, LONG_MAX,
>  				VHOST_SCSI_PREALLOC_UPAGES, &offset);
>  	/* No pages were pinned */
> @@ -681,7 +688,7 @@ vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool write,
>  			while (p < sg) {
>  				struct page *page = sg_page(p++);
>  				if (page)
> -					put_page(page);
> +					put_user_page(page);
>  			}
>  			return ret;
>  		}
> -- 
> 2.22.0

^ permalink raw reply

* Re: Reminder: 3 open syzbot bugs in "net/ax25" subsystem
From: Dmitry Vyukov @ 2019-07-24  8:01 UTC (permalink / raw)
  To: Joe; +Cc: linux-hams, netdev, Ralf Baechle, David S. Miller, LKML,
	syzkaller-bugs
In-Reply-To: <623ff230-5883-560c-22d0-3e6b0eeaba39@mcn.org>

On Wed, Jul 24, 2019 at 5:42 AM Joe <joego@mcn.org> wrote:
>
> Hi Eric, How do I get off of this thread? When I try to unsubscribe it
> tells me I'm not a member of the group.

Hi Joe,

If you received it via netdev or linux-hams mailing lists, here are
instructions on how to unsubscribe:
http://vger.kernel.org/vger-lists.html#netdev
http://vger.kernel.org/vger-lists.html#linux-hams



> On 7/23/19 7:40 PM, Eric Biggers wrote:
> > [This email was generated by a script.  Let me know if you have any suggestions
> > to make it better, or if you want it re-generated with the latest status.]
> >
> > Of the currently open syzbot reports against the upstream kernel, I've manually
> > marked 3 of them as possibly being bugs in the "net/ax25" subsystem.  I've
> > listed these reports below, sorted by an algorithm that tries to list first the
> > reports most likely to be still valid, important, and actionable.
> >
> > If you believe a bug is no longer valid, please close the syzbot report by
> > sending a '#syz fix', '#syz dup', or '#syz invalid' command in reply to the
> > original thread, as explained at https://goo.gl/tpsmEJ#status
> >
> > If you believe I misattributed a bug to the "net/ax25" subsystem, please let me
> > know, and if possible forward the report to the correct people or mailing list.
> >
> > Here are the bugs:
> >
> > --------------------------------------------------------------------------------
> > Title:              general protection fault in ax25_send_frame
> > Last occurred:      0 days ago
> > Reported:           204 days ago
> > Branches:           Mainline and others
> > Dashboard link:     https://syzkaller.appspot.com/bug?id=1cdd5b120f129364fc8e9b2b027826cf99fa696e
> > Original thread:    https://lkml.kernel.org/lkml/0000000000009ea37c057e58d787@google.com/T/#u
> >
> > Unfortunately, this bug does not have a reproducer.
> >
> > No one replied to the original thread for this bug.
> >
> > If you fix this bug, please add the following tag to the commit:
> >      Reported-by: syzbot+e0b81535a27b8be39502@syzkaller.appspotmail.com
> >
> > If you send any email or patch for this bug, please consider replying to the
> > original thread.  For the git send-email command to use, or tips on how to reply
> > if the thread isn't in your mailbox, see the "Reply instructions" at
> > https://lkml.kernel.org/r/0000000000009ea37c057e58d787@google.com
> >
> > --------------------------------------------------------------------------------
> > Title:              KASAN: stack-out-of-bounds Write in ax25_getname
> > Last occurred:      90 days ago
> > Reported:           206 days ago
> > Branches:           Mainline and others
> > Dashboard link:     https://syzkaller.appspot.com/bug?id=fb195f91dc044978c1b186f1288b1eff61edcc20
> > Original thread:    https://lkml.kernel.org/lkml/000000000000ed4120057e2df0c6@google.com/T/#u
> >
> > This bug has a syzkaller reproducer only.
> >
> > No one replied to the original thread for this bug.
> >
> > If you fix this bug, please add the following tag to the commit:
> >      Reported-by: syzbot+6a29097222b4d3b8617c@syzkaller.appspotmail.com
> >
> > If you send any email or patch for this bug, please consider replying to the
> > original thread.  For the git send-email command to use, or tips on how to reply
> > if the thread isn't in your mailbox, see the "Reply instructions" at
> > https://lkml.kernel.org/r/000000000000ed4120057e2df0c6@google.com
> >
> > --------------------------------------------------------------------------------
> > Title:              inconsistent lock state in ax25_std_heartbeat_expiry
> > Last occurred:      122 days ago
> > Reported:           120 days ago
> > Branches:           net
> > Dashboard link:     https://syzkaller.appspot.com/bug?id=9086a8eac930890b2730d6441093bd478e32913f
> > Original thread:    https://lkml.kernel.org/lkml/0000000000001b07250584efbee3@google.com/T/#u
> >
> > Unfortunately, this bug does not have a reproducer.
> >
> > The original thread for this bug received 2 replies; the last was 119 days ago.
> >
> > If you fix this bug, please add the following tag to the commit:
> >      Reported-by: syzbot+e350b81e95a6a214da8a@syzkaller.appspotmail.com
> >
> > If you send any email or patch for this bug, please consider replying to the
> > original thread.  For the git send-email command to use, or tips on how to reply
> > if the thread isn't in your mailbox, see the "Reply instructions" at
> > https://lkml.kernel.org/r/0000000000001b07250584efbee3@google.com
> >
>
> --
> You received this message because you are subscribed to the Google Groups "syzkaller-bugs" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to syzkaller-bugs+unsubscribe@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/syzkaller-bugs/623ff230-5883-560c-22d0-3e6b0eeaba39%40mcn.org.

^ permalink raw reply

* Re: [RFC PATCH net-next 00/12] drop_monitor: Capture dropped packets and metadata
From: Ido Schimmel @ 2019-07-24  7:57 UTC (permalink / raw)
  To: David Ahern
  Cc: Toke Høiland-Jørgensen, netdev, davem, nhorman, roopa,
	nikolay, jakub.kicinski, andy, f.fainelli, andrew, vivien.didelot,
	mlxsw, Ido Schimmel
In-Reply-To: <c02f9b6a-f343-89ee-1047-79c1fb4e3436@gmail.com>

On Tue, Jul 23, 2019 at 08:47:57AM -0700, David Ahern wrote:
> On 7/23/19 8:14 AM, Ido Schimmel wrote:
> > On Tue, Jul 23, 2019 at 02:17:49PM +0200, Toke Høiland-Jørgensen wrote:
> >> Ido Schimmel <idosch@idosch.org> writes:
> >>
> >>> On Mon, Jul 22, 2019 at 09:43:15PM +0200, Toke Høiland-Jørgensen wrote:
> >>>> Is there a mechanism for the user to filter the packets before they are
> >>>> sent to userspace? A bpf filter would be the obvious choice I guess...
> >>>
> >>> Hi Toke,
> >>>
> >>> Yes, it's on my TODO list to write an eBPF program that only lets
> >>> "unique" packets to be enqueued on the netlink socket. Where "unique" is
> >>> defined as {5-tuple, PC}. The rest of the copies will be counted in an
> >>> eBPF map, which is just a hash table keyed by {5-tuple, PC}.
> >>
> >> Yeah, that's a good idea. Or even something simpler like tcpdump-style
> >> filters for the packets returned by drop monitor (say if I'm just trying
> >> to figure out what happens to my HTTP requests).
> > 
> > Yep, that's a good idea. I guess different users will use different
> > programs. Will look into both options.
> 
> Perhaps I am missing something, but the dropmon code only allows a
> single user at the moment (in my attempts to run 2 instances the second
> one failed).

Yes, you're correct. By "different users" I meant users on different
systems with different needs. For example, someone trying to monitor
dropped packets on a laptop versus someone trying to do the same on a
ToR switch.

> If that part stays with the design

This stays the same.

> it afford better options for the design. e.g., attributes that control
> the enqueued packets when the event occurs as opposed to bpf filters
> which run much later when the message is enqueued to the socket.

I'm going to add an attribute that will control the number of packets
we're enqueuing on the per-CPU drop list. I'm not sure, but are you
suggesting to add even more attributes? If so, how do you imagine these
will look like?

> 
> > 
> >>> I think it would be good to have the program as part of the bcc
> >>> repository [1]. What do you think?
> >>
> >> Sure. We could also add it to the XDP tutorial[2]; it could go into a
> >> section on introspection and debugging (just added a TODO about that[3]).
> > 
> > Great!
> > 
> >>>> For integrating with XDP the trick would be to find a way to do it that
> >>>> doesn't incur any overhead when it's not enabled. Are you envisioning
> >>>> that this would be enabled separately for the different "modes" (kernel,
> >>>> hardware, XDP, etc)?
> >>>
> >>> Yes. Drop monitor have commands to enable and disable tracing, but they
> >>> don't carry any attributes at the moment. My plan is to add an attribute
> >>> (e.g., 'NET_DM_ATTR_DROP_TYPE') that will specify the type of drops
> >>> you're interested in - SW/HW/XDP. If the attribute is not specified,
> >>> then current behavior is maintained and all the drop types are traced.
> >>> But if you're only interested in SW drops, then overhead for the rest
> >>> should be zero.
> >>
> >> Makes sense (although "should be" is the key here ;)).
> 
> static_key is used in other parts of the packet fast path.
> 
> Toke/Jesper: Any reason to believe it is too much overhead for this path?
> 
> >>
> >> I'm also worried about the drop monitor getting overwhelmed; if you turn
> >> it on for XDP and you're running a filtering program there, you'll
> >> suddenly get *a lot* of drops.
> >>
> >> As I read your patch, the current code can basically queue up an
> >> unbounded number of packets waiting to go out over netlink, can't it?
> > 
> > That's a very good point. Each CPU holds a drop list. It probably makes
> > sense to limit it by default (to 1000?) and allow user to change it
> > later, if needed. I can expose a counter that shows how many packets
> > were dropped because of this limit. It can be used as an indication to
> > adjust the queue length (or flip to 'summary' mode).
> > 
> 
> And then with a single user limit, you can have an attribute that
> controls the backlog.

Yep, already on my list of changes for v1 :)

Thanks, David.

^ permalink raw reply

* Re: [PATCH] net: sctp: fix memory leak in sctp_send_reset_streams
From: Xin Long @ 2019-07-24  7:56 UTC (permalink / raw)
  To: Neil Horman
  Cc: Hillf Danton, linux-sctp, network dev, syzkaller,
	David S . Miller, LKML, syzkaller-bugs, syzbot,
	Marcelo Ricardo Leitner, Vlad Yasevich, Eric Dumazet
In-Reply-To: <CADvbK_dUDjK3UAF49uo+DZv+QiuEsaMmZeqDwBJ0suRwu4yXJw@mail.gmail.com>

On Sun, Jun 2, 2019 at 9:36 PM Xin Long <lucien.xin@gmail.com> wrote:
>
> On Sun, Jun 2, 2019 at 6:52 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> >
> > On Sun, Jun 02, 2019 at 11:44:29AM +0800, Hillf Danton wrote:
> > >
> > > syzbot found the following crash on:
> > >
> > > HEAD commit:    036e3431 Merge git://git.kernel.org/pub/scm/linux/kernel/g..
> > > git tree:       upstream
> > > console output: https://syzkaller.appspot.com/x/log.txt?x=153cff12a00000
> > > kernel config:  https://syzkaller.appspot.com/x/.config?x=8f0f63a62bb5b13c
> > > dashboard link: https://syzkaller.appspot.com/bug?extid=6ad9c3bd0a218a2ab41d
> > > compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
> > > syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=12561c86a00000
> > > C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=15b76fd8a00000
> > >
> > > executing program
> > > executing program
> > > executing program
> > > executing program
> > > executing program
> > > BUG: memory leak
> > > unreferenced object 0xffff888123894820 (size 32):
> > >   comm "syz-executor045", pid 7267, jiffies 4294943559 (age 13.660s)
> > >   hex dump (first 32 bytes):
> > >     00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
> > >     00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
> > >   backtrace:
> > >     [<00000000c7e71c69>] kmemleak_alloc_recursive
> > > include/linux/kmemleak.h:55 [inline]
> > >     [<00000000c7e71c69>] slab_post_alloc_hook mm/slab.h:439 [inline]
> > >     [<00000000c7e71c69>] slab_alloc mm/slab.c:3326 [inline]
> > >     [<00000000c7e71c69>] __do_kmalloc mm/slab.c:3658 [inline]
> > >     [<00000000c7e71c69>] __kmalloc+0x161/0x2c0 mm/slab.c:3669
> > >     [<000000003250ed8e>] kmalloc_array include/linux/slab.h:670 [inline]
> > >     [<000000003250ed8e>] kcalloc include/linux/slab.h:681 [inline]
> > >     [<000000003250ed8e>] sctp_send_reset_streams+0x1ab/0x5a0 net/sctp/stream.c:302
> > >     [<00000000cd899c6e>] sctp_setsockopt_reset_streams net/sctp/socket.c:4314 [inline]
> > >     [<00000000cd899c6e>] sctp_setsockopt net/sctp/socket.c:4765 [inline]
> > >     [<00000000cd899c6e>] sctp_setsockopt+0xc23/0x2bf0 net/sctp/socket.c:4608
> > >     [<00000000ff3a21a2>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3130
> > >     [<000000009eb87ae7>] __sys_setsockopt+0x98/0x120 net/socket.c:2078
> > >     [<00000000e0ede6ca>] __do_sys_setsockopt net/socket.c:2089 [inline]
> > >     [<00000000e0ede6ca>] __se_sys_setsockopt net/socket.c:2086 [inline]
> > >     [<00000000e0ede6ca>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086
> > >     [<00000000c61155f5>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301
> > >     [<00000000e540958c>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
> > >
> > >
> > > It was introduced in commit d570a59c5b5f ("sctp: only allow the out stream
> > > reset when the stream outq is empty"), in orde to check stream outqs before
> > > sending SCTP_STRRESET_IN_PROGRESS back to the peer of the stream. EAGAIN is
> > > returned, however, without the nstr_list slab released, if any outq is found
> > > to be non empty.
> > >
> > > Freeing the slab in question before bailing out fixes it.
> > >
> > > Fixes: d570a59c5b5f ("sctp: only allow the out stream reset when the stream outq is empty")
> > > Reported-by: syzbot <syzbot+6ad9c3bd0a218a2ab41d@syzkaller.appspotmail.com>
> > > Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > Tested-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > Cc: Xin Long <lucien.xin@gmail.com>
> > > Cc: Neil Horman <nhorman@tuxdriver.com>
> > > Cc: Vlad Yasevich <vyasevich@gmail.com>
> > > Cc: Eric Dumazet <edumazet@google.com>
> > > Signed-off-by: Hillf Danton <hdanton@sina.com>
> > > ---
> > > net/sctp/stream.c | 1 +
> > > 1 file changed, 1 insertion(+)
> > >
> > > diff --git a/net/sctp/stream.c b/net/sctp/stream.c
> > > index 93ed078..d3e2f03 100644
> > > --- a/net/sctp/stream.c
> > > +++ b/net/sctp/stream.c
> > > @@ -310,6 +310,7 @@ int sctp_send_reset_streams(struct sctp_association *asoc,
> > >
> > >       if (out && !sctp_stream_outq_is_empty(stream, str_nums, nstr_list)) {
> > >               retval = -EAGAIN;
> > > +             kfree(nstr_list);
> > >               goto out;
> > >       }
> > >
> > > --
> > >
> > >
> > Acked-by: Neil Horman <nhorman@tuxdriver.com>
> Reviewed-by: Xin Long <lucien.xin@gmail.com>
This fix is not applied, pls resend it with:
to = network dev <netdev@vger.kernel.org>
cc = davem@davemloft.net
to = linux-sctp@vger.kernel.org
cc = Neil Horman <nhorman@tuxdriver.com>
cc = Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

^ permalink raw reply

* Re: [RFC PATCH 1/2] gianfar: convert to phylink
From: Arseny Solokha @ 2019-07-24  7:36 UTC (permalink / raw)
  To: Claudiu Manoil
  Cc: Ioana Ciornei, Russell King, Andrew Lunn, netdev@vger.kernel.org
In-Reply-To: <VI1PR04MB48809AFBB9DF01001AA5E2CA96C70@VI1PR04MB4880.eurprd04.prod.outlook.com>

>>-----Original Message-----
>>From: Arseny Solokha <asolokha@kb.kras.ru>
>>Sent: Tuesday, July 23, 2019 6:17 PM
>>To: Claudiu Manoil <claudiu.manoil@nxp.com>; Ioana Ciornei
>><ioana.ciornei@nxp.com>; Russell King <linux@armlinux.org.uk>; Andrew Lunn
>><andrew@lunn.ch>
>>Cc: netdev@vger.kernel.org; Arseny Solokha <asolokha@kb.kras.ru>
>>Subject: [RFC PATCH 1/2] gianfar: convert to phylink
>>
>>Convert gianfar to use the phylink API for better SFP modules support.
>>
>>The driver still uses phylib for serdes configuration over the TBI
>>interface, as there seems to be no functionally equivalent API present
>>in phylink (yet). phylib usage is basically confined in two functions.
>>
>
> Thanks for your patch.  Phylink in gianfar... that would be something!
> At first glance a lot of code has changed with this patch or got relocated.
> To make it easier to swallow, I think a few cleanup patches could be
> separated before migrating to phylink.  Like for instance getting rid of the
> old* link state variables, which I think are an artifact from early phylib usage.
> Nonetheless good to see this implemented, I'll have a closer look asap.

Hi,

meanwhile I'll have to post v2 of this patch because it has some issues which
initially escaped my attention. For now I'm pasting the diff against v1 here for
reference:

--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -1666,7 +1666,7 @@ static int gfar_suspend(struct device *dev)
 		gfar_start_wol_filer(priv);
 
 	} else {
-		phylink_stop(phy->phylink);
+		phylink_stop(priv->phylink);
 	}
 
 	priv->speed = SPEED_UNKNOWN;
@@ -3699,9 +3699,6 @@ static void gfar_mac_config(struct phylink_config *config, unsigned int mode,
 	if (unlikely(test_bit(GFAR_RESETTING, &priv->state)))
 		return;
 
-	if (unlikely(phylink_autoneg_inband(mode)))
-		return;
-
 	maccfg1 = gfar_read(&regs->maccfg1);
 	maccfg2 = gfar_read(&regs->maccfg2);
 	ecntrl = gfar_read(&regs->ecntrl);

The first hunk here fixes a typo which broke build with PM enabled. The second
one removes an early return from gfar_mac_config() which I believe is really
bogus and also breaks coalesce parameters calculation for SGMII and 1000Base-X
attached PHYs.

I'd like to submit a real v2 after the patches gets actual review, though.

Thanks,
Arseny

^ permalink raw reply

* Re: [PATCH net-next 1/4] sctp: check addr_size with sa_family_t size in __sctp_setsockopt_connectx
From: Xin Long @ 2019-07-24  7:21 UTC (permalink / raw)
  To: Neil Horman; +Cc: network dev, linux-sctp, Marcelo Ricardo Leitner, davem
In-Reply-To: <20190723152449.GB8419@localhost.localdomain>

On Tue, Jul 23, 2019 at 11:25 PM Neil Horman <nhorman@tuxdriver.com> wrote:
>
> On Tue, Jul 23, 2019 at 01:37:57AM +0800, Xin Long wrote:
> > Now __sctp_connect() is called by __sctp_setsockopt_connectx() and
> > sctp_inet_connect(), the latter has done addr_size check with size
> > of sa_family_t.
> >
> > In the next patch to clean up __sctp_connect(), we will remove
> > addr_size check with size of sa_family_t from __sctp_connect()
> > for the 1st address.
> >
> > So before doing that, __sctp_setsockopt_connectx() should do
> > this check first, as sctp_inet_connect() does.
> >
> > Signed-off-by: Xin Long <lucien.xin@gmail.com>
> > ---
> >  net/sctp/socket.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> > index aa80cda..5f92e4a 100644
> > --- a/net/sctp/socket.c
> > +++ b/net/sctp/socket.c
> > @@ -1311,7 +1311,7 @@ static int __sctp_setsockopt_connectx(struct sock *sk,
> >       pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n",
> >                __func__, sk, addrs, addrs_size);
> >
> > -     if (unlikely(addrs_size <= 0))
> > +     if (unlikely(addrs_size < sizeof(sa_family_t)))
> I don't think this is what you want to check for here.  sa_family_t is
> an unsigned short, and addrs_size is the number of bytes in the addrs
> array.  The addrs array should be at least the size of one struct
> sockaddr (16 bytes iirc), and, if larger, should be a multiple of
> sizeof(struct sockaddr)
sizeof(struct sockaddr) is not the right value to check either.

The proper check will be done later in __sctp_connect():

        af = sctp_get_af_specific(daddr->sa.sa_family);
        if (!af || af->sockaddr_len > addrs_size)
                return -EINVAL;

So the check 'addrs_size < sizeof(sa_family_t)' in this patch is
just to make sure daddr->sa.sa_family is accessible. the same
check is also done in sctp_inet_connect().

>
> Neil
>
> >               return -EINVAL;
> >
> >       kaddrs = memdup_user(addrs, addrs_size);
> > --
> > 2.1.0
> >
> >

^ permalink raw reply

* Re: [PATCH net-next] dpaa2-eth: Don't use netif_receive_skb_list for TCP frames
From: Eric Dumazet @ 2019-07-24  7:10 UTC (permalink / raw)
  To: Ioana Radulescu, netdev, davem; +Cc: ioana.ciornei, vladimir.oltean
In-Reply-To: <1563902923-26178-1-git-send-email-ruxandra.radulescu@nxp.com>



On 7/23/19 7:28 PM, Ioana Radulescu wrote:
> Using Rx skb bulking for all frames may negatively impact the
> performance in some TCP termination scenarios, as it effectively
> bypasses GRO.

>  
> -	list_add_tail(&skb->list, ch->rx_list);
> +	if (frame_is_tcp(fd, fas))
> +		napi_gro_receive(&ch->napi, skb);
> +	else
> +		list_add_tail(&skb->list, ch->rx_list);
>  
>  	return;
>  


This is really bad.

This is exactly why I suggested to add the batching capability to GRO,
instead having to change all drivers.

Edward Cree is working on this.



^ permalink raw reply

* Re: [RFC PATCH net-next 10/12] drop_monitor: Add packet alert mode
From: Ido Schimmel @ 2019-07-24  7:10 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, davem, dsahern, roopa, nikolay, jakub.kicinski, toke,
	andy, f.fainelli, andrew, vivien.didelot, mlxsw, Ido Schimmel
In-Reply-To: <20190723151431.GA8419@localhost.localdomain>

On Tue, Jul 23, 2019 at 11:14:31AM -0400, Neil Horman wrote:
> On Tue, Jul 23, 2019 at 05:16:25PM +0300, Ido Schimmel wrote:
> > On Tue, Jul 23, 2019 at 08:43:40AM -0400, Neil Horman wrote:
> > > On Mon, Jul 22, 2019 at 09:31:32PM +0300, Ido Schimmel wrote:
> > > > +static void net_dm_packet_work(struct work_struct *work)
> > > > +{
> > > > +	struct per_cpu_dm_data *data;
> > > > +	struct sk_buff_head list;
> > > > +	struct sk_buff *skb;
> > > > +	unsigned long flags;
> > > > +
> > > > +	data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
> > > > +
> > > > +	__skb_queue_head_init(&list);
> > > > +
> > > > +	spin_lock_irqsave(&data->drop_queue.lock, flags);
> > > > +	skb_queue_splice_tail_init(&data->drop_queue, &list);
> > > > +	spin_unlock_irqrestore(&data->drop_queue.lock, flags);
> > > > +
> > > These functions are all executed in a per-cpu context.  While theres nothing
> > > wrong with using a spinlock here, I think you can get away with just doing
> > > local_irqsave and local_irq_restore.
> > 
> > Hi Neil,
> > 
> > Thanks a lot for reviewing. I might be missing something, but please
> > note that this function is executed from a workqueue and therefore the
> > CPU it is running on does not have to be the same CPU to which 'data'
> > belongs to. If so, I'm not sure how I can avoid taking the spinlock, as
> > otherwise two different CPUs can modify the list concurrently.
> > 
> Ah, my bad, I was under the impression that the schedule_work call for
> that particular work queue was actually a call to schedule_work_on,
> which would have affined it to a specific cpu.  That said, looking at
> it, I think using schedule_work_on was my initial intent, as the work
> queue is registered per cpu.  And converting it to schedule_work_on
> would allow you to reduce the spin_lock to a faster local_irqsave

Yes, this can work, but I'm not sure we can justify it. The CPU that is
dropping packets is potentially very busy processing all incoming
packets and with schedule_work_on() we force the same CPU to be used to
allocate and prepare the netlink messages. With schedule_work() the
system can choose an idle CPU and better utilize system resources. Also,
the scope of the lock is very limited and it is only ever contended by
at most two CPUs: The CPU the list belongs to and the CPU executing the
work item.

I will limit the number of skbs we can enqueue, add a counter to see how
many packets we tail drop and benchmark both approaches.

Thanks!

^ permalink raw reply

* Re: [PATCH v12 1/5] can: m_can: Create a m_can platform framework
From: Greg KH @ 2019-07-24  6:47 UTC (permalink / raw)
  To: Dan Murphy; +Cc: wg, mkl, davem, linux-can, netdev, linux-kernel
In-Reply-To: <437b6371-8488-a0ff-fa68-d1fb5a81bb8b@ti.com>

On Tue, Jul 23, 2019 at 10:14:14AM -0500, Dan Murphy wrote:
> Hello
> 
> On 7/10/19 7:08 AM, Dan Murphy wrote:
> > Hello
> > 
> > On 6/17/19 10:09 AM, Dan Murphy wrote:
> > > Marc
> > > 
> > > On 6/10/19 11:35 AM, Dan Murphy wrote:
> > > > Bump
> > > > 
> > > > On 6/6/19 8:16 AM, Dan Murphy wrote:
> > > > > Marc
> > > > > 
> > > > > Bump
> > > > > 
> > > > > On 5/31/19 6:51 AM, Dan Murphy wrote:
> > > > > > Marc
> > > > > > 
> > > > > > On 5/15/19 3:54 PM, Dan Murphy wrote:
> > > > > > > Marc
> > > > > > > 
> > > > > > > On 5/9/19 11:11 AM, Dan Murphy wrote:
> > > > > > > > Create a m_can platform framework that peripheral
> > > > > > > > devices can register to and use common code and register sets.
> > > > > > > > The peripheral devices may provide read/write and configuration
> > > > > > > > support of the IP.
> > > > > > > > 
> > > > > > > > Acked-by: Wolfgang Grandegger <wg@grandegger.com>
> > > > > > > > Signed-off-by: Dan Murphy <dmurphy@ti.com>
> > > > > > > > ---
> > > > > > > > 
> > > > > > > > v12 - Update the m_can_read/write functions to
> > > > > > > > create a backtrace if the callback
> > > > > > > > pointer is NULL. - https://lore.kernel.org/patchwork/patch/1052302/
> > > > > > > > 
> > > > > > > Is this able to be merged now?
> > > > > > 
> > > > > > ping
> > > 
> > > Wondering if there is anything else we need to do?
> > > 
> > > The part has officially shipped and we had hoped to have driver
> > > support in Linux as part of the announcement.
> > > 
> > Is this being sent in a PR for 5.3?
> > 
> > Dan
> > 
> Adding Greg to this thread as I have no idea what is going on with this. 

Why me?  What am I supposed to do here?  I see no patches at all to do
anything with :(

thanks,

greg "not a miracle worker" k-h

^ permalink raw reply

* Re: Reminder: 99 open syzbot bugs in net subsystem
From: Eric Dumazet @ 2019-07-24  6:39 UTC (permalink / raw)
  To: netdev, David S. Miller, Florian Westphal, Ilya Maximets,
	Eric Dumazet, David Ahern, linux-kernel, syzkaller-bugs
In-Reply-To: <20190724013813.GB643@sol.localdomain>



On 7/24/19 3:38 AM, Eric Biggers wrote:
> [This email was generated by a script.  Let me know if you have any suggestions
> to make it better, or if you want it re-generated with the latest status.]
> 
> Of the currently open syzbot reports against the upstream kernel, I've manually
> marked 99 of them as possibly being bugs in the net subsystem.  This category
> only includes the networking bugs that I couldn't assign to a more specific
> component (bpf, xfrm, bluetooth, tls, tipc, sctp, wireless, etc.).  I've listed
> these reports below, sorted by an algorithm that tries to list first the reports
> most likely to be still valid, important, and actionable.
> 
> Of these 99 bugs, 17 were seen in mainline in the last week.
> 
> Of these 99 bugs, 4 were bisected to commits from the following people:
> 
> 	Florian Westphal <fw@strlen.de>
> 	Ilya Maximets <i.maximets@samsung.com>
> 	Eric Dumazet <edumazet@google.com>
> 	David Ahern <dsahern@gmail.com>
> 
> If you believe a bug is no longer valid, please close the syzbot report by
> sending a '#syz fix', '#syz dup', or '#syz invalid' command in reply to the
> original thread, as explained at https://goo.gl/tpsmEJ#status
> 
> If you believe I misattributed a bug to the net subsystem, please let me know,
> and if possible forward the report to the correct people or mailing list.
>

Some of the bugs have been fixed already, before syzbot found them.

Why force human to be gentle to bots and actually replying to them ?

I usually simply wait that syzbot is finding the bug does not repro anymore,
but now if you send these emails, we will have even more pressure on us.



^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
From: Song Liu @ 2019-07-24  6:30 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andy Lutomirski, Kees Cook, linux-security@vger.kernel.org,
	Networking, bpf, Alexei Starovoitov, Daniel Borkmann, Kernel Team,
	Lorenz Bauer, Jann Horn, Greg KH, Linux API
In-Reply-To: <1DE886F3-3982-45DE-B545-67AD6A4871AB@amacapital.net>



> On Jul 23, 2019, at 6:40 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> 
> 
> 
>> On Jul 23, 2019, at 3:56 PM, Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Jul 23, 2019, at 8:11 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>> 
>>> On Mon, Jul 22, 2019 at 1:54 PM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> Hi Andy, Lorenz, and all,
>>>> 
>>>>> On Jul 2, 2019, at 2:32 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>> 
>>>>> On Tue, Jul 2, 2019 at 2:04 PM Kees Cook <keescook@chromium.org> wrote:
>>>>>> 
>>>>>>> On Mon, Jul 01, 2019 at 06:59:13PM -0700, Andy Lutomirski wrote:
>>>>>>> I think I'm understanding your motivation.  You're not trying to make
>>>>>>> bpf() generically usable without privilege -- you're trying to create
>>>>>>> a way to allow certain users to access dangerous bpf functionality
>>>>>>> within some limits.
>>>>>>> 
>>>>>>> That's a perfectly fine goal, but I think you're reinventing the
>>>>>>> wheel, and the wheel you're reinventing is quite complicated and
>>>>>>> already exists.  I think you should teach bpftool to be secure when
>>>>>>> installed setuid root or with fscaps enabled and put your policy in
>>>>>>> bpftool.  If you want to harden this a little bit, it would seem
>>>>>>> entirely reasonable to add a new CAP_BPF_ADMIN and change some, but
>>>>>>> not all, of the capable() checks to check CAP_BPF_ADMIN instead of the
>>>>>>> capabilities that they currently check.
>>>>>> 
>>>>>> If finer grained controls are wanted, it does seem like the /dev/bpf
>>>>>> path makes the most sense. open, request abilities, use fd. The open can
>>>>>> be mediated by DAC and LSM. The request can be mediated by LSM. This
>>>>>> provides a way to add policy at the LSM level and at the tool level.
>>>>>> (i.e. For tool-level controls: leave LSM wide open, make /dev/bpf owned
>>>>>> by "bpfadmin" and bpftool becomes setuid "bpfadmin". For fine-grained
>>>>>> controls, leave /dev/bpf wide open and add policy to SELinux, etc.)
>>>>>> 
>>>>>> With only a new CAP, you don't get the fine-grained controls. (The
>>>>>> "request abilities" part is the key there.)
>>>>> 
>>>>> Sure you do: the effective set.  It has somewhat bizarre defaults, but
>>>>> I don't think that's a real problem.  Also, this wouldn't be like
>>>>> CAP_DAC_READ_SEARCH -- you can't accidentally use your BPF caps.
>>>>> 
>>>>> I think that a /dev capability-like object isn't totally nuts, but I
>>>>> think we should do it well, and this patch doesn't really achieve
>>>>> that.  But I don't think bpf wants fine-grained controls like this at
>>>>> all -- as I pointed upthread, a fine-grained solution really wants
>>>>> different treatment for the different capable() checks, and a bunch of
>>>>> them won't resemble capabilities or /dev/bpf at all.
>>>> 
>>>> With 5.3-rc1 out, I am back on this. :)
>>>> 
>>>> How about we modify the set as:
>>>> 1. Introduce sys_bpf_with_cap() that takes fd of /dev/bpf.
>>> 
>>> I'm fine with this in principle, but:
>>> 
>>>> 2. Better handling of capable() calls through bpf code. I guess the
>>>>   biggest problem here is is_priv in verifier.c:bpf_check().
>>> 
>>> I think it would be good to understand exactly what /dev/bpf will
>>> enable one to do.  Without some care, it would just become the next
>>> CAP_SYS_ADMIN: if you can open it, sure, you're not root, but you can
>>> intercept network traffic, modify cgroup behavior, and do plenty of
>>> other things, any of which can probably be used to completely take
>>> over the system.
>> 
>> Well, yes. sys_bpf() is pretty powerful. 
>> 
>> The goal of /dev/bpf is to enable special users to call sys_bpf(). In 
>> the meanwhile, such users should not take down the whole system easily
>> by accident, e.g., with rm -rf /.
> 
> That’s easy, though — bpftool could learn to read /etc/bpfusers before allowing ruid != 0.

This is a great idea! fscaps + /etc/bpfusers should do the trick. 

> 
>> 
>> It is similar to CAP_BPF_ADMIN, without really adding the CAP_.  
>> 
>> I think adding new CAP_ requires much more effort. 
>> 
> 
> A new CAP_ is straightforward — add the definition and change the max cap.
> 
>>> 
>>> It would also be nice to understand why you can't do what you need to
>>> do entirely in user code using setuid or fscaps.
>> 
>> It is not very easy to achieve the same control: only certain users can
>> run certain tools (bpftool, etc.). 
>> 
>> The closest approach I can find is:
>> 1. use libcap (pam_cap) to give CAP_SETUID to certain users;
>> 2. add setuid(0) to bpftool.
>> 
>> The difference between this approach and /dev/bpf is that certain users
>> would be able to run other tools that call setuid(). Though I am not 
>> sure how many tools call setuid(), and how risky they are. 
> 
> I think you’re misunderstanding me. Install bpftool with either the setuid (S_ISUID) mode or with an appropriate fscap bit — see the setcap(8) manpage.
> 
> The downside of this approach is that it won’t work well in a container, and containers are cool these days :)
> 
>> 
>>> 
>>> Finally, at risk of rehashing some old arguments, I'll point out that
>>> the bpf() syscall is an unusual design to begin with.  As an example,
>>> consider bpf_prog_attach().  Outside of bpf(), if I want to change the
>>> behavior of a cgroup, I would write to a file in
>>> /sys/kernel/cgroup/unified/whatever/, and normal DAC and MAC rules
>>> apply.  With bpf(), however, I just call bpf() to attach a program to
>>> the cgroup.  bpf() says "oh, you are capable(CAP_NET_ADMIN) -- go for
>>> it!".  Unless I missed something major, and I just re-read the code,
>>> there is no check that the caller has write or LSM permission to
>>> anything at all in cgroupfs, and the existing API would make it very
>>> awkward to impose any kind of DAC rules here.
>>> 
>>> So I think it might actually be time to repay some techincal debt and
>>> come up with a real fix.  As a less intrusive approach, you could see
>>> about requiring ownership of the cgroup directory instead of
>>> CAP_NET_ADMIN.  As a more intrusive but perhaps better approach, you
>>> could invert the logic to to make it work like everything outside of
>>> cgroup: add pseudo-files like bpf.inet_ingress to the cgroup
>>> directories, and require a writable fd to *that* to a new improved
>>> attach API.  If a user could do:
>>> 
>>> int fd = open("/sys/fs/cgroup/.../bpf.inet_attach", O_RDWR);  /* usual
>>> DAC and MAC policy applies */
>>> int bpf_fd = setup the bpf stuff;  /* no privilege required, unless
>>> the program is huge or needs is_priv */
>>> bpf(BPF_IMPROVED_ATTACH, target = fd, program = bpf_fd);
>>> 
>>> there would be no capabilities or global privilege at all required for
>>> this.  It would just work with cgroup delegation, containers, etc.
>>> 
>>> I think you could even pull off this type of API change with only
>>> libbpf changes.  In particular, there's this code:
>>> 
>>> int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
>>>                  unsigned int flags)
>>> {
>>>      union bpf_attr attr;
>>> 
>>>      memset(&attr, 0, sizeof(attr));
>>>      attr.target_fd     = target_fd;
>>>      attr.attach_bpf_fd = prog_fd;
>>>      attr.attach_type   = type;
>>>      attr.attach_flags  = flags;
>>> 
>>>      return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
>>> }
>>> 
>>> This would instead do something like:
>>> 
>>> int specific_target_fd = openat(target_fd, bpf_type_to_target[type], O_RDWR);
>>> attr.target_fd = specific_target_fd;
>>> ...
>>> 
>>> return sys_bpf(BPF_PROG_IMPROVED_ATTACH, &attr, sizeof(attr));
>>> 
>>> Would this solve your problem without needing /dev/bpf at all?
>> 
>> This gives fine grain access control. I think it solves the problem. 
>> But it also requires a lot of rework to sys_bpf(). And it may also 
>> break backward/forward compatibility?
>> 
> 
> I think the compatibility issue is manageable. The current bpf() interface would be supported for at least several years, and libbpf could detect that the new interface isn’t supported and fall back the old interface

You are right. New BPF_PROG_IMPROVED_ATTACH helps compatibility. 
I missed that part. 

> 
>> Personally, I think it is an overkill for the original motivation: 
>> call sys_bpf() with special user instead of root. 
> 
> It’s overkill for your specific use case, but I’m trying to encourage you to either solve your problem entirely in userspace or to solve a more general problem in the kernel :)

I do like both proposals. Thanks for these invaluable suggestions. 

> 
> In furtherance of bpf’s goal of world domination, I think it would be great if it Just Worked in a container. My proposal does this.

Let me think more about this and discuss with the team. 

Thanks again, 
Song

^ permalink raw reply

* KASAN: slab-out-of-bounds Read in bpf_int_jit_compile
From: syzbot @ 2019-07-24  6:28 UTC (permalink / raw)
  To: andriin, ast, bpf, daniel, kafai, linux-kernel, netdev,
	songliubraving, syzkaller-bugs, yhs

Hello,

syzbot found the following crash on:

HEAD commit:    c6dd78fc Merge branch 'x86-urgent-for-linus' of git://git...
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=1760ff84600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=7937b718ddac333b
dashboard link: https://syzkaller.appspot.com/bug?extid=35101610ff3e83119b1b
compiler:       clang version 9.0.0 (/home/glider/llvm/clang  
80fee25776c2fb61e74c1ecb1a523375c2500b69)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=13c017a4600000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=173f8278600000

The bug was bisected to:

commit 2589726d12a1b12eaaa93c7f1ea64287e383c7a5
Author: Alexei Starovoitov <ast@kernel.org>
Date:   Sat Jun 15 19:12:20 2019 +0000

     bpf: introduce bounded loops

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=13df66afa00000
final crash:    https://syzkaller.appspot.com/x/report.txt?x=103f66afa00000
console output: https://syzkaller.appspot.com/x/log.txt?x=17df66afa00000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+35101610ff3e83119b1b@syzkaller.appspotmail.com
Fixes: 2589726d12a1 ("bpf: introduce bounded loops")

==================================================================
BUG: KASAN: slab-out-of-bounds in do_jit /arch/x86/net/bpf_jit_comp.c:966  
[inline]
BUG: KASAN: slab-out-of-bounds in bpf_int_jit_compile+0x4d19/0x7530  
/arch/x86/net/bpf_jit_comp.c:1132
Read of size 4 at addr ffff8880960cc2bc by task syz-executor607/7822

CPU: 0 PID: 7822 Comm: syz-executor607 Not tainted 5.2.0+ #37
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  __dump_stack /lib/dump_stack.c:77 [inline]
  dump_stack+0x1d8/0x2f8 /lib/dump_stack.c:113
  print_address_description+0x75/0x5b0 /mm/kasan/report.c:351
  __kasan_report+0x14b/0x1c0 /mm/kasan/report.c:482
  kasan_report+0x26/0x50 /mm/kasan/common.c:612
  __asan_report_load4_noabort+0x14/0x20 /mm/kasan/generic_report.c:131
  do_jit /arch/x86/net/bpf_jit_comp.c:966 [inline]
  bpf_int_jit_compile+0x4d19/0x7530 /arch/x86/net/bpf_jit_comp.c:1132
  bpf_prog_select_runtime+0x756/0xa50 /kernel/bpf/core.c:1725
  bpf_prog_load /kernel/bpf/syscall.c:1702 [inline]
  __do_sys_bpf+0x7d4e/0xc0e0 /kernel/bpf/syscall.c:2849
  __se_sys_bpf /kernel/bpf/syscall.c:2808 [inline]
  __x64_sys_bpf+0x7a/0x90 /kernel/bpf/syscall.c:2808
  do_syscall_64+0xfe/0x140 /arch/x86/entry/common.c:296
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4402c9
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7  
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007ffeb7a02c18 EFLAGS: 00000246 ORIG_RAX: 0000000000000141
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 00000000004402c9
RDX: 0000000000000046 RSI: 0000000020000180 RDI: 0000000000000005
RBP: 00000000006ca018 R08: 0000000000000000 R09: 0000000000000000
R10: 00000000ffffffff R11: 0000000000000246 R12: 0000000000401b50
R13: 0000000000401be0 R14: 0000000000000000 R15: 0000000000000000

Allocated by task 7822:
  save_stack /mm/kasan/common.c:69 [inline]
  set_track /mm/kasan/common.c:77 [inline]
  __kasan_kmalloc+0x11c/0x1b0 /mm/kasan/common.c:487
  kasan_kmalloc+0x9/0x10 /mm/kasan/common.c:501
  kmem_cache_alloc_trace+0x215/0x2f0 /mm/slab.c:3550
  kmalloc /./include/linux/slab.h:552 [inline]
  kzalloc /./include/linux/slab.h:748 [inline]
  bpf_int_jit_compile+0x1b2/0x7530 /arch/x86/net/bpf_jit_comp.c:1092
  bpf_prog_select_runtime+0x756/0xa50 /kernel/bpf/core.c:1725
  bpf_prog_load /kernel/bpf/syscall.c:1702 [inline]
  __do_sys_bpf+0x7d4e/0xc0e0 /kernel/bpf/syscall.c:2849
  __se_sys_bpf /kernel/bpf/syscall.c:2808 [inline]
  __x64_sys_bpf+0x7a/0x90 /kernel/bpf/syscall.c:2808
  do_syscall_64+0xfe/0x140 /arch/x86/entry/common.c:296
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 7329:
  save_stack /mm/kasan/common.c:69 [inline]
  set_track /mm/kasan/common.c:77 [inline]
  __kasan_slab_free+0x12a/0x1e0 /mm/kasan/common.c:449
  kasan_slab_free+0xe/0x10 /mm/kasan/common.c:457
  __cache_free /mm/slab.c:3425 [inline]
  kfree+0x115/0x200 /mm/slab.c:3756
  tomoyo_path_perm+0x6cc/0x8b0 /security/tomoyo/file.c:842
  tomoyo_inode_getattr+0x1c/0x20 /security/tomoyo/tomoyo.c:129
  security_inode_getattr+0xd5/0x150 /security/security.c:1182
  vfs_getattr+0x2a/0x6d0 /fs/stat.c:115
  vfs_statx /fs/stat.c:191 [inline]
  vfs_stat /./include/linux/fs.h:3182 [inline]
  __do_sys_newstat /fs/stat.c:341 [inline]
  __se_sys_newstat+0x10c/0x210 /fs/stat.c:337
  __x64_sys_newstat+0x5b/0x70 /fs/stat.c:337
  do_syscall_64+0xfe/0x140 /arch/x86/entry/common.c:296
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at ffff8880960cc280
  which belongs to the cache kmalloc-32 of size 32
The buggy address is located 28 bytes to the right of
  32-byte region [ffff8880960cc280, ffff8880960cc2a0)
The buggy address belongs to the page:
page:ffffea0002583300 refcount:1 mapcount:0 mapping:ffff8880aa4001c0  
index:0xffff8880960ccfc1
flags: 0x1fffc0000000200(slab)
raw: 01fffc0000000200 ffffea00027f47c8 ffffea0002a53a48 ffff8880aa4001c0
raw: ffff8880960ccfc1 ffff8880960cc000 000000010000003f 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
  ffff8880960cc180: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
  ffff8880960cc200: fb fb fb fb fc fc fc fc 00 00 01 fc fc fc fc fc
> ffff8880960cc280: 00 00 00 00 fc fc fc fc 00 00 fc fc fc fc fc fc
                                         ^
  ffff8880960cc300: fb fb fb fb fc fc fc fc 00 00 01 fc fc fc fc fc
  ffff8880960cc380: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
==================================================================


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
For information about bisection process see: https://goo.gl/tpsmEJ#bisection
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* Re: [bpf-next 2/6] tcp: add skb-less helpers to retrieve SYN cookie
From: kbuild test robot @ 2019-07-24  6:19 UTC (permalink / raw)
  To: Petar Penkov
  Cc: kbuild-all, netdev, bpf, davem, ast, daniel, edumazet, lmb, sdf,
	Petar Penkov
In-Reply-To: <20190723002042.105927-3-ppenkov.kernel@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1413 bytes --]

Hi Petar,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Petar-Penkov/Introduce-a-BPF-helper-to-generate-SYN-cookies/20190723-235628
base:   https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: parisc-c8000_defconfig (attached as .config)
compiler: hppa64-linux-gcc (GCC) 7.4.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.4.0 make.cross ARCH=parisc 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   hppa64-linux-ld: arch/parisc/kernel/entry.o(.text.hot+0x22b8): cannot reach preempt_schedule_irq
   arch/parisc/kernel/entry.o: In function `intr_do_preempt':
   (.text.hot+0x22b8): relocation truncated to fit: R_PARISC_PCREL22F against symbol `preempt_schedule_irq' defined in .sched.text section in kernel/sched/core.o
   net/ipv4/tcp_input.o: In function `.LC240':
>> (.data.rel.ro+0x718): undefined reference to `__cookie_v6_init_sequence'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 18429 bytes --]

^ permalink raw reply

* Re: [PATCH 00/12] block/bio, fs: convert put_page() to put_user_page*()
From: Christoph Hellwig @ 2019-07-24  6:17 UTC (permalink / raw)
  To: john.hubbard
  Cc: Andrew Morton, Alexander Viro, Anna Schumaker, David S . Miller,
	Dominique Martinet, Eric Van Hensbergen, Jason Gunthorpe,
	Jason Wang, Jens Axboe, Latchesar Ionkov, Michael S . Tsirkin,
	Miklos Szeredi, Trond Myklebust, Christoph Hellwig,
	Matthew Wilcox, linux-mm, LKML, ceph-devel, kvm, linux-block,
	linux-cifs, linux-fsdevel, linux-nfs, linux-rdma, netdev,
	samba-technical, v9fs-developer, virtualization, John Hubbard
In-Reply-To: <20190724042518.14363-1-jhubbard@nvidia.com>

On Tue, Jul 23, 2019 at 09:25:06PM -0700, john.hubbard@gmail.com wrote:
> * Store, in the iov_iter, a "came from gup (get_user_pages)" parameter.
>   Then, use the new iov_iter_get_pages_use_gup() to retrieve it when
>   it is time to release the pages. That allows choosing between put_page()
>   and put_user_page*().
> 
> * Pass in one more piece of information to bio_release_pages: a "from_gup"
>   parameter. Similar use as above.
> 
> * Change the block layer, and several file systems, to use
>   put_user_page*().

I think we can do this in a simple and better way.  We have 5 ITER_*
types.  Of those ITER_DISCARD as the name suggests never uses pages, so
we can skip handling it.  ITER_PIPE is rejected іn the direct I/O path,
which leaves us with three.

Out of those ITER_BVEC needs a user page reference, so we want to call
put_user_page* on it.  ITER_BVEC always already has page reference,
which means in the block direct I/O path path we alread don't take
a page reference.  We should extent that handling to all other calls
of iov_iter_get_pages / iov_iter_get_pages_alloc.  I think we should
just reject ITER_KVEC for direct I/O as well as we have no users and
it is rather pointless.  Alternatively if we see a use for it the
callers should always have a life page reference anyway (or might
be on kmalloc memory), so we really should not take a reference either.

In other words:  the only time we should ever have to put a page in
this patch is when they are user pages.  We'll need to clean up
various bits of code for that, but that can be done gradually before
even getting to the actual put_user_pages conversion.

^ permalink raw reply

* Re: [PATCH v4 1/2] rtw88: pci: Rearrange the memory usage for skb in RX ISR
From: Jian-Hong Pan @ 2019-07-24  6:13 UTC (permalink / raw)
  To: Yan-Hsuan Chuang, Kalle Valo, David S . Miller, Larry Finger,
	David Laight, Christoph Hellwig
  Cc: linux-wireless, Linux Netdev List, Linux Kernel,
	Linux Upstreaming Team, Daniel Drake, stable
In-Reply-To: <CAPpJ_edQRMiBcdB-dTxhti8nK0eX4GPRUOgimzWW1JC3ZZjRHw@mail.gmail.com>

Jian-Hong Pan <jian-hong@endlessm.com> 於 2019年7月11日 週四 下午1:28寫道：
>
> Jian-Hong Pan <jian-hong@endlessm.com> 於 2019年7月11日 週四 下午1:25寫道：
> >
> > Testing with RTL8822BE hardware, when available memory is low, we
> > frequently see a kernel panic and system freeze.
> >
> > First, rtw_pci_rx_isr encounters a memory allocation failure (trimmed):
> >
> > rx routine starvation
> > WARNING: CPU: 7 PID: 9871 at drivers/net/wireless/realtek/rtw88/pci.c:822 rtw_pci_rx_isr.constprop.25+0x35a/0x370 [rtwpci]
> > [ 2356.580313] RIP: 0010:rtw_pci_rx_isr.constprop.25+0x35a/0x370 [rtwpci]
> >
> > Then we see a variety of different error conditions and kernel panics,
> > such as this one (trimmed):
> >
> > rtw_pci 0000:02:00.0: pci bus timeout, check dma status
> > skbuff: skb_over_panic: text:00000000091b6e66 len:415 put:415 head:00000000d2880c6f data:000000007a02b1ea tail:0x1df end:0xc0 dev:<NULL>
> > ------------[ cut here ]------------
> > kernel BUG at net/core/skbuff.c:105!
> > invalid opcode: 0000 [#1] SMP NOPTI
> > RIP: 0010:skb_panic+0x43/0x45
> >
> > When skb allocation fails and the "rx routine starvation" is hit, the
> > function returns immediately without updating the RX ring. At this
> > point, the RX ring may continue referencing an old skb which was already
> > handed off to ieee80211_rx_irqsafe(). When it comes to be used again,
> > bad things happen.
> >
> > This patch allocates a new, data-sized skb first in RX ISR. After
> > copying the data in, we pass it to the upper layers. However, if skb
> > allocation fails, we effectively drop the frame. In both cases, the
> > original, full size ring skb is reused.
> >
> > In addition, to fixing the kernel crash, the RX routine should now
> > generally behave better under low memory conditions.
> >
> > Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=204053
> > Signed-off-by: Jian-Hong Pan <jian-hong@endlessm.com>
> > Cc: <stable@vger.kernel.org>
> > ---
>
> Sorry, I forget to place the version difference here.
>
> v2:
>  - Allocate new data-sized skb and put data into it, then pass it to
>    mac80211. Reuse the original skb in RX ring by DMA sync.
>  - Modify the commit message.
>  - Introduce following [PATCH v3 2/2] rtw88: pci: Use DMA sync instead
>    of remapping in RX ISR.
>
> v3:
>  - Same as v2.
>
> v4:
>  - Fix comment: allocate a new skb for this frame, discard the frame
> if none available
>
> >  drivers/net/wireless/realtek/rtw88/pci.c | 49 +++++++++++-------------
> >  1 file changed, 22 insertions(+), 27 deletions(-)
> >
> > diff --git a/drivers/net/wireless/realtek/rtw88/pci.c b/drivers/net/wireless/realtek/rtw88/pci.c
> > index cfe05ba7280d..c415f5e94fed 100644
> > --- a/drivers/net/wireless/realtek/rtw88/pci.c
> > +++ b/drivers/net/wireless/realtek/rtw88/pci.c
> > @@ -763,6 +763,7 @@ static void rtw_pci_rx_isr(struct rtw_dev *rtwdev, struct rtw_pci *rtwpci,
> >         u32 pkt_offset;
> >         u32 pkt_desc_sz = chip->rx_pkt_desc_sz;
> >         u32 buf_desc_sz = chip->rx_buf_desc_sz;
> > +       u32 new_len;
> >         u8 *rx_desc;
> >         dma_addr_t dma;
> >
> > @@ -790,40 +791,34 @@ static void rtw_pci_rx_isr(struct rtw_dev *rtwdev, struct rtw_pci *rtwpci,
> >                 pkt_offset = pkt_desc_sz + pkt_stat.drv_info_sz +
> >                              pkt_stat.shift;
> >
> > -               if (pkt_stat.is_c2h) {
> > -                       /* keep rx_desc, halmac needs it */
> > -                       skb_put(skb, pkt_stat.pkt_len + pkt_offset);
> > +               /* allocate a new skb for this frame,
> > +                * discard the frame if none available
> > +                */
> > +               new_len = pkt_stat.pkt_len + pkt_offset;
> > +               new = dev_alloc_skb(new_len);
> > +               if (WARN_ONCE(!new, "rx routine starvation\n"))
> > +                       goto next_rp;
> > +
> > +               /* put the DMA data including rx_desc from phy to new skb */
> > +               skb_put_data(new, skb->data, new_len);
> >
> > -                       /* pass offset for further operation */
> > -                       *((u32 *)skb->cb) = pkt_offset;
> > -                       skb_queue_tail(&rtwdev->c2h_queue, skb);
> > +               if (pkt_stat.is_c2h) {
> > +                        /* pass rx_desc & offset for further operation */
> > +                       *((u32 *)new->cb) = pkt_offset;
> > +                       skb_queue_tail(&rtwdev->c2h_queue, new);
> >                         ieee80211_queue_work(rtwdev->hw, &rtwdev->c2h_work);
> >                 } else {
> > -                       /* remove rx_desc, maybe use skb_pull? */
> > -                       skb_put(skb, pkt_stat.pkt_len);
> > -                       skb_reserve(skb, pkt_offset);
> > -
> > -                       /* alloc a smaller skb to mac80211 */
> > -                       new = dev_alloc_skb(pkt_stat.pkt_len);
> > -                       if (!new) {
> > -                               new = skb;
> > -                       } else {
> > -                               skb_put_data(new, skb->data, skb->len);
> > -                               dev_kfree_skb_any(skb);
> > -                       }
> > -                       /* TODO: merge into rx.c */
> > -                       rtw_rx_stats(rtwdev, pkt_stat.vif, skb);
> > +                       /* remove rx_desc */
> > +                       skb_pull(new, pkt_offset);
> > +
> > +                       rtw_rx_stats(rtwdev, pkt_stat.vif, new);
> >                         memcpy(new->cb, &rx_status, sizeof(rx_status));
> >                         ieee80211_rx_irqsafe(rtwdev->hw, new);
> >                 }
> >
> > -               /* skb delivered to mac80211, alloc a new one in rx ring */
> > -               new = dev_alloc_skb(RTK_PCI_RX_BUF_SIZE);
> > -               if (WARN(!new, "rx routine starvation\n"))
> > -                       return;
> > -
> > -               ring->buf[cur_rp] = new;
> > -               rtw_pci_reset_rx_desc(rtwdev, new, ring, cur_rp, buf_desc_sz);
> > +next_rp:
> > +               /* new skb delivered to mac80211, re-enable original skb DMA */
> > +               rtw_pci_reset_rx_desc(rtwdev, skb, ring, cur_rp, buf_desc_sz);
> >
> >                 /* host read next element in ring */
> >                 if (++cur_rp >= ring->r.len)
> > --
> > 2.22.0
> >

Gentle ping!  Any comment for this patch set (v4) will be appreciated.

Jian-Hong Pan

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox