Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v4 2/6] net: dsa: mv88e6xxx: update code operating on hidden registers
From: Marek Behún @ 2019-08-26 12:21 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, Vivien Didelot, Florian Fainelli, Vladimir Oltean,
	Marek Behún
In-Reply-To: <20190826122109.20660-1-marek.behun@nic.cz>

This patch moves the functions operating on the hidden debug registers
into it's own file, port_hidden.c. The functions prefix is renamed from
mv88e6390_hidden_ to mv88e6xxx_port_hidden_, to be consistent with the
rest of this driver. The macros are prefixed with MV88E6XXX_ prefix, and
are changed not to use the BIT() macro nor bit shifts, since the rest of
the port.h file does not use it.

We also add the support for setting the Block Address field when
operating hidden registers. Marvell's mdio examples for SERDES settings
on Topaz use Block Address 0x7 when reading/writing hidden registers,
and although the specification says that block must be set to 0xf, those
settings are reachable only with Block Address 0x7.

Signed-off-by: Marek Behún <marek.behun@nic.cz>
---
 drivers/net/dsa/mv88e6xxx/Makefile      |  1 +
 drivers/net/dsa/mv88e6xxx/chip.c        | 58 +-------------------
 drivers/net/dsa/mv88e6xxx/port.h        | 22 +++++---
 drivers/net/dsa/mv88e6xxx/port_hidden.c | 70 +++++++++++++++++++++++++
 4 files changed, 87 insertions(+), 64 deletions(-)
 create mode 100644 drivers/net/dsa/mv88e6xxx/port_hidden.c

diff --git a/drivers/net/dsa/mv88e6xxx/Makefile b/drivers/net/dsa/mv88e6xxx/Makefile
index e85755dde90b..aa645ff86f64 100644
--- a/drivers/net/dsa/mv88e6xxx/Makefile
+++ b/drivers/net/dsa/mv88e6xxx/Makefile
@@ -10,6 +10,7 @@ mv88e6xxx-$(CONFIG_NET_DSA_MV88E6XXX_GLOBAL2) += global2_scratch.o
 mv88e6xxx-$(CONFIG_NET_DSA_MV88E6XXX_PTP) += hwtstamp.o
 mv88e6xxx-objs += phy.o
 mv88e6xxx-objs += port.o
+mv88e6xxx-objs += port_hidden.o
 mv88e6xxx-$(CONFIG_NET_DSA_MV88E6XXX_PTP) += ptp.o
 mv88e6xxx-objs += serdes.o
 mv88e6xxx-objs += smi.o
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index d0bf98c10b2b..ec4274d71145 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2317,60 +2317,6 @@ static int mv88e6xxx_stats_setup(struct mv88e6xxx_chip *chip)
 	return mv88e6xxx_g1_stats_clear(chip);
 }
 
-/* The mv88e6390 has some hidden registers used for debug and
- * development. The errata also makes use of them.
- */
-static int mv88e6390_hidden_write(struct mv88e6xxx_chip *chip, int port,
-				  int reg, u16 val)
-{
-	u16 ctrl;
-	int err;
-
-	err = mv88e6xxx_port_write(chip, PORT_RESERVED_1A_DATA_PORT,
-				   PORT_RESERVED_1A, val);
-	if (err)
-		return err;
-
-	ctrl = PORT_RESERVED_1A_BUSY | PORT_RESERVED_1A_WRITE |
-	       PORT_RESERVED_1A_BLOCK | port << PORT_RESERVED_1A_PORT_SHIFT |
-	       reg;
-
-	return mv88e6xxx_port_write(chip, PORT_RESERVED_1A_CTRL_PORT,
-				    PORT_RESERVED_1A, ctrl);
-}
-
-static int mv88e6390_hidden_wait(struct mv88e6xxx_chip *chip)
-{
-	int bit = __bf_shf(PORT_RESERVED_1A_BUSY);
-
-	return mv88e6xxx_wait_bit(chip, PORT_RESERVED_1A_CTRL_PORT,
-				  PORT_RESERVED_1A, bit, 0);
-}
-
-
-static int mv88e6390_hidden_read(struct mv88e6xxx_chip *chip, int port,
-				  int reg, u16 *val)
-{
-	u16 ctrl;
-	int err;
-
-	ctrl = PORT_RESERVED_1A_BUSY | PORT_RESERVED_1A_READ |
-	       PORT_RESERVED_1A_BLOCK | port << PORT_RESERVED_1A_PORT_SHIFT |
-	       reg;
-
-	err = mv88e6xxx_port_write(chip, PORT_RESERVED_1A_CTRL_PORT,
-				   PORT_RESERVED_1A, ctrl);
-	if (err)
-		return err;
-
-	err = mv88e6390_hidden_wait(chip);
-	if (err)
-		return err;
-
-	return 	mv88e6xxx_port_read(chip, PORT_RESERVED_1A_DATA_PORT,
-				    PORT_RESERVED_1A, val);
-}
-
 /* Check if the errata has already been applied. */
 static bool mv88e6390_setup_errata_applied(struct mv88e6xxx_chip *chip)
 {
@@ -2379,7 +2325,7 @@ static bool mv88e6390_setup_errata_applied(struct mv88e6xxx_chip *chip)
 	u16 val;
 
 	for (port = 0; port < mv88e6xxx_num_ports(chip); port++) {
-		err = mv88e6390_hidden_read(chip, port, 0, &val);
+		err = mv88e6xxx_port_hidden_read(chip, 0xf, port, 0, &val);
 		if (err) {
 			dev_err(chip->dev,
 				"Error reading hidden register: %d\n", err);
@@ -2412,7 +2358,7 @@ static int mv88e6390_setup_errata(struct mv88e6xxx_chip *chip)
 	}
 
 	for (port = 0; port < mv88e6xxx_num_ports(chip); port++) {
-		err = mv88e6390_hidden_write(chip, port, 0, 0x01c0);
+		err = mv88e6xxx_port_hidden_write(chip, 0xf, port, 0, 0x01c0);
 		if (err)
 			return err;
 	}
diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h
index 1abf5ea033e2..21d2d8f7c8f9 100644
--- a/drivers/net/dsa/mv88e6xxx/port.h
+++ b/drivers/net/dsa/mv88e6xxx/port.h
@@ -261,14 +261,14 @@
 #define MV88E6095_PORT_IEEE_PRIO_REMAP_4567	0x19
 
 /* Offset 0x1a: Magic undocumented errata register */
-#define PORT_RESERVED_1A			0x1a
-#define PORT_RESERVED_1A_BUSY			BIT(15)
-#define PORT_RESERVED_1A_WRITE			BIT(14)
-#define PORT_RESERVED_1A_READ			0
-#define PORT_RESERVED_1A_PORT_SHIFT		5
-#define PORT_RESERVED_1A_BLOCK			(0xf << 10)
-#define PORT_RESERVED_1A_CTRL_PORT		4
-#define PORT_RESERVED_1A_DATA_PORT		5
+#define MV88E6XXX_PORT_RESERVED_1A		0x1a
+#define MV88E6XXX_PORT_RESERVED_1A_BUSY		0x8000
+#define MV88E6XXX_PORT_RESERVED_1A_WRITE	0x4000
+#define MV88E6XXX_PORT_RESERVED_1A_READ		0x0000
+#define MV88E6XXX_PORT_RESERVED_1A_PORT_SHIFT	5
+#define MV88E6XXX_PORT_RESERVED_1A_BLOCK_SHIFT	10
+#define MV88E6XXX_PORT_RESERVED_1A_CTRL_PORT	0x04
+#define MV88E6XXX_PORT_RESERVED_1A_DATA_PORT	0x05
 
 int mv88e6xxx_port_read(struct mv88e6xxx_chip *chip, int port, int reg,
 			u16 *val);
@@ -353,4 +353,10 @@ int mv88e6095_port_set_upstream_port(struct mv88e6xxx_chip *chip, int port,
 int mv88e6xxx_port_disable_learn_limit(struct mv88e6xxx_chip *chip, int port);
 int mv88e6xxx_port_disable_pri_override(struct mv88e6xxx_chip *chip, int port);
 
+int mv88e6xxx_port_hidden_write(struct mv88e6xxx_chip *chip, int block,
+				int port, int reg, u16 val);
+int mv88e6xxx_port_hidden_wait(struct mv88e6xxx_chip *chip);
+int mv88e6xxx_port_hidden_read(struct mv88e6xxx_chip *chip, int block, int port,
+			       int reg, u16 *val);
+
 #endif /* _MV88E6XXX_PORT_H */
diff --git a/drivers/net/dsa/mv88e6xxx/port_hidden.c b/drivers/net/dsa/mv88e6xxx/port_hidden.c
new file mode 100644
index 000000000000..b49d05f0e117
--- /dev/null
+++ b/drivers/net/dsa/mv88e6xxx/port_hidden.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Marvell 88E6xxx Switch Hidden Registers support
+ *
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * Copyright (c) 2019 Andrew Lunn <andrew@lunn.ch>
+ */
+
+#include <linux/bitfield.h>
+
+#include "chip.h"
+#include "port.h"
+
+/* The mv88e6390 and mv88e6341 have some hidden registers used for debug and
+ * development. The errata also makes use of them.
+ */
+int mv88e6xxx_port_hidden_write(struct mv88e6xxx_chip *chip, int block,
+				int port, int reg, u16 val)
+{
+	u16 ctrl;
+	int err;
+
+	err = mv88e6xxx_port_write(chip, MV88E6XXX_PORT_RESERVED_1A_DATA_PORT,
+				   MV88E6XXX_PORT_RESERVED_1A, val);
+	if (err)
+		return err;
+
+	ctrl = MV88E6XXX_PORT_RESERVED_1A_BUSY |
+	       MV88E6XXX_PORT_RESERVED_1A_WRITE |
+	       block << MV88E6XXX_PORT_RESERVED_1A_BLOCK_SHIFT |
+	       port << MV88E6XXX_PORT_RESERVED_1A_PORT_SHIFT |
+	       reg;
+
+	return mv88e6xxx_port_write(chip, MV88E6XXX_PORT_RESERVED_1A_CTRL_PORT,
+				    MV88E6XXX_PORT_RESERVED_1A, ctrl);
+}
+
+int mv88e6xxx_port_hidden_wait(struct mv88e6xxx_chip *chip)
+{
+	int bit = __bf_shf(MV88E6XXX_PORT_RESERVED_1A_BUSY);
+
+	return mv88e6xxx_wait_bit(chip, MV88E6XXX_PORT_RESERVED_1A_CTRL_PORT,
+				  MV88E6XXX_PORT_RESERVED_1A, bit, 0);
+}
+
+int mv88e6xxx_port_hidden_read(struct mv88e6xxx_chip *chip, int block, int port,
+			       int reg, u16 *val)
+{
+	u16 ctrl;
+	int err;
+
+	ctrl = MV88E6XXX_PORT_RESERVED_1A_BUSY |
+	       MV88E6XXX_PORT_RESERVED_1A_READ |
+	       block << MV88E6XXX_PORT_RESERVED_1A_BLOCK_SHIFT |
+	       port << MV88E6XXX_PORT_RESERVED_1A_PORT_SHIFT |
+	       reg;
+
+	err = mv88e6xxx_port_write(chip, MV88E6XXX_PORT_RESERVED_1A_CTRL_PORT,
+				   MV88E6XXX_PORT_RESERVED_1A, ctrl);
+	if (err)
+		return err;
+
+	err = mv88e6xxx_port_hidden_wait(chip);
+	if (err)
+		return err;
+
+	return mv88e6xxx_port_read(chip, MV88E6XXX_PORT_RESERVED_1A_DATA_PORT,
+				   MV88E6XXX_PORT_RESERVED_1A, val);
+}
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v4 3/6] net: dsa: mv88e6xxx: create serdes_get_lane chip operation
From: Marek Behún @ 2019-08-26 12:21 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, Vivien Didelot, Florian Fainelli, Vladimir Oltean,
	Marek Behún
In-Reply-To: <20190826122109.20660-1-marek.behun@nic.cz>

Create a serdes_get_lane() method in the mv88e6xxx operations structure.
Use it instead of calling the different implementations.
Also change the methods so that their return value is used only for
error. The lane number is put into a place referred to by a pointer
given as argument. If the port does not have a lane, put -1 there.
Lanes are phy addresses, so use s8 as their type.

Signed-off-by: Marek Behún <marek.behun@nic.cz>
---
 drivers/net/dsa/mv88e6xxx/chip.c   |   6 ++
 drivers/net/dsa/mv88e6xxx/chip.h   |   3 +
 drivers/net/dsa/mv88e6xxx/port.c   |  14 ++--
 drivers/net/dsa/mv88e6xxx/serdes.c | 130 +++++++++++++++--------------
 drivers/net/dsa/mv88e6xxx/serdes.h |  20 ++++-
 5 files changed, 99 insertions(+), 74 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index ec4274d71145..5a3fff1971b9 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3255,6 +3255,7 @@ static const struct mv88e6xxx_ops mv88e6190_ops = {
 	.vtu_getnext = mv88e6390_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6390_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,
+	.serdes_get_lane = mv88e6390_serdes_get_lane,
 	.serdes_irq_setup = mv88e6390_serdes_irq_setup,
 	.serdes_irq_free = mv88e6390_serdes_irq_free,
 	.gpio_ops = &mv88e6352_gpio_ops,
@@ -3301,6 +3302,7 @@ static const struct mv88e6xxx_ops mv88e6190x_ops = {
 	.vtu_getnext = mv88e6390_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6390_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390x_serdes_power,
+	.serdes_get_lane = mv88e6390x_serdes_get_lane,
 	.serdes_irq_setup = mv88e6390x_serdes_irq_setup,
 	.serdes_irq_free = mv88e6390x_serdes_irq_free,
 	.gpio_ops = &mv88e6352_gpio_ops,
@@ -3347,6 +3349,7 @@ static const struct mv88e6xxx_ops mv88e6191_ops = {
 	.vtu_getnext = mv88e6390_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6390_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,
+	.serdes_get_lane = mv88e6390_serdes_get_lane,
 	.serdes_irq_setup = mv88e6390_serdes_irq_setup,
 	.serdes_irq_free = mv88e6390_serdes_irq_free,
 	.avb_ops = &mv88e6390_avb_ops,
@@ -3483,6 +3486,7 @@ static const struct mv88e6xxx_ops mv88e6290_ops = {
 	.vtu_getnext = mv88e6390_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6390_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,
+	.serdes_get_lane = mv88e6390_serdes_get_lane,
 	.serdes_irq_setup = mv88e6390_serdes_irq_setup,
 	.serdes_irq_free = mv88e6390_serdes_irq_free,
 	.gpio_ops = &mv88e6352_gpio_ops,
@@ -3800,6 +3804,7 @@ static const struct mv88e6xxx_ops mv88e6390_ops = {
 	.vtu_getnext = mv88e6390_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6390_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,
+	.serdes_get_lane = mv88e6390_serdes_get_lane,
 	.serdes_irq_setup = mv88e6390_serdes_irq_setup,
 	.serdes_irq_free = mv88e6390_serdes_irq_free,
 	.gpio_ops = &mv88e6352_gpio_ops,
@@ -3850,6 +3855,7 @@ static const struct mv88e6xxx_ops mv88e6390x_ops = {
 	.vtu_getnext = mv88e6390_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6390_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390x_serdes_power,
+	.serdes_get_lane = mv88e6390x_serdes_get_lane,
 	.serdes_irq_setup = mv88e6390x_serdes_irq_setup,
 	.serdes_irq_free = mv88e6390x_serdes_irq_free,
 	.gpio_ops = &mv88e6352_gpio_ops,
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index a406be2f5652..15d0c9f00f54 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -443,6 +443,9 @@ struct mv88e6xxx_ops {
 	/* Power on/off a SERDES interface */
 	int (*serdes_power)(struct mv88e6xxx_chip *chip, int port, bool on);
 
+	/* SERDES lane mapping */
+	int (*serdes_get_lane)(struct mv88e6xxx_chip *chip, int port, s8 *lane);
+
 	/* SERDES interrupt handling */
 	int (*serdes_irq_setup)(struct mv88e6xxx_chip *chip, int port);
 	void (*serdes_irq_free)(struct mv88e6xxx_chip *chip, int port);
diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
index c95cdb73e5a2..6a1fa5c72fdb 100644
--- a/drivers/net/dsa/mv88e6xxx/port.c
+++ b/drivers/net/dsa/mv88e6xxx/port.c
@@ -395,7 +395,7 @@ phy_interface_t mv88e6390x_port_max_speed_mode(int port)
 int mv88e6390x_port_set_cmode(struct mv88e6xxx_chip *chip, int port,
 			      phy_interface_t mode)
 {
-	int lane;
+	s8 lane;
 	u16 cmode;
 	u16 reg;
 	int err;
@@ -434,9 +434,9 @@ int mv88e6390x_port_set_cmode(struct mv88e6xxx_chip *chip, int port,
 	if (cmode == chip->ports[port].cmode)
 		return 0;
 
-	lane = mv88e6390x_serdes_get_lane(chip, port);
-	if (lane < 0 && lane != -ENODEV)
-		return lane;
+	err = mv88e6xxx_serdes_get_lane(chip, port, &lane);
+	if (err)
+		return err;
 
 	if (lane >= 0) {
 		if (chip->ports[port].serdes_irq) {
@@ -466,9 +466,9 @@ int mv88e6390x_port_set_cmode(struct mv88e6xxx_chip *chip, int port,
 
 		chip->ports[port].cmode = cmode;
 
-		lane = mv88e6390x_serdes_get_lane(chip, port);
-		if (lane < 0)
-			return lane;
+		err = mv88e6xxx_serdes_get_lane(chip, port, &lane);
+		if (err)
+			return err;
 
 		err = mv88e6390x_serdes_power(chip, port, true);
 		if (err)
diff --git a/drivers/net/dsa/mv88e6xxx/serdes.c b/drivers/net/dsa/mv88e6xxx/serdes.c
index 678aaba3d019..61ef7bf85829 100644
--- a/drivers/net/dsa/mv88e6xxx/serdes.c
+++ b/drivers/net/dsa/mv88e6xxx/serdes.c
@@ -286,36 +286,33 @@ void mv88e6352_serdes_irq_free(struct mv88e6xxx_chip *chip, int port)
 	chip->ports[port].serdes_irq = 0;
 }
 
-/* Return the SERDES lane address a port is using. Only Ports 9 and 10
- * have SERDES lanes. Returns -ENODEV if a port does not have a lane.
- */
-static int mv88e6390_serdes_get_lane(struct mv88e6xxx_chip *chip, int port)
+int mv88e6390_serdes_get_lane(struct mv88e6xxx_chip *chip, int port, s8 *lane)
 {
 	u8 cmode = chip->ports[port].cmode;
 
+	*lane = -1;
+
 	switch (port) {
 	case 9:
 		if (cmode == MV88E6XXX_PORT_STS_CMODE_1000BASE_X ||
 		    cmode == MV88E6XXX_PORT_STS_CMODE_SGMII ||
 		    cmode == MV88E6XXX_PORT_STS_CMODE_2500BASEX)
-			return MV88E6390_PORT9_LANE0;
-		return -ENODEV;
+			*lane = MV88E6390_PORT9_LANE0;
+		break;
 	case 10:
 		if (cmode == MV88E6XXX_PORT_STS_CMODE_1000BASE_X ||
 		    cmode == MV88E6XXX_PORT_STS_CMODE_SGMII ||
 		    cmode == MV88E6XXX_PORT_STS_CMODE_2500BASEX)
-			return MV88E6390_PORT10_LANE0;
-		return -ENODEV;
+			*lane = MV88E6390_PORT10_LANE0;
+		break;
 	default:
-		return -ENODEV;
+		break;
 	}
+
+	return 0;
 }
 
-/* Return the SERDES lane address a port is using. Ports 9 and 10 can
- * use multiple lanes. If so, return the first lane the port uses.
- * Returns -ENODEV if a port does not have a lane.
- */
-int mv88e6390x_serdes_get_lane(struct mv88e6xxx_chip *chip, int port)
+int mv88e6390x_serdes_get_lane(struct mv88e6xxx_chip *chip, int port, s8 *lane)
 {
 	u8 cmode_port9, cmode_port10, cmode_port;
 
@@ -323,76 +320,80 @@ int mv88e6390x_serdes_get_lane(struct mv88e6xxx_chip *chip, int port)
 	cmode_port10 = chip->ports[10].cmode;
 	cmode_port = chip->ports[port].cmode;
 
+	*lane = -1;
+
 	switch (port) {
 	case 2:
 		if (cmode_port9 == MV88E6XXX_PORT_STS_CMODE_1000BASE_X ||
 		    cmode_port9 == MV88E6XXX_PORT_STS_CMODE_SGMII ||
 		    cmode_port9 == MV88E6XXX_PORT_STS_CMODE_2500BASEX)
 			if (cmode_port == MV88E6XXX_PORT_STS_CMODE_1000BASE_X)
-				return MV88E6390_PORT9_LANE1;
-		return -ENODEV;
+				*lane = MV88E6390_PORT9_LANE1;
+		break;
 	case 3:
 		if (cmode_port9 == MV88E6XXX_PORT_STS_CMODE_1000BASE_X ||
 		    cmode_port9 == MV88E6XXX_PORT_STS_CMODE_SGMII ||
 		    cmode_port9 == MV88E6XXX_PORT_STS_CMODE_2500BASEX ||
 		    cmode_port9 == MV88E6XXX_PORT_STS_CMODE_RXAUI)
 			if (cmode_port == MV88E6XXX_PORT_STS_CMODE_1000BASE_X)
-				return MV88E6390_PORT9_LANE2;
-		return -ENODEV;
+				*lane = MV88E6390_PORT9_LANE2;
+		break;
 	case 4:
 		if (cmode_port9 == MV88E6XXX_PORT_STS_CMODE_1000BASE_X ||
 		    cmode_port9 == MV88E6XXX_PORT_STS_CMODE_SGMII ||
 		    cmode_port9 == MV88E6XXX_PORT_STS_CMODE_2500BASEX ||
 		    cmode_port9 == MV88E6XXX_PORT_STS_CMODE_RXAUI)
 			if (cmode_port == MV88E6XXX_PORT_STS_CMODE_1000BASE_X)
-				return MV88E6390_PORT9_LANE3;
-		return -ENODEV;
+				*lane = MV88E6390_PORT9_LANE3;
+		break;
 	case 5:
 		if (cmode_port10 == MV88E6XXX_PORT_STS_CMODE_1000BASE_X ||
 		    cmode_port10 == MV88E6XXX_PORT_STS_CMODE_SGMII ||
 		    cmode_port10 == MV88E6XXX_PORT_STS_CMODE_2500BASEX)
 			if (cmode_port == MV88E6XXX_PORT_STS_CMODE_1000BASE_X)
-				return MV88E6390_PORT10_LANE1;
-		return -ENODEV;
+				*lane = MV88E6390_PORT10_LANE1;
+		break;
 	case 6:
 		if (cmode_port10 == MV88E6XXX_PORT_STS_CMODE_1000BASE_X ||
 		    cmode_port10 == MV88E6XXX_PORT_STS_CMODE_SGMII ||
 		    cmode_port10 == MV88E6XXX_PORT_STS_CMODE_2500BASEX ||
 		    cmode_port10 == MV88E6XXX_PORT_STS_CMODE_RXAUI)
 			if (cmode_port == MV88E6XXX_PORT_STS_CMODE_1000BASE_X)
-				return MV88E6390_PORT10_LANE2;
-		return -ENODEV;
+				*lane = MV88E6390_PORT10_LANE2;
+		break;
 	case 7:
 		if (cmode_port10 == MV88E6XXX_PORT_STS_CMODE_1000BASE_X ||
 		    cmode_port10 == MV88E6XXX_PORT_STS_CMODE_SGMII ||
 		    cmode_port10 == MV88E6XXX_PORT_STS_CMODE_2500BASEX ||
 		    cmode_port10 == MV88E6XXX_PORT_STS_CMODE_RXAUI)
 			if (cmode_port == MV88E6XXX_PORT_STS_CMODE_1000BASE_X)
-				return MV88E6390_PORT10_LANE3;
-		return -ENODEV;
+				*lane = MV88E6390_PORT10_LANE3;
+		break;
 	case 9:
 		if (cmode_port9 == MV88E6XXX_PORT_STS_CMODE_1000BASE_X ||
 		    cmode_port9 == MV88E6XXX_PORT_STS_CMODE_SGMII ||
 		    cmode_port9 == MV88E6XXX_PORT_STS_CMODE_2500BASEX ||
 		    cmode_port9 == MV88E6XXX_PORT_STS_CMODE_XAUI ||
 		    cmode_port9 == MV88E6XXX_PORT_STS_CMODE_RXAUI)
-			return MV88E6390_PORT9_LANE0;
-		return -ENODEV;
+			*lane = MV88E6390_PORT9_LANE0;
+		break;
 	case 10:
 		if (cmode_port10 == MV88E6XXX_PORT_STS_CMODE_1000BASE_X ||
 		    cmode_port10 == MV88E6XXX_PORT_STS_CMODE_SGMII ||
 		    cmode_port10 == MV88E6XXX_PORT_STS_CMODE_2500BASEX ||
 		    cmode_port10 == MV88E6XXX_PORT_STS_CMODE_XAUI ||
 		    cmode_port10 == MV88E6XXX_PORT_STS_CMODE_RXAUI)
-			return MV88E6390_PORT10_LANE0;
-		return -ENODEV;
+			*lane = MV88E6390_PORT10_LANE0;
+		break;
 	default:
-		return -ENODEV;
+		break;
 	}
+
+	return 0;
 }
 
 /* Set the power on/off for 10GBASE-R and 10GBASE-X4/X2 */
-static int mv88e6390_serdes_power_10g(struct mv88e6xxx_chip *chip, int lane,
+static int mv88e6390_serdes_power_10g(struct mv88e6xxx_chip *chip, s8 lane,
 				      bool on)
 {
 	u16 val, new_val;
@@ -419,7 +420,7 @@ static int mv88e6390_serdes_power_10g(struct mv88e6xxx_chip *chip, int lane,
 }
 
 /* Set the power on/off for SGMII and 1000Base-X */
-static int mv88e6390_serdes_power_sgmii(struct mv88e6xxx_chip *chip, int lane,
+static int mv88e6390_serdes_power_sgmii(struct mv88e6xxx_chip *chip, s8 lane,
 					bool on)
 {
 	u16 val, new_val;
@@ -445,7 +446,7 @@ static int mv88e6390_serdes_power_sgmii(struct mv88e6xxx_chip *chip, int lane,
 }
 
 static int mv88e6390_serdes_power_lane(struct mv88e6xxx_chip *chip, int port,
-				       int lane, bool on)
+				       s8 lane, bool on)
 {
 	u8 cmode = chip->ports[port].cmode;
 
@@ -464,14 +465,14 @@ static int mv88e6390_serdes_power_lane(struct mv88e6xxx_chip *chip, int port,
 
 int mv88e6390_serdes_power(struct mv88e6xxx_chip *chip, int port, bool on)
 {
-	int lane;
-
-	lane = mv88e6390_serdes_get_lane(chip, port);
-	if (lane == -ENODEV)
-		return 0;
+	s8 lane;
+	int err;
 
+	err = mv88e6xxx_serdes_get_lane(chip, port, &lane);
+	if (err)
+		return err;
 	if (lane < 0)
-		return lane;
+		return 0;
 
 	switch (port) {
 	case 9 ... 10:
@@ -483,14 +484,14 @@ int mv88e6390_serdes_power(struct mv88e6xxx_chip *chip, int port, bool on)
 
 int mv88e6390x_serdes_power(struct mv88e6xxx_chip *chip, int port, bool on)
 {
-	int lane;
-
-	lane = mv88e6390x_serdes_get_lane(chip, port);
-	if (lane == -ENODEV)
-		return 0;
+	s8 lane;
+	int err;
 
+	err = mv88e6xxx_serdes_get_lane(chip, port, &lane);
+	if (err)
+		return err;
 	if (lane < 0)
-		return lane;
+		return 0;
 
 	switch (port) {
 	case 2 ... 4:
@@ -503,7 +504,7 @@ int mv88e6390x_serdes_power(struct mv88e6xxx_chip *chip, int port, bool on)
 }
 
 static void mv88e6390_serdes_irq_link_sgmii(struct mv88e6xxx_chip *chip,
-					    int port, int lane)
+					    int port, s8 lane)
 {
 	u8 cmode = chip->ports[port].cmode;
 	struct dsa_switch *ds = chip->ds;
@@ -570,7 +571,7 @@ static void mv88e6390_serdes_irq_link_sgmii(struct mv88e6xxx_chip *chip,
 }
 
 static int mv88e6390_serdes_irq_enable_sgmii(struct mv88e6xxx_chip *chip,
-					     int lane)
+					     s8 lane)
 {
 	return mv88e6390_serdes_write(chip, lane, MDIO_MMD_PHYXS,
 				      MV88E6390_SGMII_INT_ENABLE,
@@ -579,14 +580,14 @@ static int mv88e6390_serdes_irq_enable_sgmii(struct mv88e6xxx_chip *chip,
 }
 
 static int mv88e6390_serdes_irq_disable_sgmii(struct mv88e6xxx_chip *chip,
-					      int lane)
+					      s8 lane)
 {
 	return mv88e6390_serdes_write(chip, lane, MDIO_MMD_PHYXS,
 				      MV88E6390_SGMII_INT_ENABLE, 0);
 }
 
 int mv88e6390_serdes_irq_enable(struct mv88e6xxx_chip *chip, int port,
-				int lane)
+				s8 lane)
 {
 	u8 cmode = chip->ports[port].cmode;
 	int err = 0;
@@ -602,7 +603,7 @@ int mv88e6390_serdes_irq_enable(struct mv88e6xxx_chip *chip, int port,
 }
 
 int mv88e6390_serdes_irq_disable(struct mv88e6xxx_chip *chip, int port,
-				 int lane)
+				 s8 lane)
 {
 	u8 cmode = chip->ports[port].cmode;
 	int err = 0;
@@ -618,7 +619,7 @@ int mv88e6390_serdes_irq_disable(struct mv88e6xxx_chip *chip, int port,
 }
 
 static int mv88e6390_serdes_irq_status_sgmii(struct mv88e6xxx_chip *chip,
-					     int lane, u16 *status)
+					     s8 lane, u16 *status)
 {
 	int err;
 
@@ -635,10 +636,10 @@ static irqreturn_t mv88e6390_serdes_thread_fn(int irq, void *dev_id)
 	irqreturn_t ret = IRQ_NONE;
 	u8 cmode = port->cmode;
 	u16 status;
-	int lane;
 	int err;
+	s8 lane;
 
-	lane = mv88e6390x_serdes_get_lane(chip, port->port);
+	mv88e6xxx_serdes_get_lane(chip, port->port, &lane);
 
 	mv88e6xxx_reg_lock(chip);
 
@@ -663,16 +664,14 @@ static irqreturn_t mv88e6390_serdes_thread_fn(int irq, void *dev_id)
 
 int mv88e6390x_serdes_irq_setup(struct mv88e6xxx_chip *chip, int port)
 {
-	int lane;
 	int err;
+	s8 lane;
 
-	lane = mv88e6390x_serdes_get_lane(chip, port);
-
-	if (lane == -ENODEV)
-		return 0;
-
+	err = mv88e6xxx_serdes_get_lane(chip, port, &lane);
+	if (err)
+		return err;
 	if (lane < 0)
-		return lane;
+		return 0;
 
 	chip->ports[port].serdes_irq = irq_find_mapping(chip->g2_irq.domain,
 							port);
@@ -711,11 +710,14 @@ int mv88e6390_serdes_irq_setup(struct mv88e6xxx_chip *chip, int port)
 
 void mv88e6390x_serdes_irq_free(struct mv88e6xxx_chip *chip, int port)
 {
-	int lane = mv88e6390x_serdes_get_lane(chip, port);
+	int err;
+	s8 lane;
 
-	if (lane == -ENODEV)
+	err = mv88e6xxx_serdes_get_lane(chip, port, &lane);
+	if (err) {
+		dev_err(chip->dev, "Unable to free SERDES irq: %d\n", err);
 		return;
-
+	}
 	if (lane < 0)
 		return;
 
diff --git a/drivers/net/dsa/mv88e6xxx/serdes.h b/drivers/net/dsa/mv88e6xxx/serdes.h
index ff5b94439335..1ddb8fb3aab9 100644
--- a/drivers/net/dsa/mv88e6xxx/serdes.h
+++ b/drivers/net/dsa/mv88e6xxx/serdes.h
@@ -74,7 +74,21 @@
 #define MV88E6390_SGMII_PHY_STATUS_SPD_DPL_VALID BIT(11)
 #define MV88E6390_SGMII_PHY_STATUS_LINK		BIT(10)
 
-int mv88e6390x_serdes_get_lane(struct mv88e6xxx_chip *chip, int port);
+/* Put the SERDES lane address a port is using into *lane. If a port has
+ * multiple lanes, should put the first lane the port is using. If a port does
+ * not have a lane, put -1 into *lane.
+ */
+static inline int mv88e6xxx_serdes_get_lane(struct mv88e6xxx_chip *chip,
+					    int port, s8 *lane)
+{
+	if (!chip->info->ops->serdes_get_lane)
+		return -EOPNOTSUPP;
+
+	return chip->info->ops->serdes_get_lane(chip, port, lane);
+}
+
+int mv88e6390_serdes_get_lane(struct mv88e6xxx_chip *chip, int port, s8 *lane);
+int mv88e6390x_serdes_get_lane(struct mv88e6xxx_chip *chip, int port, s8 *lane);
 int mv88e6341_serdes_power(struct mv88e6xxx_chip *chip, int port, bool on);
 int mv88e6352_serdes_power(struct mv88e6xxx_chip *chip, int port, bool on);
 int mv88e6390_serdes_power(struct mv88e6xxx_chip *chip, int port, bool on);
@@ -89,9 +103,9 @@ int mv88e6352_serdes_get_strings(struct mv88e6xxx_chip *chip,
 int mv88e6352_serdes_get_stats(struct mv88e6xxx_chip *chip, int port,
 			       uint64_t *data);
 int mv88e6390_serdes_irq_enable(struct mv88e6xxx_chip *chip, int port,
-				int lane);
+				s8 lane);
 int mv88e6390_serdes_irq_disable(struct mv88e6xxx_chip *chip, int port,
-				 int lane);
+				 s8 lane);
 int mv88e6352_serdes_irq_setup(struct mv88e6xxx_chip *chip, int port);
 void mv88e6352_serdes_irq_free(struct mv88e6xxx_chip *chip, int port);
 
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH v2 0/3] Add NETIF_F_HW_BR_CAP feature
From: Andrew Lunn @ 2019-08-26 12:38 UTC (permalink / raw)
  To: Horatiu Vultur
  Cc: roopa, nikolay, davem, UNGLinuxDriver, alexandre.belloni,
	allan.nielsen, f.fainelli, netdev, linux-kernel, bridge
In-Reply-To: <1566807075-775-1-git-send-email-horatiu.vultur@microchip.com>

On Mon, Aug 26, 2019 at 10:11:12AM +0200, Horatiu Vultur wrote:
> When a network port is added to a bridge then the port is added in
> promisc mode. Some HW that has bridge capabilities(can learn, forward,
> flood etc the frames) they are disabling promisc mode in the network
> driver when the port is added to the SW bridge.
> 
> This patch adds the feature NETIF_F_HW_BR_CAP so that the network ports
> that have this feature will not be set in promisc mode when they are
> added to a SW bridge.
> 
> In this way the HW that has bridge capabilities don't need to send all the
> traffic to the CPU and can also implement the promisc mode and toggle it
> using the command 'ip link set dev swp promisc on'

Hi Horatiu

I'm still not convinced this is needed. The model is, the hardware is
there to accelerate what Linux can do in software. Any peculiarities
of the accelerator should be hidden in the driver.  If the accelerator
can do its job without needing promisc mode, do that in the driver.

So you are trying to differentiate between promisc mode because the
interface is a member of a bridge, and promisc mode because some
application, like pcap, has asked for promisc mode.

dev->promiscuity is a counter. So what you can do it look at its
value, and how the interface is being used. If the interface is not a
member of a bridge, and the count > 0, enable promisc mode in the
accelerator. If the interface is a member of a bridge, and the count >
1, enable promisc mode in the accelerator.

   Andrew

^ permalink raw reply

* Re: [PATCH v5] perf machine: arm/arm64: Improve completeness for kernel address space
From: Leo Yan @ 2019-08-26 12:51 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: Arnaldo Carvalho de Melo, Alexander Shishkin, Jiri Olsa,
	Namhyung Kim, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Song Liu, Yonghong Song, linux-kernel, netdev,
	bpf, clang-built-linux, Mathieu Poirier, Peter Zijlstra,
	Suzuki Poulouse, coresight, linux-arm-kernel
In-Reply-To: <363577f1-097e-eddd-a6ca-b23f644dd8ce@intel.com>

Hi Adrian,

On Fri, Aug 16, 2019 at 04:00:02PM +0300, Adrian Hunter wrote:
> On 16/08/19 4:45 AM, Leo Yan wrote:
> > Hi Adrian,
> > 
> > On Thu, Aug 15, 2019 at 02:45:57PM +0300, Adrian Hunter wrote:
> > 
> > [...]
> > 
> >>>> How come you cannot use kallsyms to get the information?
> >>>
> >>> Thanks for pointing out this.  Sorry I skipped your comment "I don't
> >>> know how you intend to calculate ARM_PRE_START_SIZE" when you reviewed
> >>> the patch v3, I should use that chance to elaborate the detailed idea
> >>> and so can get more feedback/guidance before procceed.
> >>>
> >>> Actually, I have considered to use kallsyms when worked on the previous
> >>> patch set.
> >>>
> >>> As mentioned in patch set v4's cover letter, I tried to implement
> >>> machine__create_extra_kernel_maps() for arm/arm64, the purpose is to
> >>> parse kallsyms so can find more kernel maps and thus also can fixup
> >>> the kernel start address.  But I found the 'perf script' tool directly
> >>> calls machine__get_kernel_start() instead of running into the flow for
> >>> machine__create_extra_kernel_maps();
> >>
> >> Doesn't it just need to loop through each kernel map to find the lowest
> >> start address?
> > 
> > Based on your suggestion, I worked out below change and verified it
> > can work well on arm64 for fixing up start address; please let me know
> > if the change works for you?
> 
> How does that work if take a perf.data file to a machine with a different
> architecture?

Sorry I delayed so long to respond to your question; I didn't have
confidence to give out very reasonale answer and this is the main reason
for delaying.

For your question for taking a perf.data file to a machine with a
different architecture, we can firstly use command 'perf buildid-list'
to print out the buildid for kallsyms, based on the dumped buildid we
can find out the location for the saved kallsyms file; then we can use
option '--kallsyms' to specify the offline kallsyms file and use the
offline kallsyms to fixup kernel start address.  The detailed commands
are listed as below:

root@debian:~# perf buildid-list
7b36dfca8317ef74974ebd7ee5ec0a8b35c97640 [kernel.kallsyms]
56b84aa88a1bcfe222a97a53698b92723a3977ca /usr/lib/systemd/systemd
0956b952e9cd673d48ff2cfeb1a9dbd0c853e686 /usr/lib/aarch64-linux-gnu/libm-2.28.so
[...]

root@debian:~# perf script --kallsyms ~/.debug/\[kernel.kallsyms\]/7b36dfca8317ef74974ebd7ee5ec0a8b35c97640/kallsyms

The amended patch is as below, please review and always welcome
any suggestions or comments!

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 5734460fc89e..593f05cc453f 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2672,9 +2672,26 @@ int machine__nr_cpus_avail(struct machine *machine)
 	return machine ? perf_env__nr_cpus_avail(machine->env) : 0;
 }
 
+static int machine__fixup_kernel_start(void *arg,
+				       const char *name __maybe_unused,
+				       char type,
+				       u64 start)
+{
+	struct machine *machine = arg;
+
+	type = toupper(type);
+
+	/* Fixup for text, weak, data and bss sections. */
+	if (type == 'T' || type == 'W' || type == 'D' || type == 'B')
+		machine->kernel_start = min(machine->kernel_start, start);
+
+	return 0;
+}
+
 int machine__get_kernel_start(struct machine *machine)
 {
 	struct map *map = machine__kernel_map(machine);
+	char filename[PATH_MAX];
 	int err = 0;
 
 	/*
@@ -2696,6 +2713,22 @@ int machine__get_kernel_start(struct machine *machine)
 		if (!err && !machine__is(machine, "x86_64"))
 			machine->kernel_start = map->start;
 	}
+
+	if (symbol_conf.kallsyms_name != NULL) {
+		strncpy(filename, symbol_conf.kallsyms_name, PATH_MAX);
+	} else {
+		machine__get_kallsyms_filename(machine, filename, PATH_MAX);
+
+		if (symbol__restricted_filename(filename, "/proc/kallsyms"))
+			goto out;
+	}
+
+	if (kallsyms__parse(filename, machine, machine__fixup_kernel_start))
+		pr_warning("Fail to fixup kernel start address. skipping...\n");
+
+out:
 	return err;
 }
 

Thanks,
Leo Yan

^ permalink raw reply related

* RE: [PATCH net-next] dpaa2-eth: Add pause frame support
From: Ioana Ciocoi Radulescu @ 2019-08-26 13:00 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: netdev@vger.kernel.org, davem@davemloft.net, Ioana Ciornei
In-Reply-To: <20190823163037.GA19727@lunn.ch>

> -----Original Message-----
> From: Andrew Lunn <andrew@lunn.ch>
> Sent: Friday, August 23, 2019 7:31 PM
> To: Ioana Ciocoi Radulescu <ruxandra.radulescu@nxp.com>
> Cc: netdev@vger.kernel.org; davem@davemloft.net; Ioana Ciornei
> <ioana.ciornei@nxp.com>
> Subject: Re: [PATCH net-next] dpaa2-eth: Add pause frame support
> 
> > --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
> > +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
> > @@ -78,29 +78,20 @@ static int
> >  dpaa2_eth_get_link_ksettings(struct net_device *net_dev,
> >  			     struct ethtool_link_ksettings *link_settings)
> >  {
> > -	struct dpni_link_state state = {0};
> > -	int err = 0;
> >  	struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
> >
> > -	err = dpni_get_link_state(priv->mc_io, 0, priv->mc_token, &state);
> > -	if (err) {
> > -		netdev_err(net_dev, "ERROR %d getting link state\n", err);
> > -		goto out;
> > -	}
> > -
> >  	/* At the moment, we have no way of interrogating the DPMAC
> >  	 * from the DPNI side - and for that matter there may exist
> >  	 * no DPMAC at all. So for now we just don't report anything
> >  	 * beyond the DPNI attributes.
> >  	 */
> > -	if (state.options & DPNI_LINK_OPT_AUTONEG)
> > +	if (priv->link_state.options & DPNI_LINK_OPT_AUTONEG)
> >  		link_settings->base.autoneg = AUTONEG_ENABLE;
> > -	if (!(state.options & DPNI_LINK_OPT_HALF_DUPLEX))
> > +	if (!(priv->link_state.options & DPNI_LINK_OPT_HALF_DUPLEX))
> >  		link_settings->base.duplex = DUPLEX_FULL;
> > -	link_settings->base.speed = state.rate;
> > +	link_settings->base.speed = priv->link_state.rate;
> >
> > -out:
> > -	return err;
> > +	return 0;
> 
> Hi Ioana
> 
> I think this patch can be broken up a bit, to help review.
> 
> It looks like this change to report state via priv->link_state should
> be a separate patch. I think this change can be made without the pause
> changes. That then makes the pause changes themselves simpler.

Agree, will change in v2

> 
> > +static void dpaa2_eth_get_pauseparam(struct net_device *net_dev,
> > +				     struct ethtool_pauseparam *pause)
> > +{
> > +	struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
> > +	u64 link_options = priv->link_state.options;
> > +
> > +	pause->rx_pause = !!(link_options & DPNI_LINK_OPT_PAUSE);
> > +	pause->tx_pause = pause->rx_pause ^
> > +			  !!(link_options & DPNI_LINK_OPT_ASYM_PAUSE);
> 
> Since you don't support auto-neg, you should set pause->autoneg to
> false. It probably already is set to false, by a memset, but setting
> it explicitly is a form of documentation, this hardware only supports
> forced pause configuration.
> 

Ok.

> > +}
> > +
> > +static int dpaa2_eth_set_pauseparam(struct net_device *net_dev,
> > +				    struct ethtool_pauseparam *pause)
> > +{
> > +	struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
> > +	struct dpni_link_cfg cfg = {0};
> > +	int err;
> > +
> > +	if (!dpaa2_eth_has_pause_support(priv)) {
> > +		netdev_info(net_dev, "No pause frame support for DPNI
> version < %d.%d\n",
> > +			    DPNI_PAUSE_VER_MAJOR,
> DPNI_PAUSE_VER_MINOR);
> > +		return -EOPNOTSUPP;
> > +	}
> > +
> > +	if (pause->autoneg)
> > +		netdev_err(net_dev, "Pause frame autoneg not supported\n");
> 
> And here you should return -EOPNOTSUPP. No need for an netdev_err. It
> is not an error, you simply don't support it.

Ok

> 
> There is also the issue of what is the PHY doing? It would not be good
> if the MAC is configured one way, but the PHY is advertising something
> else. So it appears you have no control over the PHY. But i assume you
> know what the PHY is actually doing? Does it advertise pause/asym
> pause?
> 
> It might be, to keep things consistent, we only accept pause
> configuration when auto-neg in general is disabled.

Ah, this is an area in our driver that's a bit messy and complicated.
Like you said, we don't control the PHY, actually we only support
fixed-link PHYs for now. General autoneg should really be reported as
always off.

We're working to add proper support in this area, but until we manage
to do it I think it's best we just remove the possibility of users
changing the link settings via ethtool - it's code that shouldn't be
there (and firmware doesn't allow it anyway). I'll include this cleanup
in the next version.

Thanks a lot for the feedback,
Ioana

^ permalink raw reply

* Re: [PATCH v2 5/5] ARM: dts: ls1021a-tsn: Use the DSPI controller in poll mode
From: Vladimir Oltean @ 2019-08-26 13:10 UTC (permalink / raw)
  To: Mark Brown; +Cc: linux-spi, lkml, devicetree, netdev, Rob Herring, Shawn Guo
In-Reply-To: <20190822211514.19288-6-olteanv@gmail.com>

Hi Mark,

On Fri, 23 Aug 2019 at 00:15, Vladimir Oltean <olteanv@gmail.com> wrote:
>
> Connected to the LS1021A DSPI is the SJA1105 DSA switch. This
> constitutes 4 of the 6 Ethernet ports on this board.
>
> As the SJA1105 is a PTP switch, constant disciplining of its PTP clock
> is necessary, and that translates into a lot of SPI I/O even when
> otherwise idle.
>
> Switching to using the DSPI in poll mode has several distinct
> benefits:
>
> - With interrupts, the DSPI driver in TCFQ mode raises an IRQ after each
>   transmitted byte. There is more time wasted for the "waitq" event than
>   for actual I/O. And the DSPI IRQ count is by far the largest in
>   /proc/interrupts on this board (larger than Ethernet). I should
>   mention that due to various LS1021A errata, other operating modes than
>   TCFQ are not available.
>
> - The SPI I/O time is both lower, and more consistently so. For a TSN
>   switch it is important that all SPI transfers take a deterministic
>   time to complete.
>   Reading the PTP clock is an important example.
>   Egressing through the switch requires some setup in advance (an SPI
>   write command). Without this patch, that operation required a
>   --tx_timestamp_timeout 50 (ms), now it can be done with
>   --tx_timestamp_timeout 10.
>   Yet another example is reconstructing timestamps, which has a hard
>   deadline because the PTP timestamping counter wraps around in 0.135
>   seconds. Combined with other I/O needed for that to happen, there is
>   a real risk that the deadline is not always met.
>
> See drivers/net/dsa/sja1105/ for more info about the above.
>
> Cc: Rob Herring <robh@kernel.org>
> Cc: Shawn Guo <shawnguo@kernel.org>
> Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
> ---
>  arch/arm/boot/dts/ls1021a-tsn.dts | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/arch/arm/boot/dts/ls1021a-tsn.dts b/arch/arm/boot/dts/ls1021a-tsn.dts
> index 5b7689094b70..1c09cfc766af 100644
> --- a/arch/arm/boot/dts/ls1021a-tsn.dts
> +++ b/arch/arm/boot/dts/ls1021a-tsn.dts
> @@ -33,6 +33,7 @@
>  };
>
>  &dspi0 {
> +       /delete-property/ interrupts;
>         bus-num = <0>;
>         status = "okay";
>
> --
> 2.17.1
>

I noticed you skipped applying this patch, and I'm not sure that Shawn
will review it/take it.
Do you have a better suggestion how I can achieve putting the DSPI
driver in poll mode for this board? A Kconfig option maybe?

Regards,
-Vladimir

^ permalink raw reply

* Re: [PATCH net v3] ixgbe: fix double clean of tx descriptors with xdp
From: Maciej Fijalkowski @ 2019-08-26 13:40 UTC (permalink / raw)
  To: Ilya Maximets
  Cc: netdev, linux-kernel, bpf, David S. Miller, Björn Töpel,
	Magnus Karlsson, Jakub Kicinski, Alexei Starovoitov,
	Daniel Borkmann, Jeff Kirsher, intel-wired-lan, Eelco Chaudron,
	William Tu, Alexander Duyck
In-Reply-To: <20190822171237.20798-1-i.maximets@samsung.com>

On Thu, 22 Aug 2019 20:12:37 +0300
Ilya Maximets <i.maximets@samsung.com> wrote:

> Tx code doesn't clear the descriptors' status after cleaning.
> So, if the budget is larger than number of used elems in a ring, some
> descriptors will be accounted twice and xsk_umem_complete_tx will move
> prod_tail far beyond the prod_head breaking the completion queue ring.
> 
> Fix that by limiting the number of descriptors to clean by the number
> of used descriptors in the tx ring.
> 
> 'ixgbe_clean_xdp_tx_irq()' function refactored to look more like
> 'ixgbe_xsk_clean_tx_ring()' since we're allowed to directly use
> 'next_to_clean' and 'next_to_use' indexes.
> 
> Fixes: 8221c5eba8c1 ("ixgbe: add AF_XDP zero-copy Tx support")
> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
> ---
> 
> Version 3:
>   * Reverted some refactoring made for v2.
>   * Eliminated 'budget' for tx clean.
>   * prefetch returned.
> 
> Version 2:
>   * 'ixgbe_clean_xdp_tx_irq()' refactored to look more like
>     'ixgbe_xsk_clean_tx_ring()'.
> 
>  drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 29 ++++++++------------
>  1 file changed, 11 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
> index 6b609553329f..a3b6d8c89127 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
> @@ -633,19 +633,17 @@ static void ixgbe_clean_xdp_tx_buffer(struct ixgbe_ring *tx_ring,
>  bool ixgbe_clean_xdp_tx_irq(struct ixgbe_q_vector *q_vector,
>  			    struct ixgbe_ring *tx_ring, int napi_budget)

While you're at it, can you please as well remove the 'napi_budget' argument?
It wasn't used at all even before your patch.

I'm jumping late in, but I was really wondering and hesitated with taking
part in discussion since the v1 of this patch - can you elaborate why simply
clearing the DD bit wasn't sufficient?

Maciej

>  {
> +	u16 ntc = tx_ring->next_to_clean, ntu = tx_ring->next_to_use;
>  	unsigned int total_packets = 0, total_bytes = 0;
> -	u32 i = tx_ring->next_to_clean, xsk_frames = 0;
> -	unsigned int budget = q_vector->tx.work_limit;
>  	struct xdp_umem *umem = tx_ring->xsk_umem;
>  	union ixgbe_adv_tx_desc *tx_desc;
>  	struct ixgbe_tx_buffer *tx_bi;
> -	bool xmit_done;
> +	u32 xsk_frames = 0;
>  
> -	tx_bi = &tx_ring->tx_buffer_info[i];
> -	tx_desc = IXGBE_TX_DESC(tx_ring, i);
> -	i -= tx_ring->count;
> +	tx_bi = &tx_ring->tx_buffer_info[ntc];
> +	tx_desc = IXGBE_TX_DESC(tx_ring, ntc);
>  
> -	do {
> +	while (ntc != ntu) {
>  		if (!(tx_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD)))
>  			break;
>  
> @@ -661,22 +659,18 @@ bool ixgbe_clean_xdp_tx_irq(struct ixgbe_q_vector *q_vector,
>  
>  		tx_bi++;
>  		tx_desc++;
> -		i++;
> -		if (unlikely(!i)) {
> -			i -= tx_ring->count;
> +		ntc++;
> +		if (unlikely(ntc == tx_ring->count)) {
> +			ntc = 0;
>  			tx_bi = tx_ring->tx_buffer_info;
>  			tx_desc = IXGBE_TX_DESC(tx_ring, 0);
>  		}
>  
>  		/* issue prefetch for next Tx descriptor */
>  		prefetch(tx_desc);
> +	}
>  
> -		/* update budget accounting */
> -		budget--;
> -	} while (likely(budget));
> -
> -	i += tx_ring->count;
> -	tx_ring->next_to_clean = i;
> +	tx_ring->next_to_clean = ntc;
>  
>  	u64_stats_update_begin(&tx_ring->syncp);
>  	tx_ring->stats.bytes += total_bytes;
> @@ -688,8 +682,7 @@ bool ixgbe_clean_xdp_tx_irq(struct ixgbe_q_vector *q_vector,
>  	if (xsk_frames)
>  		xsk_umem_complete_tx(umem, xsk_frames);
>  
> -	xmit_done = ixgbe_xmit_zc(tx_ring, q_vector->tx.work_limit);
> -	return budget > 0 && xmit_done;
> +	return ixgbe_xmit_zc(tx_ring, q_vector->tx.work_limit);
>  }
>  
>  int ixgbe_xsk_async_xmit(struct net_device *dev, u32 qid)


^ permalink raw reply

* [PATCH net-next v3 05/10] net: sched: add API for registering unlocked offload block callbacks
From: Vlad Buslov @ 2019-08-26 13:45 UTC (permalink / raw)
  To: netdev
  Cc: jhs, xiyou.wangcong, jiri, davem, jakub.kicinski, pablo,
	Vlad Buslov, Jiri Pirko
In-Reply-To: <20190826134506.9705-1-vladbu@mellanox.com>

Extend struct flow_block_offload with "unlocked_driver_cb" flag to allow
registering and unregistering block hardware offload callbacks that do not
require caller to hold rtnl lock. Extend tcf_block with additional
lockeddevcnt counter that is incremented for each non-unlocked driver
callback attached to device. This counter is necessary to conditionally
obtain rtnl lock before calling hardware callbacks in following patches.

Register mlx5 tc block offload callbacks as "unlocked".

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c  | 3 +++
 include/net/flow_offload.h                        | 1 +
 include/net/sch_generic.h                         | 1 +
 net/sched/cls_api.c                               | 6 ++++++
 5 files changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index fa4bf2d4bcd4..8592b98d0e70 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3470,10 +3470,12 @@ static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			  void *type_data)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct flow_block_offload *f = type_data;
 
 	switch (type) {
 #ifdef CONFIG_MLX5_ESWITCH
 	case TC_SETUP_BLOCK:
+		f->unlocked_driver_cb = true;
 		return flow_block_cb_setup_simple(type_data,
 						  &mlx5e_block_cb_list,
 						  mlx5e_setup_tc_block_cb,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 3c0d36b2b91c..e7ac6233037d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -763,6 +763,7 @@ mlx5e_rep_indr_setup_tc_block(struct net_device *netdev,
 	if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
 		return -EOPNOTSUPP;
 
+	f->unlocked_driver_cb = true;
 	f->driver_block_list = &mlx5e_block_cb_list;
 
 	switch (f->command) {
@@ -1245,9 +1246,11 @@ static int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			      void *type_data)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct flow_block_offload *f = type_data;
 
 	switch (type) {
 	case TC_SETUP_BLOCK:
+		f->unlocked_driver_cb = true;
 		return flow_block_cb_setup_simple(type_data,
 						  &mlx5e_rep_block_cb_list,
 						  mlx5e_rep_setup_tc_cb,
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 757fa84de654..fc881875f856 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -284,6 +284,7 @@ struct flow_block_offload {
 	enum flow_block_command command;
 	enum flow_block_binder_type binder_type;
 	bool block_shared;
+	bool unlocked_driver_cb;
 	struct net *net;
 	struct flow_block *block;
 	struct list_head cb_list;
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index c4fbbaff30a2..43f5b7ed02bd 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -408,6 +408,7 @@ struct tcf_block {
 	bool keep_dst;
 	atomic_t offloadcnt; /* Number of oddloaded filters */
 	unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */
+	unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */
 	struct {
 		struct tcf_chain *chain;
 		struct list_head filter_chain_list;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 8b807e75fae2..1a39779bdbad 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1418,6 +1418,8 @@ static int tcf_block_bind(struct tcf_block *block,
 						  bo->extack);
 		if (err)
 			goto err_unroll;
+		if (!bo->unlocked_driver_cb)
+			block->lockeddevcnt++;
 
 		i++;
 	}
@@ -1433,6 +1435,8 @@ static int tcf_block_bind(struct tcf_block *block,
 						    block_cb->cb_priv, false,
 						    tcf_block_offload_in_use(block),
 						    NULL);
+			if (!bo->unlocked_driver_cb)
+				block->lockeddevcnt--;
 		}
 		flow_block_cb_free(block_cb);
 	}
@@ -1454,6 +1458,8 @@ static void tcf_block_unbind(struct tcf_block *block,
 					    NULL);
 		list_del(&block_cb->list);
 		flow_block_cb_free(block_cb);
+		if (!bo->unlocked_driver_cb)
+			block->lockeddevcnt--;
 	}
 }
 
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v3 06/10] net: sched: conditionally obtain rtnl lock in cls hw offloads API
From: Vlad Buslov @ 2019-08-26 13:45 UTC (permalink / raw)
  To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, jakub.kicinski, pablo,
	Vlad Buslov
In-Reply-To: <20190826134506.9705-1-vladbu@mellanox.com>

In order to remove dependency on rtnl lock from offloads code of
classifiers, take rtnl lock conditionally before executing driver
callbacks. Only obtain rtnl lock if block is bound to devices that require
it.

Block bind/unbind code is rtnl-locked and obtains block->cb_lock while
holding rtnl lock. Obtain locks in same order in tc_setup_cb_*() functions
to prevent deadlock.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
---

Notes:
    Changes from V2 to V3:
      - Don't speculatively take rtnl lock when caller already has it.
    
    Changes from V1 to V2:
      - Speculatively read block->lockeddevcnt in tc_setup_cb_*() to obtain rtnl
        mutex without retry when block is bound to locked device.

 net/sched/cls_api.c | 65 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 1a39779bdbad..3c103cf9fd0d 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3076,11 +3076,28 @@ __tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
 int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
 		     void *type_data, bool err_stop, bool rtnl_held)
 {
+	bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
 	int ok_count;
 
+retry:
+	if (take_rtnl)
+		rtnl_lock();
 	down_read(&block->cb_lock);
+	/* Need to obtain rtnl lock if block is bound to devs that require it.
+	 * In block bind code cb_lock is obtained while holding rtnl, so we must
+	 * obtain the locks in same order here.
+	 */
+	if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+		up_read(&block->cb_lock);
+		take_rtnl = true;
+		goto retry;
+	}
+
 	ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+
 	up_read(&block->cb_lock);
+	if (take_rtnl)
+		rtnl_unlock();
 	return ok_count;
 }
 EXPORT_SYMBOL(tc_setup_cb_call);
@@ -3095,9 +3112,23 @@ int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp,
 		    enum tc_setup_type type, void *type_data, bool err_stop,
 		    u32 *flags, unsigned int *in_hw_count, bool rtnl_held)
 {
+	bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
 	int ok_count;
 
+retry:
+	if (take_rtnl)
+		rtnl_lock();
 	down_read(&block->cb_lock);
+	/* Need to obtain rtnl lock if block is bound to devs that require it.
+	 * In block bind code cb_lock is obtained while holding rtnl, so we must
+	 * obtain the locks in same order here.
+	 */
+	if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+		up_read(&block->cb_lock);
+		take_rtnl = true;
+		goto retry;
+	}
+
 	/* Make sure all netdevs sharing this block are offload-capable. */
 	if (block->nooffloaddevcnt && err_stop) {
 		ok_count = -EOPNOTSUPP;
@@ -3115,6 +3146,8 @@ int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp,
 					  ok_count, true);
 err_unlock:
 	up_read(&block->cb_lock);
+	if (take_rtnl)
+		rtnl_unlock();
 	return ok_count < 0 ? ok_count : 0;
 }
 EXPORT_SYMBOL(tc_setup_cb_add);
@@ -3131,9 +3164,23 @@ int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp,
 			u32 *new_flags, unsigned int *new_in_hw_count,
 			bool rtnl_held)
 {
+	bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
 	int ok_count;
 
+retry:
+	if (take_rtnl)
+		rtnl_lock();
 	down_read(&block->cb_lock);
+	/* Need to obtain rtnl lock if block is bound to devs that require it.
+	 * In block bind code cb_lock is obtained while holding rtnl, so we must
+	 * obtain the locks in same order here.
+	 */
+	if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+		up_read(&block->cb_lock);
+		take_rtnl = true;
+		goto retry;
+	}
+
 	/* Make sure all netdevs sharing this block are offload-capable. */
 	if (block->nooffloaddevcnt && err_stop) {
 		ok_count = -EOPNOTSUPP;
@@ -3155,6 +3202,8 @@ int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp,
 					  new_flags, ok_count, true);
 err_unlock:
 	up_read(&block->cb_lock);
+	if (take_rtnl)
+		rtnl_unlock();
 	return ok_count < 0 ? ok_count : 0;
 }
 EXPORT_SYMBOL(tc_setup_cb_replace);
@@ -3167,9 +3216,23 @@ int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp,
 			enum tc_setup_type type, void *type_data, bool err_stop,
 			u32 *flags, unsigned int *in_hw_count, bool rtnl_held)
 {
+	bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
 	int ok_count;
 
+retry:
+	if (take_rtnl)
+		rtnl_lock();
 	down_read(&block->cb_lock);
+	/* Need to obtain rtnl lock if block is bound to devs that require it.
+	 * In block bind code cb_lock is obtained while holding rtnl, so we must
+	 * obtain the locks in same order here.
+	 */
+	if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+		up_read(&block->cb_lock);
+		take_rtnl = true;
+		goto retry;
+	}
+
 	ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
 
 	tc_cls_offload_cnt_reset(block, tp, in_hw_count, flags);
@@ -3177,6 +3240,8 @@ int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp,
 		tp->ops->hw_del(tp, type_data);
 
 	up_read(&block->cb_lock);
+	if (take_rtnl)
+		rtnl_unlock();
 	return ok_count < 0 ? ok_count : 0;
 }
 EXPORT_SYMBOL(tc_setup_cb_destroy);
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v3 08/10] net: sched: take reference to action dev before calling offloads
From: Vlad Buslov @ 2019-08-26 13:45 UTC (permalink / raw)
  To: netdev
  Cc: jhs, xiyou.wangcong, jiri, davem, jakub.kicinski, pablo,
	Vlad Buslov, Jiri Pirko
In-Reply-To: <20190826134506.9705-1-vladbu@mellanox.com>

In order to remove dependency on rtnl lock when calling hardware offload
API, take reference to action mirred dev when initializing flow_action
structure in tc_setup_flow_action(). Implement function
tc_cleanup_flow_action(), use it to release the device after hardware
offload API is done using it.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/pkt_cls.h  |  2 ++
 net/sched/cls_api.c    | 32 ++++++++++++++++++++++++++++++++
 net/sched/cls_flower.c |  2 ++
 3 files changed, 36 insertions(+)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index a48824bc1489..e553fc80eb23 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -505,6 +505,8 @@ tcf_match_indev(struct sk_buff *skb, int ifindex)
 
 int tc_setup_flow_action(struct flow_action *flow_action,
 			 const struct tcf_exts *exts, bool rtnl_held);
+void tc_cleanup_flow_action(struct flow_action *flow_action);
+
 int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
 		     void *type_data, bool err_stop, bool rtnl_held);
 int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 8751bb8a682f..d988737693e4 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3265,6 +3265,27 @@ int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp,
 }
 EXPORT_SYMBOL(tc_setup_cb_reoffload);
 
+void tc_cleanup_flow_action(struct flow_action *flow_action)
+{
+	struct flow_action_entry *entry;
+	int i;
+
+	flow_action_for_each(i, entry, flow_action) {
+		switch (entry->id) {
+		case FLOW_ACTION_REDIRECT:
+		case FLOW_ACTION_MIRRED:
+		case FLOW_ACTION_REDIRECT_INGRESS:
+		case FLOW_ACTION_MIRRED_INGRESS:
+			if (entry->dev)
+				dev_put(entry->dev);
+			break;
+		default:
+			break;
+		}
+	}
+}
+EXPORT_SYMBOL(tc_cleanup_flow_action);
+
 int tc_setup_flow_action(struct flow_action *flow_action,
 			 const struct tcf_exts *exts, bool rtnl_held)
 {
@@ -3294,15 +3315,23 @@ int tc_setup_flow_action(struct flow_action *flow_action,
 		} else if (is_tcf_mirred_egress_redirect(act)) {
 			entry->id = FLOW_ACTION_REDIRECT;
 			entry->dev = tcf_mirred_dev(act);
+			if (entry->dev)
+				dev_hold(entry->dev);
 		} else if (is_tcf_mirred_egress_mirror(act)) {
 			entry->id = FLOW_ACTION_MIRRED;
 			entry->dev = tcf_mirred_dev(act);
+			if (entry->dev)
+				dev_hold(entry->dev);
 		} else if (is_tcf_mirred_ingress_redirect(act)) {
 			entry->id = FLOW_ACTION_REDIRECT_INGRESS;
 			entry->dev = tcf_mirred_dev(act);
+			if (entry->dev)
+				dev_hold(entry->dev);
 		} else if (is_tcf_mirred_ingress_mirror(act)) {
 			entry->id = FLOW_ACTION_MIRRED_INGRESS;
 			entry->dev = tcf_mirred_dev(act);
+			if (entry->dev)
+				dev_hold(entry->dev);
 		} else if (is_tcf_vlan(act)) {
 			switch (tcf_vlan_action(act)) {
 			case TCA_VLAN_ACT_PUSH:
@@ -3410,6 +3439,9 @@ int tc_setup_flow_action(struct flow_action *flow_action,
 	if (!rtnl_held)
 		rtnl_unlock();
 
+	if (err)
+		tc_cleanup_flow_action(flow_action);
+
 	return err;
 }
 EXPORT_SYMBOL(tc_setup_flow_action);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index fb305bd45d93..2852fe6f50d2 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -465,6 +465,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 
 	err = tc_setup_cb_add(block, tp, TC_SETUP_CLSFLOWER, &cls_flower,
 			      skip_sw, &f->flags, &f->in_hw_count, true);
+	tc_cleanup_flow_action(&cls_flower.rule->action);
 	kfree(cls_flower.rule);
 
 	if (err) {
@@ -1838,6 +1839,7 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
 					    TC_SETUP_CLSFLOWER, &cls_flower,
 					    cb_priv, &f->flags,
 					    &f->in_hw_count);
+		tc_cleanup_flow_action(&cls_flower.rule->action);
 		kfree(cls_flower.rule);
 
 		if (err) {
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v3 10/10] net: sched: flower: don't take rtnl lock for cls hw offloads API
From: Vlad Buslov @ 2019-08-26 13:45 UTC (permalink / raw)
  To: netdev
  Cc: jhs, xiyou.wangcong, jiri, davem, jakub.kicinski, pablo,
	Vlad Buslov, Jiri Pirko
In-Reply-To: <20190826134506.9705-1-vladbu@mellanox.com>

Don't manually take rtnl lock in flower classifier before calling cls
hardware offloads API. Instead, pass rtnl lock status via 'rtnl_held'
parameter.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 net/sched/cls_flower.c | 53 +++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 37 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 2852fe6f50d2..74221e3351c3 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -412,18 +412,13 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
 	struct tcf_block *block = tp->chain->block;
 	struct flow_cls_offload cls_flower = {};
 
-	if (!rtnl_held)
-		rtnl_lock();
-
 	tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
 	cls_flower.command = FLOW_CLS_DESTROY;
 	cls_flower.cookie = (unsigned long) f;
 
 	tc_setup_cb_destroy(block, tp, TC_SETUP_CLSFLOWER, &cls_flower, false,
-			    &f->flags, &f->in_hw_count, true);
+			    &f->flags, &f->in_hw_count, rtnl_held);
 
-	if (!rtnl_held)
-		rtnl_unlock();
 }
 
 static int fl_hw_replace_filter(struct tcf_proto *tp,
@@ -435,14 +430,9 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	bool skip_sw = tc_skip_sw(f->flags);
 	int err = 0;
 
-	if (!rtnl_held)
-		rtnl_lock();
-
 	cls_flower.rule = flow_rule_alloc(tcf_exts_num_actions(&f->exts));
-	if (!cls_flower.rule) {
-		err = -ENOMEM;
-		goto errout;
-	}
+	if (!cls_flower.rule)
+		return -ENOMEM;
 
 	tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
 	cls_flower.command = FLOW_CLS_REPLACE;
@@ -453,36 +443,30 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	cls_flower.classid = f->res.classid;
 
 	err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts,
-				   true);
+				   rtnl_held);
 	if (err) {
 		kfree(cls_flower.rule);
-		if (skip_sw)
+		if (skip_sw) {
 			NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action");
-		else
-			err = 0;
-		goto errout;
+			return err;
+		}
+		return 0;
 	}
 
 	err = tc_setup_cb_add(block, tp, TC_SETUP_CLSFLOWER, &cls_flower,
-			      skip_sw, &f->flags, &f->in_hw_count, true);
+			      skip_sw, &f->flags, &f->in_hw_count, rtnl_held);
 	tc_cleanup_flow_action(&cls_flower.rule->action);
 	kfree(cls_flower.rule);
 
 	if (err) {
-		fl_hw_destroy_filter(tp, f, true, NULL);
-		goto errout;
-	}
-
-	if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW)) {
-		err = -EINVAL;
-		goto errout;
+		fl_hw_destroy_filter(tp, f, rtnl_held, NULL);
+		return err;
 	}
 
-errout:
-	if (!rtnl_held)
-		rtnl_unlock();
+	if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW))
+		return -EINVAL;
 
-	return err;
+	return 0;
 }
 
 static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
@@ -491,22 +475,17 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
 	struct tcf_block *block = tp->chain->block;
 	struct flow_cls_offload cls_flower = {};
 
-	if (!rtnl_held)
-		rtnl_lock();
-
 	tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL);
 	cls_flower.command = FLOW_CLS_STATS;
 	cls_flower.cookie = (unsigned long) f;
 	cls_flower.classid = f->res.classid;
 
-	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, true);
+	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false,
+			 rtnl_held);
 
 	tcf_exts_stats_update(&f->exts, cls_flower.stats.bytes,
 			      cls_flower.stats.pkts,
 			      cls_flower.stats.lastused);
-
-	if (!rtnl_held)
-		rtnl_unlock();
 }
 
 static void __fl_put(struct cls_fl_filter *f)
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v3 04/10] net: sched: notify classifier on successful offload add/delete
From: Vlad Buslov @ 2019-08-26 13:45 UTC (permalink / raw)
  To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, jakub.kicinski, pablo,
	Vlad Buslov
In-Reply-To: <20190826134506.9705-1-vladbu@mellanox.com>

To remove dependency on rtnl lock, extend classifier ops with new
ops->hw_add() and ops->hw_del() callbacks. Call them from cls API while
holding cb_lock every time filter if successfully added to or deleted from
hardware.

Implement the new API in flower classifier. Use it to manage hw_filters
list under cb_lock protection, instead of relying on rtnl lock to
synchronize with concurrent fl_reoffload() call.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
---

Notes:
    Changes from V1 to V2:
      - Keep success flow unindented in tc_setup_cb_{add|replace}().

 include/net/sch_generic.h |  4 ++++
 net/sched/cls_api.c       | 19 +++++++++++++++++--
 net/sched/cls_flower.c    | 33 ++++++++++++++++++++++++++-------
 3 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f90e3b2a3065..c4fbbaff30a2 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -312,6 +312,10 @@ struct tcf_proto_ops {
 	int			(*reoffload)(struct tcf_proto *tp, bool add,
 					     flow_setup_cb_t *cb, void *cb_priv,
 					     struct netlink_ext_ack *extack);
+	void			(*hw_add)(struct tcf_proto *tp,
+					  void *type_data);
+	void			(*hw_del)(struct tcf_proto *tp,
+					  void *type_data);
 	void			(*bind_class)(void *, u32, unsigned long);
 	void *			(*tmplt_create)(struct net *net,
 						struct tcf_chain *chain,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 6e612984e4a6..8b807e75fae2 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3099,6 +3099,11 @@ int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp,
 	}
 
 	ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+	if (ok_count < 0)
+		goto err_unlock;
+
+	if (tp->ops->hw_add)
+		tp->ops->hw_add(tp, type_data);
 	if (ok_count > 0)
 		tc_cls_offload_cnt_update(block, tp, in_hw_count, flags,
 					  ok_count, true);
@@ -3130,11 +3135,18 @@ int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp,
 	}
 
 	tc_cls_offload_cnt_reset(block, tp, old_in_hw_count, old_flags);
+	if (tp->ops->hw_del)
+		tp->ops->hw_del(tp, type_data);
 
 	ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+	if (ok_count < 0)
+		goto err_unlock;
+
+	if (tp->ops->hw_add)
+		tp->ops->hw_add(tp, type_data);
 	if (ok_count > 0)
-		tc_cls_offload_cnt_update(block, tp, new_in_hw_count, new_flags,
-					  ok_count, true);
+		tc_cls_offload_cnt_update(block, tp, new_in_hw_count,
+					  new_flags, ok_count, true);
 err_unlock:
 	up_read(&block->cb_lock);
 	return ok_count < 0 ? ok_count : 0;
@@ -3155,6 +3167,9 @@ int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp,
 	ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
 
 	tc_cls_offload_cnt_reset(block, tp, in_hw_count, flags);
+	if (tp->ops->hw_del)
+		tp->ops->hw_del(tp, type_data);
+
 	up_read(&block->cb_lock);
 	return ok_count < 0 ? ok_count : 0;
 }
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index cb816bbbd376..5cb694469b51 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -421,9 +421,6 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
 
 	tc_setup_cb_destroy(block, tp, TC_SETUP_CLSFLOWER, &cls_flower, false,
 			    &f->flags, &f->in_hw_count, true);
-	spin_lock(&tp->lock);
-	list_del_init(&f->hw_list);
-	spin_unlock(&tp->lock);
 
 	if (!rtnl_held)
 		rtnl_unlock();
@@ -433,7 +430,6 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 				struct cls_fl_filter *f, bool rtnl_held,
 				struct netlink_ext_ack *extack)
 {
-	struct cls_fl_head *head = fl_head_dereference(tp);
 	struct tcf_block *block = tp->chain->block;
 	struct flow_cls_offload cls_flower = {};
 	bool skip_sw = tc_skip_sw(f->flags);
@@ -480,9 +476,6 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 		goto errout;
 	}
 
-	spin_lock(&tp->lock);
-	list_add(&f->hw_list, &head->hw_filters);
-	spin_unlock(&tp->lock);
 errout:
 	if (!rtnl_held)
 		rtnl_unlock();
@@ -1856,6 +1849,30 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
 	return 0;
 }
 
+static void fl_hw_add(struct tcf_proto *tp, void *type_data)
+{
+	struct flow_cls_offload *cls_flower = type_data;
+	struct cls_fl_filter *f =
+		(struct cls_fl_filter *) cls_flower->cookie;
+	struct cls_fl_head *head = fl_head_dereference(tp);
+
+	spin_lock(&tp->lock);
+	list_add(&f->hw_list, &head->hw_filters);
+	spin_unlock(&tp->lock);
+}
+
+static void fl_hw_del(struct tcf_proto *tp, void *type_data)
+{
+	struct flow_cls_offload *cls_flower = type_data;
+	struct cls_fl_filter *f =
+		(struct cls_fl_filter *) cls_flower->cookie;
+
+	spin_lock(&tp->lock);
+	if (!list_empty(&f->hw_list))
+		list_del_init(&f->hw_list);
+	spin_unlock(&tp->lock);
+}
+
 static int fl_hw_create_tmplt(struct tcf_chain *chain,
 			      struct fl_flow_tmplt *tmplt)
 {
@@ -2516,6 +2533,8 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
 	.delete		= fl_delete,
 	.walk		= fl_walk,
 	.reoffload	= fl_reoffload,
+	.hw_add		= fl_hw_add,
+	.hw_del		= fl_hw_del,
 	.dump		= fl_dump,
 	.bind_class	= fl_bind_class,
 	.tmplt_create	= fl_tmplt_create,
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v3 01/10] net: sched: protect block offload-related fields with rw_semaphore
From: Vlad Buslov @ 2019-08-26 13:44 UTC (permalink / raw)
  To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, jakub.kicinski, pablo,
	Vlad Buslov
In-Reply-To: <20190826134506.9705-1-vladbu@mellanox.com>

In order to remove dependency on rtnl lock, extend tcf_block with 'cb_lock'
rwsem and use it to protect flow_block->cb_list and related counters from
concurrent modification. The lock is taken in read mode for read-only
traversal of cb_list in tc_setup_cb_call() and write mode in all other
cases. This approach ensures that:

- cb_list is not changed concurrently while filters is being offloaded on
  block.

- block->nooffloaddevcnt is checked while holding the lock in read mode,
  but is only changed by bind/unbind code when holding the cb_lock in write
  mode to prevent concurrent modification.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
---

Notes:
    Changes from V1 to V2:
      - Rename 'errout' label to 'err_unlock'.

 include/net/sch_generic.h |  2 ++
 net/sched/cls_api.c       | 45 +++++++++++++++++++++++++++++++--------
 2 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index d9f359af0b93..a3eaf5f9d28f 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -13,6 +13,7 @@
 #include <linux/refcount.h>
 #include <linux/workqueue.h>
 #include <linux/mutex.h>
+#include <linux/rwsem.h>
 #include <net/gen_stats.h>
 #include <net/rtnetlink.h>
 #include <net/flow_offload.h>
@@ -396,6 +397,7 @@ struct tcf_block {
 	refcount_t refcnt;
 	struct net *net;
 	struct Qdisc *q;
+	struct rw_semaphore cb_lock; /* protects cb_list and offload counters */
 	struct flow_block flow_block;
 	struct list_head owner_list;
 	bool keep_dst;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index e0d8b456e9f5..959b7ca1ca02 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -568,9 +568,11 @@ static void tc_indr_block_ing_cmd(struct net_device *dev,
 
 	bo.block = &block->flow_block;
 
+	down_write(&block->cb_lock);
 	cb(dev, cb_priv, TC_SETUP_BLOCK, &bo);
 
 	tcf_block_setup(block, &bo);
+	up_write(&block->cb_lock);
 }
 
 static struct tcf_block *tc_dev_ingress_block(struct net_device *dev)
@@ -661,6 +663,7 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
 	struct net_device *dev = q->dev_queue->dev;
 	int err;
 
+	down_write(&block->cb_lock);
 	if (!dev->netdev_ops->ndo_setup_tc)
 		goto no_offload_dev_inc;
 
@@ -669,24 +672,31 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
 	 */
 	if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) {
 		NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled");
-		return -EOPNOTSUPP;
+		err = -EOPNOTSUPP;
+		goto err_unlock;
 	}
 
 	err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_BIND, extack);
 	if (err == -EOPNOTSUPP)
 		goto no_offload_dev_inc;
 	if (err)
-		return err;
+		goto err_unlock;
 
 	tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
+	up_write(&block->cb_lock);
 	return 0;
 
 no_offload_dev_inc:
-	if (tcf_block_offload_in_use(block))
-		return -EOPNOTSUPP;
+	if (tcf_block_offload_in_use(block)) {
+		err = -EOPNOTSUPP;
+		goto err_unlock;
+	}
+	err = 0;
 	block->nooffloaddevcnt++;
 	tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
-	return 0;
+err_unlock:
+	up_write(&block->cb_lock);
+	return err;
 }
 
 static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
@@ -695,6 +705,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
 	struct net_device *dev = q->dev_queue->dev;
 	int err;
 
+	down_write(&block->cb_lock);
 	tc_indr_block_call(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
 
 	if (!dev->netdev_ops->ndo_setup_tc)
@@ -702,10 +713,12 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
 	err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
 	if (err == -EOPNOTSUPP)
 		goto no_offload_dev_dec;
+	up_write(&block->cb_lock);
 	return;
 
 no_offload_dev_dec:
 	WARN_ON(block->nooffloaddevcnt-- == 0);
+	up_write(&block->cb_lock);
 }
 
 static int
@@ -820,6 +833,7 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
 		return ERR_PTR(-ENOMEM);
 	}
 	mutex_init(&block->lock);
+	init_rwsem(&block->cb_lock);
 	flow_block_init(&block->flow_block);
 	INIT_LIST_HEAD(&block->chain_list);
 	INIT_LIST_HEAD(&block->owner_list);
@@ -1355,6 +1369,8 @@ tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb,
 	struct tcf_proto *tp, *tp_prev;
 	int err;
 
+	lockdep_assert_held(&block->cb_lock);
+
 	for (chain = __tcf_get_next_chain(block, NULL);
 	     chain;
 	     chain_prev = chain,
@@ -1393,6 +1409,8 @@ static int tcf_block_bind(struct tcf_block *block,
 	struct flow_block_cb *block_cb, *next;
 	int err, i = 0;
 
+	lockdep_assert_held(&block->cb_lock);
+
 	list_for_each_entry(block_cb, &bo->cb_list, list) {
 		err = tcf_block_playback_offloads(block, block_cb->cb,
 						  block_cb->cb_priv, true,
@@ -1427,6 +1445,8 @@ static void tcf_block_unbind(struct tcf_block *block,
 {
 	struct flow_block_cb *block_cb, *next;
 
+	lockdep_assert_held(&block->cb_lock);
+
 	list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
 		tcf_block_playback_offloads(block, block_cb->cb,
 					    block_cb->cb_priv, false,
@@ -2987,19 +3007,26 @@ int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
 	int ok_count = 0;
 	int err;
 
+	down_read(&block->cb_lock);
 	/* Make sure all netdevs sharing this block are offload-capable. */
-	if (block->nooffloaddevcnt && err_stop)
-		return -EOPNOTSUPP;
+	if (block->nooffloaddevcnt && err_stop) {
+		ok_count = -EOPNOTSUPP;
+		goto err_unlock;
+	}
 
 	list_for_each_entry(block_cb, &block->flow_block.cb_list, list) {
 		err = block_cb->cb(type, type_data, block_cb->cb_priv);
 		if (err) {
-			if (err_stop)
-				return err;
+			if (err_stop) {
+				ok_count = err;
+				goto err_unlock;
+			}
 		} else {
 			ok_count++;
 		}
 	}
+err_unlock:
+	up_read(&block->cb_lock);
 	return ok_count;
 }
 EXPORT_SYMBOL(tc_setup_cb_call);
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v3 02/10] net: sched: change tcf block offload counter type to atomic_t
From: Vlad Buslov @ 2019-08-26 13:44 UTC (permalink / raw)
  To: netdev
  Cc: jhs, xiyou.wangcong, jiri, davem, jakub.kicinski, pablo,
	Vlad Buslov, Jiri Pirko
In-Reply-To: <20190826134506.9705-1-vladbu@mellanox.com>

As a preparation for running proto ops functions without rtnl lock, change
offload counter type to atomic. This is necessary to allow updating the
counter by multiple concurrent users when offloading filters to hardware
from unlocked classifiers.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/sch_generic.h | 7 ++++---
 net/sched/cls_api.c       | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index a3eaf5f9d28f..d778c502decd 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -14,6 +14,7 @@
 #include <linux/workqueue.h>
 #include <linux/mutex.h>
 #include <linux/rwsem.h>
+#include <linux/atomic.h>
 #include <net/gen_stats.h>
 #include <net/rtnetlink.h>
 #include <net/flow_offload.h>
@@ -401,7 +402,7 @@ struct tcf_block {
 	struct flow_block flow_block;
 	struct list_head owner_list;
 	bool keep_dst;
-	unsigned int offloadcnt; /* Number of oddloaded filters */
+	atomic_t offloadcnt; /* Number of oddloaded filters */
 	unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */
 	struct {
 		struct tcf_chain *chain;
@@ -443,7 +444,7 @@ static inline void tcf_block_offload_inc(struct tcf_block *block, u32 *flags)
 	if (*flags & TCA_CLS_FLAGS_IN_HW)
 		return;
 	*flags |= TCA_CLS_FLAGS_IN_HW;
-	block->offloadcnt++;
+	atomic_inc(&block->offloadcnt);
 }
 
 static inline void tcf_block_offload_dec(struct tcf_block *block, u32 *flags)
@@ -451,7 +452,7 @@ static inline void tcf_block_offload_dec(struct tcf_block *block, u32 *flags)
 	if (!(*flags & TCA_CLS_FLAGS_IN_HW))
 		return;
 	*flags &= ~TCA_CLS_FLAGS_IN_HW;
-	block->offloadcnt--;
+	atomic_dec(&block->offloadcnt);
 }
 
 static inline void
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 959b7ca1ca02..f2c2f8159e35 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -629,7 +629,7 @@ static void tc_indr_block_call(struct tcf_block *block,
 
 static bool tcf_block_offload_in_use(struct tcf_block *block)
 {
-	return block->offloadcnt;
+	return atomic_read(&block->offloadcnt);
 }
 
 static int tcf_block_offload_cmd(struct tcf_block *block,
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v3 03/10] net: sched: refactor block offloads counter usage
From: Vlad Buslov @ 2019-08-26 13:44 UTC (permalink / raw)
  To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, jakub.kicinski, pablo,
	Vlad Buslov
In-Reply-To: <20190826134506.9705-1-vladbu@mellanox.com>

Without rtnl lock protection filters can no longer safely manage block
offloads counter themselves. Refactor cls API to protect block offloadcnt
with tcf_block->cb_lock that is already used to protect driver callback
list and nooffloaddevcnt counter. The counter can be modified by concurrent
tasks by new functions that execute block callbacks (which is safe with
previous patch that changed its type to atomic_t), however, block
bind/unbind code that checks the counter value takes cb_lock in write mode
to exclude any concurrent modifications. This approach prevents race
conditions between bind/unbind and callback execution code but allows for
concurrency for tc rule update path.

Move block offload counter, filter in hardware counter and filter flags
management from classifiers into cls hardware offloads API. Make functions
tcf_block_offload_{inc|dec}() and tc_cls_offload_cnt_update() to be cls API
private. Implement following new cls API to be used instead:

  tc_setup_cb_add() - non-destructive filter add. If filter that wasn't
  already in hardware is successfully offloaded, increment block offloads
  counter, set filter in hardware counter and flag. On failure, previously
  offloaded filter is considered to be intact and offloads counter is not
  decremented.

  tc_setup_cb_replace() - destructive filter replace. Release existing
  filter block offload counter and reset its in hardware counter and flag.
  Set new filter in hardware counter and flag. On failure, previously
  offloaded filter is considered to be destroyed and offload counter is
  decremented.

  tc_setup_cb_destroy() - filter destroy. Unconditionally decrement block
  offloads counter.

  tc_setup_cb_reoffload() - reoffload filter to single cb. Execute cb() and
  call tc_cls_offload_cnt_update() if cb() didn't return an error.

Refactor all offload-capable classifiers to atomically offload filters to
hardware, change block offload counter, and set filter in hardware counter
and flag by means of the new cls API functions.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
---

Notes:
    Changes from V2 to V3:
      - Only return error from tc_setup_cb_reoffload() when adding new filter
      and when skip_sw flag is set.
    
    Changes from V1 to V2:
      - Remove redundant brackets around cnt.
      - Rename 'errout' label to 'err_unlock'.
      - Revert changing oldprog to cls_bpf.oldprog in conditional.
      - Add new helper tc_setup_cb_reoffload().
      - Make tc_cls_offload_cnt_update() static and remove its export.
      - Change tc_setup_cb_*() helpers to only return zero or error code.

 include/net/pkt_cls.h     |  17 +++-
 include/net/sch_generic.h |  31 -------
 net/sched/cls_api.c       | 176 +++++++++++++++++++++++++++++++++++---
 net/sched/cls_bpf.c       |  38 ++++----
 net/sched/cls_flower.c    |  38 +++-----
 net/sched/cls_matchall.c  |  27 +++---
 net/sched/cls_u32.c       |  29 +++----
 7 files changed, 233 insertions(+), 123 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 64999ffcb486..612232492f67 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -506,7 +506,22 @@ tcf_match_indev(struct sk_buff *skb, int ifindex)
 int tc_setup_flow_action(struct flow_action *flow_action,
 			 const struct tcf_exts *exts);
 int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
-		     void *type_data, bool err_stop);
+		     void *type_data, bool err_stop, bool rtnl_held);
+int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp,
+		    enum tc_setup_type type, void *type_data, bool err_stop,
+		    u32 *flags, unsigned int *in_hw_count, bool rtnl_held);
+int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp,
+			enum tc_setup_type type, void *type_data, bool err_stop,
+			u32 *old_flags, unsigned int *old_in_hw_count,
+			u32 *new_flags, unsigned int *new_in_hw_count,
+			bool rtnl_held);
+int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp,
+			enum tc_setup_type type, void *type_data, bool err_stop,
+			u32 *flags, unsigned int *in_hw_count, bool rtnl_held);
+int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp,
+			  bool add, flow_setup_cb_t *cb,
+			  enum tc_setup_type type, void *type_data,
+			  void *cb_priv, u32 *flags, unsigned int *in_hw_count);
 unsigned int tcf_exts_num_actions(struct tcf_exts *exts);
 
 struct tc_cls_u32_knode {
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index d778c502decd..f90e3b2a3065 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -439,37 +439,6 @@ static inline bool lockdep_tcf_proto_is_locked(struct tcf_proto *tp)
 #define tcf_proto_dereference(p, tp)					\
 	rcu_dereference_protected(p, lockdep_tcf_proto_is_locked(tp))
 
-static inline void tcf_block_offload_inc(struct tcf_block *block, u32 *flags)
-{
-	if (*flags & TCA_CLS_FLAGS_IN_HW)
-		return;
-	*flags |= TCA_CLS_FLAGS_IN_HW;
-	atomic_inc(&block->offloadcnt);
-}
-
-static inline void tcf_block_offload_dec(struct tcf_block *block, u32 *flags)
-{
-	if (!(*flags & TCA_CLS_FLAGS_IN_HW))
-		return;
-	*flags &= ~TCA_CLS_FLAGS_IN_HW;
-	atomic_dec(&block->offloadcnt);
-}
-
-static inline void
-tc_cls_offload_cnt_update(struct tcf_block *block, u32 *cnt,
-			  u32 *flags, bool add)
-{
-	if (add) {
-		if (!*cnt)
-			tcf_block_offload_inc(block, flags);
-		(*cnt)++;
-	} else {
-		(*cnt)--;
-		if (!*cnt)
-			tcf_block_offload_dec(block, flags);
-	}
-}
-
 static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
 {
 	struct qdisc_skb_cb *qcb;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f2c2f8159e35..6e612984e4a6 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3000,37 +3000,185 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
 }
 EXPORT_SYMBOL(tcf_exts_dump_stats);
 
-int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
-		     void *type_data, bool err_stop)
+static void tcf_block_offload_inc(struct tcf_block *block, u32 *flags)
+{
+	if (*flags & TCA_CLS_FLAGS_IN_HW)
+		return;
+	*flags |= TCA_CLS_FLAGS_IN_HW;
+	atomic_inc(&block->offloadcnt);
+}
+
+static void tcf_block_offload_dec(struct tcf_block *block, u32 *flags)
+{
+	if (!(*flags & TCA_CLS_FLAGS_IN_HW))
+		return;
+	*flags &= ~TCA_CLS_FLAGS_IN_HW;
+	atomic_dec(&block->offloadcnt);
+}
+
+static void tc_cls_offload_cnt_update(struct tcf_block *block,
+				      struct tcf_proto *tp, u32 *cnt,
+				      u32 *flags, u32 diff, bool add)
+{
+	lockdep_assert_held(&block->cb_lock);
+
+	spin_lock(&tp->lock);
+	if (add) {
+		if (!*cnt)
+			tcf_block_offload_inc(block, flags);
+		*cnt += diff;
+	} else {
+		*cnt -= diff;
+		if (!*cnt)
+			tcf_block_offload_dec(block, flags);
+	}
+	spin_unlock(&tp->lock);
+}
+
+static void
+tc_cls_offload_cnt_reset(struct tcf_block *block, struct tcf_proto *tp,
+			 u32 *cnt, u32 *flags)
+{
+	lockdep_assert_held(&block->cb_lock);
+
+	spin_lock(&tp->lock);
+	tcf_block_offload_dec(block, flags);
+	*cnt = 0;
+	spin_unlock(&tp->lock);
+}
+
+static int
+__tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
+		   void *type_data, bool err_stop)
 {
 	struct flow_block_cb *block_cb;
 	int ok_count = 0;
 	int err;
 
-	down_read(&block->cb_lock);
-	/* Make sure all netdevs sharing this block are offload-capable. */
-	if (block->nooffloaddevcnt && err_stop) {
-		ok_count = -EOPNOTSUPP;
-		goto err_unlock;
-	}
-
 	list_for_each_entry(block_cb, &block->flow_block.cb_list, list) {
 		err = block_cb->cb(type, type_data, block_cb->cb_priv);
 		if (err) {
-			if (err_stop) {
-				ok_count = err;
-				goto err_unlock;
-			}
+			if (err_stop)
+				return err;
 		} else {
 			ok_count++;
 		}
 	}
-err_unlock:
+	return ok_count;
+}
+
+int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
+		     void *type_data, bool err_stop, bool rtnl_held)
+{
+	int ok_count;
+
+	down_read(&block->cb_lock);
+	ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
 	up_read(&block->cb_lock);
 	return ok_count;
 }
 EXPORT_SYMBOL(tc_setup_cb_call);
 
+/* Non-destructive filter add. If filter that wasn't already in hardware is
+ * successfully offloaded, increment block offloads counter. On failure,
+ * previously offloaded filter is considered to be intact and offloads counter
+ * is not decremented.
+ */
+
+int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp,
+		    enum tc_setup_type type, void *type_data, bool err_stop,
+		    u32 *flags, unsigned int *in_hw_count, bool rtnl_held)
+{
+	int ok_count;
+
+	down_read(&block->cb_lock);
+	/* Make sure all netdevs sharing this block are offload-capable. */
+	if (block->nooffloaddevcnt && err_stop) {
+		ok_count = -EOPNOTSUPP;
+		goto err_unlock;
+	}
+
+	ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+	if (ok_count > 0)
+		tc_cls_offload_cnt_update(block, tp, in_hw_count, flags,
+					  ok_count, true);
+err_unlock:
+	up_read(&block->cb_lock);
+	return ok_count < 0 ? ok_count : 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_add);
+
+/* Destructive filter replace. If filter that wasn't already in hardware is
+ * successfully offloaded, increment block offload counter. On failure,
+ * previously offloaded filter is considered to be destroyed and offload counter
+ * is decremented.
+ */
+
+int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp,
+			enum tc_setup_type type, void *type_data, bool err_stop,
+			u32 *old_flags, unsigned int *old_in_hw_count,
+			u32 *new_flags, unsigned int *new_in_hw_count,
+			bool rtnl_held)
+{
+	int ok_count;
+
+	down_read(&block->cb_lock);
+	/* Make sure all netdevs sharing this block are offload-capable. */
+	if (block->nooffloaddevcnt && err_stop) {
+		ok_count = -EOPNOTSUPP;
+		goto err_unlock;
+	}
+
+	tc_cls_offload_cnt_reset(block, tp, old_in_hw_count, old_flags);
+
+	ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+	if (ok_count > 0)
+		tc_cls_offload_cnt_update(block, tp, new_in_hw_count, new_flags,
+					  ok_count, true);
+err_unlock:
+	up_read(&block->cb_lock);
+	return ok_count < 0 ? ok_count : 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_replace);
+
+/* Destroy filter and decrement block offload counter, if filter was previously
+ * offloaded.
+ */
+
+int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp,
+			enum tc_setup_type type, void *type_data, bool err_stop,
+			u32 *flags, unsigned int *in_hw_count, bool rtnl_held)
+{
+	int ok_count;
+
+	down_read(&block->cb_lock);
+	ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+
+	tc_cls_offload_cnt_reset(block, tp, in_hw_count, flags);
+	up_read(&block->cb_lock);
+	return ok_count < 0 ? ok_count : 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_destroy);
+
+int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp,
+			  bool add, flow_setup_cb_t *cb,
+			  enum tc_setup_type type, void *type_data,
+			  void *cb_priv, u32 *flags, unsigned int *in_hw_count)
+{
+	int err = cb(type, type_data, cb_priv);
+
+	if (err) {
+		if (add && tc_skip_sw(*flags))
+			return err;
+	} else {
+		tc_cls_offload_cnt_update(block, tp, in_hw_count, flags, 1,
+					  add);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_reoffload);
+
 int tc_setup_flow_action(struct flow_action *flow_action,
 			 const struct tcf_exts *exts)
 {
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 3f7a9c02b70c..bf10bdaf5012 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -163,17 +163,19 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	cls_bpf.exts_integrated = obj->exts_integrated;
 
 	if (oldprog)
-		tcf_block_offload_dec(block, &oldprog->gen_flags);
+		err = tc_setup_cb_replace(block, tp, TC_SETUP_CLSBPF, &cls_bpf,
+					  skip_sw, &oldprog->gen_flags,
+					  &oldprog->in_hw_count,
+					  &prog->gen_flags, &prog->in_hw_count,
+					  true);
+	else
+		err = tc_setup_cb_add(block, tp, TC_SETUP_CLSBPF, &cls_bpf,
+				      skip_sw, &prog->gen_flags,
+				      &prog->in_hw_count, true);
 
-	err = tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, skip_sw);
-	if (prog) {
-		if (err < 0) {
-			cls_bpf_offload_cmd(tp, oldprog, prog, extack);
-			return err;
-		} else if (err > 0) {
-			prog->in_hw_count = err;
-			tcf_block_offload_inc(block, &prog->gen_flags);
-		}
+	if (prog && err) {
+		cls_bpf_offload_cmd(tp, oldprog, prog, extack);
+		return err;
 	}
 
 	if (prog && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW))
@@ -230,7 +232,7 @@ static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
 	cls_bpf.name = prog->bpf_name;
 	cls_bpf.exts_integrated = prog->exts_integrated;
 
-	tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false);
+	tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false, true);
 }
 
 static int cls_bpf_init(struct tcf_proto *tp)
@@ -673,15 +675,11 @@ static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb
 		cls_bpf.name = prog->bpf_name;
 		cls_bpf.exts_integrated = prog->exts_integrated;
 
-		err = cb(TC_SETUP_CLSBPF, &cls_bpf, cb_priv);
-		if (err) {
-			if (add && tc_skip_sw(prog->gen_flags))
-				return err;
-			continue;
-		}
-
-		tc_cls_offload_cnt_update(block, &prog->in_hw_count,
-					  &prog->gen_flags, add);
+		err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSBPF,
+					    &cls_bpf, cb_priv, &prog->gen_flags,
+					    &prog->in_hw_count);
+		if (err)
+			return err;
 	}
 
 	return 0;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 054123742e32..cb816bbbd376 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -419,10 +419,10 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
 	cls_flower.command = FLOW_CLS_DESTROY;
 	cls_flower.cookie = (unsigned long) f;
 
-	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
+	tc_setup_cb_destroy(block, tp, TC_SETUP_CLSFLOWER, &cls_flower, false,
+			    &f->flags, &f->in_hw_count, true);
 	spin_lock(&tp->lock);
 	list_del_init(&f->hw_list);
-	tcf_block_offload_dec(block, &f->flags);
 	spin_unlock(&tp->lock);
 
 	if (!rtnl_held)
@@ -466,18 +466,13 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 		goto errout;
 	}
 
-	err = tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw);
+	err = tc_setup_cb_add(block, tp, TC_SETUP_CLSFLOWER, &cls_flower,
+			      skip_sw, &f->flags, &f->in_hw_count, true);
 	kfree(cls_flower.rule);
 
-	if (err < 0) {
+	if (err) {
 		fl_hw_destroy_filter(tp, f, true, NULL);
 		goto errout;
-	} else if (err > 0) {
-		f->in_hw_count = err;
-		err = 0;
-		spin_lock(&tp->lock);
-		tcf_block_offload_inc(block, &f->flags);
-		spin_unlock(&tp->lock);
 	}
 
 	if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW)) {
@@ -509,7 +504,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
 	cls_flower.cookie = (unsigned long) f;
 	cls_flower.classid = f->res.classid;
 
-	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
+	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, true);
 
 	tcf_exts_stats_update(&f->exts, cls_flower.stats.bytes,
 			      cls_flower.stats.pkts,
@@ -1844,21 +1839,16 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
 
 		cls_flower.classid = f->res.classid;
 
-		err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv);
+		err = tc_setup_cb_reoffload(block, tp, add, cb,
+					    TC_SETUP_CLSFLOWER, &cls_flower,
+					    cb_priv, &f->flags,
+					    &f->in_hw_count);
 		kfree(cls_flower.rule);
 
 		if (err) {
-			if (add && tc_skip_sw(f->flags)) {
-				__fl_put(f);
-				return err;
-			}
-			goto next_flow;
+			__fl_put(f);
+			return err;
 		}
-
-		spin_lock(&tp->lock);
-		tc_cls_offload_cnt_update(block, &f->in_hw_count, &f->flags,
-					  add);
-		spin_unlock(&tp->lock);
 next_flow:
 		__fl_put(f);
 	}
@@ -1886,7 +1876,7 @@ static int fl_hw_create_tmplt(struct tcf_chain *chain,
 	/* We don't care if driver (any of them) fails to handle this
 	 * call. It serves just as a hint for it.
 	 */
-	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
+	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, true);
 	kfree(cls_flower.rule);
 
 	return 0;
@@ -1902,7 +1892,7 @@ static void fl_hw_destroy_tmplt(struct tcf_chain *chain,
 	cls_flower.command = FLOW_CLS_TMPLT_DESTROY;
 	cls_flower.cookie = (unsigned long) tmplt;
 
-	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
+	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, true);
 }
 
 static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain,
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 455ea2793f9b..911d1ea28bb2 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -75,8 +75,8 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
 	cls_mall.command = TC_CLSMATCHALL_DESTROY;
 	cls_mall.cookie = cookie;
 
-	tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false);
-	tcf_block_offload_dec(block, &head->flags);
+	tc_setup_cb_destroy(block, tp, TC_SETUP_CLSMATCHALL, &cls_mall, false,
+			    &head->flags, &head->in_hw_count, true);
 }
 
 static int mall_replace_hw_filter(struct tcf_proto *tp,
@@ -109,15 +109,13 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 		return err;
 	}
 
-	err = tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, skip_sw);
+	err = tc_setup_cb_add(block, tp, TC_SETUP_CLSMATCHALL, &cls_mall,
+			      skip_sw, &head->flags, &head->in_hw_count, true);
 	kfree(cls_mall.rule);
 
-	if (err < 0) {
+	if (err) {
 		mall_destroy_hw_filter(tp, head, cookie, NULL);
 		return err;
-	} else if (err > 0) {
-		head->in_hw_count = err;
-		tcf_block_offload_inc(block, &head->flags);
 	}
 
 	if (skip_sw && !(head->flags & TCA_CLS_FLAGS_IN_HW))
@@ -312,16 +310,13 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
 		return 0;
 	}
 
-	err = cb(TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv);
+	err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSMATCHALL,
+				    &cls_mall, cb_priv, &head->flags,
+				    &head->in_hw_count);
 	kfree(cls_mall.rule);
 
-	if (err) {
-		if (add && tc_skip_sw(head->flags))
-			return err;
-		return 0;
-	}
-
-	tc_cls_offload_cnt_update(block, &head->in_hw_count, &head->flags, add);
+	if (err)
+		return err;
 
 	return 0;
 }
@@ -337,7 +332,7 @@ static void mall_stats_hw_filter(struct tcf_proto *tp,
 	cls_mall.command = TC_CLSMATCHALL_STATS;
 	cls_mall.cookie = cookie;
 
-	tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false);
+	tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true);
 
 	tcf_exts_stats_update(&head->exts, cls_mall.stats.bytes,
 			      cls_mall.stats.pkts, cls_mall.stats.lastused);
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 8614088edd1b..a0e6fac613de 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -480,7 +480,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 	cls_u32.hnode.handle = h->handle;
 	cls_u32.hnode.prio = h->prio;
 
-	tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false);
+	tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false, true);
 }
 
 static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
@@ -498,7 +498,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 	cls_u32.hnode.handle = h->handle;
 	cls_u32.hnode.prio = h->prio;
 
-	err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw);
+	err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw, true);
 	if (err < 0) {
 		u32_clear_hw_hnode(tp, h, NULL);
 		return err;
@@ -522,8 +522,8 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 	cls_u32.command = TC_CLSU32_DELETE_KNODE;
 	cls_u32.knode.handle = n->handle;
 
-	tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false);
-	tcf_block_offload_dec(block, &n->flags);
+	tc_setup_cb_destroy(block, tp, TC_SETUP_CLSU32, &cls_u32, false,
+			    &n->flags, &n->in_hw_count, true);
 }
 
 static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
@@ -552,13 +552,11 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 	if (n->ht_down)
 		cls_u32.knode.link_handle = ht->handle;
 
-	err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw);
-	if (err < 0) {
+	err = tc_setup_cb_add(block, tp, TC_SETUP_CLSU32, &cls_u32, skip_sw,
+			      &n->flags, &n->in_hw_count, true);
+	if (err) {
 		u32_remove_hw_knode(tp, n, NULL);
 		return err;
-	} else if (err > 0) {
-		n->in_hw_count = err;
-		tcf_block_offload_inc(block, &n->flags);
 	}
 
 	if (skip_sw && !(n->flags & TCA_CLS_FLAGS_IN_HW))
@@ -1201,14 +1199,11 @@ static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 			cls_u32.knode.link_handle = ht->handle;
 	}
 
-	err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv);
-	if (err) {
-		if (add && tc_skip_sw(n->flags))
-			return err;
-		return 0;
-	}
-
-	tc_cls_offload_cnt_update(block, &n->in_hw_count, &n->flags, add);
+	err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSU32,
+				    &cls_u32, cb_priv, &n->flags,
+				    &n->in_hw_count);
+	if (err)
+		return err;
 
 	return 0;
 }
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v3 07/10] net: sched: take rtnl lock in tc_setup_flow_action()
From: Vlad Buslov @ 2019-08-26 13:45 UTC (permalink / raw)
  To: netdev
  Cc: jhs, xiyou.wangcong, jiri, davem, jakub.kicinski, pablo,
	Vlad Buslov, Jiri Pirko
In-Reply-To: <20190826134506.9705-1-vladbu@mellanox.com>

In order to allow using new flow_action infrastructure from unlocked
classifiers, modify tc_setup_flow_action() to accept new 'rtnl_held'
argument. Take rtnl lock before accessing tc_action data. This is necessary
to protect from concurrent action replace.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/pkt_cls.h    |  2 +-
 net/sched/cls_api.c      | 17 +++++++++++++----
 net/sched/cls_flower.c   |  6 ++++--
 net/sched/cls_matchall.c |  4 ++--
 4 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 612232492f67..a48824bc1489 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -504,7 +504,7 @@ tcf_match_indev(struct sk_buff *skb, int ifindex)
 }
 
 int tc_setup_flow_action(struct flow_action *flow_action,
-			 const struct tcf_exts *exts);
+			 const struct tcf_exts *exts, bool rtnl_held);
 int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
 		     void *type_data, bool err_stop, bool rtnl_held);
 int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 3c103cf9fd0d..8751bb8a682f 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3266,14 +3266,17 @@ int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp,
 EXPORT_SYMBOL(tc_setup_cb_reoffload);
 
 int tc_setup_flow_action(struct flow_action *flow_action,
-			 const struct tcf_exts *exts)
+			 const struct tcf_exts *exts, bool rtnl_held)
 {
 	const struct tc_action *act;
-	int i, j, k;
+	int i, j, k, err = 0;
 
 	if (!exts)
 		return 0;
 
+	if (!rtnl_held)
+		rtnl_lock();
+
 	j = 0;
 	tcf_exts_for_each_action(i, act, exts) {
 		struct flow_action_entry *entry;
@@ -3318,6 +3321,7 @@ int tc_setup_flow_action(struct flow_action *flow_action,
 				entry->vlan.prio = tcf_vlan_push_prio(act);
 				break;
 			default:
+				err = -EOPNOTSUPP;
 				goto err_out;
 			}
 		} else if (is_tcf_tunnel_set(act)) {
@@ -3335,6 +3339,7 @@ int tc_setup_flow_action(struct flow_action *flow_action,
 					entry->id = FLOW_ACTION_ADD;
 					break;
 				default:
+					err = -EOPNOTSUPP;
 					goto err_out;
 				}
 				entry->mangle.htype = tcf_pedit_htype(act, k);
@@ -3393,15 +3398,19 @@ int tc_setup_flow_action(struct flow_action *flow_action,
 			entry->id = FLOW_ACTION_PTYPE;
 			entry->ptype = tcf_skbedit_ptype(act);
 		} else {
+			err = -EOPNOTSUPP;
 			goto err_out;
 		}
 
 		if (!is_tcf_pedit(act))
 			j++;
 	}
-	return 0;
+
 err_out:
-	return -EOPNOTSUPP;
+	if (!rtnl_held)
+		rtnl_unlock();
+
+	return err;
 }
 EXPORT_SYMBOL(tc_setup_flow_action);
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 5cb694469b51..fb305bd45d93 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -452,7 +452,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	cls_flower.rule->match.key = &f->mkey;
 	cls_flower.classid = f->res.classid;
 
-	err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts);
+	err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts,
+				   true);
 	if (err) {
 		kfree(cls_flower.rule);
 		if (skip_sw)
@@ -1819,7 +1820,8 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
 		cls_flower.rule->match.mask = &f->mask->key;
 		cls_flower.rule->match.key = &f->mkey;
 
-		err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts);
+		err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts,
+					   true);
 		if (err) {
 			kfree(cls_flower.rule);
 			if (tc_skip_sw(f->flags)) {
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 911d1ea28bb2..3266f25011cc 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -97,7 +97,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 	cls_mall.command = TC_CLSMATCHALL_REPLACE;
 	cls_mall.cookie = cookie;
 
-	err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts);
+	err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts, true);
 	if (err) {
 		kfree(cls_mall.rule);
 		mall_destroy_hw_filter(tp, head, cookie, NULL);
@@ -300,7 +300,7 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
 		TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY;
 	cls_mall.cookie = (unsigned long)head;
 
-	err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts);
+	err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts, true);
 	if (err) {
 		kfree(cls_mall.rule);
 		if (add && tc_skip_sw(head->flags)) {
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v3 00/10] Refactor cls hardware offload API to support rtnl-independent drivers
From: Vlad Buslov @ 2019-08-26 13:44 UTC (permalink / raw)
  To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, jakub.kicinski, pablo,
	Vlad Buslov

Currently, all cls API hardware offloads driver callbacks require caller
to hold rtnl lock when calling them. This patch set introduces new API
that allows drivers to register callbacks that are not dependent on rtnl
lock and unlocked classifiers to offload filters without obtaining rtnl
lock first, which is intended to allow offloading tc rules in parallel.

Recently, new rtnl registration flag RTNL_FLAG_DOIT_UNLOCKED was added.
TC rule update handlers (RTM_NEWTFILTER, RTM_DELTFILTER, etc.) are
already registered with this flag and only take rtnl lock when qdisc or
classifier requires it. Classifiers can indicate that their ops
callbacks don't require caller to hold rtnl lock by setting the
TCF_PROTO_OPS_DOIT_UNLOCKED flag. Unlocked implementation of flower
classifier is now upstreamed. However, this implementation still obtains
rtnl lock before calling hardware offloads API.

Implement following cls API changes:

- Introduce new "unlocked_driver_cb" flag to struct flow_block_offload
  to allow registering and unregistering block hardware offload
  callbacks that do not require caller to hold rtnl lock. Drivers that
  doesn't require users of its tc offload callbacks to hold rtnl lock
  sets the flag to true on block bind/unbind. Internally tcf_block is
  extended with additional lockeddevcnt counter that is used to count
  number of devices that require rtnl lock that block is bound to. When
  this counter is zero, tc_setup_cb_*() functions execute callbacks
  without obtaining rtnl lock.

- Extend cls API single hardware rule update tc_setup_cb_call() function
  with tc_setup_cb_add(), tc_setup_cb_replace(), tc_setup_cb_destroy()
  and tc_setup_cb_reoffload() functions. These new APIs are needed to
  move management of block offload counter, filter in hardware counter
  and flag from classifier implementations to cls API, which is now
  responsible for managing them in concurrency-safe manner. Access to
  cb_list from callback execution code is synchronized by obtaining new
  'cb_lock' rw_semaphore in read mode, which allows executing callbacks
  in parallel, but excludes any modifications of data from
  register/unregister code. tcf_block offloads counter type is changed
  to atomic integer to allow updating the counter concurrently.

- Extend classifier ops with new ops->hw_add() and ops->hw_del()
  callbacks which are used to notify unlocked classifiers when filter is
  successfully added or deleted to hardware without releasing cb_lock.
  This is necessary to update classifier state atomically with callback
  list traversal and updating of all relevant counters and allows
  unlocked classifiers to synchronize with concurrent reoffload without
  requiring any changes to driver callback API implementations.

New tc flow_action infrastructure is also modified to allow its user to
execute without rtnl lock protection. Function tc_setup_flow_action() is
modified to conditionally obtain rtnl lock before accessing action
state. Action data that is accessed by reference is either copied or
reference counted to prevent concurrent action overwrite from
deallocating it. New function tc_cleanup_flow_action() is introduced to
cleanup/release all such data obtained by tc_setup_flow_action().

Flower classifier (only unlocked classifier at the moment) is modified
to use new cls hardware offloads API and no longer obtains rtnl lock
before calling it.

Vlad Buslov (10):
  net: sched: protect block offload-related fields with rw_semaphore
  net: sched: change tcf block offload counter type to atomic_t
  net: sched: refactor block offloads counter usage
  net: sched: notify classifier on successful offload add/delete
  net: sched: add API for registering unlocked offload block callbacks
  net: sched: conditionally obtain rtnl lock in cls hw offloads API
  net: sched: take rtnl lock in tc_setup_flow_action()
  net: sched: take reference to action dev before calling offloads
  net: sched: copy tunnel info when setting flow_action entry->tunnel
  net: sched: flower: don't take rtnl lock for cls hw offloads API

 .../net/ethernet/mellanox/mlx5/core/en_main.c |   2 +
 .../net/ethernet/mellanox/mlx5/core/en_rep.c  |   3 +
 include/net/flow_offload.h                    |   1 +
 include/net/pkt_cls.h                         |  21 +-
 include/net/sch_generic.h                     |  41 +--
 include/net/tc_act/tc_tunnel_key.h            |  17 +
 net/sched/cls_api.c                           | 343 +++++++++++++++++-
 net/sched/cls_bpf.c                           |  38 +-
 net/sched/cls_flower.c                        | 124 +++----
 net/sched/cls_matchall.c                      |  31 +-
 net/sched/cls_u32.c                           |  29 +-
 11 files changed, 478 insertions(+), 172 deletions(-)

-- 
2.21.0

^ permalink raw reply

* [PATCH net-next v3 09/10] net: sched: copy tunnel info when setting flow_action entry->tunnel
From: Vlad Buslov @ 2019-08-26 13:45 UTC (permalink / raw)
  To: netdev
  Cc: jhs, xiyou.wangcong, jiri, davem, jakub.kicinski, pablo,
	Vlad Buslov, Jiri Pirko
In-Reply-To: <20190826134506.9705-1-vladbu@mellanox.com>

In order to remove dependency on rtnl lock, modify tc_setup_flow_action()
to copy tunnel info, instead of just saving pointer to tunnel_key action
tunnel info. This is necessary to prevent concurrent action overwrite from
releasing tunnel info while it is being used by rtnl-unlocked driver.

Implement helper tcf_tunnel_info_copy() that is used to copy tunnel info
with all its options to dynamically allocated memory block. Modify
tc_cleanup_flow_action() to free dynamically allocated tunnel info.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/tc_act/tc_tunnel_key.h | 17 +++++++++++++++++
 net/sched/cls_api.c                |  9 ++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h
index 7c3f777c168c..0689d9bcdf84 100644
--- a/include/net/tc_act/tc_tunnel_key.h
+++ b/include/net/tc_act/tc_tunnel_key.h
@@ -59,4 +59,21 @@ static inline struct ip_tunnel_info *tcf_tunnel_info(const struct tc_action *a)
 	return NULL;
 #endif
 }
+
+static inline struct ip_tunnel_info *
+tcf_tunnel_info_copy(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	struct ip_tunnel_info *tun = tcf_tunnel_info(a);
+
+	if (tun) {
+		size_t tun_size = sizeof(*tun) + tun->options_len;
+		struct ip_tunnel_info *tun_copy = kmemdup(tun, tun_size,
+							  GFP_KERNEL);
+
+		return tun_copy;
+	}
+#endif
+	return NULL;
+}
 #endif /* __NET_TC_TUNNEL_KEY_H */
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index d988737693e4..671ca905dbb5 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3279,6 +3279,9 @@ void tc_cleanup_flow_action(struct flow_action *flow_action)
 			if (entry->dev)
 				dev_put(entry->dev);
 			break;
+		case FLOW_ACTION_TUNNEL_ENCAP:
+			kfree(entry->tunnel);
+			break;
 		default:
 			break;
 		}
@@ -3355,7 +3358,11 @@ int tc_setup_flow_action(struct flow_action *flow_action,
 			}
 		} else if (is_tcf_tunnel_set(act)) {
 			entry->id = FLOW_ACTION_TUNNEL_ENCAP;
-			entry->tunnel = tcf_tunnel_info(act);
+			entry->tunnel = tcf_tunnel_info_copy(act);
+			if (!entry->tunnel) {
+				err = -ENOMEM;
+				goto err_out;
+			}
 		} else if (is_tcf_tunnel_release(act)) {
 			entry->id = FLOW_ACTION_TUNNEL_DECAP;
 		} else if (is_tcf_pedit(act)) {
-- 
2.21.0


^ permalink raw reply related

* BUG_ON in skb_segment, after bpf_skb_change_proto was applied
From: Shmulik Ladkani @ 2019-08-26 14:07 UTC (permalink / raw)
  To: Daniel Borkmann, Eric Dumazet, netdev, Alexander Duyck
  Cc: Alexei Starovoitov, Yonghong Song, Steffen Klassert, shmulik,
	eyal

Hi,

In our production systems, running v4.19.y longterm kernels, we hit a
BUG_ON in 'skb_segment()'. It occurs rarely and although tried, couldn't
synthetically reproduce.

In v4.19.41 it crashes at net/core/skbuff.c:3711

		while (pos < offset + len) {
			if (i >= nfrags) {
				i = 0;
				nfrags = skb_shinfo(list_skb)->nr_frags;
				frag = skb_shinfo(list_skb)->frags;
				frag_skb = list_skb;
				if (!skb_headlen(list_skb)) {
					BUG_ON(!nfrags);
				} else {
3711:					BUG_ON(!list_skb->head_frag);

With the accompanying dump:

 kernel BUG at net/core/skbuff.c:3711!
 invalid opcode: 0000 [#1] SMP PTI
 CPU: 2 PID: 0 Comm: swapper/2 Kdump: loaded Not tainted 4.19.41-041941-generic #201905080231
 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/05/2016
 RIP: 0010:skb_segment+0xb65/0xda9
 Code: 89 44 24 60 48 89 4c 24 70 e8 87 b3 ff ff 48 8b 4c 24 70 44 8b 44 24 60 85 c0 44 8b 54 24 4c 0f 84 fc fb ff ff e9 16 fd ff ff <0f> 0b 29 c1 89 ce 09 ca e9 61 ff ff ff 0f 0b 41 8b bf 84 00 00 00
 RSP: 0018:ffff9e4d79b037c0 EFLAGS: 00010246
 RAX: ffff9e4d75012ec0 RBX: ffff9e4d74067500 RCX: 0000000000000000
 RDX: 0000000000480020 RSI: 0000000000000000 RDI: ffff9e4d74e3a200
 RBP: ffff9e4d79b03898 R08: 0000000000000564 R09: f69d84ecbfe8b972
 R10: 0000000000000571 R11: a6b66a32f69d84ec R12: 0000000000000564
 R13: ffff9e4c18d03ef0 R14: 0000000000000000 R15: ffff9e4d74e3a200
 FS:  0000000000000000(0000) GS:ffff9e4d79b00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00000000007f50d8 CR3: 000000009420a003 CR4: 00000000001606e0
 Call Trace:
  <IRQ>
  tcp_gso_segment+0xf9/0x4e0
  tcp6_gso_segment+0x5e/0x100
  ipv6_gso_segment+0x112/0x340
  skb_mac_gso_segment+0xb9/0x130
  __skb_gso_segment+0x84/0x190
  validate_xmit_skb+0x14a/0x2f0
  validate_xmit_skb_list+0x4b/0x70
  sch_direct_xmit+0x154/0x390
  __dev_queue_xmit+0x808/0x920
  dev_queue_xmit+0x10/0x20
  neigh_direct_output+0x11/0x20
  ip6_finish_output2+0x1b9/0x5b0
  ip6_finish_output+0x13a/0x1b0
  ip6_output+0x6c/0x110
  ? ip6_fragment+0xa40/0xa40
  ip6_forward+0x501/0x810
  ip6_rcv_finish+0x7a/0x90
  ipv6_rcv+0x69/0xe0
  ? nf_hook.part.24+0x10/0x10
  __netif_receive_skb_core+0x4fa/0xc80
  ? netif_receive_skb_core+0x20/0x20
  ? netif_receive_skb_internal+0x45/0xf0
  ? tcp4_gro_complete+0x86/0x90
  ? napi_gro_complete+0x53/0x90
  __netif_receive_skb_one_core+0x3b/0x80
  __netif_receive_skb+0x18/0x60
  process_backlog+0xb3/0x170
  net_rx_action+0x130/0x350
  __do_softirq+0xdc/0x2d4

To our best knowledge, the packet flow leading to this BUG_ON is:

  - ingress on eth0 (veth, gro:on), ipv4 udp encapsulated esp
  - re-ingresss on eth0, after xfrm, decapsulated ipv4 tcp
  - the skb was GROed (skb_is_gso:true)
  - ipv4 forwarding to dummy1, where eBPF nat4-to-6 program is attached
    at TC Egress (calls 'bpf_skb_change_proto()'), then redirect to ingress
    on same device.
    NOTE: 'bpf_skb_proto_4_to_6()' mangles 'shinfo->gso_size'
  - ingress on dummy1, ipv6 tcp
  - ipv6 forwarding
  - egress on tun2 (tun device) that calls:
    validate_xmit_skb -> ... -> skb_segment BUG_ON

A similar issue was reported and fixed by Yonghong Song in commit
13acc94eff12 ("net: permit skb_segment on head_frag frag_list skb").

However 13acc94eff12 added "BUG_ON(!list_skb->head_frag)" to line 3711,
and patchwork states:

    This patch addressed the issue by handling skb_headlen(list_skb) != 0
    case properly if list_skb->head_frag is true, which is expected in
    most cases. [1]

meaning, 13acc94eff12 does not support list_skb->head_frag=0 case.

Historically, it is claimed that skb_segment is rather intolerant to
gso_size changes, quote:

    Eric suggested to shrink gso_size instead to avoid segmentation+fragments.
    I think its nice idea, but skb_gso_segment makes certain assumptions about
    nr_frags and gso_size (it can't handle frag size > desired mss). [2]

Any suggestions how to debug and fix this?

Could it be that 'bpf_skb_change_proto()' isn't really allowed to
mangle 'gso_size', and we should somehow enforce a 'skb_segment()' call
PRIOR translation?

Appreciate any input and assistance,
Shmulik

[1] https://patchwork.ozlabs.org/patch/889166/
[2] https://patchwork.ozlabs.org/patch/314327/

^ permalink raw reply

* kernel/bpf/core.o: warning: objtool: ___bpf_prog_run.cold()+0x7: call without frame pointer save/setup
From: He Zhe @ 2019-08-26 14:42 UTC (permalink / raw)
  To: ast, daniel, kafai, songliubraving, yhs, ndesaulniers,
	miguel.ojeda.sandonis, luc.vanoostenryck, schwidefsky, gregkh,
	mst, gor, andreyknvl, jpoimboe, liuxiaozhou, yamada.masahiro,
	linux-kernel, netdev, bpf, He Zhe

Hi All,

Since 3193c0836f20 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()"),
We have got the following warning,
kernel/bpf/core.o: warning: objtool: ___bpf_prog_run.cold()+0x7: call without frame pointer save/setup

If reverting the above commit, we will get the following warning,
kernel/bpf/core.o: warning: objtool: ___bpf_prog_run()+0x8b9: sibling call from callable instruction with modified stack frame
if CONFIG_RETPOLINE=n, and no warning if CONFIG_RETPOLINE=y


Zhe

^ permalink raw reply

* Re: [PATCH v2] riscv: add support for SECCOMP and SECCOMP_FILTER
From: Tycho Andersen @ 2019-08-26 14:57 UTC (permalink / raw)
  To: Paul Walmsley
  Cc: David Abdurachmanov, Palmer Dabbelt, Albert Ou, Oleg Nesterov,
	Kees Cook, Andy Lutomirski, Will Drewry, Shuah Khan,
	Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, David Abdurachmanov, Thomas Gleixner,
	Allison Randal, Alexios Zavras, Anup Patel, Vincent Chen,
	Alan Kao, linux-riscv, linux-kernel, linux-kselftest, netdev, bpf,
	me
In-Reply-To: <alpine.DEB.2.21.9999.1908231717550.25649@viisi.sifive.com>

Hi,

On Fri, Aug 23, 2019 at 05:30:53PM -0700, Paul Walmsley wrote:
> On Thu, 22 Aug 2019, David Abdurachmanov wrote:
> 
> > There is one failing kernel selftest: global.user_notification_signal
> 
> Also - could you follow up with the author of this failing test to see if 
> we can get some more clarity about what might be going wrong here?  It 
> appears that the failing test was added in commit 6a21cc50f0c7f ("seccomp: 
> add a return code to trap to userspace") by Tycho Andersen 
> <tycho@tycho.ws>.

Can you post an strace and a cat of /proc/$pid/stack for both tasks
where it gets stuck? I don't have any riscv hardware, and it "works
for me" on x86 and arm64 with 100 tries.

Thanks,

Tycho

^ permalink raw reply

* Re: [PATCH net-next v4 0/6] net: dsa: mv88e6xxx: Peridot/Topaz SERDES changes
From: Vivien Didelot @ 2019-08-26 15:06 UTC (permalink / raw)
  To: Marek Behún
  Cc: netdev, Andrew Lunn, Florian Fainelli, Vladimir Oltean,
	Marek Behún
In-Reply-To: <20190826122109.20660-1-marek.behun@nic.cz>

Hi Marek,

On Mon, 26 Aug 2019 14:21:03 +0200, Marek Behún <marek.behun@nic.cz> wrote:
> Hello,
> 
> this is the fourth version of changes for the Topaz/Peridot family of
> switches. The patches apply on net-next.
> Changes since v3:
>  - there was a mistake in the serdes_get_lane implementations for
>    6390 (patch 3/6). These methods returned -ENODEV if no lane was
>    to be on port, but they should return 0. This is now fixed.
> 
> Tested on Turris Mox with Peridot, Topaz, and Peridot + Topaz.
> 
> Marek
> 
> Marek Behún (6):
>   net: dsa: mv88e6xxx: support 2500base-x in SGMII IRQ handler
>   net: dsa: mv88e6xxx: update code operating on hidden registers
>   net: dsa: mv88e6xxx: create serdes_get_lane chip operation
>   net: dsa: mv88e6xxx: simplify SERDES code for Topaz and Peridot
>   net: dsa: mv88e6xxx: rename port cmode macro
>   net: dsa: mv88e6xxx: fully support SERDES on Topaz family
> 
>  drivers/net/dsa/mv88e6xxx/Makefile      |   1 +
>  drivers/net/dsa/mv88e6xxx/chip.c        |  88 +++-----
>  drivers/net/dsa/mv88e6xxx/chip.h        |   3 +
>  drivers/net/dsa/mv88e6xxx/port.c        |  98 ++++++---
>  drivers/net/dsa/mv88e6xxx/port.h        |  30 ++-
>  drivers/net/dsa/mv88e6xxx/port_hidden.c |  70 ++++++
>  drivers/net/dsa/mv88e6xxx/serdes.c      | 275 +++++++++++-------------
>  drivers/net/dsa/mv88e6xxx/serdes.h      |  27 ++-
>  8 files changed, 333 insertions(+), 259 deletions(-)
>  create mode 100644 drivers/net/dsa/mv88e6xxx/port_hidden.c

The series causes no issues on my Dev C with two 88E6390Xs. If Andrew has
no complaints about the functional changes on the SERDES code, this LGTM:

Tested-by: Vivien Didelot <vivien.didelot@gmail.com>


Thanks,

	Vivien

^ permalink raw reply

* Re: kernel/bpf/core.o: warning: objtool: ___bpf_prog_run.cold()+0x7: call without frame pointer save/setup
From: Greg KH @ 2019-08-26 15:11 UTC (permalink / raw)
  To: He Zhe
  Cc: ast, daniel, kafai, songliubraving, yhs, ndesaulniers,
	miguel.ojeda.sandonis, luc.vanoostenryck, schwidefsky, mst, gor,
	andreyknvl, jpoimboe, liuxiaozhou, yamada.masahiro, linux-kernel,
	netdev, bpf
In-Reply-To: <cf0273fb-c272-72be-50f9-b25bb7c7f183@windriver.com>

On Mon, Aug 26, 2019 at 10:42:53PM +0800, He Zhe wrote:
> Hi All,
> 
> Since 3193c0836f20 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()"),
> We have got the following warning,
> kernel/bpf/core.o: warning: objtool: ___bpf_prog_run.cold()+0x7: call without frame pointer save/setup
> 
> If reverting the above commit, we will get the following warning,
> kernel/bpf/core.o: warning: objtool: ___bpf_prog_run()+0x8b9: sibling call from callable instruction with modified stack frame
> if CONFIG_RETPOLINE=n, and no warning if CONFIG_RETPOLINE=y

Do you see this same problem on 5.3-rc6?

And what version of gcc are you using?

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH net-next v4 1/6] net: dsa: mv88e6xxx: support 2500base-x in SGMII IRQ handler
From: Andrew Lunn @ 2019-08-26 15:13 UTC (permalink / raw)
  To: Marek Behún
  Cc: netdev, Vivien Didelot, Florian Fainelli, Vladimir Oltean
In-Reply-To: <20190826122109.20660-2-marek.behun@nic.cz>

On Mon, Aug 26, 2019 at 02:21:04PM +0200, Marek Behún wrote:
> The mv88e6390_serdes_irq_link_sgmii IRQ handler reads the SERDES PHY
> status register to determine speed, among other things. If cmode of the
> port is set to 2500base-x, though, the PHY still reports 1000 Mbps (the
> PHY register itself does not differentiate between 1000 Mbps and 2500
> Mbps - it thinks it is running at 1000 Mbps, although clock is 2.5x
> faster).
> Look at the cmode and set SPEED_2500 if cmode is set to 2500base-x.
> Also tell mv88e6xxx_port_setup_mac the PHY interface mode corresponding
> to current cmode in terms of phy_interface_t.
> 
> Signed-off-by: Marek Behún <marek.behun@nic.cz>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: kernel/bpf/core.o: warning: objtool: ___bpf_prog_run.cold()+0x7: call without frame pointer save/setup
From: Josh Poimboeuf @ 2019-08-26 15:18 UTC (permalink / raw)
  To: He Zhe
  Cc: ast, daniel, kafai, songliubraving, yhs, ndesaulniers,
	miguel.ojeda.sandonis, luc.vanoostenryck, schwidefsky, gregkh,
	mst, gor, andreyknvl, liuxiaozhou, yamada.masahiro, linux-kernel,
	netdev, bpf
In-Reply-To: <cf0273fb-c272-72be-50f9-b25bb7c7f183@windriver.com>

On Mon, Aug 26, 2019 at 10:42:53PM +0800, He Zhe wrote:
> Hi All,
> 
> Since 3193c0836f20 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()"),
> We have got the following warning,
> kernel/bpf/core.o: warning: objtool: ___bpf_prog_run.cold()+0x7: call without frame pointer save/setup
> 
> If reverting the above commit, we will get the following warning,
> kernel/bpf/core.o: warning: objtool: ___bpf_prog_run()+0x8b9: sibling call from callable instruction with modified stack frame
> if CONFIG_RETPOLINE=n, and no warning if CONFIG_RETPOLINE=y

Can you please share the following:

- core.o file

The following would also be helpful for me to try to recreate it:

- config file
- compiler version
- kernel version

-- 
Josh

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox