Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v3 10/11] phy: add driver for Microsemi Ocelot SerDes muxing
From: Quentin Schulz @ 2018-09-14  8:16 UTC (permalink / raw)
  To: alexandre.belloni, ralf, paul.burton, jhogan, robh+dt,
	mark.rutland, davem, kishon, andrew, f.fainelli
  Cc: allan.nielsen, linux-mips, devicetree, linux-kernel, netdev,
	thomas.petazzoni, Quentin Schulz
In-Reply-To: <cover.ff40d591b548a6da31716e6e600f11a303e0e643.1536912834.git-series.quentin.schulz@bootlin.com>

The Microsemi Ocelot can mux SerDes lanes (aka macros) to different
switch ports or even make it act as a PCIe interface.

This adds support for the muxing of the SerDes.

Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
---
 drivers/phy/Kconfig                  |   1 +-
 drivers/phy/Makefile                 |   1 +-
 drivers/phy/mscc/Kconfig             |  11 +-
 drivers/phy/mscc/Makefile            |   5 +-
 drivers/phy/mscc/phy-ocelot-serdes.c | 288 ++++++++++++++++++++++++++++-
 5 files changed, 306 insertions(+)
 create mode 100644 drivers/phy/mscc/Kconfig
 create mode 100644 drivers/phy/mscc/Makefile
 create mode 100644 drivers/phy/mscc/phy-ocelot-serdes.c

diff --git a/drivers/phy/Kconfig b/drivers/phy/Kconfig
index 5c8d452..c89d3ef 100644
--- a/drivers/phy/Kconfig
+++ b/drivers/phy/Kconfig
@@ -48,6 +48,7 @@ source "drivers/phy/lantiq/Kconfig"
 source "drivers/phy/marvell/Kconfig"
 source "drivers/phy/mediatek/Kconfig"
 source "drivers/phy/motorola/Kconfig"
+source "drivers/phy/mscc/Kconfig"
 source "drivers/phy/qualcomm/Kconfig"
 source "drivers/phy/ralink/Kconfig"
 source "drivers/phy/renesas/Kconfig"
diff --git a/drivers/phy/Makefile b/drivers/phy/Makefile
index 84e3bd9..ce8339f 100644
--- a/drivers/phy/Makefile
+++ b/drivers/phy/Makefile
@@ -18,6 +18,7 @@ obj-y					+= broadcom/	\
 					   hisilicon/	\
 					   marvell/	\
 					   motorola/	\
+					   mscc/	\
 					   qualcomm/	\
 					   ralink/	\
 					   samsung/	\
diff --git a/drivers/phy/mscc/Kconfig b/drivers/phy/mscc/Kconfig
new file mode 100644
index 0000000..2e2a466
--- /dev/null
+++ b/drivers/phy/mscc/Kconfig
@@ -0,0 +1,11 @@
+#
+# Phy drivers for Microsemi devices
+#
+
+config PHY_OCELOT_SERDES
+	tristate "SerDes PHY driver for Microsemi Ocelot"
+	select GENERIC_PHY
+	depends on OF
+	depends on MFD_SYSCON
+	help
+	  Enable this for supporting SerDes muxing with Microsemi Ocelot.
diff --git a/drivers/phy/mscc/Makefile b/drivers/phy/mscc/Makefile
new file mode 100644
index 0000000..e147491
--- /dev/null
+++ b/drivers/phy/mscc/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the Microsemi phy drivers.
+#
+
+obj-$(CONFIG_PHY_OCELOT_SERDES) := phy-ocelot-serdes.o
diff --git a/drivers/phy/mscc/phy-ocelot-serdes.c b/drivers/phy/mscc/phy-ocelot-serdes.c
new file mode 100644
index 0000000..c2d34cb
--- /dev/null
+++ b/drivers/phy/mscc/phy-ocelot-serdes.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/*
+ * SerDes PHY driver for Microsemi Ocelot
+ *
+ * Copyright (c) 2018 Microsemi
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/mfd/syscon.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/phy/phy.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <soc/mscc/ocelot_hsio.h>
+#include <dt-bindings/phy/phy-ocelot-serdes.h>
+
+struct serdes_ctrl {
+	struct regmap		*regs;
+	struct device		*dev;
+	struct phy		*phys[SERDES_MAX];
+};
+
+struct serdes_macro {
+	u8			idx;
+	/* Not used when in QSGMII or PCIe mode */
+	int			port;
+	struct serdes_ctrl	*ctrl;
+};
+
+#define MCB_S1G_CFG_TIMEOUT     50
+
+static int __serdes_write_mcb_s1g(struct regmap *regmap, u8 macro, u32 op)
+{
+	unsigned int regval;
+
+	regmap_write(regmap, HSIO_MCB_S1G_ADDR_CFG, op |
+		     HSIO_MCB_S1G_ADDR_CFG_SERDES1G_ADDR(BIT(macro)));
+
+	return regmap_read_poll_timeout(regmap, HSIO_MCB_S1G_ADDR_CFG, regval,
+					(regval & op) != op, 100,
+					MCB_S1G_CFG_TIMEOUT * 1000);
+}
+
+static int serdes_commit_mcb_s1g(struct regmap *regmap, u8 macro)
+{
+	return __serdes_write_mcb_s1g(regmap, macro,
+		HSIO_MCB_S1G_ADDR_CFG_SERDES1G_WR_ONE_SHOT);
+}
+
+static int serdes_update_mcb_s1g(struct regmap *regmap, u8 macro)
+{
+	return __serdes_write_mcb_s1g(regmap, macro,
+		HSIO_MCB_S1G_ADDR_CFG_SERDES1G_RD_ONE_SHOT);
+}
+
+static int serdes_init_s1g(struct regmap *regmap, u8 serdes)
+{
+	int ret;
+
+	ret = serdes_update_mcb_s1g(regmap, serdes);
+	if (ret)
+		return ret;
+
+	regmap_update_bits(regmap, HSIO_S1G_COMMON_CFG,
+			   HSIO_S1G_COMMON_CFG_SYS_RST |
+			   HSIO_S1G_COMMON_CFG_ENA_LANE |
+			   HSIO_S1G_COMMON_CFG_ENA_ELOOP |
+			   HSIO_S1G_COMMON_CFG_ENA_FLOOP,
+			   HSIO_S1G_COMMON_CFG_ENA_LANE);
+
+	regmap_update_bits(regmap, HSIO_S1G_PLL_CFG,
+			   HSIO_S1G_PLL_CFG_PLL_FSM_ENA |
+			   HSIO_S1G_PLL_CFG_PLL_FSM_CTRL_DATA_M,
+			   HSIO_S1G_PLL_CFG_PLL_FSM_CTRL_DATA(200) |
+			   HSIO_S1G_PLL_CFG_PLL_FSM_ENA);
+
+	regmap_update_bits(regmap, HSIO_S1G_MISC_CFG,
+			   HSIO_S1G_MISC_CFG_DES_100FX_CPMD_ENA |
+			   HSIO_S1G_MISC_CFG_LANE_RST,
+			   HSIO_S1G_MISC_CFG_LANE_RST);
+
+	ret = serdes_commit_mcb_s1g(regmap, serdes);
+	if (ret)
+		return ret;
+
+	regmap_update_bits(regmap, HSIO_S1G_COMMON_CFG,
+			   HSIO_S1G_COMMON_CFG_SYS_RST,
+			   HSIO_S1G_COMMON_CFG_SYS_RST);
+
+	regmap_update_bits(regmap, HSIO_S1G_MISC_CFG,
+			   HSIO_S1G_MISC_CFG_LANE_RST, 0);
+
+	ret = serdes_commit_mcb_s1g(regmap, serdes);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+struct serdes_mux {
+	u8			idx;
+	u8			port;
+	enum phy_mode		mode;
+	u32			mask;
+	u32			mux;
+};
+
+#define SERDES_MUX(_idx, _port, _mode, _mask, _mux) {	\
+	.idx = _idx,						\
+	.port = _port,						\
+	.mode = _mode,						\
+	.mask = _mask,						\
+	.mux = _mux,						\
+}
+
+static const struct serdes_mux ocelot_serdes_muxes[] = {
+	SERDES_MUX(SERDES1G_0, 0, PHY_MODE_SGMII, 0, 0),
+	SERDES_MUX(SERDES1G_1, 1, PHY_MODE_SGMII, HSIO_HW_CFG_DEV1G_5_MODE, 0),
+	SERDES_MUX(SERDES1G_1, 5, PHY_MODE_SGMII, HSIO_HW_CFG_QSGMII_ENA |
+		   HSIO_HW_CFG_DEV1G_5_MODE, HSIO_HW_CFG_DEV1G_5_MODE),
+	SERDES_MUX(SERDES1G_2, 2, PHY_MODE_SGMII, HSIO_HW_CFG_DEV1G_4_MODE, 0),
+	SERDES_MUX(SERDES1G_2, 4, PHY_MODE_SGMII, HSIO_HW_CFG_QSGMII_ENA |
+		   HSIO_HW_CFG_DEV1G_4_MODE, HSIO_HW_CFG_DEV1G_4_MODE),
+	SERDES_MUX(SERDES1G_3, 3, PHY_MODE_SGMII, HSIO_HW_CFG_DEV1G_6_MODE, 0),
+	SERDES_MUX(SERDES1G_3, 6, PHY_MODE_SGMII, HSIO_HW_CFG_QSGMII_ENA |
+		   HSIO_HW_CFG_DEV1G_6_MODE, HSIO_HW_CFG_DEV1G_6_MODE),
+	SERDES_MUX(SERDES1G_4, 4, PHY_MODE_SGMII, HSIO_HW_CFG_QSGMII_ENA |
+		   HSIO_HW_CFG_DEV1G_4_MODE | HSIO_HW_CFG_DEV1G_9_MODE, 0),
+	SERDES_MUX(SERDES1G_4, 9, PHY_MODE_SGMII, HSIO_HW_CFG_DEV1G_4_MODE |
+		   HSIO_HW_CFG_DEV1G_9_MODE, HSIO_HW_CFG_DEV1G_4_MODE |
+		   HSIO_HW_CFG_DEV1G_9_MODE),
+	SERDES_MUX(SERDES1G_5, 5, PHY_MODE_SGMII, HSIO_HW_CFG_QSGMII_ENA |
+		   HSIO_HW_CFG_DEV1G_5_MODE | HSIO_HW_CFG_DEV2G5_10_MODE, 0),
+	SERDES_MUX(SERDES1G_5, 10, PHY_MODE_SGMII, HSIO_HW_CFG_PCIE_ENA |
+		   HSIO_HW_CFG_DEV1G_5_MODE | HSIO_HW_CFG_DEV2G5_10_MODE,
+		   HSIO_HW_CFG_DEV1G_5_MODE | HSIO_HW_CFG_DEV2G5_10_MODE),
+	SERDES_MUX(SERDES6G_0, 4, PHY_MODE_QSGMII, HSIO_HW_CFG_QSGMII_ENA,
+		   HSIO_HW_CFG_QSGMII_ENA),
+	SERDES_MUX(SERDES6G_0, 5, PHY_MODE_QSGMII, HSIO_HW_CFG_QSGMII_ENA,
+		   HSIO_HW_CFG_QSGMII_ENA),
+	SERDES_MUX(SERDES6G_0, 6, PHY_MODE_QSGMII, HSIO_HW_CFG_QSGMII_ENA,
+		   HSIO_HW_CFG_QSGMII_ENA),
+	SERDES_MUX(SERDES6G_0, 7, PHY_MODE_SGMII, HSIO_HW_CFG_QSGMII_ENA, 0),
+	SERDES_MUX(SERDES6G_0, 7, PHY_MODE_QSGMII, HSIO_HW_CFG_QSGMII_ENA,
+		   HSIO_HW_CFG_QSGMII_ENA),
+	SERDES_MUX(SERDES6G_1, 8, PHY_MODE_SGMII, 0, 0),
+	SERDES_MUX(SERDES6G_2, 10, PHY_MODE_SGMII, HSIO_HW_CFG_PCIE_ENA |
+		   HSIO_HW_CFG_DEV2G5_10_MODE, 0),
+	SERDES_MUX(SERDES6G_2, 10, PHY_MODE_PCIE, HSIO_HW_CFG_PCIE_ENA,
+		   HSIO_HW_CFG_PCIE_ENA),
+};
+
+static int serdes_set_mode(struct phy *phy, enum phy_mode mode)
+{
+	struct serdes_macro *macro = phy_get_drvdata(phy);
+	int ret, i;
+
+	for (i = 0; i < ARRAY_SIZE(ocelot_serdes_muxes); i++) {
+		if (macro->idx != ocelot_serdes_muxes[i].idx ||
+		    mode != ocelot_serdes_muxes[i].mode)
+			continue;
+
+		if (mode != PHY_MODE_QSGMII &&
+		    macro->port != ocelot_serdes_muxes[i].port)
+			continue;
+
+		ret = regmap_update_bits(macro->ctrl->regs, HSIO_HW_CFG,
+					 ocelot_serdes_muxes[i].mask,
+					 ocelot_serdes_muxes[i].mux);
+		if (ret)
+			return ret;
+
+		if (macro->idx < SERDES1G_MAX)
+			return serdes_init_s1g(macro->ctrl->regs, macro->idx);
+
+		/* SERDES6G and PCIe not supported yet */
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static const struct phy_ops serdes_ops = {
+	.set_mode	= serdes_set_mode,
+	.owner		= THIS_MODULE,
+};
+
+static struct phy *serdes_simple_xlate(struct device *dev,
+				       struct of_phandle_args *args)
+{
+	struct serdes_ctrl *ctrl = dev_get_drvdata(dev);
+	int port, idx, i;
+
+	if (args->args_count != 2)
+		return ERR_PTR(-EINVAL);
+
+	port = args->args[0];
+	idx = args->args[1];
+
+	for (i = 0; i < SERDES_MAX; i++) {
+		struct serdes_macro *macro = phy_get_drvdata(ctrl->phys[i]);
+
+		if (idx != macro->idx)
+			continue;
+
+		/* SERDES6G_0 is the only SerDes capable of QSGMII */
+		if (idx != SERDES6G_0 && macro->port >= 0)
+			return ERR_PTR(-EBUSY);
+
+		macro->port = port;
+		return ctrl->phys[i];
+	}
+
+	return ERR_PTR(-ENODEV);
+}
+
+static int serdes_phy_create(struct serdes_ctrl *ctrl, u8 idx, struct phy **phy)
+{
+	struct serdes_macro *macro;
+
+	*phy = devm_phy_create(ctrl->dev, NULL, &serdes_ops);
+	if (IS_ERR(*phy))
+		return PTR_ERR(*phy);
+
+	macro = devm_kzalloc(ctrl->dev, sizeof(*macro), GFP_KERNEL);
+	if (!macro)
+		return -ENOMEM;
+
+	macro->idx = idx;
+	macro->ctrl = ctrl;
+	macro->port = -1;
+
+	phy_set_drvdata(*phy, macro);
+
+	return 0;
+}
+
+static int serdes_probe(struct platform_device *pdev)
+{
+	struct phy_provider *provider;
+	struct serdes_ctrl *ctrl;
+	int i, ret;
+
+	ctrl = devm_kzalloc(&pdev->dev, sizeof(*ctrl), GFP_KERNEL);
+	if (!ctrl)
+		return -ENOMEM;
+
+	ctrl->dev = &pdev->dev;
+	ctrl->regs = syscon_node_to_regmap(pdev->dev.parent->of_node);
+	if (!ctrl->regs)
+		return -ENODEV;
+
+	for (i = 0; i <= SERDES_MAX; i++) {
+		ret = serdes_phy_create(ctrl, i, &ctrl->phys[i]);
+		if (ret)
+			return ret;
+	}
+
+	dev_set_drvdata(&pdev->dev, ctrl);
+
+	provider = devm_of_phy_provider_register(ctrl->dev,
+						 serdes_simple_xlate);
+
+	return PTR_ERR_OR_ZERO(provider);
+}
+
+static const struct of_device_id serdes_ids[] = {
+	{ .compatible = "mscc,vsc7514-serdes", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, serdes_ids);
+
+static struct platform_driver mscc_ocelot_serdes = {
+	.probe		= serdes_probe,
+	.driver		= {
+		.name	= "mscc,ocelot-serdes",
+		.of_match_table = of_match_ptr(serdes_ids),
+	},
+};
+
+module_platform_driver(mscc_ocelot_serdes);
+
+MODULE_AUTHOR("Quentin Schulz <quentin.schulz@bootlin.com>");
+MODULE_DESCRIPTION("SerDes driver for Microsemi Ocelot");
+MODULE_LICENSE("Dual MIT/GPL");
-- 
git-series 0.9.1

^ permalink raw reply related

* [PATCH net-next v3 11/11] net: mscc: ocelot: make use of SerDes PHYs for handling their configuration
From: Quentin Schulz @ 2018-09-14  8:16 UTC (permalink / raw)
  To: alexandre.belloni, ralf, paul.burton, jhogan, robh+dt,
	mark.rutland, davem, kishon, andrew, f.fainelli
  Cc: allan.nielsen, linux-mips, devicetree, linux-kernel, netdev,
	thomas.petazzoni, Quentin Schulz
In-Reply-To: <cover.ff40d591b548a6da31716e6e600f11a303e0e643.1536912834.git-series.quentin.schulz@bootlin.com>

Previously, the SerDes muxing was hardcoded to a given mode in the MAC
controller driver. Now, the SerDes muxing is configured within the
Device Tree and is enforced in the MAC controller driver so we can have
a lot of different SerDes configurations.

Make use of the SerDes PHYs in the MAC controller to set up the SerDes
according to the SerDes<->switch port mapping and the communication mode
with the Ethernet PHY.

Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
---
 drivers/net/ethernet/mscc/Kconfig        |  2 +-
 drivers/net/ethernet/mscc/ocelot.c       | 16 +++++++-
 drivers/net/ethernet/mscc/ocelot.h       |  5 +++-
 drivers/net/ethernet/mscc/ocelot_board.c | 50 ++++++++++++++++++++-----
 4 files changed, 62 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mscc/Kconfig b/drivers/net/ethernet/mscc/Kconfig
index 36c8462..bcec058 100644
--- a/drivers/net/ethernet/mscc/Kconfig
+++ b/drivers/net/ethernet/mscc/Kconfig
@@ -23,6 +23,8 @@ config MSCC_OCELOT_SWITCH
 config MSCC_OCELOT_SWITCH_OCELOT
 	tristate "Ocelot switch driver on Ocelot"
 	depends on MSCC_OCELOT_SWITCH
+	depends on GENERIC_PHY
+	depends on OF_NET
 	help
 	  This driver supports the Ocelot network switch device as present on
 	  the Ocelot SoCs.
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 1a4f2bb..8f11fdb 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -472,6 +472,7 @@ static int ocelot_port_open(struct net_device *dev)
 {
 	struct ocelot_port *port = netdev_priv(dev);
 	struct ocelot *ocelot = port->ocelot;
+	enum phy_mode phy_mode;
 	int err;
 
 	/* Enable receiving frames on the port, and activate auto-learning of
@@ -482,8 +483,21 @@ static int ocelot_port_open(struct net_device *dev)
 			 ANA_PORT_PORT_CFG_PORTID_VAL(port->chip_port),
 			 ANA_PORT_PORT_CFG, port->chip_port);
 
+	if (port->serdes) {
+		if (port->phy_mode == PHY_INTERFACE_MODE_SGMII)
+			phy_mode = PHY_MODE_SGMII;
+		else
+			phy_mode = PHY_MODE_QSGMII;
+
+		err = phy_set_mode(port->serdes, phy_mode);
+		if (err) {
+			netdev_err(dev, "Could not set mode of SerDes\n");
+			return err;
+		}
+	}
+
 	err = phy_connect_direct(dev, port->phy, &ocelot_port_adjust_link,
-				 PHY_INTERFACE_MODE_NA);
+				 port->phy_mode);
 	if (err) {
 		netdev_err(dev, "Could not attach to PHY\n");
 		return err;
diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
index 3720e51..62c7c8e 100644
--- a/drivers/net/ethernet/mscc/ocelot.h
+++ b/drivers/net/ethernet/mscc/ocelot.h
@@ -11,6 +11,8 @@
 #include <linux/bitops.h>
 #include <linux/etherdevice.h>
 #include <linux/if_vlan.h>
+#include <linux/phy.h>
+#include <linux/phy/phy.h>
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
 
@@ -453,6 +455,9 @@ struct ocelot_port {
 	u8 vlan_aware;
 
 	u64 *stats;
+
+	phy_interface_t phy_mode;
+	struct phy *serdes;
 };
 
 u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset);
diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c
index b7d755b..a26a06e 100644
--- a/drivers/net/ethernet/mscc/ocelot_board.c
+++ b/drivers/net/ethernet/mscc/ocelot_board.c
@@ -6,6 +6,7 @@
  */
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/of_net.h>
 #include <linux/netdevice.h>
 #include <linux/of_mdio.h>
 #include <linux/of_platform.h>
@@ -247,18 +248,12 @@ static int mscc_ocelot_probe(struct platform_device *pdev)
 	INIT_LIST_HEAD(&ocelot->multicast);
 	ocelot_init(ocelot);
 
-	ocelot_rmw(ocelot, HSIO_HW_CFG_DEV1G_4_MODE |
-		     HSIO_HW_CFG_DEV1G_6_MODE |
-		     HSIO_HW_CFG_DEV1G_9_MODE,
-		     HSIO_HW_CFG_DEV1G_4_MODE |
-		     HSIO_HW_CFG_DEV1G_6_MODE |
-		     HSIO_HW_CFG_DEV1G_9_MODE,
-		     HSIO_HW_CFG);
-
 	for_each_available_child_of_node(ports, portnp) {
 		struct device_node *phy_node;
 		struct phy_device *phy;
 		struct resource *res;
+		struct phy *serdes;
+		enum phy_mode phy_mode;
 		void __iomem *regs;
 		char res_name[8];
 		u32 port;
@@ -283,10 +278,45 @@ static int mscc_ocelot_probe(struct platform_device *pdev)
 			continue;
 
 		err = ocelot_probe_port(ocelot, port, regs, phy);
-		if (err) {
-			dev_err(&pdev->dev, "failed to probe ports\n");
+		if (err)
+			return err;
+
+		err = of_get_phy_mode(portnp);
+		if (err < 0)
+			ocelot->ports[port]->phy_mode = PHY_INTERFACE_MODE_NA;
+		else
+			ocelot->ports[port]->phy_mode = err;
+
+		switch (ocelot->ports[port]->phy_mode) {
+		case PHY_INTERFACE_MODE_NA:
+			continue;
+		case PHY_INTERFACE_MODE_SGMII:
+			phy_mode = PHY_MODE_SGMII;
+			break;
+		case PHY_INTERFACE_MODE_QSGMII:
+			phy_mode = PHY_MODE_QSGMII;
+			break;
+		default:
+			dev_err(ocelot->dev,
+				"invalid phy mode for port%d, (Q)SGMII only\n",
+				port);
+			return -EINVAL;
+		}
+
+		serdes = devm_of_phy_get(ocelot->dev, portnp, NULL);
+		if (IS_ERR(serdes)) {
+			err = PTR_ERR(serdes);
+			if (err == -EPROBE_DEFER) {
+				dev_dbg(ocelot->dev, "deferring probe\n");
+				goto err_probe_ports;
+			}
+
+			dev_err(ocelot->dev, "missing SerDes phys for port%d\n",
+				port);
 			goto err_probe_ports;
 		}
+
+		ocelot->ports[port]->serdes = serdes;
 	}
 
 	register_netdevice_notifier(&ocelot_netdevice_nb);
-- 
git-series 0.9.1

^ permalink raw reply related

* [PATCH stable 4.4 V2 0/6] fix SegmentSmack in stable branch (CVE-2018-5390)
From: Mao Wenan @ 2018-09-14  8:24 UTC (permalink / raw)
  To: netdev, gregkh, dwmw2, eric.dumazet, davem, stable, linux-kernel,
	maowenan

There are five patches to fix CVE-2018-5390 in latest mainline 
branch, but only two patches exist in stable 4.4 and 3.18: 
dc6ae4d tcp: detect malicious patterns in tcp_collapse_ofo_queue()
5fbec48 tcp: avoid collapses in tcp_prune_queue() if possible
I have tested with stable 4.4 kernel, and found the cpu usage was very high.
So I think only two patches can't fix the CVE-2018-5390.
test results:
with fix patch：     78.2%   ksoftirqd
withoutfix patch：   90%     ksoftirqd

Then I try to imitate 72cd43ba(tcp: free batches of packets in tcp_prune_ofo_queue())
to drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks with simple queue 
instead of RB tree. The result is not very well.

After analysing the codes of stable 4.4, and debuging the 
system, shows that search of ofo_queue(tcp ofo using a simple queue) cost more cycles.

So I try to backport "tcp: use an RB tree for ooo receive queue" using RB tree 
instead of simple queue, then backport Eric Dumazet 5 fixed patches in mainline,
good news is that ksoftirqd is turn to about 20%, which is the same with mainline now.

Stable 4.4 have already back port two patches, 
f4a3313d(tcp: avoid collapses in tcp_prune_queue() if possible)
3d4bf93a(tcp: detect malicious patterns in tcp_collapse_ofo_queue())
If we want to change simple queue to RB tree to finally resolve, we should apply previous 
patch 9f5afeae(tcp: use an RB tree for ooo receive queue.) firstly, but 9f5afeae have many 
conflicts with 3d4bf93a and f4a3313d, which are part of patch series from Eric in 
mainline to fix CVE-2018-5390, so I need revert part of patches in stable 4.4 firstly, 
then apply 9f5afeae, and reapply five patches from Eric.

V1->V2:
1) Don't revert 3d4bf93a and f4a3313d firstly, all of 6 patches based on 4.4.155. 
2) Add one bug fix patch for RB tree:76f0dcbb5ae1a7c3dbeec13dd98233b8e6b0b32a tcp: fix a stale ooo_last_skb

Eric Dumazet (5):
  tcp: increment sk_drops for dropped rx packets
  tcp: fix a stale ooo_last_skb after a replace
  tcp: free batches of packets in tcp_prune_ofo_queue()
  tcp: call tcp_drop() from tcp_data_queue_ofo()
  tcp: add tcp_ooo_try_coalesce() helper

Yaogong Wang (1):
  tcp: use an RB tree for ooo receive queue

 include/linux/skbuff.h   |   8 +
 include/linux/tcp.h      |   7 +-
 include/net/sock.h       |   7 +
 include/net/tcp.h        |   2 +-
 net/core/skbuff.c        |  19 +++
 net/ipv4/tcp.c           |   4 +-
 net/ipv4/tcp_input.c     | 417 +++++++++++++++++++++++++++++------------------
 net/ipv4/tcp_ipv4.c      |   3 +-
 net/ipv4/tcp_minisocks.c |   1 -
 net/ipv6/tcp_ipv6.c      |   1 +
 10 files changed, 297 insertions(+), 172 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* [PATCH stable 4.4 V2 1/6] tcp: increment sk_drops for dropped rx packets
From: Mao Wenan @ 2018-09-14  8:24 UTC (permalink / raw)
  To: netdev, gregkh, dwmw2, eric.dumazet, davem, stable, linux-kernel,
	maowenan
In-Reply-To: <1536913450-12380-1-git-send-email-maowenan@huawei.com>

From: Eric Dumazet <edumazet@google.com>

[ Upstream commit 532182cd610782db8c18230c2747626562032205 ]

Now ss can report sk_drops, we can instruct TCP to increment
this per socket counter when it drops an incoming frame, to refine
monitoring and debugging.

Following patch takes care of listeners drops.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
---
 include/net/sock.h   |  7 +++++++
 net/ipv4/tcp_input.c | 33 ++++++++++++++++++++-------------
 net/ipv4/tcp_ipv4.c  |  1 +
 net/ipv6/tcp_ipv6.c  |  1 +
 4 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 3d5ff74..5770757 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2139,6 +2139,13 @@ sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
 	SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops);
 }
 
+static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
+{
+	int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+
+	atomic_add(segs, &sk->sk_drops);
+}
+
 void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 			   struct sk_buff *skb);
 void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9c4c6cd..9df4cbb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4296,6 +4296,12 @@ static bool tcp_try_coalesce(struct sock *sk,
 	return true;
 }
 
+static void tcp_drop(struct sock *sk, struct sk_buff *skb)
+{
+	sk_drops_add(sk, skb);
+	__kfree_skb(skb);
+}
+
 /* This one checks to see if we can put data from the
  * out_of_order queue into the receive_queue.
  */
@@ -4320,7 +4326,7 @@ static void tcp_ofo_queue(struct sock *sk)
 		__skb_unlink(skb, &tp->out_of_order_queue);
 		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
 			SOCK_DEBUG(sk, "ofo packet was already received\n");
-			__kfree_skb(skb);
+			tcp_drop(sk, skb);
 			continue;
 		}
 		SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
@@ -4372,7 +4378,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 
 	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
-		__kfree_skb(skb);
+		tcp_drop(sk, skb);
 		return;
 	}
 
@@ -4436,7 +4442,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
 			/* All the bits are present. Drop. */
 			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
-			__kfree_skb(skb);
+			tcp_drop(sk, skb);
 			skb = NULL;
 			tcp_dsack_set(sk, seq, end_seq);
 			goto add_sack;
@@ -4475,7 +4481,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
 				 TCP_SKB_CB(skb1)->end_seq);
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
-		__kfree_skb(skb1);
+		tcp_drop(sk, skb1);
 	}
 
 add_sack:
@@ -4558,12 +4564,13 @@ err:
 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	int eaten = -1;
 	bool fragstolen = false;
+	int eaten = -1;
 
-	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
-		goto drop;
-
+	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
+		__kfree_skb(skb);
+		return;
+	}
 	skb_dst_drop(skb);
 	__skb_pull(skb, tcp_hdr(skb)->doff * 4);
 
@@ -4645,7 +4652,7 @@ out_of_window:
 		tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
 		inet_csk_schedule_ack(sk);
 drop:
-		__kfree_skb(skb);
+		tcp_drop(sk, skb);
 		return;
 	}
 
@@ -5236,7 +5243,7 @@ syn_challenge:
 	return true;
 
 discard:
-	__kfree_skb(skb);
+	tcp_drop(sk, skb);
 	return false;
 }
 
@@ -5454,7 +5461,7 @@ csum_error:
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
 
 discard:
-	__kfree_skb(skb);
+	tcp_drop(sk, skb);
 }
 EXPORT_SYMBOL(tcp_rcv_established);
 
@@ -5684,7 +5691,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 						  TCP_DELACK_MAX, TCP_RTO_MAX);
 
 discard:
-			__kfree_skb(skb);
+			tcp_drop(sk, skb);
 			return 0;
 		} else {
 			tcp_send_ack(sk);
@@ -6041,7 +6048,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 
 	if (!queued) {
 discard:
-		__kfree_skb(skb);
+		tcp_drop(sk, skb);
 	}
 	return 0;
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index eeda67c..01715fc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1716,6 +1716,7 @@ discard_it:
 	return 0;
 
 discard_and_relse:
+	sk_drops_add(sk, skb);
 	sock_put(sk);
 	goto discard_it;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 90abe88..d6c1911 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1505,6 +1505,7 @@ discard_it:
 	return 0;
 
 discard_and_relse:
+	sk_drops_add(sk, skb);
 	sock_put(sk);
 	goto discard_it;
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH stable 4.4 V2 2/6] tcp: use an RB tree for ooo receive queue
From: Mao Wenan @ 2018-09-14  8:24 UTC (permalink / raw)
  To: netdev, gregkh, dwmw2, eric.dumazet, davem, stable, linux-kernel,
	maowenan
In-Reply-To: <1536913450-12380-1-git-send-email-maowenan@huawei.com>

From: Yaogong Wang <wygivan@google.com>

[ Upstream commit 9f5afeae51526b3ad7b7cb21ee8b145ce6ea7a7a ]

Over the years, TCP BDP has increased by several orders of magnitude,
and some people are considering to reach the 2 Gbytes limit.

Even with current window scale limit of 14, ~1 Gbytes maps to ~740,000
MSS.

In presence of packet losses (or reorders), TCP stores incoming packets
into an out of order queue, and number of skbs sitting there waiting for
the missing packets to be received can be in the 10^5 range.

Most packets are appended to the tail of this queue, and when
packets can finally be transferred to receive queue, we scan the queue
from its head.

However, in presence of heavy losses, we might have to find an arbitrary
point in this queue, involving a linear scan for every incoming packet,
throwing away cpu caches.

This patch converts it to a RB tree, to get bounded latencies.

Yaogong wrote a preliminary patch about 2 years ago.
Eric did the rebase, added ofo_last_skb cache, polishing and tests.

Tested with network dropping between 1 and 10 % packets, with good
success (about 30 % increase of throughput in stress tests)

Next step would be to also use an RB tree for the write queue at sender
side ;)

Signed-off-by: Yaogong Wang <wygivan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Acked-By: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
---
 include/linux/skbuff.h   |   8 ++
 include/linux/tcp.h      |   7 +-
 include/net/tcp.h        |   2 +-
 net/core/skbuff.c        |  19 +++
 net/ipv4/tcp.c           |   4 +-
 net/ipv4/tcp_input.c     | 358 +++++++++++++++++++++++++++--------------------
 net/ipv4/tcp_ipv4.c      |   2 +-
 net/ipv4/tcp_minisocks.c |   1 -
 8 files changed, 242 insertions(+), 159 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c28bd8b..a490dd7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2273,6 +2273,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
 		kfree_skb(skb);
 }
 
+void skb_rbtree_purge(struct rb_root *root);
+
 void *netdev_alloc_frag(unsigned int fragsz);
 
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
@@ -2807,6 +2809,12 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
 	return __pskb_trim(skb, len);
 }
 
+#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
+#define skb_rb_first(root) rb_to_skb(rb_first(root))
+#define skb_rb_last(root)  rb_to_skb(rb_last(root))
+#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
+#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))
+
 #define skb_queue_walk(queue, skb) \
 		for (skb = (queue)->next;					\
 		     skb != (struct sk_buff *)(queue);				\
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 5b6df1a..747404d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -279,10 +279,9 @@ struct tcp_sock {
 	struct sk_buff* lost_skb_hint;
 	struct sk_buff *retransmit_skb_hint;
 
-	/* OOO segments go in this list. Note that socket lock must be held,
-	 * as we do not use sk_buff_head lock.
-	 */
-	struct sk_buff_head	out_of_order_queue;
+	/* OOO segments go in this rbtree. Socket lock must be held. */
+	struct rb_root	out_of_order_queue;
+	struct sk_buff	*ooo_last_skb; /* cache rb_last(out_of_order_queue) */
 
 	/* SACKs data, these 2 need to be together (see tcp_options_write) */
 	struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6c89238..a99f75e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -649,7 +649,7 @@ static inline void tcp_fast_path_check(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (skb_queue_empty(&tp->out_of_order_queue) &&
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
 	    tp->rcv_wnd &&
 	    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
 	    !tp->urg_data)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 55be076..9703924 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2378,6 +2378,25 @@ void skb_queue_purge(struct sk_buff_head *list)
 EXPORT_SYMBOL(skb_queue_purge);
 
 /**
+ *	skb_rbtree_purge - empty a skb rbtree
+ *	@root: root of the rbtree to empty
+ *
+ *	Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
+ *	the list and one reference dropped. This function does not take
+ *	any lock. Synchronization should be handled by the caller (e.g., TCP
+ *	out-of-order queue is protected by the socket lock).
+ */
+void skb_rbtree_purge(struct rb_root *root)
+{
+	struct sk_buff *skb, *next;
+
+	rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
+		kfree_skb(skb);
+
+	*root = RB_ROOT;
+}
+
+/**
  *	skb_queue_head - queue a buffer at the list head
  *	@list: list to use
  *	@newsk: buffer to queue
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5e162b8..b7492aa 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -382,7 +382,7 @@ void tcp_init_sock(struct sock *sk)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	__skb_queue_head_init(&tp->out_of_order_queue);
+	tp->out_of_order_queue = RB_ROOT;
 	tcp_init_xmit_timers(sk);
 	tcp_prequeue_init(tp);
 	INIT_LIST_HEAD(&tp->tsq_node);
@@ -2240,7 +2240,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tcp_clear_xmit_timers(sk);
 	__skb_queue_purge(&sk->sk_receive_queue);
 	tcp_write_queue_purge(sk);
-	__skb_queue_purge(&tp->out_of_order_queue);
+	skb_rbtree_purge(&tp->out_of_order_queue);
 
 	inet->inet_dport = 0;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9df4cbb..7832d0d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4073,7 +4073,7 @@ static void tcp_fin(struct sock *sk)
 	/* It _is_ possible, that we have something out-of-order _after_ FIN.
 	 * Probably, we should reset in this case. For now drop them.
 	 */
-	__skb_queue_purge(&tp->out_of_order_queue);
+	skb_rbtree_purge(&tp->out_of_order_queue);
 	if (tcp_is_sack(tp))
 		tcp_sack_reset(&tp->rx_opt);
 	sk_mem_reclaim(sk);
@@ -4233,7 +4233,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
 	int this_sack;
 
 	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
-	if (skb_queue_empty(&tp->out_of_order_queue)) {
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
 		tp->rx_opt.num_sacks = 0;
 		return;
 	}
@@ -4309,10 +4309,13 @@ static void tcp_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u32 dsack_high = tp->rcv_nxt;
+	bool fin, fragstolen, eaten;
 	struct sk_buff *skb, *tail;
-	bool fragstolen, eaten;
+	struct rb_node *p;
 
-	while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
+	p = rb_first(&tp->out_of_order_queue);
+	while (p) {
+		skb = rb_entry(p, struct sk_buff, rbnode);
 		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
 			break;
 
@@ -4322,9 +4325,10 @@ static void tcp_ofo_queue(struct sock *sk)
 				dsack_high = TCP_SKB_CB(skb)->end_seq;
 			tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
 		}
+		p = rb_next(p);
+		rb_erase(&skb->rbnode, &tp->out_of_order_queue);
 
-		__skb_unlink(skb, &tp->out_of_order_queue);
-		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+		if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
 			SOCK_DEBUG(sk, "ofo packet was already received\n");
 			tcp_drop(sk, skb);
 			continue;
@@ -4336,12 +4340,19 @@ static void tcp_ofo_queue(struct sock *sk)
 		tail = skb_peek_tail(&sk->sk_receive_queue);
 		eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
 		tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
+		fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
 		if (!eaten)
 			__skb_queue_tail(&sk->sk_receive_queue, skb);
-		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
-			tcp_fin(sk);
-		if (eaten)
+		else
 			kfree_skb_partial(skb, fragstolen);
+
+		if (unlikely(fin)) {
+			tcp_fin(sk);
+			/* tcp_fin() purges tp->out_of_order_queue,
+			 * so we must end this loop right now.
+			 */
+			break;
+		}
 	}
 }
 
@@ -4371,8 +4382,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct rb_node **p, *q, *parent;
 	struct sk_buff *skb1;
 	u32 seq, end_seq;
+	bool fragstolen;
 
 	tcp_ecn_check_ce(sk, skb);
 
@@ -4387,89 +4400,86 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	inet_csk_schedule_ack(sk);
 
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+	seq = TCP_SKB_CB(skb)->seq;
+	end_seq = TCP_SKB_CB(skb)->end_seq;
 	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
-		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+		   tp->rcv_nxt, seq, end_seq);
 
-	skb1 = skb_peek_tail(&tp->out_of_order_queue);
-	if (!skb1) {
+	p = &tp->out_of_order_queue.rb_node;
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
 		/* Initial out of order segment, build 1 SACK. */
 		if (tcp_is_sack(tp)) {
 			tp->rx_opt.num_sacks = 1;
-			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
-			tp->selective_acks[0].end_seq =
-						TCP_SKB_CB(skb)->end_seq;
+			tp->selective_acks[0].start_seq = seq;
+			tp->selective_acks[0].end_seq = end_seq;
 		}
-		__skb_queue_head(&tp->out_of_order_queue, skb);
-		goto end;
-	}
-
-	seq = TCP_SKB_CB(skb)->seq;
-	end_seq = TCP_SKB_CB(skb)->end_seq;
-
-	if (seq == TCP_SKB_CB(skb1)->end_seq) {
-		bool fragstolen;
-
-		if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
-			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
-		} else {
-			tcp_grow_window(sk, skb);
-			kfree_skb_partial(skb, fragstolen);
-			skb = NULL;
-		}
-
-		if (!tp->rx_opt.num_sacks ||
-		    tp->selective_acks[0].end_seq != seq)
-			goto add_sack;
-
-		/* Common case: data arrive in order after hole. */
-		tp->selective_acks[0].end_seq = end_seq;
+		rb_link_node(&skb->rbnode, NULL, p);
+		rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
+		tp->ooo_last_skb = skb;
 		goto end;
 	}
 
-	/* Find place to insert this segment. */
-	while (1) {
-		if (!after(TCP_SKB_CB(skb1)->seq, seq))
-			break;
-		if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
-			skb1 = NULL;
-			break;
+	/* In the typical case, we are adding an skb to the end of the list.
+	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+	 */
+	if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
+coalesce_done:
+		tcp_grow_window(sk, skb);
+		kfree_skb_partial(skb, fragstolen);
+		skb = NULL;
+		goto add_sack;
+	}
+
+	/* Find place to insert this segment. Handle overlaps on the way. */
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		if (before(seq, TCP_SKB_CB(skb1)->seq)) {
+			p = &parent->rb_left;
+			continue;
 		}
-		skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
-	}
 
-	/* Do skb overlap to previous one? */
-	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
-		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
-			/* All the bits are present. Drop. */
-			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
-			tcp_drop(sk, skb);
-			skb = NULL;
-			tcp_dsack_set(sk, seq, end_seq);
-			goto add_sack;
-		}
-		if (after(seq, TCP_SKB_CB(skb1)->seq)) {
-			/* Partial overlap. */
-			tcp_dsack_set(sk, seq,
-				      TCP_SKB_CB(skb1)->end_seq);
-		} else {
-			if (skb_queue_is_first(&tp->out_of_order_queue,
-					       skb1))
-				skb1 = NULL;
-			else
-				skb1 = skb_queue_prev(
-					&tp->out_of_order_queue,
-					skb1);
+		if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+			if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+				/* All the bits are present. Drop. */
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPOFOMERGE);
+				__kfree_skb(skb);
+				skb = NULL;
+				tcp_dsack_set(sk, seq, end_seq);
+				goto add_sack;
+			}
+			if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+				/* Partial overlap. */
+				tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
+			} else {
+				/* skb's seq == skb1's seq and skb covers skb1.
+				 * Replace skb1 with skb.
+				 */
+				rb_replace_node(&skb1->rbnode, &skb->rbnode,
+						&tp->out_of_order_queue);
+				tcp_dsack_extend(sk,
+						 TCP_SKB_CB(skb1)->seq,
+						 TCP_SKB_CB(skb1)->end_seq);
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPOFOMERGE);
+				__kfree_skb(skb1);
+				goto add_sack;
+			}
+		} else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+			goto coalesce_done;
 		}
+		p = &parent->rb_right;
 	}
-	if (!skb1)
-		__skb_queue_head(&tp->out_of_order_queue, skb);
-	else
-		__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
 
-	/* And clean segments covered by new one as whole. */
-	while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
-		skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
+	/* Insert segment into RB tree. */
+	rb_link_node(&skb->rbnode, parent, p);
+	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
 
+	/* Remove other segments covered by skb. */
+	while ((q = rb_next(&skb->rbnode)) != NULL) {
+		skb1 = rb_entry(q, struct sk_buff, rbnode);
 		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
 			break;
 		if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
@@ -4477,12 +4487,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 					 end_seq);
 			break;
 		}
-		__skb_unlink(skb1, &tp->out_of_order_queue);
+		rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
 		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
 				 TCP_SKB_CB(skb1)->end_seq);
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
 		tcp_drop(sk, skb1);
 	}
+	/* If there is no skb after us, we are the last_skb ! */
+	if (!q)
+		tp->ooo_last_skb = skb;
 
 add_sack:
 	if (tcp_is_sack(tp))
@@ -4621,13 +4634,13 @@ queue_and_out:
 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 			tcp_fin(sk);
 
-		if (!skb_queue_empty(&tp->out_of_order_queue)) {
+		if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
 			tcp_ofo_queue(sk);
 
 			/* RFC2581. 4.2. SHOULD send immediate ACK, when
 			 * gap in queue is filled.
 			 */
-			if (skb_queue_empty(&tp->out_of_order_queue))
+			if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
 				inet_csk(sk)->icsk_ack.pingpong = 0;
 		}
 
@@ -4679,48 +4692,76 @@ drop:
 	tcp_data_queue_ofo(sk, skb);
 }
 
+static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
+{
+	if (list)
+		return !skb_queue_is_last(list, skb) ? skb->next : NULL;
+
+	return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+}
+
 static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
-					struct sk_buff_head *list)
+					struct sk_buff_head *list,
+					struct rb_root *root)
 {
-	struct sk_buff *next = NULL;
+	struct sk_buff *next = tcp_skb_next(skb, list);
 
-	if (!skb_queue_is_last(list, skb))
-		next = skb_queue_next(list, skb);
+	if (list)
+		__skb_unlink(skb, list);
+	else
+		rb_erase(&skb->rbnode, root);
 
-	__skb_unlink(skb, list);
 	__kfree_skb(skb);
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
 
 	return next;
 }
 
+/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
+static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct sk_buff *skb1;
+
+	while (*p) {
+		parent = *p;
+		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+	}
+	rb_link_node(&skb->rbnode, parent, p);
+	rb_insert_color(&skb->rbnode, root);
+}
+
 /* Collapse contiguous sequence of skbs head..tail with
  * sequence numbers start..end.
  *
- * If tail is NULL, this means until the end of the list.
+ * If tail is NULL, this means until the end of the queue.
  *
  * Segments with FIN/SYN are not collapsed (only because this
  * simplifies code)
  */
 static void
-tcp_collapse(struct sock *sk, struct sk_buff_head *list,
-	     struct sk_buff *head, struct sk_buff *tail,
-	     u32 start, u32 end)
+tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
+	     struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
 {
-	struct sk_buff *skb, *n;
+	struct sk_buff *skb = head, *n;
+	struct sk_buff_head tmp;
 	bool end_of_skbs;
 
 	/* First, check that queue is collapsible and find
-	 * the point where collapsing can be useful. */
-	skb = head;
+	 * the point where collapsing can be useful.
+	 */
 restart:
-	end_of_skbs = true;
-	skb_queue_walk_from_safe(list, skb, n) {
-		if (skb == tail)
-			break;
+	for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
+		n = tcp_skb_next(skb, list);
+
 		/* No new bits? It is possible on ofo queue. */
 		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
-			skb = tcp_collapse_one(sk, skb, list);
+			skb = tcp_collapse_one(sk, skb, list, root);
 			if (!skb)
 				break;
 			goto restart;
@@ -4738,13 +4779,10 @@ restart:
 			break;
 		}
 
-		if (!skb_queue_is_last(list, skb)) {
-			struct sk_buff *next = skb_queue_next(list, skb);
-			if (next != tail &&
-			    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
-				end_of_skbs = false;
-				break;
-			}
+		if (n && n != tail &&
+		    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
+			end_of_skbs = false;
+			break;
 		}
 
 		/* Decided to skip this, advance start seq. */
@@ -4754,17 +4792,22 @@ restart:
 	    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
 		return;
 
+	__skb_queue_head_init(&tmp);
+
 	while (before(start, end)) {
 		int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
 		struct sk_buff *nskb;
 
 		nskb = alloc_skb(copy, GFP_ATOMIC);
 		if (!nskb)
-			return;
+			break;
 
 		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
 		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
-		__skb_queue_before(list, skb, nskb);
+		if (list)
+			__skb_queue_before(list, skb, nskb);
+		else
+			__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
 		skb_set_owner_r(nskb, sk);
 
 		/* Copy data, releasing collapsed skbs. */
@@ -4782,14 +4825,17 @@ restart:
 				start += size;
 			}
 			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
-				skb = tcp_collapse_one(sk, skb, list);
+				skb = tcp_collapse_one(sk, skb, list, root);
 				if (!skb ||
 				    skb == tail ||
 				    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
-					return;
+					goto end;
 			}
 		}
 	}
+end:
+	skb_queue_walk_safe(&tmp, skb, n)
+		tcp_rbtree_insert(root, skb);
 }
 
 /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
@@ -4799,34 +4845,39 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 range_truesize, sum_tiny = 0;
-	struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
-	struct sk_buff *head;
+	struct sk_buff *skb, *head;
+	struct rb_node *p;
 	u32 start, end;
 
-	if (!skb)
+	p = rb_first(&tp->out_of_order_queue);
+	skb = rb_entry_safe(p, struct sk_buff, rbnode);
+new_range:
+	if (!skb) {
+		p = rb_last(&tp->out_of_order_queue);
+		/* Note: This is possible p is NULL here. We do not
+		 * use rb_entry_safe(), as ooo_last_skb is valid only
+		 * if rbtree is not empty.
+		 */
+		tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
 		return;
-
+	}
 	start = TCP_SKB_CB(skb)->seq;
 	end = TCP_SKB_CB(skb)->end_seq;
 	range_truesize = skb->truesize;
-	head = skb;
-
-	for (;;) {
-		struct sk_buff *next = NULL;
 
-		if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
-			next = skb_queue_next(&tp->out_of_order_queue, skb);
-		skb = next;
+	for (head = skb;;) {
+		skb = tcp_skb_next(skb, NULL);
 
-		/* Segment is terminated when we see gap or when
-		 * we are at the end of all the queue. */
+		/* Range is terminated when we see a gap or when
+		 * we are at the queue end.
+		 */
 		if (!skb ||
 		    after(TCP_SKB_CB(skb)->seq, end) ||
 		    before(TCP_SKB_CB(skb)->end_seq, start)) {
 			/* Do not attempt collapsing tiny skbs */
 			if (range_truesize != head->truesize ||
 			    end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
-				tcp_collapse(sk, &tp->out_of_order_queue,
+				tcp_collapse(sk, NULL, &tp->out_of_order_queue,
 					     head, skb, start, end);
 			} else {
 				sum_tiny += range_truesize;
@@ -4834,20 +4885,14 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 					return;
 			}
 
-			head = skb;
-			if (!skb)
-				break;
-			/* Start new segment */
+			goto new_range;
+		}
+
+		range_truesize += skb->truesize;
+		if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
 			start = TCP_SKB_CB(skb)->seq;
+		if (after(TCP_SKB_CB(skb)->end_seq, end))
 			end = TCP_SKB_CB(skb)->end_seq;
-			range_truesize = skb->truesize;
-		} else {
-			range_truesize += skb->truesize;
-			if (before(TCP_SKB_CB(skb)->seq, start))
-				start = TCP_SKB_CB(skb)->seq;
-			if (after(TCP_SKB_CB(skb)->end_seq, end))
-				end = TCP_SKB_CB(skb)->end_seq;
-		}
 	}
 }
 
@@ -4858,23 +4903,36 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 static bool tcp_prune_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	bool res = false;
+	struct rb_node *node, *prev;
 
-	if (!skb_queue_empty(&tp->out_of_order_queue)) {
-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
-		__skb_queue_purge(&tp->out_of_order_queue);
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
+		return false;
 
-		/* Reset SACK state.  A conforming SACK implementation will
-		 * do the same at a timeout based retransmit.  When a connection
-		 * is in a sad state like this, we care only about integrity
-		 * of the connection not performance.
-		 */
-		if (tp->rx_opt.sack_ok)
-			tcp_sack_reset(&tp->rx_opt);
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
+
+	node = &tp->ooo_last_skb->rbnode;
+	do {
+		prev = rb_prev(node);
+		rb_erase(node, &tp->out_of_order_queue);
+		__kfree_skb(rb_to_skb(node));
 		sk_mem_reclaim(sk);
-		res = true;
-	}
-	return res;
+		if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+		    !tcp_under_memory_pressure(sk))
+			break;
+
+		node = prev;
+	} while (node);
+	tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
+
+	/* Reset SACK state.  A conforming SACK implementation will
+	 * do the same at a timeout based retransmit.  When a connection
+	 * is in a sad state like this, we care only about integrity
+	 * of the connection not performance.
+	 */
+	if (tp->rx_opt.sack_ok)
+		tcp_sack_reset(&tp->rx_opt);
+
+	return true;
 }
 
 /* Reduce allocated memory if we can, trying to get
@@ -4902,7 +4960,7 @@ static int tcp_prune_queue(struct sock *sk)
 
 	tcp_collapse_ofo_queue(sk);
 	if (!skb_queue_empty(&sk->sk_receive_queue))
-		tcp_collapse(sk, &sk->sk_receive_queue,
+		tcp_collapse(sk, &sk->sk_receive_queue, NULL,
 			     skb_peek(&sk->sk_receive_queue),
 			     NULL,
 			     tp->copied_seq, tp->rcv_nxt);
@@ -5007,7 +5065,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 	    /* We ACK each frame or... */
 	    tcp_in_quickack_mode(sk) ||
 	    /* We have out of order data. */
-	    (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
+	    (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
 		/* Then ack it now */
 		tcp_send_ack(sk);
 	} else {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 01715fc..ee8399f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1830,7 +1830,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 	tcp_write_queue_purge(sk);
 
 	/* Cleans up our, hopefully empty, out_of_order_queue. */
-	__skb_queue_purge(&tp->out_of_order_queue);
+	skb_rbtree_purge(&tp->out_of_order_queue);
 
 #ifdef CONFIG_TCP_MD5SIG
 	/* Clean up the MD5 key list, if any */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4c1c94f..81c633d 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -495,7 +495,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->snd_cwnd_cnt = 0;
 
 		tcp_init_xmit_timers(newsk);
-		__skb_queue_head_init(&newtp->out_of_order_queue);
 		newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
 
 		newtp->rx_opt.saw_tstamp = 0;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH stable 4.4 V2 3/6] tcp: fix a stale ooo_last_skb after a replace
From: Mao Wenan @ 2018-09-14  8:24 UTC (permalink / raw)
  To: netdev, gregkh, dwmw2, eric.dumazet, davem, stable, linux-kernel,
	maowenan
In-Reply-To: <1536913450-12380-1-git-send-email-maowenan@huawei.com>

From: Eric Dumazet <edumazet@google.com>

[ Upstream commit 76f0dcbb5ae1a7c3dbeec13dd98233b8e6b0b32a ]

When skb replaces another one in ooo queue, I forgot to also
update tp->ooo_last_skb as well, if the replaced skb was the last one
in the queue.

To fix this, we simply can re-use the code that runs after an insertion,
trying to merge skbs at the right of current skb.

This not only fixes the bug, but also remove all small skbs that might
be a subset of the new one.

Example:

We receive segments 2001:3001,  4001:5001

Then we receive 2001:8001 : We should replace 2001:3001 with the big
skb, but also remove 4001:50001 from the queue to save space.

packetdrill test demonstrating the bug

0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0 bind(3, ..., ...) = 0
+0 listen(3, 1) = 0

+0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 7>
+0.100 < . 1:1(0) ack 1 win 1024
+0 accept(3, ..., ...) = 4

+0.01 < . 1001:2001(1000) ack 1 win 1024
+0    > . 1:1(0) ack 1 <nop,nop, sack 1001:2001>

+0.01 < . 1001:3001(2000) ack 1 win 1024
+0    > . 1:1(0) ack 1 <nop,nop, sack 1001:2001 1001:3001>

Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Yuchung Cheng <ycheng@google.com>
Cc: Yaogong Wang <wygivan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
---
 net/ipv4/tcp_input.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7832d0d..4cc0a53 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4465,7 +4465,7 @@ coalesce_done:
 				NET_INC_STATS(sock_net(sk),
 					      LINUX_MIB_TCPOFOMERGE);
 				__kfree_skb(skb1);
-				goto add_sack;
+				goto merge_right;
 			}
 		} else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
 			goto coalesce_done;
@@ -4477,6 +4477,7 @@ coalesce_done:
 	rb_link_node(&skb->rbnode, parent, p);
 	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
 
+merge_right:
 	/* Remove other segments covered by skb. */
 	while ((q = rb_next(&skb->rbnode)) != NULL) {
 		skb1 = rb_entry(q, struct sk_buff, rbnode);
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH stable 4.4 V2 4/6] tcp: free batches of packets in tcp_prune_ofo_queue()
From: Mao Wenan @ 2018-09-14  8:24 UTC (permalink / raw)
  To: netdev, gregkh, dwmw2, eric.dumazet, davem, stable, linux-kernel,
	maowenan
In-Reply-To: <1536913450-12380-1-git-send-email-maowenan@huawei.com>

From: Eric Dumazet <edumazet@google.com>

[ Upstream commit 72cd43ba64fc172a443410ce01645895850844c8 ]

Juha-Matti Tilli reported that malicious peers could inject tiny
packets in out_of_order_queue, forcing very expensive calls
to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for
every incoming packet. out_of_order_queue rb-tree can contain
thousands of nodes, iterating over all of them is not nice.

Before linux-4.9, we would have pruned all packets in ofo_queue
in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs
truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB.

Since we plan to increase tcp_rmem[2] in the future to cope with
modern BDP, can not revert to the old behavior, without great pain.

Strategy taken in this patch is to purge ~12.5 % of the queue capacity.

Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
---
 net/ipv4/tcp_input.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4cc0a53..4739a93 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4899,27 +4899,33 @@ new_range:
 
 /*
  * Purge the out-of-order queue.
+ * Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
  * Return true if queue was pruned.
  */
 static bool tcp_prune_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct rb_node *node, *prev;
+	int goal;
 
 	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
 		return false;
 
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
-
+	goal = sk->sk_rcvbuf >> 3;
 	node = &tp->ooo_last_skb->rbnode;
 	do {
 		prev = rb_prev(node);
 		rb_erase(node, &tp->out_of_order_queue);
+		goal -= rb_to_skb(node)->truesize;
 		__kfree_skb(rb_to_skb(node));
-		sk_mem_reclaim(sk);
-		if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
-		    !tcp_under_memory_pressure(sk))
-			break;
+		if (!prev || goal <= 0) {
+			sk_mem_reclaim(sk);
+			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+			    !tcp_under_memory_pressure(sk))
+				break;
+			goal = sk->sk_rcvbuf >> 3;
+		}
 
 		node = prev;
 	} while (node);
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH stable 4.4 V2 5/6] tcp: call tcp_drop() from tcp_data_queue_ofo()
From: Mao Wenan @ 2018-09-14  8:24 UTC (permalink / raw)
  To: netdev, gregkh, dwmw2, eric.dumazet, davem, stable, linux-kernel,
	maowenan
In-Reply-To: <1536913450-12380-1-git-send-email-maowenan@huawei.com>

From: Eric Dumazet <edumazet@google.com>

[ Upstream commit 8541b21e781a22dce52a74fef0b9bed00404a1cd ]

In order to be able to give better diagnostics and detect
malicious traffic, we need to have better sk->sk_drops tracking.

Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4739a93..cbe0ca0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4445,7 +4445,7 @@ coalesce_done:
 				/* All the bits are present. Drop. */
 				NET_INC_STATS(sock_net(sk),
 					      LINUX_MIB_TCPOFOMERGE);
-				__kfree_skb(skb);
+				tcp_drop(sk, skb);
 				skb = NULL;
 				tcp_dsack_set(sk, seq, end_seq);
 				goto add_sack;
@@ -4464,7 +4464,7 @@ coalesce_done:
 						 TCP_SKB_CB(skb1)->end_seq);
 				NET_INC_STATS(sock_net(sk),
 					      LINUX_MIB_TCPOFOMERGE);
-				__kfree_skb(skb1);
+				tcp_drop(sk, skb1);
 				goto merge_right;
 			}
 		} else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH stable 4.4 V2 6/6] tcp: add tcp_ooo_try_coalesce() helper
From: Mao Wenan @ 2018-09-14  8:24 UTC (permalink / raw)
  To: netdev, gregkh, dwmw2, eric.dumazet, davem, stable, linux-kernel,
	maowenan
In-Reply-To: <1536913450-12380-1-git-send-email-maowenan@huawei.com>

From: Eric Dumazet <edumazet@google.com>

[ Upstream commit 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c ]

In case skb in out_or_order_queue is the result of
multiple skbs coalescing, we would like to get a proper gso_segs
counter tracking, so that future tcp_drop() can report an accurate
number.

I chose to not implement this tracking for skbs in receive queue,
since they are not dropped, unless socket is disconnected.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
---
 net/ipv4/tcp_input.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cbe0ca0..1aff93d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4296,6 +4296,23 @@ static bool tcp_try_coalesce(struct sock *sk,
 	return true;
 }
 
+static bool tcp_ooo_try_coalesce(struct sock *sk,
+			     struct sk_buff *to,
+			     struct sk_buff *from,
+			     bool *fragstolen)
+{
+	bool res = tcp_try_coalesce(sk, to, from, fragstolen);
+
+	/* In case tcp_drop() is called later, update to->gso_segs */
+	if (res) {
+		u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
+			       max_t(u16, 1, skb_shinfo(from)->gso_segs);
+
+		skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
+	}
+	return res;
+}
+
 static void tcp_drop(struct sock *sk, struct sk_buff *skb)
 {
 	sk_drops_add(sk, skb);
@@ -4422,7 +4439,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	/* In the typical case, we are adding an skb to the end of the list.
 	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
 	 */
-	if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
+	if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
+				 skb, &fragstolen)) {
 coalesce_done:
 		tcp_grow_window(sk, skb);
 		kfree_skb_partial(skb, fragstolen);
@@ -4467,7 +4485,8 @@ coalesce_done:
 				tcp_drop(sk, skb1);
 				goto merge_right;
 			}
-		} else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+		} else if (tcp_ooo_try_coalesce(sk, skb1,
+						skb, &fragstolen)) {
 			goto coalesce_done;
 		}
 		p = &parent->rb_right;
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH net-next V2] virtio_net: ethtool tx napi configuration
From: Jason Wang @ 2018-09-14  8:24 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Michael S. Tsirkin, David Miller, virtualization,
	Network Development, LKML, Willem de Bruijn
In-Reply-To: <CAF=yD-LSgCEnL_kTH44A8-FH3nut3y=bonf4Bgf75=jd=D64hw@mail.gmail.com>



On 2018年09月13日 23:20, Willem de Bruijn wrote:
> On Thu, Sep 13, 2018 at 1:40 AM Jason Wang <jasowang@redhat.com> wrote:
>> Implement ethtool .set_coalesce (-C) and .get_coalesce (-c) handlers.
>> Interrupt moderation is currently not supported, so these accept and
>> display the default settings of 0 usec and 1 frame.
>>
>> Toggle tx napi through a bit in tx-frames. So as to not interfere
>> with possible future interrupt moderation, value 1 means tx napi while
>> value 0 means not.
>>
>> To properly synchronize with the data path, tx napi is disabled and
>> tx lock is held when changing the value of napi weight. And two more
>> places that can access tx napi weight:
>>
>> - speculative tx polling in rx napi, we can leave it as is since it
>>    not a must for correctness.
>> - skb_xmit_done(), one more check of napi weight is added before
>>    trying to enable tx to avoid tx to be disabled forever if napi is
>>    disabled after skb_xmit_done() but before the napi
>>
>> Link: https://patchwork.ozlabs.org/patch/948149/
>> Suggested-by: Jason Wang <jasowang@redhat.com>
>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>> Changes from V1:
>> - try to synchronize with datapath to allow changing mode when
>>    interface is up.
>> - use tx-frames 0 as to disable tx napi while tx-frames 1 to enable tx napi
>> ---
>>   drivers/net/virtio_net.c | 64 +++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 63 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 765920905226..6e70864f5899 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -66,6 +66,8 @@ DECLARE_EWMA(pkt_len, 0, 64)
>>
>>   #define VIRTNET_DRIVER_VERSION "1.0.0"
>>
>> +static const u32 ethtool_coalesce_napi_mask = (1UL << 10);
>> +
> This is no longer needed

Yes, will remove this.

>
>>   static const unsigned long guest_offloads[] = {
>>          VIRTIO_NET_F_GUEST_TSO4,
>>          VIRTIO_NET_F_GUEST_TSO6,
>> @@ -1444,7 +1446,10 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>>
>>          virtqueue_napi_complete(napi, sq->vq, 0);
>>
>> -       if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>> +       /* Check napi.weight to avoid tx stall since it could be set
>> +        * to zero by ethtool after skb_xmit_done().
>> +        */
>> +       if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS || !sq->napi.weight)
>>                  netif_tx_wake_queue(txq);
> I see. This assumes that the napi handler will always be called on
> conversion from napi to no-napi mode.
>
> That is safe to assume because if it isn't called (and will not call
> netif_tx_wake_queue) that implies that napi was not scheduled, and
> thus the tx interrupt was not suppressed and thus there was no tx
> completion work to be scheduled?

If it isn't called it means skb_xmit_done() wakeup tx directly instead 
of schedule tx. This could be a little bit early since there may be 
still lots of pending tx packets. But it doesn't harm, start_xmit() can 
handle this by re enable a delayed tx interrupt and disable TX.

But there's a bug, look like I need remove the check of 
(!sq->napi.weight) in the beginning of the function.

>
>>          return 0;
>> @@ -2181,6 +2186,61 @@ static int virtnet_get_link_ksettings(struct net_device *dev,
>>          return 0;
>>   }
>>
>> +static int virtnet_set_coalesce(struct net_device *dev,
>> +                               struct ethtool_coalesce *ec)
>> +{
>> +       struct ethtool_coalesce ec_default = {
>> +               .cmd = ETHTOOL_SCOALESCE,
>> +               .rx_max_coalesced_frames = 1,
>> +       };
>> +       struct virtnet_info *vi = netdev_priv(dev);
>> +       int i, napi_weight;
>> +
>> +       if (ec->tx_max_coalesced_frames > 1)
>> +               return -EINVAL;
>> +
>> +       ec_default.tx_max_coalesced_frames = ec->tx_max_coalesced_frames;
>> +       napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
>> +
>> +       /* disallow changes to fields not explicitly tested above */
>> +       if (memcmp(ec, &ec_default, sizeof(ec_default)))
>> +               return -EINVAL;
>> +
>> +       if (napi_weight ^ vi->sq[0].napi.weight) {
>> +               for (i = 0; i < vi->max_queue_pairs; i++) {
>> +                       struct netdev_queue *txq =
>> +                              netdev_get_tx_queue(vi->dev, i);
>> +
>> +                       virtnet_napi_tx_disable(&vi->sq[i].napi);
>> +                       __netif_tx_lock_bh(txq);
>> +                       vi->sq[i].napi.weight = napi_weight;
>> +                       __netif_tx_unlock_bh(txq);
>> +                       virtnet_napi_tx_enable(vi, vi->sq[i].vq,
>> +                                              &vi->sq[i].napi);
>> +               }
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static int virtnet_get_coalesce(struct net_device *dev,
>> +                               struct ethtool_coalesce *ec)
>> +{
>> +       struct ethtool_coalesce ec_default = {
>> +               .cmd = ETHTOOL_GCOALESCE,
>> +               .rx_max_coalesced_frames = 1,
>> +               .tx_max_coalesced_frames = 0,
> no need to explicitly initialize to 0 (unless you did this for
> documentation purposes, which is fine).

Yes.

Thanks

>> +       };
>> +       struct virtnet_info *vi = netdev_priv(dev);
>> +
>> +       memcpy(ec, &ec_default, sizeof(ec_default));
>> +
>> +       if (vi->sq[0].napi.weight)
>> +               ec->tx_max_coalesced_frames = 1;
>> +
>> +       return 0;
>> +}
>> +
>>   static void virtnet_init_settings(struct net_device *dev)
>>   {
>>          struct virtnet_info *vi = netdev_priv(dev);
>> @@ -2219,6 +2279,8 @@ static const struct ethtool_ops virtnet_ethtool_ops = {
>>          .get_ts_info = ethtool_op_get_ts_info,
>>          .get_link_ksettings = virtnet_get_link_ksettings,
>>          .set_link_ksettings = virtnet_set_link_ksettings,
>> +       .set_coalesce = virtnet_set_coalesce,
>> +       .get_coalesce = virtnet_get_coalesce,
>>   };
>>
>>   static void virtnet_freeze_down(struct virtio_device *vdev)
>> --
>> 2.17.1
>>

^ permalink raw reply

* [PATCH net-next 0/5] Various improvements to Microsemi PHY driver
From: Quentin Schulz @ 2018-09-14  8:33 UTC (permalink / raw)
  To: davem, andrew, f.fainelli
  Cc: allan.nielsen, linux-kernel, netdev, thomas.petazzoni,
	Quentin Schulz

The Microsemi PHYs have several counters so let's make them available as PHY
statistics.

The VSC 8530/31/40/41 also need to update their EEE init sequence in order to
avoid packet losses and improve performance.

This patch series also makes some minor cosmetic changes to the driver.

Quentin Schulz (3):
  net: phy: mscc: remove unneeded parenthesis
  net: phy: mscc: shorten `x != 0` condition to `x`
  net: phy: mscc: remove unneeded temporary variable

Raju Lakkaraju (2):
  net: phy: mscc: add ethtool statistics counters
  net: phy: mscc: Add EEE init sequence

 drivers/net/phy/mscc.c | 229 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 202 insertions(+), 27 deletions(-)

base-commit: 00989856964175eafbe1435a70862c2ac66cffc0
-- 
git-series 0.9.1

^ permalink raw reply

* [PATCH net-next 1/5] net: phy: mscc: add ethtool statistics counters
From: Quentin Schulz @ 2018-09-14  8:33 UTC (permalink / raw)
  To: davem, andrew, f.fainelli
  Cc: allan.nielsen, linux-kernel, netdev, thomas.petazzoni,
	Quentin Schulz, Raju Lakkaraju
In-Reply-To: <cover.616d15610d44a0e3d463acd8119859f243163ad2.1536913944.git-series.quentin.schulz@bootlin.com>

From: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>

There are a few counters available in the PHY: receive errors, false
carriers, link disconnects, media CRC errors and valids counters.

So let's expose those in the PHY driver.

Use the priv structure as the next PHY to be supported has a few
additional counters.

Signed-off-by: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
---
 drivers/net/phy/mscc.c | 128 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 128 insertions(+)

diff --git a/drivers/net/phy/mscc.c b/drivers/net/phy/mscc.c
index 2d9676d..62d6e0a 100644
--- a/drivers/net/phy/mscc.c
+++ b/drivers/net/phy/mscc.c
@@ -33,6 +33,11 @@ enum rgmii_rx_clock_delay {
 #define DISABLE_PAIR_SWAP_CORR_MASK	  0x0020
 #define DISABLE_POLARITY_CORR_MASK	  0x0010
 
+#define MSCC_PHY_ERR_RX_CNT		  19
+#define MSCC_PHY_ERR_FALSE_CARRIER_CNT	  20
+#define MSCC_PHY_ERR_LINK_DISCONNECT_CNT  21
+#define ERR_CNT_MASK			  GENMASK(7, 0)
+
 #define MSCC_PHY_EXT_PHY_CNTL_1           23
 #define MAC_IF_SELECTION_MASK             0x1800
 #define MAC_IF_SELECTION_GMII             0
@@ -64,6 +69,9 @@ enum rgmii_rx_clock_delay {
 #define MSCC_PHY_PAGE_EXTENDED_2	  0x0002 /* Extended reg - page 2 */
 
 /* Extended Page 1 Registers */
+#define MSCC_PHY_CU_MEDIA_CRC_VALID_CNT	  18
+#define VALID_CRC_CNT_CRC_MASK		  GENMASK(13, 0)
+
 #define MSCC_PHY_EXT_MODE_CNTL		  19
 #define FORCE_MDI_CROSSOVER_MASK	  0x000C
 #define FORCE_MDI_CROSSOVER_MDIX	  0x000C
@@ -74,6 +82,8 @@ enum rgmii_rx_clock_delay {
 #define DOWNSHIFT_EN			  0x0010
 #define DOWNSHIFT_CNTL_POS		  2
 
+#define MSCC_PHY_EXT_PHY_CNTL_4		  23
+
 /* Extended Page 2 Registers */
 #define MSCC_PHY_RGMII_CNTL		  20
 #define RGMII_RX_CLK_DELAY_MASK		  0x0070
@@ -119,11 +129,50 @@ enum rgmii_rx_clock_delay {
 				BIT(VSC8531_FORCE_LED_OFF) | \
 				BIT(VSC8531_FORCE_LED_ON))
 
+struct vsc85xx_hw_stat {
+	const char *string;
+	u8 reg;
+	u16 page;
+	u16 mask;
+};
+
+static struct vsc85xx_hw_stat vsc85xx_hw_stats[] = {
+	{
+		.string	= "phy_receive_errors",
+		.reg	= MSCC_PHY_ERR_RX_CNT,
+		.page	= MSCC_PHY_PAGE_STANDARD,
+		.mask	= ERR_CNT_MASK,
+	}, {
+		.string	= "phy_false_carrier",
+		.reg	= MSCC_PHY_ERR_FALSE_CARRIER_CNT,
+		.page	= MSCC_PHY_PAGE_STANDARD,
+		.mask	= ERR_CNT_MASK,
+	}, {
+		.string	= "phy_cu_media_link_disconnect",
+		.reg	= MSCC_PHY_ERR_LINK_DISCONNECT_CNT,
+		.page	= MSCC_PHY_PAGE_STANDARD,
+		.mask	= ERR_CNT_MASK,
+	}, {
+		.string	= "phy_cu_media_crc_good_count",
+		.reg	= MSCC_PHY_CU_MEDIA_CRC_VALID_CNT,
+		.page	= MSCC_PHY_PAGE_EXTENDED,
+		.mask	= VALID_CRC_CNT_CRC_MASK,
+	}, {
+		.string	= "phy_cu_media_crc_error_count",
+		.reg	= MSCC_PHY_EXT_PHY_CNTL_4,
+		.page	= MSCC_PHY_PAGE_EXTENDED,
+		.mask	= ERR_CNT_MASK,
+	},
+};
+
 struct vsc8531_private {
 	int rate_magic;
 	u16 supp_led_modes;
 	u32 leds_mode[MAX_LEDS];
 	u8 nleds;
+	struct vsc85xx_hw_stat *hw_stats;
+	u64 *stats;
+	int nstats;
 };
 
 #ifdef CONFIG_OF_MDIO
@@ -148,6 +197,66 @@ static int vsc85xx_phy_page_set(struct phy_device *phydev, u16 page)
 	return rc;
 }
 
+static int vsc85xx_get_sset_count(struct phy_device *phydev)
+{
+	struct vsc8531_private *priv = phydev->priv;
+
+	if (!priv)
+		return 0;
+
+	return priv->nstats;
+}
+
+static void vsc85xx_get_strings(struct phy_device *phydev, u8 *data)
+{
+	struct vsc8531_private *priv = phydev->priv;
+	int i;
+
+	if (!priv)
+		return;
+
+	for (i = 0; i < priv->nstats; i++)
+		strlcpy(data + i * ETH_GSTRING_LEN, priv->hw_stats[i].string,
+			ETH_GSTRING_LEN);
+}
+
+static u64 vsc85xx_get_stat(struct phy_device *phydev, int i)
+{
+	struct vsc8531_private *priv = phydev->priv;
+	int val;
+	u64 ret;
+
+	vsc85xx_phy_page_set(phydev, priv->hw_stats[i].page);
+
+	val = phy_read(phydev, priv->hw_stats[i].reg);
+	if (val < 0) {
+		ret = U64_MAX;
+		goto out;
+	}
+
+	val = val & priv->hw_stats[i].mask;
+	priv->stats[i] += val;
+	ret = priv->stats[i];
+
+out:
+	vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_STANDARD);
+
+	return ret;
+}
+
+static void vsc85xx_get_stats(struct phy_device *phydev,
+			      struct ethtool_stats *stats, u64 *data)
+{
+	struct vsc8531_private *priv = phydev->priv;
+	int i;
+
+	if (!priv)
+		return;
+
+	for (i = 0; i < priv->nstats; i++)
+		data[i] = vsc85xx_get_stat(phydev, i);
+}
+
 static int vsc85xx_led_cntl_set(struct phy_device *phydev,
 				u8 led_num,
 				u8 mode)
@@ -673,6 +782,13 @@ static int vsc85xx_probe(struct phy_device *phydev)
 	vsc8531->rate_magic = rate_magic;
 	vsc8531->nleds = 2;
 	vsc8531->supp_led_modes = VSC85XX_SUPP_LED_MODES;
+	vsc8531->hw_stats = vsc85xx_hw_stats;
+	vsc8531->nstats = ARRAY_SIZE(vsc85xx_hw_stats);
+	vsc8531->stats = devm_kzalloc(&phydev->mdio.dev,
+				      sizeof(u64) * vsc8531->nstats,
+				      GFP_KERNEL);
+	if (!vsc8531->stats)
+		return -ENOMEM;
 
 	return vsc85xx_dt_led_modes_get(phydev, default_mode);
 }
@@ -699,6 +815,9 @@ static struct phy_driver vsc85xx_driver[] = {
 	.get_wol	= &vsc85xx_wol_get,
 	.get_tunable	= &vsc85xx_get_tunable,
 	.set_tunable	= &vsc85xx_set_tunable,
+	.get_sset_count = &vsc85xx_get_sset_count,
+	.get_strings    = &vsc85xx_get_strings,
+	.get_stats      = &vsc85xx_get_stats,
 },
 {
 	.phy_id		= PHY_ID_VSC8531,
@@ -720,6 +839,9 @@ static struct phy_driver vsc85xx_driver[] = {
 	.get_wol	= &vsc85xx_wol_get,
 	.get_tunable	= &vsc85xx_get_tunable,
 	.set_tunable	= &vsc85xx_set_tunable,
+	.get_sset_count = &vsc85xx_get_sset_count,
+	.get_strings    = &vsc85xx_get_strings,
+	.get_stats      = &vsc85xx_get_stats,
 },
 {
 	.phy_id		= PHY_ID_VSC8540,
@@ -741,6 +863,9 @@ static struct phy_driver vsc85xx_driver[] = {
 	.get_wol	= &vsc85xx_wol_get,
 	.get_tunable	= &vsc85xx_get_tunable,
 	.set_tunable	= &vsc85xx_set_tunable,
+	.get_sset_count = &vsc85xx_get_sset_count,
+	.get_strings    = &vsc85xx_get_strings,
+	.get_stats      = &vsc85xx_get_stats,
 },
 {
 	.phy_id		= PHY_ID_VSC8541,
@@ -762,6 +887,9 @@ static struct phy_driver vsc85xx_driver[] = {
 	.get_wol	= &vsc85xx_wol_get,
 	.get_tunable	= &vsc85xx_get_tunable,
 	.set_tunable	= &vsc85xx_set_tunable,
+	.get_sset_count = &vsc85xx_get_sset_count,
+	.get_strings    = &vsc85xx_get_strings,
+	.get_stats      = &vsc85xx_get_stats,
 }
 
 };
-- 
git-series 0.9.1

^ permalink raw reply related

* [PATCH net-next 3/5] net: phy: mscc: remove unneeded parenthesis
From: Quentin Schulz @ 2018-09-14  8:33 UTC (permalink / raw)
  To: davem, andrew, f.fainelli
  Cc: allan.nielsen, linux-kernel, netdev, thomas.petazzoni,
	Quentin Schulz
In-Reply-To: <cover.616d15610d44a0e3d463acd8119859f243163ad2.1536913944.git-series.quentin.schulz@bootlin.com>

The == operator precedes the || operator, so we can remove the
parenthesis around (a == b) || (c == d).

The condition is rather explicit and short so removing the parenthesis
definitely does not make it harder to read.

Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
---
 drivers/net/phy/mscc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/mscc.c b/drivers/net/phy/mscc.c
index c0a9ea9..734d9fb 100644
--- a/drivers/net/phy/mscc.c
+++ b/drivers/net/phy/mscc.c
@@ -301,7 +301,7 @@ static int vsc85xx_mdix_set(struct phy_device *phydev, u8 mdix)
 	u16 reg_val;
 
 	reg_val = phy_read(phydev, MSCC_PHY_BYPASS_CONTROL);
-	if ((mdix == ETH_TP_MDI) || (mdix == ETH_TP_MDI_X)) {
+	if (mdix == ETH_TP_MDI || mdix == ETH_TP_MDI_X) {
 		reg_val |= (DISABLE_PAIR_SWAP_CORR_MASK |
 			    DISABLE_POLARITY_CORR_MASK  |
 			    DISABLE_HP_AUTO_MDIX_MASK);
-- 
git-series 0.9.1

^ permalink raw reply related

* [PATCH net-next 4/5] net: phy: mscc: shorten `x != 0` condition to `x`
From: Quentin Schulz @ 2018-09-14  8:33 UTC (permalink / raw)
  To: davem, andrew, f.fainelli
  Cc: allan.nielsen, linux-kernel, netdev, thomas.petazzoni,
	Quentin Schulz
In-Reply-To: <cover.616d15610d44a0e3d463acd8119859f243163ad2.1536913944.git-series.quentin.schulz@bootlin.com>

`if (x != 0)` is basically a more verbose version of `if (x)` so let's
use the latter so it's consistent throughout the whole driver.

Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
---
 drivers/net/phy/mscc.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/net/phy/mscc.c b/drivers/net/phy/mscc.c
index 734d9fb..efa9352 100644
--- a/drivers/net/phy/mscc.c
+++ b/drivers/net/phy/mscc.c
@@ -311,11 +311,11 @@ static int vsc85xx_mdix_set(struct phy_device *phydev, u8 mdix)
 			     DISABLE_HP_AUTO_MDIX_MASK);
 	}
 	rc = phy_write(phydev, MSCC_PHY_BYPASS_CONTROL, reg_val);
-	if (rc != 0)
+	if (rc)
 		return rc;
 
 	rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_EXTENDED);
-	if (rc != 0)
+	if (rc)
 		return rc;
 
 	reg_val = phy_read(phydev, MSCC_PHY_EXT_MODE_CNTL);
@@ -325,11 +325,11 @@ static int vsc85xx_mdix_set(struct phy_device *phydev, u8 mdix)
 	else if (mdix == ETH_TP_MDI_X)
 		reg_val |= FORCE_MDI_CROSSOVER_MDIX;
 	rc = phy_write(phydev, MSCC_PHY_EXT_MODE_CNTL, reg_val);
-	if (rc != 0)
+	if (rc)
 		return rc;
 
 	rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_STANDARD);
-	if (rc != 0)
+	if (rc)
 		return rc;
 
 	return genphy_restart_aneg(phydev);
@@ -341,7 +341,7 @@ static int vsc85xx_downshift_get(struct phy_device *phydev, u8 *count)
 	u16 reg_val;
 
 	rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_EXTENDED);
-	if (rc != 0)
+	if (rc)
 		goto out;
 
 	reg_val = phy_read(phydev, MSCC_PHY_ACTIPHY_CNTL);
@@ -373,14 +373,14 @@ static int vsc85xx_downshift_set(struct phy_device *phydev, u8 count)
 	}
 
 	rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_EXTENDED);
-	if (rc != 0)
+	if (rc)
 		goto out;
 
 	reg_val = phy_read(phydev, MSCC_PHY_ACTIPHY_CNTL);
 	reg_val &= ~(DOWNSHIFT_CNTL_MASK);
 	reg_val |= count;
 	rc = phy_write(phydev, MSCC_PHY_ACTIPHY_CNTL, reg_val);
-	if (rc != 0)
+	if (rc)
 		goto out;
 
 	rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_STANDARD);
@@ -401,7 +401,7 @@ static int vsc85xx_wol_set(struct phy_device *phydev,
 
 	mutex_lock(&phydev->lock);
 	rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_EXTENDED_2);
-	if (rc != 0)
+	if (rc)
 		goto out_unlock;
 
 	if (wol->wolopts & WAKE_MAGIC) {
@@ -439,7 +439,7 @@ static int vsc85xx_wol_set(struct phy_device *phydev,
 	phy_write(phydev, MSCC_PHY_WOL_MAC_CONTROL, reg_val);
 
 	rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_STANDARD);
-	if (rc != 0)
+	if (rc)
 		goto out_unlock;
 
 	if (wol->wolopts & WAKE_MAGIC) {
@@ -447,14 +447,14 @@ static int vsc85xx_wol_set(struct phy_device *phydev,
 		reg_val = phy_read(phydev, MII_VSC85XX_INT_MASK);
 		reg_val |= MII_VSC85XX_INT_MASK_WOL;
 		rc = phy_write(phydev, MII_VSC85XX_INT_MASK, reg_val);
-		if (rc != 0)
+		if (rc)
 			goto out_unlock;
 	} else {
 		/* Disable the WOL interrupt */
 		reg_val = phy_read(phydev, MII_VSC85XX_INT_MASK);
 		reg_val &= (~MII_VSC85XX_INT_MASK_WOL);
 		rc = phy_write(phydev, MII_VSC85XX_INT_MASK, reg_val);
-		if (rc != 0)
+		if (rc)
 			goto out_unlock;
 	}
 	/* Clear WOL iterrupt status */
@@ -595,13 +595,13 @@ static int vsc85xx_edge_rate_cntl_set(struct phy_device *phydev, u8 edge_rate)
 
 	mutex_lock(&phydev->lock);
 	rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_EXTENDED_2);
-	if (rc != 0)
+	if (rc)
 		goto out_unlock;
 	reg_val = phy_read(phydev, MSCC_PHY_WOL_MAC_CONTROL);
 	reg_val &= ~(EDGE_RATE_CNTL_MASK);
 	reg_val |= (edge_rate << EDGE_RATE_CNTL_POS);
 	rc = phy_write(phydev, MSCC_PHY_WOL_MAC_CONTROL, reg_val);
-	if (rc != 0)
+	if (rc)
 		goto out_unlock;
 	rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_STANDARD);
 
@@ -636,7 +636,7 @@ static int vsc85xx_mac_if_set(struct phy_device *phydev,
 		goto out_unlock;
 	}
 	rc = phy_write(phydev, MSCC_PHY_EXT_PHY_CNTL_1, reg_val);
-	if (rc != 0)
+	if (rc)
 		goto out_unlock;
 
 	rc = genphy_soft_reset(phydev);
@@ -655,7 +655,7 @@ static int vsc85xx_default_config(struct phy_device *phydev)
 	phydev->mdix_ctrl = ETH_TP_MDI_AUTO;
 	mutex_lock(&phydev->lock);
 	rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_EXTENDED_2);
-	if (rc != 0)
+	if (rc)
 		goto out_unlock;
 
 	reg_val = phy_read(phydev, MSCC_PHY_RGMII_CNTL);
-- 
git-series 0.9.1

^ permalink raw reply related

* [PATCH net-next 5/5] net: phy: mscc: remove unneeded temporary variable
From: Quentin Schulz @ 2018-09-14  8:33 UTC (permalink / raw)
  To: davem, andrew, f.fainelli
  Cc: allan.nielsen, linux-kernel, netdev, thomas.petazzoni,
	Quentin Schulz
In-Reply-To: <cover.616d15610d44a0e3d463acd8119859f243163ad2.1536913944.git-series.quentin.schulz@bootlin.com>

Here, the rc variable is either used only for the condition right after
the assignment or right before being used as the return value of the
function it's being used in.

So let's remove this unneeded temporary variable whenever possible.

Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
---
 drivers/net/phy/mscc.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/drivers/net/phy/mscc.c b/drivers/net/phy/mscc.c
index efa9352..24f4754 100644
--- a/drivers/net/phy/mscc.c
+++ b/drivers/net/phy/mscc.c
@@ -199,10 +199,7 @@ static const struct vsc8531_edge_rate_table edge_table[] = {
 
 static int vsc85xx_phy_page_set(struct phy_device *phydev, u16 page)
 {
-	int rc;
-
-	rc = phy_write(phydev, MSCC_EXT_PAGE_ACCESS, page);
-	return rc;
+	return phy_write(phydev, MSCC_EXT_PAGE_ACCESS, page);
 }
 
 static int vsc85xx_get_sset_count(struct phy_device *phydev)
@@ -504,7 +501,7 @@ static void vsc85xx_wol_get(struct phy_device *phydev,
 static int vsc85xx_edge_rate_magic_get(struct phy_device *phydev)
 {
 	u32 vdd, sd;
-	int rc, i, j;
+	int i, j;
 	struct device *dev = &phydev->mdio.dev;
 	struct device_node *of_node = dev->of_node;
 	u8 sd_array_size = ARRAY_SIZE(edge_table[0].slowdown);
@@ -512,12 +509,10 @@ static int vsc85xx_edge_rate_magic_get(struct phy_device *phydev)
 	if (!of_node)
 		return -ENODEV;
 
-	rc = of_property_read_u32(of_node, "vsc8531,vddmac", &vdd);
-	if (rc != 0)
+	if (of_property_read_u32(of_node, "vsc8531,vddmac", &vdd))
 		vdd = MSCC_VDDMAC_3300;
 
-	rc = of_property_read_u32(of_node, "vsc8531,edge-slowdown", &sd);
-	if (rc != 0)
+	if (of_property_read_u32(of_node, "vsc8531,edge-slowdown", &sd))
 		sd = 0;
 
 	for (i = 0; i < ARRAY_SIZE(edge_table); i++)
@@ -762,9 +757,7 @@ static int vsc85xx_config_init(struct phy_device *phydev)
 			return rc;
 	}
 
-	rc = genphy_config_init(phydev);
-
-	return rc;
+	return genphy_config_init(phydev);
 }
 
 static int vsc85xx_ack_interrupt(struct phy_device *phydev)
-- 
git-series 0.9.1

^ permalink raw reply related

* Re: [PATCH net-next] virtio_net: ethtool tx napi configuration
From: Jason Wang @ 2018-09-14  3:27 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: f.fainelli, Network Development, David Miller, caleb.raitto,
	Michael S. Tsirkin, Jon Olson (Google Drive), Willem de Bruijn
In-Reply-To: <CAF=yD-L1h=EWF8DgAZsUCbb58tZhh5vDkPTRdwaZ895==OxTxA@mail.gmail.com>



On 2018年09月13日 22:58, Willem de Bruijn wrote:
> On Thu, Sep 13, 2018 at 5:02 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>>
>> On 2018年09月13日 07:27, Willem de Bruijn wrote:
>>> On Wed, Sep 12, 2018 at 3:11 PM Willem de Bruijn
>>> <willemdebruijn.kernel@gmail.com> wrote:
>>>> On Wed, Sep 12, 2018 at 2:16 PM Florian Fainelli <f.fainelli@gmail.com> wrote:
>>>>>
>>>>> On 9/12/2018 11:07 AM, Willem de Bruijn wrote:
>>>>>> On Wed, Sep 12, 2018 at 1:42 PM Florian Fainelli <f.fainelli@gmail.com> wrote:
>>>>>>>
>>>>>>> On 9/9/2018 3:44 PM, Willem de Bruijn wrote:
>>>>>>>> From: Willem de Bruijn <willemb@google.com>
>>>>>>>>
>>>>>>>> Implement ethtool .set_coalesce (-C) and .get_coalesce (-c) handlers.
>>>>>>>> Interrupt moderation is currently not supported, so these accept and
>>>>>>>> display the default settings of 0 usec and 1 frame.
>>>>>>>>
>>>>>>>> Toggle tx napi through a bit in tx-frames. So as to not interfere
>>>>>>>> with possible future interrupt moderation, use bit 10, well outside
>>>>>>>> the reasonable range of real interrupt moderation values.
>>>>>>>>
>>>>>>>> Changes are not atomic. The tx IRQ, napi BH and transmit path must
>>>>>>>> be quiesced when switching modes. Only allow changing this setting
>>>>>>>> when the device is down.
>>>>>>> Humm, would not a private ethtool flag to switch TX NAPI on/off be more
>>>>>>> appropriate rather than use the coalescing configuration API here?
>>>>>> What do you mean by private ethtool flag? A new field in ethtool
>>>>>> --features (-k)?
>>>>> I meant using ethtool_drvinfo::n_priv_flags, ETH_SS_PRIV_FLAGS and then
>>>>> ETHTOOL_GFPFLAGS and ETHTOOL_SPFLAGS to control the toggling of that
>>>>> private flag. mlx5 has a number of privates flags for instance.
>>>> Interesting, thanks! I was not at all aware of those ethtool flags.
>>>> Am having a look. It definitely looks promising.
>>> Okay, I made that change. That is indeed much cleaner, thanks.
>>> Let me send the patch, initially as RFC.
>>>
>>> I've observed one issue where if we toggle the flag before bringing
>>> up the device, it hits a kernel BUG at include/linux/netdevice.h:515
>>>
>>>           BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
>> This reminds me that we need to check netif_running() before trying to
>> enable and disable tx napi in ethtool_set_coalesce().
> The first iteration of my patch checked IFF_UP and effectively
> only allowed the change when not running. What do you mean
> by need to check?

I mean if device is not up, there's no need to toggle napi state and tx 
lock.

>
> And to respond to the other follow-up notes at once:
>
>> Consider we may have interrupt moderation in the future, I tend to use
>> set_coalesce. Otherwise we may need two steps to enable moderation:
>>
>> - tx-napi on
>> - set_coalesce
> FWIW, I don't care strongly whether we do this through coalesce or priv_flags.

Ok.

>>> +                     if (!napi_weight)
>>> +                             virtqueue_enable_cb(vi->sq[i].vq);
>> I don't get why we need to disable enable cb here.
> To avoid entering no-napi mode with too few descriptors to
> make progress and no way to get out of that state. This is a
> pretty crude attempt at handling that, admittedly.

But in this case, we will call enable_cb_delayed() and we will finally 
get a interrupt?

Thanks

^ permalink raw reply

* Re: [PATCH net-next] virtio_net: ethtool tx napi configuration
From: Willem de Bruijn @ 2018-09-14  3:40 UTC (permalink / raw)
  To: Jason Wang
  Cc: Florian Fainelli, Network Development, David Miller, caleb.raitto,
	Michael S. Tsirkin, Jon Olson (Google Drive), Willem de Bruijn
In-Reply-To: <6db3c755-a1dc-dcc9-e110-bfc38143e83d@redhat.com>

On Thu, Sep 13, 2018 at 11:27 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
>
> On 2018年09月13日 22:58, Willem de Bruijn wrote:
> > On Thu, Sep 13, 2018 at 5:02 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >>
> >> On 2018年09月13日 07:27, Willem de Bruijn wrote:
> >>> On Wed, Sep 12, 2018 at 3:11 PM Willem de Bruijn
> >>> <willemdebruijn.kernel@gmail.com> wrote:
> >>>> On Wed, Sep 12, 2018 at 2:16 PM Florian Fainelli <f.fainelli@gmail.com> wrote:
> >>>>>
> >>>>> On 9/12/2018 11:07 AM, Willem de Bruijn wrote:
> >>>>>> On Wed, Sep 12, 2018 at 1:42 PM Florian Fainelli <f.fainelli@gmail.com> wrote:
> >>>>>>>
> >>>>>>> On 9/9/2018 3:44 PM, Willem de Bruijn wrote:
> >>>>>>>> From: Willem de Bruijn <willemb@google.com>
> >>>>>>>>
> >>>>>>>> Implement ethtool .set_coalesce (-C) and .get_coalesce (-c) handlers.
> >>>>>>>> Interrupt moderation is currently not supported, so these accept and
> >>>>>>>> display the default settings of 0 usec and 1 frame.
> >>>>>>>>
> >>>>>>>> Toggle tx napi through a bit in tx-frames. So as to not interfere
> >>>>>>>> with possible future interrupt moderation, use bit 10, well outside
> >>>>>>>> the reasonable range of real interrupt moderation values.
> >>>>>>>>
> >>>>>>>> Changes are not atomic. The tx IRQ, napi BH and transmit path must
> >>>>>>>> be quiesced when switching modes. Only allow changing this setting
> >>>>>>>> when the device is down.
> >>>>>>> Humm, would not a private ethtool flag to switch TX NAPI on/off be more
> >>>>>>> appropriate rather than use the coalescing configuration API here?
> >>>>>> What do you mean by private ethtool flag? A new field in ethtool
> >>>>>> --features (-k)?
> >>>>> I meant using ethtool_drvinfo::n_priv_flags, ETH_SS_PRIV_FLAGS and then
> >>>>> ETHTOOL_GFPFLAGS and ETHTOOL_SPFLAGS to control the toggling of that
> >>>>> private flag. mlx5 has a number of privates flags for instance.
> >>>> Interesting, thanks! I was not at all aware of those ethtool flags.
> >>>> Am having a look. It definitely looks promising.
> >>> Okay, I made that change. That is indeed much cleaner, thanks.
> >>> Let me send the patch, initially as RFC.
> >>>
> >>> I've observed one issue where if we toggle the flag before bringing
> >>> up the device, it hits a kernel BUG at include/linux/netdevice.h:515
> >>>
> >>>           BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
> >> This reminds me that we need to check netif_running() before trying to
> >> enable and disable tx napi in ethtool_set_coalesce().
> > The first iteration of my patch checked IFF_UP and effectively
> > only allowed the change when not running. What do you mean
> > by need to check?
>
> I mean if device is not up, there's no need to toggle napi state and tx
> lock.
>
> >
> > And to respond to the other follow-up notes at once:
> >
> >> Consider we may have interrupt moderation in the future, I tend to use
> >> set_coalesce. Otherwise we may need two steps to enable moderation:
> >>
> >> - tx-napi on
> >> - set_coalesce
> > FWIW, I don't care strongly whether we do this through coalesce or priv_flags.
>
> Ok.

Since you prefer coalesce, let's go with that (and a revision of your
latest patch).

>
> >>> +                     if (!napi_weight)
> >>> +                             virtqueue_enable_cb(vi->sq[i].vq);
> >> I don't get why we need to disable enable cb here.
> > To avoid entering no-napi mode with too few descriptors to
> > make progress and no way to get out of that state. This is a
> > pretty crude attempt at handling that, admittedly.
>
> But in this case, we will call enable_cb_delayed() and we will finally
> get a interrupt?

Right. It's a bit of a roundabout way to ensure that
netif_tx_wake_queue and thus eventually free_old_xmit_skbs are called.
It might make more sense to just wake the device without going through
an interrupt.

^ permalink raw reply

* Re: [PATCH net-next] virtio_net: ethtool tx napi configuration
From: Jason Wang @ 2018-09-14  3:53 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Florian Fainelli, Network Development, David Miller, caleb.raitto,
	Michael S. Tsirkin, Jon Olson (Google Drive), Willem de Bruijn
In-Reply-To: <CAF=yD-Lp+MDnxLBFhG3HYONDUA9SRDDeCRUYTgZtbvWJtdO82w@mail.gmail.com>



On 2018年09月14日 11:40, Willem de Bruijn wrote:
> On Thu, Sep 13, 2018 at 11:27 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>>
>> On 2018年09月13日 22:58, Willem de Bruijn wrote:
>>> On Thu, Sep 13, 2018 at 5:02 AM Jason Wang <jasowang@redhat.com> wrote:
>>>>
>>>> On 2018年09月13日 07:27, Willem de Bruijn wrote:
>>>>> On Wed, Sep 12, 2018 at 3:11 PM Willem de Bruijn
>>>>> <willemdebruijn.kernel@gmail.com> wrote:
>>>>>> On Wed, Sep 12, 2018 at 2:16 PM Florian Fainelli <f.fainelli@gmail.com> wrote:
>>>>>>> On 9/12/2018 11:07 AM, Willem de Bruijn wrote:
>>>>>>>> On Wed, Sep 12, 2018 at 1:42 PM Florian Fainelli <f.fainelli@gmail.com> wrote:
>>>>>>>>> On 9/9/2018 3:44 PM, Willem de Bruijn wrote:
>>>>>>>>>> From: Willem de Bruijn <willemb@google.com>
>>>>>>>>>>
>>>>>>>>>> Implement ethtool .set_coalesce (-C) and .get_coalesce (-c) handlers.
>>>>>>>>>> Interrupt moderation is currently not supported, so these accept and
>>>>>>>>>> display the default settings of 0 usec and 1 frame.
>>>>>>>>>>
>>>>>>>>>> Toggle tx napi through a bit in tx-frames. So as to not interfere
>>>>>>>>>> with possible future interrupt moderation, use bit 10, well outside
>>>>>>>>>> the reasonable range of real interrupt moderation values.
>>>>>>>>>>
>>>>>>>>>> Changes are not atomic. The tx IRQ, napi BH and transmit path must
>>>>>>>>>> be quiesced when switching modes. Only allow changing this setting
>>>>>>>>>> when the device is down.
>>>>>>>>> Humm, would not a private ethtool flag to switch TX NAPI on/off be more
>>>>>>>>> appropriate rather than use the coalescing configuration API here?
>>>>>>>> What do you mean by private ethtool flag? A new field in ethtool
>>>>>>>> --features (-k)?
>>>>>>> I meant using ethtool_drvinfo::n_priv_flags, ETH_SS_PRIV_FLAGS and then
>>>>>>> ETHTOOL_GFPFLAGS and ETHTOOL_SPFLAGS to control the toggling of that
>>>>>>> private flag. mlx5 has a number of privates flags for instance.
>>>>>> Interesting, thanks! I was not at all aware of those ethtool flags.
>>>>>> Am having a look. It definitely looks promising.
>>>>> Okay, I made that change. That is indeed much cleaner, thanks.
>>>>> Let me send the patch, initially as RFC.
>>>>>
>>>>> I've observed one issue where if we toggle the flag before bringing
>>>>> up the device, it hits a kernel BUG at include/linux/netdevice.h:515
>>>>>
>>>>>            BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
>>>> This reminds me that we need to check netif_running() before trying to
>>>> enable and disable tx napi in ethtool_set_coalesce().
>>> The first iteration of my patch checked IFF_UP and effectively
>>> only allowed the change when not running. What do you mean
>>> by need to check?
>> I mean if device is not up, there's no need to toggle napi state and tx
>> lock.
>>
>>> And to respond to the other follow-up notes at once:
>>>
>>>> Consider we may have interrupt moderation in the future, I tend to use
>>>> set_coalesce. Otherwise we may need two steps to enable moderation:
>>>>
>>>> - tx-napi on
>>>> - set_coalesce
>>> FWIW, I don't care strongly whether we do this through coalesce or priv_flags.
>> Ok.
> Since you prefer coalesce, let's go with that (and a revision of your
> latest patch).

Good to know this.

>>>>> +                     if (!napi_weight)
>>>>> +                             virtqueue_enable_cb(vi->sq[i].vq);
>>>> I don't get why we need to disable enable cb here.
>>> To avoid entering no-napi mode with too few descriptors to
>>> make progress and no way to get out of that state. This is a
>>> pretty crude attempt at handling that, admittedly.
>> But in this case, we will call enable_cb_delayed() and we will finally
>> get a interrupt?
> Right. It's a bit of a roundabout way to ensure that
> netif_tx_wake_queue and thus eventually free_old_xmit_skbs are called.
> It might make more sense to just wake the device without going through
> an interrupt.

I'm not sure I get this. If we don't enable tx napi, we tend to delay TX 
interrupt if we found the ring is about to full to avoid interrupt 
storm, so we're probably ok in this case.

Thanks

^ permalink raw reply

* Re: [PATCH net] net/ipv6: do not copy DST_NOCOUNT flag on rt init
From: David Ahern @ 2018-09-14  4:11 UTC (permalink / raw)
  To: Peter Oskolkov, David Miller, netdev
In-Reply-To: <20180913203814.189698-1-posk@google.com>

On 9/13/18 1:38 PM, Peter Oskolkov wrote:

> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 3eed045c65a5..a3902f805305 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -946,7 +946,7 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
>  
>  static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
>  {
> -	rt->dst.flags |= fib6_info_dst_flags(ort);
> +	rt->dst.flags |= fib6_info_dst_flags(ort) & ~DST_NOCOUNT;

I think my mistake is setting dst.flags in ip6_rt_init_dst. Flags
argument is passed to ip6_dst_alloc which is always invoked before
ip6_rt_copy_init is called which is the only caller of ip6_rt_init_dst.

>  
>  	if (ort->fib6_flags & RTF_REJECT) {
>  		ip6_rt_init_dst_reject(rt, ort);
> 

^ permalink raw reply

* Re: [PATCH net-next 0/8] bnxt_en: devlink param updates
From: Vasundhara Volam @ 2018-09-14  4:17 UTC (permalink / raw)
  To: jakub.kicinski
  Cc: David Miller, michael.chan@broadcom.com, Netdev, alexander.duyck
In-Reply-To: <20180912115026.6c05ab5e@cakuba>

On Wed, Sep 12, 2018 at 3:20 PM Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
>
> On Wed, 12 Sep 2018 12:09:37 +0530, Vasundhara Volam wrote:
> > On Tue, Sep 11, 2018 at 5:04 PM Jakub Kicinski wrote:
> > > On Tue, 11 Sep 2018 14:14:57 +0530, Vasundhara Volam wrote:
> > > > This patchset adds support for 4 generic and 1 driver-specific devlink
> > > > parameters.
> > > >
> > > > Also, this patchset adds support to return proper error code if
> > > > HWRM_NVM_GET/SET_VARIABLE commands return error code
> > > > HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED.
> > > >
> > > > Vasundhara Volam (8):
> > > >   devlink: Add generic parameter hw_tc_offload
> > >
> > > Much like Jiri, I can't help but wonder why do you need this?
> >
> > There is a request from our customer for a way to toggle tc_offload
> > feature in our adapter.
>
> Vasundhara, again, we don't need to know who asked you to do this, but
> _why_.  What problem are you solving?  What is the customer trying to
> achieve?
For Brand new big features like TC_offload, few customers are not willing
to enable it by default in the adapter(Firmware). This was a subjective decision
to disable TC_offload by default in the adapter.
>
> > > >   devlink: Add generic parameter ignore_ari
> > > >   devlink: Add generic parameter msix_vec_per_pf_max
> > > >   devlink: Add generic parameter msix_vec_per_pf_min
> > >
> > > IMHO more structured API would be preferable if possible.  The string
> > > keys won't scale if you want to set the parameters per PF, and
> > > creating more structured API for PCIe which is a relatively slow
> > > moving HW spec seems tractable.
> >
> > Sorry, could you please suggest an example? We will try to adapt.
>
> My thinking was that the same way devlink device has ports, it should
> have PCIe functions as objects which then have attributes.  Instead of
> making everything a string-identified device attribute.  But I'm not
> dead set on this if others don't think its a good idea.
Actually this parameters are for the port but the value given to this param
is applicable for individual PF. That's the reason I have added "per_pf" string.
If you think this is not a good idea, I can move this params to driver-specific.

^ permalink raw reply

* [PATCH v3,net-next 2/2] ip6_gre: simplify gre header parsing in ip6gre_err
From: Haishuang Yan @ 2018-09-14  4:26 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov
  Cc: Jiri Benc, netdev, linux-kernel, Haishuang Yan
In-Reply-To: <1536899208-2958-1-git-send-email-yanhaishuang@cmss.chinamobile.com>

Same as ip_gre, use gre_parse_header to parse gre header in gre error
handler code.

Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
---
 net/ipv6/ip6_gre.c | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index e493b04..515adbd 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -427,35 +427,17 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		       u8 type, u8 code, int offset, __be32 info)
 {
 	struct net *net = dev_net(skb->dev);
-	const struct gre_base_hdr *greh;
 	const struct ipv6hdr *ipv6h;
-	int grehlen = sizeof(*greh);
+	struct tnl_ptk_info tpi;
 	struct ip6_tnl *t;
-	int key_off = 0;
-	__be16 flags;
-	__be32 key;
 
-	if (!pskb_may_pull(skb, offset + grehlen))
-		return;
-	greh = (const struct gre_base_hdr *)(skb->data + offset);
-	flags = greh->flags;
-	if (flags & (GRE_VERSION | GRE_ROUTING))
+	if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6),
+			     offset) < 0)
 		return;
-	if (flags & GRE_CSUM)
-		grehlen += 4;
-	if (flags & GRE_KEY) {
-		key_off = grehlen + offset;
-		grehlen += 4;
-	}
 
-	if (!pskb_may_pull(skb, offset + grehlen))
-		return;
 	ipv6h = (const struct ipv6hdr *)skb->data;
-	greh = (const struct gre_base_hdr *)(skb->data + offset);
-	key = key_off ? *(__be32 *)(skb->data + key_off) : 0;
-
 	t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
-				 key, greh->protocol);
+				 tpi.key, tpi.proto);
 	if (!t)
 		return;
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 0/7] add support for VSC8584 and VSC8574 Microsemi quad-port PHYs
From: Quentin Schulz @ 2018-09-14  9:44 UTC (permalink / raw)
  To: alexandre.belloni, ralf, paul.burton, jhogan, robh+dt,
	mark.rutland, davem, andrew, f.fainelli
  Cc: allan.nielsen, linux-mips, devicetree, linux-kernel, netdev,
	thomas.petazzoni, antoine.tenart, Quentin Schulz

Both PHYs are 4-port PHY that are 10/100/1000BASE-T, 100BASE-FX, 1000BASE-X
and triple-speed copper SFP capable, can communicate with the MAC via
SGMII, QSGMII or 1000BASE-X, supports downshifting and can set the blinking
pattern of each of its 4 LEDs, supports SyncE as well as HP Auto-MDIX
detection.

VSC8574 supports WOL and VSC8584 supports hardware offloading of MACsec.

This patch series add support for 10/100/1000BASE-T, SGMII/QSGMII link with
the MAC, downshifting, HP Auto-MDIX detection and blinking pattern for
their 4 LEDs.

They have also an internal Intel 8051 microcontroller whose firmware needs
to be patched when the PHY is reset. If the 8051's firmware has the
expected CRC, its patching can be skipped. The microcontroller can be
accessed from any port of the PHY, though the CRC function can only be done
through the PHY that is the base PHY of the package (internal address 0)
due to a limitation of the firmware.

The GPIO register bank is a set of registers that are common to all PHYs in
the package. So any modification in any register of this bank affects all
PHYs of the package.

If the PHYs haven't been reset before booting the Linux kernel and were
configured to use interrupts for e.g. link status updates, it is required
to clear the interrupts mask register of all PHYs before being able to use
interrupts with any PHY. The first PHY of the package that will be init
will take care of clearing all PHYs interrupts mask registers. Thus, we
need to keep track of the init sequence in the package, if it's already
been done or if it's to be done.

Most of the init sequence of a PHY of the package is common to all PHYs in
the package, thus we use the SMI broadcast feature which enables us to
propagate a write in one register of one PHY to all PHYs in the package.

We also introduce a new development board called PCB120 which exists in
variants for VSC8584 and VSC8574 (and that's the only difference to the
best of my knowledge).

I suggest patches 1 to 4 go through net tree and patches 5 to 7 go through
MIPS tree. Patches going through net tree and those going through MIPS tree
do not depend on one another.

This patch series depends on two patch series though:
"mscc: ocelot: add support for SerDes muxing configuration"
(https://lore.kernel.org/lkml/cover.ff40d591b548a6da31716e6e600f11a303e0e643.1536912834.git-series.quentin.schulz@bootlin.com/)
"Various improvements to Microsemi PHY driver"
(https://lore.kernel.org/lkml/cover.616d15610d44a0e3d463acd8119859f243163ad2.1536913944.git-series.quentin.schulz@bootlin.com/)
specifically patch 2/5 which defines constants that are used in this patch
series.

Thanks,
Quentin

Quentin Schulz (7):
  dt-bindings: net: vsc8531: add two additional LED modes for VSC8584
  net: phy: mscc: add support for VSC8584 PHY
  net: phy: mscc: split config_init in two functions for VSC8584
  net: phy: mscc: add support for VSC8574 PHY
  MIPS: mscc: ocelot: add GPIO4 pinmuxing DT node
  MIPS: mscc: add DT for Ocelot PCB120
  MIPS: mscc: add PCB120 to the ocelot fitImage

 arch/mips/boot/dts/mscc/Makefile            |    2 +-
 arch/mips/boot/dts/mscc/ocelot.dtsi         |    5 +-
 arch/mips/boot/dts/mscc/ocelot_pcb120.dts   |  100 ++-
 arch/mips/generic/Kconfig                   |    6 +-
 arch/mips/generic/Platform                  |    2 +-
 arch/mips/generic/board-ocelot.its.S        |   40 +-
 arch/mips/generic/board-ocelot_pcb123.its.S |   23 +-
 drivers/net/phy/mscc.c                      | 1019 ++++++++++++++++++++-
 include/dt-bindings/net/mscc-phy-vsc8531.h  |    2 +-
 9 files changed, 1171 insertions(+), 28 deletions(-)
 create mode 100644 arch/mips/boot/dts/mscc/ocelot_pcb120.dts
 create mode 100644 arch/mips/generic/board-ocelot.its.S
 delete mode 100644 arch/mips/generic/board-ocelot_pcb123.its.S

base-commit: d9cca8eef36bb8918c9ed28574b79b7674fd36f6
-- 
git-series 0.9.1

^ permalink raw reply

* [PATCH net-next 1/7] dt-bindings: net: vsc8531: add two additional LED modes for VSC8584
From: Quentin Schulz @ 2018-09-14  9:44 UTC (permalink / raw)
  To: alexandre.belloni, ralf, paul.burton, jhogan, robh+dt,
	mark.rutland, davem, andrew, f.fainelli
  Cc: allan.nielsen, linux-mips, devicetree, linux-kernel, netdev,
	thomas.petazzoni, antoine.tenart, Quentin Schulz
In-Reply-To: <cover.b921b010b6d6bde1c11e69551ae38f3b2818645b.1536916714.git-series.quentin.schulz@bootlin.com>

The VSC8584 (and most likely other PHYs in the same generation) has two
additional LED modes that can be picked, so let's add them.

Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
---
 include/dt-bindings/net/mscc-phy-vsc8531.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/dt-bindings/net/mscc-phy-vsc8531.h b/include/dt-bindings/net/mscc-phy-vsc8531.h
index 697161f..9eb2ec2 100644
--- a/include/dt-bindings/net/mscc-phy-vsc8531.h
+++ b/include/dt-bindings/net/mscc-phy-vsc8531.h
@@ -18,9 +18,11 @@
 #define VSC8531_LINK_100_1000_ACTIVITY  4
 #define VSC8531_LINK_10_1000_ACTIVITY   5
 #define VSC8531_LINK_10_100_ACTIVITY    6
+#define VSC8584_LINK_100FX_1000X_ACTIVITY	7
 #define VSC8531_DUPLEX_COLLISION        8
 #define VSC8531_COLLISION               9
 #define VSC8531_ACTIVITY                10
+#define VSC8584_100FX_1000X_ACTIVITY	11
 #define VSC8531_AUTONEG_FAULT           12
 #define VSC8531_SERIAL_MODE             13
 #define VSC8531_FORCE_LED_OFF           14
-- 
git-series 0.9.1

^ permalink raw reply related

* [PATCH net-next 3/7] net: phy: mscc: split config_init in two functions for VSC8584
From: Quentin Schulz @ 2018-09-14  9:44 UTC (permalink / raw)
  To: alexandre.belloni, ralf, paul.burton, jhogan, robh+dt,
	mark.rutland, davem, andrew, f.fainelli
  Cc: allan.nielsen, linux-mips, devicetree, linux-kernel, netdev,
	thomas.petazzoni, antoine.tenart, Quentin Schulz
In-Reply-To: <cover.b921b010b6d6bde1c11e69551ae38f3b2818645b.1536916714.git-series.quentin.schulz@bootlin.com>

Part of the config init is common between the VSC8584 and the VSC8574,
so to prepare the upcoming support for VSC8574, separate config_init
PHY-specific code to config_pre_init function which is set in the probe
function of the PHY and used in config_init.

Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
---
 drivers/net/phy/mscc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/mscc.c b/drivers/net/phy/mscc.c
index b450489..69cc3cf 100644
--- a/drivers/net/phy/mscc.c
+++ b/drivers/net/phy/mscc.c
@@ -355,6 +355,7 @@ struct vsc8531_private {
 	u64 *stats;
 	int nstats;
 	bool pkg_init;
+	int (*config_pre_init)(struct mii_bus *bus, int phy);
 };
 
 #ifdef CONFIG_OF_MDIO
@@ -1298,7 +1299,7 @@ static int vsc8584_config_init(struct phy_device *phydev)
 	 */
 	if (!vsc8584_is_pkg_init(phydev, base_addr,
 				 val & PHY_ADDR_REVERSED ? 1 : 0)) {
-		ret = vsc8584_config_pre_init(phydev->mdio.bus, base_addr);
+		ret = vsc8531->config_pre_init(phydev->mdio.bus, base_addr);
 		if (ret)
 			goto err;
 	}
@@ -1486,6 +1487,7 @@ static int vsc8584_probe(struct phy_device *phydev)
 
 	phydev->priv = vsc8531;
 
+	vsc8531->config_pre_init = vsc8584_config_pre_init;
 	vsc8531->nleds = 4;
 	vsc8531->supp_led_modes = VSC8584_SUPP_LED_MODES;
 	vsc8531->hw_stats = vsc8584_hw_stats;
-- 
git-series 0.9.1

^ permalink raw reply related

* [PATCH net-next 4/7] net: phy: mscc: add support for VSC8574 PHY
From: Quentin Schulz @ 2018-09-14  9:44 UTC (permalink / raw)
  To: alexandre.belloni, ralf, paul.burton, jhogan, robh+dt,
	mark.rutland, davem, andrew, f.fainelli
  Cc: allan.nielsen, linux-mips, devicetree, linux-kernel, netdev,
	thomas.petazzoni, antoine.tenart, Quentin Schulz
In-Reply-To: <cover.b921b010b6d6bde1c11e69551ae38f3b2818645b.1536916714.git-series.quentin.schulz@bootlin.com>

The VSC8574 PHY is a 4-ports PHY that is 10/100/1000BASE-T, 100BASE-FX,
1000BASE-X and triple-speed copper SFP capable, can communicate with
the MAC via SGMII, QSGMII or 1000BASE-X, supports WOL, downshifting and
can set the blinking pattern of each of its 4 LEDs, supports SyncE as
well as HP Auto-MDIX detection.

This adds support for 10/100/1000BASE-T, SGMII/QSGMII link with the MAC,
WOL, downshifting, HP Auto-MDIX detection and blinking pattern for its 4
LEDs.

The VSC8574 has also an internal Intel 8051 microcontroller whose
firmware needs to be patched when the PHY is reset. If the 8051's
firmware has the expected CRC, its patching can be skipped. The
microcontroller can be accessed from any port of the PHY, though the CRC
function can only be done through the PHY that is the base PHY of the
package (internal address 0) due to a limitation of the firmware.

The GPIO register bank is a set of registers that are common to all PHYs
in the package. So any modification in any register of this bank affects
all PHYs of the package.

If the PHYs haven't been reset before booting the Linux kernel and were
configured to use interrupts for e.g. link status updates, it is
required to clear the interrupts mask register of all PHYs before being
able to use interrupts with any PHY. The first PHY of the package that
will be init will take care of clearing all PHYs interrupts mask
registers. Thus, we need to keep track of the init sequence in the
package, if it's already been done or if it's to be done.

Most of the init sequence of a PHY of the package is common to all PHYs
in the package, thus we use the SMI broadcast feature which enables us
to propagate a write in one register of one PHY to all PHYs in the
package.

Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
---
 drivers/net/phy/mscc.c | 303 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 303 insertions(+)

diff --git a/drivers/net/phy/mscc.c b/drivers/net/phy/mscc.c
index 69cc3cf..2289d0a 100644
--- a/drivers/net/phy/mscc.c
+++ b/drivers/net/phy/mscc.c
@@ -65,6 +65,8 @@ enum rgmii_rx_clock_delay {
 #define MEDIA_OP_MODE_AMS_COPPER_100BASEFX	7
 #define MEDIA_OP_MODE_POS		  8
 
+#define MSCC_PHY_EXT_PHY_CNTL_2		  24
+
 #define MII_VSC85XX_INT_MASK		  25
 #define MII_VSC85XX_INT_MASK_MASK	  0xa000
 #define MII_VSC85XX_INT_MASK_WOL	  0x0040
@@ -151,6 +153,7 @@ enum rgmii_rx_clock_delay {
 #define DW8051_CLK_EN			  0x0010
 #define MICRO_CLK_EN			  0x0008
 #define MICRO_CLK_DIVIDE(x)		  ((x) >> 1)
+#define MSCC_DW8051_VLD_MASK		  0xf1ff
 
 /* x Address in range 1-4 */
 #define MSCC_TRAP_ROM_ADDR(x)		  ((x) * 2 + 1)
@@ -184,7 +187,9 @@ enum rgmii_rx_clock_delay {
 #define PROC_CMD_SGMII_MAC		  0x0030
 #define PROC_CMD_QSGMII_MAC		  0x0020
 #define PROC_CMD_NO_MAC_CONF		  0x0000
+#define PROC_CMD_1588_DEFAULT_INIT	  0x0010
 #define PROC_CMD_NOP			  0x000f
+#define PROC_CMD_PHY_INIT		  0x000a
 #define PROC_CMD_CRC16			  0x0008
 #define PROC_CMD_FIBER_MEDIA_CONF	  0x0001
 #define PROC_CMD_MCB_ACCESS_MAC_CONF	  0x0000
@@ -198,6 +203,9 @@ enum rgmii_rx_clock_delay {
 /* Test page Registers */
 #define MSCC_PHY_TEST_PAGE_5		  5
 #define MSCC_PHY_TEST_PAGE_8		  8
+#define MSCC_PHY_TEST_PAGE_9		  9
+#define MSCC_PHY_TEST_PAGE_20		  20
+#define MSCC_PHY_TEST_PAGE_24		  24
 
 /* Token ring page Registers */
 #define MSCC_PHY_TR_CNTL		  16
@@ -211,6 +219,7 @@ enum rgmii_rx_clock_delay {
 #define PHY_ID_VSC8531			  0x00070570
 #define PHY_ID_VSC8540			  0x00070760
 #define PHY_ID_VSC8541			  0x00070770
+#define PHY_ID_VSC8574			  0x000704a0
 #define PHY_ID_VSC8584			  0x000707c0
 
 #define MSCC_VDDMAC_1500		  1500
@@ -258,6 +267,10 @@ enum rgmii_rx_clock_delay {
 #define MSCC_VSC8584_REVB_INT8051_FW_START_ADDR	0xe800
 #define MSCC_VSC8584_REVB_INT8051_FW_CRC	0xfb48
 
+#define MSCC_VSC8574_REVB_INT8051_FW		"mscc_vsc8574_revb_int8051_29e8.bin"
+#define MSCC_VSC8574_REVB_INT8051_FW_START_ADDR	0x4000
+#define MSCC_VSC8574_REVB_INT8051_FW_CRC	0x29e8
+
 #define VSC8584_REVB				0x0001
 #define MSCC_DEV_REV_MASK			GENMASK(3, 0)
 
@@ -1084,6 +1097,243 @@ static int vsc8584_patch_fw(struct mii_bus *bus, int phy,
 }
 
 /* bus->mdio_lock should be locked when using this function */
+static bool vsc8574_is_serdes_init(struct mii_bus *bus, int phy)
+{
+	u16 reg;
+	bool ret;
+
+	__mdiobus_write(bus, phy, MSCC_EXT_PAGE_ACCESS,
+			MSCC_PHY_PAGE_EXTENDED_GPIO);
+
+	reg = __mdiobus_read(bus, phy, MSCC_TRAP_ROM_ADDR(1));
+	if (reg != 0x3eb7) {
+		ret = false;
+		goto out;
+	}
+
+	reg = __mdiobus_read(bus, phy, MSCC_PATCH_RAM_ADDR(1));
+	if (reg != 0x4012) {
+		ret = false;
+		goto out;
+	}
+
+	reg = __mdiobus_read(bus, phy, MSCC_INT_MEM_CNTL);
+	if (reg != EN_PATCH_RAM_TRAP_ADDR(1)) {
+		ret = false;
+		goto out;
+	}
+
+	reg = __mdiobus_read(bus, phy, MSCC_DW8051_CNTL_STATUS);
+	if ((MICRO_NSOFT_RESET | RUN_FROM_INT_ROM |  DW8051_CLK_EN |
+	     MICRO_CLK_EN) != (reg & MSCC_DW8051_VLD_MASK)) {
+		ret = false;
+		goto out;
+	}
+
+	ret = true;
+out:
+	__mdiobus_write(bus, phy, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_STANDARD);
+
+	return ret;
+}
+
+/* bus->mdio_lock should be locked when using this function */
+static int vsc8574_config_pre_init(struct mii_bus *bus, int phy)
+{
+	struct device *dev = &bus->mdio_map[phy]->dev;
+	const struct firmware *fw;
+	u16 crc, reg;
+	bool serdes_init;
+	int ret;
+
+	__mdiobus_write(bus, phy, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_STANDARD);
+
+	/* all writes below this line are broadcasted to all PHYs */
+	reg = __mdiobus_read(bus, phy, MSCC_PHY_EXT_CNTL_STATUS);
+	reg |= SMI_BROADCAST_WR_EN;
+	__mdiobus_write(bus, phy, MSCC_PHY_EXT_CNTL_STATUS, reg);
+
+	__mdiobus_write(bus, phy, MII_VSC85XX_INT_MASK, 0);
+
+	/* The below register writes are tweaking analog and electrical
+	 * configuration that were determined through characterization by PHY
+	 * engineers. These don't mean anything more than "these are the best
+	 * values".
+	 */
+	__mdiobus_write(bus, phy, MSCC_PHY_EXT_PHY_CNTL_2, 0x0040);
+
+	__mdiobus_write(bus, phy, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_TEST);
+
+	__mdiobus_write(bus, phy, MSCC_PHY_TEST_PAGE_20, 0x4320);
+	__mdiobus_write(bus, phy, MSCC_PHY_TEST_PAGE_24, 0x0c00);
+	__mdiobus_write(bus, phy, MSCC_PHY_TEST_PAGE_9, 0x18ca);
+	__mdiobus_write(bus, phy, MSCC_PHY_TEST_PAGE_5, 0x1b20);
+
+	reg = __mdiobus_read(bus, phy, MSCC_PHY_TEST_PAGE_8);
+	reg |= 0x8000;
+	__mdiobus_write(bus, phy, MSCC_PHY_TEST_PAGE_8, reg);
+
+	__mdiobus_write(bus, phy, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_TR);
+
+	vsc8584_csr_write(bus, phy, 0x8fae, 0x000401bd);
+	vsc8584_csr_write(bus, phy, 0x8fac, 0x000f000f);
+	vsc8584_csr_write(bus, phy, 0x97a0, 0x00a0f147);
+	vsc8584_csr_write(bus, phy, 0x8fe4, 0x00052f54);
+	vsc8584_csr_write(bus, phy, 0x9792, 0x0027303d);
+	vsc8584_csr_write(bus, phy, 0x87fe, 0x00000704);
+	vsc8584_csr_write(bus, phy, 0x8fe0, 0x00060150);
+	vsc8584_csr_write(bus, phy, 0x8f82, 0x0012b00a);
+	vsc8584_csr_write(bus, phy, 0x8f80, 0x00000d74);
+	vsc8584_csr_write(bus, phy, 0x82e0, 0x00000012);
+	vsc8584_csr_write(bus, phy, 0x83a2, 0x00050208);
+	vsc8584_csr_write(bus, phy, 0x83b2, 0x00009186);
+	vsc8584_csr_write(bus, phy, 0x8fb0, 0x000e3700);
+	vsc8584_csr_write(bus, phy, 0x9688, 0x00049f81);
+	vsc8584_csr_write(bus, phy, 0x8fd2, 0x0000ffff);
+	vsc8584_csr_write(bus, phy, 0x968a, 0x00039fa2);
+	vsc8584_csr_write(bus, phy, 0x9690, 0x0020640b);
+	vsc8584_csr_write(bus, phy, 0x8258, 0x00002220);
+	vsc8584_csr_write(bus, phy, 0x825a, 0x00002a20);
+	vsc8584_csr_write(bus, phy, 0x825c, 0x00003060);
+	vsc8584_csr_write(bus, phy, 0x825e, 0x00003fa0);
+	vsc8584_csr_write(bus, phy, 0x83a6, 0x0000e0f0);
+	vsc8584_csr_write(bus, phy, 0x8f92, 0x00001489);
+	vsc8584_csr_write(bus, phy, 0x96a2, 0x00007000);
+	vsc8584_csr_write(bus, phy, 0x96a6, 0x00071448);
+	vsc8584_csr_write(bus, phy, 0x96a0, 0x00eeffdd);
+	vsc8584_csr_write(bus, phy, 0x8fe8, 0x0091b06c);
+	vsc8584_csr_write(bus, phy, 0x8fea, 0x00041600);
+	vsc8584_csr_write(bus, phy, 0x96b0, 0x00eeff00);
+	vsc8584_csr_write(bus, phy, 0x96b2, 0x00007000);
+	vsc8584_csr_write(bus, phy, 0x96b4, 0x00000814);
+	vsc8584_csr_write(bus, phy, 0x8f90, 0x00688980);
+	vsc8584_csr_write(bus, phy, 0x83a4, 0x0000d8f0);
+	vsc8584_csr_write(bus, phy, 0x8fc0, 0x00000400);
+	vsc8584_csr_write(bus, phy, 0x87fa, 0x0050100f);
+	vsc8584_csr_write(bus, phy, 0x8796, 0x00000003);
+	vsc8584_csr_write(bus, phy, 0x87f8, 0x00c3ff98);
+	vsc8584_csr_write(bus, phy, 0x8fa4, 0x0018292a);
+	vsc8584_csr_write(bus, phy, 0x968c, 0x00d2c46f);
+	vsc8584_csr_write(bus, phy, 0x97a2, 0x00000620);
+	vsc8584_csr_write(bus, phy, 0x96a4, 0x0013132f);
+	vsc8584_csr_write(bus, phy, 0x96a8, 0x00000000);
+	vsc8584_csr_write(bus, phy, 0x8ffc, 0x00c0a028);
+	vsc8584_csr_write(bus, phy, 0x8fec, 0x00901c09);
+	vsc8584_csr_write(bus, phy, 0x8fee, 0x0004a6a1);
+	vsc8584_csr_write(bus, phy, 0x8ffe, 0x00b01807);
+
+	__mdiobus_write(bus, phy, MSCC_EXT_PAGE_ACCESS,
+			MSCC_PHY_PAGE_EXTENDED_2);
+
+	__mdiobus_write(bus, phy, MSCC_PHY_CU_PMD_TX_CNTL, 0x028e);
+
+	__mdiobus_write(bus, phy, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_TR);
+
+	vsc8584_csr_write(bus, phy, 0x8486, 0x0008a518);
+	vsc8584_csr_write(bus, phy, 0x8488, 0x006dc696);
+	vsc8584_csr_write(bus, phy, 0x848a, 0x00000912);
+	vsc8584_csr_write(bus, phy, 0x848e, 0x00000db6);
+	vsc8584_csr_write(bus, phy, 0x849c, 0x00596596);
+	vsc8584_csr_write(bus, phy, 0x849e, 0x00000514);
+	vsc8584_csr_write(bus, phy, 0x84a2, 0x00410280);
+	vsc8584_csr_write(bus, phy, 0x84a4, 0x00000000);
+	vsc8584_csr_write(bus, phy, 0x84a6, 0x00000000);
+	vsc8584_csr_write(bus, phy, 0x84a8, 0x00000000);
+	vsc8584_csr_write(bus, phy, 0x84aa, 0x00000000);
+	vsc8584_csr_write(bus, phy, 0x84ae, 0x007df7dd);
+	vsc8584_csr_write(bus, phy, 0x84b0, 0x006d95d4);
+	vsc8584_csr_write(bus, phy, 0x84b2, 0x00492410);
+
+	__mdiobus_write(bus, phy, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_TEST);
+
+	reg = __mdiobus_read(bus, phy, MSCC_PHY_TEST_PAGE_8);
+	reg &= ~0x8000;
+	__mdiobus_write(bus, phy, MSCC_PHY_TEST_PAGE_8, reg);
+
+	__mdiobus_write(bus, phy, MSCC_EXT_PAGE_ACCESS,
+			MSCC_PHY_PAGE_STANDARD);
+
+	/* end of write broadcasting */
+	reg = __mdiobus_read(bus, phy, MSCC_PHY_EXT_CNTL_STATUS);
+	reg &= ~SMI_BROADCAST_WR_EN;
+	__mdiobus_write(bus, phy, MSCC_PHY_EXT_CNTL_STATUS, reg);
+
+	ret = request_firmware(&fw, MSCC_VSC8574_REVB_INT8051_FW, dev);
+	if (ret) {
+		dev_err(dev, "failed to load firmware %s, ret: %d\n",
+			MSCC_VSC8574_REVB_INT8051_FW, ret);
+		return ret;
+	}
+
+	/* Add one byte to size for the one added by the patch_fw function */
+	ret = vsc8584_get_fw_crc(bus, phy,
+				 MSCC_VSC8574_REVB_INT8051_FW_START_ADDR,
+				 fw->size + 1, &crc);
+	if (ret)
+		goto out;
+
+	if (crc == MSCC_VSC8574_REVB_INT8051_FW_CRC) {
+		serdes_init = vsc8574_is_serdes_init(bus, phy);
+
+		if (!serdes_init) {
+			ret = vsc8584_micro_assert_reset(bus, phy);
+			if (ret) {
+				dev_err(dev,
+					"%s: failed to assert reset of micro\n",
+					__func__);
+				return ret;
+			}
+		}
+	} else {
+		dev_dbg(dev, "FW CRC is not the expected one, patching FW\n");
+
+		serdes_init = false;
+
+		if (vsc8584_patch_fw(bus, phy, fw))
+			dev_warn(dev,
+				 "failed to patch FW, expect non-optimal device\n");
+	}
+
+	if (!serdes_init) {
+		__mdiobus_write(bus, phy, MSCC_EXT_PAGE_ACCESS,
+				MSCC_PHY_PAGE_EXTENDED_GPIO);
+
+		__mdiobus_write(bus, phy, MSCC_TRAP_ROM_ADDR(1), 0x3eb7);
+		__mdiobus_write(bus, phy, MSCC_PATCH_RAM_ADDR(1), 0x4012);
+		__mdiobus_write(bus, phy, MSCC_INT_MEM_CNTL,
+				EN_PATCH_RAM_TRAP_ADDR(1));
+
+		vsc8584_micro_deassert_reset(bus, phy, false);
+
+		/* Add one byte to size for the one added by the patch_fw
+		 * function
+		 */
+		ret = vsc8584_get_fw_crc(bus, phy,
+					 MSCC_VSC8574_REVB_INT8051_FW_START_ADDR,
+					 fw->size + 1, &crc);
+		if (ret)
+			goto out;
+
+		if (crc != MSCC_VSC8574_REVB_INT8051_FW_CRC)
+			dev_warn(dev,
+				 "FW CRC after patching is not the expected one, expect non-optimal device\n");
+	}
+
+	__mdiobus_write(bus, phy, MSCC_EXT_PAGE_ACCESS,
+			MSCC_PHY_PAGE_EXTENDED_GPIO);
+
+	ret = vsc8584_cmd(bus, phy, PROC_CMD_1588_DEFAULT_INIT |
+			  PROC_CMD_PHY_INIT);
+
+out:
+	__mdiobus_write(bus, phy, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_STANDARD);
+
+	release_firmware(fw);
+
+	return ret;
+}
+
+/* bus->mdio_lock should be locked when using this function */
 static int vsc8584_config_pre_init(struct mii_bus *bus, int phy)
 {
 	struct device *dev = &bus->mdio_map[phy]->dev;
@@ -1469,6 +1719,33 @@ static int vsc85xx_read_status(struct phy_device *phydev)
 	return genphy_read_status(phydev);
 }
 
+static int vsc8574_probe(struct phy_device *phydev)
+{
+	struct vsc8531_private *vsc8531;
+	u32 default_mode[4] = {VSC8531_LINK_1000_ACTIVITY,
+	   VSC8531_LINK_100_ACTIVITY, VSC8531_LINK_ACTIVITY,
+	   VSC8531_DUPLEX_COLLISION};
+
+	vsc8531 = devm_kzalloc(&phydev->mdio.dev, sizeof(*vsc8531), GFP_KERNEL);
+	if (!vsc8531)
+		return -ENOMEM;
+
+	phydev->priv = vsc8531;
+
+	vsc8531->config_pre_init = vsc8574_config_pre_init;
+	vsc8531->nleds = 4;
+	vsc8531->supp_led_modes = VSC8584_SUPP_LED_MODES;
+	vsc8531->hw_stats = vsc8584_hw_stats;
+	vsc8531->nstats = ARRAY_SIZE(vsc8584_hw_stats);
+	vsc8531->stats = devm_kzalloc(&phydev->mdio.dev,
+				      sizeof(u64) * vsc8531->nstats,
+				      GFP_KERNEL);
+	if (!vsc8531->stats)
+		return -ENOMEM;
+
+	return vsc85xx_dt_led_modes_get(phydev, default_mode);
+}
+
 static int vsc8584_probe(struct phy_device *phydev)
 {
 	struct vsc8531_private *vsc8531;
@@ -1631,6 +1908,31 @@ static struct phy_driver vsc85xx_driver[] = {
 	.get_stats      = &vsc85xx_get_stats,
 },
 {
+	.phy_id		= PHY_ID_VSC8574,
+	.name		= "Microsemi GE VSC8574 SyncE",
+	.phy_id_mask	= 0xfffffff0,
+	.features	= PHY_GBIT_FEATURES,
+	.flags		= PHY_HAS_INTERRUPT,
+	.soft_reset	= &genphy_soft_reset,
+	.config_init    = &vsc8584_config_init,
+	.config_aneg    = &vsc85xx_config_aneg,
+	.aneg_done	= &genphy_aneg_done,
+	.read_status	= &vsc85xx_read_status,
+	.ack_interrupt  = &vsc85xx_ack_interrupt,
+	.config_intr    = &vsc85xx_config_intr,
+	.did_interrupt  = &vsc8584_did_interrupt,
+	.suspend	= &genphy_suspend,
+	.resume		= &genphy_resume,
+	.probe		= &vsc8574_probe,
+	.set_wol	= &vsc85xx_wol_set,
+	.get_wol	= &vsc85xx_wol_get,
+	.get_tunable	= &vsc85xx_get_tunable,
+	.set_tunable	= &vsc85xx_set_tunable,
+	.get_sset_count = &vsc85xx_get_sset_count,
+	.get_strings    = &vsc85xx_get_strings,
+	.get_stats      = &vsc85xx_get_stats,
+},
+{
 	.phy_id		= PHY_ID_VSC8584,
 	.name		= "Microsemi GE VSC8584 SyncE",
 	.phy_id_mask	= 0xfffffff0,
@@ -1663,6 +1965,7 @@ static struct mdio_device_id __maybe_unused vsc85xx_tbl[] = {
 	{ PHY_ID_VSC8531, 0xfffffff0, },
 	{ PHY_ID_VSC8540, 0xfffffff0, },
 	{ PHY_ID_VSC8541, 0xfffffff0, },
+	{ PHY_ID_VSC8574, 0xfffffff0, },
 	{ PHY_ID_VSC8584, 0xfffffff0, },
 	{ }
 };
-- 
git-series 0.9.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox