Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v2 4/5] net: dsa: b53: Add PHYLINK support
From: Florian Fainelli @ 2018-09-05 19:23 UTC (permalink / raw)
  To: netdev; +Cc: Florian Fainelli, andrew, vivien.didelot, davem
In-Reply-To: <20180905192327.6469-1-f.fainelli@gmail.com>

Add support for PHYLINK, things are reasonably straight forward since we
do not yet support SerDes interfaces, that leaves us with just
MLO_AN_PHY and MLO_AN_FIXED to deal with.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/dsa/b53/b53_common.c | 122 +++++++++++++++++++++++++++++++
 drivers/net/dsa/b53/b53_priv.h   |  17 +++++
 2 files changed, 139 insertions(+)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 78aeaccf19a1..3d5e822bb17c 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1109,6 +1109,122 @@ static void b53_adjust_link(struct dsa_switch *ds, int port,
 	p->eee_enabled = b53_eee_init(ds, port, phydev);
 }
 
+void b53_port_event(struct dsa_switch *ds, int port)
+{
+	struct b53_device *dev = ds->priv;
+	bool link;
+	u16 sts;
+
+	b53_read16(dev, B53_STAT_PAGE, B53_LINK_STAT, &sts);
+	link = !!(sts & BIT(port));
+	dsa_port_phylink_mac_change(ds, port, link);
+}
+EXPORT_SYMBOL(b53_port_event);
+
+void b53_phylink_validate(struct dsa_switch *ds, int port,
+			  unsigned long *supported,
+			  struct phylink_link_state *state)
+{
+	struct b53_device *dev = ds->priv;
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
+
+	/* Allow all the expected bits */
+	phylink_set(mask, Autoneg);
+	phylink_set_port_modes(mask);
+	phylink_set(mask, Pause);
+	phylink_set(mask, Asym_Pause);
+
+	/* With the exclusion of 5325/5365, MII, Reverse MII and 802.3z, we
+	 * support Gigabit, including Half duplex.
+	 */
+	if (state->interface != PHY_INTERFACE_MODE_MII &&
+	    state->interface != PHY_INTERFACE_MODE_REVMII &&
+	    !phy_interface_mode_is_8023z(state->interface) &&
+	    !(is5325(dev) || is5365(dev))) {
+		phylink_set(mask, 1000baseT_Full);
+		phylink_set(mask, 1000baseT_Half);
+	}
+
+	if (!phy_interface_mode_is_8023z(state->interface)) {
+		phylink_set(mask, 10baseT_Half);
+		phylink_set(mask, 10baseT_Full);
+		phylink_set(mask, 100baseT_Half);
+		phylink_set(mask, 100baseT_Full);
+	}
+
+	bitmap_and(supported, supported, mask,
+		   __ETHTOOL_LINK_MODE_MASK_NBITS);
+	bitmap_and(state->advertising, state->advertising, mask,
+		   __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+	phylink_helper_basex_speed(state);
+}
+EXPORT_SYMBOL(b53_phylink_validate);
+
+int b53_phylink_mac_link_state(struct dsa_switch *ds, int port,
+			       struct phylink_link_state *state)
+{
+	int ret = -EOPNOTSUPP;
+
+	return ret;
+}
+EXPORT_SYMBOL(b53_phylink_mac_link_state);
+
+void b53_phylink_mac_config(struct dsa_switch *ds, int port,
+			    unsigned int mode,
+			    const struct phylink_link_state *state)
+{
+	struct b53_device *dev = ds->priv;
+
+	if (mode == MLO_AN_PHY)
+		return;
+
+	if (mode == MLO_AN_FIXED) {
+		b53_force_port_config(dev, port, state->speed,
+				      state->duplex, state->pause);
+		return;
+	}
+}
+EXPORT_SYMBOL(b53_phylink_mac_config);
+
+void b53_phylink_mac_an_restart(struct dsa_switch *ds, int port)
+{
+}
+EXPORT_SYMBOL(b53_phylink_mac_an_restart);
+
+void b53_phylink_mac_link_down(struct dsa_switch *ds, int port,
+			       unsigned int mode,
+			       phy_interface_t interface)
+{
+	struct b53_device *dev = ds->priv;
+
+	if (mode == MLO_AN_PHY)
+		return;
+
+	if (mode == MLO_AN_FIXED) {
+		b53_force_link(dev, port, false);
+		return;
+	}
+}
+EXPORT_SYMBOL(b53_phylink_mac_link_down);
+
+void b53_phylink_mac_link_up(struct dsa_switch *ds, int port,
+			     unsigned int mode,
+			     phy_interface_t interface,
+			     struct phy_device *phydev)
+{
+	struct b53_device *dev = ds->priv;
+
+	if (mode == MLO_AN_PHY)
+		return;
+
+	if (mode == MLO_AN_FIXED) {
+		b53_force_link(dev, port, true);
+		return;
+	}
+}
+EXPORT_SYMBOL(b53_phylink_mac_link_up);
+
 int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering)
 {
 	return 0;
@@ -1750,6 +1866,12 @@ static const struct dsa_switch_ops b53_switch_ops = {
 	.phy_read		= b53_phy_read16,
 	.phy_write		= b53_phy_write16,
 	.adjust_link		= b53_adjust_link,
+	.phylink_validate	= b53_phylink_validate,
+	.phylink_mac_link_state	= b53_phylink_mac_link_state,
+	.phylink_mac_config	= b53_phylink_mac_config,
+	.phylink_mac_an_restart	= b53_phylink_mac_an_restart,
+	.phylink_mac_link_down	= b53_phylink_mac_link_down,
+	.phylink_mac_link_up	= b53_phylink_mac_link_up,
 	.port_enable		= b53_enable_port,
 	.port_disable		= b53_disable_port,
 	.get_mac_eee		= b53_get_mac_eee,
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 2980a5838f58..3f79dc07c00f 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -300,6 +300,23 @@ int b53_br_join(struct dsa_switch *ds, int port, struct net_device *bridge);
 void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *bridge);
 void b53_br_set_stp_state(struct dsa_switch *ds, int port, u8 state);
 void b53_br_fast_age(struct dsa_switch *ds, int port);
+void b53_port_event(struct dsa_switch *ds, int port);
+void b53_phylink_validate(struct dsa_switch *ds, int port,
+			  unsigned long *supported,
+			  struct phylink_link_state *state);
+int b53_phylink_mac_link_state(struct dsa_switch *ds, int port,
+			       struct phylink_link_state *state);
+void b53_phylink_mac_config(struct dsa_switch *ds, int port,
+			    unsigned int mode,
+			    const struct phylink_link_state *state);
+void b53_phylink_mac_an_restart(struct dsa_switch *ds, int port);
+void b53_phylink_mac_link_down(struct dsa_switch *ds, int port,
+			       unsigned int mode,
+			       phy_interface_t interface);
+void b53_phylink_mac_link_up(struct dsa_switch *ds, int port,
+			     unsigned int mode,
+			     phy_interface_t interface,
+			     struct phy_device *phydev);
 int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering);
 int b53_vlan_prepare(struct dsa_switch *ds, int port,
 		     const struct switchdev_obj_port_vlan *vlan);
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next v2 3/5] net: dsa: b53: Add helper to set link parameters
From: Florian Fainelli @ 2018-09-05 19:23 UTC (permalink / raw)
  To: netdev; +Cc: Florian Fainelli, andrew, vivien.didelot, davem
In-Reply-To: <20180905192327.6469-1-f.fainelli@gmail.com>

Extract the logic from b53_adjust_link() responsible for overriding a
given port's link, speed, duplex and pause settings and make two helper
functions to set the port's configuration and the port's link settings.
We will make use of both, as separate functions while adding PHYLINK
support next.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/dsa/b53/b53_common.c | 89 +++++++++++++++++++++-----------
 1 file changed, 60 insertions(+), 29 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 85ed264bc163..78aeaccf19a1 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -26,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/platform_data/b53.h>
 #include <linux/phy.h>
+#include <linux/phylink.h>
 #include <linux/etherdevice.h>
 #include <linux/if_bridge.h>
 #include <net/dsa.h>
@@ -947,33 +948,50 @@ static int b53_setup(struct dsa_switch *ds)
 	return ret;
 }
 
-static void b53_adjust_link(struct dsa_switch *ds, int port,
-			    struct phy_device *phydev)
+static void b53_force_link(struct b53_device *dev, int port, int link)
 {
-	struct b53_device *dev = ds->priv;
-	struct ethtool_eee *p = &dev->ports[port].eee;
-	u8 rgmii_ctrl = 0, reg = 0, off;
-
-	if (!phy_is_pseudo_fixed_link(phydev))
-		return;
+	u8 reg, val, off;
 
 	/* Override the port settings */
 	if (port == dev->cpu_port) {
 		off = B53_PORT_OVERRIDE_CTRL;
-		reg = PORT_OVERRIDE_EN;
+		val = PORT_OVERRIDE_EN;
 	} else {
 		off = B53_GMII_PORT_OVERRIDE_CTRL(port);
-		reg = GMII_PO_EN;
+		val = GMII_PO_EN;
 	}
 
-	/* Set the link UP */
-	if (phydev->link)
+	b53_read8(dev, B53_CTRL_PAGE, off, &reg);
+	reg |= val;
+	if (link)
 		reg |= PORT_OVERRIDE_LINK;
+	else
+		reg &= ~PORT_OVERRIDE_LINK;
+	b53_write8(dev, B53_CTRL_PAGE, off, reg);
+}
+
+static void b53_force_port_config(struct b53_device *dev, int port,
+				  int speed, int duplex, int pause)
+{
+	u8 reg, val, off;
+
+	/* Override the port settings */
+	if (port == dev->cpu_port) {
+		off = B53_PORT_OVERRIDE_CTRL;
+		val = PORT_OVERRIDE_EN;
+	} else {
+		off = B53_GMII_PORT_OVERRIDE_CTRL(port);
+		val = GMII_PO_EN;
+	}
 
-	if (phydev->duplex == DUPLEX_FULL)
+	b53_read8(dev, B53_CTRL_PAGE, off, &reg);
+	reg |= val;
+	if (duplex == DUPLEX_FULL)
 		reg |= PORT_OVERRIDE_FULL_DUPLEX;
+	else
+		reg &= ~PORT_OVERRIDE_FULL_DUPLEX;
 
-	switch (phydev->speed) {
+	switch (speed) {
 	case 2000:
 		reg |= PORT_OVERRIDE_SPEED_2000M;
 		/* fallthrough */
@@ -987,21 +1005,41 @@ static void b53_adjust_link(struct dsa_switch *ds, int port,
 		reg |= PORT_OVERRIDE_SPEED_10M;
 		break;
 	default:
-		dev_err(ds->dev, "unknown speed: %d\n", phydev->speed);
+		dev_err(dev->dev, "unknown speed: %d\n", speed);
 		return;
 	}
 
+	if (pause & MLO_PAUSE_RX)
+		reg |= PORT_OVERRIDE_RX_FLOW;
+	if (pause & MLO_PAUSE_TX)
+		reg |= PORT_OVERRIDE_TX_FLOW;
+
+	b53_write8(dev, B53_CTRL_PAGE, off, reg);
+}
+
+static void b53_adjust_link(struct dsa_switch *ds, int port,
+			    struct phy_device *phydev)
+{
+	struct b53_device *dev = ds->priv;
+	struct ethtool_eee *p = &dev->ports[port].eee;
+	u8 rgmii_ctrl = 0, reg = 0, off;
+	int pause;
+
+	if (!phy_is_pseudo_fixed_link(phydev))
+		return;
+
 	/* Enable flow control on BCM5301x's CPU port */
 	if (is5301x(dev) && port == dev->cpu_port)
-		reg |= PORT_OVERRIDE_RX_FLOW | PORT_OVERRIDE_TX_FLOW;
+		pause = MLO_PAUSE_TXRX_MASK;
 
 	if (phydev->pause) {
 		if (phydev->asym_pause)
-			reg |= PORT_OVERRIDE_TX_FLOW;
-		reg |= PORT_OVERRIDE_RX_FLOW;
+			pause |= MLO_PAUSE_TX;
+		pause |= MLO_PAUSE_RX;
 	}
 
-	b53_write8(dev, B53_CTRL_PAGE, off, reg);
+	b53_force_port_config(dev, port, phydev->speed, phydev->duplex, pause);
+	b53_force_link(dev, port, phydev->link);
 
 	if (is531x5(dev) && phy_interface_is_rgmii(phydev)) {
 		if (port == 8)
@@ -1061,16 +1099,9 @@ static void b53_adjust_link(struct dsa_switch *ds, int port,
 		}
 	} else if (is5301x(dev)) {
 		if (port != dev->cpu_port) {
-			u8 po_reg = B53_GMII_PORT_OVERRIDE_CTRL(dev->cpu_port);
-			u8 gmii_po;
-
-			b53_read8(dev, B53_CTRL_PAGE, po_reg, &gmii_po);
-			gmii_po |= GMII_PO_LINK |
-				   GMII_PO_RX_FLOW |
-				   GMII_PO_TX_FLOW |
-				   GMII_PO_EN |
-				   GMII_PO_SPEED_2000M;
-			b53_write8(dev, B53_CTRL_PAGE, po_reg, gmii_po);
+			b53_force_port_config(dev, dev->cpu_port, 2000,
+					      DUPLEX_FULL, MLO_PAUSE_TXRX_MASK);
+			b53_force_link(dev, dev->cpu_port, 1);
 		}
 	}
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next v2 2/5] net: dsa: b53: Make SRAB driver manage port interrupts
From: Florian Fainelli @ 2018-09-05 19:23 UTC (permalink / raw)
  To: netdev; +Cc: Florian Fainelli, andrew, vivien.didelot, davem
In-Reply-To: <20180905192327.6469-1-f.fainelli@gmail.com>

Update the SRAB driver to manage per-port interrupts. Since we cannot
sleep during b53_io_ops, schedule a workqueue whenever we get a port
specific interrupt. We will later make use of this to call back into
PHYLINK when there is e.g: a link state change.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/dsa/b53/b53_srab.c | 108 +++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/drivers/net/dsa/b53/b53_srab.c b/drivers/net/dsa/b53/b53_srab.c
index 91de2ba99ad1..411b84f61903 100644
--- a/drivers/net/dsa/b53/b53_srab.c
+++ b/drivers/net/dsa/b53/b53_srab.c
@@ -22,6 +22,7 @@
 #include <linux/platform_device.h>
 #include <linux/platform_data/b53.h>
 #include <linux/of.h>
+#include <linux/workqueue.h>
 
 #include "b53_priv.h"
 
@@ -47,6 +48,7 @@
 
 /* command and status register of the SRAB */
 #define B53_SRAB_CTRLS			0x40
+#define  B53_SRAB_CTRLS_HOST_INTR	BIT(1)
 #define  B53_SRAB_CTRLS_RCAREQ		BIT(3)
 #define  B53_SRAB_CTRLS_RCAGNT		BIT(4)
 #define  B53_SRAB_CTRLS_SW_INIT_DONE	BIT(6)
@@ -60,8 +62,17 @@
 #define  B53_SRAB_P7_SLEEP_TIMER	BIT(11)
 #define  B53_SRAB_IMP0_SLEEP_TIMER	BIT(12)
 
+struct b53_srab_port_priv {
+	struct work_struct irq_work;
+	int irq;
+	bool irq_enabled;
+	struct b53_device *dev;
+	unsigned int num;
+};
+
 struct b53_srab_priv {
 	void __iomem *regs;
+	struct b53_srab_port_priv port_intrs[B53_N_PORTS];
 };
 
 static int b53_srab_request_grant(struct b53_device *dev)
@@ -344,6 +355,50 @@ static int b53_srab_write64(struct b53_device *dev, u8 page, u8 reg,
 	return ret;
 }
 
+static void b53_srab_port_defer(struct work_struct *work)
+{
+}
+
+static irqreturn_t b53_srab_port_isr(int irq, void *dev_id)
+{
+	struct b53_srab_port_priv *port = dev_id;
+	struct b53_device *dev = port->dev;
+	struct b53_srab_priv *priv = dev->priv;
+
+	/* Acknowledge the interrupt */
+	writel(BIT(port->num), priv->regs + B53_SRAB_INTR);
+
+	schedule_work(&port->irq_work);
+
+	return IRQ_HANDLED;
+}
+
+static int b53_srab_irq_enable(struct b53_device *dev, int port)
+{
+	struct b53_srab_priv *priv = dev->priv;
+	struct b53_srab_port_priv *p = &priv->port_intrs[port];
+	int ret;
+
+	ret = request_irq(p->irq, b53_srab_port_isr, 0,
+			  dev_name(dev->dev), p);
+	if (!ret)
+		p->irq_enabled = true;
+
+	return ret;
+}
+
+static void b53_srab_irq_disable(struct b53_device *dev, int port)
+{
+	struct b53_srab_priv *priv = dev->priv;
+	struct b53_srab_port_priv *p = &priv->port_intrs[port];
+
+	if (p->irq_enabled) {
+		free_irq(p->irq, p);
+		cancel_work_sync(&p->irq_work);
+		p->irq_enabled = false;
+	}
+}
+
 static const struct b53_io_ops b53_srab_ops = {
 	.read8 = b53_srab_read8,
 	.read16 = b53_srab_read16,
@@ -355,6 +410,8 @@ static const struct b53_io_ops b53_srab_ops = {
 	.write32 = b53_srab_write32,
 	.write48 = b53_srab_write48,
 	.write64 = b53_srab_write64,
+	.irq_enable = b53_srab_irq_enable,
+	.irq_disable = b53_srab_irq_disable,
 };
 
 static const struct of_device_id b53_srab_of_match[] = {
@@ -379,6 +436,53 @@ static const struct of_device_id b53_srab_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, b53_srab_of_match);
 
+static void b53_srab_intr_set(struct b53_srab_priv *priv, bool set)
+{
+	u32 reg;
+
+	reg = readl(priv->regs + B53_SRAB_CTRLS);
+	if (set)
+		reg |= B53_SRAB_CTRLS_HOST_INTR;
+	else
+		reg &= ~B53_SRAB_CTRLS_HOST_INTR;
+	writel(reg, priv->regs + B53_SRAB_CTRLS);
+}
+
+static void b53_srab_prepare_irq(struct platform_device *pdev)
+{
+	struct b53_device *dev = platform_get_drvdata(pdev);
+	struct b53_srab_priv *priv = dev->priv;
+	struct b53_srab_port_priv *port;
+	unsigned int i;
+	char *name;
+
+	/* Clear all pending interrupts */
+	writel(0xffffffff, priv->regs + B53_SRAB_INTR);
+
+	if (dev->pdata && dev->pdata->chip_id != BCM58XX_DEVICE_ID)
+		return;
+
+	for (i = 0; i < B53_N_PORTS; i++) {
+		port = &priv->port_intrs[i];
+
+		/* There is no port 6 */
+		if (i == 6)
+			continue;
+
+		name = kasprintf(GFP_KERNEL, "link_state_p%d", i);
+		if (!name)
+			return;
+
+		port->num = i;
+		port->dev = dev;
+		INIT_WORK(&port->irq_work, b53_srab_port_defer);
+		port->irq = platform_get_irq_byname(pdev, name);
+		kfree(name);
+	}
+
+	b53_srab_intr_set(priv, true);
+}
+
 static int b53_srab_probe(struct platform_device *pdev)
 {
 	struct b53_platform_data *pdata = pdev->dev.platform_data;
@@ -417,13 +521,17 @@ static int b53_srab_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, dev);
 
+	b53_srab_prepare_irq(pdev);
+
 	return b53_switch_register(dev);
 }
 
 static int b53_srab_remove(struct platform_device *pdev)
 {
 	struct b53_device *dev = platform_get_drvdata(pdev);
+	struct b53_srab_priv *priv = dev->priv;
 
+	b53_srab_intr_set(priv, false);
 	if (dev)
 		b53_switch_remove(dev);
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next v2 1/5] net: dsa: b53: Add ability to enable/disable port interrupts
From: Florian Fainelli @ 2018-09-05 19:23 UTC (permalink / raw)
  To: netdev; +Cc: Florian Fainelli, andrew, vivien.didelot, davem
In-Reply-To: <20180905192327.6469-1-f.fainelli@gmail.com>

Some switches expose individual interrupt line(s) for port specific
event(s), allow configuring these interrupts at an appropriate time
during port_enable/disable callbacks where all port specific resources
are known to be set-up and ready for use.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/dsa/b53/b53_common.c | 9 +++++++++
 drivers/net/dsa/b53/b53_priv.h   | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index d93c790bfbe8..85ed264bc163 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -502,8 +502,14 @@ int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy)
 {
 	struct b53_device *dev = ds->priv;
 	unsigned int cpu_port = ds->ports[port].cpu_dp->index;
+	int ret = 0;
 	u16 pvlan;
 
+	if (dev->ops->irq_enable)
+		ret = dev->ops->irq_enable(dev, port);
+	if (ret)
+		return ret;
+
 	/* Clear the Rx and Tx disable bits and set to no spanning tree */
 	b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), 0);
 
@@ -536,6 +542,9 @@ void b53_disable_port(struct dsa_switch *ds, int port, struct phy_device *phy)
 	b53_read8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), &reg);
 	reg |= PORT_CTRL_RX_DISABLE | PORT_CTRL_TX_DISABLE;
 	b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), reg);
+
+	if (dev->ops->irq_disable)
+		dev->ops->irq_disable(dev, port);
 }
 EXPORT_SYMBOL(b53_disable_port);
 
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index df149756c282..2980a5838f58 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -43,6 +43,8 @@ struct b53_io_ops {
 	int (*write64)(struct b53_device *dev, u8 page, u8 reg, u64 value);
 	int (*phy_read16)(struct b53_device *dev, int addr, int reg, u16 *value);
 	int (*phy_write16)(struct b53_device *dev, int addr, int reg, u16 value);
+	int (*irq_enable)(struct b53_device *dev, int port);
+	void (*irq_disable)(struct b53_device *dev, int port);
 };
 
 enum {
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next v2 0/5] net: dsa: b53: SerDes support
From: Florian Fainelli @ 2018-09-05 19:23 UTC (permalink / raw)
  To: netdev; +Cc: Florian Fainelli, andrew, vivien.didelot, davem

Hi all,

This patch series adds support for the SerDes found on NorthStar Plus
(NSP) which allows us to use the SFP port on the BCM958625HR board (and
other similar designs).

Changes in v2:

- migrate to threaded interrupt (Andrew)
- fixed a case where MLO_AN_FIXED's mac_config would still call into
  the serdes_config callback
- added an additional check on the phylink interface in mac_config
- default to ARCH_BCM_NSP instead of ARCH_BCM_IPROC which is really
  the NSP Kconfig bit we want

Florian Fainelli (5):
  net: dsa: b53: Add ability to enable/disable port interrupts
  net: dsa: b53: Make SRAB driver manage port interrupts
  net: dsa: b53: Add helper to set link parameters
  net: dsa: b53: Add PHYLINK support
  net: dsa: b53: Add SerDes support

 drivers/net/dsa/b53/Kconfig      |   7 +
 drivers/net/dsa/b53/Makefile     |   1 +
 drivers/net/dsa/b53/b53_common.c | 246 +++++++++++++++++++++++++++----
 drivers/net/dsa/b53/b53_priv.h   |  36 +++++
 drivers/net/dsa/b53/b53_serdes.c | 217 +++++++++++++++++++++++++++
 drivers/net/dsa/b53/b53_serdes.h | 121 +++++++++++++++
 drivers/net/dsa/b53/b53_srab.c   | 210 ++++++++++++++++++++++++++
 7 files changed, 809 insertions(+), 29 deletions(-)
 create mode 100644 drivers/net/dsa/b53/b53_serdes.c
 create mode 100644 drivers/net/dsa/b53/b53_serdes.h

-- 
2.17.1

^ permalink raw reply

* Re: [RFC PATCH bpf-next 0/4] tools/bpf: bpftool: add net support
From: Yonghong Song @ 2018-09-05 19:17 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: ast, daniel, netdev, kernel-team
In-Reply-To: <20180905195109.7fcc8a10@cakuba>

On 9/5/18 10:51 AM, Jakub Kicinski wrote:
> On Mon, 3 Sep 2018 11:26:43 -0700, Yonghong Song wrote:
>> The functionality to dump network driver and tc related bpf programs
>> are added. Currently, users can already use "ip link show <dev>"
>> and "tc filter show dev <dev> ..." to dump bpf program attachment
>> information for xdp programs and tc bpf programs.
>> The implementation here allows bpftool as a central place for
>> bpf introspection and users do not need to revert to other tools.
>> Also, we can make command simpler to dump bpf related information,
>> e.g., "bpftool net" is able to dump all xdp and tc bpf programs.
> 
> Why not implement this best-effort, unreliable (name spaces) additional
> output the same way we added bpffs support, make it a flag to existing
> list commands?

Do you mean to implement something like "bpftool -n prog" to show the
attachments for net-related bpf programs? I feel a separate command 
"net" is better since it intends to show the context of the bpf program
as the same program may be installed in different places. This is 
similar to other commands like "cgroup" and "perf".

> 
> My knee jerk reaction is that this is duplication of work.  iproute2 can
> show us the filters and xdp programs very easily.  Will we add programs
> attached to sockets as well?  And lwtunnels?  bpfilter?

This has been discussed in iovisor meeting, but let me summarize here.
I understand that iproute2 ip/tc can do exactly what I implemented here.
The implementation here is mostly from user experience point of view.

People worried about bpf performance/memory cost in the data center. So 
they often ask what bpf programs are running in any host and what is the
context (attachment point) of all bpf programs? Most these engineers
are not networking/bpf/kernel engineers. So yes, we will need
to add programs attached to sockets, lwtunnels and bpfilters etc. later.
This may be a lower priority for me now since FB does not use them yet.

Currently, we already use bpftool do prog/map perf/cgroup dumps. 
Extending bpftool is easier for user than using a different command
as not everybody is very familiar to esp. tc.

> 
> Would you be able to give us a convincing user scenario?  What kind of
> information is the user looking for?  Are there going to be other
> sub-commands to the 'net' object?

As of now, it just dumped bpf related information with prog_id (plus 
other attachment specific information) so users can correlate back to 
the program itself. No plan to add other sub-commands (except "dev 
<name>") at this point.

> 
>> For example,
>>
>>    $ bpftool net
>>    xdp [
>>    ]
>>    netdev_filters [
>>    ifindex 2 name handle_icmp flags direct-action flags_gen [not_in_hw ]
> 
> How do you handle shared blocks here?  Does the user really care about
> the flags?  What about ordering of filters?

shared block is not handled here. This can be added later.
yes, I can remove flags_gen. For "flags", may or may not. Will go 
through dumped info and remove those not really needed.

The order of filters will be based on ifindex first and inside ifindex,
attached to class first, attached to qdisc second, attached to 
root/clsact last.

> 
>>              prog_id 3194 tag 846d29c14d0d7d26 act []
>>    ifindex 2 name handle_egress flags direct-action flags_gen [not_in_hw ]
>>              prog_id 3193 tag 387d281be9fe77aa
>>    ]

^ permalink raw reply

* Re: [PATCH bpf-next 0/4] i40e AF_XDP zero-copy buffer leak fixes
From: Björn Töpel @ 2018-09-05 19:15 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: ast, Daniel Borkmann, Netdev, Jeff Kirsher, intel-wired-lan,
	Björn Töpel, Karlsson, Magnus, Magnus Karlsson
In-Reply-To: <20180905191437.35f7d049@cakuba>

Den ons 5 sep. 2018 kl 19:14 skrev Jakub Kicinski
<jakub.kicinski@netronome.com>:
>
> On Tue,  4 Sep 2018 20:11:01 +0200, Björn Töpel wrote:
> > From: Björn Töpel <bjorn.topel@intel.com>
> >
> > This series addresses an AF_XDP zero-copy issue that buffers passed
> > from userspace to the kernel was leaked when the hardware descriptor
> > ring was torn down.
> >
> > The patches fixes the i40e AF_XDP zero-copy implementation.
> >
> > Thanks to Jakub Kicinski for pointing this out!
> >
> > Some background for folks that don't know the details: A zero-copy
> > capable driver picks buffers off the fill ring and places them on the
> > hardware Rx ring to be completed at a later point when DMA is
> > complete. Similar on the Tx side; The driver picks buffers off the Tx
> > ring and places them on the Tx hardware ring.
> >
> > In the typical flow, the Rx buffer will be placed onto an Rx ring
> > (completed to the user), and the Tx buffer will be placed on the
> > completion ring to notify the user that the transfer is done.
> >
> > However, if the driver needs to tear down the hardware rings for some
> > reason (interface goes down, reconfiguration and such), the userspace
> > buffers cannot be leaked. They have to be reused or completed back to
> > userspace.
> >
> > The implementation does the following:
> >
> > * Outstanding Tx descriptors will be passed to the completion
> >   ring. The Tx code has back-pressure mechanism in place, so that
> >   enough empty space in the completion ring is guaranteed.
> >
> > * Outstanding Rx descriptors are temporarily stored on a stash/reuse
> >   queue. The reuse queue is based on Jakub's RFC. When/if the HW rings
> >   comes up again, entries from the stash are used to re-populate the
> >   ring.
> >
> > * When AF_XDP ZC is enabled, disallow changing the number of hardware
> >   descriptors via ethtool. Otherwise, the size of the stash/reuse
> >   queue can grow unbounded.
> >
> > Going forward, introducing a "zero-copy allocator" analogous to Jesper
> > Brouer's page pool would be a more robust and reuseable solution.
> >
> > Jakub: I've made a minor checkpatch-fix to your RFC, prior adding it
> > into this series.
>
> Thanks for the fix! :)
>
> Out of curiosity, did checking the reuse queue have a noticeable impact
> in your test (i.e. always using the _rq() helpers)?  You seem to be
> adding an indirect call, would that not be way worse on a retpoline
> kernel?

Do you mean the indirection in __i40e_alloc_rx_buffers_zc (patch #3)?
The indirect call is elided by the __always_inline -- without that
retpoline took 2.5Mpps worth of Rx. :-(

I'm only using the _rq helpers in the configuration/slow path, so the
fast-path is unchanged.


Björn

^ permalink raw reply

* [PATCH 5/7] get rid of tc_u_knode ->tp
From: Al Viro @ 2018-09-05 19:04 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180905190414.5477-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

not used anymore

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 3311aacad6c3..8a1a573487bd 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -68,7 +68,6 @@ struct tc_u_knode {
 	u32			mask;
 	u32 __percpu		*pcpu_success;
 #endif
-	struct tcf_proto	*tp;
 	struct rcu_work		rwork;
 	/* The 'sel' field MUST be the last field in structure to allow for
 	 * tc_u32_keys allocated at end of structure.
@@ -897,7 +896,6 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
 	/* Similarly success statistics must be moved as pointers */
 	new->pcpu_success = n->pcpu_success;
 #endif
-	new->tp = tp;
 	memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
 
 	if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) {
@@ -1113,7 +1111,6 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	n->handle = handle;
 	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
 	n->flags = flags;
-	n->tp = tp;
 
 	err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
 	if (err < 0)
-- 
2.11.0

^ permalink raw reply related

* [PATCH 1/7] fix hnode refcounting
From: Al Viro @ 2018-09-05 19:04 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko, stable
In-Reply-To: <20180905190134.GQ19965@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

cls_u32.c misuses refcounts for struct tc_u_hnode - it counts references via
->hlist and via ->tp_root together.  u32_destroy() drops the former and, in
case when there had been links, leaves the sucker on the list.  As the result,
there's nothing to protect it from getting freed once links are dropped.
That also makes the "is it busy" check incapable of catching the root hnode -
it *is* busy (there's a reference from tp), but we don't see it as something
separate.  "Is it our root?" check partially covers that, but the problem
exists for others' roots as well.

AFAICS, the minimal fix preserving the existing behaviour (where it doesn't
include oopsen, that is) would be this:
        * count tp->root and tp_c->hlist as separate references.  I.e.
have u32_init() set refcount to 2, not 1.
	* in u32_destroy() we always drop the former; in u32_destroy_hnode() -
the latter.

	That way we have *all* references contributing to refcount.  List
removal happens in u32_destroy_hnode() (called only when ->refcnt is 1)
an in u32_destroy() in case of tc_u_common going away, along with everything
reachable from it.  IOW, that way we know that u32_destroy_key() won't
free something still on the list (or pointed to by someone's ->root).

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index f218ccf1e2d9..3f985f29ef30 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -398,6 +398,7 @@ static int u32_init(struct tcf_proto *tp)
 	rcu_assign_pointer(tp_c->hlist, root_ht);
 	root_ht->tp_c = tp_c;
 
+	root_ht->refcnt++;
 	rcu_assign_pointer(tp->root, root_ht);
 	tp->data = tp_c;
 	return 0;
@@ -610,7 +611,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
 	struct tc_u_hnode __rcu **hn;
 	struct tc_u_hnode *phn;
 
-	WARN_ON(ht->refcnt);
+	WARN_ON(--ht->refcnt);
 
 	u32_clear_hnode(tp, ht, extack);
 
@@ -649,7 +650,7 @@ static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 
 	WARN_ON(root_ht == NULL);
 
-	if (root_ht && --root_ht->refcnt == 0)
+	if (root_ht && --root_ht->refcnt == 1)
 		u32_destroy_hnode(tp, root_ht, extack);
 
 	if (--tp_c->refcnt == 0) {
@@ -698,7 +699,6 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
 	}
 
 	if (ht->refcnt == 1) {
-		ht->refcnt--;
 		u32_destroy_hnode(tp, ht, extack);
 	} else {
 		NL_SET_ERR_MSG_MOD(extack, "Can not delete in-use filter");
-- 
2.11.0

^ permalink raw reply related

* [PATCH 3/7] make sure that divisor is a power of 2
From: Al Viro @ 2018-09-05 19:04 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180905190414.5477-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 9ea5f2be907b..5816288810cc 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -995,7 +995,11 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	if (tb[TCA_U32_DIVISOR]) {
 		unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
 
-		if (--divisor > 0x100) {
+		if (!is_power_of_2(divisor)) {
+			NL_SET_ERR_MSG_MOD(extack, "Divisor is not a power of 2");
+			return -EINVAL;
+		}
+		if (divisor-- > 0x100) {
 			NL_SET_ERR_MSG_MOD(extack, "Exceeded maximum 256 hash buckets");
 			return -EINVAL;
 		}
-- 
2.11.0

^ permalink raw reply related

* [PATCH 6/7] get rid of tc_u_common ->rcu
From: Al Viro @ 2018-09-05 19:04 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180905190414.5477-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

unused

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 8a1a573487bd..be9240ae1417 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -98,7 +98,6 @@ struct tc_u_common {
 	int			refcnt;
 	struct idr		handle_idr;
 	struct hlist_node	hnode;
-	struct rcu_head		rcu;
 };
 
 static inline unsigned int u32_hash_fold(__be32 key,
-- 
2.11.0

^ permalink raw reply related

* [PATCH 7/7] clean tc_u_common hashtable
From: Al Viro @ 2018-09-05 19:04 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180905190414.5477-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

* calculate key *once*, not for each hash chain element
* let tc_u_hash() return the pointer to chain head rather than index -
callers are cleaner that way.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index be9240ae1417..6d45ec4c218c 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -343,19 +343,16 @@ static void *tc_u_common_ptr(const struct tcf_proto *tp)
 		return block->q;
 }
 
-static unsigned int tc_u_hash(const struct tcf_proto *tp)
+static struct hlist_head *tc_u_hash(void *key)
 {
-	return hash_ptr(tc_u_common_ptr(tp), U32_HASH_SHIFT);
+	return tc_u_common_hash + hash_ptr(key, U32_HASH_SHIFT);
 }
 
-static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp)
+static struct tc_u_common *tc_u_common_find(void *key)
 {
 	struct tc_u_common *tc;
-	unsigned int h;
-
-	h = tc_u_hash(tp);
-	hlist_for_each_entry(tc, &tc_u_common_hash[h], hnode) {
-		if (tc->ptr == tc_u_common_ptr(tp))
+	hlist_for_each_entry(tc, tc_u_hash(key), hnode) {
+		if (tc->ptr == key)
 			return tc;
 	}
 	return NULL;
@@ -364,10 +361,8 @@ static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp)
 static int u32_init(struct tcf_proto *tp)
 {
 	struct tc_u_hnode *root_ht;
-	struct tc_u_common *tp_c;
-	unsigned int h;
-
-	tp_c = tc_u_common_find(tp);
+	void *key = tc_u_common_ptr(tp);
+	struct tc_u_common *tp_c = tc_u_common_find(key);
 
 	root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
 	if (root_ht == NULL)
@@ -389,8 +384,7 @@ static int u32_init(struct tcf_proto *tp)
 		INIT_HLIST_NODE(&tp_c->hnode);
 		idr_init(&tp_c->handle_idr);
 
-		h = tc_u_hash(tp);
-		hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]);
+		hlist_add_head(&tp_c->hnode, tc_u_hash(key));
 	}
 
 	tp_c->refcnt++;
-- 
2.11.0

^ permalink raw reply related

* [PATCH 2/7] mark root hnode explicitly
From: Al Viro @ 2018-09-05 19:04 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180905190414.5477-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

... and disallow deleting or linking to such

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 3f985f29ef30..9ea5f2be907b 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -91,6 +91,7 @@ struct tc_u_hnode {
 	 */
 	struct tc_u_knode __rcu	*ht[1];
 };
+#define TCA_CLS_FLAGS_U32_ROOT (1<<8)
 
 struct tc_u_common {
 	struct tc_u_hnode __rcu	*hlist;
@@ -377,6 +378,7 @@ static int u32_init(struct tcf_proto *tp)
 	root_ht->refcnt++;
 	root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000;
 	root_ht->prio = tp->prio;
+	root_ht->flags = TCA_CLS_FLAGS_U32_ROOT;
 	idr_init(&root_ht->handle_idr);
 
 	if (tp_c == NULL) {
@@ -491,7 +493,8 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_u32_offload cls_u32 = {};
 
-	tc_cls_common_offload_init(&cls_u32.common, tp, h->flags, extack);
+	tc_cls_common_offload_init(&cls_u32.common, tp,
+		h->flags & ~TCA_CLS_FLAGS_U32_ROOT, extack);
 	cls_u32.command = TC_CLSU32_DELETE_HNODE;
 	cls_u32.hnode.divisor = h->divisor;
 	cls_u32.hnode.handle = h->handle;
@@ -693,7 +696,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
 		goto out;
 	}
 
-	if (root_ht == ht) {
+	if (ht->flags & TCA_CLS_FLAGS_U32_ROOT) {
 		NL_SET_ERR_MSG_MOD(extack, "Not allowed to delete root node");
 		return -EINVAL;
 	}
@@ -795,6 +798,10 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
 				NL_SET_ERR_MSG_MOD(extack, "Link hash table not found");
 				return -EINVAL;
 			}
+			if (ht_down->flags & TCA_CLS_FLAGS_U32_ROOT) {
+				NL_SET_ERR_MSG_MOD(extack, "Not linke to root node");
+				return -EINVAL;
+			}
 			ht_down->refcnt++;
 		}
 
@@ -1214,7 +1221,8 @@ static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
 	struct tc_cls_u32_offload cls_u32 = {};
 	int err;
 
-	tc_cls_common_offload_init(&cls_u32.common, tp, ht->flags, extack);
+	tc_cls_common_offload_init(&cls_u32.common, tp,
+		ht->flags & ~TCA_CLS_FLAGS_U32_ROOT, extack);
 	cls_u32.command = add ? TC_CLSU32_NEW_HNODE : TC_CLSU32_DELETE_HNODE;
 	cls_u32.hnode.divisor = ht->divisor;
 	cls_u32.hnode.handle = ht->handle;
-- 
2.11.0

^ permalink raw reply related

* [PATCH 4/7] get rid of unused argument of u32_destroy_key()
From: Al Viro @ 2018-09-05 19:04 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180905190414.5477-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 5816288810cc..3311aacad6c3 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -406,8 +406,7 @@ static int u32_init(struct tcf_proto *tp)
 	return 0;
 }
 
-static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
-			   bool free_pf)
+static int u32_destroy_key(struct tc_u_knode *n, bool free_pf)
 {
 	struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
 
@@ -441,7 +440,7 @@ static void u32_delete_key_work(struct work_struct *work)
 					      struct tc_u_knode,
 					      rwork);
 	rtnl_lock();
-	u32_destroy_key(key->tp, key, false);
+	u32_destroy_key(key, false);
 	rtnl_unlock();
 }
 
@@ -458,7 +457,7 @@ static void u32_delete_key_freepf_work(struct work_struct *work)
 					      struct tc_u_knode,
 					      rwork);
 	rtnl_lock();
-	u32_destroy_key(key->tp, key, true);
+	u32_destroy_key(key, true);
 	rtnl_unlock();
 }
 
@@ -602,7 +601,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
 			if (tcf_exts_get_net(&n->exts))
 				tcf_queue_work(&n->rwork, u32_delete_key_freepf_work);
 			else
-				u32_destroy_key(n->tp, n, true);
+				u32_destroy_key(n, true);
 		}
 	}
 }
@@ -972,13 +971,13 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 				    tca[TCA_RATE], ovr, extack);
 
 		if (err) {
-			u32_destroy_key(tp, new, false);
+			u32_destroy_key(new, false);
 			return err;
 		}
 
 		err = u32_replace_hw_knode(tp, new, flags, extack);
 		if (err) {
-			u32_destroy_key(tp, new, false);
+			u32_destroy_key(new, false);
 			return err;
 		}
 
-- 
2.11.0

^ permalink raw reply related

* [PATCHES] cls_u32 cleanups and fixes
From: Al Viro @ 2018-09-05 19:01 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko

	Several cls_u32 patches: fixing refcounting, preventing
links to and deletion of root hnodes, validating divisor.  Plus
cleanups - removal of some useless fields and saner handling of
tc_u_common hashtable.  The first 3 in series are fixes (and
-stable fodder), the rest - cleanups.  Branch can be found in
vfs.git#misc.cls_u32; individual patches will go in followups.

^ permalink raw reply

* Re: [Patch nf] xt_hashlimit: use s->file instead of s->private
From: Christoph Hellwig @ 2018-09-05 18:49 UTC (permalink / raw)
  To: Cong Wang
  Cc: netdev, netfilter-devel, hvtaifwkbgefbaei, Christoph Hellwig,
	Pablo Neira Ayuso
In-Reply-To: <20180905184131.10269-1-xiyou.wangcong@gmail.com>

On Wed, Sep 05, 2018 at 11:41:31AM -0700, Cong Wang wrote:
> After switching to the new procfs API, it is supposed to
> retrieve the private pointer from PDE_DATA(file_inode(s->file)),
> s->private is no longer referred.

Thanks fixing this faster than I could even look into the issue:

Acked-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply

* [Patch nf] xt_hashlimit: use s->file instead of s->private
From: Cong Wang @ 2018-09-05 18:41 UTC (permalink / raw)
  To: netdev
  Cc: netfilter-devel, hvtaifwkbgefbaei, Cong Wang, Christoph Hellwig,
	Pablo Neira Ayuso

After switching to the new procfs API, it is supposed to
retrieve the private pointer from PDE_DATA(file_inode(s->file)),
s->private is no longer referred.

Fixes: 1cd671827290 ("netfilter/x_tables: switch to proc_create_seq_private")
Reported-by: Sami Farin <hvtaifwkbgefbaei@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 net/netfilter/xt_hashlimit.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 9b16402f29af..3e7d259e5d8d 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -1057,7 +1057,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 static void *dl_seq_start(struct seq_file *s, loff_t *pos)
 	__acquires(htable->lock)
 {
-	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
 	unsigned int *bucket;
 
 	spin_lock_bh(&htable->lock);
@@ -1074,7 +1074,7 @@ static void *dl_seq_start(struct seq_file *s, loff_t *pos)
 
 static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
 {
-	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
 	unsigned int *bucket = v;
 
 	*pos = ++(*bucket);
@@ -1088,7 +1088,7 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
 static void dl_seq_stop(struct seq_file *s, void *v)
 	__releases(htable->lock)
 {
-	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
 	unsigned int *bucket = v;
 
 	if (!IS_ERR(bucket))
@@ -1130,7 +1130,7 @@ static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family,
 static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
 			       struct seq_file *s)
 {
-	struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
 
 	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
@@ -1145,7 +1145,7 @@ static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
 static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
 			       struct seq_file *s)
 {
-	struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
 
 	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
@@ -1160,7 +1160,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
 static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 			    struct seq_file *s)
 {
-	struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
 
 	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
@@ -1174,7 +1174,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 
 static int dl_seq_show_v2(struct seq_file *s, void *v)
 {
-	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
 	unsigned int *bucket = (unsigned int *)v;
 	struct dsthash_ent *ent;
 
@@ -1188,7 +1188,7 @@ static int dl_seq_show_v2(struct seq_file *s, void *v)
 
 static int dl_seq_show_v1(struct seq_file *s, void *v)
 {
-	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
 	unsigned int *bucket = v;
 	struct dsthash_ent *ent;
 
@@ -1202,7 +1202,7 @@ static int dl_seq_show_v1(struct seq_file *s, void *v)
 
 static int dl_seq_show(struct seq_file *s, void *v)
 {
-	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+	struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
 	unsigned int *bucket = v;
 	struct dsthash_ent *ent;
 
-- 
2.14.4

^ permalink raw reply related

* Re: 4.18.6 dl_seq_start [xt_hashlimit] unable to handle kernel NULL pointer dereference at 0000000000000050
From: Cong Wang @ 2018-09-05 18:24 UTC (permalink / raw)
  To: hvtaifwkbgefbaei, Linux Kernel Network Developers,
	Christoph Hellwig
In-Reply-To: <20180905105528.mz4msuot5n73ofcj@m.mifar.in>

On Wed, Sep 5, 2018 at 4:06 AM Sami Farin <hvtaifwkbgefbaei@gmail.com> wrote:
>
> 4.17 worked ok, this with 32 GB Ryzen system.
>
> BUG: unable to handle kernel NULL pointer dereference at 0000000000000050
> PGD 0 P4D 0
> Oops: 0000 [#1] PREEMPT SMP NOPTI
> CPU: 0 PID: 6303 Comm: grep Tainted: G                T 4.18.6+ #16
> Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./X370 Taichi, BIOS P4.60 03/02/2018
> RIP: 0010:dl_seq_start+0x11/0x60 [xt_hashlimit]


Looks like we need to do a s/s->private/s->file/.


> Code: ff 5d 41 5c 41 5d 41 5e 41 5f c3 0f 1f 40 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89 f5 53 48 8b 87 d8 00 00 00 <48> 8b 78 50 e8 36 3b 6f de 48 89 c3 48 8d 78 48 e8 ca e6 0a df 8b
> RSP: 0018:ffffa79e88befde0 EFLAGS: 00010246
> RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> RDX: 0000000000000000 RSI: ffffa79e88befe18 RDI: ffff9a64733417a0
> RBP: ffffa79e88befe18 R08: 0000000000000000 R09: 00000000657547bf
> R10: ffffffff9f2bf98a R11: ffff9a6470f5a6c0 R12: ffffa79e88befeb0
> R13: ffff9a6471879200 R14: ffff9a6471879200 R15: ffff9a64733417a0
> FS:  00007f6798784740(0000) GS:ffff9a649ea00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000000000000050 CR3: 00000007f335c000 CR4: 00000000003406f0
> Call Trace:
>  seq_read+0xc0/0x470
>  proc_reg_read+0x49/0x70
>  vfs_read+0x8a/0x140
>  ksys_read+0x52/0xc0
>  do_syscall_64+0x6f/0x353
>  ? trace_hardirqs_off_thunk+0x1a/0x1c
>  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x7f67980eee21
> Code: fe ff ff 50 48 8d 3d 46 b6 09 00 e8 f9 04 02 00 66 0f 1f 84 00 00 00 00 00 48 8d 05 c1 3b 2d 00 8b 00 85 c0 75 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 57 c3 66 0f 1f 44 00 00 41 54 49 89 d4 55 48
> RSP: 002b:00007ffc314f7a68 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
> RAX: ffffffffffffffda RBX: 0000000000008000 RCX: 00007f67980eee21
> RDX: 0000000000008000 RSI: 000055f0d317f000 RDI: 0000000000000003
> RBP: 0000000000008000 R08: 0000000000000000 R09: 0000000000009008
> R10: 0000000000000000 R11: 0000000000000246 R12: 000055f0d317f000
> R13: 0000000000000003 R14: 000055f0d317e830 R15: 0000000000000003
> Modules linked in: arptable_filter arp_tables nfnetlink_acct ip6table_mangle nf_log_ipv6 xt_hl ip6t_REJECT nf_reject_ipv6 ip6t_rt ip6table_filter ip6_tables ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat iptable_raw xt_mark xt_connmark iptable_mangle nf_log_ipv4 nf_log_common xt_LOG xt_length xt_limit ipt_REJECT nf_reject_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 xt_connlimit nf_conncount xt_multiport xt_hashlimit xt_owner xt_set xt_conntrack iptable_filter ip_set_bitmap_port ip_set_hash_mac ip_set_hash_net ip_set nf_conntrack_netlink nfnetlink bnep hwmon_vid iwlmvm snd_usb_audio snd_usbmidi_lib snd_hwdep snd_rawmidi mac80211 iwlwifi btusb btrtl kvm_amd btbcm btintel bluetooth kvm cfg80211 ecdh_generic irqbypass sp5100_tco wmi_bmof k10temp i2c_piix4 snd_hda_codec_realtek rfkill snd_hda_codec_generic
>  snd_hda_codec_hdmi snd_hda_intel snd_hda_codec snd_hda_core rtc_cmos acpi_cpufreq binfmt_misc snd_pcm_oss snd_mixer_oss snd_seq snd_seq_device snd_pcm tcp_cubic tcp_westwood br_netfilter bridge stp llc ip_tables scsi_transport_iscsi algif_skcipher af_alg uas usb_storage usbhid mxm_wmi ccp igb xhci_pci xhci_hcd usbcore usb_common wmi button 8021q mrp sunrpc snd_timer snd soundcore fuse tun xt_tcpudp x_tables tcp_bbr nf_conntrack_ipv6 nf_defrag_ipv6 nf_conntrack sch_fq_codel sch_htb sch_pie analog gameport joydev i2c_dev ecryptfs autofs4 amdkfd amd_iommu_v2 [last unloaded: pcspkr]
> CR2: 0000000000000050
> ---[ end trace 0e097a943554aa36 ]---
>
>
> --
> Do what you love because life is too short for anything else.
> https://samifar.in/
>

^ permalink raw reply

* Re: [**EXTERNAL**] Re: VRF with enslaved L3 enabled bridge
From: D'Souza, Nelson @ 2018-09-05 18:00 UTC (permalink / raw)
  To: David Ahern, netdev@vger.kernel.org
In-Reply-To: <86763041-3D2C-482B-B4FD-4C079C68D431@ciena.com>

Hi David,

Just following up.... would you be able to confirm that this is a Linux VRF issue? 

Also, how do I log a VRF related defect to ensure this gets resolved in a subsequent release.

Thanks,
Nelson

On 8/2/18, 4:12 PM, "D'Souza, Nelson" <ndsouza@ciena.com> wrote:

    Hi David,
    
    Turns out the VRF bridge Rx issue is triggered by a docker install.
    
    Docker makes the following sysctl changes:
      net.bridge.bridge-nf-call-arptables = 1
      net.bridge.bridge-nf-call-ip6tables = 1
      net.bridge.bridge-nf-call-iptables = 1     <<< exposes the ipv4 VRF Rx issue when a bridge is enslaved to a VRF
    
    which causes packets flowing through all bridges to be subjected to netfilter rules. This is required for bridge net filtering when ip forwarding is enabled.
    
    Please refer to https://github.com/docker/libnetwork/blob/master/drivers/bridge/setup_bridgenetfiltering.go#L53
    
    Setting net.bridge.bridge-nf-call-iptables = 0 resolves the issue, but is not really a viable option given that bridge net filtering is a basic requirement in existing docker deployments.
    
    It's not clear to me why this conf setting breaks local Rx delivery for a bridge enslaved to a VRF, because these packets would always be sent up by the bridge for IP netfilter processing.
    
    This issue is easily reproducible on an Ubuntu 18.04.1 VM. Simply installing docker will cause pings running on test-vrf to fail. Clearing the sysctl conf restores Rx local delivery.
    
    Thanks,
    Nelson
    
    On 7/27/18, 4:29 PM, "D'Souza, Nelson" <ndsouza@ciena.com> wrote:
    
        David,
        
        With Ubuntu 18.04.1 (kernel 4.15.0-29) pings sent out on test-vrf and br0 are successful.
        
        # uname -rv
        4.15.0-29-generic #31-Ubuntu SMP Tue Jul 17 15:39:52 UTC 2018
        
        # ping -c 1 -I test-vrf 172.16.2.2
        ping: Warning: source address might be selected on device other than test-vrf.
        PING 172.16.2.2 (172.16.2.2) from 172.16.1.1 test-vrf: 56(84) bytes of data.
        64 bytes from 172.16.2.2: icmp_seq=1 ttl=64 time=0.050 ms
        
        --- 172.16.2.2 ping statistics ---
        1 packets transmitted, 1 received, 0% packet loss, time 0ms
        rtt min/avg/max/mdev = 0.050/0.050/0.050/0.000 ms
        
        # ping -c 1 -I br0 172.16.2.2
        PING 172.16.2.2 (172.16.2.2) from 172.16.1.1 br0: 56(84) bytes of data.
        64 bytes from 172.16.2.2: icmp_seq=1 ttl=64 time=0.026 ms
        
        --- 172.16.2.2 ping statistics ---
        1 packets transmitted, 1 received, 0% packet loss, time 0ms
        rtt min/avg/max/mdev = 0.026/0.026/0.026/0.000 ms
        
        However, with Ubuntu 17.10.1 (kernel  4.13.0-21) pings on only test-vrf are successful. Pings on br0 are not successful.
        So it seems like there maybe a change in versions after 4.13.0-21 that causes pings on br0 to pass.
        
        Nelson
        
        On 7/25/18, 5:35 PM, "D'Souza, Nelson" <ndsouza@ciena.com> wrote:
        
            David, 
            
            I tried out the commands on an Ubuntu 17.10.1 VM.
            The pings on test-vrf are successful, but the pings on br0 are not successful.
            
            # uname -rv  
            4.13.0-21-generic #24-Ubuntu SMP Mon Dec 18 17:29:16 UTC 2017
            
             # lsb_release -a
            No LSB modules are available.
            Distributor ID:	Ubuntu
            Description:	Ubuntu 17.10    
            Release:	17.10
            Codename:	artful
            
            # ip rule  --> Note: its missing the l3mdev rule
            0:	from all lookup local 
            32766:	from all lookup main 
            32767:	from all lookup default
            
            Ran the configs from a bash script vrf.sh
            
             # ./vrf.sh 
            + ip netns add foo
            + ip li add veth1 type veth peer name veth2
            + ip li set veth2 netns foo
            + ip -netns foo li set lo up
            + ip -netns foo li set veth2 up
            + ip -netns foo addr add 172.16.1.2/24 dev veth2
            + ip li add test-vrf type vrf table 123
            + ip li set test-vrf up
            + ip ro add vrf test-vrf unreachable default
            + ip li add br0 type bridge
            + ip li set veth1 master br0
            + ip li set veth1 up
            + ip li set br0 up
            + ip addr add dev br0 172.16.1.1/24
            + ip li set br0 master test-vrf
            + ip -netns foo addr add 172.16.2.2/32 dev lo
            + ip ro add vrf test-vrf 172.16.2.2/32 via 172.16.1.2
            
            # ping -I test-vrf 172.16.2.2 -c 2  <<< successful on test-vrf
            ping: Warning: source address might be selected on device other than test-vrf.
            PING 172.16.2.2 (172.16.2.2) from 172.16.1.1 test-vrf: 56(84) bytes of data.
            64 bytes from 172.16.2.2: icmp_seq=1 ttl=64 time=0.035 ms
            64 bytes from 172.16.2.2: icmp_seq=2 ttl=64 time=0.045 ms
            
            --- 172.16.2.2 ping statistics ---
            2 packets transmitted, 2 received, 0% packet loss, time 1022ms
            rtt min/avg/max/mdev = 0.035/0.040/0.045/0.005 ms
            
            #ping -I br0 172.16.2.2 -c 2   <<< fails on br0
            PING 172.16.2.2 (172.16.2.2) from 172.16.1.1 br0: 56(84) bytes of data.
            
            --- 172.16.2.2 ping statistics ---
            2 packets transmitted, 0 received, 100% packet loss, time 1022ms
            
            Please let me know if I should try a different version.
            
            Nelson
            
            On 7/24/18, 9:08 AM, "D'Souza, Nelson" <ndsouza@ciena.com> wrote:
            
                It's strange that enslaving eth1 -> br0 -> test-vrf does not work, but enslaving eth1->test-vrf works fine.
                
                Nelson
                
                On 7/24/18, 8:58 AM, "D'Souza, Nelson" <ndsouza@ciena.com> wrote:
                
                    Thank you David, really appreciate the help. Most likely something specific to my environment.
                    
                    ip vrf id, does not report anything on my system. Here's the result after running the command.
                    
                    # ip vrf id
                    #
                    
                    I'll follow up with a VM.
                    
                    Nelson
                    
                    On 7/24/18, 5:55 AM, "David Ahern" <dsa@cumulusnetworks.com> wrote:
                    
                        On 7/23/18 7:43 PM, D'Souza, Nelson wrote:
                        > I copy and pasted the configs onto my device, but pings on test-vrf do not work in my setup. 
                        > I'm essentially seeing the same issue as I reported before.
                        > 
                        > In this case, pings sent out on test-vrf (host ns) are received and replied to by the loopback interface (foo ns). Although the replies are seen at the test-vrf level, they are not locally delivered to the ping application.
                        > 
                        
                        I just built v4.14.52 kernel and ran those commands - worked fine. It is
                        something specific to your environment. Is your shell tied to a VRF --
                        (ip vrf id)?
                        
                        After that, I suggest you create a VM running a newer distribution of
                        your choice (Ubuntu 17.10 or newer, debian stretch with 4.14 kernel, or
                        Fedora 26 or newer) and run the commands there.
                        
                    
                    
                
                
            
            
        
        
    
    


^ permalink raw reply

* Re: BUG: unable to handle kernel paging request in fib6_node_lookup_1
From: Song Liu @ 2018-09-05 18:12 UTC (permalink / raw)
  To: David Ahern; +Cc: Networking, weiwan@google.com, Eric Dumazet
In-Reply-To: <54a2ac86-2e61-f7ba-c1c1-bd276b0396d2@gmail.com>



> On Sep 5, 2018, at 10:32 AM, David Ahern <dsahern@gmail.com> wrote:
> 
> On 9/5/18 12:11 AM, Song Liu wrote:
>> We are debugging an issue with fib6_node_lookup_1(). 
>> 
>> We use a 4.16 based kernel, and we have back ported most upstream
>> patches in ip6_fib.{c.h}. The only major differences I can spot are
>> 
> 
> Did you backport all patches in each set that included a change to those
> files, or just the patches to ip6_fib.* and any dependencies?

I believe we always try back port all patches in each set. But we have back
ported hundreds of patches to our 4.16 tree, so it is also likely we missed
some useful patches. 

Thanks,
Song

^ permalink raw reply

* Re: [PATCH mlx5-next v1 05/15] net/mlx5: Break encap/decap into two separated flow table creation flags
From: Leon Romanovsky @ 2018-09-05 18:11 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Doug Ledford, RDMA mailing list, Ariel Levkovich, Mark Bloch,
	Or Gerlitz, Saeed Mahameed, linux-netdev
In-Reply-To: <20180905163800.GA21028@ziepe.ca>

[-- Attachment #1: Type: text/plain, Size: 1039 bytes --]

On Wed, Sep 05, 2018 at 10:38:00AM -0600, Jason Gunthorpe wrote:
> On Wed, Sep 05, 2018 at 08:10:25AM +0300, Leon Romanovsky wrote:
> > > > -	int en_encap_decap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN);
> > > > +	int en_encap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN_ENCAP);
> > > > +	int en_decap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
> > >
> > > Yuk, please don't use !!.
> > >
> > > 	bool en_decap = flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
> >
> > We need to provide en_encap and en_decap as an input to MLX5_SET(...)
> > which is passed to FW as 0 or 1.
> >
> > Boolean type is declared in C as int and treated as zero for false
> > and any other value for true,
>
> No, that isn't right, the kernel uses C99's _Bool intrinsic type, which
> is guaranteed to only hold 0 or 1 by the compiler.
>
> See types.h:
>
> typedef _Bool                   bool;

Exciting, it took me a while to find C99 standard and relevant 6.3.1.2.
Anyway, this patch didn't change previous functionality, which used "!!"
convention.

Thanks

>
> Jason

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* Re: BUG: unable to handle kernel paging request in fib6_node_lookup_1
From: Song Liu @ 2018-09-05 18:10 UTC (permalink / raw)
  To: Wei Wang; +Cc: Linux Kernel Network Developers, David Ahern, Eric Dumazet
In-Reply-To: <CAEA6p_B7JYUhV+QB+7EWcs74oUz8VYDQgtmKiNHef5vsEBi7Lw@mail.gmail.com>



> On Sep 5, 2018, at 10:09 AM, Wei Wang <weiwan@google.com> wrote:
> 
> On Tue, Sep 4, 2018 at 11:11 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> We are debugging an issue with fib6_node_lookup_1().
>> 
>> We use a 4.16 based kernel, and we have back ported most upstream
>> patches in ip6_fib.{c.h}. The only major differences I can spot are
>> 
>> 8b7f2731bd68d83940714ce92381d1a72596407c
>> c3506372277779fccbffee2475400fcd689d5738
>> 
>> I guess the issue is not related to these two fixes.
>> 
>> After staring at the call trace and disassembly code (attached below)
>> I guess this is a use-after-free issue in (or right after) the lookup
>> loop:
>> 
>>        for (;;) {
>>                struct fib6_node *next;
>> 
>>                dir = addr_bit_set(args->addr, fn->fn_bit);
>> 
>>                next = dir ? rcu_dereference(fn->right) :
>>                             rcu_dereference(fn->left);
>> 
>>                if (next) {
>>                        fn = next;
>>                        continue;
>>                }
>>                break;
>>        }
>> 
>> I guess this probably also happens to latest upstream. I haven't
>> tested this with upstream kernel (or net tree) yet, because we
>> can only trigger this about once a week on 100 servers.
>> 
>> Does this look familiar? Any comments and/or suggestions are highly
>> appreciated.
>> 
> By glancing at the commit logs, I don't think any changes were made
> regarding the core logic of fib6_node handling recently.
> (There were a couple of fixes regarding fib6_info but I don't think it
> is the cause here... But it is still good to check if you have commit
> 9b0a8da8c4c6, e873e4b9cc7e, e70a3aad44cc in your build.)

Looks like we don't have e70a3aad44cc. I think it fixes a memory leak 
(instead of a use-after-free)? Let me add it and run some tests anyway. 
Thanks a lot for this information. 

> 
> I also went through the call path and did not find anything obviously wrong...
> I think it's the best for you to reproduce it and we can debug further.
> One question is, do you have "CONFIG_IPV6_SUBTREE" enabled and specify
> src IP in the routing table?

We do have CONFIG_IPV6_SUBTREE enabled. But we usually do not specify
src IP in the routing table. 

Let me try to reproduce it. 

Thanks again,
Song

^ permalink raw reply

* Re: [PATCH rdma-next v1 12/15] RDMA/mlx5: Add a new flow action verb - modify header
From: Leon Romanovsky @ 2018-09-05 18:02 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Doug Ledford, RDMA mailing list, Ariel Levkovich, Mark Bloch,
	Or Gerlitz, Saeed Mahameed, linux-netdev
In-Reply-To: <20180905163842.GB21028@ziepe.ca>

[-- Attachment #1: Type: text/plain, Size: 1825 bytes --]

On Wed, Sep 05, 2018 at 10:38:42AM -0600, Jason Gunthorpe wrote:
> On Wed, Sep 05, 2018 at 08:20:44AM +0300, Leon Romanovsky wrote:
> > On Tue, Sep 04, 2018 at 03:58:23PM -0600, Jason Gunthorpe wrote:
> > > On Tue, Aug 28, 2018 at 02:18:51PM +0300, Leon Romanovsky wrote:
> > >
> > > > +static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)(
> > > > +	struct ib_uverbs_file *file,
> > > > +	struct uverbs_attr_bundle *attrs)
> > > > +{
> > > > +	struct ib_uobject *uobj = uverbs_attr_get_uobject(
> > > > +		attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE);
> > > > +	struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device);
> > > > +	enum mlx5_ib_uapi_flow_table_type ft_type;
> > > > +	struct ib_flow_action *action;
> > > > +	size_t num_actions;
> > > > +	void *in;
> > > > +	int len;
> > > > +	int ret;
> > > > +
> > > > +	if (!mlx5_ib_modify_header_supported(mdev))
> > > > +		return -EOPNOTSUPP;
> > > > +
> > > > +	in = uverbs_attr_get_alloced_ptr(attrs,
> > > > +		MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM);
> > > > +	len = uverbs_attr_get_len(attrs,
> > > > +		MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM);
> > > > +
> > > > +	if (len % MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto))
> > > > +		return -EINVAL;
> > > > +
> > > > +	ret = uverbs_get_const(&ft_type, attrs,
> > > > +			       MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE);
> > > > +	if (ret)
> > > > +		return -EINVAL;
> > >
> > > This should be
> > >
> > > 	if (ret)
> > > 		return ret;
> > >
> > > Every call to uverbs_get_const is wrong in this same way..
> >
> > Right, from technical point of view uverbs_get_const can return EINVAL
> > only, and it is correct for now, but need to be changed to proper
> > "return ret".
>
> No, it can return ENOENT as well.

Ahh, right, the "|| !def_val" part.

Thanks

>
> Jason

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* [PATCH ethtool] ethtool: support combinations of FEC modes
From: Edward Cree @ 2018-09-05 17:54 UTC (permalink / raw)
  To: John W. Linville, netdev
  Cc: Ganesh Goudar, Jakub Kicinski, Dustin Byford, Dirk van der Merwe

Of the three drivers that currently support FEC configuration, two (sfc
 and cxgb4[vf]) accept configurations with more than one bit set in the
 feccmd.fec bitmask.  (The precise semantics of these combinations vary.)
Thus, this patch adds the ability to specify such combinations through a
 comma-separated list of FEC modes in the 'encoding' argument on the
 command line.

Also adds --set-fec tests to test-cmdline.c, and corrects the man page
 (the encoding argument is not optional) while updating it.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
I've CCed the maintainers of the other drivers (cxgb4, nfp) that support
 --set-fec, in case they have opinions on this.
I'm not totally happy with the man page changebar; it might be clearer
 just to leave the comma-less version in the syntax synopsis and only
 mention the commas in the running-text.

 ethtool.8.in   | 11 ++++++++---
 ethtool.c      | 50 +++++++++++++++++++++++++++++++++++++++-----------
 test-cmdline.c |  9 +++++++++
 3 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/ethtool.8.in b/ethtool.8.in
index c8a902a..414eaa1 100644
--- a/ethtool.8.in
+++ b/ethtool.8.in
@@ -389,7 +389,8 @@ ethtool \- query or control network driver and hardware settings
 .HP
 .B ethtool \-\-set\-fec
 .I devname
-.B4 encoding auto off rs baser
+.B encoding
+.BR auto | off | rs | baser [ , ...]
 .
 .\" Adjust lines (i.e. full justification) and hyphenate.
 .ad
@@ -1119,8 +1120,12 @@ current FEC mode, the driver or firmware must take the link down
 administratively and report the problem in the system logs for users to correct.
 .RS 4
 .TP
-.A4 encoding auto off rs baser
-Sets the FEC encoding for the device.
+.BR encoding\ auto | off | rs | baser [ , ...]
+
+Sets the FEC encoding for the device.  Combinations of options are specified as
+e.g.
+.B auto,rs
+; the semantics of such combinations vary between drivers.
 .TS
 nokeep;
 lB	l.
diff --git a/ethtool.c b/ethtool.c
index e8b7703..9997930 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -4967,20 +4967,48 @@ static int do_set_phy_tunable(struct cmd_context *ctx)
 
 static int fecmode_str_to_type(const char *str)
 {
+	if (!strcasecmp(str, "auto"))
+		return ETHTOOL_FEC_AUTO;
+	if (!strcasecmp(str, "off"))
+		return ETHTOOL_FEC_OFF;
+	if (!strcasecmp(str, "rs"))
+		return ETHTOOL_FEC_RS;
+	if (!strcasecmp(str, "baser"))
+		return ETHTOOL_FEC_BASER;
+
+	return 0;
+}
+
+/* Takes a comma-separated list of FEC modes, returns the bitwise OR of their
+ * corresponding ETHTOOL_FEC_* constants.
+ * Accepts repetitions (e.g. 'auto,auto') and trailing comma (e.g. 'off,').
+ */
+static int parse_fecmode(const char *str)
+{
 	int fecmode = 0;
+	char buf[6];
 
 	if (!str)
-		return fecmode;
-
-	if (!strcasecmp(str, "auto"))
-		fecmode |= ETHTOOL_FEC_AUTO;
-	else if (!strcasecmp(str, "off"))
-		fecmode |= ETHTOOL_FEC_OFF;
-	else if (!strcasecmp(str, "rs"))
-		fecmode |= ETHTOOL_FEC_RS;
-	else if (!strcasecmp(str, "baser"))
-		fecmode |= ETHTOOL_FEC_BASER;
+		return 0;
+	while (*str) {
+		size_t next;
+		int mode;
 
+		next = strcspn(str, ",");
+		if (next >= 6) /* Bad mode, longest name is 5 chars */
+			return 0;
+		/* Copy into temp buffer and nul-terminate */
+		memcpy(buf, str, next);
+		buf[next] = 0;
+		mode = fecmode_str_to_type(buf);
+		if (!mode) /* Bad mode encountered */
+			return 0;
+		fecmode |= mode;
+		str += next;
+		/* Skip over ',' (but not nul) */
+		if (*str)
+			str++;
+	}
 	return fecmode;
 }
 
@@ -5028,7 +5056,7 @@ static int do_sfec(struct cmd_context *ctx)
 	if (!fecmode_str)
 		exit_bad_args();
 
-	fecmode = fecmode_str_to_type(fecmode_str);
+	fecmode = parse_fecmode(fecmode_str);
 	if (!fecmode)
 		exit_bad_args();
 
diff --git a/test-cmdline.c b/test-cmdline.c
index a94edea..9c51cca 100644
--- a/test-cmdline.c
+++ b/test-cmdline.c
@@ -266,6 +266,15 @@ static struct test_case {
 	{ 0, "--set-eee devname tx-timer 42 advertise 0x4321" },
 	{ 1, "--set-eee devname tx-timer foo" },
 	{ 1, "--set-eee devname advertise foo" },
+	{ 1, "--set-fec devname" },
+	{ 0, "--set-fec devname encoding auto" },
+	{ 0, "--set-fec devname encoding off," },
+	{ 0, "--set-fec devname encoding baser,rs" },
+	{ 0, "--set-fec devname encoding auto,auto," },
+	{ 1, "--set-fec devname encoding foo" },
+	{ 1, "--set-fec devname encoding auto,foo" },
+	{ 1, "--set-fec devname encoding auto,," },
+	{ 1, "--set-fec devname auto" },
 	/* can't test --set-priv-flags yet */
 	{ 0, "-h" },
 	{ 0, "--help" },

^ permalink raw reply related

* Re: [RFC PATCH bpf-next 0/4] tools/bpf: bpftool: add net support
From: Jakub Kicinski @ 2018-09-05 17:51 UTC (permalink / raw)
  To: Yonghong Song; +Cc: ast, daniel, netdev, kernel-team
In-Reply-To: <20180903182647.1244630-1-yhs@fb.com>

On Mon, 3 Sep 2018 11:26:43 -0700, Yonghong Song wrote:
> The functionality to dump network driver and tc related bpf programs
> are added. Currently, users can already use "ip link show <dev>"
> and "tc filter show dev <dev> ..." to dump bpf program attachment
> information for xdp programs and tc bpf programs.
> The implementation here allows bpftool as a central place for
> bpf introspection and users do not need to revert to other tools.
> Also, we can make command simpler to dump bpf related information,
> e.g., "bpftool net" is able to dump all xdp and tc bpf programs.

Why not implement this best-effort, unreliable (name spaces) additional
output the same way we added bpffs support, make it a flag to existing
list commands?

My knee jerk reaction is that this is duplication of work.  iproute2 can
show us the filters and xdp programs very easily.  Will we add programs
attached to sockets as well?  And lwtunnels?  bpfilter?

Would you be able to give us a convincing user scenario?  What kind of
information is the user looking for?  Are there going to be other
sub-commands to the 'net' object?

> For example,
> 
>   $ bpftool net
>   xdp [
>   ]
>   netdev_filters [
>   ifindex 2 name handle_icmp flags direct-action flags_gen [not_in_hw ]

How do you handle shared blocks here?  Does the user really care about
the flags?  What about ordering of filters?

>             prog_id 3194 tag 846d29c14d0d7d26 act []
>   ifindex 2 name handle_egress flags direct-action flags_gen [not_in_hw ]
>             prog_id 3193 tag 387d281be9fe77aa
>   ]     

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox