* [PATCH net-next 3/5] net: dsa: b53: Add helper to set link parameters
From: Florian Fainelli @ 2018-09-04 22:11 UTC (permalink / raw)
To: netdev; +Cc: Florian Fainelli, andrew, vivien.didelot, davem
In-Reply-To: <20180904221120.13018-1-f.fainelli@gmail.com>
Extract the logic from b53_adjust_link() responsible for overriding a
given port's link, speed, duplex and pause settings and make two helper
functions to set the port's configuration and the port's link settings.
We will make use of both, as separate functions while adding PHYLINK
support next.
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
drivers/net/dsa/b53/b53_common.c | 89 +++++++++++++++++++++-----------
1 file changed, 60 insertions(+), 29 deletions(-)
diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 85ed264bc163..78aeaccf19a1 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -26,6 +26,7 @@
#include <linux/module.h>
#include <linux/platform_data/b53.h>
#include <linux/phy.h>
+#include <linux/phylink.h>
#include <linux/etherdevice.h>
#include <linux/if_bridge.h>
#include <net/dsa.h>
@@ -947,33 +948,50 @@ static int b53_setup(struct dsa_switch *ds)
return ret;
}
-static void b53_adjust_link(struct dsa_switch *ds, int port,
- struct phy_device *phydev)
+static void b53_force_link(struct b53_device *dev, int port, int link)
{
- struct b53_device *dev = ds->priv;
- struct ethtool_eee *p = &dev->ports[port].eee;
- u8 rgmii_ctrl = 0, reg = 0, off;
-
- if (!phy_is_pseudo_fixed_link(phydev))
- return;
+ u8 reg, val, off;
/* Override the port settings */
if (port == dev->cpu_port) {
off = B53_PORT_OVERRIDE_CTRL;
- reg = PORT_OVERRIDE_EN;
+ val = PORT_OVERRIDE_EN;
} else {
off = B53_GMII_PORT_OVERRIDE_CTRL(port);
- reg = GMII_PO_EN;
+ val = GMII_PO_EN;
}
- /* Set the link UP */
- if (phydev->link)
+ b53_read8(dev, B53_CTRL_PAGE, off, ®);
+ reg |= val;
+ if (link)
reg |= PORT_OVERRIDE_LINK;
+ else
+ reg &= ~PORT_OVERRIDE_LINK;
+ b53_write8(dev, B53_CTRL_PAGE, off, reg);
+}
+
+static void b53_force_port_config(struct b53_device *dev, int port,
+ int speed, int duplex, int pause)
+{
+ u8 reg, val, off;
+
+ /* Override the port settings */
+ if (port == dev->cpu_port) {
+ off = B53_PORT_OVERRIDE_CTRL;
+ val = PORT_OVERRIDE_EN;
+ } else {
+ off = B53_GMII_PORT_OVERRIDE_CTRL(port);
+ val = GMII_PO_EN;
+ }
- if (phydev->duplex == DUPLEX_FULL)
+ b53_read8(dev, B53_CTRL_PAGE, off, ®);
+ reg |= val;
+ if (duplex == DUPLEX_FULL)
reg |= PORT_OVERRIDE_FULL_DUPLEX;
+ else
+ reg &= ~PORT_OVERRIDE_FULL_DUPLEX;
- switch (phydev->speed) {
+ switch (speed) {
case 2000:
reg |= PORT_OVERRIDE_SPEED_2000M;
/* fallthrough */
@@ -987,21 +1005,41 @@ static void b53_adjust_link(struct dsa_switch *ds, int port,
reg |= PORT_OVERRIDE_SPEED_10M;
break;
default:
- dev_err(ds->dev, "unknown speed: %d\n", phydev->speed);
+ dev_err(dev->dev, "unknown speed: %d\n", speed);
return;
}
+ if (pause & MLO_PAUSE_RX)
+ reg |= PORT_OVERRIDE_RX_FLOW;
+ if (pause & MLO_PAUSE_TX)
+ reg |= PORT_OVERRIDE_TX_FLOW;
+
+ b53_write8(dev, B53_CTRL_PAGE, off, reg);
+}
+
+static void b53_adjust_link(struct dsa_switch *ds, int port,
+ struct phy_device *phydev)
+{
+ struct b53_device *dev = ds->priv;
+ struct ethtool_eee *p = &dev->ports[port].eee;
+ u8 rgmii_ctrl = 0, reg = 0, off;
+ int pause;
+
+ if (!phy_is_pseudo_fixed_link(phydev))
+ return;
+
/* Enable flow control on BCM5301x's CPU port */
if (is5301x(dev) && port == dev->cpu_port)
- reg |= PORT_OVERRIDE_RX_FLOW | PORT_OVERRIDE_TX_FLOW;
+ pause = MLO_PAUSE_TXRX_MASK;
if (phydev->pause) {
if (phydev->asym_pause)
- reg |= PORT_OVERRIDE_TX_FLOW;
- reg |= PORT_OVERRIDE_RX_FLOW;
+ pause |= MLO_PAUSE_TX;
+ pause |= MLO_PAUSE_RX;
}
- b53_write8(dev, B53_CTRL_PAGE, off, reg);
+ b53_force_port_config(dev, port, phydev->speed, phydev->duplex, pause);
+ b53_force_link(dev, port, phydev->link);
if (is531x5(dev) && phy_interface_is_rgmii(phydev)) {
if (port == 8)
@@ -1061,16 +1099,9 @@ static void b53_adjust_link(struct dsa_switch *ds, int port,
}
} else if (is5301x(dev)) {
if (port != dev->cpu_port) {
- u8 po_reg = B53_GMII_PORT_OVERRIDE_CTRL(dev->cpu_port);
- u8 gmii_po;
-
- b53_read8(dev, B53_CTRL_PAGE, po_reg, &gmii_po);
- gmii_po |= GMII_PO_LINK |
- GMII_PO_RX_FLOW |
- GMII_PO_TX_FLOW |
- GMII_PO_EN |
- GMII_PO_SPEED_2000M;
- b53_write8(dev, B53_CTRL_PAGE, po_reg, gmii_po);
+ b53_force_port_config(dev, dev->cpu_port, 2000,
+ DUPLEX_FULL, MLO_PAUSE_TXRX_MASK);
+ b53_force_link(dev, dev->cpu_port, 1);
}
}
--
2.17.1
^ permalink raw reply related
* [PATCH net-next 2/5] net: dsa: b53: Make SRAB driver manage port interrupts
From: Florian Fainelli @ 2018-09-04 22:11 UTC (permalink / raw)
To: netdev; +Cc: Florian Fainelli, andrew, vivien.didelot, davem
In-Reply-To: <20180904221120.13018-1-f.fainelli@gmail.com>
Update the SRAB driver to manage per-port interrupts. Since we cannot
sleep during b53_io_ops, schedule a workqueue whenever we get a port
specific interrupt. We will later make use of this to call back into
PHYLINK when there is e.g: a link state change.
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
drivers/net/dsa/b53/b53_srab.c | 108 +++++++++++++++++++++++++++++++++
1 file changed, 108 insertions(+)
diff --git a/drivers/net/dsa/b53/b53_srab.c b/drivers/net/dsa/b53/b53_srab.c
index 91de2ba99ad1..411b84f61903 100644
--- a/drivers/net/dsa/b53/b53_srab.c
+++ b/drivers/net/dsa/b53/b53_srab.c
@@ -22,6 +22,7 @@
#include <linux/platform_device.h>
#include <linux/platform_data/b53.h>
#include <linux/of.h>
+#include <linux/workqueue.h>
#include "b53_priv.h"
@@ -47,6 +48,7 @@
/* command and status register of the SRAB */
#define B53_SRAB_CTRLS 0x40
+#define B53_SRAB_CTRLS_HOST_INTR BIT(1)
#define B53_SRAB_CTRLS_RCAREQ BIT(3)
#define B53_SRAB_CTRLS_RCAGNT BIT(4)
#define B53_SRAB_CTRLS_SW_INIT_DONE BIT(6)
@@ -60,8 +62,17 @@
#define B53_SRAB_P7_SLEEP_TIMER BIT(11)
#define B53_SRAB_IMP0_SLEEP_TIMER BIT(12)
+struct b53_srab_port_priv {
+ struct work_struct irq_work;
+ int irq;
+ bool irq_enabled;
+ struct b53_device *dev;
+ unsigned int num;
+};
+
struct b53_srab_priv {
void __iomem *regs;
+ struct b53_srab_port_priv port_intrs[B53_N_PORTS];
};
static int b53_srab_request_grant(struct b53_device *dev)
@@ -344,6 +355,50 @@ static int b53_srab_write64(struct b53_device *dev, u8 page, u8 reg,
return ret;
}
+static void b53_srab_port_defer(struct work_struct *work)
+{
+}
+
+static irqreturn_t b53_srab_port_isr(int irq, void *dev_id)
+{
+ struct b53_srab_port_priv *port = dev_id;
+ struct b53_device *dev = port->dev;
+ struct b53_srab_priv *priv = dev->priv;
+
+ /* Acknowledge the interrupt */
+ writel(BIT(port->num), priv->regs + B53_SRAB_INTR);
+
+ schedule_work(&port->irq_work);
+
+ return IRQ_HANDLED;
+}
+
+static int b53_srab_irq_enable(struct b53_device *dev, int port)
+{
+ struct b53_srab_priv *priv = dev->priv;
+ struct b53_srab_port_priv *p = &priv->port_intrs[port];
+ int ret;
+
+ ret = request_irq(p->irq, b53_srab_port_isr, 0,
+ dev_name(dev->dev), p);
+ if (!ret)
+ p->irq_enabled = true;
+
+ return ret;
+}
+
+static void b53_srab_irq_disable(struct b53_device *dev, int port)
+{
+ struct b53_srab_priv *priv = dev->priv;
+ struct b53_srab_port_priv *p = &priv->port_intrs[port];
+
+ if (p->irq_enabled) {
+ free_irq(p->irq, p);
+ cancel_work_sync(&p->irq_work);
+ p->irq_enabled = false;
+ }
+}
+
static const struct b53_io_ops b53_srab_ops = {
.read8 = b53_srab_read8,
.read16 = b53_srab_read16,
@@ -355,6 +410,8 @@ static const struct b53_io_ops b53_srab_ops = {
.write32 = b53_srab_write32,
.write48 = b53_srab_write48,
.write64 = b53_srab_write64,
+ .irq_enable = b53_srab_irq_enable,
+ .irq_disable = b53_srab_irq_disable,
};
static const struct of_device_id b53_srab_of_match[] = {
@@ -379,6 +436,53 @@ static const struct of_device_id b53_srab_of_match[] = {
};
MODULE_DEVICE_TABLE(of, b53_srab_of_match);
+static void b53_srab_intr_set(struct b53_srab_priv *priv, bool set)
+{
+ u32 reg;
+
+ reg = readl(priv->regs + B53_SRAB_CTRLS);
+ if (set)
+ reg |= B53_SRAB_CTRLS_HOST_INTR;
+ else
+ reg &= ~B53_SRAB_CTRLS_HOST_INTR;
+ writel(reg, priv->regs + B53_SRAB_CTRLS);
+}
+
+static void b53_srab_prepare_irq(struct platform_device *pdev)
+{
+ struct b53_device *dev = platform_get_drvdata(pdev);
+ struct b53_srab_priv *priv = dev->priv;
+ struct b53_srab_port_priv *port;
+ unsigned int i;
+ char *name;
+
+ /* Clear all pending interrupts */
+ writel(0xffffffff, priv->regs + B53_SRAB_INTR);
+
+ if (dev->pdata && dev->pdata->chip_id != BCM58XX_DEVICE_ID)
+ return;
+
+ for (i = 0; i < B53_N_PORTS; i++) {
+ port = &priv->port_intrs[i];
+
+ /* There is no port 6 */
+ if (i == 6)
+ continue;
+
+ name = kasprintf(GFP_KERNEL, "link_state_p%d", i);
+ if (!name)
+ return;
+
+ port->num = i;
+ port->dev = dev;
+ INIT_WORK(&port->irq_work, b53_srab_port_defer);
+ port->irq = platform_get_irq_byname(pdev, name);
+ kfree(name);
+ }
+
+ b53_srab_intr_set(priv, true);
+}
+
static int b53_srab_probe(struct platform_device *pdev)
{
struct b53_platform_data *pdata = pdev->dev.platform_data;
@@ -417,13 +521,17 @@ static int b53_srab_probe(struct platform_device *pdev)
platform_set_drvdata(pdev, dev);
+ b53_srab_prepare_irq(pdev);
+
return b53_switch_register(dev);
}
static int b53_srab_remove(struct platform_device *pdev)
{
struct b53_device *dev = platform_get_drvdata(pdev);
+ struct b53_srab_priv *priv = dev->priv;
+ b53_srab_intr_set(priv, false);
if (dev)
b53_switch_remove(dev);
--
2.17.1
^ permalink raw reply related
* Re: [PATCH rdma-next v1 00/15] Flow actions to mutate packets
From: Jason Gunthorpe @ 2018-09-04 22:12 UTC (permalink / raw)
To: Leon Romanovsky
Cc: Doug Ledford, Leon Romanovsky, RDMA mailing list, Ariel Levkovich,
Mark Bloch, Or Gerlitz, Saeed Mahameed, linux-netdev
In-Reply-To: <20180828111854.14367-1-leon@kernel.org>
On Tue, Aug 28, 2018 at 02:18:39PM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@mellanox.com>
>
> >From Mark,
>
> This series exposes the ability to create flow actions which can
> mutate packet headers. We do that by exposing two new verbs:
> * modify header - can change existing packet headers. packet
> * reformat - can encapsulate or decapsulate a packet.
> Once created a flow action must be attached to a steering
> rule for it to take effect.
>
> The first 10 patches refactor mlx5_core code, rename internal structures
> to better reflect their operation and export needed functions so the
> RDMA side can allocate the action.
>
> The last 5 patches expose via the IOCTL infrastructure mlx5_ib methods
> which do the actual allocation of resources and return an handle to the
> user. A user of this API is expected to know how to work with the
> device's spec as the input to those function is HW depended.
>
> An example usage of the modify header action is routing, A user can
> create an action which edits the L2 header and decrease the TTL.
>
> An example usage of the packet reformat action is VXLAN encap/decap
> which is done by the HW.
>
> Changelog:
> v0 -> v1:
> * Patch 1: Addressed Saeed's comments and simplified the logic.
> * Patch 2: Changed due to changes in patch 1.
>
> Split the 27 patch series into 3, this is the first one
> which just lets the user create flow action.
> Other than that styling fixes mainly in the RDMA patches
> to make sure 80 chars limit isn't exceeded.
>
> RFC -> v0:
> * Patch 1 a new patch which refactors the logic
> when getting a flow namespace.
> * Patch 2 was split into two.
> * Patch 3: Fixed a typo in commit message
> * Patch 5: Updated commit message
> * Patch 7: Updated commit message
> Renamed:
> - MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT_ID to
> MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT
> - packet_reformat_id to reformat_id in struct mlx5_flow_act
> - packet_reformat_id to encap_id in struct mlx5_esw_flow_attr
> - packet_reformat_id to encap_id in struct mlx5e_encap_entry
> - PACKET_REFORMAT to REFORMAT when printing trace points
> * Patch 9: Updated commit message
> Updated function declaration in mlx5_core.h, could of lead
> to compile error on bisection.
> * Patch 11: Disallow egress rules insertion when in switchdev mode
> * Patch 12: A new patch to deal with passing enum values using
> the IOCTL infrastructure.
> * Patch 13: Use new enum value attribute when passing enum
> mlx5_ib_uapi_flow_table_type
> * Patch 15: Don't set encap flags on flow tables if in switchdev mode
> * Patch 17: Use new enum value attribute when passing enum
> mlx5_ib_uapi_flow_table_type and enum
> mlx5_ib_uapi_flow_action_packet_reformat_type
> * Patch 19: Allow creation of both
> MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL
> and MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2
> packet
> reformat actions.
> * Patch 20: A new patch which allows attaching packet reformat
> actions to flow tables on NIC RX.
>
> Thanks
>
> Mark Bloch (15):
> net/mlx5: Cleanup flow namespace getter switch logic
> net/mlx5: Add proper NIC TX steering flow tables support
> net/mlx5: Export modify header alloc/dealloc functions
> net/mlx5: Add support for more namespaces when allocating modify
> header
> net/mlx5: Break encap/decap into two separated flow table creation
> flags
> net/mlx5: Move header encap type to IFC header file
> {net, RDMA}/mlx5: Rename encap to reformat packet
> net/mlx5: Expose new packet reformat capabilities
> net/mlx5: Pass a namespace for packet reformat ID allocation
> net/mlx5: Export packet reformat alloc/dealloc functions
> RDMA/uverbs: Add UVERBS_ATTR_CONST_IN to the specs language
> RDMA/mlx5: Add a new flow action verb - modify header
> RDMA/uverbs: Add generic function to fill in flow action object
> RDMA/mlx5: Add new flow action verb - packet reformat
> RDMA/mlx5: Extend packet reformat verbs
>
> drivers/infiniband/core/uverbs_ioctl.c | 23 ++
> .../infiniband/core/uverbs_std_types_flow_action.c | 7 +-
> drivers/infiniband/hw/mlx5/devx.c | 6 +-
> drivers/infiniband/hw/mlx5/flow.c | 301 +++++++++++++++++++++
> drivers/infiniband/hw/mlx5/main.c | 3 +
> drivers/infiniband/hw/mlx5/mlx5_ib.h | 19 +-
> drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 8 +-
> .../mellanox/mlx5/core/diag/fs_tracepoint.h | 2 +-
> drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 51 ++--
> drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 2 +-
> .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 9 +-
> drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 87 +++---
> drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 73 +++--
> .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 12 +-
> include/linux/mlx5/device.h | 6 +
> include/linux/mlx5/fs.h | 20 +-
> include/linux/mlx5/mlx5_ifc.h | 70 +++--
> include/rdma/uverbs_ioctl.h | 40 +++
> include/rdma/uverbs_std_types.h | 12 +
> include/uapi/rdma/mlx5_user_ioctl_cmds.h | 18 ++
> include/uapi/rdma/mlx5_user_ioctl_verbs.h | 12 +
> 21 files changed, 638 insertions(+), 143 deletions(-)
This looks OK to me, can you make the shared commit please?
Thanks,
Jason
^ permalink raw reply
* [PATCH net-next 1/5] net: dsa: b53: Add ability to enable/disable port interrupts
From: Florian Fainelli @ 2018-09-04 22:11 UTC (permalink / raw)
To: netdev; +Cc: Florian Fainelli, andrew, vivien.didelot, davem
In-Reply-To: <20180904221120.13018-1-f.fainelli@gmail.com>
Some switches expose individual interrupt line(s) for port specific
event(s), allow configuring these interrupts at an appropriate time
during port_enable/disable callbacks where all port specific resources
are known to be set-up and ready for use.
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
drivers/net/dsa/b53/b53_common.c | 9 +++++++++
drivers/net/dsa/b53/b53_priv.h | 2 ++
2 files changed, 11 insertions(+)
diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index d93c790bfbe8..85ed264bc163 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -502,8 +502,14 @@ int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy)
{
struct b53_device *dev = ds->priv;
unsigned int cpu_port = ds->ports[port].cpu_dp->index;
+ int ret = 0;
u16 pvlan;
+ if (dev->ops->irq_enable)
+ ret = dev->ops->irq_enable(dev, port);
+ if (ret)
+ return ret;
+
/* Clear the Rx and Tx disable bits and set to no spanning tree */
b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), 0);
@@ -536,6 +542,9 @@ void b53_disable_port(struct dsa_switch *ds, int port, struct phy_device *phy)
b53_read8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), ®);
reg |= PORT_CTRL_RX_DISABLE | PORT_CTRL_TX_DISABLE;
b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), reg);
+
+ if (dev->ops->irq_disable)
+ dev->ops->irq_disable(dev, port);
}
EXPORT_SYMBOL(b53_disable_port);
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index df149756c282..2980a5838f58 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -43,6 +43,8 @@ struct b53_io_ops {
int (*write64)(struct b53_device *dev, u8 page, u8 reg, u64 value);
int (*phy_read16)(struct b53_device *dev, int addr, int reg, u16 *value);
int (*phy_write16)(struct b53_device *dev, int addr, int reg, u16 value);
+ int (*irq_enable)(struct b53_device *dev, int port);
+ void (*irq_disable)(struct b53_device *dev, int port);
};
enum {
--
2.17.1
^ permalink raw reply related
* [PATCH net-next 0/5] net: dsa: b53: SerDes support
From: Florian Fainelli @ 2018-09-04 22:11 UTC (permalink / raw)
To: netdev; +Cc: Florian Fainelli, andrew, vivien.didelot, davem
Hi all,
This patch series adds support for the SerDes found on NorthStar Plus
(NSP) which allows us to use the SFP port on the BCM958625HR board (and
other similar designs).
Florian Fainelli (5):
net: dsa: b53: Add ability to enable/disable port interrupts
net: dsa: b53: Make SRAB driver manage port interrupts
net: dsa: b53: Add helper to set link parameters
net: dsa: b53: Add PHYLINK support
net: dsa: b53: Add SerDes support
drivers/net/dsa/b53/Kconfig | 7 +
drivers/net/dsa/b53/Makefile | 1 +
drivers/net/dsa/b53/b53_common.c | 243 +++++++++++++++++++++++++++----
drivers/net/dsa/b53/b53_priv.h | 36 +++++
drivers/net/dsa/b53/b53_serdes.c | 217 +++++++++++++++++++++++++++
drivers/net/dsa/b53/b53_serdes.h | 121 +++++++++++++++
drivers/net/dsa/b53/b53_srab.c | 217 +++++++++++++++++++++++++++
7 files changed, 813 insertions(+), 29 deletions(-)
create mode 100644 drivers/net/dsa/b53/b53_serdes.c
create mode 100644 drivers/net/dsa/b53/b53_serdes.h
--
2.17.1
^ permalink raw reply
* Re: [PATCH mlx5-next v1 05/15] net/mlx5: Break encap/decap into two separated flow table creation flags
From: Jason Gunthorpe @ 2018-09-04 22:02 UTC (permalink / raw)
To: Leon Romanovsky
Cc: Doug Ledford, Leon Romanovsky, RDMA mailing list, Ariel Levkovich,
Mark Bloch, Or Gerlitz, Saeed Mahameed, linux-netdev
In-Reply-To: <20180828111854.14367-6-leon@kernel.org>
On Tue, Aug 28, 2018 at 02:18:44PM +0300, Leon Romanovsky wrote:
> From: Mark Bloch <markb@mellanox.com>
>
> Today we are able to attach encap and decap actions only to the FDB. In
> preparation to enable those actions on the NIC flow tables, break the
> single flag into two. Those flags control whatever a decap or encap
> operations can be attached to the flow table created. For FDB, if
> encapsulation is required, we set both of them.
>
> Signed-off-by: Mark Bloch <markb@mellanox.com>
> Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
> Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
> Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
> drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 3 ++-
> drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 7 ++++---
> include/linux/mlx5/fs.h | 3 ++-
> 3 files changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> index f72b5c9dcfe9..ff21807a0c4b 100644
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> @@ -529,7 +529,8 @@ static int esw_create_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
> esw_size >>= 1;
>
> if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
> - flags |= MLX5_FLOW_TABLE_TUNNEL_EN;
> + flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_ENCAP |
> + MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
>
> fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH,
> esw_size,
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
> index 9ae777e56529..1698f325a21e 100644
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
> @@ -152,7 +152,8 @@ static int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
> struct mlx5_flow_table *next_ft,
> unsigned int *table_id, u32 flags)
> {
> - int en_encap_decap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN);
> + int en_encap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN_ENCAP);
> + int en_decap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
Yuk, please don't use !!.
bool en_decap = flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
Jason
^ permalink raw reply
* Re: [PATCH rdma-next v1 12/15] RDMA/mlx5: Add a new flow action verb - modify header
From: Jason Gunthorpe @ 2018-09-04 21:58 UTC (permalink / raw)
To: Leon Romanovsky
Cc: Doug Ledford, Leon Romanovsky, RDMA mailing list, Ariel Levkovich,
Mark Bloch, Or Gerlitz, Saeed Mahameed, linux-netdev
In-Reply-To: <20180828111854.14367-13-leon@kernel.org>
On Tue, Aug 28, 2018 at 02:18:51PM +0300, Leon Romanovsky wrote:
> +static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)(
> + struct ib_uverbs_file *file,
> + struct uverbs_attr_bundle *attrs)
> +{
> + struct ib_uobject *uobj = uverbs_attr_get_uobject(
> + attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE);
> + struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device);
> + enum mlx5_ib_uapi_flow_table_type ft_type;
> + struct ib_flow_action *action;
> + size_t num_actions;
> + void *in;
> + int len;
> + int ret;
> +
> + if (!mlx5_ib_modify_header_supported(mdev))
> + return -EOPNOTSUPP;
> +
> + in = uverbs_attr_get_alloced_ptr(attrs,
> + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM);
> + len = uverbs_attr_get_len(attrs,
> + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM);
> +
> + if (len % MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto))
> + return -EINVAL;
> +
> + ret = uverbs_get_const(&ft_type, attrs,
> + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE);
> + if (ret)
> + return -EINVAL;
This should be
if (ret)
return ret;
Every call to uverbs_get_const is wrong in this same way..
I can probably fix it if this is the only thing though..
Jason
^ permalink raw reply
* [Patch net v3] tipc: call start and done ops directly in __tipc_nl_compat_dumpit()
From: Cong Wang @ 2018-09-04 21:54 UTC (permalink / raw)
To: netdev; +Cc: tipc-discussion, Cong Wang, Jon Maloy, Ying Xue
__tipc_nl_compat_dumpit() uses a netlink_callback on stack,
so the only way to align it with other ->dumpit() call path
is calling tipc_dump_start() and tipc_dump_done() directly
inside it. Otherwise ->dumpit() would always get NULL from
cb->args[].
But tipc_dump_start() uses sock_net(cb->skb->sk) to retrieve
net pointer, the cb->skb here doesn't set skb->sk, the net pointer
is saved in msg->net instead, so introduce a helper function
__tipc_dump_start() to pass in msg->net.
Ying pointed out cb->args[0...3] are already used by other
callbacks on this call path, so we can't use cb->args[0] any
more, use cb->args[4] instead.
Fixes: 9a07efa9aea2 ("tipc: switch to rhashtable iterator")
Reported-and-tested-by: syzbot+e93a2c41f91b8e2c7d9b@syzkaller.appspotmail.com
Cc: Jon Maloy <jon.maloy@ericsson.com>
Cc: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
v2: fix sock_net(cb->skb->sk)
v3: use cb->args[4] instead of cb->args[0]
net/tipc/netlink_compat.c | 2 ++
net/tipc/socket.c | 17 +++++++++++------
net/tipc/socket.h | 1 +
3 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index a2f76743c73a..82f665728382 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -185,6 +185,7 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
return -ENOMEM;
buf->sk = msg->dst_sk;
+ __tipc_dump_start(&cb, msg->net);
do {
int rem;
@@ -216,6 +217,7 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
err = 0;
err_out:
+ tipc_dump_done(&cb);
kfree_skb(buf);
if (err == -EMSGSIZE) {
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index a0ff8bffc96b..3f03ddd0e35b 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -3230,7 +3230,7 @@ int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb,
struct netlink_callback *cb,
struct tipc_sock *tsk))
{
- struct rhashtable_iter *iter = (void *)cb->args[0];
+ struct rhashtable_iter *iter = (void *)cb->args[4];
struct tipc_sock *tsk;
int err;
@@ -3266,8 +3266,14 @@ EXPORT_SYMBOL(tipc_nl_sk_walk);
int tipc_dump_start(struct netlink_callback *cb)
{
- struct rhashtable_iter *iter = (void *)cb->args[0];
- struct net *net = sock_net(cb->skb->sk);
+ return __tipc_dump_start(cb, sock_net(cb->skb->sk));
+}
+EXPORT_SYMBOL(tipc_dump_start);
+
+int __tipc_dump_start(struct netlink_callback *cb, struct net *net)
+{
+ /* tipc_nl_name_table_dump() uses cb->args[0...3]. */
+ struct rhashtable_iter *iter = (void *)cb->args[4];
struct tipc_net *tn = tipc_net(net);
if (!iter) {
@@ -3275,17 +3281,16 @@ int tipc_dump_start(struct netlink_callback *cb)
if (!iter)
return -ENOMEM;
- cb->args[0] = (long)iter;
+ cb->args[4] = (long)iter;
}
rhashtable_walk_enter(&tn->sk_rht, iter);
return 0;
}
-EXPORT_SYMBOL(tipc_dump_start);
int tipc_dump_done(struct netlink_callback *cb)
{
- struct rhashtable_iter *hti = (void *)cb->args[0];
+ struct rhashtable_iter *hti = (void *)cb->args[4];
rhashtable_walk_exit(hti);
kfree(hti);
diff --git a/net/tipc/socket.h b/net/tipc/socket.h
index d43032e26532..5e575f205afe 100644
--- a/net/tipc/socket.h
+++ b/net/tipc/socket.h
@@ -69,5 +69,6 @@ int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb,
struct netlink_callback *cb,
struct tipc_sock *tsk));
int tipc_dump_start(struct netlink_callback *cb);
+int __tipc_dump_start(struct netlink_callback *cb, struct net *net);
int tipc_dump_done(struct netlink_callback *cb);
#endif
--
2.14.4
^ permalink raw reply related
* Re: [PATCH v2 net-next 4/7] dt-bindings: net: Add lantiq,xrx200-net DT bindings
From: Hauke Mehrtens @ 2018-09-04 21:54 UTC (permalink / raw)
To: Florian Fainelli, davem
Cc: netdev, andrew, vivien.didelot, john, linux-mips, dev,
hauke.mehrtens, devicetree
In-Reply-To: <5866e89f-ac9a-8c6c-bf53-3b1206171e31@gmail.com>
[-- Attachment #1.1: Type: text/plain, Size: 1736 bytes --]
On 09/03/2018 09:46 PM, Florian Fainelli wrote:
>
>
> On 9/1/2018 5:04 AM, Hauke Mehrtens wrote:
>> This adds the binding for the PMAC core between the CPU and the GSWIP
>> switch found on the xrx200 / VR9 Lantiq / Intel SoC.
>>
>> Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
>> Cc: devicetree@vger.kernel.org
>> ---
>> .../devicetree/bindings/net/lantiq,xrx200-net.txt | 21
>> +++++++++++++++++++++
>> 1 file changed, 21 insertions(+)
>> create mode 100644
>> Documentation/devicetree/bindings/net/lantiq,xrx200-net.txt
>>
>> diff --git
>> a/Documentation/devicetree/bindings/net/lantiq,xrx200-net.txt
>> b/Documentation/devicetree/bindings/net/lantiq,xrx200-net.txt
>> new file mode 100644
>> index 000000000000..8a2fe5200cdc
>> --- /dev/null
>> +++ b/Documentation/devicetree/bindings/net/lantiq,xrx200-net.txt
>> @@ -0,0 +1,21 @@
>> +Lantiq xRX200 GSWIP PMAC Ethernet driver
>> +==================================
>> +
>> +Required properties:
>> +
>> +- compatible : "lantiq,xrx200-net" for the PMAC of the embedded
>> + : GSWIP in the xXR200
>> +- reg : memory range of the PMAC core inside of the GSWIP core
>> +- interrupts : TX and RX DMA interrupts. Use interrupt-names "tx" for
>> + : the TX interrupt and "rx" for the RX interrupt.
>
> You would likely want to document that the order should be strict, that
> is TX interrupt first and RX interrupt second, but other than that:
>
> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Currently this is fetched based on the name like this:
platform_get_irq_byname(pdev, "rx");
I do not care about the order, just interrupt-names must match.
Hauke
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply
* Re: [PATCH v2 net-next 5/7] net: lantiq: Add Lantiq / Intel VRX200 Ethernet driver
From: Hauke Mehrtens @ 2018-09-04 21:37 UTC (permalink / raw)
To: Florian Fainelli, davem
Cc: netdev, andrew, vivien.didelot, john, linux-mips, dev,
hauke.mehrtens, devicetree
In-Reply-To: <6d3bdb60-c993-9129-6e87-afb08ff5c113@gmail.com>
[-- Attachment #1.1: Type: text/plain, Size: 15338 bytes --]
Hi Florian,
Thanks for the review.
On 09/03/2018 09:24 PM, Florian Fainelli wrote:
>
>
> On 9/1/2018 5:04 AM, Hauke Mehrtens wrote:
>> This drives the PMAC between the GSWIP Switch and the CPU in the VRX200
>> SoC. This is currently only the very basic version of the Ethernet
>> driver.
>>
>> When the DMA channel is activated we receive some packets which were
>> send to the SoC while it was still in U-Boot, these packets have the
>> wrong header. Resetting the IP cores did not work so we read out the
>> extra packets at the beginning and discard them.
>>
>> This also adapts the clock code in sysctrl.c to use the default name of
>> the device node so that the driver gets the correct clock. sysctrl.c
>> should be replaced with a proper common clock driver later.
>>
>> Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
>> ---
>> MAINTAINERS | 1 +
>> arch/mips/lantiq/xway/sysctrl.c | 6 +-
>> drivers/net/ethernet/Kconfig | 7 +
>> drivers/net/ethernet/Makefile | 1 +
>> drivers/net/ethernet/lantiq_xrx200.c | 591
>> +++++++++++++++++++++++++++++++++++
>> 5 files changed, 603 insertions(+), 3 deletions(-)
>> create mode 100644 drivers/net/ethernet/lantiq_xrx200.c
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 4b2ee65f6086..ffff912d31b5 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -8171,6 +8171,7 @@ M: Hauke Mehrtens <hauke@hauke-m.de>
>> L: netdev@vger.kernel.org
>> S: Maintained
>> F: net/dsa/tag_gswip.c
>> +F: drivers/net/ethernet/lantiq_xrx200.c
>> LANTIQ MIPS ARCHITECTURE
>> M: John Crispin <john@phrozen.org>
>> diff --git a/arch/mips/lantiq/xway/sysctrl.c
>> b/arch/mips/lantiq/xway/sysctrl.c
>> index e0af39b33e28..eeb89a37e27e 100644
>> --- a/arch/mips/lantiq/xway/sysctrl.c
>> +++ b/arch/mips/lantiq/xway/sysctrl.c
>> @@ -505,7 +505,7 @@ void __init ltq_soc_init(void)
>> clkdev_add_pmu("1a800000.pcie", "msi", 1, 1, PMU1_PCIE2_MSI);
>> clkdev_add_pmu("1a800000.pcie", "pdi", 1, 1, PMU1_PCIE2_PDI);
>> clkdev_add_pmu("1a800000.pcie", "ctl", 1, 1, PMU1_PCIE2_CTL);
>> - clkdev_add_pmu("1e108000.eth", NULL, 0, 0, PMU_SWITCH |
>> PMU_PPE_DP);
>> + clkdev_add_pmu("1e10b308.eth", NULL, 0, 0, PMU_SWITCH |
>> PMU_PPE_DP);
>> clkdev_add_pmu("1da00000.usif", "NULL", 1, 0, PMU_USIF);
>> clkdev_add_pmu("1e103100.deu", NULL, 1, 0, PMU_DEU);
>> } else if (of_machine_is_compatible("lantiq,ar10")) {
>> @@ -513,7 +513,7 @@ void __init ltq_soc_init(void)
>> ltq_ar10_fpi_hz(), ltq_ar10_pp32_hz());
>> clkdev_add_pmu("1e101000.usb", "otg", 1, 0, PMU_USB0);
>> clkdev_add_pmu("1e106000.usb", "otg", 1, 0, PMU_USB1);
>> - clkdev_add_pmu("1e108000.eth", NULL, 0, 0, PMU_SWITCH |
>> + clkdev_add_pmu("1e10b308.eth", NULL, 0, 0, PMU_SWITCH |
>> PMU_PPE_DP | PMU_PPE_TC);
>
> Should not that be part of patch 4 where you define the base register
> address?
hmm, I can also put this into patch number 4.
>
>> clkdev_add_pmu("1da00000.usif", "NULL", 1, 0, PMU_USIF);
>> clkdev_add_pmu("1f203020.gphy", NULL, 1, 0, PMU_GPHY);
>> @@ -536,7 +536,7 @@ void __init ltq_soc_init(void)
>> clkdev_add_pmu(NULL, "ahb", 1, 0, PMU_AHBM | PMU_AHBS);
>> clkdev_add_pmu("1da00000.usif", "NULL", 1, 0, PMU_USIF);
>> - clkdev_add_pmu("1e108000.eth", NULL, 0, 0,
>> + clkdev_add_pmu("1e10b308.eth", NULL, 0, 0,
>> PMU_SWITCH | PMU_PPE_DPLUS | PMU_PPE_DPLUM |
>> PMU_PPE_EMA | PMU_PPE_TC | PMU_PPE_SLL01 |
>> PMU_PPE_QSB | PMU_PPE_TOP);
>
> Likewise.
>
> [snip]
>
>> +static int xrx200_open(struct net_device *dev)
>> +{
>> + struct xrx200_priv *priv = netdev_priv(dev);
>> +
>> + ltq_dma_open(&priv->chan_tx.dma);
>> + ltq_dma_enable_irq(&priv->chan_tx.dma);
>> +
>> + napi_enable(&priv->chan_rx.napi);
>> + ltq_dma_open(&priv->chan_rx.dma);
>> + /* The boot loader does not always deactivate the receiving of
>> frames
>> + * on the ports and then some packets queue up in the PPE buffers.
>> + * They already passed the PMAC so they do not have the tags
>> + * configured here. Read the these packets here and drop them.
>> + * The HW should have written them into memory after 10us
>> + */
>> + udelay(10);
>
> You execute in process context with the ndo_open() callback (AFAIR),
> would usleep_range() work here?
The Documentation/timers/timers-howto.txt says for ~10us I should use
udaly also in non atomic context, I can try usleep_range() too.
>> + xrx200_flush_dma(&priv->chan_rx);
>> + ltq_dma_enable_irq(&priv->chan_rx.dma);
>> +
>> + netif_wake_queue(dev);
>> +
>> + return 0;
>> +}
>> +
>> +static int xrx200_close(struct net_device *dev)
>> +{
>> + struct xrx200_priv *priv = netdev_priv(dev);
>> +
>> + netif_stop_queue(dev);
>> +
>> + napi_disable(&priv->chan_rx.napi);
>> + ltq_dma_close(&priv->chan_rx.dma);
>> +
>> + ltq_dma_close(&priv->chan_tx.dma);
>> +
>> + return 0;
>> +}
>> +
>> +static int xrx200_alloc_skb(struct xrx200_chan *ch)
>> +{
>> + int ret = 0;
>> +
>> +#define DMA_PAD (NET_IP_ALIGN + NET_SKB_PAD)
>> + ch->skb[ch->dma.desc] = dev_alloc_skb(XRX200_DMA_DATA_LEN +
>> DMA_PAD);
>> + if (!ch->skb[ch->dma.desc]) {
>> + ret = -ENOMEM;
>> + goto skip;
>> + }
>
> Would not netdev_alloc_skb() do what you want already?
Ok, I am now using netdev_alloc_skb_ip_align()
>> +
>> + skb_reserve(ch->skb[ch->dma.desc], NET_SKB_PAD);
>> + ch->dma.desc_base[ch->dma.desc].addr = dma_map_single(ch->priv->dev,
>> + ch->skb[ch->dma.desc]->data, XRX200_DMA_DATA_LEN,
>> + DMA_FROM_DEVICE);
>> + if (unlikely(dma_mapping_error(ch->priv->dev,
>> + ch->dma.desc_base[ch->dma.desc].addr))) {
>> + dev_kfree_skb_any(ch->skb[ch->dma.desc]);
>> + ret = -ENOMEM;
>> + goto skip;
>> + }
>> +
>> + ch->dma.desc_base[ch->dma.desc].addr =
>> + CPHYSADDR(ch->skb[ch->dma.desc]->data);
>> + skb_reserve(ch->skb[ch->dma.desc], NET_IP_ALIGN);
>> +
>> +skip:
>> + ch->dma.desc_base[ch->dma.desc].ctl =
>> + LTQ_DMA_OWN | LTQ_DMA_RX_OFFSET(NET_IP_ALIGN) |
>> + XRX200_DMA_DATA_LEN;
>> +
>> + return ret;
>> +}
>> +
>> +static int xrx200_hw_receive(struct xrx200_chan *ch)
>> +{
>> + struct xrx200_priv *priv = ch->priv;
>> + struct ltq_dma_desc *desc = &ch->dma.desc_base[ch->dma.desc];
>> + struct sk_buff *skb = ch->skb[ch->dma.desc];
>> + int len = (desc->ctl & LTQ_DMA_SIZE_MASK);
>> + int ret;
>> +
>> + ret = xrx200_alloc_skb(ch);
>> +
>> + ch->dma.desc++;
>> + ch->dma.desc %= LTQ_DESC_NUM;
>> +
>> + if (ret) {
>> + netdev_err(priv->net_dev,
>> + "failed to allocate new rx buffer\n");
>> + return ret;
>> + }
>> +
>> + skb_put(skb, len);
>> + skb->dev = priv->net_dev;
>
> eth_type_trans() does the skb->dev assignment already, this is not
> necessary.
removed
>> + skb->protocol = eth_type_trans(skb, priv->net_dev);
>> + netif_receive_skb(skb);
>> + priv->stats.rx_packets++;
>> + priv->stats.rx_bytes += len;
>
> Does the length reported by the hardware include the Ethernet Frame
> Check Sequence (FCS)? If so, you need to remove it here, since you are
> not supposed to pass it up the stack unless NETIF_F_RXFCS* is turned on.
Yes the FCS is included, I removed it.
>
> [snip]
>
>> +static void xrx200_tx_housekeeping(unsigned long ptr)
>> +{
>> + struct xrx200_chan *ch = (struct xrx200_chan *)ptr;
>> + int pkts = 0;
>> + int bytes = 0;
>> +
>> + ltq_dma_ack_irq(&ch->dma);
>> + while ((ch->dma.desc_base[ch->tx_free].ctl &
>> + (LTQ_DMA_OWN | LTQ_DMA_C)) == LTQ_DMA_C) {
>> + struct sk_buff *skb = ch->skb[ch->tx_free];
>> +
>> + pkts++;
>> + bytes += skb->len;
>> + ch->skb[ch->tx_free] = NULL;
>> + dev_kfree_skb(skb);
>
> Consider using dev_consume_skb() to be drop monitor friendly,
> dev_kfree_skb() indicates the SKB was freed upon error, this is not the
> case here.
Will call consume_skb() here.
>
>> + memset(&ch->dma.desc_base[ch->tx_free], 0,
>> + sizeof(struct ltq_dma_desc));
>
> Humm, don't you need a write barrier here to make sure the HW view's of
> the descriptor is consistent with the CPU's view?
I do not think so, but it could be that I miss some side affects. This
is the TX queue free, this will be used next by the driver xmit function
when the next packet in this descriptor is written to the HW. I think
we can even leave this memset out as this descriptor is already given to
the driver.
>> + ch->tx_free++;
>> + ch->tx_free %= LTQ_DESC_NUM;
>> + }
>> + ltq_dma_enable_irq(&ch->dma);
>> +
>> + netdev_completed_queue(ch->priv->net_dev, pkts, bytes);
>> +
>> + if (!pkts)
>> + return;
>> +
>> + netif_wake_queue(ch->priv->net_dev);
>
> Can you do this in NAPI context, even if that means creating a specific
> TX NAPI object instead of doing this in tasklet context?
Should I put the TX freeing into the RX NAPI handler or create a own
NAPI handler for TX freeing only?
>> +}
>> +
>> +static struct net_device_stats *xrx200_get_stats(struct net_device *dev)
>> +{
>> + struct xrx200_priv *priv = netdev_priv(dev);
>> +
>> + return &priv->stats;
>
> As Andrew pointed out, consider using dev->stats, or better yet,
> implement 64-bit statistics.
I am already using dev->stats.
>> +}
>> +
>> +static int xrx200_start_xmit(struct sk_buff *skb, struct net_device
>> *dev)
>> +{
>> + struct xrx200_priv *priv = netdev_priv(dev);
>> + struct xrx200_chan *ch;
>> + struct ltq_dma_desc *desc;
>> + u32 byte_offset;
>> + dma_addr_t mapping;
>> + int len;
>> +
>> + ch = &priv->chan_tx;
>> +
>> + desc = &ch->dma.desc_base[ch->dma.desc];
>> +
>> + skb->dev = dev;
>> + len = skb->len < ETH_ZLEN ? ETH_ZLEN : skb->len;
>
> Consider using skb_put_padto() which would do that automatically for you.
Will use skb_put_padto()
>> +
>> + /* dma needs to start on a 16 byte aligned address */
>> + byte_offset = CPHYSADDR(skb->data) % 16;
>
> That really should not be necessary, the stack should already be handing
> you off packets that are aligned to the max between the L1 cache line
> size and 64 bytes. Also, CPHYSADDR is a MIPSism, getting rid of it would
> help with the portability and building the driver on other architectures.
I will do this based on the mapping which is returned from
dma_map_single(). The byte offset is anyway needed based on the HW
address. The DMA controller can access the memory at 16 bytes offsets,
but we can provide an extra offset in the DMA descriptor.
>> +
>> + if ((desc->ctl & (LTQ_DMA_OWN | LTQ_DMA_C)) ||
>> ch->skb[ch->dma.desc]) {
>> + netdev_err(dev, "tx ring full\n");
>> + netif_stop_queue(dev);
>> + return NETDEV_TX_BUSY;
>> + }
>> +
>> + ch->skb[ch->dma.desc] = skb;
>> +
>> + netif_trans_update(dev);
>
> This should not be necessary the stack does that already AFAIR.
removed
>> +
>> + mapping = dma_map_single(priv->dev, skb->data, len, DMA_TO_DEVICE);
>> + if (unlikely(dma_mapping_error(priv->dev, mapping)))
>> + goto err_drop;
>> +
>> + desc->addr = mapping - byte_offset;
>> + /* Make sure the address is written before we give it to HW */
>> + wmb();
>> + desc->ctl = LTQ_DMA_OWN | LTQ_DMA_SOP | LTQ_DMA_EOP |
>> + LTQ_DMA_TX_OFFSET(byte_offset) | (len & LTQ_DMA_SIZE_MASK);
>> + ch->dma.desc++;
>> + ch->dma.desc %= LTQ_DESC_NUM;
>> + if (ch->dma.desc == ch->tx_free)
>> + netif_stop_queue(dev);
>> +
>> + netdev_sent_queue(dev, skb->len);
>
> As soon as you write to the descriptor, the packet is handed to HW and
> you could thereoteically have it completed before you even get to access
> skb->len here since your TX completion interrupt could preempt this
> function, that would mean use after free, so consider using 'len' here.
Ok, I will use len.
>> + priv->stats.tx_packets++;
>> + priv->stats.tx_bytes += len;
>
> Updating sucessful TX completion statistics should occur in your TX
> completion handler: xrx200_tx_housekeeping() because you could have a
> stuck TX path, so knowing whether the TX IRQ fired and cleaned up your
> packets is helpful to troubleshoot problems.
Ok, I wil move this.
>> +
>> + return NETDEV_TX_OK;
>> +
>> +err_drop:
>> + dev_kfree_skb(skb);
>> + priv->stats.tx_dropped++;
>> + priv->stats.tx_errors++;
>> + return NETDEV_TX_OK;
>> +}
>> +
>> +static const struct net_device_ops xrx200_netdev_ops = {
>> + .ndo_open = xrx200_open,
>> + .ndo_stop = xrx200_close,
>> + .ndo_start_xmit = xrx200_start_xmit,
>> + .ndo_set_mac_address = eth_mac_addr,
>> + .ndo_validate_addr = eth_validate_addr,
>> + .ndo_change_mtu = eth_change_mtu,
>> + .ndo_get_stats = xrx200_get_stats,
>> +};
>> +
>> +static irqreturn_t xrx200_dma_irq_tx(int irq, void *ptr)
>> +{
>> + struct xrx200_priv *priv = ptr;
>> + struct xrx200_chan *ch = &priv->chan_tx;
>> +
>> + ltq_dma_disable_irq(&ch->dma);
>> + ltq_dma_ack_irq(&ch->dma);
>> +
>> + tasklet_schedule(&ch->tasklet);
>
> Can you use NAPI instead (similar to what was suggested before)?
>
> [snip]
>
>> + /* enable clock gate */
>> + err = clk_prepare_enable(priv->clk);
>> + if (err)
>> + goto err_uninit_dma;
>
> Since there is no guarantee that a network device will be used up until
> some point, you should consider defering the clock enabling into the
> ndo_open() callback to save some possible power. Likewise with resources
> that require memory allocations, you should defer them to as as late as
> possible.
Ok I will move this.
Hauke
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply
* Re: phys_port_id in switchdev mode?
From: Or Gerlitz @ 2018-09-04 20:37 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Florian Fainelli, Simon Horman, Andy Gospodarek,
mchan@broadcom.com, Jiri Pirko, Alexander Duyck, Frederick Botha,
nick viljoen, Linux Netdev List
In-Reply-To: <20180904122057.46fce83a@cakuba>
On Tue, Sep 4, 2018 at 1:20 PM, Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
> On Mon, 3 Sep 2018 12:40:22 +0300, Or Gerlitz wrote:
>> On Tue, Aug 28, 2018 at 9:05 PM, Jakub Kicinski wrote:
>> > Hi!
>>
>> Hi Jakub and sorry for the late reply, this crazigly hot summer refuses to die,
>>
>> Note I replied couple of minutes ago but it didn't get to the list, so
>> lets take it from this one:
>>
>> > I wonder if we can use phys_port_id in switchdev to group together
>> > interfaces of a single PCI PF? Here is the problem:
>> >
>> > With a mix of PF and VF interfaces it gets increasingly difficult to
>> > figure out which one corresponds to which PF. We can identify which
>> > *representor* is which, by means of phys_port_name and devlink
>> > flavours. But if the actual VF/PF interfaces are also present on the
>> > same host, it gets confusing when one tries to identify the PF they
>> > came from. Generally one has to resort of matching between PCI DBDF of
>> > the PF and VFs or read relevant info out of ethtool -i.
>> >
>> > In multi host scenario this is particularly painful, as there seems to
>> > be no immediately obvious way to match PCI interface ID of a card (0,
>> > 1, 2, 3, 4...) to the DBDF we have connected.
>> >
>> > Another angle to this is legacy SR-IOV NDOs. User space picks a netdev
>> > from /sys/bus/pci/$VF_DBDF/physfn/net/ to run the NDOs on in somehow
>> > random manner, which means we have to provide those for all devices with
>> > link to the PF (all reprs). And we have to link them (a) because it's
>> > right (tm) and (b) to get correct naming.
>>
>> wait, as you commented in later, not only the mellanox vf reprs but rather also
>> the nfp vf reprs are not linked to the PF, because ip link output
>> grows quadratically.
>
> Right, correct. If we set phys_port_id libvirt will reliably pick the
> correct netdev to run NDOs on (PF/PF repr) so we can remove them from
> the other netdevs and therefore limit the size of ip link show output.
just to make sure, this is suggested/future not existing flow of libvirt?
> Ugh, you're right! Libvirt is our primary target here. IIUC we need
> phys_port_id on the actual VF and then *a* netdev linked to physfn in
> sysfs which will have the legacy NDOs.
>
> We can't set the phys_port_id on the VF reprs because then we're back
> to the problem of ip link output growing. Perhaps we shouldn't set it
> on PF repr either?
>
> Let's make a table (assuming bare metal cloud scenario where Host0 is
> controlling the network, while Host1 is the actual server):
yeah, this would be a super-set the non-smartnic case where
we have only one host.
[...]
> With this libvirt on Host0 should easily find the actual PF0 netdev to
> run the NDO on, if it wants to use VFs:
> - libvrit finds act VF0/0 to plug into the VM;
> - reads its phys_port_id -> "PF0 SN";
> - finds netdev with "PF0 SN" linked to physfn -> "act PF0";
> - runs NDOs on "act PF0" for PF0's VF correctly.
What you describe here doesn't seem to be networking
configuration, as it deals only with VFs and PF but not with reprs,
and hence AFAIK runs on host host1
[...]
> Should Host0 in bare metal cloud have access to SR-IOV NDOs of Host1?
I need to think on that
^ permalink raw reply
* Re: [PATCH RFC net-next] net: Poptrie based routing table lookup
From: Md. Islam @ 2018-09-04 20:34 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: Netdev, David Miller, David Ahern, Alexey Kuznetsov,
alexei.starovoitov, Stephen Hemminger, makita.toshiaki, panda,
yasuhiro.ohara, Eric Dumazet, john fastabend
In-Reply-To: <CAFgPn1AFUKgGdMArXtfCYQfHxO6nzOYcaPFgN-8ref4HBrMcuQ@mail.gmail.com>
On Tue, Sep 4, 2018 at 12:14 PM, Md. Islam <mislam4@kent.edu> wrote:
>
> On Tue, Sep 4, 2018, 6:53 AM Jesper Dangaard Brouer <brouer@redhat.com>
> wrote:
>>
>> Hi Md. Islam,
>>
>> People will start to ignore you, when you don't interact appropriately
>> with the community, and you ignore their advice, especially when it is
>> about how to interact with the community[1].
>>
>> You have not addressed any of my feedback on your patch in [1].
>> [1]
>> http://www.mail-archive.com/search?l=mid&q=20180827173334.16ff0673@redhat.com
>
>
> Jesper,
>
> I actually addressed all the feedbacks in the previous patch except TOS,
> FIB_matrics, and etc. This is because I don't think they are relevant in
> this usecase. Please let me know if I wrong.
>
> Thanks
Jesper
Sorry, I missed your review in the first place. I will take a look and
resubmit the patch.
Thanks
>>
>>
>>
>> --
>> Best regards,
>> Jesper Dangaard Brouer
>> MSc.CS, Principal Kernel Engineer at Red Hat
>> LinkedIn: http://www.linkedin.com/in/brouer
>>
>> p.s. also top-posting is bad, but I suspect you will not read my
>> response if I don't top-post.
>>
>>
>> On Tue, 4 Sep 2018 01:02:30 -0400 "Md. Islam" <mislam4@kent.edu> wrote:
>>
>> > This patch implements Poptrie based routing table
>> > lookup/insert/delete/flush. Currently many carrier routers use kernel
>> > bypass frameworks such as DPDK and VPP to implement the data plane.
>> > XDP along with this patch will enable Linux to work as such a router.
>> > Currently it supports up to 255 ports. Many real word backbone routers
>> > have up to 233 ports (to the best of my knowledge), so it seems to be
>> > sufficient at this moment.
>> >
>> > I also have attached a draft paper to explain it works (poptrie.pdf).
>> > Please set CONFIG_FIB_POPTRIE=y (default n) before testing the patch.
>> > Note that, poptrie_lookup() is not being called from anywhere. It will
>> > be used by XDP forwarding.
>> >
>> >
>> > From 3dc9683298ed896dd3080733503c35d68f05370e Mon Sep 17 00:00:00 2001
>> > From: tamimcse <tamim@csebuet.org>
>> > Date: Mon, 3 Sep 2018 23:56:43 -0400
>> > Subject: [PATCH] Poptrie based routing table lookup
>> >
>> > Signed-off-by: tamimcse <tamim@csebuet.org>
>> > ---
>> > include/net/ip_fib.h | 42 +++++
>> > net/ipv4/Kconfig | 4 +
>> > net/ipv4/Makefile | 1 +
>> > net/ipv4/fib_poptrie.c | 483
>> > +++++++++++++++++++++++++++++++++++++++++++++++++
>> > net/ipv4/fib_trie.c | 12 ++
>> > 5 files changed, 542 insertions(+)
>> > create mode 100644 net/ipv4/fib_poptrie.c
>>
>> First of order of business: You need to conform to the kernels coding
>> standards!
>>
>> https://www.kernel.org/doc/html/v4.18/process/coding-style.html
>>
>> There is a script avail to check this called: scripts/checkpatch.pl
>> It summary says:
>> total: 139 errors, 238 warnings, 6 checks, 372 lines checked
>> (Not good, more error+warnings than lines...)
>>
>> Please fix up those... else people will not even read you code!
>>
>
^ permalink raw reply
* Re: [RFC/PATCH] net: nixge: Add PHYLINK support
From: Andrew Lunn @ 2018-09-05 1:01 UTC (permalink / raw)
To: Moritz Fischer
Cc: netdev, davem, f.fainelli, alex.williams, moritz.fischer,
linux-kernel
In-Reply-To: <20180905001535.19168-1-mdf@kernel.org>
> 3) I'm again not sure about the 'select PHYLINK', wouldn't
> wanna break the build again...
Hi Moritz
I think it is safe. PHYLINK has no stated dependencies on OF. But i
suspect it currently is pretty useless without OF.
> @@ -1286,7 +1329,13 @@ static int nixge_probe(struct platform_device *pdev)
> priv->coalesce_count_rx = XAXIDMA_DFT_RX_THRESHOLD;
> priv->coalesce_count_tx = XAXIDMA_DFT_TX_THRESHOLD;
>
> - err = nixge_mdio_setup(priv, pdev->dev.of_node);
> + mn = of_get_child_by_name(pdev->dev.of_node, "mdio");
> + if (!mn) {
> + dev_warn(&pdev->dev, "No \"mdio\" subnode found, defaulting to legacy\n");
> + mn = pdev->dev.of_node;
> + }
> +
> + err = nixge_mdio_setup(priv, mn);
I would suggest making this a patch of its own.
Also, do you need the legacy behaviour? If there are no boards out in
the wild which this will break, just make the change.
Please also update the device tree binding documentation.
Andrew
^ permalink raw reply
* Motorcycle Owners List
From: Audrey Tyler @ 2018-09-04 19:59 UTC (permalink / raw)
To: netdev
Hi,
Would you are interested in acquiring an email list of "Motorcycle Owners" from USA.
We also having data of Harley Davidson Owners, Car Owners List, BMW Owners List, RV Owners, Pick Up Truck Owners, Boat Owners, RV Owners List and many more...
Each record we will provide you with: Contact (First and Last name), Mailing Address and Emails Address.
Please let me know your thoughts towards procuring these Lists.
Best Regards,
Audrey Tyler
Research Analyst
^ permalink raw reply
* Re: [RFC/PATCH] net: nixge: Add PHYLINK support
From: Florian Fainelli @ 2018-09-05 0:27 UTC (permalink / raw)
To: Moritz Fischer, netdev
Cc: davem, andrew, alex.williams, moritz.fischer, linux-kernel
In-Reply-To: <20180905001535.19168-1-mdf@kernel.org>
On 09/04/2018 05:15 PM, Moritz Fischer wrote:
> Add basic PHYLINK support to driver.
>
> Suggested-by: Andrew Lunn <andrew@lunn.ch>
> Signed-off-by: Moritz Fischer <mdf@kernel.org>
> ---
>
> Hi all,
>
> as Andrew suggested in order to enable SFP as
> well as fixed-link support add PHYLINK support.
>
> A couple of questions are still open (hence the RFC):
>
> 1) It seems odd to implement PHYLINK callbacks that
> are all empty? If so, should we have generic empty
> ones in drivers/net/phy/phylink.c like we have for
> genphys?
Yes it is odd, the validate callback most certainly should not be empty,
neither should the mac_config and mac_link_{up,down}, but, with some
luck, you can get things to just work, typically with MDIO PHYs, since a
large amount of what they can do is discoverable.
If you had an existing adjust_link callback from PHYLIB, it's really
about breaking it down such that the MAC configuration of
speed/duplex/pause happens in mac_config, and the link setting (if
necessary), happens in mac_link_{up,down}, and that's pretty much it for
MLO_AN_PHY cases.
>
> 2) If this is ok, then I'll go ahead rework this with
> a DT binding update to deprecate the non-'mdio'-subnode
> case (since there are no in-tree users we might just
> change the binding)?
>
> 3) I'm again not sure about the 'select PHYLINK', wouldn't
> wanna break the build again...
>
> Thanks again for your time!
>
> Moritz
>
> ---
> drivers/net/ethernet/ni/Kconfig | 1 +
> drivers/net/ethernet/ni/nixge.c | 115 +++++++++++++++++++++++---------
> 2 files changed, 83 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/net/ethernet/ni/Kconfig b/drivers/net/ethernet/ni/Kconfig
> index c73978474c4b..80cd72948551 100644
> --- a/drivers/net/ethernet/ni/Kconfig
> +++ b/drivers/net/ethernet/ni/Kconfig
> @@ -21,6 +21,7 @@ config NI_XGE_MANAGEMENT_ENET
> depends on HAS_IOMEM && HAS_DMA
> select PHYLIB
> select OF_MDIO if OF
> + select PHYLINK
> help
> Simple LAN device for debug or management purposes. Can
> support either 10G or 1G PHYs via SFP+ ports.
> diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c
> index 74cf52e3fb09..a0e790d07b1c 100644
> --- a/drivers/net/ethernet/ni/nixge.c
> +++ b/drivers/net/ethernet/ni/nixge.c
> @@ -11,6 +11,7 @@
> #include <linux/of_mdio.h>
> #include <linux/of_net.h>
> #include <linux/of_platform.h>
> +#include <linux/phylink.h>
> #include <linux/of_irq.h>
> #include <linux/skbuff.h>
> #include <linux/phy.h>
> @@ -165,7 +166,7 @@ struct nixge_priv {
> struct device *dev;
>
> /* Connection to PHY device */
> - struct device_node *phy_node;
> + struct phylink *phylink;
> phy_interface_t phy_mode;
>
> int link;
> @@ -416,20 +417,6 @@ static void nixge_device_reset(struct net_device *ndev)
> netif_trans_update(ndev);
> }
>
> -static void nixge_handle_link_change(struct net_device *ndev)
> -{
> - struct nixge_priv *priv = netdev_priv(ndev);
> - struct phy_device *phydev = ndev->phydev;
> -
> - if (phydev->link != priv->link || phydev->speed != priv->speed ||
> - phydev->duplex != priv->duplex) {
> - priv->link = phydev->link;
> - priv->speed = phydev->speed;
> - priv->duplex = phydev->duplex;
> - phy_print_status(phydev);
> - }
> -}
> -
> static void nixge_tx_skb_unmap(struct nixge_priv *priv,
> struct nixge_tx_skb *tx_skb)
> {
> @@ -859,17 +846,15 @@ static void nixge_dma_err_handler(unsigned long data)
> static int nixge_open(struct net_device *ndev)
> {
> struct nixge_priv *priv = netdev_priv(ndev);
> - struct phy_device *phy;
> int ret;
>
> nixge_device_reset(ndev);
>
> - phy = of_phy_connect(ndev, priv->phy_node,
> - &nixge_handle_link_change, 0, priv->phy_mode);
> - if (!phy)
> - return -ENODEV;
> + ret = phylink_of_phy_connect(priv->phylink, priv->dev->of_node, 0);
> + if (ret < 0)
> + return ret;
>
> - phy_start(phy);
> + phylink_start(priv->phylink);
>
> /* Enable tasklets for Axi DMA error handling */
> tasklet_init(&priv->dma_err_tasklet, nixge_dma_err_handler,
> @@ -893,8 +878,7 @@ static int nixge_open(struct net_device *ndev)
> err_rx_irq:
> free_irq(priv->tx_irq, ndev);
> err_tx_irq:
> - phy_stop(phy);
> - phy_disconnect(phy);
> + phylink_disconnect_phy(priv->phylink);
> tasklet_kill(&priv->dma_err_tasklet);
> netdev_err(ndev, "request_irq() failed\n");
> return ret;
> @@ -908,9 +892,9 @@ static int nixge_stop(struct net_device *ndev)
> netif_stop_queue(ndev);
> napi_disable(&priv->napi);
>
> - if (ndev->phydev) {
> - phy_stop(ndev->phydev);
> - phy_disconnect(ndev->phydev);
> + if (priv->phylink) {
> + phylink_stop(priv->phylink);
> + phylink_disconnect_phy(priv->phylink);
> }
>
> cr = nixge_dma_read_reg(priv, XAXIDMA_RX_CR_OFFSET);
> @@ -1076,13 +1060,31 @@ static int nixge_ethtools_set_phys_id(struct net_device *ndev,
> return 0;
> }
>
> +static int
> +nixge_ethtool_set_link_ksettings(struct net_device *ndev,
> + const struct ethtool_link_ksettings *cmd)
> +{
> + struct nixge_priv *priv = netdev_priv(ndev);
> +
> + return phylink_ethtool_ksettings_set(priv->phylink, cmd);
> +}
> +
> +static int
> +nixge_ethtool_get_link_ksettings(struct net_device *ndev,
> + struct ethtool_link_ksettings *cmd)
> +{
> + struct nixge_priv *priv = netdev_priv(ndev);
> +
> + return phylink_ethtool_ksettings_get(priv->phylink, cmd);
> +}
> +
> static const struct ethtool_ops nixge_ethtool_ops = {
> .get_drvinfo = nixge_ethtools_get_drvinfo,
> .get_coalesce = nixge_ethtools_get_coalesce,
> .set_coalesce = nixge_ethtools_set_coalesce,
> .set_phys_id = nixge_ethtools_set_phys_id,
> - .get_link_ksettings = phy_ethtool_get_link_ksettings,
> - .set_link_ksettings = phy_ethtool_set_link_ksettings,
> + .get_link_ksettings = nixge_ethtool_get_link_ksettings,
> + .set_link_ksettings = nixge_ethtool_set_link_ksettings,
> .get_link = ethtool_op_get_link,
> };
>
> @@ -1225,11 +1227,52 @@ static void *nixge_get_nvmem_address(struct device *dev)
> return mac;
> }
>
> +static void nixge_validate(struct net_device *ndev, unsigned long *supported,
> + struct phylink_link_state *state)
> +{
> +}
> +
> +static int nixge_mac_link_state(struct net_device *ndev,
> + struct phylink_link_state *state)
> +{
> + return 0;
> +}
> +
> +static void nixge_mac_config(struct net_device *ndev, unsigned int mode,
> + const struct phylink_link_state *state)
> +{
> +}
> +
> +static void nixge_mac_an_restart(struct net_device *ndev)
> +{
> +}
> +
> +static void nixge_mac_link_down(struct net_device *ndev, unsigned int mode,
> + phy_interface_t interface)
> +{
> +}
> +
> +static void nixge_mac_link_up(struct net_device *ndev, unsigned int mode,
> + phy_interface_t interface,
> + struct phy_device *phy)
> +{
> +}
> +
> +static const struct phylink_mac_ops nixge_phylink_ops = {
> + .validate = nixge_validate,
> + .mac_link_state = nixge_mac_link_state,
> + .mac_an_restart = nixge_mac_an_restart,
> + .mac_config = nixge_mac_config,
> + .mac_link_down = nixge_mac_link_down,
> + .mac_link_up = nixge_mac_link_up,
> +};
> +
> static int nixge_probe(struct platform_device *pdev)
> {
> struct nixge_priv *priv;
> struct net_device *ndev;
> struct resource *dmares;
> + struct device_node *mn;
> const u8 *mac_addr;
> int err;
>
> @@ -1286,7 +1329,13 @@ static int nixge_probe(struct platform_device *pdev)
> priv->coalesce_count_rx = XAXIDMA_DFT_RX_THRESHOLD;
> priv->coalesce_count_tx = XAXIDMA_DFT_TX_THRESHOLD;
>
> - err = nixge_mdio_setup(priv, pdev->dev.of_node);
> + mn = of_get_child_by_name(pdev->dev.of_node, "mdio");
> + if (!mn) {
> + dev_warn(&pdev->dev, "No \"mdio\" subnode found, defaulting to legacy\n");
> + mn = pdev->dev.of_node;
> + }
> +
> + err = nixge_mdio_setup(priv, mn);
> if (err) {
> netdev_err(ndev, "error registering mdio bus");
> goto free_netdev;
> @@ -1299,10 +1348,10 @@ static int nixge_probe(struct platform_device *pdev)
> goto unregister_mdio;
> }
>
> - priv->phy_node = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0);
> - if (!priv->phy_node) {
> - netdev_err(ndev, "not find \"phy-handle\" property\n");
> - err = -EINVAL;
> + priv->phylink = phylink_create(ndev, pdev->dev.fwnode, priv->phy_mode,
> + &nixge_phylink_ops);
> + if (IS_ERR(priv->phylink)) {
> + err = PTR_ERR(priv->phylink);
> goto unregister_mdio;
> }
>
>
--
Florian
^ permalink raw reply
* [PATCH net-next v2 7/9] rtnetlink: s/IFLA_IF_NETNSID/IFLA_TARGET_NETNSID/g
From: Christian Brauner @ 2018-09-04 19:53 UTC (permalink / raw)
To: netdev, linux-kernel
Cc: davem, kuznet, yoshfuji, pombredanne, kstewart, gregkh, dsahern,
fw, ktkhai, lucien.xin, jakub.kicinski, jbenc, nicolas.dichtel,
Christian Brauner
In-Reply-To: <20180904195355.4695-1-christian@brauner.io>
IFLA_TARGET_NETNSID is the new alias for IFLA_IF_NETNSID. This commit
replaces all occurrences of IFLA_IF_NETNSID with the new alias to
indicate that this identifier is the preferred one.
Signed-off-by: Christian Brauner <christian@brauner.io>
Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Cc: Jiri Benc <jbenc@redhat.com>
---
v1->v2:
- patch added
v0->v1:
- patch not present
---
net/core/rtnetlink.c | 32 ++++++++++++++++----------------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b36dab7507a0..67d7898db346 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1012,7 +1012,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_NEW_NETNSID */
+ nla_total_size(4) /* IFLA_NEW_IFINDEX */
+ nla_total_size(1) /* IFLA_PROTO_DOWN */
- + nla_total_size(4) /* IFLA_IF_NETNSID */
+ + nla_total_size(4) /* IFLA_TARGET_NETNSID */
+ nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */
+ nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */
+ nla_total_size(4) /* IFLA_MIN_MTU */
@@ -1594,7 +1594,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
ifm->ifi_flags = dev_get_flags(dev);
ifm->ifi_change = change;
- if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_IF_NETNSID, tgt_netnsid))
+ if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid))
goto nla_put_failure;
if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
@@ -1733,7 +1733,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_XDP] = { .type = NLA_NESTED },
[IFLA_EVENT] = { .type = NLA_U32 },
[IFLA_GROUP] = { .type = NLA_U32 },
- [IFLA_IF_NETNSID] = { .type = NLA_S32 },
+ [IFLA_TARGET_NETNSID] = { .type = NLA_S32 },
[IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 },
[IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
[IFLA_MIN_MTU] = { .type = NLA_U32 },
@@ -1900,8 +1900,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX,
ifla_policy, NULL) >= 0) {
- if (tb[IFLA_IF_NETNSID]) {
- netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
+ if (tb[IFLA_TARGET_NETNSID]) {
+ netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]);
tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid);
if (IS_ERR(tgt_net)) {
tgt_net = net;
@@ -1989,7 +1989,7 @@ EXPORT_SYMBOL(rtnl_link_get_net);
*
* 1. IFLA_NET_NS_PID
* 2. IFLA_NET_NS_FD
- * 3. IFLA_IF_NETNSID
+ * 3. IFLA_TARGET_NETNSID
*/
static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net,
struct nlattr *tb[])
@@ -1999,10 +1999,10 @@ static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net,
if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])
return rtnl_link_get_net(src_net, tb);
- if (!tb[IFLA_IF_NETNSID])
+ if (!tb[IFLA_TARGET_NETNSID])
return get_net(src_net);
- net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_IF_NETNSID]));
+ net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID]));
if (!net)
return ERR_PTR(-EINVAL);
@@ -2043,13 +2043,13 @@ static int rtnl_ensure_unique_netns(struct nlattr *tb[],
return -EOPNOTSUPP;
}
- if (tb[IFLA_IF_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]))
+ if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]))
goto invalid_attr;
- if (tb[IFLA_NET_NS_PID] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_FD]))
+ if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD]))
goto invalid_attr;
- if (tb[IFLA_NET_NS_FD] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_PID]))
+ if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID]))
goto invalid_attr;
return 0;
@@ -2325,7 +2325,7 @@ static int do_setlink(const struct sk_buff *skb,
if (err < 0)
return err;
- if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) {
+ if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) {
struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
tb, CAP_NET_ADMIN);
if (IS_ERR(net)) {
@@ -2768,8 +2768,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[IFLA_IFNAME])
nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
- if (tb[IFLA_IF_NETNSID]) {
- netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
+ if (tb[IFLA_TARGET_NETNSID]) {
+ netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]);
tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
if (IS_ERR(tgt_net))
return PTR_ERR(tgt_net);
@@ -3178,8 +3178,8 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
return err;
- if (tb[IFLA_IF_NETNSID]) {
- netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
+ if (tb[IFLA_TARGET_NETNSID]) {
+ netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]);
tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
if (IS_ERR(tgt_net))
return PTR_ERR(tgt_net);
--
2.17.1
^ permalink raw reply related
* [PATCH net-next v2 5/9] rtnetlink: move type calculation out of loop
From: Christian Brauner @ 2018-09-04 19:53 UTC (permalink / raw)
To: netdev, linux-kernel
Cc: davem, kuznet, yoshfuji, pombredanne, kstewart, gregkh, dsahern,
fw, ktkhai, lucien.xin, jakub.kicinski, jbenc, nicolas.dichtel,
Christian Brauner
In-Reply-To: <20180904195355.4695-1-christian@brauner.io>
I don't see how the type - which is one of
RTM_{GETADDR,GETROUTE,GETNETCONF} - can change. So do the message type
calculation once before entering the for loop.
Signed-off-by: Christian Brauner <christian@brauner.io>
---
v1->v2:
- unchanged
v0->v1:
- unchanged
---
net/core/rtnetlink.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 30645d9a9801..b36dab7507a0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3265,13 +3265,13 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
{
int idx;
int s_idx = cb->family;
+ int type = cb->nlh->nlmsg_type - RTM_BASE;
if (s_idx == 0)
s_idx = 1;
for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
struct rtnl_link **tab;
- int type = cb->nlh->nlmsg_type-RTM_BASE;
struct rtnl_link *link;
rtnl_dumpit_func dumpit;
--
2.17.1
^ permalink raw reply related
* [PATCH net-next v2 3/9] ipv4: enable IFA_TARGET_NETNSID for RTM_GETADDR
From: Christian Brauner @ 2018-09-04 19:53 UTC (permalink / raw)
To: netdev, linux-kernel
Cc: davem, kuznet, yoshfuji, pombredanne, kstewart, gregkh, dsahern,
fw, ktkhai, lucien.xin, jakub.kicinski, jbenc, nicolas.dichtel,
Christian Brauner
In-Reply-To: <20180904195355.4695-1-christian@brauner.io>
- Backwards Compatibility:
If userspace wants to determine whether ipv4 RTM_GETADDR requests
support the new IFA_TARGET_NETNSID property it should verify that the
reply includes the IFA_TARGET_NETNSID property. If it does not
userspace should assume that IFA_TARGET_NETNSID is not supported for
ipv4 RTM_GETADDR requests on this kernel.
- From what I gather from current userspace tools that make use of
RTM_GETADDR requests some of them pass down struct ifinfomsg when they
should actually pass down struct ifaddrmsg. To not break existing
tools that pass down the wrong struct we will do the same as for
RTM_GETLINK | NLM_F_DUMP requests and not error out when the
nlmsg_parse() fails.
- Security:
Callers must have CAP_NET_ADMIN in the owning user namespace of the
target network namespace.
Signed-off-by: Christian Brauner <christian@brauner.io>
---
v1->v2:
- rename from IFA_IF_NETNSID to IFA_TARGET_NETNSID
v0->v1:
- unchanged
---
net/ipv4/devinet.c | 38 ++++++++++++++++++++++++++++++--------
1 file changed, 30 insertions(+), 8 deletions(-)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index ea4bd8a52422..5cb849300b81 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -100,6 +100,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
[IFA_FLAGS] = { .type = NLA_U32 },
[IFA_RT_PRIORITY] = { .type = NLA_U32 },
+ [IFA_TARGET_NETNSID] = { .type = NLA_S32 },
};
#define IN4_ADDR_HSIZE_SHIFT 8
@@ -1584,7 +1585,8 @@ static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
}
static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
- u32 portid, u32 seq, int event, unsigned int flags)
+ u32 portid, u32 seq, int event, unsigned int flags,
+ int netnsid)
{
struct ifaddrmsg *ifm;
struct nlmsghdr *nlh;
@@ -1601,6 +1603,9 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
ifm->ifa_scope = ifa->ifa_scope;
ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+ if (netnsid >= 0 && nla_put_s32(skb, IFA_TARGET_NETNSID, netnsid))
+ goto nla_put_failure;
+
if (!(ifm->ifa_flags & IFA_F_PERMANENT)) {
preferred = ifa->ifa_preferred_lft;
valid = ifa->ifa_valid_lft;
@@ -1648,6 +1653,9 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[IFA_MAX+1];
+ struct net *tgt_net = net;
+ int netnsid = -1;
int h, s_h;
int idx, s_idx;
int ip_idx, s_ip_idx;
@@ -1660,12 +1668,23 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
s_idx = idx = cb->args[1];
s_ip_idx = ip_idx = cb->args[2];
+ if (nlmsg_parse(cb->nlh, sizeof(struct ifaddrmsg), tb, IFA_MAX,
+ ifa_ipv4_policy, NULL) >= 0) {
+ if (tb[IFA_TARGET_NETNSID]) {
+ netnsid = nla_get_s32(tb[IFA_TARGET_NETNSID]);
+
+ tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid);
+ if (IS_ERR(tgt_net))
+ return PTR_ERR(tgt_net);
+ }
+ }
+
for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
idx = 0;
- head = &net->dev_index_head[h];
+ head = &tgt_net->dev_index_head[h];
rcu_read_lock();
- cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
- net->dev_base_seq;
+ cb->seq = atomic_read(&tgt_net->ipv4.dev_addr_genid) ^
+ tgt_net->dev_base_seq;
hlist_for_each_entry_rcu(dev, head, index_hlist) {
if (idx < s_idx)
goto cont;
@@ -1680,9 +1699,10 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
if (ip_idx < s_ip_idx)
continue;
if (inet_fill_ifaddr(skb, ifa,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWADDR, NLM_F_MULTI) < 0) {
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWADDR, NLM_F_MULTI,
+ netnsid) < 0) {
rcu_read_unlock();
goto done;
}
@@ -1698,6 +1718,8 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[0] = h;
cb->args[1] = idx;
cb->args[2] = ip_idx;
+ if (netnsid >= 0)
+ put_net(tgt_net);
return skb->len;
}
@@ -1715,7 +1737,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
if (!skb)
goto errout;
- err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
+ err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0, -1);
if (err < 0) {
/* -EMSGSIZE implies BUG in inet_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
--
2.17.1
^ permalink raw reply related
* [PATCH net-next v2 2/9] if_addr: add IFA_TARGET_NETNSID
From: Christian Brauner @ 2018-09-04 19:53 UTC (permalink / raw)
To: netdev, linux-kernel
Cc: davem, kuznet, yoshfuji, pombredanne, kstewart, gregkh, dsahern,
fw, ktkhai, lucien.xin, jakub.kicinski, jbenc, nicolas.dichtel,
Christian Brauner
In-Reply-To: <20180904195355.4695-1-christian@brauner.io>
This adds a new IFA_TARGET_NETNSID property to be used by address
families such as PF_INET and PF_INET6.
The IFA_TARGET_NETNSID property can be used to send a network namespace
identifier as part of a request. If a IFA_TARGET_NETNSID property is
identified it will be used to retrieve the target network namespace in
which the request is to be made.
Signed-off-by: Christian Brauner <christian@brauner.io>
Cc: Jiri Benc <jbenc@redhat.com>
Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
v1->v2:
- rename from IFA_IF_NETNSID to IFA_TARGET_NETNSID
v0->v1:
- unchanged
Note, I did not change the property name to IFA_TARGET_NSID as there
was no clear agreement what would be preferred. My personal preference
is to keep the IFA_IF_NETNSID name because it aligns naturally with
the IFLA_IF_NETNSID property for RTM_*LINK requests. Jiri seems to
prefer this name too.
However, if there is agreement that another property name makes more
sense I'm happy to send a v2 that changes this.
---
include/uapi/linux/if_addr.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h
index ebaf5701c9db..dfcf3ce0097f 100644
--- a/include/uapi/linux/if_addr.h
+++ b/include/uapi/linux/if_addr.h
@@ -34,6 +34,7 @@ enum {
IFA_MULTICAST,
IFA_FLAGS,
IFA_RT_PRIORITY, /* u32, priority/metric for prefix route */
+ IFA_TARGET_NETNSID,
__IFA_MAX,
};
--
2.17.1
^ permalink raw reply related
* Re: [PATCH net] net: phy: sfp: Handle unimplemented hwmon limits and alarms
From: David Miller @ 2018-09-04 19:23 UTC (permalink / raw)
To: andrew; +Cc: netdev, rmk+kernel, f.fainelli
In-Reply-To: <1536027836-19913-1-git-send-email-andrew@lunn.ch>
From: Andrew Lunn <andrew@lunn.ch>
Date: Tue, 4 Sep 2018 04:23:56 +0200
> Not all SFPs implement the registers containing sensor limits and
> alarms. Luckily, there is a bit indicating if they are implemented or
> not. Add checking for this bit, when deciding if the hwmon attributes
> should be visible.
>
> Fixes: 1323061a018a ("net: phy: sfp: Add HWMON support for module sensors")
> Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Applied, thanks Andrew.
^ permalink raw reply
* Re: [PATCH net-next v2] net: sched: action_ife: take reference to meta module
From: David Miller @ 2018-09-04 19:21 UTC (permalink / raw)
To: vladbu; +Cc: netdev, xiyou.wangcong, jhs, jiri
In-Reply-To: <1536011082-2043-1-git-send-email-vladbu@mellanox.com>
From: Vlad Buslov <vladbu@mellanox.com>
Date: Tue, 4 Sep 2018 00:44:42 +0300
> Recent refactoring of add_metainfo() caused use_all_metadata() to add
> metainfo to ife action metalist without taking reference to module. This
> causes warning in module_put called from ife action cleanup function.
>
> Implement add_metainfo_and_get_ops() function that returns with reference
> to module taken if metainfo was added successfully, and call it from
> use_all_metadata(), instead of calling __add_metainfo() directly.
>
> Example warning:
...
> Fixes: 5ffe57da29b3 ("act_ife: fix a potential deadlock")
> Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
> ---
>
> Changes V1->V2:
> - fold constants into helper function
Applied to 'net'.
^ permalink raw reply
* Re: [Patch net] act_ife: fix a potential use-after-free
From: David Miller @ 2018-09-04 19:19 UTC (permalink / raw)
To: xiyou.wangcong; +Cc: netdev, jhs
In-Reply-To: <20180903180815.32220-1-xiyou.wangcong@gmail.com>
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 3 Sep 2018 11:08:15 -0700
> Immediately after module_put(), user could delete this
> module, so e->ops could be already freed before we call
> e->ops->release().
>
> Fix this by moving module_put() after ops->release().
>
> Fixes: ef6980b6becb ("introduce IFE action")
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Applied and queued up for -stable, thanks Cong.
^ permalink raw reply
* Re: [PATCH net] net/mlx5: Fix SQ offset in QPs with small RQ
From: David Miller @ 2018-09-04 19:18 UTC (permalink / raw)
To: tariqt; +Cc: netdev, eranbe, saeedm, alaa
In-Reply-To: <1535987184-16417-1-git-send-email-tariqt@mellanox.com>
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 3 Sep 2018 18:06:24 +0300
> Correct the formula for calculating the RQ page remainder,
> which should be in byte granularity. The result will be
> non-zero only for RQs smaller than PAGE_SIZE, as an RQ size
> is a power of 2.
>
> Divide this by the SQ stride (MLX5_SEND_WQE_BB) to get the
> SQ offset in strides granularity.
>
> Fixes: d7037ad73daa ("net/mlx5: Fix QP fragmented buffer allocation")
> Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
> Reviewed-by: Eran Ben Elisha <eranbe@mellanox.com>
> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
> ---
> drivers/net/ethernet/mellanox/mlx5/core/wq.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> Hi Dave,
> Please queue for -stable v4.18.
Applied and queued up for -stable, thanks.
^ permalink raw reply
* Re: [PATCH] neighbour: confirm neigh entries when ARP packet is received
From: David Miller @ 2018-09-04 18:57 UTC (permalink / raw)
To: ihrachys
Cc: vasilykh, roopa, adobriyan, jwestfall, stephen, anarsoul,
keescook, w.bumiller, edumazet, netdev
In-Reply-To: <CAKwN9=A6YEU5QWf9cQjHwv6L_ssOH5K_rapE6hoZtF_A4gokuQ@mail.gmail.com>
From: Ihar Hrachyshka <ihrachys@redhat.com>
Date: Tue, 4 Sep 2018 11:31:23 -0700
> Of course, I also agree that the comment will need some adjustment to
> reflect the fact that now a single timestamp is being updated. Perhaps
> while at it, Vasily could also explicitly describe in a comment which
> scenario the "if" branch check is supposed to cover. (I should have
> done it myself, mea culpa.)
Yes, that would help a lot.
^ permalink raw reply
* Re: [Patch net] tipc: call start and done ops directly in __tipc_nl_compat_dumpit()
From: Cong Wang @ 2018-09-04 18:41 UTC (permalink / raw)
To: Ying Xue; +Cc: Linux Kernel Network Developers, tipc-discussion, Jon Maloy
In-Reply-To: <941068fa-85c9-7e5b-f769-23800ca407fa@windriver.com>
On Tue, Sep 4, 2018 at 4:45 AM Ying Xue <ying.xue@windriver.com> wrote:
>
>
> On 09/04/2018 10:12 AM, Cong Wang wrote:
> > __tipc_nl_compat_dumpit() uses a netlink_callback on stack,
> > so the only way to align it with other ->dumpit() call path
> > is calling tipc_dump_start() and tipc_dump_done() directly
> > inside it. Otherwise ->dumpit() would always get NULL from
> > cb->args[0].
>
> Thank you for your fix Cong!
>
> Your solution is fine with me.
>
> When we align __tipc_nl_compat_dumpit() with ->dumpit() functions
> defined in tipc_genl_v2_ops[], cb->args[0] is used to save a
> rhashtable_iter object allocated in tipc_dump_start(), and the object
> will be retrieved with cb->args[0] in tipc_dump_done() and will be freed.
>
> But unfortunately cb->args[0] has been used to other purposes in
> tipc_nl_bearer_dump(), tipc_nl_node_dump_link(),
> tipc_nl_name_table_dump(), tipc_nl_node_dump() and tipc_nl_net_dump().
> It means cb->args[0] saved in __tipc_dump_start() will be overwritten in
> these ->dumpit() functions. As a consequence, not only the
> rhashtable_iter object allocated in tipc_dump_start() cannot be properly
> released in tipc_dump_done(), but also more kernel panics might be
> triggered in tipc_dump_done().
Ah, good catch!
The max utilization of cb->args is tipc_nl_name_table_dump():
net/tipc/name_table.c: cb->args[0] = last_type;
net/tipc/name_table.c: cb->args[1] = last_lower;
net/tipc/name_table.c: cb->args[2] = last_key;
net/tipc/name_table.c: cb->args[3] = done;
Looks like I should just use cb->args[4] for rhashtable iterator,
as we still have some room in cb->args[].
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox