Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v4 1/4] enetc: Clean up local mdio bus allocation
From: Claudiu Manoil @ 2019-07-30  9:45 UTC (permalink / raw)
  To: David S . Miller
  Cc: andrew, Rob Herring, Li Yang, alexandru.marginean, netdev,
	devicetree, linux-arm-kernel, linux-kernel
In-Reply-To: <1564479919-18835-1-git-send-email-claudiu.manoil@nxp.com>

What's needed is basically a pointer to the mdio registers.
This is one way to store it inside bus->priv allocated space,
without upsetting sparse.
Reworked accessors to avoid __iomem casting.
Used devm_* variant to further clean up the init error /
remove paths.

Fixes following sparse warning:
 warning: incorrect type in assignment (different address spaces)
    expected void *priv
    got struct enetc_mdio_regs [noderef] <asn:2>*[assigned] regs

Fixes: ebfcb23d62ab ("enetc: Add ENETC PF level external MDIO support")

Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
---
v1 - added this patch
v2 - reworked accessors as per Andrew Lunn's request
v3 - cleaned up commit message
v4 - none

 .../net/ethernet/freescale/enetc/enetc_mdio.c | 94 +++++++++----------
 1 file changed, 46 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/freescale/enetc/enetc_mdio.c b/drivers/net/ethernet/freescale/enetc/enetc_mdio.c
index 77b9cd10ba2b..05094601ece8 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_mdio.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_mdio.c
@@ -8,16 +8,22 @@
 
 #include "enetc_pf.h"
 
-struct enetc_mdio_regs {
-	u32	mdio_cfg;	/* MDIO configuration and status */
-	u32	mdio_ctl;	/* MDIO control */
-	u32	mdio_data;	/* MDIO data */
-	u32	mdio_addr;	/* MDIO address */
+#define	ENETC_MDIO_REG_OFFSET	0x1c00
+#define	ENETC_MDIO_CFG	0x0	/* MDIO configuration and status */
+#define	ENETC_MDIO_CTL	0x4	/* MDIO control */
+#define	ENETC_MDIO_DATA	0x8	/* MDIO data */
+#define	ENETC_MDIO_ADDR	0xc	/* MDIO address */
+
+#define enetc_mdio_rd(hw, off) \
+	enetc_port_rd(hw, ENETC_##off + ENETC_MDIO_REG_OFFSET)
+#define enetc_mdio_wr(hw, off, val) \
+	enetc_port_wr(hw, ENETC_##off + ENETC_MDIO_REG_OFFSET, val)
+#define enetc_mdio_rd_reg(off)	enetc_mdio_rd(hw, off)
+
+struct enetc_mdio_priv {
+	struct enetc_hw *hw;
 };
 
-#define bus_to_enetc_regs(bus)	(struct enetc_mdio_regs __iomem *)((bus)->priv)
-
-#define ENETC_MDIO_REG_OFFSET	0x1c00
 #define ENETC_MDC_DIV		258
 
 #define MDIO_CFG_CLKDIV(x)	((((x) >> 1) & 0xff) << 8)
@@ -33,18 +39,19 @@ struct enetc_mdio_regs {
 #define MDIO_DATA(x)		((x) & 0xffff)
 
 #define TIMEOUT	1000
-static int enetc_mdio_wait_complete(struct enetc_mdio_regs __iomem *regs)
+static int enetc_mdio_wait_complete(struct enetc_hw *hw)
 {
 	u32 val;
 
-	return readx_poll_timeout(enetc_rd_reg, &regs->mdio_cfg, val,
+	return readx_poll_timeout(enetc_mdio_rd_reg, MDIO_CFG, val,
 				  !(val & MDIO_CFG_BSY), 10, 10 * TIMEOUT);
 }
 
 static int enetc_mdio_write(struct mii_bus *bus, int phy_id, int regnum,
 			    u16 value)
 {
-	struct enetc_mdio_regs __iomem *regs = bus_to_enetc_regs(bus);
+	struct enetc_mdio_priv *mdio_priv = bus->priv;
+	struct enetc_hw *hw = mdio_priv->hw;
 	u32 mdio_ctl, mdio_cfg;
 	u16 dev_addr;
 	int ret;
@@ -59,29 +66,29 @@ static int enetc_mdio_write(struct mii_bus *bus, int phy_id, int regnum,
 		mdio_cfg &= ~MDIO_CFG_ENC45;
 	}
 
-	enetc_wr_reg(&regs->mdio_cfg, mdio_cfg);
+	enetc_mdio_wr(hw, MDIO_CFG, mdio_cfg);
 
-	ret = enetc_mdio_wait_complete(regs);
+	ret = enetc_mdio_wait_complete(hw);
 	if (ret)
 		return ret;
 
 	/* set port and dev addr */
 	mdio_ctl = MDIO_CTL_PORT_ADDR(phy_id) | MDIO_CTL_DEV_ADDR(dev_addr);
-	enetc_wr_reg(&regs->mdio_ctl, mdio_ctl);
+	enetc_mdio_wr(hw, MDIO_CTL, mdio_ctl);
 
 	/* set the register address */
 	if (regnum & MII_ADDR_C45) {
-		enetc_wr_reg(&regs->mdio_addr, regnum & 0xffff);
+		enetc_mdio_wr(hw, MDIO_ADDR, regnum & 0xffff);
 
-		ret = enetc_mdio_wait_complete(regs);
+		ret = enetc_mdio_wait_complete(hw);
 		if (ret)
 			return ret;
 	}
 
 	/* write the value */
-	enetc_wr_reg(&regs->mdio_data, MDIO_DATA(value));
+	enetc_mdio_wr(hw, MDIO_DATA, MDIO_DATA(value));
 
-	ret = enetc_mdio_wait_complete(regs);
+	ret = enetc_mdio_wait_complete(hw);
 	if (ret)
 		return ret;
 
@@ -90,7 +97,8 @@ static int enetc_mdio_write(struct mii_bus *bus, int phy_id, int regnum,
 
 static int enetc_mdio_read(struct mii_bus *bus, int phy_id, int regnum)
 {
-	struct enetc_mdio_regs __iomem *regs = bus_to_enetc_regs(bus);
+	struct enetc_mdio_priv *mdio_priv = bus->priv;
+	struct enetc_hw *hw = mdio_priv->hw;
 	u32 mdio_ctl, mdio_cfg;
 	u16 dev_addr, value;
 	int ret;
@@ -104,41 +112,41 @@ static int enetc_mdio_read(struct mii_bus *bus, int phy_id, int regnum)
 		mdio_cfg &= ~MDIO_CFG_ENC45;
 	}
 
-	enetc_wr_reg(&regs->mdio_cfg, mdio_cfg);
+	enetc_mdio_wr(hw, MDIO_CFG, mdio_cfg);
 
-	ret = enetc_mdio_wait_complete(regs);
+	ret = enetc_mdio_wait_complete(hw);
 	if (ret)
 		return ret;
 
 	/* set port and device addr */
 	mdio_ctl = MDIO_CTL_PORT_ADDR(phy_id) | MDIO_CTL_DEV_ADDR(dev_addr);
-	enetc_wr_reg(&regs->mdio_ctl, mdio_ctl);
+	enetc_mdio_wr(hw, MDIO_CTL, mdio_ctl);
 
 	/* set the register address */
 	if (regnum & MII_ADDR_C45) {
-		enetc_wr_reg(&regs->mdio_addr, regnum & 0xffff);
+		enetc_mdio_wr(hw, MDIO_ADDR, regnum & 0xffff);
 
-		ret = enetc_mdio_wait_complete(regs);
+		ret = enetc_mdio_wait_complete(hw);
 		if (ret)
 			return ret;
 	}
 
 	/* initiate the read */
-	enetc_wr_reg(&regs->mdio_ctl, mdio_ctl | MDIO_CTL_READ);
+	enetc_mdio_wr(hw, MDIO_CTL, mdio_ctl | MDIO_CTL_READ);
 
-	ret = enetc_mdio_wait_complete(regs);
+	ret = enetc_mdio_wait_complete(hw);
 	if (ret)
 		return ret;
 
 	/* return all Fs if nothing was there */
-	if (enetc_rd_reg(&regs->mdio_cfg) & MDIO_CFG_RD_ER) {
+	if (enetc_mdio_rd(hw, MDIO_CFG) & MDIO_CFG_RD_ER) {
 		dev_dbg(&bus->dev,
 			"Error while reading PHY%d reg at %d.%hhu\n",
 			phy_id, dev_addr, regnum);
 		return 0xffff;
 	}
 
-	value = enetc_rd_reg(&regs->mdio_data) & 0xffff;
+	value = enetc_mdio_rd(hw, MDIO_DATA) & 0xffff;
 
 	return value;
 }
@@ -146,12 +154,12 @@ static int enetc_mdio_read(struct mii_bus *bus, int phy_id, int regnum)
 int enetc_mdio_probe(struct enetc_pf *pf)
 {
 	struct device *dev = &pf->si->pdev->dev;
-	struct enetc_mdio_regs __iomem *regs;
+	struct enetc_mdio_priv *mdio_priv;
 	struct device_node *np;
 	struct mii_bus *bus;
-	int ret;
+	int err;
 
-	bus = mdiobus_alloc_size(sizeof(regs));
+	bus = devm_mdiobus_alloc_size(dev, sizeof(*mdio_priv));
 	if (!bus)
 		return -ENOMEM;
 
@@ -159,41 +167,31 @@ int enetc_mdio_probe(struct enetc_pf *pf)
 	bus->read = enetc_mdio_read;
 	bus->write = enetc_mdio_write;
 	bus->parent = dev;
+	mdio_priv = bus->priv;
+	mdio_priv->hw = &pf->si->hw;
 	snprintf(bus->id, MII_BUS_ID_SIZE, "%s", dev_name(dev));
 
-	/* store the enetc mdio base address for this bus */
-	regs = pf->si->hw.port + ENETC_MDIO_REG_OFFSET;
-	bus->priv = regs;
-
 	np = of_get_child_by_name(dev->of_node, "mdio");
 	if (!np) {
 		dev_err(dev, "MDIO node missing\n");
-		ret = -EINVAL;
-		goto err_registration;
+		return -EINVAL;
 	}
 
-	ret = of_mdiobus_register(bus, np);
-	if (ret) {
+	err = of_mdiobus_register(bus, np);
+	if (err) {
 		of_node_put(np);
 		dev_err(dev, "cannot register MDIO bus\n");
-		goto err_registration;
+		return err;
 	}
 
 	of_node_put(np);
 	pf->mdio = bus;
 
 	return 0;
-
-err_registration:
-	mdiobus_free(bus);
-
-	return ret;
 }
 
 void enetc_mdio_remove(struct enetc_pf *pf)
 {
-	if (pf->mdio) {
+	if (pf->mdio)
 		mdiobus_unregister(pf->mdio);
-		mdiobus_free(pf->mdio);
-	}
 }
-- 
2.17.1


^ permalink raw reply related

* RE: [PATCH] powerpc/kmcent2: update the ethernet devices' phy properties
From: Madalin-cristian Bucur @ 2019-07-30  9:44 UTC (permalink / raw)
  To: Scott Wood, Valentin Longchamp, linuxppc-dev@lists.ozlabs.org,
	galak@kernel.crashing.org
  Cc: netdev@vger.kernel.org
In-Reply-To: <2243421e574c72c5e75d27cc0122338e2e0bde63.camel@buserror.net>

> -----Original Message-----
> From: Scott Wood <oss@buserror.net>
> Sent: Sunday, July 28, 2019 10:27 PM
> To: Valentin Longchamp <valentin@longchamp.me>; linuxppc-
> dev@lists.ozlabs.org; galak@kernel.crashing.org
> Cc: Madalin-cristian Bucur <madalin.bucur@nxp.com>
> Subject: Re: [PATCH] powerpc/kmcent2: update the ethernet devices' phy
> properties
> 
> On Sun, 2019-07-28 at 18:01 +0200, Valentin Longchamp wrote:
> > Hi Scott, Kumar,
> >
> > Looking at this patch I have realised that I had already submitted it
> > to the mailing list nearly 2 years ago:
> >
> > https://patchwork.ozlabs.org/patch/842944/
> >
> > Could you please make sure that this one gets merged in the next
> > window, so that I avoid forgetting such a patch a 2nd time ?
> >
> > Thanks a lot
> 
> I added it to my patchwork todo list; thanks for the reminder.
> 
> > Le dim. 14 juil. 2019 à 22:05, Valentin Longchamp
> > <valentin@longchamp.me> a écrit :
> > >
> > > Change all phy-connection-type properties to phy-mode that are better
> > > supported by the fman driver.
> > >
> > > Use the more readable fixed-link node for the 2 sgmii links.
> > >
> > > Change the RGMII link to rgmii-id as the clock delays are added by the
> > > phy.
> > >
> > > Signed-off-by: Valentin Longchamp <valentin@longchamp.me>
> 
> I don't see any other uses of phy-mode in arch/powerpc/boot/dts/fsl, and I see
> lots of phy-connection-type with fman.  Madalin, does this patch look OK?
> 
> -Scott

Hi,

we are using "phy-connection-type" not "phy-mode" for the NXP (former Freescale)
DPAA platforms. While the two seem to be interchangeable ("phy-mode" seems to be
more recent, looking at the device tree bindings), the driver code in Linux seems
to use one or the other, not both so one should stick with the variant the driver
is using. To make things more complex, there may be dependencies in bootloaders,
I see code in u-boot using only "phy-connection-type" or only "phy-mode".

I'd leave "phy-connection-type" as is.

Madalin

> > > ---
> > >  arch/powerpc/boot/dts/fsl/kmcent2.dts | 16 +++++++++++-----
> > >  1 file changed, 11 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/arch/powerpc/boot/dts/fsl/kmcent2.dts
> > > b/arch/powerpc/boot/dts/fsl/kmcent2.dts
> > > index 48b7f9797124..c3e0741cafb1 100644
> > > --- a/arch/powerpc/boot/dts/fsl/kmcent2.dts
> > > +++ b/arch/powerpc/boot/dts/fsl/kmcent2.dts
> > > @@ -210,13 +210,19 @@
> > >
> > >                 fman@400000 {
> > >                         ethernet@e0000 {
> > > -                               fixed-link = <0 1 1000 0 0>;
> > > -                               phy-connection-type = "sgmii";
> > > +                               phy-mode = "sgmii";
> > > +                               fixed-link {
> > > +                                       speed = <1000>;
> > > +                                       full-duplex;
> > > +                               };
> > >                         };
> > >
> > >                         ethernet@e2000 {
> > > -                               fixed-link = <1 1 1000 0 0>;
> > > -                               phy-connection-type = "sgmii";
> > > +                               phy-mode = "sgmii";
> > > +                               fixed-link {
> > > +                                       speed = <1000>;
> > > +                                       full-duplex;
> > > +                               };
> > >                         };
> > >
> > >                         ethernet@e4000 {
> > > @@ -229,7 +235,7 @@
> > >
> > >                         ethernet@e8000 {
> > >                                 phy-handle = <&front_phy>;
> > > -                               phy-connection-type = "rgmii";
> > > +                               phy-mode = "rgmii-id";
> > >                         };
> > >
> > >                         mdio0: mdio@fc000 {
> > > --
> > > 2.17.1
> > >
> >
> >


^ permalink raw reply

* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Allan W. Nielsen @ 2019-07-30  9:21 UTC (permalink / raw)
  To: Nikolay Aleksandrov
  Cc: Ido Schimmel, Horatiu Vultur, roopa, davem, bridge, netdev,
	linux-kernel
In-Reply-To: <13f66ebe-4173-82d7-604b-08e9d33d9aff@cumulusnetworks.com>

The 07/30/2019 11:58, Nikolay Aleksandrov wrote:
> On 30/07/2019 11:30, Allan W. Nielsen wrote:
> > The 07/30/2019 10:06, Ido Schimmel wrote:
> >> On Tue, Jul 30, 2019 at 08:27:22AM +0200, Allan W. Nielsen wrote:
> >>> The 07/29/2019 20:51, Ido Schimmel wrote:
> >>>> Can you please clarify what you're trying to achieve? I just read the
> >>>> thread again and my impression is that you're trying to locally receive
> >>>> packets with a certain link layer multicast address.
> >>> Yes. The thread is also a bit confusing because we half way through realized
> >>> that we misunderstood how the multicast packets should be handled (sorry about
> >>> that). To begin with we had a driver where multicast packets was only copied to
> >>> the CPU if someone needed it. Andrew and Nikolay made us aware that this is not
> >>> how other drivers are doing it, so we changed the driver to include the CPU in
> >>> the default multicast flood-mask.
> >> OK, so what prevents you from removing all other ports from the
> >> flood-mask and letting the CPU handle the flooding? Then you can install
> >> software tc filters to limit the flooding.
> > I do not have the bandwidth to forward the multicast traffic in the CPU.
> > 
> > It will also cause enormous latency on the forwarding of L2 multicast packets.
> > 
> >>> This changes the objective a bit. To begin with we needed to get more packets to
> >>> the CPU (which could have been done using tc ingress rules and a trap action).
> >>>
> >>> Now after we changed the driver, we realized that we need something to limit the
> >>> flooding of certain L2 multicast packets. This is the new problem we are trying
> >>> to solve!
> >>>
> >>> Example: Say we have a bridge with 4 slave interfaces, then we want to install a
> >>> forwarding rule saying that packets to a given L2-multicast MAC address, should
> >>> only be flooded to 2 of the 4 ports.
> >>>
> >>> (instead of adding rules to get certain packets to the CPU, we are now adding
> >>> other rules to prevent other packets from going to the CPU and other ports where
> >>> they are not needed/wanted).
> >>>
> >>> This is exactly the same thing as IGMP snooping does dynamically, but only for
> >>> IP multicast.
> >>>
> >>> The "bridge mdb" allow users to manually/static add/del a port to a multicast
> >>> group, but still it operates on IP multicast address (not L2 multicast
> >>> addresses).
> >>>
> >>>> Nik suggested SIOCADDMULTI.
> >>> It is not clear to me how this should be used to limit the flooding, maybe we
> >>> can make some hacks, but as far as I understand the intend of this is maintain
> >>> the list of addresses an interface should receive. I'm not sure this should
> >>> influence how for forwarding decisions are being made.
> >>>
> >>>> and I suggested a tc filter to get the packet to the CPU.
> >>> The TC solution is a good solution to the original problem where wanted to copy
> >>> more frames to the CPU. But we were convinced that this is not the right
> >>> approach, and that the CPU by default should receive all multicast packets, and
> >>> we should instead try to find a way to limit the flooding of certain frames as
> >>> an optimization.
> >>
> >> This can still work. In Linux, ingress tc filters are executed before the
> >> bridge's Rx handler. The same happens in every sane HW. Ingress ACL is
> >> performed before L2 forwarding. Assuming you have eth0-eth3 bridged and
> >> you want to prevent packets with DMAC 01:21:6C:00:00:01 from egressing
> >> eth2:
> >>
> >> # tc filter add dev eth0 ingress pref 1 flower skip_sw \
> >> 	dst_mac 01:21:6C:00:00:01 action trap
> >> # tc filter add dev eth2 egress pref 1 flower skip_hw \
> >> 	dst_mac 01:21:6C:00:00:01 action drop
> >>
> >> The first filter is only present in HW ('skip_sw') and should result in
> >> your HW passing you the sole copy of the packet.
> > Agree.
> > 
> >> The second filter is only present in SW ('skip_hw', not using HW egress
> >> ACL that you don't have) and drops the packet after it was flooded by
> >> the SW bridge.
> > Agree.
> > 
> >> As I mentioned earlier, you can install the filter once in your HW and
> >> share it between different ports using a shared block. This means you
> >> only consume one TCAM entry.
> >>
> >> Note that this allows you to keep flooding all other multicast packets
> >> in HW.
> > Yes, but the frames we want to limit the flood-mask on are the exact frames
> > which occurs at a very high rate, and where latency is important.
> > 
> > I really do not consider it as an option to forward this in SW, when it is
> > something that can easily be offloaded in HW.
> > 
> >>>> If you now want to limit the ports to which this packet is flooded, then
> >>>> you can use tc filters in *software*:
> >>>>
> >>>> # tc qdisc add dev eth2 clsact
> >>>> # tc filter add dev eth2 egress pref 1 flower skip_hw \
> >>>> 	dst_mac 01:21:6C:00:00:01 action drop
> >>> Yes. This can work in the SW bridge.
> >>>
> >>>> If you want to forward the packet in hardware and locally receive it,
> >>>> you can chain several mirred action and then a trap action.
> >>> I'm not I fully understand how this should be done, but it does sound like it
> >>> becomes quite complicated. Also, as far as I understand it will mean that we
> >>> will be using TCAM/ACL resources to do something that could have been done with
> >>> a simple MAC entry.
> >>>
> >>>> Both options avoid HW egress ACLs which your design does not support.
> >>> True, but what is wrong with expanding the functionality of the normal
> >>> forwarding/MAC operations to allow multiple destinations?
> >>>
> >>> It is not an uncommon feature (I just browsed the manual of some common L2
> >>> switches and they all has this feature).
> >>>
> >>> It seems to fit nicely into the existing user-interface:
> >>>
> >>> bridge fdb add    01:21:6C:00:00:01 port eth0
> >>> bridge fdb append 01:21:6C:00:00:01 port eth1
> >>
> >> Wouldn't it be better to instead extend the MDB entries so that they are
> >> either keyed by IP or MAC? I believe FDB should remain as unicast-only.
> > 
> > You might be right, it was not clear to me which of the two would fit the
> > purpose best.
> > 
> > From a user-space iproute2 perspective I prefer using the "bridge fdb" command
> > as it already supports the needed syntax, and I do not think it will be too
> > pretty if we squeeze this into the "bridge mdb" command syntax.
> > 
> 
> MDB is a much better fit as Ido already suggested. FDB should remain unicast
> and mixing them is not a good idea, we already have a good ucast/mcast separation
> and we'd like to keep it that way.
Okay. We will explore that option.


> > But that does not mean that it need to go into the FDB database in the
> > implementation.
> > 
> > Last evening when I looked at it again, I was considering keeping the
> > net_bridge_fdb_entry structure as is, and add a new hashtable with the
> > following:
> > 
> > struct net_bridge_fdbmc_entry {
> > 	struct rhash_head		rhnode;
> > 	struct net_bridge_fdbmc_ports   *dst;
> > 
> > 	struct net_bridge_fdb_key	key;
> > 	struct hlist_node		fdb_node;
> > 	unsigned char			offloaded:1;
> > 
> > 	struct rcu_head			rcu;
> > };
> > 
> 
> What would the notification for this look like ?
Not sure. But we will change the direction and use the MDB structures instead.

> > If we go with this approach then we can look at the MAC address and see if it is
> > a unicast which will cause a lookup in the fdb, l3-multicast (33:33:* or
> > 01:00:5e:*) which will cause a lookup in the mdb, or finally a fdbmc which will
> > need to do a lookup in this new hashtable.
> 
> That sounds wrong, you will change the current default behaviour of flooding these
> packets. This will have to be well hidden behind a new option and enabled only on user
> request.
It will only affect users who install a static L2-multicast entry. If no entry
is found, it will default to flooding, which will be the same as before.

> > Alternative it would be like this:
> > 
> > struct net_bridge_fdb_entry {
> > 	struct rhash_head		rhnode;
> > 	union net_bridge_port_or_list	*dst;
> > 
> > 	struct net_bridge_fdb_key	key;
> > 	struct hlist_node		fdb_node;
> > 	unsigned char			is_local:1,
> > 					is_static:1,
> > 					is_sticky:1,
> > 					added_by_user:1,
> > 					added_by_external_learn:1,
> > 					offloaded:1;
> > 					multi_dst:1;
> > 
> > 	/* write-heavy members should not affect lookups */
> > 	unsigned long			updated ____cacheline_aligned_in_smp;
> > 	unsigned long			used;
> > 
> > 	struct rcu_head			rcu;
> > };
> > 
> > Both solutions should require fairly few changes, and should not cause any
> > measurable performance hit.
> > 
> 
> You'll have to convert these bits to use the proper atomic bitops if you go with
> the second solution. That has to be done even today, but the second case would
> make it a must.
Good to know.

Just for my understanding, is this because this is the "current" guide lines on
how things should be done, or is this because the multi_dst as a special need.

The multi_dst flag will never be changed in the life-cycle of the structure, and
the structure is protected by rcu. If this is causeing a raise, then I do not
see it.

> > Making it fit into the net_bridge_mdb_entry seems to be harder.
> > 
> 
> But it is the correct abstraction from bridge POV, so please stop trying to change
> the FDB code and try to keep to the multicast code.
We are planning on letting the net_bridge_port_or_list union use the
net_bridge_port_group structure, which will mean that we can re-use the
br_multicast_flood function (if we change the signatire to accept the ports
instead of the entry).

> >> As a bonus, existing drivers could benefit from it, as MDB entries are already
> >> notified by MAC.
> > Not sure I follow. When FDB entries are added, it also generates notification
> > events.
> > 
> 
> Could you please show fdb event with multiple ports ?
We will get to that. Maybe this is an argument for converting to mdb. We have
not looked into the details of this yet.

> >>> It seems that it can be added to the existing implementation with out adding
> >>> significant complexity.
> >>>
> >>> It will be easy to offload in HW.
> >>>
> >>> I do not believe that it will be a performance issue, if this is a concern then
> >>> we may have to do a bit of benchmarking, or we can make it a configuration
> >>> option.
> >>>
> >>> Long story short, we (Horatiu and I) learned a lot from the discussion here, and
> >>> I think we should try do a new patch with the learning we got. Then it is easier
> >>> to see what it actually means to the exiting code, complexity, exiting drivers,
> >>> performance, default behavioral, backwards compatibly, and other valid concerns.
> >>>
> >>> If the patch is no good, and cannot be fixed, then we will go back and look
> >>> further into alternative solutions.
> >> Overall, I tend to agree with Nik. I think your use case is too specific
> >> to justify the amount of changes you want to make in the bridge driver.
> >> We also provided other alternatives. That being said, you're more than
> >> welcome to send the patches and we can continue the discussion then.
> > Okay, good to know. I'm not sure I agree that the alternative solutions really
> > solves the issue this is trying to solve, nor do I agree that this is specific
> > to our needs.
> > 
> > But lets take a look at a new patch, and see what is the amount of changes we
> > are talking about. Without having the patch it is really hard to know for sure.
> Please keep in mind that this case is the exception, not the norm, thus it should
> not under any circumstance affect the standard deployments.
Understood - no surprises.

-- 
/Allan

^ permalink raw reply

* Re: net: ipv6: Fix a bug in ndisc_send_ns when netdev only has a global address
From: Su Yanjun @ 2019-07-30  9:39 UTC (permalink / raw)
  To: Mark Smith, netdev
In-Reply-To: <CAO42Z2yN=FfueKAjb0KjY8-CdiKuvkJDr2iJdJR4XdKM90HJRg@mail.gmail.com>


在 2019/7/30 16:15, Mark Smith 写道:
> Hi,
>
> I'm not subscribed to the Linux netdev mailing list, so I can't
> directly reply to the patch email.
>
> This patch is not the correct solution to this issue.
>
> Per RFC 4291 "IP Version 6 Addressing Architecture", all IPv6
> interfaces are required to have Link-Local addresses, so therefore
> there should always be a link-local address available to use as the
> source address for an ND NS.

In linux implementation, one interface may have no link local address if 
kernel config

*addr_gen_mode* is set to IN6_ADDR_GEN_MODE_NONE. My patch is to fix 
this problem.

And what you say is related to the lo interface.  I'm not sure whether 
the lo interface needs a ll adreess.

IMO the ll address is used to get l2 address by sending ND ns. The lo is 
very special.

Thanks

Su

>
> "2.1. Addressing Model"
>
> ...
>
> "All interfaces are required to have at least one Link-Local unicast
>     address (see Section 2.8 for additional required addresses)."
>
> I have submitted a more specific bug regarding no Link-Local
> address/prefix on the Linux kernel loopback interface through RedHat
> bugzilla as I use Fedora 30, however it doesn't seem to have been
> looked at yet.
>
> "Loopback network interface does not have a Link Local address,
> contrary to RFC 4291"
> https://bugzilla.redhat.com/show_bug.cgi?id=1706709
>
>
> Thanks very much,
> Mark.
>
>



^ permalink raw reply

* Re: [PATCH v4 0/5] vsock/virtio: optimizations to increase the throughput
From: Stefano Garzarella @ 2019-07-30  9:40 UTC (permalink / raw)
  To: Michael S. Tsirkin, Stefan Hajnoczi, David S. Miller
  Cc: netdev, linux-kernel, virtualization, Jason Wang, kvm
In-Reply-To: <20190729095743-mutt-send-email-mst@kernel.org>

On Mon, Jul 29, 2019 at 09:59:23AM -0400, Michael S. Tsirkin wrote:
> On Wed, Jul 17, 2019 at 01:30:25PM +0200, Stefano Garzarella wrote:
> > This series tries to increase the throughput of virtio-vsock with slight
> > changes.
> > While I was testing the v2 of this series I discovered an huge use of memory,
> > so I added patch 1 to mitigate this issue. I put it in this series in order
> > to better track the performance trends.
> 
> Series:
> 
> Acked-by: Michael S. Tsirkin <mst@redhat.com>
> 
> Can this go into net-next?
> 

I think so.
Michael, Stefan thanks to ack the series!

Should I resend it with net-next tag?

Thanks,
Stefano

^ permalink raw reply

* RE: [PATCH net-next 3/3] net: stmmac: Introducing support for Page Pool
From: Jose Abreu @ 2019-07-30  9:39 UTC (permalink / raw)
  To: Jon Hunter, Jose Abreu, Robin Murphy,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	linux-stm32@st-md-mailman.stormreply.com,
	linux-arm-kernel@lists.infradead.org, Catalin Marinas,
	Will Deacon
  Cc: Joao Pinto, Alexandre Torgue, Maxime Ripard, Chen-Yu Tsai,
	Maxime Coquelin, linux-tegra, Giuseppe Cavallaro,
	David S . Miller
In-Reply-To: <8a60361f-b914-93ef-0d80-92ae4ad8b808@nvidia.com>

[-- Attachment #1: Type: text/plain, Size: 2703 bytes --]

From: Jon Hunter <jonathanh@nvidia.com>
Date: Jul/29/2019, 22:33:04 (UTC+00:00)

> 
> On 29/07/2019 15:08, Jose Abreu wrote:
> 
> ...
> 
> >>> Hi Catalin and Will,
> >>>
> >>> Sorry to add you in such a long thread but we are seeing a DMA issue
> >>> with stmmac driver in an ARM64 platform with IOMMU enabled.
> >>>
> >>> The issue seems to be solved when buffers allocation for DMA based
> >>> transfers are *not* mapped with the DMA_ATTR_SKIP_CPU_SYNC flag *OR*
> >>> when IOMMU is disabled.
> >>>
> >>> Notice that after transfer is done we do use
> >>> dma_sync_single_for_{cpu,device} and then we reuse *the same* page for
> >>> another transfer.
> >>>
> >>> Can you please comment on whether DMA_ATTR_SKIP_CPU_SYNC can not be used
> >>> in ARM64 platforms with IOMMU ?
> >>
> >> In terms of what they do, there should be no difference on arm64 between:
> >>
> >> dma_map_page(..., dir);
> >> ...
> >> dma_unmap_page(..., dir);
> >>
> >> and:
> >>
> >> dma_map_page_attrs(..., dir, DMA_ATTR_SKIP_CPU_SYNC);
> >> dma_sync_single_for_device(..., dir);
> >> ...
> >> dma_sync_single_for_cpu(..., dir);
> >> dma_unmap_page_attrs(..., dir, DMA_ATTR_SKIP_CPU_SYNC);
> >>
> >> provided that the first sync covers the whole buffer and any subsequent 
> >> ones cover at least the parts of the buffer which may have changed. Plus 
> >> for coherent hardware it's entirely moot either way.
> > 
> > Thanks for confirming. That's indeed what stmmac is doing when buffer is 
> > received by syncing the packet size to CPU.
> > 
> >>
> >> Given Jon's previous findings, I would lean towards the idea that 
> >> performing the extra (redundant) cache maintenance plus barrier in 
> >> dma_unmap is mostly just perturbing timing in the same way as the debug 
> >> print which also made things seem OK.
> > 
> > Mikko said that Tegra186 is not coherent so we have to explicit flush 
> > pipeline but I don't understand why sync_single() is not doing it ...
> > 
> > Jon, can you please remove *all* debug prints, hacks, etc ... and test 
> > this one in attach with plain -net tree ?
> 
> So far I have just been testing on the mainline kernel branch. The issue
> still persists after applying this on mainline. I can test on the -net
> tree, but I am not sure that will make a difference.
> 
> Cheers
> Jon
> 
> -- 
> nvpublic

I looked at netsec implementation and I noticed that we are syncing the 
old buffer for device instead of the new one. netsec syncs the buffer 
for device immediately after the allocation which may be what we have to 
do. Maybe the attached patch can make things work for you ?

---
Thanks,
Jose Miguel Abreu

[-- Attachment #2: 0001-net-stmmac-Sync-RX-Buffer-upon-allocation.patch --]
[-- Type: application/octet-stream, Size: 2465 bytes --]

From 3601e3ae4357d48b3294f42781d0f19095d1b00e Mon Sep 17 00:00:00 2001
Message-Id: <3601e3ae4357d48b3294f42781d0f19095d1b00e.1564479382.git.joabreu@synopsys.com>
From: Jose Abreu <joabreu@synopsys.com>
Date: Tue, 30 Jul 2019 11:36:13 +0200
Subject: [PATCH net] net: stmmac: Sync RX Buffer upon allocation

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
---
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Cc: Jose Abreu <joabreu@synopsys.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: netdev@vger.kernel.org
Cc: linux-stm32@st-md-mailman.stormreply.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 98b1a5c6d537..9a4a56ad35cd 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3271,9 +3271,11 @@ static inline int stmmac_rx_threshold_count(struct stmmac_rx_queue *rx_q)
 static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue)
 {
 	struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
-	int dirty = stmmac_rx_dirty(priv, queue);
+	int len, dirty = stmmac_rx_dirty(priv, queue);
 	unsigned int entry = rx_q->dirty_rx;
 
+	len = DIV_ROUND_UP(priv->dma_buf_sz, PAGE_SIZE) * PAGE_SIZE;
+
 	while (dirty-- > 0) {
 		struct stmmac_rx_buffer *buf = &rx_q->buf_pool[entry];
 		struct dma_desc *p;
@@ -3291,6 +3293,13 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue)
 		}
 
 		buf->addr = page_pool_get_dma_addr(buf->page);
+
+		/* Sync whole allocation to device. This will invalidate old
+		 * data.
+		 */
+		dma_sync_single_for_device(priv->device, buf->addr, len,
+					   DMA_FROM_DEVICE);
+
 		stmmac_set_desc_addr(priv, p, buf->addr);
 		stmmac_refill_desc3(priv, rx_q, p);
 
@@ -3425,8 +3434,6 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 			skb_copy_to_linear_data(skb, page_address(buf->page),
 						frame_len);
 			skb_put(skb, frame_len);
-			dma_sync_single_for_device(priv->device, buf->addr,
-						   frame_len, DMA_FROM_DEVICE);
 
 			if (netif_msg_pktdata(priv)) {
 				netdev_dbg(priv->dev, "frame received (%dbytes)",
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH] net: ceph: Fix a possible null-pointer dereference in ceph_crypto_key_destroy()
From: Ilya Dryomov @ 2019-07-30  9:41 UTC (permalink / raw)
  To: Jia-Ju Bai
  Cc: Jeff Layton, Sage Weil, David S. Miller, Ceph Development, netdev,
	LKML
In-Reply-To: <20190724094306.1866-1-baijiaju1990@gmail.com>

On Wed, Jul 24, 2019 at 11:43 AM Jia-Ju Bai <baijiaju1990@gmail.com> wrote:
>
> In set_secret(), key->tfm is assigned to NULL on line 55, and then
> ceph_crypto_key_destroy(key) is executed.
>
> ceph_crypto_key_destroy(key)
>     crypto_free_sync_skcipher(key->tfm)
>         crypto_skcipher_tfm(tfm)
>             return &tfm->base;
>
> Thus, a possible null-pointer dereference may occur.
>
> To fix this bug, key->tfm is checked before calling
> crypto_free_sync_skcipher().
>
> This bug is found by a static analysis tool STCheck written by us.
>
> Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
> ---
>  net/ceph/crypto.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
> index 5d6724cee38f..ac28463bcfd8 100644
> --- a/net/ceph/crypto.c
> +++ b/net/ceph/crypto.c
> @@ -136,7 +136,8 @@ void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
>         if (key) {
>                 kfree(key->key);
>                 key->key = NULL;
> -               crypto_free_sync_skcipher(key->tfm);
> +               if (key->tfm)
> +                       crypto_free_sync_skcipher(key->tfm);
>                 key->tfm = NULL;
>         }
>  }

Hi Jia-Ju,

Yeah, looks like the only reason this continued to work after
69d6302b65a8 ("libceph: Remove VLA usage of skcipher") is because
crypto_sync_skcipher is a trivial wrapper around crypto_skcipher
added just for type checking AFAICT.

struct crypto_sync_skcipher {
    struct crypto_skcipher base;
};

Before that ceph_crypto_key_destroy() used crypto_free_skcipher(),
which is safe to call on a NULL tfm.

Applied with a slight modification -- I moved key->tfm = NULL under
the new if and amended the changelog.

https://github.com/ceph/ceph-client/commit/b3d79916ff99074d289d66f1643b423ae0008c50

Thanks,

                Ilya

^ permalink raw reply

* KMSAN: uninit-value in read_eprom_word
From: syzbot @ 2019-07-30  9:38 UTC (permalink / raw)
  To: davem, glider, linux-kernel, linux-usb, netdev, petkan,
	syzkaller-bugs

Hello,

syzbot found the following crash on:

HEAD commit:    3351e2b9 usb-fuzzer: main usb gadget fuzzer driver
git tree:       kmsan
console output: https://syzkaller.appspot.com/x/log.txt?x=13105d85a00000
kernel config:  https://syzkaller.appspot.com/x/.config?x=40511ad0c5945201
dashboard link: https://syzkaller.appspot.com/bug?extid=3499a83b2d062ae409d4
compiler:       clang version 9.0.0 (/home/glider/llvm/clang  
80fee25776c2fb61e74c1ecb1a523375c2500b69)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=1257755ea00000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=1327e5a5a00000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+3499a83b2d062ae409d4@syzkaller.appspotmail.com

usb 1-1: Using ep0 maxpacket: 8
usb 1-1: config 0 has an invalid interface number: 150 but max is 0
usb 1-1: config 0 has no interface number 0
usb 1-1: New USB device found, idVendor=050d, idProduct=0122,  
bcdDevice=c1.69
usb 1-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0
usb 1-1: config 0 descriptor??
==================================================================
BUG: KMSAN: uninit-value in read_eprom_word+0x947/0xdd0  
drivers/net/usb/pegasus.c:298
CPU: 0 PID: 12 Comm: kworker/0:1 Not tainted 5.2.0-rc4+ #5
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Workqueue: usb_hub_wq hub_event
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x191/0x1f0 lib/dump_stack.c:113
  kmsan_report+0x162/0x2d0 mm/kmsan/kmsan.c:611
  __msan_warning+0x75/0xe0 mm/kmsan/kmsan_instr.c:304
  read_eprom_word+0x947/0xdd0 drivers/net/usb/pegasus.c:298
  get_interrupt_interval drivers/net/usb/pegasus.c:758 [inline]
  pegasus_probe+0xf2b/0x4be0 drivers/net/usb/pegasus.c:1192
  usb_probe_interface+0xd19/0x1310 drivers/usb/core/driver.c:361
  really_probe+0x1344/0x1d90 drivers/base/dd.c:513
  driver_probe_device+0x1ba/0x510 drivers/base/dd.c:670
  __device_attach_driver+0x5b8/0x790 drivers/base/dd.c:777
  bus_for_each_drv+0x28e/0x3b0 drivers/base/bus.c:454
  __device_attach+0x489/0x750 drivers/base/dd.c:843
  device_initial_probe+0x4a/0x60 drivers/base/dd.c:890
  bus_probe_device+0x131/0x390 drivers/base/bus.c:514
  device_add+0x25b5/0x2df0 drivers/base/core.c:2111
  usb_set_configuration+0x309f/0x3710 drivers/usb/core/message.c:2027
  generic_probe+0xe7/0x280 drivers/usb/core/generic.c:210
  usb_probe_device+0x146/0x200 drivers/usb/core/driver.c:266
  really_probe+0x1344/0x1d90 drivers/base/dd.c:513
  driver_probe_device+0x1ba/0x510 drivers/base/dd.c:670
  __device_attach_driver+0x5b8/0x790 drivers/base/dd.c:777
  bus_for_each_drv+0x28e/0x3b0 drivers/base/bus.c:454
  __device_attach+0x489/0x750 drivers/base/dd.c:843
  device_initial_probe+0x4a/0x60 drivers/base/dd.c:890
  bus_probe_device+0x131/0x390 drivers/base/bus.c:514
  device_add+0x25b5/0x2df0 drivers/base/core.c:2111
  usb_new_device+0x23e5/0x2fb0 drivers/usb/core/hub.c:2534
  hub_port_connect drivers/usb/core/hub.c:5089 [inline]
  hub_port_connect_change drivers/usb/core/hub.c:5204 [inline]
  port_event drivers/usb/core/hub.c:5350 [inline]
  hub_event+0x5853/0x7320 drivers/usb/core/hub.c:5432
  process_one_work+0x1572/0x1f00 kernel/workqueue.c:2269
  worker_thread+0x111b/0x2460 kernel/workqueue.c:2415
  kthread+0x4b5/0x4f0 kernel/kthread.c:256
  ret_from_fork+0x35/0x40 arch/x86/entry/entry_64.S:355

Local variable description: ----data.addr.i13@read_eprom_word
Variable was created at:
  set_register drivers/net/usb/pegasus.c:174 [inline]
  read_eprom_word+0x498/0xdd0 drivers/net/usb/pegasus.c:294
  get_interrupt_interval drivers/net/usb/pegasus.c:758 [inline]
  pegasus_probe+0xf2b/0x4be0 drivers/net/usb/pegasus.c:1192
==================================================================


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* Re: [PATCH] rtw88: pci: Use general byte arrays as the elements of RX ring
From: Stanislaw Gruszka @ 2019-07-30  9:35 UTC (permalink / raw)
  To: Jian-Hong Pan
  Cc: Yan-Hsuan Chuang, Kalle Valo, David S . Miller, David Laight,
	linux-wireless, netdev, linux-kernel, linux, stable
In-Reply-To: <20190725080925.6575-1-jian-hong@endlessm.com>

On Thu, Jul 25, 2019 at 04:09:26PM +0800, Jian-Hong Pan wrote:
> Each skb as the element in RX ring was expected with sized buffer 8216
> (RTK_PCI_RX_BUF_SIZE) bytes. However, the skb buffer's true size is
> 16640 bytes for alignment after allocated, x86_64 for example. And, the

rtw88 advertise IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454, so maximum AMSDU
packet can be approximately 12kB. This might be accidental, but having
16kB skb's allow to handle such big AMSDUs. If you shrink buf size,
you can probably override memory after buffer end.

> difference will be enlarged 512 times (RTK_MAX_RX_DESC_NUM).
> To prevent that much wasted memory, this patch follows David's
> suggestion [1] and uses general buffer arrays, instead of skbs as the
> elements in RX ring.
> 
> [1] https://www.spinics.net/lists/linux-wireless/msg187870.html
> 
> Signed-off-by: Jian-Hong Pan <jian-hong@endlessm.com>
> Cc: <stable@vger.kernel.org>

This does not fix any serious problem, it actually most likely 
introduce memory corruption problem described above. Should not
be targeted to stable anyway.

> -			dev_kfree_skb_any(skb);
> +			devm_kfree(rtwdev->dev, buf);

For what this is needed? devm_ allocations are used exactly to avoid
manual freeing.

> +		len = pkt_stat.pkt_len + pkt_offset;
> +		skb = dev_alloc_skb(len);
> +		if (WARN_ONCE(!skb, "rx routine starvation\n"))
>  			goto next_rp;
>  
>  		/* put the DMA data including rx_desc from phy to new skb */
> -		skb_put_data(new, skb->data, new_len);
> +		skb_put_data(skb, rx_desc, len);

Coping big packets it quite inefficient. What drivers usually do is 
copy only for small packets and for big ones allocate new rx buf
(drop packet alloc if fail) and pas old buf to network stack via
skb_add_rx_frag(). See iwlmvm as example.

Stanislaw

^ permalink raw reply

* Re: [PATCH v4 1/5] vsock/virtio: limit the memory used per-socket
From: Stefano Garzarella @ 2019-07-30  9:35 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev, linux-kernel, Stefan Hajnoczi, David S. Miller,
	virtualization, Jason Wang, kvm
In-Reply-To: <20190729143622-mutt-send-email-mst@kernel.org>

On Mon, Jul 29, 2019 at 03:10:15PM -0400, Michael S. Tsirkin wrote:
> On Mon, Jul 29, 2019 at 06:50:56PM +0200, Stefano Garzarella wrote:
> > On Mon, Jul 29, 2019 at 06:19:03PM +0200, Stefano Garzarella wrote:
> > > On Mon, Jul 29, 2019 at 11:49:02AM -0400, Michael S. Tsirkin wrote:
> > > > On Mon, Jul 29, 2019 at 05:36:56PM +0200, Stefano Garzarella wrote:
> > > > > On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > > > > > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > > > > > Since virtio-vsock was introduced, the buffers filled by the host
> > > > > > > and pushed to the guest using the vring, are directly queued in
> > > > > > > a per-socket list. These buffers are preallocated by the guest
> > > > > > > with a fixed size (4 KB).
> > > > > > > 
> > > > > > > The maximum amount of memory used by each socket should be
> > > > > > > controlled by the credit mechanism.
> > > > > > > The default credit available per-socket is 256 KB, but if we use
> > > > > > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > > > > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > > > > > guest will continue to fill the vring with new 4 KB free buffers
> > > > > > > to avoid starvation of other sockets.
> > > > > > > 
> > > > > > > This patch mitigates this issue copying the payload of small
> > > > > > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > > > > > order to avoid wasting memory.
> > > > > > > 
> > > > > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > > > > 
> > > > > > This is good enough for net-next, but for net I think we
> > > > > > should figure out how to address the issue completely.
> > > > > > Can we make the accounting precise? What happens to
> > > > > > performance if we do?
> > > > > > 
> > > > > 
> > > > > In order to do more precise accounting maybe we can use the buffer size,
> > > > > instead of payload size when we update the credit available.
> > > > > In this way, the credit available for each socket will reflect the memory
> > > > > actually used.
> > > > > 
> > > > > I should check better, because I'm not sure what happen if the peer sees
> > > > > 1KB of space available, then it sends 1KB of payload (using a 4KB
> > > > > buffer).
> > > > > 
> > > > > The other option is to copy each packet in a new buffer like I did in
> > > > > the v2 [2], but this forces us to make a copy for each packet that does
> > > > > not fill the entire buffer, perhaps too expensive.
> > > > > 
> > > > > [2] https://patchwork.kernel.org/patch/10938741/
> > > > > 
> > > > > 
> > > > > Thanks,
> > > > > Stefano
> > > > 
> > > > Interesting. You are right, and at some level the protocol forces copies.
> > > > 
> > > > We could try to detect that the actual memory is getting close to
> > > > admin limits and force copies on queued packets after the fact.
> > > > Is that practical?
> > > 
> > > Yes, I think it is doable!
> > > We can decrease the credit available with the buffer size queued, and
> > > when the buffer size of packet to queue is bigger than the credit
> > > available, we can copy it.
> > > 
> > > > 
> > > > And yes we can extend the credit accounting to include buffer size.
> > > > That's a protocol change but maybe it makes sense.
> > > 
> > > Since we send to the other peer the credit available, maybe this
> > > change can be backwards compatible (I'll check better this).
> > 
> > What I said was wrong.
> > 
> > We send a counter (increased when the user consumes the packets) and the
> > "buf_alloc" (the max memory allowed) to the other peer.
> > It makes a difference between a local counter (increased when the
> > packets are sent) and the remote counter to calculate the credit available:
> > 
> >     u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
> >     {
> >     	u32 ret;
> > 
> >     	spin_lock_bh(&vvs->tx_lock);
> >     	ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
> >     	if (ret > credit)
> >     		ret = credit;
> >     	vvs->tx_cnt += ret;
> >     	spin_unlock_bh(&vvs->tx_lock);
> > 
> >     	return ret;
> >     }
> > 
> > Maybe I can play with "buf_alloc" to take care of bytes queued but not
> > used.
> > 
> > Thanks,
> > Stefano
> 
> Right. And the idea behind it all was that if we send a credit
> to remote then we have space for it.

Yes.

> I think the basic idea was that if we have actual allocated
> memory and can copy data there, then we send the credit to
> remote.
> 
> Of course that means an extra copy every packet.
> So as an optimization, it seems that we just assume
> that we will be able to allocate a new buffer.

Yes, we refill the virtqueue when half of the buffers were used.

> 
> First this is not the best we can do. We can actually do
> allocate memory in the socket before sending credit.

In this case, IIUC we should allocate an entire buffer (4KB),
so we can reuse it if the packet is big.

> If packet is small then we copy it there.
> If packet is big then we queue the packet,
> take the buffer out of socket and add it to the virtqueue.
> 
> Second question is what to do about medium sized packets.
> Packet is 1K but buffer is 4K, what do we do?
> And here I wonder - why don't we add the 3K buffer
> to the vq?

This would allow us to have an accurate credit account.

The problem here is the compatibility. Before this series virtio-vsock
and vhost-vsock modules had the RX buffer size hard-coded
(VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE = 4K). So, if we send a buffer smaller
of 4K, there might be issues.

Maybe it is the time to add add 'features' to virtio-vsock device.

Thanks,
Stefano

^ permalink raw reply

* Re: [PATCH 0/5] staging: fsl-dpaa2/ethsw: add the .ndo_fdb_dump callback
From: Ioana Ciornei @ 2019-07-30  9:13 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: gregkh@linuxfoundation.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, davem@davemloft.net, f.fainelli@gmail.com,
	jiri@mellanox.com
In-Reply-To: <20190729163506.GJ4110@lunn.ch>

On 7/29/19 7:35 PM, Andrew Lunn wrote:
> On Mon, Jul 29, 2019 at 07:11:47PM +0300, Ioana Ciornei wrote:
>> This patch set adds some features and small fixes in the
>> FDB table manipulation area.
>>
>> First of all, we implement the .ndo_fdb_dump netdev callback so that all
>> offloaded FDB entries, either static or learnt, are available to the user.
>> This is necessary because the DPAA2 switch does not emit interrupts when a
>> new FDB is learnt or deleted, thus we are not able to keep the software
>> bridge state and the HW in sync by calling the switchdev notifiers.
>>
>> The patch set also adds the .ndo_fdb_[add|del] callbacks in order to
>> facilitate adding FDB entries not associated with any master device.
>>
>> One interesting thing that I observed is that when adding an FDB entry
>> associated with a bridge (ie using the 'master' keywork appended to the
>> bridge command) and then dumping the FDB entries, there will be duplicates
>> of the same entry: one listed by the bridge device and one by the
>> driver's .ndo_fdb_dump).
>> It raises the question whether this is the expected behavior or not.
> 
> DSA devices are the same, they don't provide an interrupt when a new
> entry is added by the hardware. So we can have two entries, or just
> the SW bridge entry, or just the HW entry, depending on ageing.
>

This also happens when dealing with static entries (not just dynamic 
ones that can be affected by ageing). All in all, the basic actions of 
adding/deleting entries and then dumping them works. It was just a 
question about switchdev's architecture.


>> Another concern is regarding the correct/desired machanism for drivers to
>> signal errors back to switchdev on adding or deleting an FDB entry.
>> In the switchdev documentation, there is a TODO in the place of this topic.
> 
> It used to be a two state prepare/commit transaction, but that was
> changed a while back.
> 
> Maybe the DSA core code can give you ideas?
>

I looked in the DSA core before sending these patches out and it's doing 
the exact same thing as ethsw - even though it notifies switchdev if the 
entry could be offloaded (ie no error) all entries will still be present 
in the 'bridge fdb' output. In the SWITCHDEV_FDB_DEL_TO_DEVICE case, it 
seems that it just closes the netdev without any further action.

On the other hand, the mlxsw_spectrum also calls the notifiers when an 
offloaded entry is deleted (on SWITCHDEV_FDB_DEL_TO_DEVICE). This seems 
like a reasonable thing to do, maybe in another patch set.

Ioana C


^ permalink raw reply

* RE: [PATCH] net/socket: fix GCC8+ Wpacked-not-aligned warnings
From: David Laight @ 2019-07-30  9:01 UTC (permalink / raw)
  To: 'Qian Cai', davem@davemloft.net
  Cc: vyasevich@gmail.com, nhorman@tuxdriver.com,
	marcelo.leitner@gmail.com, linux-sctp@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <1564431838-23051-1-git-send-email-cai@lca.pw>

From: Qian Cai
> Sent: 29 July 2019 21:24
..
> To fix this, "struct sockaddr_storage" needs to be aligned to 4-byte as
> it is only used in those packed sctp structure which is part of UAPI,
> and "struct __kernel_sockaddr_storage" is used in some other
> places of UAPI that need not to change alignments in order to not
> breaking userspace.
> 
> One option is use typedef between "sockaddr_storage" and
> "__kernel_sockaddr_storage" but it needs to change 35 or 370 lines of
> codes. The other option is to duplicate this simple 2-field structure to
> have a separate "struct sockaddr_storage" so it can use a different
> alignment than "__kernel_sockaddr_storage". Also the structure seems
> stable enough, so it will be low-maintenance to update two structures in
> the future in case of changes.

Does it all work if the 8 byte alignment is implicit, not explicit?
For instance if unnamed union and struct are used eg:

struct sockaddr_storage {
	union {
		void * __ptr;  /* Force alignment */
		struct {
			__kernel_sa_family_t	ss_family;		/* address family */
			/* Following field(s) are implementation specific */
			char	__data[_K_SS_MAXSIZE - sizeof(unsigned short)];
					/* space to achieve desired size, */
					/* _SS_MAXSIZE value minus size of ss_family */
		};
	};
};

I suspect unnamed unions and structs have to be allowed by the compiler.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply

* [patch iproute2-next v2 2/2] devlink: add support for network namespace change
From: Jiri Pirko @ 2019-07-30  8:59 UTC (permalink / raw)
  To: netdev; +Cc: davem, jakub.kicinski, sthemmin, dsahern, mlxsw
In-Reply-To: <20190730085734.31504-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 devlink/devlink.c            | 54 +++++++++++++++++++++++++++++++++++-
 include/uapi/linux/devlink.h |  4 +++
 man/man8/devlink-dev.8       | 12 ++++++++
 3 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 9242cc05ad0c..a39bd8771d8b 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -235,6 +235,7 @@ static void ifname_map_free(struct ifname_map *ifname_map)
 #define DL_OPT_HEALTH_REPORTER_NAME	BIT(27)
 #define DL_OPT_HEALTH_REPORTER_GRACEFUL_PERIOD	BIT(27)
 #define DL_OPT_HEALTH_REPORTER_AUTO_RECOVER	BIT(28)
+#define DL_OPT_NETNS	BIT(29)
 
 struct dl_opts {
 	uint32_t present; /* flags of present items */
@@ -271,6 +272,8 @@ struct dl_opts {
 	const char *reporter_name;
 	uint64_t reporter_graceful_period;
 	bool reporter_auto_recover;
+	bool netns_is_pid;
+	uint32_t netns;
 };
 
 struct dl {
@@ -1331,6 +1334,22 @@ static int dl_argv_parse(struct dl *dl, uint32_t o_required,
 			if (err)
 				return err;
 			o_found |= DL_OPT_HEALTH_REPORTER_AUTO_RECOVER;
+		} else if (dl_argv_match(dl, "netns") &&
+			(o_all & DL_OPT_NETNS)) {
+			const char *netns_str;
+
+			dl_arg_inc(dl);
+			err = dl_argv_str(dl, &netns_str);
+			if (err)
+				return err;
+			opts->netns = netns_get_fd(netns_str);
+			if (opts->netns < 0) {
+				err = dl_argv_uint32_t(dl, &opts->netns);
+				if (err)
+					return err;
+				opts->netns_is_pid = true;
+			}
+			o_found |= DL_OPT_NETNS;
 		} else {
 			pr_err("Unknown option \"%s\"\n", dl_argv(dl));
 			return -EINVAL;
@@ -1444,7 +1463,11 @@ static void dl_opts_put(struct nlmsghdr *nlh, struct dl *dl)
 	if (opts->present & DL_OPT_HEALTH_REPORTER_AUTO_RECOVER)
 		mnl_attr_put_u8(nlh, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
 				opts->reporter_auto_recover);
-
+	if (opts->present & DL_OPT_NETNS)
+		mnl_attr_put_u32(nlh,
+				 opts->netns_is_pid ? DEVLINK_ATTR_NETNS_PID :
+						      DEVLINK_ATTR_NETNS_FD,
+				 opts->netns);
 }
 
 static int dl_argv_parse_put(struct nlmsghdr *nlh, struct dl *dl,
@@ -1499,6 +1522,7 @@ static bool dl_dump_filter(struct dl *dl, struct nlattr **tb)
 static void cmd_dev_help(void)
 {
 	pr_err("Usage: devlink dev show [ DEV ]\n");
+	pr_err("       devlink dev set DEV netns { PID | NAME | ID }\n");
 	pr_err("       devlink dev eswitch set DEV [ mode { legacy | switchdev } ]\n");
 	pr_err("                               [ inline-mode { none | link | network | transport } ]\n");
 	pr_err("                               [ encap { disable | enable } ]\n");
@@ -2551,6 +2575,31 @@ static int cmd_dev_show(struct dl *dl)
 	return err;
 }
 
+static void cmd_dev_set_help(void)
+{
+	pr_err("Usage: devlink dev set DEV netns { PID | NAME | ID }\n");
+}
+
+static int cmd_dev_set(struct dl *dl)
+{
+	struct nlmsghdr *nlh;
+	int err;
+
+	if (dl_argv_match(dl, "help") || dl_no_arg(dl)) {
+		cmd_dev_set_help();
+		return 0;
+	}
+
+	nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_SET,
+			       NLM_F_REQUEST | NLM_F_ACK);
+
+	err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE, DL_OPT_NETNS);
+	if (err)
+		return err;
+
+	return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL);
+}
+
 static void cmd_dev_reload_help(void)
 {
 	pr_err("Usage: devlink dev reload [ DEV ]\n");
@@ -2747,6 +2796,9 @@ static int cmd_dev(struct dl *dl)
 		   dl_argv_match(dl, "list") || dl_no_arg(dl)) {
 		dl_arg_inc(dl);
 		return cmd_dev_show(dl);
+	} else if (dl_argv_match(dl, "set")) {
+		dl_arg_inc(dl);
+		return cmd_dev_set(dl);
 	} else if (dl_argv_match(dl, "eswitch")) {
 		dl_arg_inc(dl);
 		return cmd_dev_eswitch(dl);
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index fc195cbd66f4..bc1869993e20 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -348,6 +348,10 @@ enum devlink_attr {
 	DEVLINK_ATTR_PORT_PCI_PF_NUMBER,	/* u16 */
 	DEVLINK_ATTR_PORT_PCI_VF_NUMBER,	/* u16 */
 
+	DEVLINK_ATTR_NETNS_FD,			/* u32 */
+	DEVLINK_ATTR_NETNS_PID,			/* u32 */
+	DEVLINK_ATTR_NETNS_ID,			/* u32 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/man/man8/devlink-dev.8 b/man/man8/devlink-dev.8
index 1804463b2321..0e1a5523fa7b 100644
--- a/man/man8/devlink-dev.8
+++ b/man/man8/devlink-dev.8
@@ -25,6 +25,13 @@ devlink-dev \- devlink device configuration
 .ti -8
 .B devlink dev help
 
+.ti -8
+.BR "devlink dev set"
+.IR DEV
+.RI "[ "
+.BI "netns { " PID " | " NAME " | " ID " }
+.RI "]"
+
 .ti -8
 .BR "devlink dev eswitch set"
 .IR DEV
@@ -92,6 +99,11 @@ Format is:
 .in +2
 BUS_NAME/BUS_ADDRESS
 
+.SS devlink dev set  - sets devlink device attributes
+
+.TP
+.BI "netns { " PID " | " NAME " | " ID " }
+
 .SS devlink dev eswitch show - display devlink device eswitch attributes
 .SS devlink dev eswitch set  - sets devlink device eswitch attributes
 
-- 
2.21.0


^ permalink raw reply related

* [patch iproute2-next v2 1/2] devlink: introduce cmdline option to switch to a different namespace
From: Jiri Pirko @ 2019-07-30  8:59 UTC (permalink / raw)
  To: netdev; +Cc: davem, jakub.kicinski, sthemmin, dsahern, mlxsw
In-Reply-To: <20190730085734.31504-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 devlink/devlink.c  | 12 ++++++++++--
 man/man8/devlink.8 |  4 ++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index d8197ea3a478..9242cc05ad0c 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -32,6 +32,7 @@
 #include "mnlg.h"
 #include "json_writer.h"
 #include "utils.h"
+#include "namespace.h"
 
 #define ESWITCH_MODE_LEGACY "legacy"
 #define ESWITCH_MODE_SWITCHDEV "switchdev"
@@ -6332,7 +6333,7 @@ static int cmd_health(struct dl *dl)
 static void help(void)
 {
 	pr_err("Usage: devlink [ OPTIONS ] OBJECT { COMMAND | help }\n"
-	       "       devlink [ -f[orce] ] -b[atch] filename\n"
+	       "       devlink [ -f[orce] ] -b[atch] filename -N[etns] netnsname\n"
 	       "where  OBJECT := { dev | port | sb | monitor | dpipe | resource | region | health }\n"
 	       "       OPTIONS := { -V[ersion] | -n[o-nice-names] | -j[son] | -p[retty] | -v[erbose] }\n");
 }
@@ -6478,6 +6479,7 @@ int main(int argc, char **argv)
 		{ "json",		no_argument,		NULL, 'j' },
 		{ "pretty",		no_argument,		NULL, 'p' },
 		{ "verbose",		no_argument,		NULL, 'v' },
+		{ "Netns",		required_argument,	NULL, 'N' },
 		{ NULL, 0, NULL, 0 }
 	};
 	const char *batch_file = NULL;
@@ -6493,7 +6495,7 @@ int main(int argc, char **argv)
 		return EXIT_FAILURE;
 	}
 
-	while ((opt = getopt_long(argc, argv, "Vfb:njpv",
+	while ((opt = getopt_long(argc, argv, "Vfb:njpvN:",
 				  long_options, NULL)) >= 0) {
 
 		switch (opt) {
@@ -6519,6 +6521,12 @@ int main(int argc, char **argv)
 		case 'v':
 			dl->verbose = true;
 			break;
+		case 'N':
+			if (netns_switch(optarg)) {
+				ret = EXIT_FAILURE;
+				goto dl_free;
+			}
+			break;
 		default:
 			pr_err("Unknown option.\n");
 			help();
diff --git a/man/man8/devlink.8 b/man/man8/devlink.8
index 13d4dcd908b3..9fc9b034eefe 100644
--- a/man/man8/devlink.8
+++ b/man/man8/devlink.8
@@ -51,6 +51,10 @@ When combined with -j generate a pretty JSON output.
 .BR "\-v" , " --verbose"
 Turn on verbose output.
 
+.TP
+.BR "\-N", " \-Netns " <NETNSNAME>
+Switches to the specified network namespace.
+
 .SS
 .I OBJECT
 
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Nikolay Aleksandrov @ 2019-07-30  8:58 UTC (permalink / raw)
  To: Allan W. Nielsen, Ido Schimmel
  Cc: Horatiu Vultur, roopa, davem, bridge, netdev, linux-kernel
In-Reply-To: <20190730083027.biuzy7h5dbq7pik3@lx-anielsen.microsemi.net>

On 30/07/2019 11:30, Allan W. Nielsen wrote:
> The 07/30/2019 10:06, Ido Schimmel wrote:
>> On Tue, Jul 30, 2019 at 08:27:22AM +0200, Allan W. Nielsen wrote:
>>> The 07/29/2019 20:51, Ido Schimmel wrote:
>>>> Can you please clarify what you're trying to achieve? I just read the
>>>> thread again and my impression is that you're trying to locally receive
>>>> packets with a certain link layer multicast address.
>>> Yes. The thread is also a bit confusing because we half way through realized
>>> that we misunderstood how the multicast packets should be handled (sorry about
>>> that). To begin with we had a driver where multicast packets was only copied to
>>> the CPU if someone needed it. Andrew and Nikolay made us aware that this is not
>>> how other drivers are doing it, so we changed the driver to include the CPU in
>>> the default multicast flood-mask.
>> OK, so what prevents you from removing all other ports from the
>> flood-mask and letting the CPU handle the flooding? Then you can install
>> software tc filters to limit the flooding.
> I do not have the bandwidth to forward the multicast traffic in the CPU.
> 
> It will also cause enormous latency on the forwarding of L2 multicast packets.
> 
>>> This changes the objective a bit. To begin with we needed to get more packets to
>>> the CPU (which could have been done using tc ingress rules and a trap action).
>>>
>>> Now after we changed the driver, we realized that we need something to limit the
>>> flooding of certain L2 multicast packets. This is the new problem we are trying
>>> to solve!
>>>
>>> Example: Say we have a bridge with 4 slave interfaces, then we want to install a
>>> forwarding rule saying that packets to a given L2-multicast MAC address, should
>>> only be flooded to 2 of the 4 ports.
>>>
>>> (instead of adding rules to get certain packets to the CPU, we are now adding
>>> other rules to prevent other packets from going to the CPU and other ports where
>>> they are not needed/wanted).
>>>
>>> This is exactly the same thing as IGMP snooping does dynamically, but only for
>>> IP multicast.
>>>
>>> The "bridge mdb" allow users to manually/static add/del a port to a multicast
>>> group, but still it operates on IP multicast address (not L2 multicast
>>> addresses).
>>>
>>>> Nik suggested SIOCADDMULTI.
>>> It is not clear to me how this should be used to limit the flooding, maybe we
>>> can make some hacks, but as far as I understand the intend of this is maintain
>>> the list of addresses an interface should receive. I'm not sure this should
>>> influence how for forwarding decisions are being made.
>>>
>>>> and I suggested a tc filter to get the packet to the CPU.
>>> The TC solution is a good solution to the original problem where wanted to copy
>>> more frames to the CPU. But we were convinced that this is not the right
>>> approach, and that the CPU by default should receive all multicast packets, and
>>> we should instead try to find a way to limit the flooding of certain frames as
>>> an optimization.
>>
>> This can still work. In Linux, ingress tc filters are executed before the
>> bridge's Rx handler. The same happens in every sane HW. Ingress ACL is
>> performed before L2 forwarding. Assuming you have eth0-eth3 bridged and
>> you want to prevent packets with DMAC 01:21:6C:00:00:01 from egressing
>> eth2:
>>
>> # tc filter add dev eth0 ingress pref 1 flower skip_sw \
>> 	dst_mac 01:21:6C:00:00:01 action trap
>> # tc filter add dev eth2 egress pref 1 flower skip_hw \
>> 	dst_mac 01:21:6C:00:00:01 action drop
>>
>> The first filter is only present in HW ('skip_sw') and should result in
>> your HW passing you the sole copy of the packet.
> Agree.
> 
>> The second filter is only present in SW ('skip_hw', not using HW egress
>> ACL that you don't have) and drops the packet after it was flooded by
>> the SW bridge.
> Agree.
> 
>> As I mentioned earlier, you can install the filter once in your HW and
>> share it between different ports using a shared block. This means you
>> only consume one TCAM entry.
>>
>> Note that this allows you to keep flooding all other multicast packets
>> in HW.
> Yes, but the frames we want to limit the flood-mask on are the exact frames
> which occurs at a very high rate, and where latency is important.
> 
> I really do not consider it as an option to forward this in SW, when it is
> something that can easily be offloaded in HW.
> 
>>>> If you now want to limit the ports to which this packet is flooded, then
>>>> you can use tc filters in *software*:
>>>>
>>>> # tc qdisc add dev eth2 clsact
>>>> # tc filter add dev eth2 egress pref 1 flower skip_hw \
>>>> 	dst_mac 01:21:6C:00:00:01 action drop
>>> Yes. This can work in the SW bridge.
>>>
>>>> If you want to forward the packet in hardware and locally receive it,
>>>> you can chain several mirred action and then a trap action.
>>> I'm not I fully understand how this should be done, but it does sound like it
>>> becomes quite complicated. Also, as far as I understand it will mean that we
>>> will be using TCAM/ACL resources to do something that could have been done with
>>> a simple MAC entry.
>>>
>>>> Both options avoid HW egress ACLs which your design does not support.
>>> True, but what is wrong with expanding the functionality of the normal
>>> forwarding/MAC operations to allow multiple destinations?
>>>
>>> It is not an uncommon feature (I just browsed the manual of some common L2
>>> switches and they all has this feature).
>>>
>>> It seems to fit nicely into the existing user-interface:
>>>
>>> bridge fdb add    01:21:6C:00:00:01 port eth0
>>> bridge fdb append 01:21:6C:00:00:01 port eth1
>>
>> Wouldn't it be better to instead extend the MDB entries so that they are
>> either keyed by IP or MAC? I believe FDB should remain as unicast-only.
> 
> You might be right, it was not clear to me which of the two would fit the
> purpose best.
> 
> From a user-space iproute2 perspective I prefer using the "bridge fdb" command
> as it already supports the needed syntax, and I do not think it will be too
> pretty if we squeeze this into the "bridge mdb" command syntax.
> 

MDB is a much better fit as Ido already suggested. FDB should remain unicast
and mixing them is not a good idea, we already have a good ucast/mcast separation
and we'd like to keep it that way.

> But that does not mean that it need to go into the FDB database in the
> implementation.
> 
> Last evening when I looked at it again, I was considering keeping the
> net_bridge_fdb_entry structure as is, and add a new hashtable with the
> following:
> 
> struct net_bridge_fdbmc_entry {
> 	struct rhash_head		rhnode;
> 	struct net_bridge_fdbmc_ports   *dst;
> 
> 	struct net_bridge_fdb_key	key;
> 	struct hlist_node		fdb_node;
> 	unsigned char			offloaded:1;
> 
> 	struct rcu_head			rcu;
> };
> 

What would the notification for this look like ?

> If we go with this approach then we can look at the MAC address and see if it is
> a unicast which will cause a lookup in the fdb, l3-multicast (33:33:* or
> 01:00:5e:*) which will cause a lookup in the mdb, or finally a fdbmc which will
> need to do a lookup in this new hashtable.

That sounds wrong, you will change the current default behaviour of flooding these
packets. This will have to be well hidden behind a new option and enabled only on user
request.

> 
> Alternative it would be like this:
> 
> struct net_bridge_fdb_entry {
> 	struct rhash_head		rhnode;
> 	union net_bridge_port_or_list	*dst;
> 
> 	struct net_bridge_fdb_key	key;
> 	struct hlist_node		fdb_node;
> 	unsigned char			is_local:1,
> 					is_static:1,
> 					is_sticky:1,
> 					added_by_user:1,
> 					added_by_external_learn:1,
> 					offloaded:1;
> 					multi_dst:1;
> 
> 	/* write-heavy members should not affect lookups */
> 	unsigned long			updated ____cacheline_aligned_in_smp;
> 	unsigned long			used;
> 
> 	struct rcu_head			rcu;
> };
> 
> Both solutions should require fairly few changes, and should not cause any
> measurable performance hit.
> 

You'll have to convert these bits to use the proper atomic bitops if you go with
the second solution. That has to be done even today, but the second case would
make it a must.

> Making it fit into the net_bridge_mdb_entry seems to be harder.
> 

But it is the correct abstraction from bridge POV, so please stop trying to change
the FDB code and try to keep to the multicast code.

>> As a bonus, existing drivers could benefit from it, as MDB entries are already
>> notified by MAC.
> Not sure I follow. When FDB entries are added, it also generates notification
> events.
> 

Could you please show fdb event with multiple ports ?

>>> It seems that it can be added to the existing implementation with out adding
>>> significant complexity.
>>>
>>> It will be easy to offload in HW.
>>>
>>> I do not believe that it will be a performance issue, if this is a concern then
>>> we may have to do a bit of benchmarking, or we can make it a configuration
>>> option.
>>>
>>> Long story short, we (Horatiu and I) learned a lot from the discussion here, and
>>> I think we should try do a new patch with the learning we got. Then it is easier
>>> to see what it actually means to the exiting code, complexity, exiting drivers,
>>> performance, default behavioral, backwards compatibly, and other valid concerns.
>>>
>>> If the patch is no good, and cannot be fixed, then we will go back and look
>>> further into alternative solutions.
>> Overall, I tend to agree with Nik. I think your use case is too specific
>> to justify the amount of changes you want to make in the bridge driver.
>> We also provided other alternatives. That being said, you're more than
>> welcome to send the patches and we can continue the discussion then.
> Okay, good to know. I'm not sure I agree that the alternative solutions really
> solves the issue this is trying to solve, nor do I agree that this is specific
> to our needs.
> 
> But lets take a look at a new patch, and see what is the amount of changes we
> are talking about. Without having the patch it is really hard to know for sure.
> 

Please keep in mind that this case is the exception, not the norm, thus it should
not under any circumstance affect the standard deployments.

^ permalink raw reply

* [patch net-next v2 3/3] netdevsim: create devlink and netdev instances in namespace
From: Jiri Pirko @ 2019-07-30  8:57 UTC (permalink / raw)
  To: netdev; +Cc: davem, jakub.kicinski, sthemmin, dsahern, mlxsw
In-Reply-To: <20190730085734.31504-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

When user does create new netdevsim instance using sysfs bus file,
create the devlink instance and related netdev instance in the namespace
of the caller.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v1->v2:
- remove net_namespace.h include and forward decralared net struct
- add comment to initial_net pointer
---
 drivers/net/netdevsim/bus.c       |  1 +
 drivers/net/netdevsim/dev.c       | 17 +++++++++++------
 drivers/net/netdevsim/netdev.c    |  4 +++-
 drivers/net/netdevsim/netdevsim.h |  8 +++++++-
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/net/netdevsim/bus.c b/drivers/net/netdevsim/bus.c
index 1a0ff3d7747b..6aeed0c600f8 100644
--- a/drivers/net/netdevsim/bus.c
+++ b/drivers/net/netdevsim/bus.c
@@ -283,6 +283,7 @@ nsim_bus_dev_new(unsigned int id, unsigned int port_count)
 	nsim_bus_dev->dev.bus = &nsim_bus;
 	nsim_bus_dev->dev.type = &nsim_bus_dev_type;
 	nsim_bus_dev->port_count = port_count;
+	nsim_bus_dev->initial_net = current->nsproxy->net_ns;
 
 	err = device_register(&nsim_bus_dev->dev);
 	if (err)
diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
index c5c417a3c0ce..685dd21f5500 100644
--- a/drivers/net/netdevsim/dev.c
+++ b/drivers/net/netdevsim/dev.c
@@ -268,7 +268,8 @@ static const struct devlink_ops nsim_dev_devlink_ops = {
 };
 
 static struct nsim_dev *
-nsim_dev_create(struct nsim_bus_dev *nsim_bus_dev, unsigned int port_count)
+nsim_dev_create(struct net *net, struct nsim_bus_dev *nsim_bus_dev,
+		unsigned int port_count)
 {
 	struct nsim_dev *nsim_dev;
 	struct devlink *devlink;
@@ -277,6 +278,7 @@ nsim_dev_create(struct nsim_bus_dev *nsim_bus_dev, unsigned int port_count)
 	devlink = devlink_alloc(&nsim_dev_devlink_ops, sizeof(*nsim_dev));
 	if (!devlink)
 		return ERR_PTR(-ENOMEM);
+	devlink_net_set(devlink, net);
 	nsim_dev = devlink_priv(devlink);
 	nsim_dev->nsim_bus_dev = nsim_bus_dev;
 	nsim_dev->switch_id.id_len = sizeof(nsim_dev->switch_id.id);
@@ -335,7 +337,7 @@ static void nsim_dev_destroy(struct nsim_dev *nsim_dev)
 	devlink_free(devlink);
 }
 
-static int __nsim_dev_port_add(struct nsim_dev *nsim_dev,
+static int __nsim_dev_port_add(struct net *net, struct nsim_dev *nsim_dev,
 			       unsigned int port_index)
 {
 	struct nsim_dev_port *nsim_dev_port;
@@ -361,7 +363,7 @@ static int __nsim_dev_port_add(struct nsim_dev *nsim_dev,
 	if (err)
 		goto err_dl_port_unregister;
 
-	nsim_dev_port->ns = nsim_create(nsim_dev, nsim_dev_port);
+	nsim_dev_port->ns = nsim_create(net, nsim_dev, nsim_dev_port);
 	if (IS_ERR(nsim_dev_port->ns)) {
 		err = PTR_ERR(nsim_dev_port->ns);
 		goto err_port_debugfs_exit;
@@ -404,17 +406,19 @@ static void nsim_dev_port_del_all(struct nsim_dev *nsim_dev)
 
 int nsim_dev_probe(struct nsim_bus_dev *nsim_bus_dev)
 {
+	struct net *initial_net = nsim_bus_dev->initial_net;
 	struct nsim_dev *nsim_dev;
 	int i;
 	int err;
 
-	nsim_dev = nsim_dev_create(nsim_bus_dev, nsim_bus_dev->port_count);
+	nsim_dev = nsim_dev_create(initial_net, nsim_bus_dev,
+				   nsim_bus_dev->port_count);
 	if (IS_ERR(nsim_dev))
 		return PTR_ERR(nsim_dev);
 	dev_set_drvdata(&nsim_bus_dev->dev, nsim_dev);
 
 	for (i = 0; i < nsim_bus_dev->port_count; i++) {
-		err = __nsim_dev_port_add(nsim_dev, i);
+		err = __nsim_dev_port_add(initial_net, nsim_dev, i);
 		if (err)
 			goto err_port_del_all;
 	}
@@ -449,13 +453,14 @@ int nsim_dev_port_add(struct nsim_bus_dev *nsim_bus_dev,
 		      unsigned int port_index)
 {
 	struct nsim_dev *nsim_dev = dev_get_drvdata(&nsim_bus_dev->dev);
+	struct net *net = devlink_net(priv_to_devlink(nsim_dev));
 	int err;
 
 	mutex_lock(&nsim_dev->port_list_lock);
 	if (__nsim_dev_port_lookup(nsim_dev, port_index))
 		err = -EEXIST;
 	else
-		err = __nsim_dev_port_add(nsim_dev, port_index);
+		err = __nsim_dev_port_add(net, nsim_dev, port_index);
 	mutex_unlock(&nsim_dev->port_list_lock);
 	return err;
 }
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index 0740940f41b1..25c7de7a4a31 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -280,7 +280,8 @@ static void nsim_setup(struct net_device *dev)
 }
 
 struct netdevsim *
-nsim_create(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port)
+nsim_create(struct net *net, struct nsim_dev *nsim_dev,
+	    struct nsim_dev_port *nsim_dev_port)
 {
 	struct net_device *dev;
 	struct netdevsim *ns;
@@ -290,6 +291,7 @@ nsim_create(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port)
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
 
+	dev_net_set(dev, net);
 	ns = netdev_priv(dev);
 	ns->netdev = dev;
 	ns->nsim_dev = nsim_dev;
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index 79c05af2a7c0..9563acb61b03 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -74,8 +74,11 @@ struct netdevsim {
 	struct nsim_ipsec ipsec;
 };
 
+struct net;
+
 struct netdevsim *
-nsim_create(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port);
+nsim_create(struct net *net, struct nsim_dev *nsim_dev,
+	    struct nsim_dev_port *nsim_dev_port);
 void nsim_destroy(struct netdevsim *ns);
 
 #ifdef CONFIG_BPF_SYSCALL
@@ -213,6 +216,9 @@ struct nsim_bus_dev {
 	struct device dev;
 	struct list_head list;
 	unsigned int port_count;
+	struct net *initial_net; /* Purpose of this is to carry net pointer
+				  * during the probe time only.
+				  */
 	unsigned int num_vfs;
 	struct nsim_vf_config *vfconfigs;
 };
-- 
2.21.0


^ permalink raw reply related

* [patch net-next v2 1/3] net: devlink: allow to change namespaces
From: Jiri Pirko @ 2019-07-30  8:57 UTC (permalink / raw)
  To: netdev; +Cc: davem, jakub.kicinski, sthemmin, dsahern, mlxsw
In-Reply-To: <20190730085734.31504-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

All devlink instances are created in init_net and stay there for a
lifetime. Allow user to be able to move devlink instances into
namespaces.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v1->v2:
- change the check for multiple attributes
- add warnon in case there is no attribute passed
---
 include/uapi/linux/devlink.h |   4 ++
 net/core/devlink.c           | 113 ++++++++++++++++++++++++++++++++++-
 2 files changed, 114 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index ffc993256527..95f0a1edab99 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -348,6 +348,10 @@ enum devlink_attr {
 	DEVLINK_ATTR_PORT_PCI_PF_NUMBER,	/* u16 */
 	DEVLINK_ATTR_PORT_PCI_VF_NUMBER,	/* u16 */
 
+	DEVLINK_ATTR_NETNS_FD,			/* u32 */
+	DEVLINK_ATTR_NETNS_PID,			/* u32 */
+	DEVLINK_ATTR_NETNS_ID,			/* u32 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 4f40aeace902..e1cbfd90f788 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -439,8 +439,16 @@ static void devlink_nl_post_doit(const struct genl_ops *ops,
 {
 	struct devlink *devlink;
 
-	devlink = devlink_get_from_info(info);
-	if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
+	/* When devlink changes netns, it would not be found
+	 * by devlink_get_from_info(). So try if it is stored first.
+	 */
+	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) {
+		devlink = info->user_ptr[0];
+	} else {
+		devlink = devlink_get_from_info(info);
+		WARN_ON(IS_ERR(devlink));
+	}
+	if (!IS_ERR(devlink) && ~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
 		mutex_unlock(&devlink->lock);
 	mutex_unlock(&devlink_mutex);
 }
@@ -645,6 +653,71 @@ static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info)
 	return genlmsg_reply(msg, info);
 }
 
+static struct net *devlink_netns_get(struct sk_buff *skb,
+				     struct devlink *devlink,
+				     struct genl_info *info)
+{
+	struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID];
+	struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD];
+	struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID];
+	struct net *net;
+
+	if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) {
+		NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (netns_pid_attr) {
+		net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr));
+	} else if (netns_fd_attr) {
+		net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr));
+	} else if (netns_id_attr) {
+		net = get_net_ns_by_id(sock_net(skb->sk),
+				       nla_get_u32(netns_id_attr));
+		if (!net)
+			net = ERR_PTR(-EINVAL);
+	} else {
+		WARN_ON(1);
+		net = ERR_PTR(-EINVAL);
+	}
+	if (IS_ERR(net)) {
+		NL_SET_ERR_MSG(info->extack, "Unknown network namespace");
+		return ERR_PTR(-EINVAL);
+	}
+	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+		put_net(net);
+		return ERR_PTR(-EPERM);
+	}
+	return net;
+}
+
+static void devlink_netns_change(struct devlink *devlink, struct net *net)
+{
+	if (net_eq(devlink_net(devlink), net))
+		return;
+	devlink_notify(devlink, DEVLINK_CMD_DEL);
+	devlink_net_set(devlink, net);
+	devlink_notify(devlink, DEVLINK_CMD_NEW);
+}
+
+static int devlink_nl_cmd_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+
+	if (info->attrs[DEVLINK_ATTR_NETNS_PID] ||
+	    info->attrs[DEVLINK_ATTR_NETNS_FD] ||
+	    info->attrs[DEVLINK_ATTR_NETNS_ID]) {
+		struct net *net;
+
+		net = devlink_netns_get(skb, devlink, info);
+		if (IS_ERR(net))
+			return PTR_ERR(net);
+		devlink_netns_change(devlink, net);
+		put_net(net);
+	}
+	return 0;
+}
+
 static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg,
 				     struct netlink_callback *cb)
 {
@@ -5184,6 +5257,9 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 },
 	[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 },
+	[DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 },
+	[DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 },
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -5195,6 +5271,13 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 		/* can be retrieved by unprivileged users */
 	},
+	{
+		.cmd = DEVLINK_CMD_SET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = devlink_nl_cmd_set_doit,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 	{
 		.cmd = DEVLINK_CMD_PORT_GET,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -6955,9 +7038,33 @@ int devlink_compat_switch_id_get(struct net_device *dev,
 	return 0;
 }
 
+static void __net_exit devlink_pernet_exit(struct net *net)
+{
+	struct devlink *devlink;
+
+	mutex_lock(&devlink_mutex);
+	list_for_each_entry(devlink, &devlink_list, list)
+		if (net_eq(devlink_net(devlink), net))
+			devlink_netns_change(devlink, &init_net);
+	mutex_unlock(&devlink_mutex);
+}
+
+static struct pernet_operations __net_initdata devlink_pernet_ops = {
+	.exit = devlink_pernet_exit,
+};
+
 static int __init devlink_init(void)
 {
-	return genl_register_family(&devlink_nl_family);
+	int err;
+
+	err = genl_register_family(&devlink_nl_family);
+	if (err)
+		goto out;
+	err = register_pernet_device(&devlink_pernet_ops);
+
+out:
+	WARN_ON(err);
+	return err;
 }
 
 subsys_initcall(devlink_init);
-- 
2.21.0


^ permalink raw reply related

* [patch net-next v2 2/3] net: devlink: export devlink net set/get helpers
From: Jiri Pirko @ 2019-07-30  8:57 UTC (permalink / raw)
  To: netdev; +Cc: davem, jakub.kicinski, sthemmin, dsahern, mlxsw
In-Reply-To: <20190730085734.31504-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Allow drivers to set/get net struct for devlink instance. Set is only
allowed for newly allocated devlink instance.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/devlink.h |  3 +++
 net/core/devlink.c    | 18 ++++++++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index bc36f942a7d5..98b89eabd73a 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -35,6 +35,7 @@ struct devlink {
 	struct device *dev;
 	possible_net_t _net;
 	struct mutex lock;
+	bool registered;
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
@@ -591,6 +592,8 @@ static inline struct devlink *netdev_to_devlink(struct net_device *dev)
 
 struct ib_device;
 
+struct net *devlink_net(const struct devlink *devlink);
+void devlink_net_set(struct devlink *devlink, struct net *net);
 struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size);
 int devlink_register(struct devlink *devlink, struct device *dev);
 void devlink_unregister(struct devlink *devlink);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index e1cbfd90f788..fc364bdb9cf2 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -92,16 +92,25 @@ static LIST_HEAD(devlink_list);
  */
 static DEFINE_MUTEX(devlink_mutex);
 
-static struct net *devlink_net(const struct devlink *devlink)
+struct net *devlink_net(const struct devlink *devlink)
 {
 	return read_pnet(&devlink->_net);
 }
+EXPORT_SYMBOL_GPL(devlink_net);
 
-static void devlink_net_set(struct devlink *devlink, struct net *net)
+static void __devlink_net_set(struct devlink *devlink, struct net *net)
 {
 	write_pnet(&devlink->_net, net);
 }
 
+void devlink_net_set(struct devlink *devlink, struct net *net)
+{
+	if (WARN_ON(devlink->registered))
+		return;
+	__devlink_net_set(devlink, net);
+}
+EXPORT_SYMBOL_GPL(devlink_net_set);
+
 static struct devlink *devlink_get_from_attrs(struct net *net,
 					      struct nlattr **attrs)
 {
@@ -696,7 +705,7 @@ static void devlink_netns_change(struct devlink *devlink, struct net *net)
 	if (net_eq(devlink_net(devlink), net))
 		return;
 	devlink_notify(devlink, DEVLINK_CMD_DEL);
-	devlink_net_set(devlink, net);
+	__devlink_net_set(devlink, net);
 	devlink_notify(devlink, DEVLINK_CMD_NEW);
 }
 
@@ -5603,7 +5612,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
 	if (!devlink)
 		return NULL;
 	devlink->ops = ops;
-	devlink_net_set(devlink, &init_net);
+	__devlink_net_set(devlink, &init_net);
 	INIT_LIST_HEAD(&devlink->port_list);
 	INIT_LIST_HEAD(&devlink->sb_list);
 	INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
@@ -5627,6 +5636,7 @@ int devlink_register(struct devlink *devlink, struct device *dev)
 {
 	mutex_lock(&devlink_mutex);
 	devlink->dev = dev;
+	devlink->registered = true;
 	list_add_tail(&devlink->list, &devlink_list);
 	devlink_notify(devlink, DEVLINK_CMD_NEW);
 	mutex_unlock(&devlink_mutex);
-- 
2.21.0


^ permalink raw reply related

* [patch net-next v2 0/3] net: devlink: Finish network namespace support
From: Jiri Pirko @ 2019-07-30  8:57 UTC (permalink / raw)
  To: netdev; +Cc: davem, jakub.kicinski, sthemmin, dsahern, mlxsw

From: Jiri Pirko <jiri@mellanox.com>

Devlink from the beginning counts with network namespaces, but the
instances has been fixed to init_net. The first patch allows user
to move existing devlink instances into namespaces:

$ devlink dev
netdevsim/netdevsim1
$ ip netns add ns1
$ devlink dev set netdevsim/netdevsim1 netns ns1
$ devlink -N ns1 dev
netdevsim/netdevsim1

The last patch allows user to create new netdevsim instance directly
inside network namespace of a caller.

Jiri Pirko (3):
  net: devlink: allow to change namespaces
  net: devlink: export devlink net set/get helpers
  netdevsim: create devlink and netdev instances in namespace

 drivers/net/netdevsim/bus.c       |   1 +
 drivers/net/netdevsim/dev.c       |  17 ++--
 drivers/net/netdevsim/netdev.c    |   4 +-
 drivers/net/netdevsim/netdevsim.h |   8 +-
 include/net/devlink.h             |   3 +
 include/uapi/linux/devlink.h      |   4 +
 net/core/devlink.c                | 129 ++++++++++++++++++++++++++++--
 7 files changed, 152 insertions(+), 14 deletions(-)

-- 
2.21.0


^ permalink raw reply

* Re: [PATCH v2 0/2] mmc: core: Fix Marvell WiFi reset by adding SDIO API to replug card
From: Andreas Fenkart @ 2019-07-30  8:46 UTC (permalink / raw)
  To: Douglas Anderson
  Cc: Ulf Hansson, Kalle Valo, Adrian Hunter, Ganapathi Bhat,
	linux-wireless, Brian Norris, Amitkumar Karwar, linux-rockchip,
	Wolfram Sang, Nishant Sarmukadam, netdev, Avri Altman, linux-mmc,
	David Miller, Xinming Hu, linux-kernel, Thomas Gleixner,
	Kate Stewart
In-Reply-To: <20190722193939.125578-1-dianders@chromium.org>

Hi Douglas,

Am Mo., 22. Juli 2019 um 21:41 Uhr schrieb Douglas Anderson
<dianders@chromium.org>:
>
> As talked about in the thread at:
>
> http://lkml.kernel.org/r/CAD=FV=X7P2F1k_zwHc0mbtfk55-rucTz_GoDH=PL6zWqKYcpuw@mail.gmail.com
>
> ...when the Marvell WiFi card tries to reset itself it kills
> Bluetooth.  It was observed that we could re-init the card properly by
> unbinding / rebinding the host controller.  It was also observed that
> in the downstream Chrome OS codebase the solution used was
> mmc_remove_host() / mmc_add_host(), which is similar to the solution
> in this series.
>
> So far I've only done testing of this series using the reset test
> source that can be simulated via sysfs.  Specifically I ran this test:
>
> for i in $(seq 1000); do
>   echo "LOOP $i --------"
>   echo 1 > /sys/kernel/debug/mwifiex/mlan0/reset
>
>   while true; do
>     if ! ping -w15 -c1 "${GW}" >/dev/null 2>&1; then
>       fail=$(( fail + 1 ))
>       echo "Fail WiFi ${fail}"
>       if [[ ${fail} == 3 ]]; then
>         exit 1
>       fi
>     else
>       fail=0
>       break
>     fi
>   done
>
>   hciconfig hci0 down
>   sleep 1
>   if ! hciconfig hci0 up; then
>     echo "Fail BT"
>     exit 1
>   fi
> done
>
> I ran this several times and got several hundred iterations each
> before a failure.  When I saw failures:
>
> * Once I saw a "Fail BT"; manually resetting the card again fixed it.
>   I didn't give it time to see if it would have detected this
>   automatically.
> * Once I saw the ping fail because (for some reason) my device only
>   got an IPv6 address from my router and the IPv4 ping failed.  I
>   changed my script to use 'ping6' to see if that would help.
> * Once I saw the ping fail because the higher level network stack
>   ("shill" in my case) seemed to crash.  A few minutes later the
>   system recovered itself automatically.  https://crbug.com/984593 if
>   you want more details.
> * Sometimes while I was testing I saw "Fail WiFi 1" indicating a
>   transitory failure.  Usually this was an association failure, but in
>   one case I saw the device do "Firmware wakeup failed" after I
>   triggered the reset.  This caused the driver to trigger a re-reset
>   of itself which eventually recovered things.  This was good because
>   it was an actual test of the normal reset flow (not the one
>   triggered via sysfs).

This error triggers something. I remember that when I was working on
suspend-to-ram feature, we had problems to wake up the firmware
reliable. I found this patch in one of my old 3.13 tree

    the missing bit -- ugly hack to force cmd52 before cmd53.
---
 drivers/mmc/host/omap_hsmmc.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c
index fb24a006080f..710d0bdf39e5 100644
--- a/drivers/mmc/host/omap_hsmmc.c
+++ b/drivers/mmc/host/omap_hsmmc.c
@@ -2372,6 +2372,12 @@ static int omap_hsmmc_suspend(struct device *dev)
        if (host->flags & HSMMC_SWAKEUP_QUIRK)
                disable_irq(host->gpio_sdio_irq);

+       /*
+        * force a polling cycle after resume.
+        * will issue cmd52, not cmd53 straight away
+        */
+       omap_hsmmc_enable_sdio_irq(host->mmc, false);
+
        if (host->dbclk)
                clk_disable_unprepare(host->dbclk);

>
> Changes in v2:
> - s/routnine/routine (Brian Norris, Matthias Kaehlcke).
> - s/contining/containing (Matthias Kaehlcke).
> - Add Matthias Reviewed-by tag.
> - Removed clear_bit() calls and old comment (Brian Norris).
> - Explicit CC of Andreas Fenkart.
> - Explicit CC of Brian Norris.
> - Add "Fixes" pointing at the commit Brian talked about.
> - Add Brian's Reviewed-by tag.
>
> Douglas Anderson (2):
>   mmc: core: Add sdio_trigger_replug() API
>   mwifiex: Make use of the new sdio_trigger_replug() API to reset
>
>  drivers/mmc/core/core.c                     | 28 +++++++++++++++++++--
>  drivers/mmc/core/sdio_io.c                  | 20 +++++++++++++++
>  drivers/net/wireless/marvell/mwifiex/sdio.c | 16 +-----------
>  include/linux/mmc/host.h                    | 15 ++++++++++-
>  include/linux/mmc/sdio_func.h               |  2 ++
>  5 files changed, 63 insertions(+), 18 deletions(-)
>
> --
> 2.22.0.657.g960e92d24f-goog
>

^ permalink raw reply related

* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Allan W. Nielsen @ 2019-07-30  8:30 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: Nikolay Aleksandrov, Horatiu Vultur, roopa, davem, bridge, netdev,
	linux-kernel
In-Reply-To: <20190730070626.GA508@splinter>

The 07/30/2019 10:06, Ido Schimmel wrote:
> On Tue, Jul 30, 2019 at 08:27:22AM +0200, Allan W. Nielsen wrote:
> > The 07/29/2019 20:51, Ido Schimmel wrote:
> > > Can you please clarify what you're trying to achieve? I just read the
> > > thread again and my impression is that you're trying to locally receive
> > > packets with a certain link layer multicast address.
> > Yes. The thread is also a bit confusing because we half way through realized
> > that we misunderstood how the multicast packets should be handled (sorry about
> > that). To begin with we had a driver where multicast packets was only copied to
> > the CPU if someone needed it. Andrew and Nikolay made us aware that this is not
> > how other drivers are doing it, so we changed the driver to include the CPU in
> > the default multicast flood-mask.
> OK, so what prevents you from removing all other ports from the
> flood-mask and letting the CPU handle the flooding? Then you can install
> software tc filters to limit the flooding.
I do not have the bandwidth to forward the multicast traffic in the CPU.

It will also cause enormous latency on the forwarding of L2 multicast packets.

> > This changes the objective a bit. To begin with we needed to get more packets to
> > the CPU (which could have been done using tc ingress rules and a trap action).
> > 
> > Now after we changed the driver, we realized that we need something to limit the
> > flooding of certain L2 multicast packets. This is the new problem we are trying
> > to solve!
> > 
> > Example: Say we have a bridge with 4 slave interfaces, then we want to install a
> > forwarding rule saying that packets to a given L2-multicast MAC address, should
> > only be flooded to 2 of the 4 ports.
> > 
> > (instead of adding rules to get certain packets to the CPU, we are now adding
> > other rules to prevent other packets from going to the CPU and other ports where
> > they are not needed/wanted).
> > 
> > This is exactly the same thing as IGMP snooping does dynamically, but only for
> > IP multicast.
> > 
> > The "bridge mdb" allow users to manually/static add/del a port to a multicast
> > group, but still it operates on IP multicast address (not L2 multicast
> > addresses).
> > 
> > > Nik suggested SIOCADDMULTI.
> > It is not clear to me how this should be used to limit the flooding, maybe we
> > can make some hacks, but as far as I understand the intend of this is maintain
> > the list of addresses an interface should receive. I'm not sure this should
> > influence how for forwarding decisions are being made.
> > 
> > > and I suggested a tc filter to get the packet to the CPU.
> > The TC solution is a good solution to the original problem where wanted to copy
> > more frames to the CPU. But we were convinced that this is not the right
> > approach, and that the CPU by default should receive all multicast packets, and
> > we should instead try to find a way to limit the flooding of certain frames as
> > an optimization.
> 
> This can still work. In Linux, ingress tc filters are executed before the
> bridge's Rx handler. The same happens in every sane HW. Ingress ACL is
> performed before L2 forwarding. Assuming you have eth0-eth3 bridged and
> you want to prevent packets with DMAC 01:21:6C:00:00:01 from egressing
> eth2:
> 
> # tc filter add dev eth0 ingress pref 1 flower skip_sw \
> 	dst_mac 01:21:6C:00:00:01 action trap
> # tc filter add dev eth2 egress pref 1 flower skip_hw \
> 	dst_mac 01:21:6C:00:00:01 action drop
> 
> The first filter is only present in HW ('skip_sw') and should result in
> your HW passing you the sole copy of the packet.
Agree.

> The second filter is only present in SW ('skip_hw', not using HW egress
> ACL that you don't have) and drops the packet after it was flooded by
> the SW bridge.
Agree.

> As I mentioned earlier, you can install the filter once in your HW and
> share it between different ports using a shared block. This means you
> only consume one TCAM entry.
> 
> Note that this allows you to keep flooding all other multicast packets
> in HW.
Yes, but the frames we want to limit the flood-mask on are the exact frames
which occurs at a very high rate, and where latency is important.

I really do not consider it as an option to forward this in SW, when it is
something that can easily be offloaded in HW.

> > > If you now want to limit the ports to which this packet is flooded, then
> > > you can use tc filters in *software*:
> > > 
> > > # tc qdisc add dev eth2 clsact
> > > # tc filter add dev eth2 egress pref 1 flower skip_hw \
> > > 	dst_mac 01:21:6C:00:00:01 action drop
> > Yes. This can work in the SW bridge.
> > 
> > > If you want to forward the packet in hardware and locally receive it,
> > > you can chain several mirred action and then a trap action.
> > I'm not I fully understand how this should be done, but it does sound like it
> > becomes quite complicated. Also, as far as I understand it will mean that we
> > will be using TCAM/ACL resources to do something that could have been done with
> > a simple MAC entry.
> > 
> > > Both options avoid HW egress ACLs which your design does not support.
> > True, but what is wrong with expanding the functionality of the normal
> > forwarding/MAC operations to allow multiple destinations?
> > 
> > It is not an uncommon feature (I just browsed the manual of some common L2
> > switches and they all has this feature).
> > 
> > It seems to fit nicely into the existing user-interface:
> > 
> > bridge fdb add    01:21:6C:00:00:01 port eth0
> > bridge fdb append 01:21:6C:00:00:01 port eth1
> 
> Wouldn't it be better to instead extend the MDB entries so that they are
> either keyed by IP or MAC? I believe FDB should remain as unicast-only.

You might be right, it was not clear to me which of the two would fit the
purpose best.

From a user-space iproute2 perspective I prefer using the "bridge fdb" command
as it already supports the needed syntax, and I do not think it will be too
pretty if we squeeze this into the "bridge mdb" command syntax.

But that does not mean that it need to go into the FDB database in the
implementation.

Last evening when I looked at it again, I was considering keeping the
net_bridge_fdb_entry structure as is, and add a new hashtable with the
following:

struct net_bridge_fdbmc_entry {
	struct rhash_head		rhnode;
	struct net_bridge_fdbmc_ports   *dst;

	struct net_bridge_fdb_key	key;
	struct hlist_node		fdb_node;
	unsigned char			offloaded:1;

	struct rcu_head			rcu;
};

If we go with this approach then we can look at the MAC address and see if it is
a unicast which will cause a lookup in the fdb, l3-multicast (33:33:* or
01:00:5e:*) which will cause a lookup in the mdb, or finally a fdbmc which will
need to do a lookup in this new hashtable.

Alternative it would be like this:

struct net_bridge_fdb_entry {
	struct rhash_head		rhnode;
	union net_bridge_port_or_list	*dst;

	struct net_bridge_fdb_key	key;
	struct hlist_node		fdb_node;
	unsigned char			is_local:1,
					is_static:1,
					is_sticky:1,
					added_by_user:1,
					added_by_external_learn:1,
					offloaded:1;
					multi_dst:1;

	/* write-heavy members should not affect lookups */
	unsigned long			updated ____cacheline_aligned_in_smp;
	unsigned long			used;

	struct rcu_head			rcu;
};

Both solutions should require fairly few changes, and should not cause any
measurable performance hit.

Making it fit into the net_bridge_mdb_entry seems to be harder.

> As a bonus, existing drivers could benefit from it, as MDB entries are already
> notified by MAC.
Not sure I follow. When FDB entries are added, it also generates notification
events.

> > It seems that it can be added to the existing implementation with out adding
> > significant complexity.
> > 
> > It will be easy to offload in HW.
> > 
> > I do not believe that it will be a performance issue, if this is a concern then
> > we may have to do a bit of benchmarking, or we can make it a configuration
> > option.
> > 
> > Long story short, we (Horatiu and I) learned a lot from the discussion here, and
> > I think we should try do a new patch with the learning we got. Then it is easier
> > to see what it actually means to the exiting code, complexity, exiting drivers,
> > performance, default behavioral, backwards compatibly, and other valid concerns.
> > 
> > If the patch is no good, and cannot be fixed, then we will go back and look
> > further into alternative solutions.
> Overall, I tend to agree with Nik. I think your use case is too specific
> to justify the amount of changes you want to make in the bridge driver.
> We also provided other alternatives. That being said, you're more than
> welcome to send the patches and we can continue the discussion then.
Okay, good to know. I'm not sure I agree that the alternative solutions really
solves the issue this is trying to solve, nor do I agree that this is specific
to our needs.

But lets take a look at a new patch, and see what is the amount of changes we
are talking about. Without having the patch it is really hard to know for sure.

-- 
/Allan


^ permalink raw reply

* Re: [PATCH] drivers: net: wireless: rsi: return explicit error values
From: Kalle Valo @ 2019-07-30  8:29 UTC (permalink / raw)
  To: Enrico Weigelt, metux IT consult
  Cc: linux-kernel, amitkarwar, siva8118, linux-wireless, netdev
In-Reply-To: <1564413872-10720-1-git-send-email-info@metux.net>

"Enrico Weigelt, metux IT consult" <info@metux.net> writes:

> From: Enrico Weigelt <info@metux.net>
>
> Explicitly return constants instead of variable (and rely on
> it to be explicitly initialized), if the value is supposed
> to be fixed anyways. Align it with the rest of the driver,
> which does it the same way.
>
> Signed-off-by: Enrico Weigelt <info@metux.net>
> ---
>  drivers/net/wireless/rsi/rsi_91x_sdio.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)

The prefix should be just "rsi:", I'll fix that.

-- 
Kalle Valo

^ permalink raw reply

* net: ipv6: Fix a bug in ndisc_send_ns when netdev only has a global address
From: Mark Smith @ 2019-07-30  8:15 UTC (permalink / raw)
  To: suyj.fnst, netdev

Hi,

I'm not subscribed to the Linux netdev mailing list, so I can't
directly reply to the patch email.

This patch is not the correct solution to this issue.

Per RFC 4291 "IP Version 6 Addressing Architecture", all IPv6
interfaces are required to have Link-Local addresses, so therefore
there should always be a link-local address available to use as the
source address for an ND NS.

"2.1. Addressing Model"

...

"All interfaces are required to have at least one Link-Local unicast
   address (see Section 2.8 for additional required addresses)."

I have submitted a more specific bug regarding no Link-Local
address/prefix on the Linux kernel loopback interface through RedHat
bugzilla as I use Fedora 30, however it doesn't seem to have been
looked at yet.

"Loopback network interface does not have a Link Local address,
contrary to RFC 4291"
https://bugzilla.redhat.com/show_bug.cgi?id=1706709


Thanks very much,
Mark.

^ permalink raw reply

* [PATCH v2] net: phy: phy_led_triggers: Fix a possible null-pointer dereference in phy_led_trigger_change_speed()
From: Jia-Ju Bai @ 2019-07-30  8:08 UTC (permalink / raw)
  To: andrew, f.fainelli, hkallweit1, davem; +Cc: netdev, linux-kernel, Jia-Ju Bai

In phy_led_trigger_change_speed(), there is an if statement on line 48
to check whether phy->last_triggered is NULL: 
    if (!phy->last_triggered)

When phy->last_triggered is NULL, it is used on line 52:
    led_trigger_event(&phy->last_triggered->trigger, LED_OFF);

Thus, a possible null-pointer dereference may occur.

To fix this bug, led_trigger_event(&phy->last_triggered->trigger,
LED_OFF) is called when phy->last_triggered is not NULL.

This bug is found by a static analysis tool STCheck written by
the OSLAB group in Tsinghua University.

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
---
v2:
* Add the organization of the tool's authors.
  Thank David and Andrew for helpful advice.

---
 drivers/net/phy/phy_led_triggers.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/phy_led_triggers.c b/drivers/net/phy/phy_led_triggers.c
index b86a4b2116f8..59a94e07e7c5 100644
--- a/drivers/net/phy/phy_led_triggers.c
+++ b/drivers/net/phy/phy_led_triggers.c
@@ -48,8 +48,9 @@ void phy_led_trigger_change_speed(struct phy_device *phy)
 		if (!phy->last_triggered)
 			led_trigger_event(&phy->led_link_trigger->trigger,
 					  LED_FULL);
+		else
+			led_trigger_event(&phy->last_triggered->trigger, LED_OFF);
 
-		led_trigger_event(&phy->last_triggered->trigger, LED_OFF);
 		led_trigger_event(&plt->trigger, LED_FULL);
 		phy->last_triggered = plt;
 	}
-- 
2.17.0


^ permalink raw reply related

* Re: [PATCH] net: phy: phy_led_triggers: Fix a possible null-pointer dereference in phy_led_trigger_change_speed()
From: Jia-Ju Bai @ 2019-07-30  8:03 UTC (permalink / raw)
  To: David Miller, andrew; +Cc: f.fainelli, hkallweit1, netdev, linux-kernel
In-Reply-To: <20190729.204113.316505378355498068.davem@davemloft.net>



On 2019/7/30 11:41, David Miller wrote:
> From: Andrew Lunn <andrew@lunn.ch>
> Date: Tue, 30 Jul 2019 05:32:29 +0200
>
>> On Tue, Jul 30, 2019 at 10:25:36AM +0800, Jia-Ju Bai wrote:
>>>
>>> On 2019/7/29 21:45, Andrew Lunn wrote:
>>>> On Mon, Jul 29, 2019 at 05:24:24PM +0800, Jia-Ju Bai wrote:
>>>>> In phy_led_trigger_change_speed(), there is an if statement on line 48
>>>>> to check whether phy->last_triggered is NULL:
>>>>>      if (!phy->last_triggered)
>>>>>
>>>>> When phy->last_triggered is NULL, it is used on line 52:
>>>>>      led_trigger_event(&phy->last_triggered->trigger, LED_OFF);
>>>>>
>>>>> Thus, a possible null-pointer dereference may occur.
>>>>>
>>>>> To fix this bug, led_trigger_event(&phy->last_triggered->trigger,
>>>>> LED_OFF) is called when phy->last_triggered is not NULL.
>>>>>
>>>>> This bug is found by a static analysis tool STCheck written by us.
>>>> Who is 'us'?
>>> Me and my colleague...
>> Well, we can leave it very vague, giving no idea who 'us' is. But
>> often you want to name the company behind it, or the university, or
>> the sponsor, etc.
> I agree, if you are going to mention that there is a tool you should be
> clear exactly who and what organization are behind it

Thanks for the advice.
I will add my organization in the patch.


Best wishes,
Jia-Ju Bai

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox