Netdev List
 help / color / mirror / Atom feed
* [PATCH v4 net-next 1/4] net: macb: Reorganize macb_mii bringup
From: Brad Mouring @ 2018-03-12 21:34 UTC (permalink / raw)
  To: Nicolas Ferre, Rob Herring, David S . Miller, Michael Grzeschik,
	Andrew Lunn
  Cc: Mark Rutland, netdev, Julia Cartwright, devicetree, Brad Mouring
In-Reply-To: <20180312213435.115174-1-brad.mouring@ni.com>

The macb mii setup (mii_probe() and mii_init()) previously was
somewhat interspersed, likely a result of organic growth and hacking.

This change moves mii bus registration into mii_init and probing the
bus for devices into mii_probe.

Signed-off-by: Brad Mouring <brad.mouring@ni.com>
Suggested-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 79 +++++++++++++++++---------------
 1 file changed, 41 insertions(+), 38 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index e84afcf1ecb5..9b6195fbbf8e 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -472,8 +472,42 @@ static int macb_mii_probe(struct net_device *dev)
 	struct macb *bp = netdev_priv(dev);
 	struct macb_platform_data *pdata;
 	struct phy_device *phydev;
-	int phy_irq;
-	int ret;
+	struct device_node *np;
+	int phy_irq, ret, i;
+
+	pdata = dev_get_platdata(&bp->pdev->dev);
+	np = bp->pdev->dev.of_node;
+	ret = 0;
+
+	if (np) {
+		if (of_phy_is_fixed_link(np)) {
+			if (of_phy_register_fixed_link(np) < 0) {
+				dev_err(&bp->pdev->dev,
+					"broken fixed-link specification\n");
+				return -ENODEV;
+			}
+			bp->phy_node = of_node_get(np);
+		} else {
+			/* fallback to standard phy registration if no phy were
+			 * found during dt phy registration
+			 */
+			if (!phy_find_first(bp->mii_bus)) {
+				for (i = 0; i < PHY_MAX_ADDR; i++) {
+					struct phy_device *phydev;
+
+					phydev = mdiobus_scan(bp->mii_bus, i);
+					if (IS_ERR(phydev) &&
+					    PTR_ERR(phydev) != -ENODEV) {
+						ret = PTR_ERR(phydev);
+						break;
+					}
+				}
+
+				if (ret)
+					return -ENODEV;
+			}
+		}
+	}
 
 	if (bp->phy_node) {
 		phydev = of_phy_connect(dev, bp->phy_node,
@@ -488,7 +522,6 @@ static int macb_mii_probe(struct net_device *dev)
 			return -ENXIO;
 		}
 
-		pdata = dev_get_platdata(&bp->pdev->dev);
 		if (pdata) {
 			if (gpio_is_valid(pdata->phy_irq_pin)) {
 				ret = devm_gpio_request(&bp->pdev->dev,
@@ -533,7 +566,7 @@ static int macb_mii_init(struct macb *bp)
 {
 	struct macb_platform_data *pdata;
 	struct device_node *np;
-	int err = -ENXIO, i;
+	int err;
 
 	/* Enable management port */
 	macb_writel(bp, NCR, MACB_BIT(MPE));
@@ -556,39 +589,9 @@ static int macb_mii_init(struct macb *bp)
 	dev_set_drvdata(&bp->dev->dev, bp->mii_bus);
 
 	np = bp->pdev->dev.of_node;
-	if (np) {
-		if (of_phy_is_fixed_link(np)) {
-			if (of_phy_register_fixed_link(np) < 0) {
-				dev_err(&bp->pdev->dev,
-					"broken fixed-link specification\n");
-				goto err_out_unregister_bus;
-			}
-			bp->phy_node = of_node_get(np);
-
-			err = mdiobus_register(bp->mii_bus);
-		} else {
-			/* try dt phy registration */
-			err = of_mdiobus_register(bp->mii_bus, np);
-
-			/* fallback to standard phy registration if no phy were
-			 * found during dt phy registration
-			 */
-			if (!err && !phy_find_first(bp->mii_bus)) {
-				for (i = 0; i < PHY_MAX_ADDR; i++) {
-					struct phy_device *phydev;
-
-					phydev = mdiobus_scan(bp->mii_bus, i);
-					if (IS_ERR(phydev) &&
-					    PTR_ERR(phydev) != -ENODEV) {
-						err = PTR_ERR(phydev);
-						break;
-					}
-				}
 
-				if (err)
-					goto err_out_unregister_bus;
-			}
-		}
+	if (np) {
+		err = of_mdiobus_register(bp->mii_bus, np);
 	} else {
 		for (i = 0; i < PHY_MAX_ADDR; i++)
 			bp->mii_bus->irq[i] = PHY_POLL;
@@ -610,10 +613,10 @@ static int macb_mii_init(struct macb *bp)
 
 err_out_unregister_bus:
 	mdiobus_unregister(bp->mii_bus);
-err_out_free_mdiobus:
-	of_node_put(bp->phy_node);
 	if (np && of_phy_is_fixed_link(np))
 		of_phy_deregister_fixed_link(np);
+err_out_free_mdiobus:
+	of_node_put(bp->phy_node);
 	mdiobus_free(bp->mii_bus);
 err_out:
 	return err;
-- 
2.16.2

^ permalink raw reply related

* [PATCH v4 net-next 2/4] net: macb: Remove redundant poll irq assignment
From: Brad Mouring @ 2018-03-12 21:34 UTC (permalink / raw)
  To: Nicolas Ferre, Rob Herring, David S . Miller, Michael Grzeschik,
	Andrew Lunn
  Cc: Mark Rutland, netdev, Julia Cartwright, devicetree, Brad Mouring
In-Reply-To: <20180312213435.115174-1-brad.mouring@ni.com>

In phy_device's general probe, this device will already be set for
phy register polling, rendering this code redundant.

Signed-off-by: Brad Mouring <brad.mouring@ni.com>
Suggested-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 9b6195fbbf8e..db1dc301bed3 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -593,9 +593,6 @@ static int macb_mii_init(struct macb *bp)
 	if (np) {
 		err = of_mdiobus_register(bp->mii_bus, np);
 	} else {
-		for (i = 0; i < PHY_MAX_ADDR; i++)
-			bp->mii_bus->irq[i] = PHY_POLL;
-
 		if (pdata)
 			bp->mii_bus->phy_mask = pdata->phy_mask;
 
-- 
2.16.2

^ permalink raw reply related

* [PATCH v4 net-next 0/4]
From: Brad Mouring @ 2018-03-12 21:34 UTC (permalink / raw)
  To: Nicolas Ferre, Rob Herring, David S . Miller, Michael Grzeschik,
	Andrew Lunn
  Cc: Mark Rutland, netdev, Julia Cartwright, devicetree
In-Reply-To: <20180312175956.GS27783@lunn.ch>


Consider the situation where a macb netdev is connected through
a phydev that sits on a mii bus other than the one provided to
this particular netdev. This situation is what this patchset aims
to accomplish through the existing phy-handle optional binding.

This optional binding (as described in the ethernet DT bindings doc)
directs the netdev to the phydev to use. This is precisely the
situation this patchset aims to solve, so it makes sense to introduce
the functionality to this driver (where the physical layout discussed
was encountered).

The devicetree snippet would look something like this:

...
   ethernet@feedf00d {
           ...
           phy-handle = <&phy0> // the first netdev is physically wired to phy0
           ...
           phy0: phy@0 {
                   ...
                   reg = <0x0> // MDIO address 0
                   ...
           }
           phy1: phy@1 {
                   ...
                   reg = <0x1> // MDIO address 1
                   ...
           }
   ...
   }

   ethernet@deadbeef {
           ...
           phy-handle = <&phy1> // tells the driver to use phy1 on the
                                // first mac's mdio bus (it's wired thusly)
           ...
   }
...

The work done to add the phy_node in the first place (dacdbb4dfc1a1:
"net: macb: add fixed-link node support") will consume the
device_node (if found).

v2: Reorganization of mii probe/init functions, suggested by Andrew Lunn
v3: Moved some of the bus init code back into init (erroneously moved to probe)
    some style issues, and an unintialized variable warning addressed.
v4: Add Reviewed-by: tags
    Skip fallback code if phy-handle phandle is found

^ permalink raw reply

* [PATCH v4 net-next 3/4] net: macb: Add phy-handle DT support
From: Brad Mouring @ 2018-03-12 21:34 UTC (permalink / raw)
  To: Nicolas Ferre, Rob Herring, David S . Miller, Michael Grzeschik,
	Andrew Lunn
  Cc: Mark Rutland, netdev, Julia Cartwright, devicetree, Brad Mouring
In-Reply-To: <20180312213435.115174-1-brad.mouring@ni.com>

This optional binding (as described in the ethernet DT bindings doc)
directs the netdev to the phydev to use. This is useful for a phy
chip that has >1 phy in it, and two netdevs are using the same phy
chip (i.e. the second mac's phy lives on the first mac's MDIO bus)

The devicetree snippet would look something like this:

ethernet@feedf00d {
	...
	phy-handle = <&phy0> // the first netdev is physically wired to phy0
	...
	phy0: phy@0 {
		...
		reg = <0x0> // MDIO address 0
		...
	}
	phy1: phy@1 {
		...
		reg = <0x1> // MDIO address 1
		...
	}
...
}

ethernet@deadbeef {
	...
	phy-handle = <&phy1> // tells the driver to use phy1 on the
						 // first mac's mdio bus (it's wired thusly)
	...
}

The work done to add the phy_node in the first place (dacdbb4dfc1a1:
"net: macb: add fixed-link node support") will consume the
device_node (if found).

Signed-off-by: Brad Mouring <brad.mouring@ni.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 34 ++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index db1dc301bed3..4a2ad35d41aa 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -488,23 +488,27 @@ static int macb_mii_probe(struct net_device *dev)
 			}
 			bp->phy_node = of_node_get(np);
 		} else {
-			/* fallback to standard phy registration if no phy were
-			 * found during dt phy registration
-			 */
-			if (!phy_find_first(bp->mii_bus)) {
-				for (i = 0; i < PHY_MAX_ADDR; i++) {
-					struct phy_device *phydev;
-
-					phydev = mdiobus_scan(bp->mii_bus, i);
-					if (IS_ERR(phydev) &&
-					    PTR_ERR(phydev) != -ENODEV) {
-						ret = PTR_ERR(phydev);
-						break;
+			/* attempt to find a phy-handle */
+			if (!(bp->phy_node = of_parse_phandle(np, "phy-handle", 0))) {
+
+				/* fallback to standard phy registration if no phy were
+				 * found during dt phy registration
+				 */
+				if (!phy_find_first(bp->mii_bus)) {
+					for (i = 0; i < PHY_MAX_ADDR; i++) {
+						struct phy_device *phydev;
+	
+						phydev = mdiobus_scan(bp->mii_bus, i);
+						if (IS_ERR(phydev) &&
+						    PTR_ERR(phydev) != -ENODEV) {
+							ret = PTR_ERR(phydev);
+							break;
+						}
 					}
+	
+					if (ret)
+						return -ENODEV;
 				}
-
-				if (ret)
-					return -ENODEV;
 			}
 		}
 	}
-- 
2.16.2

^ permalink raw reply related

* [PATCH v4 net-next 4/4] Documentation: macb: Document phy-handle binding
From: Brad Mouring @ 2018-03-12 21:34 UTC (permalink / raw)
  To: Nicolas Ferre, Rob Herring, David S . Miller, Michael Grzeschik,
	Andrew Lunn
  Cc: Mark Rutland, netdev, Julia Cartwright, devicetree, Brad Mouring
In-Reply-To: <20180312213435.115174-1-brad.mouring@ni.com>

Document the existance of the optional binding, directing to the
general ethernet document that describes this binding.

Signed-off-by: Brad Mouring <brad.mouring@ni.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
---
 Documentation/devicetree/bindings/net/macb.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/net/macb.txt b/Documentation/devicetree/bindings/net/macb.txt
index 27966ae741e0..457d5ae16f23 100644
--- a/Documentation/devicetree/bindings/net/macb.txt
+++ b/Documentation/devicetree/bindings/net/macb.txt
@@ -29,6 +29,7 @@ Optional properties for PHY child node:
 - reset-gpios : Should specify the gpio for phy reset
 - magic-packet : If present, indicates that the hardware supports waking
   up via magic packet.
+- phy-handle : see ethernet.txt file in the same directory
 
 Examples:
 
-- 
2.16.2

^ permalink raw reply related

* Re: [PATCH iproute2] Revert "iproute: "list/flush/save default" selected all of the routes"
From: Luca Boccassi @ 2018-03-12 21:37 UTC (permalink / raw)
  To: Stephen Hemminger, green; +Cc: netdev
In-Reply-To: <20180312210301.12757-1-stephen@networkplumber.org>

[-- Attachment #1: Type: text/plain, Size: 966 bytes --]

On Mon, 2018-03-12 at 14:03 -0700, Stephen Hemminger wrote:
> This reverts commit 9135c4d6037ff9f1818507bac0049fc44db8c3d2.
> 
> Debian maintainer found that basic command:
> 	# ip route flush all
> No longer worked as expected which breaks user scripts and
> expectations. It no longer flushed all IPv4 routes.
> 
> Reported-by: Luca Boccassi <bluca@debian.org>
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
>  ip/iproute.c | 65 ++++++++++++++++++------------------------------
> ------------
>  lib/utils.c  | 13 ++++++++++++
>  2 files changed, 32 insertions(+), 46 deletions(-)

Tested-by: Luca Boccassi <bluca@debian.org>

Thanks, solves the problem. I'll backport it to Debian.

Alexander, reproducing the issue is quite simple - before that commit,
ip route ls all showed all routes, but with the change it started
showing only the default table. Same for ip route flush.

-- 
Kind regards,
Luca Boccassi

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [pull request][for-next 00/11] Mellanox, mlx5 IPSec updates 2018-02-28-2 (Part 2)
From: Saeed Mahameed @ 2018-03-12 21:43 UTC (permalink / raw)
  To: davem@davemloft.net, dledford@redhat.com
  Cc: Jason Gunthorpe, netdev@vger.kernel.org, Matan Barak,
	linux-rdma@vger.kernel.org, Leon Romanovsky, Aviad Yehezkel,
	Boris Pismenny
In-Reply-To: <f6c61009-1c27-3ea5-5aae-ff27aa1a755d@redhat.com>

On Thu, 2018-03-08 at 14:14 -0500, Doug Ledford wrote:
> On 3/8/2018 1:04 PM, David Miller wrote:
> > From: Saeed Mahameed <saeedm@mellanox.com>
> > Date: Wed,  7 Mar 2018 17:26:03 -0800
> > 
> > > Hi Dave and Doug,
> > > 
> > > This series includes shared code updates (IPSec part2) for mlx5
> > > core 
> > > driver for both netdev and rdma subsystems.  This series should
> > > be pulled
> > > to both trees so we can continue netdev and rdma specific
> > > submissions
> > > separately.
> > 
> > Doug, please give this series a quick review.
> > 
> > Thank you.
> > 
> 
> I'm good with it.  Pull it as you see fit.
> 

Hi Doug, 

Just FYI, I see that Dave already pulled the series into net-next.
I think you can pull as well, so Leon will start the mlx5 RDMA
submissions as planned on top this last pull request.

thanks,
Saeed.

^ permalink raw reply

* Re: Problem with bridge (mcast-to-ucast + hairpin) and Broadcom's 802.11f in their FullMAC fw
From: Rafał Miłecki @ 2018-03-12 21:49 UTC (permalink / raw)
  To: Linus Lüssing
  Cc: Felix Fietkau, Arend van Spriel, Franky Lin, Hante Meuleman,
	Chi-Hsien Lin, Wright Feng, Pieter-Paul Giesberts,
	Network Development,
	bridge-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	open list:BROADCOM BRCM80211 IEEE802.11n WIRELESS DRIVER,
	open list:BROADCOM BRCM80211 IEEE802.11n WIRELESS DRIVER <brcm80211-dev-list.pdl-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>,
In-Reply-To: <20180312110856.GL2470@otheros>

On 12 March 2018 at 12:08, Linus Lüssing <linus.luessing-djzkFPsfvsizQB+pC5nmwQ@public.gmane.org> wrote:
> On Tue, Feb 27, 2018 at 11:08:20AM +0100, Rafał Miłecki wrote:
>> I've problem when using OpenWrt/LEDE on a home router with Broadcom's
>> FullMAC WiFi chipset.
>
> Hi Rafał,
>
> Thanks for reporting this issue!
>
>> Can you see any solution for this problem? Is that an option to stop
>> multicast-to-unicast from touching 802.11f packets? Some other ideas?
>> Obviously I can't modify Broadcom's firmware and drop that obsoleted
>> standard.
>
> Just to avoid some potential confusion: This is more an issue of
> hairpinning than it is an issue of multicast-to-unicast per se,
> right?
>
> That is, if you set this to 0 manually:
> /sys/class/net/<ap-iface>/brport/multicast_to_unicast
> Then the issue still occurs, right?

I can't really tell, I didn't go into details on bridge +
mcast-to-unicast + hairpin.

By default OpenWrt/LEDE sets
/sys/class/net/<ap-iface>/brport/multicast_to_unicast
to 1. Changing it to 0 (with a simple echo) doesn't /fix/ the problem.

-- 
Rafał

^ permalink raw reply

* Re: Problem with bridge (mcast-to-ucast + hairpin) and Broadcom's 802.11f in their FullMAC fw
From: Rafał Miłecki @ 2018-03-12 21:52 UTC (permalink / raw)
  To: Linus Lüssing
  Cc: Felix Fietkau, Arend van Spriel, Franky Lin, Hante Meuleman,
	Chi-Hsien Lin, Wright Feng, Pieter-Paul Giesberts,
	Network Development,
	bridge-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	open list:BROADCOM BRCM80211 IEEE802.11n WIRELESS DRIVER,
	open list:BROADCOM BRCM80211 IEEE802.11n WIRELESS DRIVER <brcm80211-dev-list.pdl-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>,
In-Reply-To: <20180312114815.GN2470@otheros>

On 12 March 2018 at 12:48, Linus Lüssing <linus.luessing-djzkFPsfvsizQB+pC5nmwQ@public.gmane.org> wrote:
> On Mon, Mar 12, 2018 at 12:08:56PM +0100, Linus Lüssing wrote:
>> On Tue, Feb 27, 2018 at 11:08:20AM +0100, Rafał Miłecki wrote:
>> > I've problem when using OpenWrt/LEDE on a home router with Broadcom's
>> > FullMAC WiFi chipset.
>>
>> Hi Rafał,
>>
>> Thanks for reporting this issue!
>>
>> > Can you see any solution for this problem? Is that an option to stop
>> > multicast-to-unicast from touching 802.11f packets? Some other ideas?
>> > Obviously I can't modify Broadcom's firmware and drop that obsoleted
>> > standard.
>>
>> Just to avoid some potential confusion: This is more an issue of
>> hairpinning than it is an issue of multicast-to-unicast per se,
>> right?
>>
>> That is, if you set this to 0 manually:
>> /sys/class/net/<ap-iface>/brport/multicast_to_unicast
>> Then the issue still occurs, right?
>
> Btw., if in OpenWRT/LEDE you set 'option multicast_to_unicast 0'
> in /etc/config/network for the bridge (and not bridge port)
> then netifd should refrain from setting the bridge hairpinning and
> AP isolation on wireless devices, too, if I remember correctly.
>
> Can you confirm that the issue disappears for you then?

Yes, absolutely. This reverts OpenWrt/LEDE to the old setup (no
mcast-to-ucast + hairpin) and it works. I was hoping we can make
mcast-to-ucast + hairpin work with Broadcom's 802.11f however instead
of moving OpenWrt/LEDE back to the old setup.

-- 
Rafał

^ permalink raw reply

* Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Siwei Liu @ 2018-03-12 21:53 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, Stephen Hemminger, David Miller, Netdev,
	Jiri Pirko, virtio-dev, Brandeburg, Jesse, Alexander Duyck,
	Jakub Kicinski
In-Reply-To: <20180304060014-mutt-send-email-mst@kernel.org>

On Sat, Mar 3, 2018 at 8:04 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Fri, Mar 02, 2018 at 03:56:31PM -0800, Siwei Liu wrote:
>> On Fri, Mar 2, 2018 at 1:36 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>> > On Fri, Mar 02, 2018 at 01:11:56PM -0800, Siwei Liu wrote:
>> >> On Thu, Mar 1, 2018 at 12:08 PM, Sridhar Samudrala
>> >> <sridhar.samudrala@intel.com> wrote:
>> >> > This patch enables virtio_net to switch over to a VF datapath when a VF
>> >> > netdev is present with the same MAC address. It allows live migration
>> >> > of a VM with a direct attached VF without the need to setup a bond/team
>> >> > between a VF and virtio net device in the guest.
>> >> >
>> >> > The hypervisor needs to enable only one datapath at any time so that
>> >> > packets don't get looped back to the VM over the other datapath. When a VF
>> >> > is plugged, the virtio datapath link state can be marked as down. The
>> >> > hypervisor needs to unplug the VF device from the guest on the source host
>> >> > and reset the MAC filter of the VF to initiate failover of datapath to
>> >> > virtio before starting the migration. After the migration is completed,
>> >> > the destination hypervisor sets the MAC filter on the VF and plugs it back
>> >> > to the guest to switch over to VF datapath.
>> >> >
>> >> > When BACKUP feature is enabled, an additional netdev(bypass netdev) is
>> >> > created that acts as a master device and tracks the state of the 2 lower
>> >> > netdevs. The original virtio_net netdev is marked as 'backup' netdev and a
>> >> > passthru device with the same MAC is registered as 'active' netdev.
>> >> >
>> >> > This patch is based on the discussion initiated by Jesse on this thread.
>> >> > https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
>> >> >
>> >> > Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> >> > Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
>> >> > Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
>> >> > ---
>> >> >  drivers/net/virtio_net.c | 683 ++++++++++++++++++++++++++++++++++++++++++++++-
>> >> >  1 file changed, 682 insertions(+), 1 deletion(-)
>> >> >
>> >> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> >> > index bcd13fe906ca..f2860d86c952 100644
>> >> > --- a/drivers/net/virtio_net.c
>> >> > +++ b/drivers/net/virtio_net.c
>> >> > @@ -30,6 +30,8 @@
>> >> >  #include <linux/cpu.h>
>> >> >  #include <linux/average.h>
>> >> >  #include <linux/filter.h>
>> >> > +#include <linux/netdevice.h>
>> >> > +#include <linux/pci.h>
>> >> >  #include <net/route.h>
>> >> >  #include <net/xdp.h>
>> >> >
>> >> > @@ -206,6 +208,9 @@ struct virtnet_info {
>> >> >         u32 speed;
>> >> >
>> >> >         unsigned long guest_offloads;
>> >> > +
>> >> > +       /* upper netdev created when BACKUP feature enabled */
>> >> > +       struct net_device *bypass_netdev;
>> >> >  };
>> >> >
>> >> >  struct padded_vnet_hdr {
>> >> > @@ -2236,6 +2241,22 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>> >> >         }
>> >> >  }
>> >> >
>> >> > +static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
>> >> > +                                     size_t len)
>> >> > +{
>> >> > +       struct virtnet_info *vi = netdev_priv(dev);
>> >> > +       int ret;
>> >> > +
>> >> > +       if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
>> >> > +               return -EOPNOTSUPP;
>> >> > +
>> >> > +       ret = snprintf(buf, len, "_bkup");
>> >> > +       if (ret >= len)
>> >> > +               return -EOPNOTSUPP;
>> >> > +
>> >> > +       return 0;
>> >> > +}
>> >> > +
>> >>
>> >> What if the systemd/udevd is not new enough to enforce the
>> >> n<phys_port_name> naming? Would virtio_bypass get a different name
>> >> than the original virtio_net?
>> >
>> > You mean people using ethX names? Any hardware config change breaks
>> > these, I don't think that can be helped.
>>
>> I don't like the way to rely on .ndo_get_phys_port_name - it's fragile
>> and it does not completely solve the problem it tries to address.
>> Imagine what can end up with if getting an old udevd, or users already
>> have exsiting explicit udev rules around phys_port_name. It does not
>> give you the an ack in saying "yes, I know you're the bypass and
>> you're the backup, please continue and I will give you both correct
>> names", or an unacknowlegment saying "no, I don't know what these
>> extra interfaces are, please go back and leave the VF device alone".
>> We need new udev API for both feature negotiation and naming, or may
>> even completely hide the lower interfaces.
>
> Go ahead and try to make this happen, but I won't hold my
> breath.

Heads-up: I have some working code to hide the lower devices, upon
which the new udev/sysfs interace for feature negotiation and naming
will be built. I will get it posted for review in the next few days
once ready. Hopefully this could end the fight between the 3-netdev
and 2-netdev driver model, as I see most of the problems being argued
in this thread can be addressed by just hiding the lower devices from
the 3-netdev model while keeping its merits still.

Regards,
-Siwei


>
>> >
>> >> Should we detect this earlier and fall
>> >> back to legacy mode without creating the bypass netdev and ensalving
>> >> the VF?
>> >
>> > I don't think we can do this with existing kernel/userspace APIs.
>>
>> That's why I ever said to make udev aware of this new type of combined
>> device instead of doing hacks here and there around.
>>
>> Regards,
>> -Siwei
>
> We can add new interfaces on top but the main purpose here is to
> make old userspace do new tricks.
>
>> >
>> > --
>> > MST

^ permalink raw reply

* Re: [PATCH v4 net-next 3/4] net: macb: Add phy-handle DT support
From: Andrew Lunn @ 2018-03-12 21:57 UTC (permalink / raw)
  To: Brad Mouring
  Cc: Nicolas Ferre, Rob Herring, David S . Miller, Michael Grzeschik,
	Mark Rutland, netdev, Julia Cartwright, devicetree
In-Reply-To: <20180312213435.115174-4-brad.mouring@ni.com>

> +			/* attempt to find a phy-handle */
> +			if (!(bp->phy_node = of_parse_phandle(np, "phy-handle", 0))) {
> +
> +				/* fallback to standard phy registration if no phy were
> +				 * found during dt phy registration
> +				 */
> +				if (!phy_find_first(bp->mii_bus)) {
> +					for (i = 0; i < PHY_MAX_ADDR; i++) {
> +						struct phy_device *phydev;
> +	
> +						phydev = mdiobus_scan(bp->mii_bus, i);
> +						if (IS_ERR(phydev) &&
> +						    PTR_ERR(phydev) != -ENODEV) {
> +							ret = PTR_ERR(phydev);
> +							break;
> +						}

Hi Brad

./scipts/checkpatch.pl ~/brad.mouring 
WARNING: line over 80 characters
#122: FILE: drivers/net/ethernet/cadence/macb_main.c:492:
+		if (!(bp->phy_node = of_parse_phandle(np, "phy-handle", 0))) {

ERROR: do not use assignment in if condition
#122: FILE: drivers/net/ethernet/cadence/macb_main.c:492:
+		if (!(bp->phy_node = of_parse_phandle(np, "phy-handle", 0))) {

CHECK: Blank lines aren't necessary after an open brace '{'
#123: FILE: drivers/net/ethernet/cadence/macb_main.c:493:
+		if (!(bp->phy_node = of_parse_phandle(np, "phy-handle", 0))) {
+

WARNING: line over 80 characters
#124: FILE: drivers/net/ethernet/cadence/macb_main.c:494:
+			/* fallback to standard phy registration if no phy were

ERROR: trailing whitespace
#130: FILE: drivers/net/ethernet/cadence/macb_main.c:500:
+^I$

WARNING: line over 80 characters
#131: FILE: drivers/net/ethernet/cadence/macb_main.c:501:
+					phydev = mdiobus_scan(bp->mii_bus, i);

WARNING: Too many leading tabs - consider code refactoring
#132: FILE: drivers/net/ethernet/cadence/macb_main.c:502:
+					if (IS_ERR(phydev) &&

etc

	Andrew

^ permalink raw reply

* Re: [PATCH bpf-next v4 1/2] bpf: extend stackmap to save binary_build_id+offset instead of address
From: Peter Zijlstra @ 2018-03-12 22:01 UTC (permalink / raw)
  To: Song Liu; +Cc: netdev, ast, daniel, kernel-team, hannes, qinteng
In-Reply-To: <20180312203957.2025833-2-songliubraving@fb.com>

On Mon, Mar 12, 2018 at 01:39:56PM -0700, Song Liu wrote:
> +static void stack_map_get_build_id_offset(struct bpf_map *map,
> +					  struct stack_map_bucket *bucket,
> +					  u64 *ips, u32 trace_nr)
> +{
> +	int i;
> +	struct vm_area_struct *vma;
> +	struct bpf_stack_build_id *id_offs;
> +
> +	bucket->nr = trace_nr;
> +	id_offs = (struct bpf_stack_build_id *)bucket->data;
> +
> +	if (!current || !current->mm ||
> +	    down_read_trylock(&current->mm->mmap_sem) == 0) {

You probably want an in_nmi() before the down_read_trylock(). Doing
up_read() is an absolute no-no from NMI context.

And IIUC its 'trivial' to use this stuff with hardware counters.

> +		/* cannot access current->mm, fall back to ips */
> +		for (i = 0; i < trace_nr; i++) {
> +			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
> +			id_offs[i].ip = ips[i];
> +		}
> +		return;
> +	}
> +
> +	for (i = 0; i < trace_nr; i++) {
> +		vma = find_vma(current->mm, ips[i]);
> +		if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
> +			/* per entry fall back to ips */
> +			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
> +			id_offs[i].ip = ips[i];
> +			continue;
> +		}
> +		id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
> +			- vma->vm_start;
> +		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
> +	}
> +	up_read(&current->mm->mmap_sem);
> +}

^ permalink raw reply

* Re: [PATCH net-next v2 0/4] ibmvnic: Fix VLAN and other device errata
From: Thomas Falcon @ 2018-03-12 22:07 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, jallen, nfont
In-Reply-To: <20180312.125612.912873724451108597.davem@davemloft.net>

On 03/12/2018 11:56 AM, David Miller wrote:
> From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
> Date: Mon, 12 Mar 2018 11:51:01 -0500
>
>> This patch series contains fixes for VLAN and other backing hardware
>> errata. The VLAN fixes are mostly to account for the additional four
>> bytes VLAN header in TX descriptors and buffers, when applicable.
>>
>> The other fixes for device errata are to pad small packets to avoid a
>> possible connection error that can occur when some devices attempt to
>> transmit small packets. The other fixes are GSO related. Some devices
>> cannot handle a smaller MSS or a packet with a single segment, so
>> disable GSO in those cases.
>>
>> v2: Fix style mistake (unneeded brackets) in patch 3/4
> Series applied, thanks Thomas.
>
Really sorry about this, but 3/4 is still wrong.  There is actually a compiler warning caused by it, which I'm surprised wasn't caught by the test robot.  Is there still time to send a v3?

^ permalink raw reply

* Re: [PATCH net-next] net: phy: set link state to down when creating the phy_device
From: Florian Fainelli @ 2018-03-12 22:10 UTC (permalink / raw)
  To: Heiner Kallweit, Andrew Lunn; +Cc: netdev@vger.kernel.org
In-Reply-To: <a48309ab-97ee-153a-8153-98c38117af09@gmail.com>

On 03/11/2018 07:00 AM, Heiner Kallweit wrote:
> Currently the link state is initialized to "up" when the phy_device is
> being created. This is not consistent with the phy state being
> initialized to PHY_DOWN.
> 
> Usually this doen't do any harm because the link state is updated
> once the PHY reaches state PHY_AN. However e.g. if a LAN port isn't
> used and the PHY remains down this inconsistency remains and calls
> to functions like phy_print_status() give false results.
> Therefore change the initialization to link being down.
> 
> Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>

FWIW, I have been working on some patches to try to reduce the amount of
link drops that we are seeing. This particular change may be changed in
the future to try to initialize the link state as early as possible, but
for now this is good.

Thanks!
-- 
Florian

^ permalink raw reply

* Re: [PATCH v4 net-next 3/4] net: macb: Add phy-handle DT support
From: Florian Fainelli @ 2018-03-12 22:30 UTC (permalink / raw)
  To: Andrew Lunn, Brad Mouring
  Cc: Nicolas Ferre, Rob Herring, David S . Miller, Michael Grzeschik,
	Mark Rutland, netdev, Julia Cartwright, devicetree
In-Reply-To: <20180312215738.GA3674@lunn.ch>

On 03/12/2018 02:57 PM, Andrew Lunn wrote:
>> +			/* attempt to find a phy-handle */
>> +			if (!(bp->phy_node = of_parse_phandle(np, "phy-handle", 0))) {
>> +
>> +				/* fallback to standard phy registration if no phy were
>> +				 * found during dt phy registration
>> +				 */
>> +				if (!phy_find_first(bp->mii_bus)) {
>> +					for (i = 0; i < PHY_MAX_ADDR; i++) {
>> +						struct phy_device *phydev;
>> +	
>> +						phydev = mdiobus_scan(bp->mii_bus, i);
>> +						if (IS_ERR(phydev) &&
>> +						    PTR_ERR(phydev) != -ENODEV) {
>> +							ret = PTR_ERR(phydev);
>> +							break;
>> +						}
> 
> Hi Brad

I would be a bit relaxed on these warnings because the existing function
indentation really makes it easy to be over 80 columns. Unless you have
a preliminary patch which, as I was suggesting earlier, re-arranges the
branches such that if (!np) is the first thing you test, I don't see how
this can look any better...

> 
> ./scipts/checkpatch.pl ~/brad.mouring 
> WARNING: line over 80 characters
> #122: FILE: drivers/net/ethernet/cadence/macb_main.c:492:
> +		if (!(bp->phy_node = of_parse_phandle(np, "phy-handle", 0))) {
> 
> ERROR: do not use assignment in if condition
> #122: FILE: drivers/net/ethernet/cadence/macb_main.c:492:
> +		if (!(bp->phy_node = of_parse_phandle(np, "phy-handle", 0))) {
> 
> CHECK: Blank lines aren't necessary after an open brace '{'
> #123: FILE: drivers/net/ethernet/cadence/macb_main.c:493:
> +		if (!(bp->phy_node = of_parse_phandle(np, "phy-handle", 0))) {
> +
> 
> WARNING: line over 80 characters
> #124: FILE: drivers/net/ethernet/cadence/macb_main.c:494:
> +			/* fallback to standard phy registration if no phy were
> 
> ERROR: trailing whitespace
> #130: FILE: drivers/net/ethernet/cadence/macb_main.c:500:
> +^I$
> 
> WARNING: line over 80 characters
> #131: FILE: drivers/net/ethernet/cadence/macb_main.c:501:
> +					phydev = mdiobus_scan(bp->mii_bus, i);
> 
> WARNING: Too many leading tabs - consider code refactoring
> #132: FILE: drivers/net/ethernet/cadence/macb_main.c:502:
> +					if (IS_ERR(phydev) &&
> 
> etc
> 
> 	Andrew
> --
> To unsubscribe from this list: send the line "unsubscribe devicetree" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


-- 
Florian

^ permalink raw reply

* Re: Problem with bridge (mcast-to-ucast + hairpin) and Broadcom's 802.11f in their FullMAC fw
From: Rafał Miłecki @ 2018-03-12 22:42 UTC (permalink / raw)
  To: Linus Lüssing, Felix Fietkau, Arend van Spriel, Franky Lin,
	Hante Meuleman, Chi-Hsien Lin, Wright Feng, Pieter-Paul Giesberts
  Cc: Network Development,
	bridge-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	open list:BROADCOM BRCM80211 IEEE802.11n WIRELESS DRIVER,
	brcm80211-dev-list-+wT8y+m8/X5BDgjK7y7TUQ
In-Reply-To: <CACna6rz9L09g9oeHhvt209Tg1E3gKgmhGnYF653AdkXfZf=4kw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On 02/27/2018 11:08 AM, Rafał Miłecki wrote:
> I've problem when using OpenWrt/LEDE on a home router with Broadcom's
> FullMAC WiFi chipset.
>
>
> First of all OpenWrt/LEDE uses bridge interface for LAN network with:
> 1) IFLA_BRPORT_MCAST_TO_UCAST
> 2) Clients isolation in hostapd
> 3) Hairpin mode enabled
>
> For more details please see Linus's patch description:
> https://patchwork.kernel.org/patch/9530669/
> and maybe hairpin mode patch:
> https://lwn.net/Articles/347344/
>
> Short version: in that setup packets received from a bridged wireless
> interface can be handled back to it for transmission.
>
>
> Now, Broadcom's firmware for their FullMAC chipsets in AP mode
> supports an obsoleted 802.11f AKA IAPP standard. It's a roaming
> standard that was replaced by 802.11r.
>
> Whenever a new station associates, firmware generates a packet like:
> ff ff ff ff  ff ff ec 10  7b 5f ?? ??  00 06 00 01  af 81 01 00
> (just masked 2 bytes of my MAC)
>
> For mode details you can see discussion in my brcmfmac patch thread:
> https://patchwork.kernel.org/patch/10191451/
>
>
> The problem is that bridge (in setup as above) handles such a packet
> back to the device.
>
> That makes Broadcom's FullMAC firmware believe that a given station
> just connected to another AP in a network (which doesn't even exist).
> As a result firmware immediately disassociates that station. It's
> simply impossible to connect to the router. Every association is
> followed by immediate disassociation.
>
>
> Can you see any solution for this problem? Is that an option to stop
> multicast-to-unicast from touching 802.11f packets? Some other ideas?
> Obviously I can't modify Broadcom's firmware and drop that obsoleted
> standard.

I kept thinking about this for a bit more. We probably have to start
with deciding what to blame for this problem. I see two options I'll
describe below.

Please share your opinions on this.


1) Blame brcmfmac fw for implementing 802.11f

In that case we should add workaround to brcmfmac driver to drop/ignore
802.11f packets. A slightly improved version of my
[PATCH] brcmfmac: detect & reject faked packet generated by a firmware
https://patchwork.kernel.org/patch/10191451/
should work.


2) Blame bridge + mcast-to-ucast + hairpin for 802.11f incompatibility

If we agree that 802.11f support in FullMAC firmware is acceptable, then
we have to make sure Linux's bridge doesn't break it by passing 802.11f
(broadcast) frames back to the source interface. That would require a
check like in below diff + proper code for handling such packets. I'm
afraid I'm not familiar with bridge code enough to complete that.

diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index edae702..9e5d6ea 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -126,6 +126,27 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
  	}
  }

+static bool br_skb_is_iapp_add_packet(struct sk_buff *skb)
+{
+	const u8 iapp_add_packet[6] __aligned(2) = {
+		0x00, 0x01, 0xaf, 0x81, 0x01, 0x00,
+	};
+#if !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+	const u16 *a = (const u16 *)skb->data;
+	const u16 *b = (const u16 *)iapp_add_packet;
+#endif
+
+	if (skb->len != 6)
+		return false;
+
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+	return !(((*(const u32 *)skb->data) ^ (*(const u32 *)iapp_add_packet)) |
+	         ((*(const u16 *)(skb->data + 4)) ^ (*(const u16 *)(iapp_add_packet + 4))));
+#else
+	return !((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2]));
+#endif
+}
+
  /* note: already called with rcu_read_lock */
  int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
  {
@@ -155,6 +176,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
  	if (is_multicast_ether_addr(dest)) {
  		/* by definition the broadcast is also a multicast address */
  		if (is_broadcast_ether_addr(dest)) {
+			if (br_skb_is_iapp_add_packet(skb))
+				pr_warn("This packet should not be passed back to the source interface!\n");
  			pkt_type = BR_PKT_BROADCAST;
  			local_rcv = true;
  		} else {

^ permalink raw reply related

* Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Siwei Liu @ 2018-03-12 22:44 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Michael S. Tsirkin, Stephen Hemminger, David Miller, Netdev,
	Jiri Pirko, virtio-dev, Brandeburg, Jesse, Alexander Duyck,
	Jakub Kicinski
In-Reply-To: <1519934923-39372-3-git-send-email-sridhar.samudrala@intel.com>

Apologies, still some comments going. Please see inline.

On Thu, Mar 1, 2018 at 12:08 PM, Sridhar Samudrala
<sridhar.samudrala@intel.com> wrote:
> This patch enables virtio_net to switch over to a VF datapath when a VF
> netdev is present with the same MAC address. It allows live migration
> of a VM with a direct attached VF without the need to setup a bond/team
> between a VF and virtio net device in the guest.
>
> The hypervisor needs to enable only one datapath at any time so that
> packets don't get looped back to the VM over the other datapath. When a VF
> is plugged, the virtio datapath link state can be marked as down. The
> hypervisor needs to unplug the VF device from the guest on the source host
> and reset the MAC filter of the VF to initiate failover of datapath to
> virtio before starting the migration. After the migration is completed,
> the destination hypervisor sets the MAC filter on the VF and plugs it back
> to the guest to switch over to VF datapath.
>
> When BACKUP feature is enabled, an additional netdev(bypass netdev) is
> created that acts as a master device and tracks the state of the 2 lower
> netdevs. The original virtio_net netdev is marked as 'backup' netdev and a
> passthru device with the same MAC is registered as 'active' netdev.
>
> This patch is based on the discussion initiated by Jesse on this thread.
> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
>
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
> ---
>  drivers/net/virtio_net.c | 683 ++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 682 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index bcd13fe906ca..f2860d86c952 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -30,6 +30,8 @@
>  #include <linux/cpu.h>
>  #include <linux/average.h>
>  #include <linux/filter.h>
> +#include <linux/netdevice.h>
> +#include <linux/pci.h>
>  #include <net/route.h>
>  #include <net/xdp.h>
>
> @@ -206,6 +208,9 @@ struct virtnet_info {
>         u32 speed;
>
>         unsigned long guest_offloads;
> +
> +       /* upper netdev created when BACKUP feature enabled */
> +       struct net_device *bypass_netdev;
>  };
>
>  struct padded_vnet_hdr {
> @@ -2236,6 +2241,22 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>         }
>  }
>
> +static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
> +                                     size_t len)
> +{
> +       struct virtnet_info *vi = netdev_priv(dev);
> +       int ret;
> +
> +       if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
> +               return -EOPNOTSUPP;
> +
> +       ret = snprintf(buf, len, "_bkup");
> +       if (ret >= len)
> +               return -EOPNOTSUPP;
> +
> +       return 0;
> +}
> +
>  static const struct net_device_ops virtnet_netdev = {
>         .ndo_open            = virtnet_open,
>         .ndo_stop            = virtnet_close,
> @@ -2253,6 +2274,7 @@ static const struct net_device_ops virtnet_netdev = {
>         .ndo_xdp_xmit           = virtnet_xdp_xmit,
>         .ndo_xdp_flush          = virtnet_xdp_flush,
>         .ndo_features_check     = passthru_features_check,
> +       .ndo_get_phys_port_name = virtnet_get_phys_port_name,
>  };
>
>  static void virtnet_config_changed_work(struct work_struct *work)
> @@ -2647,6 +2669,653 @@ static int virtnet_validate(struct virtio_device *vdev)
>         return 0;
>  }
>
> +/* START of functions supporting VIRTIO_NET_F_BACKUP feature.
> + * When BACKUP feature is enabled, an additional netdev(bypass netdev)
> + * is created that acts as a master device and tracks the state of the
> + * 2 lower netdevs. The original virtio_net netdev is registered as
> + * 'backup' netdev and a passthru device with the same MAC is registered
> + * as 'active' netdev.
> + */
> +
> +/* bypass state maintained when BACKUP feature is enabled */
> +struct virtnet_bypass_info {
> +       /* passthru netdev with same MAC */
> +       struct net_device __rcu *active_netdev;
> +
> +       /* virtio_net netdev */
> +       struct net_device __rcu *backup_netdev;
> +
> +       /* active netdev stats */
> +       struct rtnl_link_stats64 active_stats;
> +
> +       /* backup netdev stats */
> +       struct rtnl_link_stats64 backup_stats;
> +
> +       /* aggregated stats */
> +       struct rtnl_link_stats64 bypass_stats;
> +
> +       /* spinlock while updating stats */
> +       spinlock_t stats_lock;
> +};
> +
> +static void virtnet_bypass_child_open(struct net_device *dev,
> +                                     struct net_device *child_netdev)
> +{
> +       int err = dev_open(child_netdev);
> +
> +       if (err)
> +               netdev_warn(dev, "unable to open slave: %s: %d\n",
> +                           child_netdev->name, err);
> +}

Why ignoring the error?? i.e. virtnet_bypass_child_open should have
return value. I don't believe the caller is in a safe context to
guarantee the dev_open always succeeds.

> +
> +static int virtnet_bypass_open(struct net_device *dev)
> +{
> +       struct virtnet_bypass_info *vbi = netdev_priv(dev);
> +       struct net_device *child_netdev;
> +
> +       netif_carrier_off(dev);
> +       netif_tx_wake_all_queues(dev);
> +
> +       child_netdev = rtnl_dereference(vbi->active_netdev);
> +       if (child_netdev)
> +               virtnet_bypass_child_open(dev, child_netdev);

Handle the error?

> +
> +       child_netdev = rtnl_dereference(vbi->backup_netdev);
> +       if (child_netdev)
> +               virtnet_bypass_child_open(dev, child_netdev);
Handle the error and unwind?

> +
> +       return 0;
> +}
> +
> +static int virtnet_bypass_close(struct net_device *dev)
> +{
> +       struct virtnet_bypass_info *vi = netdev_priv(dev);
> +       struct net_device *child_netdev;
> +
> +       netif_tx_disable(dev);
> +
> +       child_netdev = rtnl_dereference(vi->active_netdev);
> +       if (child_netdev)
> +               dev_close(child_netdev);
> +
> +       child_netdev = rtnl_dereference(vi->backup_netdev);
> +       if (child_netdev)
> +               dev_close(child_netdev);
> +
> +       return 0;
> +}
> +
> +static netdev_tx_t virtnet_bypass_drop_xmit(struct sk_buff *skb,
> +                                           struct net_device *dev)
> +{
> +       atomic_long_inc(&dev->tx_dropped);
> +       dev_kfree_skb_any(skb);
> +       return NETDEV_TX_OK;
> +}
> +
> +static bool virtnet_bypass_xmit_ready(struct net_device *dev)
> +{
> +       return netif_running(dev) && netif_carrier_ok(dev);
> +}
> +
> +static netdev_tx_t virtnet_bypass_start_xmit(struct sk_buff *skb,
> +                                            struct net_device *dev)
> +{
> +       struct virtnet_bypass_info *vbi = netdev_priv(dev);
> +       struct net_device *xmit_dev;
> +
> +       /* Try xmit via active netdev followed by backup netdev */
> +       xmit_dev = rcu_dereference_bh(vbi->active_netdev);
> +       if (!xmit_dev || !virtnet_bypass_xmit_ready(xmit_dev)) {
> +               xmit_dev = rcu_dereference_bh(vbi->backup_netdev);
> +               if (!xmit_dev || !virtnet_bypass_xmit_ready(xmit_dev))
> +                       return virtnet_bypass_drop_xmit(skb, dev);
> +       }
> +
> +       skb->dev = xmit_dev;
> +       skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
> +
> +       return dev_queue_xmit(skb);
> +}
> +
> +static u16 virtnet_bypass_select_queue(struct net_device *dev,
> +                                      struct sk_buff *skb, void *accel_priv,
> +                                      select_queue_fallback_t fallback)
> +{
> +       /* This helper function exists to help dev_pick_tx get the correct
> +        * destination queue.  Using a helper function skips a call to
> +        * skb_tx_hash and will put the skbs in the queue we expect on their
> +        * way down to the bonding driver.
> +        */
> +       u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
> +
> +       /* Save the original txq to restore before passing to the driver */
> +       qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
> +
> +       if (unlikely(txq >= dev->real_num_tx_queues)) {
> +               do {
> +                       txq -= dev->real_num_tx_queues;
> +               } while (txq >= dev->real_num_tx_queues);
> +       }
> +
> +       return txq;
> +}
> +
> +/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
> + * that some drivers can provide 32bit values only.
> + */
> +static void virtnet_bypass_fold_stats(struct rtnl_link_stats64 *_res,
> +                                     const struct rtnl_link_stats64 *_new,
> +                                     const struct rtnl_link_stats64 *_old)
> +{
> +       const u64 *new = (const u64 *)_new;
> +       const u64 *old = (const u64 *)_old;
> +       u64 *res = (u64 *)_res;
> +       int i;
> +
> +       for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
> +               u64 nv = new[i];
> +               u64 ov = old[i];
> +               s64 delta = nv - ov;
> +
> +               /* detects if this particular field is 32bit only */
> +               if (((nv | ov) >> 32) == 0)
> +                       delta = (s64)(s32)((u32)nv - (u32)ov);
> +
> +               /* filter anomalies, some drivers reset their stats
> +                * at down/up events.
> +                */
> +               if (delta > 0)
> +                       res[i] += delta;
> +       }
> +}
> +
> +static void virtnet_bypass_get_stats(struct net_device *dev,
> +                                    struct rtnl_link_stats64 *stats)
> +{
> +       struct virtnet_bypass_info *vbi = netdev_priv(dev);
> +       const struct rtnl_link_stats64 *new;
> +       struct rtnl_link_stats64 temp;
> +       struct net_device *child_netdev;
> +
> +       spin_lock(&vbi->stats_lock);
> +       memcpy(stats, &vbi->bypass_stats, sizeof(*stats));
> +
> +       rcu_read_lock();
> +
> +       child_netdev = rcu_dereference(vbi->active_netdev);
> +       if (child_netdev) {
> +               new = dev_get_stats(child_netdev, &temp);
> +               virtnet_bypass_fold_stats(stats, new, &vbi->active_stats);
> +               memcpy(&vbi->active_stats, new, sizeof(*new));
> +       }
> +
> +       child_netdev = rcu_dereference(vbi->backup_netdev);
> +       if (child_netdev) {
> +               new = dev_get_stats(child_netdev, &temp);
> +               virtnet_bypass_fold_stats(stats, new, &vbi->backup_stats);
> +               memcpy(&vbi->backup_stats, new, sizeof(*new));
> +       }
> +
> +       rcu_read_unlock();
> +
> +       memcpy(&vbi->bypass_stats, stats, sizeof(*stats));
> +       spin_unlock(&vbi->stats_lock);
> +}
> +
> +static int virtnet_bypass_change_mtu(struct net_device *dev, int new_mtu)
> +{
> +       struct virtnet_bypass_info *vbi = netdev_priv(dev);
> +       struct net_device *child_netdev;
> +       int ret = 0;
> +
> +       child_netdev = rcu_dereference(vbi->active_netdev);
> +       if (child_netdev) {
> +               ret = dev_set_mtu(child_netdev, new_mtu);
> +               if (ret)
> +                       return ret;
> +       }
> +
> +       child_netdev = rcu_dereference(vbi->backup_netdev);
> +       if (child_netdev) {
> +               ret = dev_set_mtu(child_netdev, new_mtu);
> +               if (ret)
> +                       netdev_err(child_netdev,
> +                                  "Unexpected failure to set mtu to %d\n",
> +                                  new_mtu);
> +       }
> +
> +       dev->mtu = new_mtu;
> +       return 0;
> +}
> +
> +static void virtnet_bypass_set_rx_mode(struct net_device *dev)
> +{
> +       struct virtnet_bypass_info *vbi = netdev_priv(dev);
> +       struct net_device *child_netdev;
> +
> +       rcu_read_lock();
> +
> +       child_netdev = rcu_dereference(vbi->active_netdev);
> +       if (child_netdev) {
> +               dev_uc_sync_multiple(child_netdev, dev);
> +               dev_mc_sync_multiple(child_netdev, dev);
> +       }
> +
> +       child_netdev = rcu_dereference(vbi->backup_netdev);
> +       if (child_netdev) {
> +               dev_uc_sync_multiple(child_netdev, dev);
> +               dev_mc_sync_multiple(child_netdev, dev);
> +       }
> +
> +       rcu_read_unlock();
> +}
> +
> +static const struct net_device_ops virtnet_bypass_netdev_ops = {
> +       .ndo_open               = virtnet_bypass_open,
> +       .ndo_stop               = virtnet_bypass_close,
> +       .ndo_start_xmit         = virtnet_bypass_start_xmit,
> +       .ndo_select_queue       = virtnet_bypass_select_queue,
> +       .ndo_get_stats64        = virtnet_bypass_get_stats,
> +       .ndo_change_mtu         = virtnet_bypass_change_mtu,
> +       .ndo_set_rx_mode        = virtnet_bypass_set_rx_mode,
> +       .ndo_validate_addr      = eth_validate_addr,
> +       .ndo_features_check     = passthru_features_check,
> +};
> +
> +static int
> +virtnet_bypass_ethtool_get_link_ksettings(struct net_device *dev,
> +                                         struct ethtool_link_ksettings *cmd)
> +{
> +       struct virtnet_bypass_info *vbi = netdev_priv(dev);
> +       struct net_device *child_netdev;
> +
> +       child_netdev = rtnl_dereference(vbi->active_netdev);
> +       if (!child_netdev || !virtnet_bypass_xmit_ready(child_netdev)) {
> +               child_netdev = rtnl_dereference(vbi->backup_netdev);
> +               if (!child_netdev || !virtnet_bypass_xmit_ready(child_netdev)) {
> +                       cmd->base.duplex = DUPLEX_UNKNOWN;
> +                       cmd->base.port = PORT_OTHER;
> +                       cmd->base.speed = SPEED_UNKNOWN;
> +
> +                       return 0;
> +               }
> +       }
> +
> +       return __ethtool_get_link_ksettings(child_netdev, cmd);
> +}
> +
> +#define BYPASS_DRV_NAME "virtnet_bypass"
> +#define BYPASS_DRV_VERSION "0.1"
> +
> +static void virtnet_bypass_ethtool_get_drvinfo(struct net_device *dev,
> +                                              struct ethtool_drvinfo *drvinfo)
> +{
> +       strlcpy(drvinfo->driver, BYPASS_DRV_NAME, sizeof(drvinfo->driver));
> +       strlcpy(drvinfo->version, BYPASS_DRV_VERSION, sizeof(drvinfo->version));
> +}
> +
> +static const struct ethtool_ops virtnet_bypass_ethtool_ops = {
> +       .get_drvinfo            = virtnet_bypass_ethtool_get_drvinfo,
> +       .get_link               = ethtool_op_get_link,
> +       .get_link_ksettings     = virtnet_bypass_ethtool_get_link_ksettings,
> +};
> +
> +static struct net_device *get_virtnet_bypass_bymac(struct net *net,
> +                                                  const u8 *mac)
> +{
> +       struct net_device *dev;
> +
> +       ASSERT_RTNL();
> +
> +       for_each_netdev(net, dev) {
> +               if (dev->netdev_ops != &virtnet_bypass_netdev_ops)
> +                       continue;       /* not a virtnet_bypass device */
> +
> +               if (ether_addr_equal(mac, dev->perm_addr))
> +                       return dev;
> +       }
> +
> +       return NULL;
> +}
> +
> +static struct net_device *
> +get_virtnet_bypass_byref(struct net_device *child_netdev)
> +{
> +       struct net *net = dev_net(child_netdev);
> +       struct net_device *dev;
> +
> +       ASSERT_RTNL();
> +
> +       for_each_netdev(net, dev) {
> +               struct virtnet_bypass_info *vbi;
> +
> +               if (dev->netdev_ops != &virtnet_bypass_netdev_ops)
> +                       continue;       /* not a virtnet_bypass device */
> +
> +               vbi = netdev_priv(dev);
> +
> +               if ((rtnl_dereference(vbi->active_netdev) == child_netdev) ||
> +                   (rtnl_dereference(vbi->backup_netdev) == child_netdev))
> +                       return dev;     /* a match */
> +       }
> +
> +       return NULL;
> +}
> +
> +/* Called when child dev is injecting data into network stack.
> + * Change the associated network device from lower dev to virtio.
> + * note: already called with rcu_read_lock
> + */
> +static rx_handler_result_t virtnet_bypass_handle_frame(struct sk_buff **pskb)
> +{
> +       struct sk_buff *skb = *pskb;
> +       struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
> +
> +       skb->dev = ndev;
> +
> +       return RX_HANDLER_ANOTHER;
> +}
> +
> +static int virtnet_bypass_register_child(struct net_device *child_netdev)
> +{
> +       struct virtnet_bypass_info *vbi;
> +       struct net_device *dev;
> +       bool backup;
> +       int ret;
> +
> +       if (child_netdev->addr_len != ETH_ALEN)
> +               return NOTIFY_DONE;
> +
> +       /* We will use the MAC address to locate the virtnet_bypass netdev
> +        * to associate with the child netdev. If we don't find a matching
> +        * bypass netdev, move on.
> +        */
> +       dev = get_virtnet_bypass_bymac(dev_net(child_netdev),
> +                                      child_netdev->perm_addr);
> +       if (!dev)
> +               return NOTIFY_DONE;
> +
> +       vbi = netdev_priv(dev);
> +       backup = (child_netdev->dev.parent == dev->dev.parent);
> +       if (backup ? rtnl_dereference(vbi->backup_netdev) :
> +                       rtnl_dereference(vbi->active_netdev)) {
> +               netdev_info(dev,
> +                           "%s attempting to join bypass dev when %s already present\n",
> +                           child_netdev->name, backup ? "backup" : "active");
> +               return NOTIFY_DONE;
> +       }
> +
> +       /* Avoid non pci devices as active netdev */
> +       if (!backup && (!child_netdev->dev.parent ||
> +                       !dev_is_pci(child_netdev->dev.parent)))
> +               return NOTIFY_DONE;
> +

There's a problem here in terms of error (particularly on the active
slave, e.g. VF), see below:

> +       ret = netdev_rx_handler_register(child_netdev,
> +                                        virtnet_bypass_handle_frame, dev);
> +       if (ret != 0) {
> +               netdev_err(child_netdev,
> +                          "can not register bypass receive handler (err = %d)\n",
> +                          ret);
> +               goto rx_handler_failed;
> +       }
> +
> +       ret = netdev_upper_dev_link(child_netdev, dev, NULL);
> +       if (ret != 0) {
> +               netdev_err(child_netdev,
> +                          "can not set master device %s (err = %d)\n",
> +                          dev->name, ret);
> +               goto upper_link_failed;
> +       }
> +
> +       child_netdev->flags |= IFF_SLAVE;
> +
> +       if (netif_running(dev)) {
> +               ret = dev_open(child_netdev);
> +               if (ret && (ret != -EBUSY)) {
> +                       netdev_err(dev, "Opening child %s failed ret:%d\n",
> +                                  child_netdev->name, ret);
> +                       goto err_interface_up;
> +               }
> +       }
> +
> +       /* Align MTU of child with master */
> +       ret = dev_set_mtu(child_netdev, dev->mtu);
> +       if (ret) {
> +               netdev_err(dev,
> +                          "unable to change mtu of %s to %u register failed\n",
> +                          child_netdev->name, dev->mtu);
> +               goto err_set_mtu;
> +       }

If any of the above calls returns non-zero, the current code steps
back to undo what's being done on that spefic slave previously. For
instance, if the netdev_rx_handler_register returns non-zero because
of busy rx handler, this register_child function would give up
enslaving the VF and leave the upper virtio_bypass interface behind
once it returns. I am not sure if it's a good idea to leave the
virtio_bypass around if running into failure: the guest is not
migratable as the VF doesn't have a backup path, And perhaps the worse
part is that, it now has two interfaces with identical MAC address but
one of them is invalid (user cannot use the virtio interface as it has
a dampened datapath). IMHO the virtio_bypass and its lower netdev
should be destroyed at all when it fails to bind the VF, and
technically, there should be some way to propogate the failure status
to the hypervisor/backend, indicating that the VM is not migratable
because of guest software errors (maybe by clearing out the backup
feature from the guest virtio driver so host can see/learn it).

Thanks,
-Siwei

> +
> +       call_netdevice_notifiers(NETDEV_JOIN, child_netdev);
> +
> +       netdev_info(dev, "registering %s\n", child_netdev->name);
> +
> +       dev_hold(child_netdev);
> +       if (backup) {
> +               rcu_assign_pointer(vbi->backup_netdev, child_netdev);
> +               dev_get_stats(vbi->backup_netdev, &vbi->backup_stats);
> +       } else {
> +               rcu_assign_pointer(vbi->active_netdev, child_netdev);
> +               dev_get_stats(vbi->active_netdev, &vbi->active_stats);
> +               dev->min_mtu = child_netdev->min_mtu;
> +               dev->max_mtu = child_netdev->max_mtu;
> +       }
> +
> +       return NOTIFY_OK;
> +
> +err_set_mtu:
> +       dev_close(child_netdev);
> +err_interface_up:
> +       netdev_upper_dev_unlink(child_netdev, dev);
> +       child_netdev->flags &= ~IFF_SLAVE;
> +upper_link_failed:
> +       netdev_rx_handler_unregister(child_netdev);
> +rx_handler_failed:
> +       return NOTIFY_DONE;
> +}
> +
> +static int virtnet_bypass_unregister_child(struct net_device *child_netdev)
> +{
> +       struct virtnet_bypass_info *vbi;
> +       struct net_device *dev, *backup;
> +
> +       dev = get_virtnet_bypass_byref(child_netdev);
> +       if (!dev)
> +               return NOTIFY_DONE;
> +
> +       vbi = netdev_priv(dev);
> +
> +       netdev_info(dev, "unregistering %s\n", child_netdev->name);
> +
> +       netdev_rx_handler_unregister(child_netdev);
> +       netdev_upper_dev_unlink(child_netdev, dev);
> +       child_netdev->flags &= ~IFF_SLAVE;
> +
> +       if (child_netdev->dev.parent == dev->dev.parent) {
> +               RCU_INIT_POINTER(vbi->backup_netdev, NULL);
> +       } else {
> +               RCU_INIT_POINTER(vbi->active_netdev, NULL);
> +               backup = rtnl_dereference(vbi->backup_netdev);
> +               if (backup) {
> +                       dev->min_mtu = backup->min_mtu;
> +                       dev->max_mtu = backup->max_mtu;
> +               }
> +       }
> +
> +       dev_put(child_netdev);
> +
> +       return NOTIFY_OK;
> +}
> +
> +static int virtnet_bypass_update_link(struct net_device *child_netdev)
> +{
> +       struct net_device *dev, *active, *backup;
> +       struct virtnet_bypass_info *vbi;
> +
> +       dev = get_virtnet_bypass_byref(child_netdev);
> +       if (!dev || !netif_running(dev))
> +               return NOTIFY_DONE;
> +
> +       vbi = netdev_priv(dev);
> +
> +       active = rtnl_dereference(vbi->active_netdev);
> +       backup = rtnl_dereference(vbi->backup_netdev);
> +
> +       if ((active && virtnet_bypass_xmit_ready(active)) ||
> +           (backup && virtnet_bypass_xmit_ready(backup))) {
> +               netif_carrier_on(dev);
> +               netif_tx_wake_all_queues(dev);
> +       } else {
> +               netif_carrier_off(dev);
> +               netif_tx_stop_all_queues(dev);
> +       }
> +
> +       return NOTIFY_OK;
> +}
> +
> +static int virtnet_bypass_event(struct notifier_block *this,
> +                               unsigned long event, void *ptr)
> +{
> +       struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> +
> +       /* Skip our own events */
> +       if (event_dev->netdev_ops == &virtnet_bypass_netdev_ops)
> +               return NOTIFY_DONE;
> +
> +       /* Avoid non-Ethernet type devices */
> +       if (event_dev->type != ARPHRD_ETHER)
> +               return NOTIFY_DONE;
> +
> +       /* Avoid Vlan dev with same MAC registering as child dev */
> +       if (is_vlan_dev(event_dev))
> +               return NOTIFY_DONE;
> +
> +       /* Avoid Bonding master dev with same MAC registering as child dev */
> +       if ((event_dev->priv_flags & IFF_BONDING) &&
> +           (event_dev->flags & IFF_MASTER))
> +               return NOTIFY_DONE;
> +
> +       switch (event) {
> +       case NETDEV_REGISTER:
> +               return virtnet_bypass_register_child(event_dev);
> +       case NETDEV_UNREGISTER:
> +               return virtnet_bypass_unregister_child(event_dev);
> +       case NETDEV_UP:
> +       case NETDEV_DOWN:
> +       case NETDEV_CHANGE:
> +               return virtnet_bypass_update_link(event_dev);
> +       default:
> +               return NOTIFY_DONE;
> +       }
> +}
> +
> +static struct notifier_block virtnet_bypass_notifier = {
> +       .notifier_call = virtnet_bypass_event,
> +};
> +
> +static int virtnet_bypass_create(struct virtnet_info *vi)
> +{
> +       struct net_device *backup_netdev = vi->dev;
> +       struct device *dev = &vi->vdev->dev;
> +       struct net_device *bypass_netdev;
> +       int res;
> +
> +       /* Alloc at least 2 queues, for now we are going with 16 assuming
> +        * that most devices being bonded won't have too many queues.
> +        */
> +       bypass_netdev = alloc_etherdev_mq(sizeof(struct virtnet_bypass_info),
> +                                         16);
> +       if (!bypass_netdev) {
> +               dev_err(dev, "Unable to allocate bypass_netdev!\n");
> +               return -ENOMEM;
> +       }
> +
> +       dev_net_set(bypass_netdev, dev_net(backup_netdev));
> +       SET_NETDEV_DEV(bypass_netdev, dev);
> +
> +       bypass_netdev->netdev_ops = &virtnet_bypass_netdev_ops;
> +       bypass_netdev->ethtool_ops = &virtnet_bypass_ethtool_ops;
> +
> +       /* Initialize the device options */
> +       bypass_netdev->flags |= IFF_MASTER;
> +       bypass_netdev->priv_flags |= IFF_BONDING | IFF_UNICAST_FLT |
> +                                    IFF_NO_QUEUE;
> +       bypass_netdev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
> +                                      IFF_TX_SKB_SHARING);
> +
> +       /* don't acquire bypass netdev's netif_tx_lock when transmitting */
> +       bypass_netdev->features |= NETIF_F_LLTX;
> +
> +       /* Don't allow bypass devices to change network namespaces. */
> +       bypass_netdev->features |= NETIF_F_NETNS_LOCAL;
> +
> +       bypass_netdev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
> +                                    NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |
> +                                    NETIF_F_HIGHDMA | NETIF_F_LRO;
> +
> +       bypass_netdev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
> +       bypass_netdev->features |= bypass_netdev->hw_features;
> +
> +       /* For now treat bypass netdev as VLAN challenged since we
> +        * cannot assume VLAN functionality with a VF
> +        */
> +       bypass_netdev->features |= NETIF_F_VLAN_CHALLENGED;
> +
> +       memcpy(bypass_netdev->dev_addr, backup_netdev->dev_addr,
> +              bypass_netdev->addr_len);
> +
> +       bypass_netdev->min_mtu = backup_netdev->min_mtu;
> +       bypass_netdev->max_mtu = backup_netdev->max_mtu;
> +
> +       res = register_netdev(bypass_netdev);
> +       if (res < 0) {
> +               dev_err(dev, "Unable to register bypass_netdev!\n");
> +               free_netdev(bypass_netdev);
> +               return res;
> +       }
> +
> +       netif_carrier_off(bypass_netdev);
> +
> +       vi->bypass_netdev = bypass_netdev;
> +
> +       return 0;
> +}
> +
> +static void virtnet_bypass_destroy(struct virtnet_info *vi)
> +{
> +       struct net_device *bypass_netdev = vi->bypass_netdev;
> +       struct virtnet_bypass_info *vbi;
> +       struct net_device *child_netdev;
> +
> +       /* no device found, nothing to free */
> +       if (!bypass_netdev)
> +               return;
> +
> +       vbi = netdev_priv(bypass_netdev);
> +
> +       netif_device_detach(bypass_netdev);
> +
> +       rtnl_lock();
> +
> +       child_netdev = rtnl_dereference(vbi->active_netdev);
> +       if (child_netdev)
> +               virtnet_bypass_unregister_child(child_netdev);
> +
> +       child_netdev = rtnl_dereference(vbi->backup_netdev);
> +       if (child_netdev)
> +               virtnet_bypass_unregister_child(child_netdev);
> +
> +       unregister_netdevice(bypass_netdev);
> +
> +       rtnl_unlock();
> +
> +       free_netdev(bypass_netdev);
> +}
> +
> +/* END of functions supporting VIRTIO_NET_F_BACKUP feature. */
> +
>  static int virtnet_probe(struct virtio_device *vdev)
>  {
>         int i, err = -ENOMEM;
> @@ -2797,10 +3466,15 @@ static int virtnet_probe(struct virtio_device *vdev)
>
>         virtnet_init_settings(dev);
>
> +       if (virtio_has_feature(vdev, VIRTIO_NET_F_BACKUP)) {
> +               if (virtnet_bypass_create(vi) != 0)
> +                       goto free_vqs;
> +       }
> +
>         err = register_netdev(dev);
>         if (err) {
>                 pr_debug("virtio_net: registering device failed\n");
> -               goto free_vqs;
> +               goto free_bypass;
>         }
>
>         virtio_device_ready(vdev);
> @@ -2837,6 +3511,8 @@ static int virtnet_probe(struct virtio_device *vdev)
>         vi->vdev->config->reset(vdev);
>
>         unregister_netdev(dev);
> +free_bypass:
> +       virtnet_bypass_destroy(vi);
>  free_vqs:
>         cancel_delayed_work_sync(&vi->refill);
>         free_receive_page_frags(vi);
> @@ -2871,6 +3547,8 @@ static void virtnet_remove(struct virtio_device *vdev)
>
>         unregister_netdev(vi->dev);
>
> +       virtnet_bypass_destroy(vi);
> +
>         remove_vq_common(vi);
>
>         free_netdev(vi->dev);
> @@ -2968,6 +3646,8 @@ static __init int virtio_net_driver_init(void)
>          ret = register_virtio_driver(&virtio_net_driver);
>         if (ret)
>                 goto err_virtio;
> +
> +       register_netdevice_notifier(&virtnet_bypass_notifier);
>         return 0;
>  err_virtio:
>         cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
> @@ -2980,6 +3660,7 @@ module_init(virtio_net_driver_init);
>
>  static __exit void virtio_net_driver_exit(void)
>  {
> +       unregister_netdevice_notifier(&virtnet_bypass_notifier);
>         unregister_virtio_driver(&virtio_net_driver);
>         cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
>         cpuhp_remove_multi_state(virtionet_online);
> --
> 2.14.3
>

^ permalink raw reply

* Re: [PATCH bpf-next v4 1/2] bpf: extend stackmap to save binary_build_id+offset instead of address
From: Song Liu @ 2018-03-12 22:47 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: netdev@vger.kernel.org, ast@kernel.org, Peter Zijlstra,
	Daniel Borkmann, Kernel Team, hannes@cmpxchg.org, Teng Qin
In-Reply-To: <a7ac3a74-959f-af35-81ce-a8c05be29b68@fb.com>



> On Mar 12, 2018, at 2:31 PM, Alexei Starovoitov <ast@fb.com> wrote:
> 
> On 3/12/18 2:12 PM, Song Liu wrote:
>> 
>>> On Mar 12, 2018, at 2:00 PM, Alexei Starovoitov <ast@fb.com> wrote:
>>> 
>>> On 3/12/18 1:39 PM, Song Liu wrote:
>>>> +	page = find_get_page(vma->vm_file->f_mapping, 0);
>>> 
>>> did you test it with config_debug_atomic_sleep ?
>>> it should have complained...
>> 
>> Yeah, I have CONFIG_DEBUG_ATOMIC_SLEEP=y.
>> 
>> I think find_get_page() will not sleep. The variation find_get_page_flags()
>> may sleep with flag FGP_CREAT.
> 
> I see. gfp_mask == 0 and no locks. should work indeed.
> curious how perf report looks like for heavy bpf_get_stackid() usage?

I modified samples/bpf/sampleip to only call bpf_get_stackid(). The following 
is captured with bpf_get_stackid() called at 10k Hz. stressapptest is running
with 16 threads on a system with 56 cores. 


Samples: 1M of event 'cycles:pp', Event count (approx.): 628092326243
  Overhead  Command          Shared Object                 Symbol
+   51.61%  stressapptest    stressapptest                 [.] AdlerMemcpyC             
-   20.82%  stressapptest    [kernel.vmlinux]              [k] queued_spin_lock_slowpath
   - queued_spin_lock_slowpath                                                          
      - 20.80% pcpu_freelist_pop                                                        
           bpf_get_stackid                                                              
           bpf_get_stackid_tp                                                           
         - 0x590c                                                                       
              16.12% AdlerMemcpyC                                                       
              4.50% OsLayer::CpuStressWorkload                                          
+   14.36%  stressapptest    stressapptest                 [.] OsLayer::CpuStressWorkload
-    8.74%  stressapptest    [kernel.vmlinux]              [k] _raw_spin_lock            
   - _raw_spin_lock                                                                      
      - 8.73% bpf_get_stackid                                                            
           bpf_get_stackid_tp                                                            
         + 0x590c                                                                        
-    0.67%  stressapptest    [kernel.vmlinux]              [k] pcpu_freelist_pop         
   - pcpu_freelist_pop                                                                   
      - 0.67% bpf_get_stackid                                                            
           bpf_get_stackid_tp                                                            
         + 0x590c

Seems lock contention is the dominating overhead here. This should be the same
for original stackmap. 

Song

^ permalink raw reply

* Re: [PATCH net-next 1/1] net sched actions: return explicit error when tunnel_key mode is not specified
From: Cong Wang @ 2018-03-12 22:55 UTC (permalink / raw)
  To: Roman Mashak
  Cc: David Miller, Linux Kernel Network Developers, Jamal Hadi Salim,
	Amir Vadai, Jiri Pirko
In-Reply-To: <1520886058-18337-1-git-send-email-mrv@mojatatu.com>

On Mon, Mar 12, 2018 at 1:20 PM, Roman Mashak <mrv@mojatatu.com> wrote:
> If set/unset mode of the tunnel_key action is not provided, ->init() still
> returns 0, and the caller proceeds with bogus 'struct tc_action *' object,
> this results in crash:

...

> Fixes: d0f6dd8a914f ("net/sched: Introduce act_tunnel_key")
> Signed-off-by: Roman Mashak <mrv@mojatatu.com>

Acked-by: Cong Wang <xiyou.wangcong@gmail.com>

This should go to -net rather than -net-next.

Thanks.

^ permalink raw reply

* Re: [PATCH v3] kernel.h: Skip single-eval logic on literals in min()/max()
From: Andrew Morton @ 2018-03-12 22:55 UTC (permalink / raw)
  To: Kees Cook
  Cc: Linus Torvalds, Linux Kernel Mailing List, Josh Poimboeuf,
	Rasmus Villemoes, Gustavo A. R. Silva, Tobin C. Harding,
	Steven Rostedt, Jonathan Corbet, Chris Mason, Josef Bacik,
	David Sterba, David S. Miller, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Ingo Molnar, Peter Zijlstra, Thomas Gleixner,
	Masahiro Yamada, Borislav Petkov, Randy Dunlap <
In-Reply-To: <CAGXu5j+D=zs8AMb969dhH05GmKPuBxKPcAs89KfKzVf47faL1g@mail.gmail.com>

On Fri, 9 Mar 2018 17:30:15 -0800 Kees Cook <keescook@chromium.org> wrote:

> > It's one reason why I wondered if simplifying the expression to have
> > just that single __builtin_constant_p() might not end up working..
> 
> Yeah, it seems like it doesn't bail out as "false" for complex
> expressions given to __builtin_constant_p().
> 
> If no magic solution, then which of these?
> 
> - go back to my original "multi-eval max only for constants" macro (meh)
> - add gcc version checks around this and similarly for -Wvla in the future (eww)
> - raise gcc version (yikes)

Replacing the __builtin_choose_expr() with ?: works of course.  What
will be the runtime effects?

I tried replacing

	__builtin_choose_expr(__builtin_constant_p(x) &&
                              __builtin_constant_p(y),

with

	__builtin_choose_expr(__builtin_constant_p(x + y),

but no success.

I'm not sure what else to try to trick gcc into working.

--- a/include/linux/kernel.h~kernelh-skip-single-eval-logic-on-literals-in-min-max-v3-fix
+++ a/include/linux/kernel.h
@@ -804,13 +804,10 @@ static inline void ftrace_dump(enum ftra
  * accidental VLA.
  */
 #define __min(t1, t2, x, y)						\
-	__builtin_choose_expr(__builtin_constant_p(x) &&		\
-			      __builtin_constant_p(y),			\
-			      (t1)(x) < (t2)(y) ? (t1)(x) : (t2)(y),	\
-			      __single_eval_min(t1, t2,			\
-						__UNIQUE_ID(min1_),	\
-						__UNIQUE_ID(min2_),	\
-						x, y))
+	((__builtin_constant_p(x) && __builtin_constant_p(y)) ?		\
+		((t1)(x) < (t2)(y) ? (t1)(x) : (t2)(y)) :		\
+		(__single_eval_min(t1, t2, __UNIQUE_ID(min1_),		\
+				  __UNIQUE_ID(min2_), x, y)))
 
 /**
  * min - return minimum of two values of the same or compatible types
@@ -826,13 +823,11 @@ static inline void ftrace_dump(enum ftra
 	max1 > max2 ? max1 : max2; })
 
 #define __max(t1, t2, x, y)						\
-	__builtin_choose_expr(__builtin_constant_p(x) &&		\
-			      __builtin_constant_p(y),			\
-			      (t1)(x) > (t2)(y) ? (t1)(x) : (t2)(y),	\
-			      __single_eval_max(t1, t2,			\
-						__UNIQUE_ID(max1_),	\
-						__UNIQUE_ID(max2_),	\
-						x, y))
+	((__builtin_constant_p(x) && __builtin_constant_p(y)) ?		\
+		((t1)(x) > (t2)(y) ? (t1)(x) : (t2)(y)) :		\
+		(__single_eval_max(t1, t2, __UNIQUE_ID(max1_),		\
+				  __UNIQUE_ID(max2_), x, y)))
+
 /**
  * max - return maximum of two values of the same or compatible types
  * @x: first value
_

^ permalink raw reply

* Re: [RFC PATCH net-next 3/5] bridge: allow switchdev port to handle flooding by itself
From: Igor Mitsyanko @ 2018-03-12 23:00 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: ivecera, jiri, netdev, bridge, sergey.matyukevich.os, ashevchenko,
	smaksimenko, dlebed
In-Reply-To: <20180310165540.GH29174@lunn.ch>

On 03/10/2018 08:55 AM, Andrew Lunn wrote:
> Is this sufficiently granular? There are a few different use cases for
> flooding:
> 
> There is no fdb entry in the software switch for the destination MAC
> address, so flood the packet out all ports of the bridge. The hardware
> switch might have an entry in its fdb to the destination switch, so it
> could unicast out the correct hardware port. If not, it should flood
> the packet.
> 
> A point to remember here, the software switch and the hardware switch
> can have different forwarding data bases.
> 
> A broadcast packet. Send it out all ports.
> 
> A multicast packet. If the hardware switch is capable of IGMP
> snooping, it could have FDB entries indicating which ports it should
> send the frame out of, and which is should not. Otherwise it needs to
> flood.
> 
> Is one flag sufficient for all of these, and any other use cases i
> might of missed?
> 
> As far as DSA switches go, i don't know of any of them which could
> implement anything like this, so BR_FLOOD_OFFLOAD will never be
> set. But maybe some of the TOR switches supported by switchdev can do
> some of these, and not others....
> 
>       Andrew
> 


The flag was introduced to enable hardware switch capabilities of 
drivers/net/wireless/quantenna/qtnfmac wifi driver. It does not have any 
switchdev functionality in upstream tree at this moment, and this 
patchset was intended as a preparatory change.

qtnfmac driver provides several physical radios (5 GHz and 2.4 GHz), 
each can have up to 8 virtual network interfaces. These interfaces can 
be bridged together in various configurations, and I'm trying to figure 
out what is the most efficient way to handle it from bridging perspective.
My assumption was that software FDB and hardware FDB should always be in 
sync with each other. I guess it is a safe assumption if handled 
correctly? Hardware should send a notification for each new FDB it has 
learned, and switchdev driver should process FDB notifications from 
software bridge.

qtnfmac hardware has its own memory and maintains FWT table, so for the 
best efficiency forwarding between virtual interfaces should be handled 
locally. Qtnfmac can handle all the mentioned flooding by itself:
- unknown unicasts
- broadcast and unknown multicast
- known multicasts (does have IGMP snooping)
- can do multicast-to-unicast translation if required.

The most important usecase IMO is a muticast transmission, specific 
example being:
- 2.4GHz x 8 and 5GHz x 8 virtual wifi interfaces, bridged with backbone 
ethernet interface in Linux
- multicast video streaming from a server behind ethernet
- multicast clients connected to some wifi interfaces

Best way to process this should be to handle multicasting locally in 
wifi firmware:
- SW bridge in Linux will send a multicast frame to a single virtual 
WiFi interface.
- WiFi firmware will forward/flood frames to all intended recipients 
locally.

BR_FLOOD_OFFLOAD flag is intended to address this case in particular, 
perhaps there are better ways to do that?
In a broader sense it is a way for hardware to tell that it will handle 
all flooding by itself, so there is no granularity in this. I'm not sure 
granularity is needed though, as there may not be much sense to do only 
some types of flooding and not to do others?

^ permalink raw reply

* [PATCH net] net: dsa: Fix dsa_is_user_port() test inversion
From: Florian Fainelli @ 2018-03-12 23:00 UTC (permalink / raw)
  To: netdev
  Cc: yxj790222, Florian Fainelli, Andrew Lunn, Vivien Didelot,
	David S. Miller, open list

During the conversion to dsa_is_user_port(), a condition ended up being
reversed, which would prevent the creation of any user port when using
the legacy binding and/or platform data, fix that.

Fixes: 4a5b85ffe2a0 ("net: dsa: use dsa_is_user_port everywhere")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 net/dsa/legacy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index cb54b81d0bd9..42a7b85b84e1 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -194,7 +194,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds,
 		ds->ports[i].dn = cd->port_dn[i];
 		ds->ports[i].cpu_dp = dst->cpu_dp;
 
-		if (dsa_is_user_port(ds, i))
+		if (!dsa_is_user_port(ds, i))
 			continue;
 
 		ret = dsa_slave_create(&ds->ports[i]);
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net-next 0/5] Switchdev: flooding offload option
From: Igor Mitsyanko @ 2018-03-12 23:08 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: ivecera, jiri, netdev, bridge, sergey.matyukevich.os, ashevchenko,
	smaksimenko, dlebed
In-Reply-To: <20180310220850.GI29174@lunn.ch>

On 03/10/2018 02:08 PM, Andrew Lunn wrote:
> 
> Hi Igor
> 
> You don't appear to be adding any user of this. Please also make one
> of the switchdev drivers actually use this new functionality.
> 
>     Andrew
> 

Hi Andrew, a first user is supposed to be 
drivers/net/wireless/quantenna/qtnfmac wifi driver. I will merge the 
patchset with another one that modifies qtnfmac driver (so that it has 
at least one user) and resend.

However RFC portion is still under discussion, would be good to 
understand if this approach is at all acceptable.

^ permalink raw reply

* [RFC PATCH v0 0/2] skbuff: Fix applications not being woken for errors
From: Vinicius Costa Gomes @ 2018-03-12 23:10 UTC (permalink / raw)
  To: netdev; +Cc: Vinicius Costa Gomes, randy.e.witt, davem

Hi,

This is actually a "bug report"-RFC instead of the more usual "new
feature"-RFC.

We are developing an application that uses TX hardware timestamping to
make some measurements, and during development Randy Witt initially
reported that the application poll() never unblocked when TX hardware
timestamping was enabled.

After some investigation, it turned out the problem wasn't only
exclusive to hardware timestamping, and could be reproduced with
software timestamping.

Applying patch (1), and running txtimestamp like this, for example:

$ ./txtimestamp -u -4 192.168.1.71 -c 1000 -D -l 1000 -F

('-u' to use UDP only, '-4' for ipv4 only, '-c 1000' to send 1000
packets for each test, '-D' to remove the delay between packets, '-l
1000' to set the payload to 1000 bytes, '-F' for configuring poll() to
wait forever)

will cause the application to become stuck in the poll() call in most
of the times. (Note: I couldn't reproduce the issue running against an
address that is routed through loopback.)

Another interesting fact is that if the POLLIN event is added to the
poll() .events, poll() no longer becomes stuck, and more interestingly
the returned event in .revents is only POLLERR.

After a few debugging sessions, we got to 'sock_queue_err_skb()' and
how it notifies applications of the error just enqueued. Changing it
to use 'sk->sk_error_report()', fixes the issue for hardware and
software timestamping. That is patch (2).

The "solution" proposed in patch (2) looks like too big a hammer, if
it's not, then it seems that this problem existed since a long time
ago (pre git) and was uncommon for folks to reach the necessary
conditions to trigger it (my hypothesis is that only triggers when the
error is reported from a different task context than the application).

Am I missing something here?


Cheers,
--


Vinicius Costa Gomes (2):
  selftests/txtimestamp: Add more configurable parameters
  skbuff: Notify errors with sk_error_report()

 net/core/skbuff.c                                   |  2 +-
 .../selftests/networking/timestamping/txtimestamp.c | 21 ++++++++++++++++++---
 2 files changed, 19 insertions(+), 4 deletions(-)

--
2.16.2

^ permalink raw reply

* [RFC PATCH v0 1/2] selftests/txtimestamp: Add more configurable parameters
From: Vinicius Costa Gomes @ 2018-03-12 23:10 UTC (permalink / raw)
  To: netdev; +Cc: Vinicius Costa Gomes, randy.e.witt, davem
In-Reply-To: <20180312231052.13961-1-vinicius.gomes@intel.com>

Add a way to configure if poll() should wait forever for an event, the
number of packets that should be sent for each and if there should be
any delay between packets.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 .../selftests/networking/timestamping/txtimestamp.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/networking/timestamping/txtimestamp.c b/tools/testing/selftests/networking/timestamping/txtimestamp.c
index 5df07047ca86..5190b1dd78b1 100644
--- a/tools/testing/selftests/networking/timestamping/txtimestamp.c
+++ b/tools/testing/selftests/networking/timestamping/txtimestamp.c
@@ -68,9 +68,11 @@ static int cfg_num_pkts = 4;
 static int do_ipv4 = 1;
 static int do_ipv6 = 1;
 static int cfg_payload_len = 10;
+static int cfg_poll_timeout = 100;
 static bool cfg_show_payload;
 static bool cfg_do_pktinfo;
 static bool cfg_loop_nodata;
+static bool cfg_no_delay;
 static uint16_t dest_port = 9000;
 
 static struct sockaddr_in daddr;
@@ -171,7 +173,7 @@ static void __poll(int fd)
 
 	memset(&pollfd, 0, sizeof(pollfd));
 	pollfd.fd = fd;
-	ret = poll(&pollfd, 1, 100);
+	ret = poll(&pollfd, 1, cfg_poll_timeout);
 	if (ret != 1)
 		error(1, errno, "poll");
 }
@@ -371,7 +373,8 @@ static void do_test(int family, unsigned int opt)
 			error(1, errno, "send");
 
 		/* wait for all errors to be queued, else ACKs arrive OOO */
-		usleep(50 * 1000);
+		if (!cfg_no_delay)
+			usleep(50 * 1000);
 
 		__poll(fd);
 
@@ -397,6 +400,9 @@ static void __attribute__((noreturn)) usage(const char *filepath)
 			"  -n:   set no-payload option\n"
 			"  -r:   use raw\n"
 			"  -R:   use raw (IP_HDRINCL)\n"
+			"  -D:   no delay between packets\n"
+			"  -F:   poll() waits forever for an event\n"
+			"  -c N: number of packets for each test\n"
 			"  -p N: connect to port N\n"
 			"  -u:   use udp\n"
 			"  -x:   show payload (up to 70 bytes)\n",
@@ -409,7 +415,7 @@ static void parse_opt(int argc, char **argv)
 	int proto_count = 0;
 	char c;
 
-	while ((c = getopt(argc, argv, "46hIl:np:rRux")) != -1) {
+	while ((c = getopt(argc, argv, "46hIl:np:rRuxc:DF")) != -1) {
 		switch (c) {
 		case '4':
 			do_ipv6 = 0;
@@ -447,6 +453,15 @@ static void parse_opt(int argc, char **argv)
 		case 'x':
 			cfg_show_payload = true;
 			break;
+		case 'c':
+			cfg_num_pkts = strtoul(optarg, NULL, 10);
+			break;
+		case 'D':
+			cfg_no_delay = true;
+			break;
+		case 'F':
+			cfg_poll_timeout = -1;
+			break;
 		case 'h':
 		default:
 			usage(argv[0]);
-- 
2.16.2

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox