Netdev List
 help / color / mirror / Atom feed
* [PATCH v2 2/3] lan78xx: Read LED states from Device Tree
From: Phil Elwell @ 2018-04-18 15:45 UTC (permalink / raw)
  To: Woojung Huh, Microchip Linux Driver Support, Rob Herring,
	Mark Rutland, David S. Miller, Mauro Carvalho Chehab,
	Greg Kroah-Hartman, Linus Walleij, Andrew Morton, Randy Dunlap,
	netdev, devicetree, linux-kernel, linux-usb
  Cc: Phil Elwell
In-Reply-To: <1524066323-109628-1-git-send-email-phil@raspberrypi.org>

Add support for DT property "microchip,led-modes", a vector of zero
to four cells (u32s) in the range 0-15, each of which sets the mode
for one of the LEDs. Some possible values are:

    0=link/activity          1=link1000/activity
    2=link100/activity       3=link10/activity
    4=link100/1000/activity  5=link10/1000/activity
    6=link10/100/activity    14=off    15=on

These values are given symbolic constants in a dt-bindings header.

Also use the presence of the DT property to indicate that the
LEDs should be enabled - necessary in the event that no valid OTP
or EEPROM is available.

Signed-off-by: Phil Elwell <phil@raspberrypi.org>
---
 MAINTAINERS                              |  1 +
 drivers/net/usb/lan78xx.c                | 35 ++++++++++++++++++++++++++++++++
 include/dt-bindings/net/microchip-78xx.h | 21 +++++++++++++++++++
 3 files changed, 57 insertions(+)
 create mode 100644 include/dt-bindings/net/microchip-78xx.h

diff --git a/MAINTAINERS b/MAINTAINERS
index b60179d..9c9bc63 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14573,6 +14573,7 @@ M:	Microchip Linux Driver Support <UNGLinuxDriver@microchip.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/usb/lan78xx.*
+F:	include/dt-bindings/net/microchip-78xx.h
 
 USB MASS STORAGE DRIVER
 M:	Alan Stern <stern@rowland.harvard.edu>
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index a823f01..f47ffea 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -38,6 +38,7 @@
 #include <linux/microchipphy.h>
 #include <linux/phy.h>
 #include <linux/of_net.h>
+#include <dt-bindings/net/microchip-78xx.h>
 #include "lan78xx.h"
 
 #define DRIVER_AUTHOR	"WOOJUNG HUH <woojung.huh@microchip.com>"
@@ -74,6 +75,9 @@
 #define LAN78XX_EEPROM_MAGIC		(0x78A5)
 #define LAN78XX_OTP_MAGIC		(0x78F3)
 
+/* This register is specific to the LAN7800 and LAN7850 embedded PHYs */
+#define LAN78XX_PHY_LED_MODE_SELECT	29
+
 #define	MII_READ			1
 #define	MII_WRITE			0
 
@@ -2005,6 +2009,8 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
 {
 	int ret;
 	u32 mii_adv;
+	u32 led_modes[4];
+	int len;
 	struct phy_device *phydev;
 
 	phydev = phy_find_first(dev->mdiobus);
@@ -2077,6 +2083,35 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
 	mii_adv = (u32)mii_advertise_flowctrl(dev->fc_request_control);
 	phydev->advertising |= mii_adv_to_ethtool_adv_t(mii_adv);
 
+	len = of_property_read_variable_u32_array(dev->udev->dev.of_node,
+						  "microchip,led-modes",
+						  led_modes,
+						  0,
+						  ARRAY_SIZE(led_modes));
+	if (len >= 0) {
+		u32 reg = 0;
+		int i;
+
+		for (i = 0; i < len; i++) {
+			if (led_modes[i] > 15) {
+				ret = -EINVAL;
+				goto error;
+			}
+			reg |= led_modes[i] << (i * 4);
+		}
+		for (; i < ARRAY_SIZE(led_modes); i++)
+			reg |= LAN78XX_FORCE_LED_OFF << (i * 4);
+		(void)phy_write(phydev, LAN78XX_PHY_LED_MODE_SELECT, reg);
+
+		/* Ensure the LEDs are enabled */
+		lan78xx_read_reg(dev, HW_CFG, &reg);
+		reg |= HW_CFG_LED0_EN_ | HW_CFG_LED1_EN_;
+		lan78xx_write_reg(dev, HW_CFG, reg);
+	} else if (len == -EOVERFLOW) {
+		ret = -EINVAL;
+		goto error;
+	}
+
 	genphy_config_aneg(phydev);
 
 	dev->fc_autoneg = phydev->autoneg;
diff --git a/include/dt-bindings/net/microchip-78xx.h b/include/dt-bindings/net/microchip-78xx.h
new file mode 100644
index 0000000..dcf4a43
--- /dev/null
+++ b/include/dt-bindings/net/microchip-78xx.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _DT_BINDINGS_MICROCHIP_LAN78XX_H
+#define _DT_BINDINGS_MICROCHIP_LAN78XX_H
+
+/* LED modes */
+
+#define LAN78XX_LINK_ACTIVITY           0
+#define LAN78XX_LINK_1000_ACTIVITY      1
+#define LAN78XX_LINK_100_ACTIVITY       2
+#define LAN78XX_LINK_10_ACTIVITY        3
+#define LAN78XX_LINK_100_1000_ACTIVITY  4
+#define LAN78XX_LINK_10_1000_ACTIVITY   5
+#define LAN78XX_LINK_10_100_ACTIVITY    6
+#define LAN78XX_DUPLEX_COLLISION        8
+#define LAN78XX_COLLISION               9
+#define LAN78XX_ACTIVITY                10
+#define LAN78XX_AUTONEG_FAULT           12
+#define LAN78XX_FORCE_LED_OFF           14
+#define LAN78XX_FORCE_LED_ON            15
+
+#endif
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 1/3] lan78xx: Read MAC address from DT if present
From: Phil Elwell @ 2018-04-18 15:45 UTC (permalink / raw)
  To: Woojung Huh, Microchip Linux Driver Support, Rob Herring,
	Mark Rutland, David S. Miller, Mauro Carvalho Chehab,
	Greg Kroah-Hartman, Linus Walleij, Andrew Morton, Randy Dunlap,
	netdev, devicetree, linux-kernel, linux-usb
  Cc: Phil Elwell
In-Reply-To: <1524066323-109628-1-git-send-email-phil@raspberrypi.org>

There is a standard mechanism for locating and using a MAC address from
the Device Tree. Use this facility in the lan78xx driver to support
applications without programmed EEPROM or OTP. At the same time,
regularise the handling of the different address sources.

Signed-off-by: Phil Elwell <phil@raspberrypi.org>
---
 drivers/net/usb/lan78xx.c | 42 ++++++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 0867f72..a823f01 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -37,6 +37,7 @@
 #include <linux/irqchip/chained_irq.h>
 #include <linux/microchipphy.h>
 #include <linux/phy.h>
+#include <linux/of_net.h>
 #include "lan78xx.h"
 
 #define DRIVER_AUTHOR	"WOOJUNG HUH <woojung.huh@microchip.com>"
@@ -1652,34 +1653,31 @@ static void lan78xx_init_mac_address(struct lan78xx_net *dev)
 	addr[5] = (addr_hi >> 8) & 0xFF;
 
 	if (!is_valid_ether_addr(addr)) {
-		/* reading mac address from EEPROM or OTP */
-		if ((lan78xx_read_eeprom(dev, EEPROM_MAC_OFFSET, ETH_ALEN,
-					 addr) == 0) ||
-		    (lan78xx_read_otp(dev, EEPROM_MAC_OFFSET, ETH_ALEN,
-				      addr) == 0)) {
-			if (is_valid_ether_addr(addr)) {
-				/* eeprom values are valid so use them */
-				netif_dbg(dev, ifup, dev->net,
-					  "MAC address read from EEPROM");
-			} else {
-				/* generate random MAC */
-				random_ether_addr(addr);
-				netif_dbg(dev, ifup, dev->net,
-					  "MAC address set to random addr");
-			}
-
-			addr_lo = addr[0] | (addr[1] << 8) |
-				  (addr[2] << 16) | (addr[3] << 24);
-			addr_hi = addr[4] | (addr[5] << 8);
-
-			ret = lan78xx_write_reg(dev, RX_ADDRL, addr_lo);
-			ret = lan78xx_write_reg(dev, RX_ADDRH, addr_hi);
+		if (!eth_platform_get_mac_address(&dev->udev->dev, addr)) {
+			/* valid address present in Device Tree */
+			netif_dbg(dev, ifup, dev->net,
+				  "MAC address read from Device Tree");
+		} else if (((lan78xx_read_eeprom(dev, EEPROM_MAC_OFFSET,
+						 ETH_ALEN, addr) == 0) ||
+			    (lan78xx_read_otp(dev, EEPROM_MAC_OFFSET,
+					      ETH_ALEN, addr) == 0)) &&
+			   is_valid_ether_addr(addr)) {
+			/* eeprom values are valid so use them */
+			netif_dbg(dev, ifup, dev->net,
+				  "MAC address read from EEPROM");
 		} else {
 			/* generate random MAC */
 			random_ether_addr(addr);
 			netif_dbg(dev, ifup, dev->net,
 				  "MAC address set to random addr");
 		}
+
+		addr_lo = addr[0] | (addr[1] << 8) |
+			  (addr[2] << 16) | (addr[3] << 24);
+		addr_hi = addr[4] | (addr[5] << 8);
+
+		ret = lan78xx_write_reg(dev, RX_ADDRL, addr_lo);
+		ret = lan78xx_write_reg(dev, RX_ADDRH, addr_hi);
 	}
 
 	ret = lan78xx_write_reg(dev, MAF_LO(0), addr_lo);
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 0/3] lan78xx: Read configuration from Device Tree
From: Phil Elwell @ 2018-04-18 15:45 UTC (permalink / raw)
  To: Woojung Huh, Microchip Linux Driver Support, Rob Herring,
	Mark Rutland, David S. Miller, Mauro Carvalho Chehab,
	Greg Kroah-Hartman, Linus Walleij, Andrew Morton, Randy Dunlap,
	netdev, devicetree, linux-kernel, linux-usb
  Cc: Phil Elwell

The Microchip LAN78XX family of devices are Ethernet controllers with
a USB interface. Despite being discoverable devices it can be useful to
be able to configure them from Device Tree, particularly in low-cost
applications without an EEPROM or programmed OTP.

This patch set adds support for reading the MAC address and LED modes from
Device Tree.

v2:
- Use eth_platform_get_mac_address.
- Support up to 4 LEDs, and move LED mode constants into dt-bindings header.
- Improve bindings document.
- Remove EEE support.

Phil Elwell (3):
  lan78xx: Read MAC address from DT if present
  lan78xx: Read LED states from Device Tree
  dt-bindings: Document the DT bindings for lan78xx

 .../devicetree/bindings/net/microchip,lan78xx.txt  | 43 +++++++++++++++
 MAINTAINERS                                        |  2 +
 drivers/net/usb/lan78xx.c                          | 62 ++++++++++++++--------
 include/dt-bindings/net/microchip-78xx.h           | 40 ++++++++++++++
 4 files changed, 125 insertions(+), 22 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/net/microchip,lan78xx.txt
 create mode 100644 include/dt-bindings/net/microchip-78xx.h

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH bpf-next v3 8/8] bpf: add documentation for eBPF helpers (58-64)
From: Jesper Dangaard Brouer @ 2018-04-18 15:43 UTC (permalink / raw)
  To: Quentin Monnet
  Cc: daniel, ast, netdev, oss-drivers, linux-doc, linux-man,
	John Fastabend, brouer
In-Reply-To: <67e84a95-5e7b-1c2c-e90f-7bcc0026bd10@netronome.com>

On Wed, 18 Apr 2018 15:09:41 +0100
Quentin Monnet <quentin.monnet@netronome.com> wrote:

> 2018-04-18 15:34 UTC+0200 ~ Jesper Dangaard Brouer <brouer@redhat.com>
> > On Tue, 17 Apr 2018 15:34:38 +0100
> > Quentin Monnet <quentin.monnet@netronome.com> wrote:
> >   
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index 350459c583de..3d329538498f 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -1276,6 +1276,50 @@ union bpf_attr {
> >>   * 	Return
> >>   * 		0 on success, or a negative error in case of failure.
> >>   *
> >> + * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
> >> + * 	Description
> >> + * 		Redirect the packet to the endpoint referenced by *map* at
> >> + * 		index *key*. Depending on its type, his *map* can contain  
> >                                                     ^^^
> > 
> > "his" -> "this"  
> 
> Thanks!
> 
> >> + * 		references to net devices (for forwarding packets through other
> >> + * 		ports), or to CPUs (for redirecting XDP frames to another CPU;
> >> + * 		but this is only implemented for native XDP (with driver
> >> + * 		support) as of this writing).
> >> + *
> >> + * 		All values for *flags* are reserved for future usage, and must
> >> + * 		be left at zero.
> >> + * 	Return
> >> + * 		**XDP_REDIRECT** on success, or **XDP_ABORT** on error.
> >> + *  
> > 
> > "XDP_ABORT" -> "XDP_ABORTED"  
> 
> Ouch. And I did the same for bpf_redirect(). Thanks for the catch.
> 
> > 
> > I don't know if it's worth mentioning in the doc/man-page; that for XDP
> > using bpf_redirect_map() is a HUGE performance advantage, compared to
> > the bpf_redirect() call ?  
> 
> It seems worth to me. How would you simply explain the reason for this
> difference?

The basic reason is "bulking effect", as devmap avoids the NIC
tailptr/doorbell update on every packet... how to write that in a doc
format?

I wrote about why XDP_REDIRECT with maps are smart here:
 http://vger.kernel.org/netconf2017_files/XDP_devel_update_NetConf2017_Seoul.pdf

Using maps for redirect, hopefully makes XDP_REDIRECT the last driver
XDP action code we need.  As new types of redirect can be introduced
without driver changes. See that AF_XDP also uses a map.

It is more subtle, but maps also function as a sorting step. Imagine
your XDP program need to redirect out different interfaces (or CPUs in
cpumap case), and packets arrive intermixed.  Packets get sorted into
the different map indexes, and the xdp_do_flush_map() will trigger the
flush operation.


Happened to have an i40e NIC benchmark setup, and ran a single flow pktgen test.

Results with 'xdp_redirect_map'
 13589297 pps (13,589,297) 

Results with 'xdp_redirect' NOT using devmap:
  7567575 pps (7,567,575)

Just to point out the performance benefit of devmap...

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH net-next] team: account for oper state
From: George Wilkie @ 2018-04-18 15:39 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: netdev
In-Reply-To: <20180418153312.24h7mvle6sy2dv25@debian9.gwilkie>

On Wed, Apr 18, 2018 at 04:33:12PM +0100, George Wilkie wrote:
> On Wed, Apr 18, 2018 at 04:58:22PM +0200, Jiri Pirko wrote:
> > Wed, Apr 18, 2018 at 03:35:49PM CEST, gwilkie@vyatta.att-mail.com wrote:
> > >On Wed, Apr 18, 2018 at 02:56:44PM +0200, Jiri Pirko wrote:
> > >> Wed, Apr 18, 2018 at 12:29:50PM CEST, gwilkie@vyatta.att-mail.com wrote:
> > >> >Account for operational state when determining port linkup state,
> > >> >as per Documentation/networking/operstates.txt.
> > >> 
> > >> Could you please point me to the exact place in the document where this
> > >> is suggested?
> > >> 
> > >
> > >Various places cover it I think.
> > >
> > >In 1. Introduction:
> > >"interface is not usable just because the admin enabled it"
> > >"userspace must be granted the possibility to
> > >influence operational state"
> > >
> > >In 4. Setting from userspace:
> > >"the userspace application can set IFLA_OPERSTATE
> > >to IF_OPER_DORMANT or IF_OPER_UP as long as the driver does not set
> > >netif_carrier_off() or netif_dormant_on()"
> > >
> > >We have a use case where we want to set the oper state of the team ports based
> > >on whether they are actually usable or not (as opposed to just admin up).
> > 
> > Are you running a supplicant there or what is the use-case?
> > 
> 
> We are using tun/tap interfaces for the team ports with the physical interfaces
> under the control of a user process.
> 
> > How is this handle in other drivers like bond, openvswitch, bridge, etc?
> 
> It looks like bridge is using it, looking at br_port_carrier_check() and
> br_add_if().
> 

commit 576eb62598f10c8c7fd75703fe89010cdcfff596
Author:     stephen hemminger <shemminger@vyatta.com>
AuthorDate: Fri Dec 28 18:15:22 2012 +0000
Commit:     David S. Miller <davem@davemloft.net>
CommitDate: Sun Dec 30 02:31:43 2012 -0800

    bridge: respect RFC2863 operational state
    
    The bridge link detection should follow the operational state
    of the lower device, rather than the carrier bit. This allows devices
    like tunnels that are controlled by userspace control plane to work
    with bridge STP link management.

> Cheers.
> 
> > 
> > >
> > >Cheers.
> > >
> > >> 
> > >> >
> > >> >Signed-off-by: George Wilkie <gwilkie@vyatta.att-mail.com>
> > >> >---
> > >> > drivers/net/team/team.c | 3 ++-
> > >> > 1 file changed, 2 insertions(+), 1 deletion(-)
> > >> >
> > >> >diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
> > >> >index a6c6ce19eeee..231264a05e55 100644
> > >> >--- a/drivers/net/team/team.c
> > >> >+++ b/drivers/net/team/team.c
> > >> >@@ -2918,7 +2918,8 @@ static int team_device_event(struct notifier_block *unused,
> > >> > 	case NETDEV_CHANGE:
> > >> > 		if (netif_running(port->dev))
> > >> > 			team_port_change_check(port,
> > >> >-					       !!netif_carrier_ok(port->dev));
> > >> >+					       !!(netif_carrier_ok(port->dev) &&
> > >> >+						  netif_oper_up(port->dev)));
> > >> > 		break;
> > >> > 	case NETDEV_UNREGISTER:
> > >> > 		team_del_slave(port->team->dev, dev);
> > >> >-- 
> > >> >2.11.0
> > >> >

^ permalink raw reply

* Re: [PATCH net-next] team: account for oper state
From: George Wilkie @ 2018-04-18 15:33 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: netdev
In-Reply-To: <20180418145822.GE1989@nanopsycho>

On Wed, Apr 18, 2018 at 04:58:22PM +0200, Jiri Pirko wrote:
> Wed, Apr 18, 2018 at 03:35:49PM CEST, gwilkie@vyatta.att-mail.com wrote:
> >On Wed, Apr 18, 2018 at 02:56:44PM +0200, Jiri Pirko wrote:
> >> Wed, Apr 18, 2018 at 12:29:50PM CEST, gwilkie@vyatta.att-mail.com wrote:
> >> >Account for operational state when determining port linkup state,
> >> >as per Documentation/networking/operstates.txt.
> >> 
> >> Could you please point me to the exact place in the document where this
> >> is suggested?
> >> 
> >
> >Various places cover it I think.
> >
> >In 1. Introduction:
> >"interface is not usable just because the admin enabled it"
> >"userspace must be granted the possibility to
> >influence operational state"
> >
> >In 4. Setting from userspace:
> >"the userspace application can set IFLA_OPERSTATE
> >to IF_OPER_DORMANT or IF_OPER_UP as long as the driver does not set
> >netif_carrier_off() or netif_dormant_on()"
> >
> >We have a use case where we want to set the oper state of the team ports based
> >on whether they are actually usable or not (as opposed to just admin up).
> 
> Are you running a supplicant there or what is the use-case?
> 

We are using tun/tap interfaces for the team ports with the physical interfaces
under the control of a user process.

> How is this handle in other drivers like bond, openvswitch, bridge, etc?

It looks like bridge is using it, looking at br_port_carrier_check() and
br_add_if().

Cheers.

> 
> >
> >Cheers.
> >
> >> 
> >> >
> >> >Signed-off-by: George Wilkie <gwilkie@vyatta.att-mail.com>
> >> >---
> >> > drivers/net/team/team.c | 3 ++-
> >> > 1 file changed, 2 insertions(+), 1 deletion(-)
> >> >
> >> >diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
> >> >index a6c6ce19eeee..231264a05e55 100644
> >> >--- a/drivers/net/team/team.c
> >> >+++ b/drivers/net/team/team.c
> >> >@@ -2918,7 +2918,8 @@ static int team_device_event(struct notifier_block *unused,
> >> > 	case NETDEV_CHANGE:
> >> > 		if (netif_running(port->dev))
> >> > 			team_port_change_check(port,
> >> >-					       !!netif_carrier_ok(port->dev));
> >> >+					       !!(netif_carrier_ok(port->dev) &&
> >> >+						  netif_oper_up(port->dev)));
> >> > 		break;
> >> > 	case NETDEV_UNREGISTER:
> >> > 		team_del_slave(port->team->dev, dev);
> >> >-- 
> >> >2.11.0
> >> >

^ permalink raw reply

* [PATCH net-next 2/2] netns: isolate seqnums to use per-netns locks
From: Christian Brauner @ 2018-04-18 15:21 UTC (permalink / raw)
  To: ebiederm, davem, netdev, linux-kernel
  Cc: avagin, ktkhai, serge, gregkh, Christian Brauner
In-Reply-To: <20180418152106.18519-1-christian.brauner@ubuntu.com>

Now that it's possible to have a different set of uevents in different
network namespaces, per-network namespace uevent sequence numbers are
introduced. This increases performance as locking is now restricted to the
network namespace affected by the uevent rather than locking everything.

Since commit 692ec06 ("netns: send uevent messages") network namespaces not
owned by the intial user namespace can be sent uevents from a sufficiently
privileged userspace process.
In order to send a uevent into a network namespace not owned by the initial
user namespace we currently still need to take the *global mutex* that
locks the uevent socket list even though the list *only contains network
namespaces owned by the initial user namespace*. This needs to be done
because the uevent counter is a global variable. Taking the global lock is
performance sensitive since a user on the host can spawn a pool of n
process that each create their own new user and network namespaces and then
go on to inject uevents in parallel into the network namespace of all of
these processes. This can have a significant performance impact for the
host's udevd since it means that there can be a lot of delay between a
device being added and the corresponding uevent being sent out and
available for processing by udevd. It also means that each network
namespace not owned by the initial user namespace which userspace has sent
a uevent to will need to wait until the lock becomes available.

Implementation:
This patch gives each network namespace its own uevent sequence number.
Each network namespace not owned by the initial user namespace receives its
own mutex. The struct uevent_sock is opaque to callers outside of kobject.c
so the mutex *can* and *is* only ever accessed in lib/kobject.c. In this
file it is clearly documented which lock has to be taken. All network
namespaces owned by the initial user namespace will still share the same
lock since they are all served sequentially via the uevent socket list.
This decouples the locking and ensures that the host retrieves uevents as
fast as possible even if there are a lot of uevents injected into network
namespaces not owned by the initial user namespace.  In addition, each
network namespace not owned by the initial user namespace does not have to
wait on any other network namespace not sharing the same user namespace.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/kobject.h     |   3 --
 include/net/net_namespace.h |   3 ++
 kernel/ksysfs.c             |   3 +-
 lib/kobject_uevent.c        | 100 ++++++++++++++++++++++++++++--------
 net/core/net_namespace.c    |  13 +++++
 5 files changed, 98 insertions(+), 24 deletions(-)

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 7f6f93c3df9c..776391aea247 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -36,9 +36,6 @@
 extern char uevent_helper[];
 #endif
 
-/* counter to tag the uevent, read only except for the kobject core */
-extern u64 uevent_seqnum;
-
 /*
  * The actions here must match the index to the string array
  * in lib/kobject_uevent.c
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 47e35cce3b64..e4e171b1ba69 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -85,6 +85,8 @@ struct net {
 	struct sock		*genl_sock;
 
 	struct uevent_sock	*uevent_sock;		/* uevent socket */
+	/* counter to tag the uevent, read only except for the kobject core */
+	u64                     uevent_seqnum;
 
 	struct list_head 	dev_base_head;
 	struct hlist_head 	*dev_name_head;
@@ -189,6 +191,7 @@ extern struct list_head net_namespace_list;
 
 struct net *get_net_ns_by_pid(pid_t pid);
 struct net *get_net_ns_by_fd(int fd);
+u64 get_ns_uevent_seqnum_by_vpid(void);
 
 #ifdef CONFIG_SYSCTL
 void ipx_register_sysctl(void);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 46ba853656f6..83264edcecda 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/capability.h>
 #include <linux/compiler.h>
+#include <net/net_namespace.h>
 
 #include <linux/rcupdate.h>	/* rcu_expedited and rcu_normal */
 
@@ -33,7 +34,7 @@ static struct kobj_attribute _name##_attr = \
 static ssize_t uevent_seqnum_show(struct kobject *kobj,
 				  struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
+	return sprintf(buf, "%llu\n", (unsigned long long)get_ns_uevent_seqnum_by_vpid());
 }
 KERNEL_ATTR_RO(uevent_seqnum);
 
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index f5f5038787ac..796fd502c227 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -29,21 +29,38 @@
 #include <net/net_namespace.h>
 
 
-u64 uevent_seqnum;
 #ifdef CONFIG_UEVENT_HELPER
 char uevent_helper[UEVENT_HELPER_PATH_LEN] = CONFIG_UEVENT_HELPER_PATH;
 #endif
 
+/*
+ * Size a buffer needs to be in order to hold the largest possible sequence
+ * number stored in a u64 including \0 byte: 2^64 - 1 = 21 chars.
+ */
+#define SEQNUM_BUFSIZE (sizeof("SEQNUM=") + 21)
 struct uevent_sock {
 	struct list_head list;
 	struct sock *sk;
+	/*
+	 * This mutex protects uevent sockets and the uevent counter of
+	 * network namespaces *not* owned by init_user_ns.
+	 * For network namespaces owned by init_user_ns this lock is *not*
+	 * valid instead the global uevent_sock_mutex must be used!
+	 */
+	struct mutex sk_mutex;
 };
 
 #ifdef CONFIG_NET
 static LIST_HEAD(uevent_sock_list);
 #endif
 
-/* This lock protects uevent_seqnum and uevent_sock_list */
+/*
+ * This mutex protects uevent sockets and the uevent counter of network
+ * namespaces owned by init_user_ns.
+ * For network namespaces not owned by init_user_ns this lock is *not*
+ * valid instead the network namespace specific sk_mutex in struct
+ * uevent_sock must be used!
+ */
 static DEFINE_MUTEX(uevent_sock_mutex);
 
 /* the strings here must match the enum in include/linux/kobject.h */
@@ -253,6 +270,22 @@ static int kobj_bcast_filter(struct sock *dsk, struct sk_buff *skb, void *data)
 
 	return 0;
 }
+
+static bool can_hold_seqnum(const struct kobj_uevent_env *env, size_t len)
+{
+	if (env->envp_idx >= ARRAY_SIZE(env->envp)) {
+		WARN(1, KERN_ERR "Failed to append sequence number. "
+		     "Too many uevent variables\n");
+		return false;
+	}
+
+	if ((env->buflen + len) > UEVENT_BUFFER_SIZE) {
+		WARN(1, KERN_ERR "Insufficient space to append sequence number\n");
+		return false;
+	}
+
+	return true;
+}
 #endif
 
 #ifdef CONFIG_UEVENT_HELPER
@@ -308,18 +341,22 @@ static int kobject_uevent_net_broadcast(struct kobject *kobj,
 
 	/* send netlink message */
 	list_for_each_entry(ue_sk, &uevent_sock_list, list) {
+		/* bump sequence number */
+		u64 seqnum = ++sock_net(ue_sk->sk)->uevent_seqnum;
 		struct sock *uevent_sock = ue_sk->sk;
+		char buf[SEQNUM_BUFSIZE];
 
 		if (!netlink_has_listeners(uevent_sock, 1))
 			continue;
 
 		if (!skb) {
-			/* allocate message with the maximum possible size */
+			/* calculate header length */
 			size_t len = strlen(action_string) + strlen(devpath) + 2;
 			char *scratch;
 
+			/* allocate message with the maximum possible size */
 			retval = -ENOMEM;
-			skb = alloc_skb(len + env->buflen, GFP_KERNEL);
+			skb = alloc_skb(len + env->buflen + SEQNUM_BUFSIZE, GFP_KERNEL);
 			if (!skb)
 				continue;
 
@@ -327,11 +364,24 @@ static int kobject_uevent_net_broadcast(struct kobject *kobj,
 			scratch = skb_put(skb, len);
 			sprintf(scratch, "%s@%s", action_string, devpath);
 
+			/* add env */
 			skb_put_data(skb, env->buf, env->buflen);
 
 			NETLINK_CB(skb).dst_group = 1;
 		}
 
+		/* prepare netns seqnum */
+		retval = snprintf(buf, SEQNUM_BUFSIZE, "SEQNUM=%llu", seqnum);
+		if (retval < 0 || retval >= SEQNUM_BUFSIZE)
+			continue;
+		retval++;
+
+		if (!can_hold_seqnum(env, retval))
+			continue;
+
+		/* append netns seqnum */
+		skb_put_data(skb, buf, retval);
+
 		retval = netlink_broadcast_filtered(uevent_sock, skb_get(skb),
 						    0, 1, GFP_KERNEL,
 						    kobj_bcast_filter,
@@ -339,6 +389,9 @@ static int kobject_uevent_net_broadcast(struct kobject *kobj,
 		/* ENOBUFS should be handled in userspace */
 		if (retval == -ENOBUFS || retval == -ESRCH)
 			retval = 0;
+
+		/* remove netns seqnum */
+		skb_trim(skb, env->buflen);
 	}
 	consume_skb(skb);
 #endif
@@ -510,14 +563,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 	}
 
 	mutex_lock(&uevent_sock_mutex);
-	/* we will send an event, so request a new sequence number */
-	retval = add_uevent_var(env, "SEQNUM=%llu", (unsigned long long)++uevent_seqnum);
-	if (retval) {
-		mutex_unlock(&uevent_sock_mutex);
-		goto exit;
-	}
-	retval = kobject_uevent_net_broadcast(kobj, env, action_string,
-					      devpath);
+	retval = kobject_uevent_net_broadcast(kobj, env, action_string, devpath);
 	mutex_unlock(&uevent_sock_mutex);
 
 #ifdef CONFIG_UEVENT_HELPER
@@ -605,17 +651,18 @@ int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...)
 EXPORT_SYMBOL_GPL(add_uevent_var);
 
 #if defined(CONFIG_NET)
-static int uevent_net_broadcast(struct sock *usk, struct sk_buff *skb,
+static int uevent_net_broadcast(struct uevent_sock *ue_sk, struct sk_buff *skb,
 				struct netlink_ext_ack *extack)
 {
-	/* u64 to chars: 2^64 - 1 = 21 chars */
-	char buf[sizeof("SEQNUM=") + 21];
+	struct sock *usk = ue_sk->sk;
+	char buf[SEQNUM_BUFSIZE];
 	struct sk_buff *skbc;
 	int ret;
 
 	/* bump and prepare sequence number */
-	ret = snprintf(buf, sizeof(buf), "SEQNUM=%llu", ++uevent_seqnum);
-	if (ret < 0 || (size_t)ret >= sizeof(buf))
+	ret = snprintf(buf, SEQNUM_BUFSIZE, "SEQNUM=%llu",
+		       ++sock_net(ue_sk->sk)->uevent_seqnum);
+	if (ret < 0 || ret >= SEQNUM_BUFSIZE)
 		return -ENOMEM;
 	ret++;
 
@@ -668,9 +715,15 @@ static int uevent_net_rcv_skb(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return -EPERM;
 	}
 
-	mutex_lock(&uevent_sock_mutex);
-	ret = uevent_net_broadcast(net->uevent_sock->sk, skb, extack);
-	mutex_unlock(&uevent_sock_mutex);
+	if (net->user_ns == &init_user_ns)
+		mutex_lock(&uevent_sock_mutex);
+	else
+		mutex_lock(&net->uevent_sock->sk_mutex);
+	ret = uevent_net_broadcast(net->uevent_sock, skb, extack);
+	if (net->user_ns == &init_user_ns)
+		mutex_unlock(&uevent_sock_mutex);
+	else
+		mutex_unlock(&net->uevent_sock->sk_mutex);
 
 	return ret;
 }
@@ -708,6 +761,13 @@ static int uevent_net_init(struct net *net)
 		mutex_lock(&uevent_sock_mutex);
 		list_add_tail(&ue_sk->list, &uevent_sock_list);
 		mutex_unlock(&uevent_sock_mutex);
+	} else {
+		/*
+		 * Uevent sockets and counters for network namespaces
+		 * not owned by the initial user namespace have their
+		 * own mutex.
+		 */
+		mutex_init(&ue_sk->sk_mutex);
 	}
 
 	return 0;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index a11e03f920d3..2f914804ef73 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -618,6 +618,19 @@ struct net *get_net_ns_by_pid(pid_t pid)
 }
 EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
 
+u64 get_ns_uevent_seqnum_by_vpid(void)
+{
+	pid_t cur_pid;
+	struct net *net;
+
+	cur_pid = task_pid_vnr(current);
+	net = get_net_ns_by_pid(cur_pid);
+	if (IS_ERR(net))
+		return 0;
+
+	return net->uevent_seqnum;
+}
+
 static __net_init int net_ns_net_init(struct net *net)
 {
 #ifdef CONFIG_NET_NS
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next 1/2] netns: restrict uevents
From: Christian Brauner @ 2018-04-18 15:21 UTC (permalink / raw)
  To: ebiederm, davem, netdev, linux-kernel
  Cc: avagin, ktkhai, serge, gregkh, Christian Brauner
In-Reply-To: <20180418152106.18519-1-christian.brauner@ubuntu.com>

commit 07e98962fa77 ("kobject: Send hotplug events in all network namespaces")

enabled sending hotplug events into all network namespaces back in 2010.
Over time the set of uevents that get sent into all network namespaces has
shrunk a little. We have now reached the point where hotplug events for all
devices that carry a namespace tag are filtered according to that
namespace. Specifically, they are filtered whenever the namespace tag of
the kobject does not match the namespace tag of the netlink socket. One
example are network devices. Uevents for network devices only show up in
the network namespaces these devices are moved to or created in.

However, any uevent for a kobject that does not have a namespace tag
associated with it will not be filtered and we will broadcast it into all
network namespaces. This behavior stopped making sense when user namespaces
were introduced.

This patch restricts uevents to the initial user namespace for a couple of
reasons that have been extensively discusses on the mailing list [1].
- Thundering herd:
  Broadcasting uevents into all network namespaces introduces significant
  overhead.
  All processes that listen to uevents running in non-initial user
  namespaces will end up responding to uevents that will be meaningless to
  them. Mainly, because non-initial user namespaces cannot easily manage
  devices unless they have a privileged host-process helping them out. This
  means that there will be a thundering herd of activity when there
  shouldn't be any.
- Uevents from non-root users are already filtered in userspace:
  Uevents are filtered by userspace in a user namespace because the
  received uid != 0. Instead the uid associated with the event will be
  65534 == "nobody" because the global root uid is not mapped.
  This means we can safely and without introducing regressions modify the
  kernel to not send uevents into all network namespaces whose owning user
  namespace is not the initial user namespace because we know that
  userspace will ignore the message because of the uid anyway. I have
  a) verified that is is true for every udev implementation out there b)
  that this behavior has been present in all udev implementations from the
  very beginning.
- Removing needless overhead/Increasing performance:
  Currently, the uevent socket for each network namespace is added to the
  global variable uevent_sock_list. The list itself needs to be protected
  by a mutex. So everytime a uevent is generated the mutex is taken on the
  list. The mutex is held *from the creation of the uevent (memory
  allocation, string creation etc. until all uevent sockets have been
  handled*. This is aggravated by the fact that for each uevent socket that
  has listeners the mc_list must be walked as well which means we're
  talking O(n^2) here. Given that a standard Linux workload usually has
  quite a lot of network namespaces and - in the face of containers - a lot
  of user namespaces this quickly becomes a performance problem (see
  "Thundering herd" above). By just recording uevent sockets of network
  namespaces that are owned by the initial user namespace we significantly
  increase performance in this codepath.
- Injecting uevents:
  There's a valid argument that containers might be interested in receiving
  device events especially if they are delegated to them by a privileged
  userspace process. One prime example are SR-IOV enabled devices that are
  explicitly designed to be handed of to other users such as VMs or
  containers.
  This use-case can now be correctly handled since
  commit 692ec06d7c92 ("netns: send uevent messages"). This commit
  introduced the ability to send uevents from userspace. As such we can let
  a sufficiently privileged (CAP_SYS_ADMIN in the owning user namespace of
  the network namespace of the netlink socket) userspace process make a
  decision what uevents should be sent. This removes the need to blindly
  broadcast uevents into all user namespaces and provides a performant and
  safe solution to this problem.
- Filtering logic:
  This patch filters by *owning user namespace of the network namespace a
  given task resides in* and not by user namespace of the task per se. This
  means if the user namespace of a given task is unshared but the network
  namespace is kept and is owned by the initial user namespace a listener
  that is opening the uevent socket in that network namespace can still
  listen to uevents.

[1]: https://lkml.org/lkml/2018/4/4/739
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 lib/kobject_uevent.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 15ea216a67ce..f5f5038787ac 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -703,9 +703,13 @@ static int uevent_net_init(struct net *net)
 
 	net->uevent_sock = ue_sk;
 
-	mutex_lock(&uevent_sock_mutex);
-	list_add_tail(&ue_sk->list, &uevent_sock_list);
-	mutex_unlock(&uevent_sock_mutex);
+	/* Restrict uevents to initial user namespace. */
+	if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) {
+		mutex_lock(&uevent_sock_mutex);
+		list_add_tail(&ue_sk->list, &uevent_sock_list);
+		mutex_unlock(&uevent_sock_mutex);
+	}
+
 	return 0;
 }
 
@@ -713,9 +717,11 @@ static void uevent_net_exit(struct net *net)
 {
 	struct uevent_sock *ue_sk = net->uevent_sock;
 
-	mutex_lock(&uevent_sock_mutex);
-	list_del(&ue_sk->list);
-	mutex_unlock(&uevent_sock_mutex);
+	if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) {
+		mutex_lock(&uevent_sock_mutex);
+		list_del(&ue_sk->list);
+		mutex_unlock(&uevent_sock_mutex);
+	}
 
 	netlink_kernel_release(ue_sk->sk);
 	kfree(ue_sk);
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next 0/2] netns: uevent performance tweaks
From: Christian Brauner @ 2018-04-18 15:21 UTC (permalink / raw)
  To: ebiederm, davem, netdev, linux-kernel
  Cc: avagin, ktkhai, serge, gregkh, Christian Brauner

Hey,

This series deals with a bunch of performance improvements when sending out
uevents that have been extensively discussed here:
https://lkml.org/lkml/2018/4/10/592

- Only record uevent sockets from network namespaces owned by the
  initial user namespace in the global uevent socket list.
  Eric, this is the exact patch we agreed upon in
  https://lkml.org/lkml/2018/4/10/592.
  **A very detailed rationale is present in the commit message for
    [PATCH 1/2] netns: restrict uevents**
- Decouple the locking for network namespaces in the global uevent socket
  list from the locking for network namespaces not in the global uevent
  socket list.
  **A very detailed rationale is present in the commit message
    [PATCH 2/2] netns: isolate seqnums to use per-netns locks**

Thanks!
Christian

Christian Brauner (2):
  netns: restrict uevents
  netns: isolate seqnums to use per-netns locks

 include/linux/kobject.h     |   3 -
 include/net/net_namespace.h |   3 +
 kernel/ksysfs.c             |   3 +-
 lib/kobject_uevent.c        | 118 ++++++++++++++++++++++++++++--------
 net/core/net_namespace.c    |  13 ++++
 5 files changed, 110 insertions(+), 30 deletions(-)

-- 
2.17.0

^ permalink raw reply

* Re: [PATCH bpf-next v4 07/10] bpf: btf: Add pretty print support to the basic arraymap
From: Daniel Borkmann @ 2018-04-18 15:20 UTC (permalink / raw)
  To: Martin KaFai Lau, netdev; +Cc: Alexei Starovoitov, kernel-team, jakub.kicinski
In-Reply-To: <20180417204243.4028831-8-kafai@fb.com>

Hi Martin,

first of all great work on the set! One issue that puzzled me
while digesting it further below.

On 04/17/2018 10:42 PM, Martin KaFai Lau wrote:
> This patch adds pretty print support to the basic arraymap.
> Support for other bpf maps can be added later.
> 
> This patch adds new attrs to the BPF_MAP_CREATE command to allow
> specifying the btf_fd, btf_key_id and btf_value_id.  The
> BPF_MAP_CREATE can then associate the btf to the map if
> the creating map supports BTF.

Feels like this patch is doing two things at once, meaning i)
attaching btf object to map object through bpf syscall at map
creation time, and ...

> A BTF supported map needs to implement two new map ops,
> map_seq_show_elem() and map_check_btf().  This patch has
> implemented these new map ops for the basic arraymap.
> 
> It also adds file_operations to the pinned map
> such that the pinned map can be opened and read.

... ii) pretty print map dump via bpf fs for array map.

> Here is a sample output when reading a pinned arraymap
> with the following map's value:
> 
> struct map_value {
> 	int count_a;
> 	int count_b;
> };
> 
> cat /sys/fs/bpf/pinned_array_map:
> 
> 0: {1,2}
> 1: {3,4}
> 2: {5,6}
> ...

Rather than adding this to the bpf fs itself, why not add full BTF
support for the main debugging and introspection tool we have and
ship with the kernel for BPF, namely bpftool? You can already dump
the whole map's key/value pairs via the following command from a
pinned file:

  bpftool map dump pinned /sys/fs/bpf/pinned_array_map

And given we already export the BTF info in your earlier patch through
the BPF_OBJ_GET_INFO_BY_FD, this would fit perfectly for bpftool
integration instead where the pretty-print which is done through the
extra cb map_seq_show_elem (which only does a map lookup and print
anyway) in this patch can simply all be done in user space w/o any
additional kernel complexity.

Aside that this would be very valuable there it would also nicely
demonstrate usage of it, but more importantly we could avoid implementing
such pretty-print callback in the kernel for every other map type and
then having two locations where a user now needs to go for debugging
(bpftool being one, and cat of pinned file the other; this split seems
confusing from a user perspective, imho, but also single key lookup +
pretty-print cannot be realized with the latter whereas it's trivial
with bpftool).

The same could be done for bpftool map lookup, updates, deletions, etc
where the key resp. key/value pair can be specified through a struct
like initializer from cmdline. (But dump/lookup should be good enough
starting point initially.) Thoughts?

Thanks again,
Daniel

> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> Acked-by: Alexei Starovoitov <ast@fb.com>
> ---
>  include/linux/bpf.h      |  20 ++++++-
>  include/uapi/linux/bpf.h |   3 +
>  kernel/bpf/arraymap.c    |  50 ++++++++++++++++
>  kernel/bpf/inode.c       | 146 ++++++++++++++++++++++++++++++++++++++++++++++-
>  kernel/bpf/syscall.c     |  32 ++++++++++-
>  5 files changed, 244 insertions(+), 7 deletions(-)

^ permalink raw reply

* Re: SRIOV switchdev mode BoF minutes
From: Andy Gospodarek @ 2018-04-18 15:15 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Andy Gospodarek, Or Gerlitz, Samudrala, Sridhar, David Miller,
	Anjali Singhai Jain, Michael Chan, Simon Horman, John Fastabend,
	Saeed Mahameed, Jiri Pirko, Rony Efraim, Linux Netdev List
In-Reply-To: <20180417161915.193c3651@cakuba.netronome.com>

On Tue, Apr 17, 2018 at 04:19:15PM -0700, Jakub Kicinski wrote:
> On Tue, 17 Apr 2018 10:47:00 -0400, Andy Gospodarek wrote:
> > There is also a school of thought that the VF reps could be
> > pre-allocated on the SmartNIC so that any application processing that
> > traffic would sit idle when no traffic arrives on the rep, but could
> > process frames that do arrive when the VFs were created on the host.
> > This implementation will depend on how resources are allocated on a
> > given bit of hardware, but can really work well.
> 
> +1 if there is no FW resource allocation issues IMHO it's okay to
> just show all reprs for "remote PCIes (PFs and VFs)" on the SmartNIC/
> controller.  The reprs should just show link down as if PCIe cable
> was unpluged until host actually enables them.  

Yes we are on the same page on this.

> A similar issue exists on multi-host for PFs, right?  If one of the
> hosts is down do we still show their PF repr?  IMHO yes.

I would agree with that as well.  With today's model the VF reps are
created once a PF is put into switchdev mode, but I'm still working out
how we want to consider whether or not a PF rep for the other domains is
created locally or not and also how one can determine which domain is in
control.

Permanent config options (like NVRAM settings) could easily handle which
domain is in control, but that still does not mean that PF reps must be
created automatically, does it?

> That makes the thing looks more like a switch with cables being plugged
> in and out.

Yes, that's exactly how I view it as well.

^ permalink raw reply

* Re: [PATCH RFC net-next 00/11] udp gso
From: Samudrala, Sridhar @ 2018-04-18 15:08 UTC (permalink / raw)
  To: Willem de Bruijn, Sowmini Varadhan
  Cc: Eric Dumazet, Network Development, Willem de Bruijn
In-Reply-To: <CAF=yD-LkTpVQ_8F2oTLeEDKcNWYuvn3QoSrSa0y5j7zgB6Em9A@mail.gmail.com>

On 4/18/2018 6:51 AM, Willem de Bruijn wrote:
> On Wed, Apr 18, 2018 at 9:47 AM, Sowmini Varadhan
> <sowmini.varadhan@oracle.com> wrote:
>> On (04/18/18 06:35), Eric Dumazet wrote:
>>> There is no change at all.
>>>
>>> This will only be used as a mechanism to send X packets of same size.
>>>
>>> So instead of X system calls , one system call.
>>>
>>> One traversal of some expensive part of the host stack.
>>>
>>> The content on the wire should be the same.
>> I'm sorry that's not how I interpret Willem's email below
>> (and maybe I misunderstood)
>>
>> the following taken from https://www.spinics.net/lists/netdev/msg496150.html
>>
>> Sowmini> If yes, how will the recvmsg differentiate between the case
>> Sowmini> (2000 byte message followed by 512 byte message) and
>> Sowmini> (1472 byte message, 526 byte message, then 512 byte message),
>> Sowmini> in other words, how are UDP message boundary semantics preserved?
>>
>> Willem> They aren't. This is purely an optimization to amortize the cost of
>> Willem> repeated tx stack traversal. Unlike UFO, which would preserve the
>> Willem> boundaries of the original larger than MTU datagram.
>>
>> As I understand Willem's explanation, if I do a sendmsg of 2000 bytes,
>> - classic UDP will send 2 IP fragments, the first one with a full UDP
>>    header, and the IP header indicating that this is the first frag for
>>    that ipid, with more frags to follow. The second frag will have the
>>    rest with the same ipid, it will not have a udp header,
>>    and it will indicatet that it is the last frag (no more frags).
>>
>>    The receiver can thus use the ipid, "more-frags" bit, frag offset etc
>>    to stitch the 2000 byte udp message together and pass it up on the udp
>>    socket.
>>
>> - in the "GSO" proposal my 2000  bytes of data are sent as *two*
>>    udp packets, each of them with a unique udp header, and uh_len set
>>    to 1476 (for first) and 526 (for second). The receiver has no clue
>>    that they are both part of the same UDP datagram, So wire format
>>    is not the same, am I mistaken?
> Eric is correct. If the application sets a segment size with UDP_SEGMENT
> this is an instruction to the kernel to split the payload along that border into
> separate discrete datagrams.

OK. So the sender app is passing the message boundary info to the kernel via the socket
option and letting the kernel split the large payload into multiple UDP segments.


>
> It does not matter what the behavior is without setting this option. If a
> process wants to send a larger than MTU datagram and rely on the
> kernel to fragment, then it should not set the option.

^ permalink raw reply

* Re: [PATCH net-next v4 0/3] kernel: add support to collect hardware logs in crash recovery kernel
From: Rahul Lakkireddy @ 2018-04-18 15:07 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Dave Young, netdev@vger.kernel.org, kexec@lists.infradead.org,
	linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	Indranil Choudhury, Nirranjan Kirubaharan,
	stephen@networkplumber.org, Ganesh GR, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, davem@davemloft.net,
	viro@zeniv.linux.org.uk
In-Reply-To: <871sfcy4ge.fsf@xmission.com>

On Wednesday, April 04/18/18, 2018 at 19:58:01 +0530, Eric W. Biederman wrote:
> Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> writes:
> 
> > On Wednesday, April 04/18/18, 2018 at 11:45:46 +0530, Dave Young wrote:
> >> Hi Rahul,
> >> On 04/17/18 at 01:14pm, Rahul Lakkireddy wrote:
> >> > On production servers running variety of workloads over time, kernel
> >> > panic can happen sporadically after days or even months. It is
> >> > important to collect as much debug logs as possible to root cause
> >> > and fix the problem, that may not be easy to reproduce. Snapshot of
> >> > underlying hardware/firmware state (like register dump, firmware
> >> > logs, adapter memory, etc.), at the time of kernel panic will be very
> >> > helpful while debugging the culprit device driver.
> >> > 
> >> > This series of patches add new generic framework that enable device
> >> > drivers to collect device specific snapshot of the hardware/firmware
> >> > state of the underlying device in the crash recovery kernel. In crash
> >> > recovery kernel, the collected logs are added as elf notes to
> >> > /proc/vmcore, which is copied by user space scripts for post-analysis.
> >> > 
> >> > The sequence of actions done by device drivers to append their device
> >> > specific hardware/firmware logs to /proc/vmcore are as follows:
> >> > 
> >> > 1. During probe (before hardware is initialized), device drivers
> >> > register to the vmcore module (via vmcore_add_device_dump()), with
> >> > callback function, along with buffer size and log name needed for
> >> > firmware/hardware log collection.
> >> 
> >> I assumed the elf notes info should be prepared while kexec_[file_]load
> >> phase. But I did not read the old comment, not sure if it has been discussed
> >> or not.
> >> 
> >
> > We must not collect dumps in crashing kernel. Adding more things in
> > crash dump path risks not collecting vmcore at all. Eric had
> > discussed this in more detail at:
> >
> > https://lkml.org/lkml/2018/3/24/319
> >
> > We are safe to collect dumps in the second kernel. Each device dump
> > will be exported as an elf note in /proc/vmcore.
> 
> It just occurred to me there is one variation that is worth
> considering.
> 
> Is the area you are looking at dumping part of a huge mmio area?
> I think someone said 2GB?
> 
> If that is the case it could be worth it to simply add the needed
> addresses to the range of memory we need to dump, and simply having a
> elf note saying that is what happened.
> 

We are _not_ dumping mmio area. However, one part of the dump
collection involves reading 2 GB on-chip memory via PIO access,
which is compressed and stored.

Thanks,
Rahul

^ permalink raw reply

* Re: [PATCH net-next 2/2] openvswitch: Support conntrack zone limit
From: Gregory Rose @ 2018-04-18 15:05 UTC (permalink / raw)
  To: Yi-Hung Wei; +Cc: netdev
In-Reply-To: <CAG1aQhJxgwmEGPpO61rpGo1ve9Rdr+fV7r-EF95x0=1SqZgX+A@mail.gmail.com>

On 4/17/2018 5:30 PM, Yi-Hung Wei wrote:
>> s/to commit/from committing/
>> s/entry/entries/
> Thanks, will fix that in both patches in v2.
>
>
>> I think this is a great idea but I suggest porting to the iproute2 package
>> so everyone can use it.  Then git rid of the OVS specific prefixes.
>> Presuming of course that the conntrack connection
>> limit backend works there as well I guess.  If it doesn't, then I'd suggest
>> extending
>> it.  This is a nice feature for all users in my opinion and then OVS
>> can take advantage of it as well.
> Thanks for the comment.  And yes, I think currently, iptables’s
> connlimit extension does support limiting the # of connections.  Users
> need to configure the zone properly, and the iptable’s connlimit
> extension is using netfilter's nf_conncount backend already.
>
> The main goal for this patch is to utilize netfilter backend
> (nf_conncount) to count and limit the number of connections. OVS needs
> the proposed OVS_CT_LIMIT netlink API and the corresponding booking
> data structure because the current nf_conncount backend only counts
> the # of connections, but it does not keep track of the connection
> limit in nf_conncount.
>
> Thanks,
>
> -Yi-Hung

Thanks Yi-hung, I figured I was just missing something there.  I 
appreciate the explanation.

- Greg

^ permalink raw reply

* Re: [PATCH net-next] team: account for oper state
From: Jiri Pirko @ 2018-04-18 14:58 UTC (permalink / raw)
  To: George Wilkie; +Cc: netdev
In-Reply-To: <20180418133549.qd5uqp3km45vw3ar@debian9.gwilkie>

Wed, Apr 18, 2018 at 03:35:49PM CEST, gwilkie@vyatta.att-mail.com wrote:
>On Wed, Apr 18, 2018 at 02:56:44PM +0200, Jiri Pirko wrote:
>> Wed, Apr 18, 2018 at 12:29:50PM CEST, gwilkie@vyatta.att-mail.com wrote:
>> >Account for operational state when determining port linkup state,
>> >as per Documentation/networking/operstates.txt.
>> 
>> Could you please point me to the exact place in the document where this
>> is suggested?
>> 
>
>Various places cover it I think.
>
>In 1. Introduction:
>"interface is not usable just because the admin enabled it"
>"userspace must be granted the possibility to
>influence operational state"
>
>In 4. Setting from userspace:
>"the userspace application can set IFLA_OPERSTATE
>to IF_OPER_DORMANT or IF_OPER_UP as long as the driver does not set
>netif_carrier_off() or netif_dormant_on()"
>
>We have a use case where we want to set the oper state of the team ports based
>on whether they are actually usable or not (as opposed to just admin up).

Are you running a supplicant there or what is the use-case?

How is this handle in other drivers like bond, openvswitch, bridge, etc?

>
>Cheers.
>
>> 
>> >
>> >Signed-off-by: George Wilkie <gwilkie@vyatta.att-mail.com>
>> >---
>> > drivers/net/team/team.c | 3 ++-
>> > 1 file changed, 2 insertions(+), 1 deletion(-)
>> >
>> >diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
>> >index a6c6ce19eeee..231264a05e55 100644
>> >--- a/drivers/net/team/team.c
>> >+++ b/drivers/net/team/team.c
>> >@@ -2918,7 +2918,8 @@ static int team_device_event(struct notifier_block *unused,
>> > 	case NETDEV_CHANGE:
>> > 		if (netif_running(port->dev))
>> > 			team_port_change_check(port,
>> >-					       !!netif_carrier_ok(port->dev));
>> >+					       !!(netif_carrier_ok(port->dev) &&
>> >+						  netif_oper_up(port->dev)));
>> > 		break;
>> > 	case NETDEV_UNREGISTER:
>> > 		team_del_slave(port->team->dev, dev);
>> >-- 
>> >2.11.0
>> >

^ permalink raw reply

* Re: [PATCH] net: qmi_wwan: add Wistron Neweb D19Q1
From: Bjørn Mork @ 2018-04-18 14:39 UTC (permalink / raw)
  To: Pawel Dembicki; +Cc: netdev, linux-usb, linux-kernel
In-Reply-To: <1524060204-7814-1-git-send-email-paweldembicki@gmail.com>

Pawel Dembicki <paweldembicki@gmail.com> writes:

> This modem is embedded on dlink dwr-960 router.
> The oem configuration states:
>
> T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=480 MxCh= 0
> D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1
> P: Vendor=1435 ProdID=d191 Rev=ff.ff
> S: Manufacturer=Android
> S: Product=Android
> S: SerialNumber=0123456789ABCDEF
> C:* #Ifs= 6 Cfg#= 1 Atr=80 MxPwr=500mA
> I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none)
> E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
> E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
> I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none)
> E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
> E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
> I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none)
> E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms
> E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
> E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
> I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none)
> E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms
> E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
> E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
> I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan
> E: Ad=88(I) Atr=03(Int.) MxPS= 8 Ivl=32ms
> E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
> E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
> I:* If#= 5 Alt= 0 #EPs= 2 Cls=08(stor.) Sub=06 Prot=50 Driver=(none)
> E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
> E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=125us
>
> Tested on openwrt distribution
>
> Signed-off-by: Pawel Dembicki <paweldembicki@gmail.com>

Acked-by: Bjørn Mork <bjorn@mork.no>

^ permalink raw reply

* Re: [PATCH 6/6] rhashtable: add rhashtable_walk_prev()
From: Herbert Xu @ 2018-04-18 14:35 UTC (permalink / raw)
  To: NeilBrown; +Cc: Thomas Graf, netdev, linux-kernel
In-Reply-To: <152403402206.16895.14563720960374849428.stgit2@noble>

On Wed, Apr 18, 2018 at 04:47:02PM +1000, NeilBrown wrote:
> rhashtable_walk_prev() returns the object returned by
> the previous rhashtable_walk_next(), providing it is still in the
> table (or was during this grace period).
> This works even if rhashtable_walk_stop() and rhashtable_talk_start()
> have been called since the last rhashtable_walk_next().
> 
> If there have been no calls to rhashtable_walk_next(), or if the
> object is gone from the table, then NULL is returned.
> 
> This can usefully be used in a seq_file ->start() function.
> If the pos is the same as was returned by the last ->next() call,
> then rhashtable_walk_prev() can be used to re-establish the
> current location in the table.  If it returns NULL, then
> rhashtable_walk_next() should be used.
> 
> Signed-off-by: NeilBrown <neilb@suse.com>

Can you explain the need for this function and its difference
from the existing rhashtable_walk_peek?

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [PATCH] net: don't use kvzalloc for DMA memory
From: Mikulas Patocka @ 2018-04-18 14:34 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet
  Cc: Joby Poriyath, Ben Hutchings, netdev, linux-kernel

The patch 74d332c13b21 changes alloc_netdev_mqs to use vzalloc if kzalloc
fails (later patches change it to kvzalloc).

The problem with this is that if the vzalloc function is actually used, 
virtio_net doesn't work (because it expects that the extra memory should 
be accessible with DMA-API and memory allocated with vzalloc isn't).

This patch changes it back to kzalloc and adds a warning if the allocated
size is too large (the allocation is unreliable in this case).

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Fixes: 74d332c13b21 ("net: extend net_device allocation to vmalloc()")

---
 net/core/dev.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

Index: linux-2.6/net/core/dev.c
===================================================================
--- linux-2.6.orig/net/core/dev.c	2018-04-16 21:08:36.000000000 +0200
+++ linux-2.6/net/core/dev.c	2018-04-18 16:24:43.000000000 +0200
@@ -8366,7 +8366,8 @@ struct net_device *alloc_netdev_mqs(int
 	/* ensure 32-byte alignment of whole construct */
 	alloc_size += NETDEV_ALIGN - 1;
 
-	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+	WARN_ON(alloc_size > PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
+	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 	if (!p)
 		return NULL;
 

^ permalink raw reply

* Re: [PATCH 1/6] rhashtable: remove outdated comments about grow_decision etc
From: Herbert Xu @ 2018-04-18 14:29 UTC (permalink / raw)
  To: NeilBrown; +Cc: Thomas Graf, netdev, linux-kernel
In-Reply-To: <152403402187.16895.84802790561768231.stgit2@noble>

On Wed, Apr 18, 2018 at 04:47:01PM +1000, NeilBrown wrote:
> grow_decision and shink_decision no longer exist, so remove
> the remaining references to them.
> 
> Signed-off-by: NeilBrown <neilb@suse.com>

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH RFC net-next 00/11] udp gso
From: Willem de Bruijn @ 2018-04-18 14:28 UTC (permalink / raw)
  To: Sowmini Varadhan
  Cc: Samudrala, Sridhar, Network Development, Willem de Bruijn
In-Reply-To: <CAF=yD-+iT55h_QbQNR6RWa0R41N=3GCr+71+qr32GW=1oEc0Hg@mail.gmail.com>

On Wed, Apr 18, 2018 at 9:59 AM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>> One thing that was not clear to me about the API: shouldn't UDP_SEGMENT
>> just be automatically determined in the stack from the pmtu? Whats
>> the motivation for the socket option for this? also AIUI this can be
>> either a per-socket or a per-packet option?

I forgot to respond to the last point: yes, it is set either as a setsockopt or
passed as a cmsg for a given send call.

Especially when using unconnected sockets to communicate with many
clients, it is likely that this value will vary per call.

^ permalink raw reply

* Re: [PATCH 2/6] rhashtable: remove incorrect comment on r{hl, hash}table_walk_enter()
From: Herbert Xu @ 2018-04-18 14:28 UTC (permalink / raw)
  To: NeilBrown; +Cc: Thomas Graf, netdev, linux-kernel
In-Reply-To: <152403402192.16895.9740762152906281009.stgit2@noble>

On Wed, Apr 18, 2018 at 04:47:01PM +1000, NeilBrown wrote:
> Neither rhashtable_walk_enter() or rhltable_walk_enter() sleep, so
> remove the comments which suggest that they do.
> 
> Signed-off-by: NeilBrown <neilb@suse.com>
> ---
>  include/linux/rhashtable.h |    3 ---
>  lib/rhashtable.c           |    3 ---
>  2 files changed, 6 deletions(-)
> 
> diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
> index 87d443a5b11d..b01d88e196c2 100644
> --- a/include/linux/rhashtable.h
> +++ b/include/linux/rhashtable.h
> @@ -1268,9 +1268,6 @@ static inline int rhashtable_walk_init(struct rhashtable *ht,
>   * For a completely stable walk you should construct your own data
>   * structure outside the hash table.
>   *
> - * This function may sleep so you must not call it from interrupt
> - * context or with spin locks held.

It does a naked spin lock so even though we removed the memory
allocation you still mustn't call it from interrupt context.

Why do you need to do that anyway?

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH net-next v4 0/3] kernel: add support to collect hardware logs in crash recovery kernel
From: Eric W. Biederman @ 2018-04-18 14:28 UTC (permalink / raw)
  To: Rahul Lakkireddy
  Cc: Dave Young, netdev@vger.kernel.org, kexec@lists.infradead.org,
	linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	Indranil Choudhury, Nirranjan Kirubaharan,
	stephen@networkplumber.org, Ganesh GR, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, davem@davemloft.net,
	viro@zeniv.linux.org.uk
In-Reply-To: <20180418123114.GA19159@chelsio.com>

Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> writes:

> On Wednesday, April 04/18/18, 2018 at 11:45:46 +0530, Dave Young wrote:
>> Hi Rahul,
>> On 04/17/18 at 01:14pm, Rahul Lakkireddy wrote:
>> > On production servers running variety of workloads over time, kernel
>> > panic can happen sporadically after days or even months. It is
>> > important to collect as much debug logs as possible to root cause
>> > and fix the problem, that may not be easy to reproduce. Snapshot of
>> > underlying hardware/firmware state (like register dump, firmware
>> > logs, adapter memory, etc.), at the time of kernel panic will be very
>> > helpful while debugging the culprit device driver.
>> > 
>> > This series of patches add new generic framework that enable device
>> > drivers to collect device specific snapshot of the hardware/firmware
>> > state of the underlying device in the crash recovery kernel. In crash
>> > recovery kernel, the collected logs are added as elf notes to
>> > /proc/vmcore, which is copied by user space scripts for post-analysis.
>> > 
>> > The sequence of actions done by device drivers to append their device
>> > specific hardware/firmware logs to /proc/vmcore are as follows:
>> > 
>> > 1. During probe (before hardware is initialized), device drivers
>> > register to the vmcore module (via vmcore_add_device_dump()), with
>> > callback function, along with buffer size and log name needed for
>> > firmware/hardware log collection.
>> 
>> I assumed the elf notes info should be prepared while kexec_[file_]load
>> phase. But I did not read the old comment, not sure if it has been discussed
>> or not.
>> 
>
> We must not collect dumps in crashing kernel. Adding more things in
> crash dump path risks not collecting vmcore at all. Eric had
> discussed this in more detail at:
>
> https://lkml.org/lkml/2018/3/24/319
>
> We are safe to collect dumps in the second kernel. Each device dump
> will be exported as an elf note in /proc/vmcore.

It just occurred to me there is one variation that is worth
considering.

Is the area you are looking at dumping part of a huge mmio area?
I think someone said 2GB?

If that is the case it could be worth it to simply add the needed
addresses to the range of memory we need to dump, and simply having a
elf note saying that is what happened.

>> If do this in 2nd kernel a question is driver can be loaded later than vmcore init.
>
> Yes, drivers will add their device dumps after vmcore init.
>
>> How to guarantee the function works if vmcore reading happens before
>> the driver is loaded?
>> 
>> Also it is possible that kdump initramfs does not contains the driver
>> module.
>> 
>> Am I missing something?
>> 
>
> Yes, driver must be in initramfs if it wants to collect and add device
> dump to /proc/vmcore in second kernel.

Eric

^ permalink raw reply

* Re: [PATCH bpf-next v3 8/8] bpf: add documentation for eBPF helpers (58-64)
From: Quentin Monnet @ 2018-04-18 14:09 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: daniel, ast, netdev, oss-drivers, linux-doc, linux-man,
	John Fastabend
In-Reply-To: <20180418153448.574c6814@redhat.com>

2018-04-18 15:34 UTC+0200 ~ Jesper Dangaard Brouer <brouer@redhat.com>
> On Tue, 17 Apr 2018 15:34:38 +0100
> Quentin Monnet <quentin.monnet@netronome.com> wrote:
> 
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 350459c583de..3d329538498f 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1276,6 +1276,50 @@ union bpf_attr {
>>   * 	Return
>>   * 		0 on success, or a negative error in case of failure.
>>   *
>> + * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
>> + * 	Description
>> + * 		Redirect the packet to the endpoint referenced by *map* at
>> + * 		index *key*. Depending on its type, his *map* can contain
>                                                     ^^^
> 
> "his" -> "this"

Thanks!

>> + * 		references to net devices (for forwarding packets through other
>> + * 		ports), or to CPUs (for redirecting XDP frames to another CPU;
>> + * 		but this is only implemented for native XDP (with driver
>> + * 		support) as of this writing).
>> + *
>> + * 		All values for *flags* are reserved for future usage, and must
>> + * 		be left at zero.
>> + * 	Return
>> + * 		**XDP_REDIRECT** on success, or **XDP_ABORT** on error.
>> + *
> 
> "XDP_ABORT" -> "XDP_ABORTED"

Ouch. And I did the same for bpf_redirect(). Thanks for the catch.

> 
> I don't know if it's worth mentioning in the doc/man-page; that for XDP
> using bpf_redirect_map() is a HUGE performance advantage, compared to
> the bpf_redirect() call ?

It seems worth to me. How would you simply explain the reason for this
difference?

Quentin

^ permalink raw reply

* Re: [PATCH net-next 0/5] virtio-net: Add SCTP checksum offload support
From: Michael S. Tsirkin @ 2018-04-18 14:06 UTC (permalink / raw)
  To: Vlad Yasevich
  Cc: Marcelo Ricardo Leitner, Vladislav Yasevich, netdev, linux-sctp,
	virtualization, jasowang, nhorman
In-Reply-To: <6bc762f6-d6fb-5471-2893-a888cce199f9@redhat.com>

On Tue, Apr 17, 2018 at 04:35:18PM -0400, Vlad Yasevich wrote:
> On 04/02/2018 10:47 AM, Marcelo Ricardo Leitner wrote:
> > On Mon, Apr 02, 2018 at 09:40:01AM -0400, Vladislav Yasevich wrote:
> >> Now that we have SCTP offload capabilities in the kernel, we can add
> >> them to virtio as well.  First step is SCTP checksum.
> > 
> > Thanks.
> > 
> >> As for GSO, the way sctp GSO is currently implemented buys us nothing
> >> in added support to virtio.  To add true GSO, would require a lot of
> >> re-work inside of SCTP and would require extensions to the virtio
> >> net header to carry extra sctp data.
> > 
> > Can you please elaborate more on this? Is this because SCTP GSO relies
> > on the gso skb format for knowing how to segment it instead of having
> > a list of sizes?
> > 
> 
> it's mainly because all the true segmentation, placing data into chunks,
> has already happened.  All that GSO does is allow for higher bundling
> rate between VMs. If that is all SCTP GSO ever going to do, that fine,
> but the goal is to do real GSO eventually and potentially reduce the
> amount of memory copying we are doing.
> If we do that, any current attempt at GSO in virtio would have to be
> depricated and we'd need GSO2 or something like that.

Batching helps virtualization *a lot* though.
Are there actual plans for GSO2? Is it just for SCTP?

> 
> This is why, after doing the GSO support, I decided not to include it.
> 
> -vlad
> >   Marcelo
> > 

^ permalink raw reply

* [PATCH] net: qmi_wwan: add Wistron Neweb D19Q1
From: Pawel Dembicki @ 2018-04-18 14:03 UTC (permalink / raw)
  Cc: Pawel Dembicki, Bjørn Mork, netdev, linux-usb, linux-kernel

This modem is embedded on dlink dwr-960 router.
The oem configuration states:

T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=480 MxCh= 0
D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1
P: Vendor=1435 ProdID=d191 Rev=ff.ff
S: Manufacturer=Android
S: Product=Android
S: SerialNumber=0123456789ABCDEF
C:* #Ifs= 6 Cfg#= 1 Atr=80 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none)
E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none)
E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none)
E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms
E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none)
E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms
E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan
E: Ad=88(I) Atr=03(Int.) MxPS= 8 Ivl=32ms
E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 5 Alt= 0 #EPs= 2 Cls=08(stor.) Sub=06 Prot=50 Driver=(none)
E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=125us

Tested on openwrt distribution

Signed-off-by: Pawel Dembicki <paweldembicki@gmail.com>
---
 drivers/net/usb/qmi_wwan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index ca066b7..c853e74 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1107,6 +1107,7 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x1435, 0xd181, 3)},	/* Wistron NeWeb D18Q1 */
 	{QMI_FIXED_INTF(0x1435, 0xd181, 4)},	/* Wistron NeWeb D18Q1 */
 	{QMI_FIXED_INTF(0x1435, 0xd181, 5)},	/* Wistron NeWeb D18Q1 */
+	{QMI_FIXED_INTF(0x1435, 0xd191, 4)},	/* Wistron NeWeb D19Q1 */
 	{QMI_FIXED_INTF(0x16d8, 0x6003, 0)},	/* CMOTech 6003 */
 	{QMI_FIXED_INTF(0x16d8, 0x6007, 0)},	/* CMOTech CHE-628S */
 	{QMI_FIXED_INTF(0x16d8, 0x6008, 0)},	/* CMOTech CMU-301 */
-- 
2.7.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox