Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next PATCH v4] net: dummy: Introduce dummy virtual functions
From: Phil Sutter @ 2016-11-23 16:25 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Sabrina Dubroca

The idea for this was born when testing VF support in iproute2 which was
impeded by hardware requirements. In fact, not every VF-capable hardware
driver implements all netdev ops, so testing the interface is still hard
to do even with a well-sorted hardware shelf.

To overcome this and allow for testing the user-kernel interface, this
patch allows to turn dummy into a PF with a configurable amount of VFs.

Due to the assumption that all PFs are PCI devices, this implementation
is not completely straightforward: In order to allow for
rtnl_fill_ifinfo() to see the dummy VFs, a fake PCI parent device is
attached to the dummy netdev. This has to happen at the right spot so
register_netdevice() does not get confused. This patch abuses
ndo_fix_features callback for that. In ndo_uninit callback, the fake
parent is removed again for the same purpose.

Joint work with Sabrina Dubroca.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Phil Sutter <phil@nwl.cc>
---
Changes since v3:
- Changed type of vf_mac field from unsigned char to u8.
- Column-aligned structs' field names.

Changes since v2:
- Fixed oops on reboot (need to initialize parent device mutex).
- Got rid of potential mem leak noticed by Eric Dumazet.
- Dropped stray newline insertion.

Changes since v1:
- Fixed issues reported by kbuild test robot:
  - pci_dev->sriov is only present if CONFIG_PCI_ATS is active.
  - pci_bus_type does not exist if CONFIG_PCI is not defined.
---
 drivers/net/dummy.c | 205 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 203 insertions(+), 2 deletions(-)

diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index 69fc8409a9733..91d4858ec563a 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -34,6 +34,8 @@
 #include <linux/etherdevice.h>
 #include <linux/init.h>
 #include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include "../pci/pci.h"		/* for struct pci_sriov */
 #include <linux/rtnetlink.h>
 #include <net/rtnetlink.h>
 #include <linux/u64_stats_sync.h>
@@ -42,6 +44,37 @@
 #define DRV_VERSION	"1.0"
 
 static int numdummies = 1;
+static int num_vfs;
+
+static struct pci_sriov pdev_sriov;
+
+static struct pci_dev pci_pdev = {
+	.is_physfn = 0,
+#ifdef CONFIG_PCI_ATS
+	.sriov = &pdev_sriov,
+#endif
+#ifdef CONFIG_PCI
+	.dev.bus = &pci_bus_type,
+#endif
+};
+
+struct vf_data_storage {
+	u8	vf_mac[ETH_ALEN];
+	u16	pf_vlan; /* When set, guest VLAN config not allowed. */
+	u16	pf_qos;
+	__be16	vlan_proto;
+	u16	min_tx_rate;
+	u16	max_tx_rate;
+	u8	spoofchk_enabled;
+	bool	rss_query_enabled;
+	u8	trusted;
+	int	link_state;
+};
+
+struct dummy_priv {
+	int			num_vfs;
+	struct vf_data_storage	*vfinfo;
+};
 
 /* fake multicast ability */
 static void set_multicast_list(struct net_device *dev)
@@ -91,15 +124,31 @@ static netdev_tx_t dummy_xmit(struct sk_buff *skb, struct net_device *dev)
 
 static int dummy_dev_init(struct net_device *dev)
 {
+	struct dummy_priv *priv = netdev_priv(dev);
+
 	dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
 	if (!dev->dstats)
 		return -ENOMEM;
 
+	priv->num_vfs = num_vfs;
+	priv->vfinfo = NULL;
+
+	if (!num_vfs)
+		return 0;
+
+	priv->vfinfo = kcalloc(num_vfs, sizeof(struct vf_data_storage),
+			       GFP_KERNEL);
+	if (!priv->vfinfo) {
+		free_percpu(dev->dstats);
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
 static void dummy_dev_uninit(struct net_device *dev)
 {
+	dev->dev.parent = NULL;
 	free_percpu(dev->dstats);
 }
 
@@ -112,6 +161,134 @@ static int dummy_change_carrier(struct net_device *dev, bool new_carrier)
 	return 0;
 }
 
+/* fake, just to set fake PCI parent after netdev_register_kobject() */
+static netdev_features_t dummy_fix_features(struct net_device *dev,
+					    netdev_features_t features)
+{
+	struct dummy_priv *priv = netdev_priv(dev);
+
+	if (priv->num_vfs) {
+		dev->dev.parent = &pci_pdev.dev;
+		if (!pci_pdev.is_physfn) {
+			mutex_init(&pci_pdev.dev.mutex);
+			pci_pdev.is_physfn = 1;
+		}
+	}
+
+	return features;
+}
+
+static int dummy_set_vf_mac(struct net_device *dev, int vf, u8 *mac)
+{
+	struct dummy_priv *priv = netdev_priv(dev);
+
+	if (!is_valid_ether_addr(mac) || (vf >= priv->num_vfs))
+		return -EINVAL;
+
+	memcpy(priv->vfinfo[vf].vf_mac, mac, ETH_ALEN);
+
+	return 0;
+}
+
+static int dummy_set_vf_vlan(struct net_device *dev, int vf,
+			     u16 vlan, u8 qos, __be16 vlan_proto)
+{
+	struct dummy_priv *priv = netdev_priv(dev);
+
+	if ((vf >= priv->num_vfs) || (vlan > 4095) || (qos > 7))
+		return -EINVAL;
+
+	priv->vfinfo[vf].pf_vlan = vlan;
+	priv->vfinfo[vf].pf_qos = qos;
+	priv->vfinfo[vf].vlan_proto = vlan_proto;
+
+	return 0;
+}
+
+static int dummy_set_vf_rate(struct net_device *dev, int vf, int min, int max)
+{
+	struct dummy_priv *priv = netdev_priv(dev);
+
+	if (vf >= priv->num_vfs)
+		return -EINVAL;
+
+	priv->vfinfo[vf].min_tx_rate = min;
+	priv->vfinfo[vf].max_tx_rate = max;
+
+	return 0;
+}
+
+static int dummy_set_vf_spoofchk(struct net_device *dev, int vf, bool val)
+{
+	struct dummy_priv *priv = netdev_priv(dev);
+
+	if (vf >= priv->num_vfs)
+		return -EINVAL;
+
+	priv->vfinfo[vf].spoofchk_enabled = val;
+
+	return 0;
+}
+
+static int dummy_set_vf_rss_query_en(struct net_device *dev, int vf, bool val)
+{
+	struct dummy_priv *priv = netdev_priv(dev);
+
+	if (vf >= priv->num_vfs)
+		return -EINVAL;
+
+	priv->vfinfo[vf].rss_query_enabled = val;
+
+	return 0;
+}
+
+static int dummy_set_vf_trust(struct net_device *dev, int vf, bool val)
+{
+	struct dummy_priv *priv = netdev_priv(dev);
+
+	if (vf >= priv->num_vfs)
+		return -EINVAL;
+
+	priv->vfinfo[vf].trusted = val;
+
+	return 0;
+}
+
+static int dummy_get_vf_config(struct net_device *dev,
+			       int vf, struct ifla_vf_info *ivi)
+{
+	struct dummy_priv *priv = netdev_priv(dev);
+
+	if (vf >= priv->num_vfs)
+		return -EINVAL;
+
+	ivi->vf = vf;
+	memcpy(&ivi->mac, priv->vfinfo[vf].vf_mac, ETH_ALEN);
+	ivi->vlan = priv->vfinfo[vf].pf_vlan;
+	ivi->qos = priv->vfinfo[vf].pf_qos;
+	ivi->spoofchk = priv->vfinfo[vf].spoofchk_enabled;
+	ivi->linkstate = priv->vfinfo[vf].link_state;
+	ivi->min_tx_rate = priv->vfinfo[vf].min_tx_rate;
+	ivi->max_tx_rate = priv->vfinfo[vf].max_tx_rate;
+	ivi->rss_query_en = priv->vfinfo[vf].rss_query_enabled;
+	ivi->trusted = priv->vfinfo[vf].trusted;
+	ivi->vlan_proto = priv->vfinfo[vf].vlan_proto;
+
+	return 0;
+}
+
+static int dummy_set_vf_link_state(struct net_device *dev, int vf, int state)
+{
+	struct dummy_priv *priv = netdev_priv(dev);
+
+	if (vf >= priv->num_vfs)
+		return -EINVAL;
+
+	priv->vfinfo[vf].link_state = state;
+
+	return 0;
+}
+
 static const struct net_device_ops dummy_netdev_ops = {
 	.ndo_init		= dummy_dev_init,
 	.ndo_uninit		= dummy_dev_uninit,
@@ -121,6 +298,15 @@ static const struct net_device_ops dummy_netdev_ops = {
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_get_stats64	= dummy_get_stats64,
 	.ndo_change_carrier	= dummy_change_carrier,
+	.ndo_fix_features	= dummy_fix_features,
+	.ndo_set_vf_mac		= dummy_set_vf_mac,
+	.ndo_set_vf_vlan	= dummy_set_vf_vlan,
+	.ndo_set_vf_rate	= dummy_set_vf_rate,
+	.ndo_set_vf_spoofchk	= dummy_set_vf_spoofchk,
+	.ndo_set_vf_trust	= dummy_set_vf_trust,
+	.ndo_get_vf_config	= dummy_get_vf_config,
+	.ndo_set_vf_link_state	= dummy_set_vf_link_state,
+	.ndo_set_vf_rss_query_en = dummy_set_vf_rss_query_en,
 };
 
 static void dummy_get_drvinfo(struct net_device *dev,
@@ -134,6 +320,14 @@ static const struct ethtool_ops dummy_ethtool_ops = {
 	.get_drvinfo            = dummy_get_drvinfo,
 };
 
+static void dummy_free_netdev(struct net_device *dev)
+{
+	struct dummy_priv *priv = netdev_priv(dev);
+
+	kfree(priv->vfinfo);
+	free_netdev(dev);
+}
+
 static void dummy_setup(struct net_device *dev)
 {
 	ether_setup(dev);
@@ -141,7 +335,7 @@ static void dummy_setup(struct net_device *dev)
 	/* Initialize the device structure. */
 	dev->netdev_ops = &dummy_netdev_ops;
 	dev->ethtool_ops = &dummy_ethtool_ops;
-	dev->destructor = free_netdev;
+	dev->destructor = dummy_free_netdev;
 
 	/* Fill in device structure with ethernet-generic values. */
 	dev->flags |= IFF_NOARP;
@@ -169,6 +363,7 @@ static int dummy_validate(struct nlattr *tb[], struct nlattr *data[])
 
 static struct rtnl_link_ops dummy_link_ops __read_mostly = {
 	.kind		= DRV_NAME,
+	.priv_size	= sizeof(struct dummy_priv),
 	.setup		= dummy_setup,
 	.validate	= dummy_validate,
 };
@@ -177,12 +372,16 @@ static struct rtnl_link_ops dummy_link_ops __read_mostly = {
 module_param(numdummies, int, 0);
 MODULE_PARM_DESC(numdummies, "Number of dummy pseudo devices");
 
+module_param(num_vfs, int, 0);
+MODULE_PARM_DESC(num_vfs, "Number of dummy VFs per dummy device");
+
 static int __init dummy_init_one(void)
 {
 	struct net_device *dev_dummy;
 	int err;
 
-	dev_dummy = alloc_netdev(0, "dummy%d", NET_NAME_UNKNOWN, dummy_setup);
+	dev_dummy = alloc_netdev(sizeof(struct dummy_priv),
+				 "dummy%d", NET_NAME_UNKNOWN, dummy_setup);
 	if (!dev_dummy)
 		return -ENOMEM;
 
@@ -201,6 +400,8 @@ static int __init dummy_init_module(void)
 {
 	int i, err = 0;
 
+	pdev_sriov.num_VFs = num_vfs;
+
 	rtnl_lock();
 	err = __rtnl_link_register(&dummy_link_ops);
 	if (err < 0)
-- 
2.10.0

^ permalink raw reply related

* Re: [net-next PATCH v3] net: dummy: Introduce dummy virtual functions
From: Phil Sutter @ 2016-11-23 16:24 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, Sabrina Dubroca
In-Reply-To: <1479907588.8455.484.camel@edumazet-glaptop3.roam.corp.google.com>

On Wed, Nov 23, 2016 at 05:26:28AM -0800, Eric Dumazet wrote:
> On Wed, 2016-11-23 at 13:41 +0100, Phil Sutter wrote:
> 
> > +struct vf_data_storage {
> > +	unsigned char vf_mac[ETH_ALEN];
> > +	u16 pf_vlan; /* When set, guest VLAN config not allowed. */
> > +	u16 pf_qos;
> > +	__be16 vlan_proto;
> > +	u16 min_tx_rate;
> > +	u16 max_tx_rate;
> > +	u8 spoofchk_enabled;
> > +	bool rss_query_enabled;
> > +	u8 trusted;
> > +	int link_state;
> > +};
> 
> Could you use proper indentation of field names ?

Sure!

> struct vf_data_storage {
> 	u8	vf_mac[ETH_ALEN];

Also thanks for suggesting u8 here. I really shouldn't copy'n'paste
code, not even from a well-maintained driver.

Thanks, Phil

^ permalink raw reply

* Re: [PATCH] net: dsa: mv88e6xxx: egress all frames
From: Vivien Didelot @ 2016-11-23 15:59 UTC (permalink / raw)
  To: Stefan Eichenberger, Andrew Lunn; +Cc: Stefan Eichenberger, f.fainelli, netdev
In-Reply-To: <20161123120047.GC12698@gruene.netmodule.intranet>

Hi Stefan,

Stefan Eichenberger <stefan.eichenberger@netmodule.com> writes:

>> Now, the different families are not 100% compatible with each
>> other. We never had access to a 6097, so it has not been tested
>> recently, and we have probably broken it... My guess would be,
>> anywhere mv88e6xxx_6095_family(chip) is used, there also needs to be
>> an mv88e6xxx_6097_family(chip). But i could be wrong.
>
> I think I probably found the problem. For EDSA type switches the bit
> PORT_CONTROL_FORWARD_UNKNOWN_MC is set on the cpu port but not for DSA 
> type switches. Broadcast addresses are threaded as multicast addresses, 
> so unknown frames will never leave the switch.

The Port Control Register (0x04) is one of these registers which changes
almost completely among chip models.

Are you able to give us the layout of the port register 0x04 on your
88E6097? I don't have access to its datasheet.

For instance on 88E6185 bit 3 is reserved, on 88E6352 and 88E6390 bit
3:2 are "Egress Floods" and 0x2 means "Do not egress any frame with an
unknown unicast DA".

> Do you know if there is a reason why this bit isn't set for DSA type
> switches too? The patch would be extremely simple and it seems to work
> perfectly with this bit set on the CPU port.

All these family checks for bit masking are quite messy and ideally need
proper abstraction...

Can you give us the chunk of patch you are refering to?

Thanks,

        Vivien

^ permalink raw reply

* Re: [patch net-next v2 10/11] mlxsw: spectrum_router: Request a dump of FIB tables during init
From: Jiri Pirko @ 2016-11-23 16:04 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: netdev, davem, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz,
	roopa, dsa, nikolay, andy, vivien.didelot, andrew, f.fainelli,
	alexander.h.duyck, kaber
In-Reply-To: <1479916800.4019894.797128001.743EBB6A@webmail.messagingengine.com>

Wed, Nov 23, 2016 at 05:00:00PM CET, hannes@stressinduktion.org wrote:
>On Wed, Nov 23, 2016, at 15:48, Jiri Pirko wrote:
>> From: Ido Schimmel <idosch@mellanox.com>
>> 
>> Make sure the device has a complete view of the FIB tables by invoking
>> their dump during module init.
>> 
>> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>> ---
>>  drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 16
>>  ++++++++++++++++
>>  1 file changed, 16 insertions(+)
>> 
>> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
>> b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
>> index 14bed1d..36a71d2 100644
>> --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
>> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
>> @@ -2027,6 +2027,21 @@ static int mlxsw_sp_router_fib_event(struct
>> notifier_block *nb,
>>  	return NOTIFY_DONE;
>>  }
>>  
>> +static void mlxsw_sp_router_fib_dump(struct mlxsw_sp *mlxsw_sp)
>> +{
>> +       while (!fib_notifier_dump(&mlxsw_sp->fib_nb)) {
>> +               /* Flush pending FIB notifications and then flush the
>> +                * device's table before requesting another dump. Do
>> +                * that with RTNL held, as FIB notification block is
>> +                * already registered.
>> +                */
>> +               mlxsw_core_flush_owq();
>> +               rtnl_lock();
>> +               mlxsw_sp_router_fib_flush(mlxsw_sp);
>> +               rtnl_unlock();
>> +       }
>> +}
>
>I think it is fine to use this kind of synchronization.
>
>But I think that this part of the logic still belongs into the core

Core does not know how driver handles the offloaded fibs. So only driver
knows how/if he needs to do flush in case of retry.


>kernel. I still think it could happen that we will loop here
>indefinitely because of a lot of routing updates and as such would need
>to abort this loop after a number of tries.

In theory, it is possible, howevery quite unlikely.

>
>I would like that the kernel has one function to do this decision
>instead of later patching all users of this API. Do you think it is
>worth it?

For the reason I stated above, I'm not sure that could be done...

^ permalink raw reply

* Re: [PATCH net-next 1/4] net: mvneta: Convert to be 64 bits compatible
From: Marcin Wojtas @ 2016-11-23 16:02 UTC (permalink / raw)
  To: Gregory CLEMENT
  Cc: Jisheng Zhang, Thomas Petazzoni, Jason Cooper, Arnd Bergmann,
	Andrew Lunn, netdev, linux-kernel, David S. Miller,
	linux-arm-kernel@lists.infradead.org, Sebastian Hesselbarth
In-Reply-To: <87mvgqmjiy.fsf@free-electrons.com>

Hi Gregory,

2016-11-23 14:07 GMT+01:00 Gregory CLEMENT <gregory.clement@free-electrons.com>:
> Hi Jisheng, Arnd,
>
>
> Thanks for your feedback.
>
>
>  On mer., nov. 23 2016, Arnd Bergmann <arnd@arndb.de> wrote:
>
>> On Wednesday, November 23, 2016 5:53:41 PM CET Jisheng Zhang wrote:
>>> On Tue, 22 Nov 2016 22:04:12 +0100 Arnd Bergmann wrote:
>>>
>>> > On Tuesday, November 22, 2016 5:48:41 PM CET Gregory CLEMENT wrote:
>>> > > +#ifdef CONFIG_64BIT
>>> > > +       void *data_tmp;
>>> > > +
>>> > > +       /* In Neta HW only 32 bits data is supported, so in order to
>>> > > +        * obtain whole 64 bits address from RX descriptor, we store
>>> > > +        * the upper 32 bits when allocating buffer, and put it back
>>> > > +        * when using buffer cookie for accessing packet in memory.
>>> > > +        * Frags should be allocated from single 'memory' region,
>>> > > +        * hence common upper address half should be sufficient.
>>> > > +        */
>>> > > +       data_tmp = mvneta_frag_alloc(pp->frag_size);
>>> > > +       if (data_tmp) {
>>> > > +               pp->data_high = (u64)upper_32_bits((u64)data_tmp) << 32;
>>> > > +               mvneta_frag_free(pp->frag_size, data_tmp);
>>> > > +       }
>>> > >
>>> >
>>> > How does this work when the region spans a n*4GB address boundary?
>>>
>>> indeed. We also make use of this driver on 64bit platforms. We use
>>> different solution to make the driver 64bit safe.
>>>
>>> solA: make use of the reserved field in the mvneta_rx_desc, such
>>> as reserved2 etc. Yes, the field is marked as "for future use, PnC", but
>>> now it's not used at all. This is one possible solution however.
>>
>> Right, this sounds like the most straightforward choice.
>
> The PnC (which stands for Parsing and Classification) is not used yet
> indeed but this field will be needed when we will enable it. It is
> something we want to do but it is not planned in a near future. However
> from the datasheets I have it seems only present on the Armada XP. It is
> not mentioned on datasheets for the Armada 38x or the Armada 3700.
>

It is not mentioned in A38x spec, but this SoC has exactly the same
PnC as Armada XP (they differ only with used SRAM details). I wouldn't
be surprised if it was supported on A3700 as well.

> That would mean it was safe to use on of this field in 64-bits mode on
> the Armada 3700.
>
> So I am going to take this approach.
>

I think for now it's safe and is much easier than handling extra
software ring for virtual addresses.

Best regards,
Marcin

^ permalink raw reply

* Re: [PATCH ethtool v4 1/2] ethtool-copy.h:sync with net
From: John W. Linville @ 2016-11-23 15:55 UTC (permalink / raw)
  To: Allan W. Nielsen; +Cc: netdev, andrew, f.fainelli, raju.lakkaraju
In-Reply-To: <20161123152447.GC13179@tuxdriver.com>

On Wed, Nov 23, 2016 at 10:24:47AM -0500, John W. Linville wrote:
> It looks like this patch somehow misses the following commit:
> 
> commit 85a624403c77c3f074931aefdfced59f61b668cb
> Author: Jesse Brandeburg <jesse.brandeburg@intel.com>
> Date:   Thu Oct 13 16:13:55 2016 -0700
> 
>     ethtool: silence warning on bit loss
>     
>     Sparse was complaining when we went to prototype some code
>     using ethtool_cmd_speed_set and SPEED_100000, which uses
>     the upper 16 bits of __u32 speed for the first time.
>     
>     CHECK
>     ...
>     .../uapi/linux/ethtool.h:123:28: warning:
>       cast truncates bits from constant value (186a0 becomes 86a0)
>     
>     The warning is actually bogus, as no bits are really lost, but
>     we can get rid of the sparse warning with this one small change.
>     
>     Reported-by: Preethi Banala <preethi.banala@intel.com>
>     Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
>     Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> Plus, the referenced commit does not seem to exist in the net-next tree.
> 
> Please resubmit with the complete history and correct commit IDs -- thanks!

I mean, of course, all the preceding changes accounted for -- not
every commit explicitly listed. The top commit does, of course,
need to reflect the correct commit ID from Dave's/Linus's tree.
 
John
-- 
John W. Linville		Someday the world will need a hero, and you
linville@tuxdriver.com			might be all we have.  Be ready.

^ permalink raw reply

* Re: [patch net-next v2 10/11] mlxsw: spectrum_router: Request a dump of FIB tables during init
From: Hannes Frederic Sowa @ 2016-11-23 16:00 UTC (permalink / raw)
  To: Jiri Pirko, netdev
  Cc: davem, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz, roopa,
	dsa, nikolay, andy, vivien.didelot, andrew, f.fainelli,
	alexander.h.duyck, kaber
In-Reply-To: <1479912528-4636-1-git-send-email-jiri@resnulli.us>

On Wed, Nov 23, 2016, at 15:48, Jiri Pirko wrote:
> From: Ido Schimmel <idosch@mellanox.com>
> 
> Make sure the device has a complete view of the FIB tables by invoking
> their dump during module init.
> 
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
>  drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 16
>  ++++++++++++++++
>  1 file changed, 16 insertions(+)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
> b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
> index 14bed1d..36a71d2 100644
> --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
> @@ -2027,6 +2027,21 @@ static int mlxsw_sp_router_fib_event(struct
> notifier_block *nb,
>  	return NOTIFY_DONE;
>  }
>  
> +static void mlxsw_sp_router_fib_dump(struct mlxsw_sp *mlxsw_sp)
> +{
> +       while (!fib_notifier_dump(&mlxsw_sp->fib_nb)) {
> +               /* Flush pending FIB notifications and then flush the
> +                * device's table before requesting another dump. Do
> +                * that with RTNL held, as FIB notification block is
> +                * already registered.
> +                */
> +               mlxsw_core_flush_owq();
> +               rtnl_lock();
> +               mlxsw_sp_router_fib_flush(mlxsw_sp);
> +               rtnl_unlock();
> +       }
> +}

I think it is fine to use this kind of synchronization.

But I think that this part of the logic still belongs into the core
kernel. I still think it could happen that we will loop here
indefinitely because of a lot of routing updates and as such would need
to abort this loop after a number of tries.

I would like that the kernel has one function to do this decision
instead of later patching all users of this API. Do you think it is
worth it?

Bye,
Hannes

^ permalink raw reply

* Re: sendfile from 9p fs into af_alg
From: Al Viro @ 2016-11-23 15:53 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: linux-kernel, netdev, Daniel Borkmann, Martin KaFai Lau
In-Reply-To: <20161123085809.GA46992@ast-mbp.thefacebook.com>

On Wed, Nov 23, 2016 at 12:58:11AM -0800, Alexei Starovoitov wrote:

> if I read it correctly 9p actually responded with 8192 bytes of requests...
> whereas the file size was 9624.
> For large file sizes (in megabytes) the difference between what
> sendfile is reporting and actual file size can be 3x.
> In the small file case (like above dump) it looks rounded to page size for some reason.

OK, I think I see one bug in there; could you check if this gets it back to
normal?

diff --git a/fs/splice.c b/fs/splice.c
index dcaf185..5a7750b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -408,7 +408,8 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	if (res <= 0)
 		return -ENOMEM;
 
-	nr_pages = res / PAGE_SIZE;
+	BUG_ON(dummy);
+	nr_pages = DIV_ROUND_UP(res, PAGE_SIZE);
 
 	vec = __vec;
 	if (nr_pages > PIPE_DEF_BUFFERS) {

^ permalink raw reply related

* [PATCH v9 6/6] samples: bpf: add userspace example for attaching eBPF programs to cgroups
From: Daniel Mack @ 2016-11-23 15:52 UTC (permalink / raw)
  To: htejun, daniel, ast
  Cc: davem, kafai, fw, pablo, harald, netdev, sargun, cgroups,
	Daniel Mack
In-Reply-To: <1479916350-28462-1-git-send-email-daniel@zonque.org>

Add a simple userpace program to demonstrate the new API to attach eBPF
programs to cgroups. This is what it does:

 * Create arraymap in kernel with 4 byte keys and 8 byte values

 * Load eBPF program

   The eBPF program accesses the map passed in to store two pieces of
   information. The number of invocations of the program, which maps
   to the number of packets received, is stored to key 0. Key 1 is
   incremented on each iteration by the number of bytes stored in
   the skb.

 * Detach any eBPF program previously attached to the cgroup

 * Attach the new program to the cgroup using BPF_PROG_ATTACH

 * Once a second, read map[0] and map[1] to see how many bytes and
   packets were seen on any socket of tasks in the given cgroup.

The program takes a cgroup path as 1st argument, and either "ingress"
or "egress" as 2nd. Optionally, "drop" can be passed as 3rd argument,
which will make the generated eBPF program return 0 instead of 1, so
the kernel will drop the packet.

libbpf gained two new wrappers for the new syscall commands.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile            |   2 +
 samples/bpf/libbpf.c            |  21 ++++++
 samples/bpf/libbpf.h            |   3 +
 samples/bpf/test_cgrp2_attach.c | 147 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 173 insertions(+)
 create mode 100644 samples/bpf/test_cgrp2_attach.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 12b7304..e4cdc74 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -22,6 +22,7 @@ hostprogs-y += spintest
 hostprogs-y += map_perf_test
 hostprogs-y += test_overhead
 hostprogs-y += test_cgrp2_array_pin
+hostprogs-y += test_cgrp2_attach
 hostprogs-y += xdp1
 hostprogs-y += xdp2
 hostprogs-y += test_current_task_under_cgroup
@@ -49,6 +50,7 @@ spintest-objs := bpf_load.o libbpf.o spintest_user.o
 map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
 test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
 test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
+test_cgrp2_attach-objs := libbpf.o test_cgrp2_attach.o
 xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index 9969e35..9ce707b 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -104,6 +104,27 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
 	return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
 }
 
+int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type)
+{
+	union bpf_attr attr = {
+		.target_fd = target_fd,
+		.attach_bpf_fd = prog_fd,
+		.attach_type = type,
+	};
+
+	return syscall(__NR_bpf, BPF_PROG_ATTACH, &attr, sizeof(attr));
+}
+
+int bpf_prog_detach(int target_fd, enum bpf_attach_type type)
+{
+	union bpf_attr attr = {
+		.target_fd = target_fd,
+		.attach_type = type,
+	};
+
+	return syscall(__NR_bpf, BPF_PROG_DETACH, &attr, sizeof(attr));
+}
+
 int bpf_obj_pin(int fd, const char *pathname)
 {
 	union bpf_attr attr = {
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index ac6edb6..d0a799a 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -15,6 +15,9 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
 		  const struct bpf_insn *insns, int insn_len,
 		  const char *license, int kern_version);
 
+int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type);
+int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
+
 int bpf_obj_pin(int fd, const char *pathname);
 int bpf_obj_get(const char *pathname);
 
diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c
new file mode 100644
index 0000000..63ef208
--- /dev/null
+++ b/samples/bpf/test_cgrp2_attach.c
@@ -0,0 +1,147 @@
+/* eBPF example program:
+ *
+ * - Creates arraymap in kernel with 4 bytes keys and 8 byte values
+ *
+ * - Loads eBPF program
+ *
+ *   The eBPF program accesses the map passed in to store two pieces of
+ *   information. The number of invocations of the program, which maps
+ *   to the number of packets received, is stored to key 0. Key 1 is
+ *   incremented on each iteration by the number of bytes stored in
+ *   the skb.
+ *
+ * - Detaches any eBPF program previously attached to the cgroup
+ *
+ * - Attaches the new program to a cgroup using BPF_PROG_ATTACH
+ *
+ * - Every second, reads map[0] and map[1] to see how many bytes and
+ *   packets were seen on any socket of tasks in the given cgroup.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include <linux/bpf.h>
+
+#include "libbpf.h"
+
+enum {
+	MAP_KEY_PACKETS,
+	MAP_KEY_BYTES,
+};
+
+static int prog_load(int map_fd, int verdict)
+{
+	struct bpf_insn prog[] = {
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* save r6 so it's not clobbered by BPF_CALL */
+
+		/* Count packets */
+		BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+		BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* load map fd to r1 */
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+		BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+		/* Count bytes */
+		BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+		BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+		BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
+		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+		BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
+		BPF_EXIT_INSN(),
+	};
+
+	return bpf_prog_load(BPF_PROG_TYPE_CGROUP_SKB,
+			     prog, sizeof(prog), "GPL", 0);
+}
+
+static int usage(const char *argv0)
+{
+	printf("Usage: %s <cg-path> <egress|ingress> [drop]\n", argv0);
+	return EXIT_FAILURE;
+}
+
+int main(int argc, char **argv)
+{
+	int cg_fd, map_fd, prog_fd, key, ret;
+	long long pkt_cnt, byte_cnt;
+	enum bpf_attach_type type;
+	int verdict = 1;
+
+	if (argc < 3)
+		return usage(argv[0]);
+
+	if (strcmp(argv[2], "ingress") == 0)
+		type = BPF_CGROUP_INET_INGRESS;
+	else if (strcmp(argv[2], "egress") == 0)
+		type = BPF_CGROUP_INET_EGRESS;
+	else
+		return usage(argv[0]);
+
+	if (argc > 3 && strcmp(argv[3], "drop") == 0)
+		verdict = 0;
+
+	cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY);
+	if (cg_fd < 0) {
+		printf("Failed to open cgroup path: '%s'\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY,
+				sizeof(key), sizeof(byte_cnt),
+				256, 0);
+	if (map_fd < 0) {
+		printf("Failed to create map: '%s'\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	prog_fd = prog_load(map_fd, verdict);
+	printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf);
+
+	if (prog_fd < 0) {
+		printf("Failed to load prog: '%s'\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	ret = bpf_prog_detach(cg_fd, type);
+	printf("bpf_prog_detach() returned '%s' (%d)\n", strerror(errno), errno);
+
+	ret = bpf_prog_attach(prog_fd, cg_fd, type);
+	if (ret < 0) {
+		printf("Failed to attach prog to cgroup: '%s'\n",
+		       strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	while (1) {
+		key = MAP_KEY_PACKETS;
+		assert(bpf_lookup_elem(map_fd, &key, &pkt_cnt) == 0);
+
+		key = MAP_KEY_BYTES;
+		assert(bpf_lookup_elem(map_fd, &key, &byte_cnt) == 0);
+
+		printf("cgroup received %lld packets, %lld bytes\n",
+		       pkt_cnt, byte_cnt);
+		sleep(1);
+	}
+
+	return EXIT_SUCCESS;
+}
-- 
2.7.4

^ permalink raw reply related

* [PATCH v9 5/6] net: ipv4, ipv6: run cgroup eBPF egress programs
From: Daniel Mack @ 2016-11-23 15:52 UTC (permalink / raw)
  To: htejun, daniel, ast
  Cc: davem, kafai, fw, pablo, harald, netdev, sargun, cgroups,
	Daniel Mack
In-Reply-To: <1479916350-28462-1-git-send-email-daniel@zonque.org>

If the cgroup associated with the receiving socket has an eBPF
programs installed, run them from ip_output(), ip6_output() and
ip_mc_output(). From mentioned functions we have two socket contexts
as per 7026b1ddb6b8 ("netfilter: Pass socket pointer down through
okfn()."). We explicitly need to use sk instead of skb->sk here,
since otherwise the same program would run multiple times on egress
when encap devices are involved, which is not desired in our case.

eBPF programs used in this context are expected to either return 1 to
let the packet pass, or != 1 to drop them. The programs have access to
the skb through bpf_skb_load_bytes(), and the payload starts at the
network headers (L3).

Note that cgroup_bpf_run_filter() is stubbed out as static inline nop
for !CONFIG_CGROUP_BPF, and is otherwise guarded by a static key if
the feature is unused.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 net/ipv4/ip_output.c  | 26 ++++++++++++++++++++++++--
 net/ipv6/ip6_output.c |  9 +++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 03e7f73..ed0c276 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -74,6 +74,7 @@
 #include <net/checksum.h>
 #include <net/inetpeer.h>
 #include <net/lwtunnel.h>
+#include <linux/bpf-cgroup.h>
 #include <linux/igmp.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_bridge.h>
@@ -281,6 +282,13 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
 static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	unsigned int mtu;
+	int ret;
+
+	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+	if (ret) {
+		kfree_skb(skb);
+		return ret;
+	}
 
 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 	/* Policy lookup after SNAT yielded a new policy */
@@ -299,6 +307,20 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
 	return ip_finish_output2(net, sk, skb);
 }
 
+static int ip_mc_finish_output(struct net *net, struct sock *sk,
+			       struct sk_buff *skb)
+{
+	int ret;
+
+	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+	if (ret) {
+		kfree_skb(skb);
+		return ret;
+	}
+
+	return dev_loopback_xmit(net, sk, skb);
+}
+
 int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct rtable *rt = skb_rtable(skb);
@@ -336,7 +358,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 			if (newskb)
 				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 					net, sk, newskb, NULL, newskb->dev,
-					dev_loopback_xmit);
+					ip_mc_finish_output);
 		}
 
 		/* Multicasts with ttl 0 must not go beyond the host */
@@ -352,7 +374,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 		if (newskb)
 			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 				net, sk, newskb, NULL, newskb->dev,
-				dev_loopback_xmit);
+				ip_mc_finish_output);
 	}
 
 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 6001e78..ddeb41e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -39,6 +39,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 
+#include <linux/bpf-cgroup.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
 
@@ -131,6 +132,14 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 
 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
+	int ret;
+
+	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+	if (ret) {
+		kfree_skb(skb);
+		return ret;
+	}
+
 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 	    dst_allfrag(skb_dst(skb)) ||
 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
-- 
2.7.4

^ permalink raw reply related

* [PATCH v9 2/6] cgroup: add support for eBPF programs
From: Daniel Mack @ 2016-11-23 15:52 UTC (permalink / raw)
  To: htejun, daniel, ast
  Cc: davem, kafai, fw, pablo, harald, netdev, sargun, cgroups,
	Daniel Mack
In-Reply-To: <1479916350-28462-1-git-send-email-daniel@zonque.org>

This patch adds two sets of eBPF program pointers to struct cgroup.
One for such that are directly pinned to a cgroup, and one for such
that are effective for it.

To illustrate the logic behind that, assume the following example
cgroup hierarchy.

  A - B - C
        \ D - E

If only B has a program attached, it will be effective for B, C, D
and E. If D then attaches a program itself, that will be effective for
both D and E, and the program in B will only affect B and C. Only one
program of a given type is effective for a cgroup.

Attaching and detaching programs will be done through the bpf(2)
syscall. For now, ingress and egress inet socket filtering are the
only supported use-cases.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf-cgroup.h  |  79 +++++++++++++++++++++
 include/linux/cgroup-defs.h |   4 ++
 init/Kconfig                |  12 ++++
 kernel/bpf/Makefile         |   1 +
 kernel/bpf/cgroup.c         | 167 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/cgroup.c             |  18 +++++
 6 files changed, 281 insertions(+)
 create mode 100644 include/linux/bpf-cgroup.h
 create mode 100644 kernel/bpf/cgroup.c

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
new file mode 100644
index 0000000..ec80d0c
--- /dev/null
+++ b/include/linux/bpf-cgroup.h
@@ -0,0 +1,79 @@
+#ifndef _BPF_CGROUP_H
+#define _BPF_CGROUP_H
+
+#include <linux/bpf.h>
+#include <linux/jump_label.h>
+#include <uapi/linux/bpf.h>
+
+struct sock;
+struct cgroup;
+struct sk_buff;
+
+#ifdef CONFIG_CGROUP_BPF
+
+extern struct static_key_false cgroup_bpf_enabled_key;
+#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
+
+struct cgroup_bpf {
+	/*
+	 * Store two sets of bpf_prog pointers, one for programs that are
+	 * pinned directly to this cgroup, and one for those that are effective
+	 * when this cgroup is accessed.
+	 */
+	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
+	struct bpf_prog *effective[MAX_BPF_ATTACH_TYPE];
+};
+
+void cgroup_bpf_put(struct cgroup *cgrp);
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
+
+void __cgroup_bpf_update(struct cgroup *cgrp,
+			 struct cgroup *parent,
+			 struct bpf_prog *prog,
+			 enum bpf_attach_type type);
+
+/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
+void cgroup_bpf_update(struct cgroup *cgrp,
+		       struct bpf_prog *prog,
+		       enum bpf_attach_type type);
+
+int __cgroup_bpf_run_filter(struct sock *sk,
+			    struct sk_buff *skb,
+			    enum bpf_attach_type type);
+
+/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb)			\
+({									\
+	int __ret = 0;							\
+	if (cgroup_bpf_enabled)						\
+		__ret = __cgroup_bpf_run_filter(sk, skb,		\
+						BPF_CGROUP_INET_INGRESS); \
+									\
+	__ret;								\
+})
+
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb)				\
+({									\
+	int __ret = 0;							\
+	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		\
+		typeof(sk) __sk = sk_to_full_sk(sk);			\
+		if (sk_fullsock(__sk))					\
+			__ret = __cgroup_bpf_run_filter(__sk, skb,	\
+						BPF_CGROUP_INET_EGRESS); \
+	}								\
+	__ret;								\
+})
+
+#else
+
+struct cgroup_bpf {};
+static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
+static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
+				      struct cgroup *parent) {}
+
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
+
+#endif /* CONFIG_CGROUP_BPF */
+
+#endif /* _BPF_CGROUP_H */
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 5b17de6..861b467 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,6 +16,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/workqueue.h>
+#include <linux/bpf-cgroup.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -300,6 +301,9 @@ struct cgroup {
 	/* used to schedule release agent */
 	struct work_struct release_agent_work;
 
+	/* used to store eBPF programs */
+	struct cgroup_bpf bpf;
+
 	/* ids of the ancestors at each level including self */
 	int ancestor_ids[];
 };
diff --git a/init/Kconfig b/init/Kconfig
index 34407f1..405120b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1154,6 +1154,18 @@ config CGROUP_PERF
 
 	  Say N if unsure.
 
+config CGROUP_BPF
+	bool "Support for eBPF programs attached to cgroups"
+	depends on BPF_SYSCALL && SOCK_CGROUP_DATA
+	help
+	  Allow attaching eBPF programs to a cgroup using the bpf(2)
+	  syscall command BPF_PROG_ATTACH.
+
+	  In which context these programs are accessed depends on the type
+	  of attachment. For instance, programs that are attached using
+	  BPF_CGROUP_INET_INGRESS will be executed on the ingress path of
+	  inet sockets.
+
 config CGROUP_DEBUG
 	bool "Example controller"
 	default n
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index eed911d..b22256b 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -5,3 +5,4 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
+obj-$(CONFIG_CGROUP_BPF) += cgroup.o
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
new file mode 100644
index 0000000..a0ab43f
--- /dev/null
+++ b/kernel/bpf/cgroup.c
@@ -0,0 +1,167 @@
+/*
+ * Functions to manage eBPF programs attached to cgroups
+ *
+ * Copyright (c) 2016 Daniel Mack
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
+#include <net/sock.h>
+
+DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
+EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+
+/**
+ * cgroup_bpf_put() - put references of all bpf programs
+ * @cgrp: the cgroup to modify
+ */
+void cgroup_bpf_put(struct cgroup *cgrp)
+{
+	unsigned int type;
+
+	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
+		struct bpf_prog *prog = cgrp->bpf.prog[type];
+
+		if (prog) {
+			bpf_prog_put(prog);
+			static_branch_dec(&cgroup_bpf_enabled_key);
+		}
+	}
+}
+
+/**
+ * cgroup_bpf_inherit() - inherit effective programs from parent
+ * @cgrp: the cgroup to modify
+ * @parent: the parent to inherit from
+ */
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+{
+	unsigned int type;
+
+	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
+		struct bpf_prog *e;
+
+		e = rcu_dereference_protected(parent->bpf.effective[type],
+					      lockdep_is_held(&cgroup_mutex));
+		rcu_assign_pointer(cgrp->bpf.effective[type], e);
+	}
+}
+
+/**
+ * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ *                         propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
+ * @prog: A new program to pin
+ * @type: Type of pinning operation (ingress/egress)
+ *
+ * Each cgroup has a set of two pointers for bpf programs; one for eBPF
+ * programs it owns, and which is effective for execution.
+ *
+ * If @prog is %NULL, this function attaches a new program to the cgroup and
+ * releases the one that is currently attached, if any. @prog is then made
+ * the effective program of type @type in that cgroup.
+ *
+ * If @prog is %NULL, the currently attached program of type @type is released,
+ * and the effective program of the parent cgroup (if any) is inherited to
+ * @cgrp.
+ *
+ * Then, the descendants of @cgrp are walked and the effective program for
+ * each of them is set to the effective program of @cgrp unless the
+ * descendant has its own program attached, in which case the subbranch is
+ * skipped. This ensures that delegated subcgroups with own programs are left
+ * untouched.
+ *
+ * Must be called with cgroup_mutex held.
+ */
+void __cgroup_bpf_update(struct cgroup *cgrp,
+			 struct cgroup *parent,
+			 struct bpf_prog *prog,
+			 enum bpf_attach_type type)
+{
+	struct bpf_prog *old_prog, *effective;
+	struct cgroup_subsys_state *pos;
+
+	old_prog = xchg(cgrp->bpf.prog + type, prog);
+
+	effective = (!prog && parent) ?
+		rcu_dereference_protected(parent->bpf.effective[type],
+					  lockdep_is_held(&cgroup_mutex)) :
+		prog;
+
+	css_for_each_descendant_pre(pos, &cgrp->self) {
+		struct cgroup *desc = container_of(pos, struct cgroup, self);
+
+		/* skip the subtree if the descendant has its own program */
+		if (desc->bpf.prog[type] && desc != cgrp)
+			pos = css_rightmost_descendant(pos);
+		else
+			rcu_assign_pointer(desc->bpf.effective[type],
+					   effective);
+	}
+
+	if (prog)
+		static_branch_inc(&cgroup_bpf_enabled_key);
+
+	if (old_prog) {
+		bpf_prog_put(old_prog);
+		static_branch_dec(&cgroup_bpf_enabled_key);
+	}
+}
+
+/**
+ * __cgroup_bpf_run_filter() - Run a program for packet filtering
+ * @sk: The socken sending or receiving traffic
+ * @skb: The skb that is being sent or received
+ * @type: The type of program to be exectuted
+ *
+ * If no socket is passed, or the socket is not of type INET or INET6,
+ * this function does nothing and returns 0.
+ *
+ * The program type passed in via @type must be suitable for network
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter(struct sock *sk,
+			    struct sk_buff *skb,
+			    enum bpf_attach_type type)
+{
+	struct bpf_prog *prog;
+	struct cgroup *cgrp;
+	int ret = 0;
+
+	if (!sk || !sk_fullsock(sk))
+		return 0;
+
+	if (sk->sk_family != AF_INET &&
+	    sk->sk_family != AF_INET6)
+		return 0;
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+
+	rcu_read_lock();
+
+	prog = rcu_dereference(cgrp->bpf.effective[type]);
+	if (prog) {
+		unsigned int offset = skb->data - skb_network_header(skb);
+
+		__skb_push(skb, offset);
+		ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
+		__skb_pull(skb, offset);
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 85bc9be..2ee9ec3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5074,6 +5074,8 @@ static void css_release_work_fn(struct work_struct *work)
 		if (cgrp->kn)
 			RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
 					 NULL);
+
+		cgroup_bpf_put(cgrp);
 	}
 
 	mutex_unlock(&cgroup_mutex);
@@ -5281,6 +5283,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (!cgroup_on_dfl(cgrp))
 		cgrp->subtree_control = cgroup_control(cgrp);
 
+	if (parent)
+		cgroup_bpf_inherit(cgrp, parent);
+
 	cgroup_propagate_control(cgrp);
 
 	/* @cgrp doesn't have dir yet so the following will only create csses */
@@ -6495,6 +6500,19 @@ static __init int cgroup_namespaces_init(void)
 }
 subsys_initcall(cgroup_namespaces_init);
 
+#ifdef CONFIG_CGROUP_BPF
+void cgroup_bpf_update(struct cgroup *cgrp,
+		       struct bpf_prog *prog,
+		       enum bpf_attach_type type)
+{
+	struct cgroup *parent = cgroup_parent(cgrp);
+
+	mutex_lock(&cgroup_mutex);
+	__cgroup_bpf_update(cgrp, parent, prog, type);
+	mutex_unlock(&cgroup_mutex);
+}
+#endif /* CONFIG_CGROUP_BPF */
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *
 debug_css_alloc(struct cgroup_subsys_state *parent_css)
-- 
2.7.4

^ permalink raw reply related

* [PATCH v9 3/6] bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands
From: Daniel Mack @ 2016-11-23 15:52 UTC (permalink / raw)
  To: htejun, daniel, ast
  Cc: davem, kafai, fw, pablo, harald, netdev, sargun, cgroups,
	Daniel Mack
In-Reply-To: <1479916350-28462-1-git-send-email-daniel@zonque.org>

Extend the bpf(2) syscall by two new commands, BPF_PROG_ATTACH and
BPF_PROG_DETACH which allow attaching and detaching eBPF programs
to a target.

On the API level, the target could be anything that has an fd in
userspace, hence the name of the field in union bpf_attr is called
'target_fd'.

When called with BPF_ATTACH_TYPE_CGROUP_INET_{E,IN}GRESS, the target is
expected to be a valid file descriptor of a cgroup v2 directory which
has the bpf controller enabled. These are the only use-cases
implemented by this patch at this point, but more can be added.

If a program of the given type already exists in the given cgroup,
the program is swapped automically, so userspace does not have to drop
an existing program first before installing a new one, which would
otherwise leave a gap in which no program is attached.

For more information on the propagation logic to subcgroups, please
refer to the bpf cgroup controller implementation.

The API is guarded by CAP_NET_ADMIN.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  8 +++++
 kernel/bpf/syscall.c     | 81 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1f3e6f1..f31b655 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -73,6 +73,8 @@ enum bpf_cmd {
 	BPF_PROG_LOAD,
 	BPF_OBJ_PIN,
 	BPF_OBJ_GET,
+	BPF_PROG_ATTACH,
+	BPF_PROG_DETACH,
 };
 
 enum bpf_map_type {
@@ -150,6 +152,12 @@ union bpf_attr {
 		__aligned_u64	pathname;
 		__u32		bpf_fd;
 	};
+
+	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
+		__u32		target_fd;	/* container object to attach to */
+		__u32		attach_bpf_fd;	/* eBPF program to attach */
+		__u32		attach_type;
+	};
 } __attribute__((aligned(8)));
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 228f962..1814c01 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -822,6 +822,77 @@ static int bpf_obj_get(const union bpf_attr *attr)
 	return bpf_obj_get_user(u64_to_ptr(attr->pathname));
 }
 
+#ifdef CONFIG_CGROUP_BPF
+
+#define BPF_PROG_ATTACH_LAST_FIELD attach_type
+
+static int bpf_prog_attach(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	struct cgroup *cgrp;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (CHECK_ATTR(BPF_PROG_ATTACH))
+		return -EINVAL;
+
+	switch (attr->attach_type) {
+	case BPF_CGROUP_INET_INGRESS:
+	case BPF_CGROUP_INET_EGRESS:
+		prog = bpf_prog_get_type(attr->attach_bpf_fd,
+					 BPF_PROG_TYPE_CGROUP_SKB);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+
+		cgrp = cgroup_get_from_fd(attr->target_fd);
+		if (IS_ERR(cgrp)) {
+			bpf_prog_put(prog);
+			return PTR_ERR(cgrp);
+		}
+
+		cgroup_bpf_update(cgrp, prog, attr->attach_type);
+		cgroup_put(cgrp);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#define BPF_PROG_DETACH_LAST_FIELD attach_type
+
+static int bpf_prog_detach(const union bpf_attr *attr)
+{
+	struct cgroup *cgrp;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (CHECK_ATTR(BPF_PROG_DETACH))
+		return -EINVAL;
+
+	switch (attr->attach_type) {
+	case BPF_CGROUP_INET_INGRESS:
+	case BPF_CGROUP_INET_EGRESS:
+		cgrp = cgroup_get_from_fd(attr->target_fd);
+		if (IS_ERR(cgrp))
+			return PTR_ERR(cgrp);
+
+		cgroup_bpf_update(cgrp, NULL, attr->attach_type);
+		cgroup_put(cgrp);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+#endif /* CONFIG_CGROUP_BPF */
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -888,6 +959,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_OBJ_GET:
 		err = bpf_obj_get(&attr);
 		break;
+
+#ifdef CONFIG_CGROUP_BPF
+	case BPF_PROG_ATTACH:
+		err = bpf_prog_attach(&attr);
+		break;
+	case BPF_PROG_DETACH:
+		err = bpf_prog_detach(&attr);
+		break;
+#endif
+
 	default:
 		err = -EINVAL;
 		break;
-- 
2.7.4

^ permalink raw reply related

* [PATCH v9 1/6] bpf: add new prog type for cgroup socket filtering
From: Daniel Mack @ 2016-11-23 15:52 UTC (permalink / raw)
  To: htejun, daniel, ast
  Cc: davem, kafai, fw, pablo, harald, netdev, sargun, cgroups,
	Daniel Mack
In-Reply-To: <1479916350-28462-1-git-send-email-daniel@zonque.org>

This program type is similar to BPF_PROG_TYPE_SOCKET_FILTER, except that
it does not allow BPF_LD_[ABS|IND] instructions and hooks up the
bpf_skb_load_bytes() helper.

Programs of this type will be attached to cgroups for network filtering
and accounting.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  9 +++++++++
 net/core/filter.c        | 23 +++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f09c70b..1f3e6f1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -96,8 +96,17 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_TRACEPOINT,
 	BPF_PROG_TYPE_XDP,
 	BPF_PROG_TYPE_PERF_EVENT,
+	BPF_PROG_TYPE_CGROUP_SKB,
 };
 
+enum bpf_attach_type {
+	BPF_CGROUP_INET_INGRESS,
+	BPF_CGROUP_INET_EGRESS,
+	__MAX_BPF_ATTACH_TYPE
+};
+
+#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
+
 #define BPF_PSEUDO_MAP_FD	1
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
diff --git a/net/core/filter.c b/net/core/filter.c
index 00351cd..e3813d6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2576,6 +2576,17 @@ xdp_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static const struct bpf_func_proto *
+cg_skb_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_load_bytes:
+		return &bpf_skb_load_bytes_proto;
+	default:
+		return sk_filter_func_proto(func_id);
+	}
+}
+
 static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
 	if (off < 0 || off >= sizeof(struct __sk_buff))
@@ -2938,6 +2949,12 @@ static const struct bpf_verifier_ops xdp_ops = {
 	.convert_ctx_access	= xdp_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops cg_skb_ops = {
+	.get_func_proto		= cg_skb_func_proto,
+	.is_valid_access	= sk_filter_is_valid_access,
+	.convert_ctx_access	= sk_filter_convert_ctx_access,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops	= &sk_filter_ops,
 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
@@ -2958,12 +2975,18 @@ static struct bpf_prog_type_list xdp_type __read_mostly = {
 	.type	= BPF_PROG_TYPE_XDP,
 };
 
+static struct bpf_prog_type_list cg_skb_type __read_mostly = {
+	.ops	= &cg_skb_ops,
+	.type	= BPF_PROG_TYPE_CGROUP_SKB,
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
 	bpf_register_prog_type(&sched_cls_type);
 	bpf_register_prog_type(&sched_act_type);
 	bpf_register_prog_type(&xdp_type);
+	bpf_register_prog_type(&cg_skb_type);
 
 	return 0;
 }
-- 
2.7.4

^ permalink raw reply related

* [PATCH v9 4/6] net: filter: run cgroup eBPF ingress programs
From: Daniel Mack @ 2016-11-23 15:52 UTC (permalink / raw)
  To: htejun-b10kYP2dOMg, daniel-FeC+5ew28dpmcu3hnIyYJQ,
	ast-b10kYP2dOMg
  Cc: davem-fT/PcQaiUtIeIZ0/mPfg9Q, kafai-b10kYP2dOMg,
	fw-HFFVJYpyMKqzQB+pC5nmwQ, pablo-Cap9r6Oaw4JrovVCs/uTlw,
	harald-H+wXaHxf7aLQT0dZR+AlfA, netdev-u79uwXL29TY76Z2rM5mHXA,
	sargun-GaZTRHToo+CzQB+pC5nmwQ, cgroups-u79uwXL29TY76Z2rM5mHXA,
	Daniel Mack
In-Reply-To: <1479916350-28462-1-git-send-email-daniel-cYrQPVfZoowdnm+yROfE0A@public.gmane.org>

If the cgroup associated with the receiving socket has an eBPF
programs installed, run them from sk_filter_trim_cap().

eBPF programs used in this context are expected to either return 1 to
let the packet pass, or != 1 to drop them. The programs have access to
the skb through bpf_skb_load_bytes(), and the payload starts at the
network headers (L3).

Note that cgroup_bpf_run_filter() is stubbed out as static inline nop
for !CONFIG_CGROUP_BPF, and is otherwise guarded by a static key if
the feature is unused.

Signed-off-by: Daniel Mack <daniel-cYrQPVfZoowdnm+yROfE0A@public.gmane.org>
Acked-by: Alexei Starovoitov <ast-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
 net/core/filter.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index e3813d6..474b486 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -78,6 +78,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
 	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
 		return -ENOMEM;
 
+	err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
+	if (err)
+		return err;
+
 	err = security_sock_rcv_skb(sk, skb);
 	if (err)
 		return err;
-- 
2.7.4

^ permalink raw reply related

* [PATCH v9 0/6] Add eBPF hooks for cgroups
From: Daniel Mack @ 2016-11-23 15:52 UTC (permalink / raw)
  To: htejun-b10kYP2dOMg, daniel-FeC+5ew28dpmcu3hnIyYJQ,
	ast-b10kYP2dOMg
  Cc: davem-fT/PcQaiUtIeIZ0/mPfg9Q, kafai-b10kYP2dOMg,
	fw-HFFVJYpyMKqzQB+pC5nmwQ, pablo-Cap9r6Oaw4JrovVCs/uTlw,
	harald-H+wXaHxf7aLQT0dZR+AlfA, netdev-u79uwXL29TY76Z2rM5mHXA,
	sargun-GaZTRHToo+CzQB+pC5nmwQ, cgroups-u79uwXL29TY76Z2rM5mHXA,
	Daniel Mack

This is v9 of the patch set to allow eBPF programs for network
filtering and accounting to be attached to cgroups, so that they apply
to all sockets of all tasks placed in that cgroup. The logic also
allows to be extendeded for other cgroup based eBPF logic.

Again, only minor details are updated in this version.

Thanks,
Daniel

Changes from v8:

* Move the egress hooks into ip_finish_output() and ip6_finish_output()
  so they run after the netfilter hooks. For IPv4 multicast, add a new
  ip_mc_finish_output() callback that is invoked on success by
  netfilter, and call the eBPF program from there.

Changes from v7:

* Replace the static inline function cgroup_bpf_run_filter() with
  two specific macros for ingress and egress.  This addresses David
  Miller's concern regarding skb->sk vs. sk in the egress path.
  Thanks a lot to Daniel Borkmann and Alexei Starovoitov for the
  suggestions.

Changes from v6:

* Rebased to 4.9-rc2

* Add EXPORT_SYMBOL(__cgroup_bpf_run_filter). The kbuild test robot
  now succeeds in building this version of the patch set.

* Switch from bpf_prog_run_save_cb() to bpf_prog_run_clear_cb() to not
  tamper with the contents of skb->cb[]. Pointed out by Daniel
  Borkmann.

* Use sk_to_full_sk() in the egress path, as suggested by Daniel
  Borkmann.

* Renamed BPF_PROG_TYPE_CGROUP_SOCKET to BPF_PROG_TYPE_CGROUP_SKB, as
  requested by David Ahern.

* Added Alexei's Acked-by tags.

Changes from v5:

* The eBPF programs now operate on L3 rather than on L2 of the packets,
  and the egress hooks were moved from __dev_queue_xmit() to
  ip*_output().

* For BPF_PROG_TYPE_CGROUP_SOCKET, disallow direct access to the skb
  through BPF_LD_[ABS|IND] instructions, but hook up the
  bpf_skb_load_bytes() access helper instead. Thanks to Daniel Borkmann
  for the help.

Changes from v4:

* Plug an skb leak when dropping packets due to eBPF verdicts in
  __dev_queue_xmit(). Spotted by Daniel Borkmann.

* Check for sk_fullsock(sk) in __cgroup_bpf_run_filter() so we don't
  operate on timewait or request sockets. Suggested by Daniel Borkmann.

* Add missing @parent parameter in kerneldoc of __cgroup_bpf_update().
  Spotted by Rami Rosen.

* Include linux/jump_label.h from bpf-cgroup.h to fix a kbuild error.

Changes from v3:

* Dropped the _FILTER suffix from BPF_PROG_TYPE_CGROUP_SOCKET_FILTER,
  renamed BPF_ATTACH_TYPE_CGROUP_INET_{E,IN}GRESS to
  BPF_CGROUP_INET_{IN,E}GRESS and alias BPF_MAX_ATTACH_TYPE to
  __BPF_MAX_ATTACH_TYPE, as suggested by Daniel Borkmann.

* Dropped the attach_flags member from the anonymous struct for BPF
  attach operations in union bpf_attr. They can be added later on via
  CHECK_ATTR. Requested by Daniel Borkmann and Alexei.

* Release old_prog at the end of __cgroup_bpf_update rather that at
  the beginning to fix a race gap between program updates and their
  users. Spotted by Daniel Borkmann.

* Plugged an skb leak when dropping packets on the egress path.
  Spotted by Daniel Borkmann.

* Add cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org to the loop, as suggested by Rami Rosen.

* Some minor coding style adoptions not worth mentioning in particular.

Changes from v2:

* Fixed the RCU locking details Tejun pointed out.

* Assert bpf_attr.flags == 0 in BPF_PROG_DETACH syscall handler.

Changes from v1:

* Moved all bpf specific cgroup code into its own file, and stub
  out related functions for !CONFIG_CGROUP_BPF as static inline nops.
  This way, the call sites are not cluttered with #ifdef guards while
  the feature remains compile-time configurable.

* Implemented the new scheme proposed by Tejun. Per cgroup, store one
  set of pointers that are pinned to the cgroup, and one for the
  programs that are effective. When a program is attached or detached,
  the change is propagated to all the cgroup's descendants. If a
  subcgroup has its own pinned program, skip the whole subbranch in
  order to allow delegation models.

* The hookup for egress packets is now done from __dev_queue_xmit().

* A static key is now used in both the ingress and egress fast paths
  to keep performance penalties close to zero if the feature is
  not in use.

* Overall cleanup to make the accessors use the program arrays.
  This should make it much easier to add new program types, which
  will then automatically follow the pinned vs. effective logic.

* Fixed locking issues, as pointed out by Eric Dumazet and Alexei
  Starovoitov. Changes to the program array are now done with
  xchg() and are protected by cgroup_mutex.

* eBPF programs are now expected to return 1 to let the packet pass,
  not >= 0. Pointed out by Alexei.

* Operation is now limited to INET sockets, so local AF_UNIX sockets
  are not affected. The enum members are renamed accordingly. In case
  other socket families should be supported, this can be extended in
  the future.

* The sample program learned to support both ingress and egress, and
  can now optionally make the eBPF program drop packets by making it
  return 0.

Daniel Mack (6):
  bpf: add new prog type for cgroup socket filtering
  cgroup: add support for eBPF programs
  bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands
  net: filter: run cgroup eBPF ingress programs
  net: ipv4, ipv6: run cgroup eBPF egress programs
  samples: bpf: add userspace example for attaching eBPF programs to
    cgroups

Daniel Mack (6):
  bpf: add new prog type for cgroup socket filtering
  cgroup: add support for eBPF programs
  bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands
  net: filter: run cgroup eBPF ingress programs
  net: ipv4, ipv6: run cgroup eBPF egress programs
  samples: bpf: add userspace example for attaching eBPF programs to
    cgroups

 include/linux/bpf-cgroup.h      |  79 +++++++++++++++++++
 include/linux/cgroup-defs.h     |   4 +
 include/uapi/linux/bpf.h        |  17 ++++
 init/Kconfig                    |  12 +++
 kernel/bpf/Makefile             |   1 +
 kernel/bpf/cgroup.c             | 167 ++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c            |  81 +++++++++++++++++++
 kernel/cgroup.c                 |  18 +++++
 net/core/filter.c               |  27 +++++++
 net/ipv4/ip_output.c            |  26 ++++++-
 net/ipv6/ip6_output.c           |   9 +++
 samples/bpf/Makefile            |   2 +
 samples/bpf/libbpf.c            |  21 +++++
 samples/bpf/libbpf.h            |   3 +
 samples/bpf/test_cgrp2_attach.c | 147 +++++++++++++++++++++++++++++++++++
 15 files changed, 612 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/bpf-cgroup.h
 create mode 100644 kernel/bpf/cgroup.c
 create mode 100644 samples/bpf/test_cgrp2_attach.c

-- 
2.7.4

^ permalink raw reply

* Re: [net] rtnetlink: fix the wrong minimal dump size getting from rtnl_calcit()
From: Roopa Prabhu @ 2016-11-23 15:38 UTC (permalink / raw)
  To: Zhang Shengju; +Cc: netdev, davem
In-Reply-To: <1479795268-11801-1-git-send-email-zhangshengju@cmss.chinamobile.com>

On 11/21/16, 10:14 PM, Zhang Shengju wrote:
> For RT netlink, calcit() function should return the minimal size for
> netlink dump message. This will make sure that dump message for every
> network device can be stored.
>
> Currently, rtnl_calcit() function doesn't account the size of header of
> netlink message, this patch will fix it.
>
> Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
> ---
>  net/core/rtnetlink.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
> index f4b9350..09e115b 100644
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
> @@ -2734,7 +2734,7 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
>  						           ext_filter_mask));
>  	}
>  
> -	return min_ifinfo_dump_size;
> +	return nlmsg_total_size(min_ifinfo_dump_size);
>  }
>  
>  static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)

This looks fine to me. However, looking at the git log, since this code has been
around for so long, it seems like it can go into net-next. But, that's just my 2c.
 

^ permalink raw reply

* Re: [PATCH] net: fix bogus cast in skb_pagelen() and use unsigned variables
From: Alexey Dobriyan @ 2016-11-23 15:35 UTC (permalink / raw)
  To: David Laight; +Cc: davem@davemloft.net, netdev@vger.kernel.org
In-Reply-To: <063D6719AE5E284EB5DD2968C1650D6DB02265B3@AcuExch.aculab.com>

On Wed, Nov 23, 2016 at 3:49 PM, David Laight <David.Laight@aculab.com> wrote:
> From: Alexey Dobriyan
>> Sent: 19 November 2016 01:08
> ...
>> -     for (i = (int)skb_shinfo(skb)->nr_frags - 1; i >= 0; i--)
>> +     for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--)
>>               len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
>
> Think I'd use:
>         for (i = skb_shinfo(skb)->nr_frags; i-- != 0; )

This kind of diverges from canonical loop form:

  for (init; temination condition: iterator)

by shifting everything into termination part.

  A

^ permalink raw reply

* Re: [PATCH ethtool v4 1/2] ethtool-copy.h:sync with net
From: John W. Linville @ 2016-11-23 15:24 UTC (permalink / raw)
  To: Allan W. Nielsen; +Cc: netdev, andrew, f.fainelli, raju.lakkaraju
In-Reply-To: <1479846737-18669-2-git-send-email-allan.nielsen@microsemi.com>

It looks like this patch somehow misses the following commit:

commit 85a624403c77c3f074931aefdfced59f61b668cb
Author: Jesse Brandeburg <jesse.brandeburg@intel.com>
Date:   Thu Oct 13 16:13:55 2016 -0700

    ethtool: silence warning on bit loss
    
    Sparse was complaining when we went to prototype some code
    using ethtool_cmd_speed_set and SPEED_100000, which uses
    the upper 16 bits of __u32 speed for the first time.
    
    CHECK
    ...
    .../uapi/linux/ethtool.h:123:28: warning:
      cast truncates bits from constant value (186a0 becomes 86a0)
    
    The warning is actually bogus, as no bits are really lost, but
    we can get rid of the sparse warning with this one small change.
    
    Reported-by: Preethi Banala <preethi.banala@intel.com>
    Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

Plus, the referenced commit does not seem to exist in the net-next tree.

Please resubmit with the complete history and correct commit IDs -- thanks!

John

On Tue, Nov 22, 2016 at 09:32:16PM +0100, Allan W. Nielsen wrote:
> From: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
> 
> This covers kernel changes upto:
> 
> commit f5a4732f85613b3fb43f8bc33a017e3db3b3605a
> Author: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
> Date:   Wed Nov 9 16:33:09 2016 +0530
> 
>     ethtool: (uapi) Add ETHTOOL_PHY_DOWNSHIFT to PHY tunables
> 
>     For operation in cabling environments that are incompatible with
>     1000BASE-T, PHY device may provide an automatic link speed downshift
>     operation. When enabled, the device automatically changes its 1000BASE-T
>     auto-negotiation to the next slower speed after a configured number of
>     failed attempts at 1000BASE-T.  This feature is useful in setting up in
>     networks using older cable installations that include only pairs A and B,
>     and not pairs C and D.
> 
>     Signed-off-by: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
>     Signed-off-by: Allan W. Nielsen <allan.nielsen@microsemi.com>
> 
> Signed-off-by: Allan W. Nielsen <allan.nielsen@microsemi.com>
> Acked-by: Florian Fainelli <f.fainelli@gmail.com>
> ---
>  ethtool-copy.h | 18 +++++++++++++++++-
>  1 file changed, 17 insertions(+), 1 deletion(-)
> 
> diff --git a/ethtool-copy.h b/ethtool-copy.h
> index 70748f5..2e2448f 100644
> --- a/ethtool-copy.h
> +++ b/ethtool-copy.h
> @@ -247,6 +247,19 @@ struct ethtool_tunable {
>  	void	*data[0];
>  };
>  
> +#define DOWNSHIFT_DEV_DEFAULT_COUNT	0xff
> +#define DOWNSHIFT_DEV_DISABLE		0
> +
> +enum phy_tunable_id {
> +	ETHTOOL_PHY_ID_UNSPEC,
> +	ETHTOOL_PHY_DOWNSHIFT,
> +	/*
> +	 * Add your fresh new phy tunable attribute above and remember to update
> +	 * phy_tunable_strings[] in net/core/ethtool.c
> +	 */
> +	__ETHTOOL_PHY_TUNABLE_COUNT,
> +};
> +
>  /**
>   * struct ethtool_regs - hardware register dump
>   * @cmd: Command number = %ETHTOOL_GREGS
> @@ -547,6 +560,7 @@ struct ethtool_pauseparam {
>   * @ETH_SS_FEATURES: Device feature names
>   * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names
>   * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS
> + * @ETH_SS_PHY_TUNABLES: PHY tunable names
>   */
>  enum ethtool_stringset {
>  	ETH_SS_TEST		= 0,
> @@ -557,6 +571,7 @@ enum ethtool_stringset {
>  	ETH_SS_RSS_HASH_FUNCS,
>  	ETH_SS_TUNABLES,
>  	ETH_SS_PHY_STATS,
> +	ETH_SS_PHY_TUNABLES,
>  };
>  
>  /**
> @@ -1312,7 +1327,8 @@ struct ethtool_per_queue_op {
>  
>  #define ETHTOOL_GLINKSETTINGS	0x0000004c /* Get ethtool_link_settings */
>  #define ETHTOOL_SLINKSETTINGS	0x0000004d /* Set ethtool_link_settings */
> -
> +#define ETHTOOL_PHY_GTUNABLE	0x0000004e /* Get PHY tunable configuration */
> +#define ETHTOOL_PHY_STUNABLE	0x0000004f /* Set PHY tunable configuration */
>  
>  /* compatibility with older code */
>  #define SPARC_ETH_GSET		ETHTOOL_GSET
> -- 
> 2.7.3
> 
> 

-- 
John W. Linville		Someday the world will need a hero, and you
linville@tuxdriver.com			might be all we have.  Be ready.

^ permalink raw reply

* Re: [PATCH ethtool] ethtool: Fix the "advertise" parameter logic.
From: John W. Linville @ 2016-11-23 15:00 UTC (permalink / raw)
  To: Michael Chan; +Cc: netdev
In-Reply-To: <1479858947-25017-1-git-send-email-michael.chan@broadcom.com>

On Tue, Nov 22, 2016 at 06:55:47PM -0500, Michael Chan wrote:
> From: Michael Chan <mchan@broadcom.com>
> 
> The current code ignores the value of the advertise parameter.  For example,
> 
> ethtool -s ethx advertise 0x1000
> 
> The full_advertising_wanted parameter of 0x1000 is not passed to the kernel.
> The reason is that advertising_wanted is NULL in this case, and ethtool
> will think that the user has given no advertisement input and so it will
> proceed to pass all supported advertisement speeds to the kernel.
> 
> The older legacy ethtool with similar logic worked because
> advertising_wanted was an integer and could take on -1 and 0.  It would pass
> the full_advertising_wanted value if advertising_wanted == -1.
> 
> This fix is to pass all supported advertisement speeds only when both
> advertising_wanted == NULL && full_advertising_wanted == NULL.
> 
> Signed-off-by: Michael Chan <michael.chan@broadcom.com>

Thanks, Michael -- merging...

John
-- 
John W. Linville		Someday the world will need a hero, and you
linville@tuxdriver.com			might be all we have.  Be ready.

^ permalink raw reply

* RE: [PATCH net 1/2] r8152: fix the sw rx checksum is unavailable
From: Hayes Wang @ 2016-11-23 15:12 UTC (permalink / raw)
  To: Mark Lord, netdev@vger.kernel.org
  Cc: nic_swsd, linux-kernel@vger.kernel.org, linux-usb@vger.kernel.org
In-Reply-To: <b61a0e10-c737-8912-3e4e-393f4eb32077@pobox.com>

Mark Lord [mlord@pobox.com]
[...]
> What does this code do:

> >static void r8153_set_rx_early_size(struct r8152 *tp)
> >{
> >        u32 mtu = tp->netdev->mtu;
> >        u32 ocp_data = (agg_buf_sz - mtu - VLAN_ETH_HLEN - VLAN_HLEN) / 4;
> >
> >        ocp_write_word(tp, MCU_TYPE_USB, USB_RX_EARLY_SIZE, ocp_data);
> >}

This only works for RTL8153. However, what you use is RTL8152.
It is like delay completion. It is used to reduce the loading of CPU
by letting a transfer contain more data to reduce the number of
transfers.

> How is ocp_data used by the hardware?
> Shouldn't the calculation also include sizeof(rx_desc) in there somewhere?

The algorithm is from our hw engineers, and it should be 

   (agg_buf_sz - packet size) / 8

You could refer to commit a59e6d815226 ("r8152: correct the rx early size").

^ permalink raw reply

* Re: [PATCH resend] ethtool: add register dump support for fjes driver
From: John W. Linville @ 2016-11-23 14:56 UTC (permalink / raw)
  To: Taku Izumi; +Cc: netdev
In-Reply-To: <1479257733-19277-1-git-send-email-izumi.taku@jp.fujitsu.com>

On Wed, Nov 16, 2016 at 09:55:32AM +0900, Taku Izumi wrote:
> This patch adds the register dump format for FUJITSU Extended
> Network device like the following:
> 
>    # ethtool -d es0
> 
> 0x0000: OWNER_EPID    (Owner EPID)                       0x00000001
> 0x0004: MAX_EP        (Maximum EP)                       0x00000008
> 0x0010: DCTL          (Device Control)                   0x00000000
> 0x0020: CR            (Command request)                  0x80000002
> 0x0024: CS            (Command status)                   0x80000002
> 0x0028: SHSTSAL       (Share status address Low)         0xE8215304
> 0x002C: SHSTSAH       (Share status address High)        0x00000007
> 0x0034: REQBL         (Request Buffer length)            0x00008028
> 0x0038: REQBAL        (Request Buffer Address Low)       0xEB0A0000
> 0x003C: REQBAH        (Request Buffer Address High)      0x00000007
> 0x0044: RESPBL        (Response Buffer Length)           0x00000018
> 0x0048: RESPBAL       (Response Buffer Address Low)      0xE41E1220
> 0x004C: RESPBAH       (Response Buffer Address High)     0x00000007
> 0x0080: IS            (Interrupt status)                 0x00000000
> 0x0084: IMS           (Interrupt mask set)               0x7FE00000
> 0x0088: IMC           (Interrupt mask clear)             0x001F0000
> 0x008C: IG            (Interrupt generator)              0x00010000
> 0x0090: ICTL          (Interrupt control)                0x00000000
> 
> Signed-off-by: Taku Izumi <izumi.taku@jp.fujitsu.com>

Thank you, Taku Izumi. I have merged this patch and will be pushing
it shortly.

I apologize for the delay.

John
-- 
John W. Linville		Someday the world will need a hero, and you
linville@tuxdriver.com			might be all we have.  Be ready.

^ permalink raw reply

* Re: [PATCH net-next 1/1] ipv6: sr: add option to control lwtunnel support
From: Roopa Prabhu @ 2016-11-23 14:57 UTC (permalink / raw)
  To: David Lebrun
  Cc: Alexei Starovoitov, David Miller, netdev@vger.kernel.org,
	Lorenzo Colitti, Eric Dumazet
In-Reply-To: <5835613D.8070104@uclouvain.be>

On 11/23/16, 1:28 AM, David Lebrun wrote:
> On 11/23/2016 08:34 AM, Roopa Prabhu wrote:
>> I can't seem to reproduce the problem you are seeing. still trying..
>> I don't have CONFIG_LWTUNNEL set nor any of the other SEG6 configs.
>> My CONFIG_IPV6 is on and compiled as a module. I have also tried disabling it.
>> If you can send me the config, I can try again. Looking back at the patches,
>> I do see a few things below ..but they may not fix your problem directly.
>>
>> Though I had none of the ipv6 segment routing configs turned on,
>> I do see the "Segment Routing with IPv6" msg at bootup.
>> Was looking at david's patches again, and a few things (I had missed seeing the last version):
>>
>> In my review comment I was hinting at CONFIG_IPV6_SEG6 to cover all of ipv6 segment routing,
>> including the lwtunnel bits.
>>
>> something like below:
>>
>> config IPV6_SEG6
>>         bool "IPv6: Segment Routing Header encapsulation support"
>>         depends on LWTUNNEL && IPV6
>>
>> DavidL, do you see a problem doing it this way ?. with this 'seg6.o' will be part of CONFIG_IPV6_SEG6 and not
>> get initialized unless it is enabled..which seems like the right thing to do.
> Can't reproduce the bug either, with CONFIG_IPV6=y, LWTUNNEL=n and all
> SEG6 disabled. Alexei, your .config and dmesg log could help.
>
> Roopa, the reason why seg6.o is compiled by default is that it provides
> an interface to control HMAC structures, and that HMAC does not depends
> on lwtunnels and can be used in the extension header processing (which
> is compiled by default). I could indeed add another option to
> conditionnally compile seg6.o if HMAC is enabled etc, and I actually had
> something like that in the very first versions of the patch, but I
> received comments that too much options is not a good thing (and I agree
> with that).

okay then. I agree with not having too many option. I had just thought that it
could live with the existing CONFIG_IPV6_SEG6_LWTUNNEL if it was renamed.
had not  looked at the HMAC dependency.

^ permalink raw reply

* AF_VSOCK network namespace support
From: Stefan Hajnoczi @ 2016-11-23 14:55 UTC (permalink / raw)
  To: Jorgen Hansen; +Cc: netdev, imbrenda

[-- Attachment #1: Type: text/plain, Size: 1323 bytes --]

Hi Jorgen,
There are two use cases where network namespace support in AF_VSOCK
could be useful:

1. Claudio Imbrenda pointed out that a machine cannot act as both host
   and guest at the same time.  This is necessary for nested
   virtualization.  Currently only one transport (the host side or the
   guest side) can be registered at a time.

2. Users may wish to isolate the AF_VSOCK address namespace so that two
   VMs have completely independent CID and ports (they could even use
   the same CID and ports because they're in separate namespaces).  This
   ensures that a host service visible to VM1 is not automatically
   visible to VM2.

Network namespaces could solve both problems.

A drawback of namespaces is that existing configurations using network
namespaces for IPv4/6 or other purposes break if AF_VSOCK gains network
namespace support.  This is not a big problem for virtio-vsock if we
implement namespace support soon since there are no existing users.

I wonder how other address families have solved this transition to
network namespaces.  It's almost like we need fine-grained namespaces
instead of a blanket network namespace that applies across all address
families...

I'm playing around with the code now but wanted to get your thoughts in
case you've already considered these problems.

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 455 bytes --]

^ permalink raw reply

* [patch net-next v2 11/11] rocker: Register FIB notifier before creating ports
From: Jiri Pirko @ 2016-11-23 14:48 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz, roopa,
	dsa, nikolay, andy, vivien.didelot, andrew, f.fainelli,
	alexander.h.duyck, hannes, kaber
In-Reply-To: <1479911670-4525-1-git-send-email-jiri@resnulli.us>

From: Ido Schimmel <idosch@mellanox.com>

Unlike mlxsw, rocker only supports the reflection of routes pointing to
its own netdevs. Therefore, instead of requesting a FIB dump during
init, simply register the FIB notifier before creating the ports.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/rocker/rocker_main.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index 914e9e1..8c9c90a 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -2804,6 +2804,9 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto err_alloc_ordered_workqueue;
 	}
 
+	rocker->fib_nb.notifier_call = rocker_router_fib_event;
+	register_fib_notifier(&rocker->fib_nb);
+
 	rocker->hw.id = rocker_read64(rocker, SWITCH_ID);
 
 	err = rocker_probe_ports(rocker);
@@ -2812,15 +2815,13 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto err_probe_ports;
 	}
 
-	rocker->fib_nb.notifier_call = rocker_router_fib_event;
-	register_fib_notifier(&rocker->fib_nb);
-
 	dev_info(&pdev->dev, "Rocker switch with id %*phN\n",
 		 (int)sizeof(rocker->hw.id), &rocker->hw.id);
 
 	return 0;
 
 err_probe_ports:
+	unregister_fib_notifier(&rocker->fib_nb);
 	destroy_workqueue(rocker->rocker_owq);
 err_alloc_ordered_workqueue:
 	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
@@ -2848,9 +2849,9 @@ static void rocker_remove(struct pci_dev *pdev)
 {
 	struct rocker *rocker = pci_get_drvdata(pdev);
 
+	rocker_remove_ports(rocker);
 	unregister_fib_notifier(&rocker->fib_nb);
 	rocker_write32(rocker, CONTROL, ROCKER_CONTROL_RESET);
-	rocker_remove_ports(rocker);
 	destroy_workqueue(rocker->rocker_owq);
 	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
 	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker);
-- 
2.7.4

^ permalink raw reply related

* [patch net-next v2 10/11] mlxsw: spectrum_router: Request a dump of FIB tables during init
From: Jiri Pirko @ 2016-11-23 14:48 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz, roopa,
	dsa, nikolay, andy, vivien.didelot, andrew, f.fainelli,
	alexander.h.duyck, hannes, kaber
In-Reply-To: <1479911670-4525-1-git-send-email-jiri@resnulli.us>

From: Ido Schimmel <idosch@mellanox.com>

Make sure the device has a complete view of the FIB tables by invoking
their dump during module init.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 14bed1d..36a71d2 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -2027,6 +2027,21 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 	return NOTIFY_DONE;
 }
 
+static void mlxsw_sp_router_fib_dump(struct mlxsw_sp *mlxsw_sp)
+{
+	while (!fib_notifier_dump(&mlxsw_sp->fib_nb)) {
+		/* Flush pending FIB notifications and then flush the
+		 * device's table before requesting another dump. Do
+		 * that with RTNL held, as FIB notification block is
+		 * already registered.
+		 */
+		mlxsw_core_flush_owq();
+		rtnl_lock();
+		mlxsw_sp_router_fib_flush(mlxsw_sp);
+		rtnl_unlock();
+	}
+}
+
 int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
 {
 	int err;
@@ -2048,6 +2063,7 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
 
 	mlxsw_sp->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
 	register_fib_notifier(&mlxsw_sp->fib_nb);
+	mlxsw_sp_router_fib_dump(mlxsw_sp);
 	return 0;
 
 err_neigh_init:
-- 
2.7.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox