Netdev List
 help / color / mirror / Atom feed
* [patch net 1/2] mlxsw: spectrum_router: Fix handling of neighbour structure
From: Jiri Pirko @ 2016-11-10 11:00 UTC (permalink / raw)
  To: netdev; +Cc: davem, idosch, eladr, yotamg, nogahf, ogerlitz
In-Reply-To: <1478775649-3386-1-git-send-email-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

__neigh_create function works in a different way than assumed.
It passes "n" as a parameter to ndo_neigh_construct. But this "n" might
be destroyed right away before __neigh_create() returns in case there is
already another neighbour struct in the hashtable with the same dev and
primary key. That is not expected by mlxsw_sp_router_neigh_construct()
and the stored "n" points to freed memory, eventually leading to crash.

Fix this by doing tight 1:1 coupling between neighbour struct and
internal driver neigh_entry. That allows to narrow down the key in
internal driver hashtable to do lookups by "n" only.

Fixes: 6cf3c971dc84 ("mlxsw: spectrum_router: Add private neigh table")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  | 95 ++++++++--------------
 1 file changed, 34 insertions(+), 61 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 4573da2..2863012 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -600,15 +600,13 @@ static void mlxsw_sp_vrs_fini(struct mlxsw_sp *mlxsw_sp)
 }
 
 struct mlxsw_sp_neigh_key {
-	unsigned char addr[sizeof(struct in6_addr)];
-	struct net_device *dev;
+	struct neighbour *n;
 };
 
 struct mlxsw_sp_neigh_entry {
 	struct rhash_head ht_node;
 	struct mlxsw_sp_neigh_key key;
 	u16 rif;
-	struct neighbour *n;
 	bool offloaded;
 	struct delayed_work dw;
 	struct mlxsw_sp_port *mlxsw_sp_port;
@@ -646,19 +644,15 @@ mlxsw_sp_neigh_entry_remove(struct mlxsw_sp *mlxsw_sp,
 static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work);
 
 static struct mlxsw_sp_neigh_entry *
-mlxsw_sp_neigh_entry_create(const void *addr, size_t addr_len,
-			    struct net_device *dev, u16 rif,
-			    struct neighbour *n)
+mlxsw_sp_neigh_entry_create(struct neighbour *n, u16 rif)
 {
 	struct mlxsw_sp_neigh_entry *neigh_entry;
 
 	neigh_entry = kzalloc(sizeof(*neigh_entry), GFP_ATOMIC);
 	if (!neigh_entry)
 		return NULL;
-	memcpy(neigh_entry->key.addr, addr, addr_len);
-	neigh_entry->key.dev = dev;
+	neigh_entry->key.n = n;
 	neigh_entry->rif = rif;
-	neigh_entry->n = n;
 	INIT_DELAYED_WORK(&neigh_entry->dw, mlxsw_sp_router_neigh_update_hw);
 	INIT_LIST_HEAD(&neigh_entry->nexthop_list);
 	return neigh_entry;
@@ -671,13 +665,11 @@ mlxsw_sp_neigh_entry_destroy(struct mlxsw_sp_neigh_entry *neigh_entry)
 }
 
 static struct mlxsw_sp_neigh_entry *
-mlxsw_sp_neigh_entry_lookup(struct mlxsw_sp *mlxsw_sp, const void *addr,
-			    size_t addr_len, struct net_device *dev)
+mlxsw_sp_neigh_entry_lookup(struct mlxsw_sp *mlxsw_sp, struct neighbour *n)
 {
-	struct mlxsw_sp_neigh_key key = {{ 0 } };
+	struct mlxsw_sp_neigh_key key;
 
-	memcpy(key.addr, addr, addr_len);
-	key.dev = dev;
+	key.n = n;
 	return rhashtable_lookup_fast(&mlxsw_sp->router.neigh_ht,
 				      &key, mlxsw_sp_neigh_ht_params);
 }
@@ -689,26 +681,20 @@ int mlxsw_sp_router_neigh_construct(struct net_device *dev,
 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	struct mlxsw_sp_neigh_entry *neigh_entry;
 	struct mlxsw_sp_rif *r;
-	u32 dip;
 	int err;
 
 	if (n->tbl != &arp_tbl)
 		return 0;
 
-	dip = ntohl(*((__be32 *) n->primary_key));
-	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, &dip, sizeof(dip),
-						  n->dev);
-	if (neigh_entry) {
-		WARN_ON(neigh_entry->n != n);
+	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n);
+	if (neigh_entry)
 		return 0;
-	}
 
 	r = mlxsw_sp_rif_find_by_dev(mlxsw_sp, n->dev);
 	if (WARN_ON(!r))
 		return -EINVAL;
 
-	neigh_entry = mlxsw_sp_neigh_entry_create(&dip, sizeof(dip), n->dev,
-						  r->rif, n);
+	neigh_entry = mlxsw_sp_neigh_entry_create(n, r->rif);
 	if (!neigh_entry)
 		return -ENOMEM;
 	err = mlxsw_sp_neigh_entry_insert(mlxsw_sp, neigh_entry);
@@ -727,14 +713,11 @@ void mlxsw_sp_router_neigh_destroy(struct net_device *dev,
 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	struct mlxsw_sp_neigh_entry *neigh_entry;
-	u32 dip;
 
 	if (n->tbl != &arp_tbl)
 		return;
 
-	dip = ntohl(*((__be32 *) n->primary_key));
-	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, &dip, sizeof(dip),
-						  n->dev);
+	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n);
 	if (!neigh_entry)
 		return;
 	mlxsw_sp_neigh_entry_remove(mlxsw_sp, neigh_entry);
@@ -862,7 +845,7 @@ static void mlxsw_sp_router_neighs_update_nh(struct mlxsw_sp *mlxsw_sp)
 		 * is active regardless of the traffic.
 		 */
 		if (!list_empty(&neigh_entry->nexthop_list))
-			neigh_event_send(neigh_entry->n, NULL);
+			neigh_event_send(neigh_entry->key.n, NULL);
 	}
 	rtnl_unlock();
 }
@@ -908,9 +891,9 @@ static void mlxsw_sp_router_probe_unresolved_nexthops(struct work_struct *work)
 	rtnl_lock();
 	list_for_each_entry(neigh_entry, &mlxsw_sp->router.nexthop_neighs_list,
 			    nexthop_neighs_list_node) {
-		if (!(neigh_entry->n->nud_state & NUD_VALID) &&
+		if (!(neigh_entry->key.n->nud_state & NUD_VALID) &&
 		    !list_empty(&neigh_entry->nexthop_list))
-			neigh_event_send(neigh_entry->n, NULL);
+			neigh_event_send(neigh_entry->key.n, NULL);
 	}
 	rtnl_unlock();
 
@@ -927,7 +910,7 @@ static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work)
 {
 	struct mlxsw_sp_neigh_entry *neigh_entry =
 		container_of(work, struct mlxsw_sp_neigh_entry, dw.work);
-	struct neighbour *n = neigh_entry->n;
+	struct neighbour *n = neigh_entry->key.n;
 	struct mlxsw_sp_port *mlxsw_sp_port = neigh_entry->mlxsw_sp_port;
 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	char rauht_pl[MLXSW_REG_RAUHT_LEN];
@@ -1030,11 +1013,8 @@ int mlxsw_sp_router_netevent_event(struct notifier_block *unused,
 
 		mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 		dip = ntohl(*((__be32 *) n->primary_key));
-		neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp,
-							  &dip,
-							  sizeof(__be32),
-							  dev);
-		if (WARN_ON(!neigh_entry) || WARN_ON(neigh_entry->n != n)) {
+		neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n);
+		if (WARN_ON(!neigh_entry)) {
 			mlxsw_sp_port_dev_put(mlxsw_sp_port);
 			return NOTIFY_DONE;
 		}
@@ -1343,33 +1323,26 @@ static int mlxsw_sp_nexthop_init(struct mlxsw_sp *mlxsw_sp,
 				 struct fib_nh *fib_nh)
 {
 	struct mlxsw_sp_neigh_entry *neigh_entry;
-	u32 gwip = ntohl(fib_nh->nh_gw);
 	struct net_device *dev = fib_nh->nh_dev;
 	struct neighbour *n;
 	u8 nud_state;
 
-	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, &gwip,
-						  sizeof(gwip), dev);
-	if (!neigh_entry) {
-		__be32 gwipn = htonl(gwip);
-
-		n = neigh_create(&arp_tbl, &gwipn, dev);
+	/* Take a reference of neigh here ensuring that neigh would
+	 * not be detructed before the nexthop entry is finished.
+	 * The reference is taken either in neigh_lookup() or
+	 * in neith_create() in case n is not found.
+	 */
+	n = neigh_lookup(&arp_tbl, &fib_nh->nh_gw, dev);
+	if (!n) {
+		n = neigh_create(&arp_tbl, &fib_nh->nh_gw, dev);
 		if (IS_ERR(n))
 			return PTR_ERR(n);
 		neigh_event_send(n, NULL);
-		neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, &gwip,
-							  sizeof(gwip), dev);
-		if (!neigh_entry) {
-			neigh_release(n);
-			return -EINVAL;
-		}
-	} else {
-		/* Take a reference of neigh here ensuring that neigh would
-		 * not be detructed before the nexthop entry is finished.
-		 * The second branch takes the reference in neith_create()
-		 */
-		n = neigh_entry->n;
-		neigh_clone(n);
+	}
+	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n);
+	if (!neigh_entry) {
+		neigh_release(n);
+		return -EINVAL;
 	}
 
 	/* If that is the first nexthop connected to that neigh, add to
@@ -1403,7 +1376,7 @@ static void mlxsw_sp_nexthop_fini(struct mlxsw_sp *mlxsw_sp,
 	if (list_empty(&nh->neigh_entry->nexthop_list))
 		list_del(&nh->neigh_entry->nexthop_neighs_list_node);
 
-	neigh_release(neigh_entry->n);
+	neigh_release(neigh_entry->key.n);
 }
 
 static struct mlxsw_sp_nexthop_group *
@@ -1463,11 +1436,11 @@ static bool mlxsw_sp_nexthop_match(struct mlxsw_sp_nexthop *nh,
 
 	for (i = 0; i < fi->fib_nhs; i++) {
 		struct fib_nh *fib_nh = &fi->fib_nh[i];
-		u32 gwip = ntohl(fib_nh->nh_gw);
+		struct neighbour *n = nh->neigh_entry->key.n;
 
-		if (memcmp(nh->neigh_entry->key.addr,
-			   &gwip, sizeof(u32)) == 0 &&
-		    nh->neigh_entry->key.dev == fib_nh->nh_dev)
+		if (memcmp(n->primary_key, &fib_nh->nh_gw,
+			   sizeof(fib_nh->nh_gw)) == 0 &&
+		    n->dev == fib_nh->nh_dev)
 			return true;
 	}
 	return false;
-- 
2.7.4

^ permalink raw reply related

* [patch net 2/2] mlxsw: spectrum_router: Ignore FIB notification events for non-init namespaces
From: Jiri Pirko @ 2016-11-10 11:00 UTC (permalink / raw)
  To: netdev; +Cc: davem, idosch, eladr, yotamg, nogahf, ogerlitz
In-Reply-To: <1478775649-3386-1-git-send-email-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Since now, the table with same id in multiple netnamespaces were squashed
to a single virtual router. That is not only incorrect, it also causes
error messages when trying to use RALUE register to do double remove
of FIB entries, like this one:

mlxsw_spectrum 0000:03:00.0: EMAD reg access failed (tid=facb831c00007b20,reg_id=8013(ralue),type=write,status=7(bad parameter))

Since we don't allow ports to change namespaces (NETIF_F_NETNS_LOCAL),
and the inrastructure is not yet prepared to handle netnamespaces, just
ignore FIB notification events for non-init namespaces. That is clear to
do since we don't need to offload them.

Fixes: b45f64d16d45 ("mlxsw: spectrum_router: Use FIB notifications instead of switchdev calls")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 2863012..6610672 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -1931,6 +1931,9 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 	struct fib_entry_notifier_info *fen_info = ptr;
 	int err;
 
+	if (fen_info->info.net != &init_net)
+		return NOTIFY_DONE;
+
 	switch (event) {
 	case FIB_EVENT_ENTRY_ADD:
 		err = mlxsw_sp_router_fib4_add(mlxsw_sp, fen_info);
-- 
2.7.4

^ permalink raw reply related

* [patch net-next] mlxsw: spectrum_router: Add FIB abort warning
From: Jiri Pirko @ 2016-11-10 11:10 UTC (permalink / raw)
  To: netdev; +Cc: davem, idosch, eladr, yotamg, nogahf, ogerlitz

From: Jiri Pirko <jiri@mellanox.com>

Add a warning that the abort mechanism was triggered for device.
Also avoid going through the procedure if abort was already done.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Ido Schimmel <idosch@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 348c773..df31f38 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -1877,6 +1877,9 @@ static void mlxsw_sp_router_fib4_abort(struct mlxsw_sp *mlxsw_sp)
 	int i;
 	int err;
 
+	if (mlxsw_sp->router.aborted)
+		return;
+	dev_warn(mlxsw_sp->bus_info->dev, "FIB abort triggered. Note that FIB entries are no longer being offloaded to this device.\n");
 	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS); i++) {
 		vr = &mlxsw_sp->router.vrs[i];
 		if (!vr->used)
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH] usbnet: prevent device rpm suspend in usbnet_probe function
From: Bjørn Mork @ 2016-11-10 11:09 UTC (permalink / raw)
  To: Kai-Heng Feng; +Cc: Oliver Neukum, Alan Stern, linux-kernel, linux-usb, netdev
In-Reply-To: <CAAd53p5W4xpSYsFZ00deQDh-gZBijn_ApreE9NmC=_B_MAQaug@mail.gmail.com>

Kai-Heng Feng <kai.heng.feng@canonical.com> writes:
> On Wed, Nov 9, 2016 at 8:32 PM, Bjørn Mork <bjorn@mork.no> wrote:
>> Oliver Neukum <oneukum@suse.com> writes:
>>
>>> On Tue, 2016-11-08 at 13:44 -0500, Alan Stern wrote:
>>>
>>>> These problems could very well be caused by running at SuperSpeed
>>>> (USB-3) instead of high speed (USB-2).
>
> Yes, it's running at SuperSpeed, on a Kabylake laptop.
>
> It does not have this issue on a Broadwell laptop, also running at SuperSpeed.

Then I must join Oliver, being very surprised by where in the stack you
attempt to fix the issue.  What you write above indicates a problem in
pci bridge or usb host controller, doesn't it?


Bjørn

^ permalink raw reply

* Re: [PATCH net-next] ipv6: sr: fix IPv6 initialization failure without lwtunnels
From: kbuild test robot @ 2016-11-10 11:20 UTC (permalink / raw)
  To: David Lebrun; +Cc: kbuild-all, netdev, lorenzo, davem, David Lebrun
In-Reply-To: <1478771715-23137-1-git-send-email-david.lebrun@uclouvain.be>

[-- Attachment #1: Type: text/plain, Size: 3159 bytes --]

Hi David,

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/David-Lebrun/ipv6-sr-fix-IPv6-initialization-failure-without-lwtunnels/20161110-175753
config: i386-randconfig-s1-201645 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   In file included from net/ipv6/seg6_iptunnel.c:24:0:
   include/net/seg6.h: In function 'seg6_pernet':
   include/net/seg6.h:52:12: error: 'struct net' has no member named 'ipv6'; did you mean 'ipv4'?
     return net->ipv6.seg6_data;
               ^~
   net/ipv6/seg6_iptunnel.c: In function 'seg6_output':
>> net/ipv6/seg6_iptunnel.c:289:3: error: implicit declaration of function 'dst_cache_set_ip6' [-Werror=implicit-function-declaration]
      dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr);
      ^~~~~~~~~~~~~~~~~
   In file included from net/ipv6/seg6_iptunnel.c:24:0:
   include/net/seg6.h: In function 'seg6_pernet':
   include/net/seg6.h:53:1: warning: control reaches end of non-void function [-Wreturn-type]
    }
    ^
   cc1: some warnings being treated as errors

vim +/dst_cache_set_ip6 +289 net/ipv6/seg6_iptunnel.c

6c8702c6 David Lebrun 2016-11-08  273  		struct flowi6 fl6;
6c8702c6 David Lebrun 2016-11-08  274  
6c8702c6 David Lebrun 2016-11-08  275  		fl6.daddr = hdr->daddr;
6c8702c6 David Lebrun 2016-11-08  276  		fl6.saddr = hdr->saddr;
6c8702c6 David Lebrun 2016-11-08  277  		fl6.flowlabel = ip6_flowinfo(hdr);
6c8702c6 David Lebrun 2016-11-08  278  		fl6.flowi6_mark = skb->mark;
6c8702c6 David Lebrun 2016-11-08  279  		fl6.flowi6_proto = hdr->nexthdr;
6c8702c6 David Lebrun 2016-11-08  280  
6c8702c6 David Lebrun 2016-11-08  281  		dst = ip6_route_output(net, NULL, &fl6);
6c8702c6 David Lebrun 2016-11-08  282  		if (dst->error) {
6c8702c6 David Lebrun 2016-11-08  283  			err = dst->error;
6c8702c6 David Lebrun 2016-11-08  284  			dst_release(dst);
6c8702c6 David Lebrun 2016-11-08  285  			goto drop;
6c8702c6 David Lebrun 2016-11-08  286  		}
6c8702c6 David Lebrun 2016-11-08  287  
6c8702c6 David Lebrun 2016-11-08  288  #ifdef CONFIG_DST_CACHE
6c8702c6 David Lebrun 2016-11-08 @289  		dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr);
6c8702c6 David Lebrun 2016-11-08  290  #endif
6c8702c6 David Lebrun 2016-11-08  291  	}
6c8702c6 David Lebrun 2016-11-08  292  
6c8702c6 David Lebrun 2016-11-08  293  	skb_dst_drop(skb);
6c8702c6 David Lebrun 2016-11-08  294  	skb_dst_set(skb, dst);
6c8702c6 David Lebrun 2016-11-08  295  
6c8702c6 David Lebrun 2016-11-08  296  	return dst_output(net, sk, skb);
6c8702c6 David Lebrun 2016-11-08  297  drop:

:::::: The code at line 289 was first introduced by commit
:::::: 6c8702c60b88651072460f3f4026c7dfe2521d12 ipv6: sr: add support for SRH encapsulation and injection with lwtunnels

:::::: TO: David Lebrun <david.lebrun@uclouvain.be>
:::::: CC: David S. Miller <davem@davemloft.net>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 27503 bytes --]

^ permalink raw reply

* [PATCH] net: bpqether.h: remove if_ether.h guard
From: Baruch Siach @ 2016-11-10 11:21 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: linux-hams, netdev, Baruch Siach

__LINUX_IF_ETHER_H is not defined anywhere, and if_ether.h can keep itself from
double inclusion, though it uses a single underscore prefix.

Signed-off-by: Baruch Siach <baruch@tkos.co.il>
---
 include/uapi/linux/bpqether.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/uapi/linux/bpqether.h b/include/uapi/linux/bpqether.h
index a6c35e1a89ad..05865edaefda 100644
--- a/include/uapi/linux/bpqether.h
+++ b/include/uapi/linux/bpqether.h
@@ -5,9 +5,7 @@
  * 	Defines for the BPQETHER pseudo device driver
  */
 
-#ifndef __LINUX_IF_ETHER_H
 #include <linux/if_ether.h>
-#endif
 
 #define SIOCSBPQETHOPT		(SIOCDEVPRIVATE+0)	/* reserved */
 #define SIOCSBPQETHADDR		(SIOCDEVPRIVATE+1)
-- 
2.10.2

^ permalink raw reply related

* Re: [PATCH] usbnet: prevent device rpm suspend in usbnet_probe function
From: Oliver Neukum @ 2016-11-10 11:22 UTC (permalink / raw)
  To: Bjørn Mork
  Cc: Kai-Heng Feng, Alan Stern, linux-kernel, linux-usb, netdev,
	Mathias Nyman
In-Reply-To: <87a8d7h9q1.fsf@miraculix.mork.no>

On Thu, 2016-11-10 at 12:09 +0100, Bjørn Mork wrote:
> Kai-Heng Feng <kai.heng.feng@canonical.com> writes:
> > On Wed, Nov 9, 2016 at 8:32 PM, Bjørn Mork <bjorn@mork.no> wrote:
> >> Oliver Neukum <oneukum@suse.com> writes:
> >>
> >>> On Tue, 2016-11-08 at 13:44 -0500, Alan Stern wrote:
> >>>
> >>>> These problems could very well be caused by running at SuperSpeed
> >>>> (USB-3) instead of high speed (USB-2).
> >
> > Yes, it's running at SuperSpeed, on a Kabylake laptop.
> >
> > It does not have this issue on a Broadwell laptop, also running at SuperSpeed.
> 
> Then I must join Oliver, being very surprised by where in the stack you
> attempt to fix the issue.  What you write above indicates a problem in
> pci bridge or usb host controller, doesn't it?

Indeed. And this means we need an XHCI specialist.
Mathias, we have a failure specific to one implementation of XHCI.

	Regards
		Oliver

^ permalink raw reply

* [patch net-next 2/8] act_ife: Change to use ife module
From: Jiri Pirko @ 2016-11-10 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, yotamg, idosch, eladr, nogahf, ogerlitz, jhs,
	geert+renesas, stephen, xiyou.wangcong, linux, roopa
In-Reply-To: <1478776988-5400-1-git-send-email-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

Use the encode/decode functionality from the ife module instead of using
implementation inside the act_ife.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/tc_act/tc_ife.h        |   3 -
 include/uapi/linux/tc_act/tc_ife.h |  10 +---
 net/sched/Kconfig                  |   1 +
 net/sched/act_ife.c                | 109 +++++++++++--------------------------
 4 files changed, 34 insertions(+), 89 deletions(-)

diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
index 9fd2bea0..30ba459 100644
--- a/include/net/tc_act/tc_ife.h
+++ b/include/net/tc_act/tc_ife.h
@@ -6,7 +6,6 @@
 #include <linux/rtnetlink.h>
 #include <linux/module.h>
 
-#define IFE_METAHDRLEN 2
 struct tcf_ife_info {
 	struct tc_action common;
 	u8 eth_dst[ETH_ALEN];
@@ -45,8 +44,6 @@ struct tcf_meta_ops {
 
 int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi);
 int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi);
-int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
-			const void *dval);
 int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval, gfp_t gfp);
 int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval, gfp_t gfp);
 int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi);
diff --git a/include/uapi/linux/tc_act/tc_ife.h b/include/uapi/linux/tc_act/tc_ife.h
index cd18360..7c28178 100644
--- a/include/uapi/linux/tc_act/tc_ife.h
+++ b/include/uapi/linux/tc_act/tc_ife.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/pkt_cls.h>
+#include <linux/ife.h>
 
 #define TCA_ACT_IFE 25
 /* Flag bits for now just encoding/decoding; mutually exclusive */
@@ -28,13 +29,4 @@ enum {
 };
 #define TCA_IFE_MAX (__TCA_IFE_MAX - 1)
 
-#define IFE_META_SKBMARK 1
-#define IFE_META_HASHID 2
-#define	IFE_META_PRIO 3
-#define	IFE_META_QMAP 4
-#define	IFE_META_TCINDEX 5
-/*Can be overridden at runtime by module option*/
-#define	__IFE_META_MAX 6
-#define IFE_META_MAX (__IFE_META_MAX - 1)
-
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 87956a7..24f7cac 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -763,6 +763,7 @@ config NET_ACT_SKBMOD
 config NET_ACT_IFE
         tristate "Inter-FE action based on IETF ForCES InterFE LFB"
         depends on NET_CLS_ACT
+        select NET_IFE
         ---help---
 	  Say Y here to allow for sourcing and terminating metadata
 	  For details refer to netdev01 paper:
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 95c463c..5c2478a 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -32,6 +32,7 @@
 #include <uapi/linux/tc_act/tc_ife.h>
 #include <net/tc_act/tc_ife.h>
 #include <linux/etherdevice.h>
+#include <net/ife.h>
 
 #define IFE_TAB_MASK 15
 
@@ -46,23 +47,6 @@ static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = {
 	[TCA_IFE_TYPE] = { .type = NLA_U16},
 };
 
-/* Caller takes care of presenting data in network order
-*/
-int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
-{
-	u32 *tlv = (u32 *)(skbdata);
-	u16 totlen = nla_total_size(dlen);	/*alignment + hdr */
-	char *dptr = (char *)tlv + NLA_HDRLEN;
-	u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
-
-	*tlv = htonl(htlv);
-	memset(dptr, 0, totlen - NLA_HDRLEN);
-	memcpy(dptr, dval, dlen);
-
-	return totlen;
-}
-EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
-
 int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi)
 {
 	u16 edata = 0;
@@ -637,69 +621,60 @@ int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
 	return 0;
 }
 
-struct ifeheadr {
-	__be16 metalen;
-	u8 tlv_data[];
-};
-
-struct meta_tlvhdr {
-	__be16 type;
-	__be16 len;
-};
-
 static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
 			  struct tcf_result *res)
 {
 	struct tcf_ife_info *ife = to_ife(a);
+	u32 at = G_TC_AT(skb->tc_verd);
 	int action = ife->tcf_action;
-	struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data;
-	int ifehdrln = (int)ifehdr->metalen;
-	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data);
+	u8 *ifehdr_end;
+	u8 *tlv_data;
+	u16 metalen;
 
 	spin_lock(&ife->tcf_lock);
 	bstats_update(&ife->tcf_bstats, skb);
 	tcf_lastuse_update(&ife->tcf_tm);
 	spin_unlock(&ife->tcf_lock);
 
-	ifehdrln = ntohs(ifehdrln);
-	if (unlikely(!pskb_may_pull(skb, ifehdrln))) {
+	if (!(at & AT_EGRESS))
+		skb_push(skb, skb->dev->hard_header_len);
+
+	tlv_data = ife_decode(skb, &metalen);
+	if (unlikely(!tlv_data)) {
 		spin_lock(&ife->tcf_lock);
 		ife->tcf_qstats.drops++;
 		spin_unlock(&ife->tcf_lock);
 		return TC_ACT_SHOT;
 	}
 
-	skb_set_mac_header(skb, ifehdrln);
-	__skb_pull(skb, ifehdrln);
-	skb->protocol = eth_type_trans(skb, skb->dev);
-	ifehdrln -= IFE_METAHDRLEN;
-
-	while (ifehdrln > 0) {
-		u8 *tlvdata = (u8 *)tlv;
-		u16 mtype = tlv->type;
-		u16 mlen = tlv->len;
-		u16 alen;
+	ifehdr_end = tlv_data + metalen;
+	for (; tlv_data < ifehdr_end; tlv_data = ife_tlv_meta_next(tlv_data)) {
+		u8 *curr_data;
+		u16 mtype;
+		u16 dlen;
 
-		mtype = ntohs(mtype);
-		mlen = ntohs(mlen);
-		alen = NLA_ALIGN(mlen);
+		curr_data = ife_tlv_meta_decode(tlv_data, &mtype, &dlen, NULL);
 
-		if (find_decode_metaid(skb, ife, mtype, (mlen - NLA_HDRLEN),
-				       (void *)(tlvdata + NLA_HDRLEN))) {
+		if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) {
 			/* abuse overlimits to count when we receive metadata
 			 * but dont have an ops for it
 			 */
-			pr_info_ratelimited("Unknown metaid %d alnlen %d\n",
-					    mtype, mlen);
+			pr_info_ratelimited("Unknown metaid %d dlen %d\n",
+					    mtype, dlen);
 			ife->tcf_qstats.overlimits++;
 		}
+	}
 
-		tlvdata += alen;
-		ifehdrln -= alen;
-		tlv = (struct meta_tlvhdr *)tlvdata;
+	if (WARN_ON(tlv_data != ifehdr_end)) {
+		spin_lock(&ife->tcf_lock);
+		ife->tcf_qstats.drops++;
+		spin_unlock(&ife->tcf_lock);
+		return TC_ACT_SHOT;
 	}
 
+	skb->protocol = eth_type_trans(skb, skb->dev);
 	skb_reset_network_header(skb);
+
 	return action;
 }
 
@@ -727,7 +702,6 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 	struct tcf_ife_info *ife = to_ife(a);
 	int action = ife->tcf_action;
 	struct ethhdr *oethh;	/* outer ether header */
-	struct ethhdr *iethh;	/* inner eth header */
 	struct tcf_meta_info *e;
 	/*
 	   OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
@@ -735,10 +709,11 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 	 */
 	u16 metalen = ife_get_sz(skb, ife);
 	int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
-	unsigned int skboff = skb->dev->hard_header_len;
+	unsigned int skboff = 0;
 	u32 at = G_TC_AT(skb->tc_verd);
 	int new_len = skb->len + hdrm;
 	bool exceed_mtu = false;
+	void *ife_meta;
 	int err;
 
 	if (at & AT_EGRESS) {
@@ -766,27 +741,10 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 		return TC_ACT_SHOT;
 	}
 
-	err = skb_cow_head(skb, hdrm);
-	if (unlikely(err)) {
-		ife->tcf_qstats.drops++;
-		spin_unlock(&ife->tcf_lock);
-		return TC_ACT_SHOT;
-	}
-
 	if (!(at & AT_EGRESS))
 		skb_push(skb, skb->dev->hard_header_len);
 
-	iethh = (struct ethhdr *)skb->data;
-	__skb_push(skb, hdrm);
-	memcpy(skb->data, iethh, skb->mac_len);
-	skb_reset_mac_header(skb);
-	oethh = eth_hdr(skb);
-
-	/*total metadata length */
-	metalen += IFE_METAHDRLEN;
-	metalen = htons(metalen);
-	memcpy((skb->data + skboff), &metalen, IFE_METAHDRLEN);
-	skboff += IFE_METAHDRLEN;
+	ife_meta = ife_encode(skb, metalen);
 
 	/* XXX: we dont have a clever way of telling encode to
 	 * not repeat some of the computations that are done by
@@ -794,7 +752,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 	 */
 	list_for_each_entry(e, &ife->metalist, metalist) {
 		if (e->ops->encode) {
-			err = e->ops->encode(skb, (void *)(skb->data + skboff),
+			err = e->ops->encode(skb, (void *)(ife_meta + skboff),
 					     e);
 		}
 		if (err < 0) {
@@ -805,15 +763,12 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 		}
 		skboff += err;
 	}
+	oethh = (struct ethhdr *)skb->data;
 
 	if (!is_zero_ether_addr(ife->eth_src))
 		ether_addr_copy(oethh->h_source, ife->eth_src);
-	else
-		ether_addr_copy(oethh->h_source, iethh->h_source);
 	if (!is_zero_ether_addr(ife->eth_dst))
 		ether_addr_copy(oethh->h_dest, ife->eth_dst);
-	else
-		ether_addr_copy(oethh->h_dest, iethh->h_dest);
 	oethh->h_proto = htons(ife->eth_type);
 
 	if (!(at & AT_EGRESS))
-- 
2.7.4

^ permalink raw reply related

* [patch net-next 3/8] net: ife: Introduce new metadata tlv types
From: Jiri Pirko @ 2016-11-10 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, yotamg, idosch, eladr, nogahf, ogerlitz, jhs,
	geert+renesas, stephen, xiyou.wangcong, linux, roopa
In-Reply-To: <1478776988-5400-1-git-send-email-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

IFE_META_IN_IFINDEX: Allow to pass input ifindex value as part of the
ife metadata
IFE_META_OUT_IFINDEX: Allow to pass output ifindex value as part of the
ife metadata
IFE_META_ORIG_SIZE: Allow to pass the original packet size as part of
the ife metadata. Can be used in case that the packet is truncated
IFE_META_SIZE: Allow to pass the size of the encapsulated packet as
part of the ife metadata

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/uapi/linux/ife.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/uapi/linux/ife.h b/include/uapi/linux/ife.h
index 2954da3..ebdd785 100644
--- a/include/uapi/linux/ife.h
+++ b/include/uapi/linux/ife.h
@@ -9,6 +9,10 @@ enum {
 	IFE_META_PRIO,
 	IFE_META_QMAP,
 	IFE_META_TCINDEX,
+	IFE_META_IN_IFINDEX,
+	IFE_META_OUT_IFINDEX,
+	IFE_META_ORIGSIZE,
+	IFE_META_SIZE,
 	__IFE_META_MAX
 };
 
-- 
2.7.4

^ permalink raw reply related

* [patch net-next 1/8] Introduce ife encapsulation module
From: Jiri Pirko @ 2016-11-10 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, yotamg, idosch, eladr, nogahf, ogerlitz, jhs,
	geert+renesas, stephen, xiyou.wangcong, linux, roopa
In-Reply-To: <1478776988-5400-1-git-send-email-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

This module is responsible for the ife encapsulation protocol
encode/decode logics. That module can:
 - ife_encode: encode skb and reserve space for the ife meta header
 - ife_decode: decode skb and extract the meta header size
 - ife_tlv_meta_encode - encodes one tlv entry into the reserved ife
   header space.
 - ife_tlv_meta_decode - decodes one tlv entry from the packet
 - ife_tlv_meta_next - advance to the next tlv

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 MAINTAINERS               |   7 +++
 include/net/ife.h         |  52 +++++++++++++++++
 include/uapi/linux/Kbuild |   1 +
 include/uapi/linux/ife.h  |  18 ++++++
 net/Kconfig               |   1 +
 net/Makefile              |   1 +
 net/ife/Kconfig           |  16 ++++++
 net/ife/Makefile          |   5 ++
 net/ife/ife.c             | 144 ++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 245 insertions(+)
 create mode 100644 include/net/ife.h
 create mode 100644 include/uapi/linux/ife.h
 create mode 100644 net/ife/Kconfig
 create mode 100644 net/ife/Makefile
 create mode 100644 net/ife/ife.c

diff --git a/MAINTAINERS b/MAINTAINERS
index e5c17a9..cc3e640 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6114,6 +6114,13 @@ F:	include/net/cfg802154.h
 F:	include/net/ieee802154_netdev.h
 F:	Documentation/networking/ieee802154.txt
 
+IFE PROTOCOL
+M:	Yotam Gigi <yotamg@mellanox.com>
+M:	Jamal Hadi Salim <jhs@mojatatu.com>
+F:	net/ife
+F:	include/net/ife.h
+F:	include/uapi/linux/ife.h
+
 IGORPLUG-USB IR RECEIVER
 M:	Sean Young <sean@mess.org>
 L:	linux-media@vger.kernel.org
diff --git a/include/net/ife.h b/include/net/ife.h
new file mode 100644
index 0000000..a75d4e0
--- /dev/null
+++ b/include/net/ife.h
@@ -0,0 +1,52 @@
+#ifndef __NET_IFE_H
+#define __NET_IFE_H
+
+#include <uapi/linux/ife.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <uapi/linux/ife.h>
+
+#if IS_ENABLED(CONFIG_NET_IFE)
+
+void *ife_encode(struct sk_buff *skb, u16 metalen);
+void *ife_decode(struct sk_buff *skb, u16 *metalen);
+
+void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen);
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
+			const void *dval);
+
+void *ife_tlv_meta_next(void *skbdata);
+
+#else
+
+static inline void *ife_encode(struct sk_buff *skb, u16 metalen)
+{
+	return NULL;
+}
+
+static inline void *ife_decode(struct sk_buff *skb, u16 *metalen)
+{
+	return NULL;
+}
+
+static inline void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen,
+					u16 *totlen)
+{
+	return NULL;
+}
+
+static inline int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
+			const void *dval)
+{
+	return 0;
+}
+
+static inline void *ife_tlv_meta_next(void *skbdata)
+{
+	return NULL;
+}
+
+#endif
+
+#endif /* __NET_IFE_H */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index cd2be1c..eabf838 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -191,6 +191,7 @@ header-y += if_tun.h
 header-y += if_tunnel.h
 header-y += if_vlan.h
 header-y += if_x25.h
+header-y += ife.h
 header-y += igmp.h
 header-y += ila.h
 header-y += in6.h
diff --git a/include/uapi/linux/ife.h b/include/uapi/linux/ife.h
new file mode 100644
index 0000000..2954da3
--- /dev/null
+++ b/include/uapi/linux/ife.h
@@ -0,0 +1,18 @@
+#ifndef __UAPI_IFE_H
+#define __UAPI_IFE_H
+
+#define IFE_METAHDRLEN 2
+
+enum {
+	IFE_META_SKBMARK = 1,
+	IFE_META_HASHID,
+	IFE_META_PRIO,
+	IFE_META_QMAP,
+	IFE_META_TCINDEX,
+	__IFE_META_MAX
+};
+
+/*Can be overridden at runtime by module option*/
+#define IFE_META_MAX (__IFE_META_MAX - 1)
+
+#endif
diff --git a/net/Kconfig b/net/Kconfig
index 7b6cd34..3cf29b1 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -393,6 +393,7 @@ source "net/9p/Kconfig"
 source "net/caif/Kconfig"
 source "net/ceph/Kconfig"
 source "net/nfc/Kconfig"
+source "net/ife/Kconfig"
 
 config LWTUNNEL
 	bool "Network light weight tunnels"
diff --git a/net/Makefile b/net/Makefile
index 4cafaa2..4ddc67e 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_DNS_RESOLVER)	+= dns_resolver/
 obj-$(CONFIG_CEPH_LIB)		+= ceph/
 obj-$(CONFIG_BATMAN_ADV)	+= batman-adv/
 obj-$(CONFIG_NFC)		+= nfc/
+obj-$(CONFIG_NET_IFE)		+= ife/
 obj-$(CONFIG_OPENVSWITCH)	+= openvswitch/
 obj-$(CONFIG_VSOCKETS)	+= vmw_vsock/
 obj-$(CONFIG_MPLS)		+= mpls/
diff --git a/net/ife/Kconfig b/net/ife/Kconfig
new file mode 100644
index 0000000..31e48b6
--- /dev/null
+++ b/net/ife/Kconfig
@@ -0,0 +1,16 @@
+#
+# IFE subsystem configuration
+#
+
+menuconfig NET_IFE
+	depends on NET
+        tristate "Inter-FE based on IETF ForCES InterFE LFB"
+	default n
+	help
+	  Say Y here to add support of IFE encapsulation protocol
+	  For details refer to netdev01 paper:
+	  "Distributing Linux Traffic Control Classifier-Action Subsystem"
+	   Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+
+	  To compile this support as a module, choose M here: the module will
+	  be called ife.
diff --git a/net/ife/Makefile b/net/ife/Makefile
new file mode 100644
index 0000000..2a90d97
--- /dev/null
+++ b/net/ife/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the IFE encapsulation protocol
+#
+
+obj-$(CONFIG_NET_IFE) += ife.o
diff --git a/net/ife/ife.c b/net/ife/ife.c
new file mode 100644
index 0000000..ee3221a
--- /dev/null
+++ b/net/ife/ife.c
@@ -0,0 +1,144 @@
+/*
+ * net/ife/ife.c - Inter-FE protocol based on ForCES WG InterFE LFB
+ * Copyright (c) 2015 Jamal Hadi Salim <jhs@mojatatu.com>
+ * Copyright (c) 2016 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Refer to: draft-ietf-forces-interfelfb-03 and netdev01 paper:
+ * "Distributing Linux Traffic Control Classifier-Action Subsystem"
+ * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/etherdevice.h>
+#include <net/ife.h>
+
+void *ife_encode(struct sk_buff *skb, u16 metalen)
+{
+	/* OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
+	 * where ORIGDATA = original ethernet header ...
+	 */
+	int hdrm = metalen + IFE_METAHDRLEN;
+	int total_push = hdrm + skb->dev->hard_header_len;
+	struct ethhdr *iethh;	/* inner ether header */
+	int skboff = 0;
+	int err;
+
+	err = skb_cow_head(skb, total_push);
+	if (unlikely(err))
+		return NULL;
+
+	iethh = (struct ethhdr *) skb->data;
+
+	__skb_push(skb, total_push);
+	memcpy(skb->data, iethh, skb->dev->hard_header_len);
+	skb_reset_mac_header(skb);
+	skboff += skb->dev->hard_header_len;
+
+	/* total metadata length */
+	metalen += IFE_METAHDRLEN;
+	metalen = htons(metalen);
+	memcpy((skb->data + skboff), &metalen, IFE_METAHDRLEN);
+	skboff += IFE_METAHDRLEN;
+
+	return skb->data + skboff;
+}
+EXPORT_SYMBOL_GPL(ife_encode);
+
+struct ifeheadr {
+	__be16 metalen;
+	u8 tlv_data[];
+};
+
+void *ife_decode(struct sk_buff *skb, u16 *metalen)
+{
+	struct ifeheadr *ifehdr;
+	int total_pull;
+	u16 ifehdrln;
+
+	ifehdr = (struct ifeheadr *) (skb->data + skb->dev->hard_header_len);
+	ifehdrln = ifehdr->metalen;
+	ifehdrln = ntohs(ifehdrln);
+	total_pull = skb->dev->hard_header_len + ifehdrln;
+
+	if (unlikely(ifehdrln < 2))
+		return NULL;
+
+	if (unlikely(!pskb_may_pull(skb, total_pull)))
+		return NULL;
+
+	skb_set_mac_header(skb, total_pull);
+	__skb_pull(skb, total_pull);
+	*metalen = ifehdrln - IFE_METAHDRLEN;
+
+	return &ifehdr->tlv_data;
+}
+EXPORT_SYMBOL_GPL(ife_decode);
+
+struct meta_tlvhdr {
+	__be16 type;
+	__be16 len;
+};
+
+/* Caller takes care of presenting data in network order
+ */
+void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen)
+{
+	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
+
+	*dlen = ntohs(tlv->len) - NLA_HDRLEN;
+	*attrtype = ntohs(tlv->type);
+
+	if (totlen)
+		*totlen = nla_total_size(*dlen);
+
+	return skbdata + sizeof(struct meta_tlvhdr);
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_decode);
+
+void *ife_tlv_meta_next(void *skbdata)
+{
+	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
+	u16 tlvlen = tlv->len;
+
+	tlvlen = ntohs(tlvlen);
+	tlvlen = NLA_ALIGN(tlvlen);
+
+	return skbdata + tlvlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_next);
+
+/* Caller takes care of presenting data in network order
+ */
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
+{
+	u32 *tlv = (u32 *) (skbdata);
+	u16 totlen = nla_total_size(dlen);	/*alignment + hdr */
+	char *dptr = (char *) tlv + NLA_HDRLEN;
+	u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
+
+	*tlv = htonl(htlv);
+	memset(dptr, 0, totlen - NLA_HDRLEN);
+	memcpy(dptr, dval, dlen);
+
+	return totlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
+
+MODULE_AUTHOR("Jamal Hadi Salim <jhs@mojatatu.com>");
+MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
+MODULE_DESCRIPTION("Inter-FE LFB action");
+MODULE_LICENSE("GPL");
-- 
2.7.4

^ permalink raw reply related

* [patch net-next 6/8] tc: sample: Add sequence number and sampler_id fields
From: Jiri Pirko @ 2016-11-10 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, yotamg, idosch, eladr, nogahf, ogerlitz, jhs,
	geert+renesas, stephen, xiyou.wangcong, linux, roopa
In-Reply-To: <1478776988-5400-1-git-send-email-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

Add support for sequence numbering to the sampled packets. The sequence
number counts the sampled packets and enables to check whether sampled
packets have been dropped. The sequence state is kept per sample action
instance.

Add unique id (sampler_id) for each sample action instance, and add that
sampler_id to the packet. This way, a user application can verify sample
drops, by tracking the sequence numbers from each sampler id.

Because of the fact that hw and sw can be triggered by the same tc action
(if the user does not specify both skip_sw and skip_hw), each should
have a different sampler_id, as each generates a different sequence
numbering. To allow that, the macros TC_SAMPLE_HW_ID and TC_SAMPLE_SW_ID,
which, given a sampler_id, calculates the hw sample_id and sw sample_id.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/ife.h              |  5 +++--
 include/net/tc_act/tc_sample.h |  9 +++++++++
 include/uapi/linux/ife.h       |  2 ++
 net/ife/ife.c                  | 14 ++++++++++++--
 net/sched/act_sample.c         | 10 +++++++++-
 5 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/include/net/ife.h b/include/net/ife.h
index 5fbcfb3..3ae4f0d 100644
--- a/include/net/ife.h
+++ b/include/net/ife.h
@@ -19,7 +19,8 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
 void *ife_tlv_meta_next(void *skbdata);
 
 struct ethhdr *ife_packet_info_pack(struct sk_buff *skb, int orig_size,
-				    int in_ifindex, int out_ifindex);
+				    int in_ifindex, int out_ifindex,
+				    u32 sampler_id, u32 seq);
 
 #else
 
@@ -52,7 +53,7 @@ static inline void *ife_tlv_meta_next(void *skbdata)
 
 static inline struct ethhdr *
 ife_packet_info_pack(struct sk_buff *skb, int orig_size, int in_ifindex,
-		     int out_ifindex)
+		     int out_ifindex, u32 sampler_id, u32 seq)
 {
 	return NULL;
 }
diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h
index 73154b3..98eafda 100644
--- a/include/net/tc_act/tc_sample.h
+++ b/include/net/tc_act/tc_sample.h
@@ -14,9 +14,13 @@ struct tcf_sample {
 	u8			eth_dst[ETH_ALEN];
 	u8			eth_src[ETH_ALEN];
 	u16			eth_type;
+	u32			sampler_id;
+	u32			seq;
 	struct list_head	tcfm_list;
 };
 #define to_sample(a) ((struct tcf_sample *)a)
+#define TC_SAMPLE_SW_ID(id) ((id << 1) + 0)
+#define TC_SAMPLE_HW_ID(id) ((id << 1) + 1)
 
 static inline bool is_tcf_sample(const struct tc_action *a)
 {
@@ -62,4 +66,9 @@ static inline void tcf_sample_eth_src_addr(const struct tc_action *a, u8 *src)
 	ether_addr_copy(src, to_sample(a)->eth_src);
 }
 
+static inline u32 tcf_sample_sampler_id(const struct tc_action *a)
+{
+	return to_sample(a)->sampler_id;
+}
+
 #endif /* __NET_TC_SAMPLE_H */
diff --git a/include/uapi/linux/ife.h b/include/uapi/linux/ife.h
index ebdd785..2007818 100644
--- a/include/uapi/linux/ife.h
+++ b/include/uapi/linux/ife.h
@@ -13,6 +13,8 @@ enum {
 	IFE_META_OUT_IFINDEX,
 	IFE_META_ORIGSIZE,
 	IFE_META_SIZE,
+	IFE_META_SAMPLER_ID,
+	IFE_META_SEQ,
 	__IFE_META_MAX
 };
 
diff --git a/net/ife/ife.c b/net/ife/ife.c
index 756af46..3ebd6ab 100644
--- a/net/ife/ife.c
+++ b/net/ife/ife.c
@@ -139,7 +139,8 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
 EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
 
 struct ethhdr *ife_packet_info_pack(struct sk_buff *skb, int orig_size,
-				    int in_ifindex, int out_ifindex)
+				    int in_ifindex, int out_ifindex,
+				    u32 sampler_id, u32 seq)
 {
 	int curr_size;
 	void *ifetlv;
@@ -148,7 +149,9 @@ struct ethhdr *ife_packet_info_pack(struct sk_buff *skb, int orig_size,
 	curr_size = skb->len;
 
 	metalen = nla_total_size(sizeof(orig_size)) +
-		  nla_total_size(sizeof(curr_size));
+		  nla_total_size(sizeof(curr_size)) +
+		  nla_total_size(sizeof(sampler_id)) +
+		  nla_total_size(sizeof(seq));
 
 	if (in_ifindex)
 		metalen += nla_total_size(sizeof(in_ifindex));
@@ -159,6 +162,8 @@ struct ethhdr *ife_packet_info_pack(struct sk_buff *skb, int orig_size,
 	out_ifindex = htonl(out_ifindex);
 	orig_size = htonl(orig_size);
 	curr_size = htonl(curr_size);
+	sampler_id = htonl(sampler_id);
+	seq = htonl(seq);
 
 	ifetlv = ife_encode(skb, metalen);
 	if (!ifetlv)
@@ -179,6 +184,11 @@ struct ethhdr *ife_packet_info_pack(struct sk_buff *skb, int orig_size,
 	ifetlv += ife_tlv_meta_encode(ifetlv, IFE_META_SIZE,
 				      sizeof(curr_size), &curr_size);
 
+	ifetlv += ife_tlv_meta_encode(ifetlv, IFE_META_SAMPLER_ID,
+				      sizeof(sampler_id), &sampler_id);
+
+	ifetlv += ife_tlv_meta_encode(ifetlv, IFE_META_SEQ, sizeof(seq), &seq);
+
 	return (struct ethhdr *) skb->data;
 }
 EXPORT_SYMBOL(ife_packet_info_pack);
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 6596f4b..3530401 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -27,6 +27,7 @@
 
 #define SAMPLE_TAB_MASK     7
 static int sample_net_id;
+static u32 samplers;
 static struct tc_action_ops act_sample_ops;
 
 static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
@@ -77,6 +78,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 	s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
 	s->mark = nla_get_u32(tb[TCA_SAMPLE_MARK]);
 	s->eth_type = nla_get_u16(tb[TCA_SAMPLE_ETH_TYPE]);
+	s->sampler_id = samplers++;
 
 	s->truncate = tb[TCA_SAMPLE_TRUNC_SIZE];
 	if (tb[TCA_SAMPLE_TRUNC_SIZE])
@@ -119,6 +121,7 @@ static int tcf_sample(struct sk_buff *skb, const struct tc_action *a,
 	struct tcf_sample *s = to_sample(a);
 	static struct ethhdr *ethhdr;
 	struct sk_buff *skb2;
+	u32 sampler_id;
 	int orig_size;
 	int retval;
 	u32 at;
@@ -145,8 +148,12 @@ static int tcf_sample(struct sk_buff *skb, const struct tc_action *a,
 			skb_push(skb2, skb2->mac_len);
 
 		orig_size = skb->len + skb->dev->hard_header_len;
+		sampler_id = TC_SAMPLE_SW_ID(s->sampler_id);
+
 		ethhdr = ife_packet_info_pack(skb2, orig_size,
-					      skb->dev->ifindex, 0);
+					      skb->dev->ifindex, 0,
+					      sampler_id, s->seq++);
+
 		if (!ethhdr)
 			goto out;
 
@@ -267,6 +274,7 @@ static struct pernet_operations sample_net_ops = {
 
 static int __init sample_init_module(void)
 {
+	samplers = 0;
 	return tcf_register_action(&act_sample_ops, &sample_net_ops);
 }
 
-- 
2.7.4

^ permalink raw reply related

* [patch net-next 5/8] Introduce sample tc action
From: Jiri Pirko @ 2016-11-10 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, yotamg, idosch, eladr, nogahf, ogerlitz, jhs,
	geert+renesas, stephen, xiyou.wangcong, linux, roopa
In-Reply-To: <1478776988-5400-1-git-send-email-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

This action allow the user to sample traffic matched by tc classifier.
The sampling consists of choosing packets randomly, truncating them,
adding some informative metadata regarding the interface and the original
packet size and mark them with specific mark, to allow further tc rules to
match and process. The marked sample packets are then injected into the
device ingress qdisc using netif_receive_skb.

The packets metadata is packed using the ife encapsulation protocol, and
the outer packet's ethernet dest, source and eth_type, along with the
rate, mark and the optional truncation size can be configured from
userspace.

Example:
To sample ingress traffic from interface eth1, and redirect the sampled
the sampled packets to interface dummy0, one may use the commands:

tc qdisc add dev eth1 handle ffff: ingress

tc filter add dev eth1 parent ffff: \
	   matchall action sample rate 12 mark 17

tc filter add parent ffff: dev eth1 protocol all \
	   u32 match mark 17 0xff \
	   action mirred egress redirect dev dummy0

Where the first command adds an ingress qdisc and the second starts
sampling every 12'th packet on dev eth1 and marks the sampled packets with
17. The third command catches the sampled packets, which are marked with
17, and redirects them to dev dummy0.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/tc_act/tc_sample.h        |  65 ++++++++
 include/uapi/linux/tc_act/Kbuild      |   1 +
 include/uapi/linux/tc_act/tc_sample.h |  29 ++++
 net/sched/Kconfig                     |  13 ++
 net/sched/Makefile                    |   1 +
 net/sched/act_sample.c                | 283 ++++++++++++++++++++++++++++++++++
 6 files changed, 392 insertions(+)
 create mode 100644 include/net/tc_act/tc_sample.h
 create mode 100644 include/uapi/linux/tc_act/tc_sample.h
 create mode 100644 net/sched/act_sample.c

diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h
new file mode 100644
index 0000000..73154b3
--- /dev/null
+++ b/include/net/tc_act/tc_sample.h
@@ -0,0 +1,65 @@
+#ifndef __NET_TC_SAMPLE_H
+#define __NET_TC_SAMPLE_H
+
+#include <net/act_api.h>
+#include <linux/tc_act/tc_sample.h>
+
+struct tcf_sample {
+	struct tc_action	common;
+	u32			rate;
+	u32			mark;
+	bool			truncate;
+	u32			trunc_size;
+	u32			packet_counter;
+	u8			eth_dst[ETH_ALEN];
+	u8			eth_src[ETH_ALEN];
+	u16			eth_type;
+	struct list_head	tcfm_list;
+};
+#define to_sample(a) ((struct tcf_sample *)a)
+
+static inline bool is_tcf_sample(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	return a->ops && a->ops->type == TCA_ACT_SAMPLE;
+#else
+	return false;
+#endif
+}
+
+static inline __u32 tcf_sample_mark(const struct tc_action *a)
+{
+	return to_sample(a)->mark;
+}
+
+static inline __u32 tcf_sample_rate(const struct tc_action *a)
+{
+	return to_sample(a)->rate;
+}
+
+static inline bool tcf_sample_truncate(const struct tc_action *a)
+{
+	return to_sample(a)->truncate;
+}
+
+static inline int tcf_sample_trunc_size(const struct tc_action *a)
+{
+	return to_sample(a)->trunc_size;
+}
+
+static inline u16 tcf_sample_eth_type(const struct tc_action *a)
+{
+	return to_sample(a)->eth_type;
+}
+
+static inline void tcf_sample_eth_dst_addr(const struct tc_action *a, u8 *dst)
+{
+	ether_addr_copy(dst, to_sample(a)->eth_dst);
+}
+
+static inline void tcf_sample_eth_src_addr(const struct tc_action *a, u8 *src)
+{
+	ether_addr_copy(src, to_sample(a)->eth_src);
+}
+
+#endif /* __NET_TC_SAMPLE_H */
diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild
index e3969bd..6c6b8d6 100644
--- a/include/uapi/linux/tc_act/Kbuild
+++ b/include/uapi/linux/tc_act/Kbuild
@@ -4,6 +4,7 @@ header-y += tc_defact.h
 header-y += tc_gact.h
 header-y += tc_ipt.h
 header-y += tc_mirred.h
+header-y += tc_sample.h
 header-y += tc_nat.h
 header-y += tc_pedit.h
 header-y += tc_skbedit.h
diff --git a/include/uapi/linux/tc_act/tc_sample.h b/include/uapi/linux/tc_act/tc_sample.h
new file mode 100644
index 0000000..44ee9d0
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_sample.h
@@ -0,0 +1,29 @@
+#ifndef __LINUX_TC_SAMPLE_H
+#define __LINUX_TC_SAMPLE_H
+
+#include <linux/types.h>
+#include <linux/pkt_cls.h>
+#include <linux/if_ether.h>
+
+#define TCA_ACT_SAMPLE 26
+
+struct tc_sample {
+	tc_gen;
+};
+
+enum {
+	TCA_SAMPLE_UNSPEC,
+	TCA_SAMPLE_PARMS,
+	TCA_SAMPLE_TM,
+	TCA_SAMPLE_RATE,
+	TCA_SAMPLE_MARK,
+	TCA_SAMPLE_TRUNC_SIZE,
+	TCA_SAMPLE_ETH_DST,
+	TCA_SAMPLE_ETH_SRC,
+	TCA_SAMPLE_ETH_TYPE,
+	TCA_SAMPLE_PAD,
+	__TCA_SAMPLE_MAX
+};
+#define TCA_SAMPLE_MAX (__TCA_SAMPLE_MAX - 1)
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 24f7cac..c54ea6b 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -650,6 +650,19 @@ config NET_ACT_MIRRED
 	  To compile this code as a module, choose M here: the
 	  module will be called act_mirred.
 
+config NET_ACT_SAMPLE
+        tristate "Traffic Sampling"
+        depends on NET_CLS_ACT
+        select NET_IFE
+        ---help---
+	  Say Y here to allow packet sampling tc action. The packet sample
+	  action consists of statistically duplicating packets, truncating them
+	  and adding descriptive metadata to them. The duplicated packets are
+	  then marked to allow further processing using tc.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_sample.
+
 config NET_ACT_IPT
         tristate "IPtables targets"
         depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 4bdda36..7b915d2 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_NET_CLS_ACT)	+= act_api.o
 obj-$(CONFIG_NET_ACT_POLICE)	+= act_police.o
 obj-$(CONFIG_NET_ACT_GACT)	+= act_gact.o
 obj-$(CONFIG_NET_ACT_MIRRED)	+= act_mirred.o
+obj-$(CONFIG_NET_ACT_SAMPLE)	+= act_sample.o
 obj-$(CONFIG_NET_ACT_IPT)	+= act_ipt.o
 obj-$(CONFIG_NET_ACT_NAT)	+= act_nat.o
 obj-$(CONFIG_NET_ACT_PEDIT)	+= act_pedit.o
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
new file mode 100644
index 0000000..6596f4b
--- /dev/null
+++ b/net/sched/act_sample.c
@@ -0,0 +1,283 @@
+/*
+ * net/sched/act_sample.c - Packet samplig tc action
+ * Copyright (c) 2016 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_sample.h>
+#include <net/tc_act/tc_sample.h>
+#include <net/ife.h>
+
+#include <linux/if_arp.h>
+
+#define SAMPLE_TAB_MASK     7
+static int sample_net_id;
+static struct tc_action_ops act_sample_ops;
+
+static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
+	[TCA_SAMPLE_PARMS]	= { .len = sizeof(struct tc_sample) },
+};
+
+static int tcf_sample_init(struct net *net, struct nlattr *nla,
+			   struct nlattr *est, struct tc_action **a, int ovr,
+			   int bind)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+	struct nlattr *tb[TCA_SAMPLE_MAX + 1];
+	struct tc_sample *parm;
+	struct tcf_sample *s;
+	int ret;
+	bool exists = false;
+
+	if (!nla)
+		return -EINVAL;
+	ret = nla_parse_nested(tb, TCA_SAMPLE_MAX, nla, sample_policy);
+	if (ret < 0)
+		return ret;
+	if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] ||
+	    !tb[TCA_SAMPLE_MARK] || !tb[TCA_SAMPLE_ETH_TYPE])
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_SAMPLE_PARMS]);
+
+	exists = tcf_hash_check(tn, parm->index, a, bind);
+	if (exists && bind)
+		return 0;
+
+	if (!exists) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      &act_sample_ops, bind, false);
+		if (ret)
+			return ret;
+		ret = ACT_P_CREATED;
+	} else {
+		tcf_hash_release(*a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+	s = to_sample(*a);
+
+	ASSERT_RTNL();
+	s->tcf_action = parm->action;
+	s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
+	s->mark = nla_get_u32(tb[TCA_SAMPLE_MARK]);
+	s->eth_type = nla_get_u16(tb[TCA_SAMPLE_ETH_TYPE]);
+
+	s->truncate = tb[TCA_SAMPLE_TRUNC_SIZE];
+	if (tb[TCA_SAMPLE_TRUNC_SIZE])
+		s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]);
+
+	s->packet_counter = 0;
+
+	if (tb[TCA_SAMPLE_ETH_SRC])
+		ether_addr_copy(s->eth_src, nla_data(tb[TCA_SAMPLE_ETH_SRC]));
+	else
+		eth_zero_addr(s->eth_src);
+	if (tb[TCA_SAMPLE_ETH_DST])
+		ether_addr_copy(s->eth_dst, nla_data(tb[TCA_SAMPLE_ETH_DST]));
+	else
+		eth_zero_addr(s->eth_dst);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, *a);
+	return ret;
+}
+
+static bool dev_ok_push(struct net_device *dev)
+{
+	switch (dev->type) {
+	case ARPHRD_TUNNEL:
+	case ARPHRD_TUNNEL6:
+	case ARPHRD_SIT:
+	case ARPHRD_IPGRE:
+	case ARPHRD_VOID:
+	case ARPHRD_NONE:
+		return false;
+	default:
+		return true;
+	}
+}
+
+static int tcf_sample(struct sk_buff *skb, const struct tc_action *a,
+		      struct tcf_result *res)
+{
+	struct tcf_sample *s = to_sample(a);
+	static struct ethhdr *ethhdr;
+	struct sk_buff *skb2;
+	int orig_size;
+	int retval;
+	u32 at;
+
+	tcf_lastuse_update(&s->tcf_tm);
+	bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb);
+
+	rcu_read_lock();
+	retval = READ_ONCE(s->tcf_action);
+
+	if (++s->packet_counter % s->rate == 0) {
+		skb2 = skb_copy(skb, GFP_ATOMIC);
+		if (!skb2)
+			goto out;
+
+		if (s->truncate)
+			skb_trim(skb2, s->trunc_size);
+
+		at = G_TC_AT(skb->tc_verd);
+		skb2->mac_len = skb->mac_len;
+
+		/* on ingress, the mac header gets poped, so push it back */
+		if (!(at & AT_EGRESS) && dev_ok_push(skb->dev))
+			skb_push(skb2, skb2->mac_len);
+
+		orig_size = skb->len + skb->dev->hard_header_len;
+		ethhdr = ife_packet_info_pack(skb2, orig_size,
+					      skb->dev->ifindex, 0);
+		if (!ethhdr)
+			goto out;
+
+		ethhdr->h_proto = htons(s->eth_type);
+		if (!is_zero_ether_addr(s->eth_src))
+			ether_addr_copy(ethhdr->h_source, s->eth_src);
+		if (!is_zero_ether_addr(s->eth_dst))
+			ether_addr_copy(ethhdr->h_dest, s->eth_dst);
+
+		skb2->mark = s->mark;
+		netif_receive_skb(skb2);
+
+		/* mirror is always swallowed */
+		skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
+	}
+out:
+	rcu_read_unlock();
+
+	return retval;
+}
+
+static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
+			   int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_sample *s = to_sample(a);
+	struct tc_sample opt = {
+		.index      = s->tcf_index,
+		.action     = s->tcf_action,
+		.refcnt     = s->tcf_refcnt - ref,
+		.bindcnt    = s->tcf_bindcnt - bind,
+	};
+	struct tcf_t t;
+
+	if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	tcf_tm_dump(&t, &s->tcf_tm);
+	if (nla_put_64bit(skb, TCA_SAMPLE_TM, sizeof(t), &t, TCA_SAMPLE_PAD))
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_SAMPLE_RATE, s->rate))
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_SAMPLE_MARK, s->mark))
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_SAMPLE_ETH_TYPE, s->eth_type))
+		goto nla_put_failure;
+
+	if (s->truncate)
+		if (nla_put_u32(skb, TCA_SAMPLE_TRUNC_SIZE, s->trunc_size))
+			goto nla_put_failure;
+
+	if (!is_zero_ether_addr(s->eth_src))
+		if (nla_put(skb, TCA_SAMPLE_ETH_SRC, ETH_ALEN, s->eth_src))
+			goto nla_put_failure;
+
+	if (!is_zero_ether_addr(s->eth_dst))
+		if (nla_put(skb, TCA_SAMPLE_ETH_DST, ETH_ALEN, s->eth_dst))
+			goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int tcf_sample_walker(struct net *net, struct sk_buff *skb,
+			     struct netlink_callback *cb, int type,
+			     const struct tc_action_ops *ops)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_sample_ops = {
+	.kind	= "sample",
+	.type	= TCA_ACT_SAMPLE,
+	.owner	= THIS_MODULE,
+	.act	= tcf_sample,
+	.dump	= tcf_sample_dump,
+	.init	= tcf_sample_init,
+	.walk	= tcf_sample_walker,
+	.lookup	= tcf_sample_search,
+	.size	= sizeof(struct tcf_sample),
+};
+
+static __net_init int sample_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+	return tc_action_net_init(tn, &act_sample_ops, SAMPLE_TAB_MASK);
+}
+
+static void __net_exit sample_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations sample_net_ops = {
+	.init = sample_init_net,
+	.exit = sample_exit_net,
+	.id   = &sample_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init sample_init_module(void)
+{
+	return tcf_register_action(&act_sample_ops, &sample_net_ops);
+}
+
+static void __exit sample_cleanup_module(void)
+{
+	tcf_unregister_action(&act_sample_ops, &sample_net_ops);
+}
+
+module_init(sample_init_module);
+module_exit(sample_cleanup_module);
+
+MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
+MODULE_DESCRIPTION("Packet sampling action");
+MODULE_LICENSE("GPL v2");
-- 
2.7.4

^ permalink raw reply related

* [patch net-next 7/8] mlxsw: reg: add the Monitoring Packet Sampling Configuration Register
From: Jiri Pirko @ 2016-11-10 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, yotamg, idosch, eladr, nogahf, ogerlitz, jhs,
	geert+renesas, stephen, xiyou.wangcong, linux, roopa
In-Reply-To: <1478776988-5400-1-git-send-email-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

The MPSC register allows to configure ingress packet sampling on specific
port of the mlxsw device. The sampled packets are then trapped via
PKT_SAMPLE trap.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h | 38 +++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index a61ce34..0936b4a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -4771,6 +4771,44 @@ static inline void mlxsw_reg_mlcr_pack(char *payload, u8 local_port,
 					   MLXSW_REG_MLCR_DURATION_MAX : 0);
 }
 
+/* MPSC - Monitoring Packet Sampling Configuration Register
+ * --------------------------------------------------------
+ * MPSC Register is used to configure the Packet Sampling mechanism.
+ */
+#define MLXSW_REG_MPSC_ID 0x9080
+#define MLXSW_REG_MPSC_LEN 0x14
+
+MLXSW_REG_DEFINE(mpsc, MLXSW_REG_MPSC_ID, MLXSW_REG_MPSC_LEN);
+
+/* reg_mpsc_local_port
+ * Local port number
+ * Not supported for CPU port
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mpsc, local_port, 0x00, 16, 8);
+
+/* reg_mpsc_e
+ * Enable sampling on port local_port
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpsc, e, 0x04, 30, 1);
+
+/* reg_mpsc_rate
+ * Sampling rate = 1 out of rate packets (with randomization around
+ * the point). Valid values are: 1 to 3.5*10^9
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpsc, rate, 0x08, 0, 32);
+
+static inline void mlxsw_reg_mpsc_pack(char *payload, u8 local_port, bool e,
+				       u32 rate)
+{
+	MLXSW_REG_ZERO(mpsc, payload);
+	mlxsw_reg_mpsc_local_port_set(payload, local_port);
+	mlxsw_reg_mpsc_e_set(payload, e);
+	mlxsw_reg_mpsc_rate_set(payload, rate);
+}
+
 /* SBPR - Shared Buffer Pools Register
  * -----------------------------------
  * The SBPR configures and retrieves the shared buffer pools and configuration.
-- 
2.7.4

^ permalink raw reply related

* [patch net-next 8/8] mlxsw: packet sample: Add packet sample offloading support
From: Jiri Pirko @ 2016-11-10 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, yotamg, idosch, eladr, nogahf, ogerlitz, jhs,
	geert+renesas, stephen, xiyou.wangcong, linux, roopa
In-Reply-To: <1478776988-5400-1-git-send-email-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

Using the MPSC regiter, add the functions that configure port packets
sampling in hardware and the necessary datatypes in the mlxsw_sp_port
struct. In addition, add the necessary trap for sampled packets and
integrate with matchall offloading to allow offloading of the sample tc
action.

The current offload support is for the tc command:

tc filter add dev <DEV> parent ffff:   \
	  matchall   \
	  action sample rate <RATE> mark <MARK> [trunc <SIZE>]   \
	  		[src <SADDR>] [dst <DADDR>] [type <TYPE>]

Where only ingress qdiscs are supported, and only a combination of
matchall classifier and sample action will lead to activating hardware
packet sampling.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 120 +++++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h |  12 +++
 drivers/net/ethernet/mellanox/mlxsw/trap.h     |   1 +
 3 files changed, 125 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index a5433e4..6cbf67c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -57,6 +57,8 @@
 #include <net/pkt_cls.h>
 #include <net/tc_act/tc_mirred.h>
 #include <net/netevent.h>
+#include <net/tc_act/tc_sample.h>
+#include <net/ife.h>
 
 #include "spectrum.h"
 #include "pci.h"
@@ -467,6 +469,16 @@ static void mlxsw_sp_span_mirror_remove(struct mlxsw_sp_port *from,
 	mlxsw_sp_span_inspected_port_unbind(from, span_entry, type);
 }
 
+static int mlxsw_sp_port_sample_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				    bool enable, u32 rate)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char mpsc_pl[MLXSW_REG_MPSC_LEN];
+
+	mlxsw_reg_mpsc_pack(mpsc_pl, mlxsw_sp_port->local_port, enable, rate);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpsc), mpsc_pl);
+}
+
 static int mlxsw_sp_port_admin_status_set(struct mlxsw_sp_port *mlxsw_sp_port,
 					  bool is_up)
 {
@@ -1221,6 +1233,49 @@ mlxsw_sp_port_add_cls_matchall_mirror(struct mlxsw_sp_port *mlxsw_sp_port,
 	return err;
 }
 
+static int
+mlxsw_sp_port_add_cls_matchall_sample(struct mlxsw_sp_port *mlxsw_sp_port,
+				      struct tc_cls_matchall_offload *cls,
+				      const struct tc_action *a,
+				      bool ingress)
+{
+	struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry;
+	int sampler_id;
+	int err;
+
+	if (mlxsw_sp_port->sample.enable) {
+		netdev_err(mlxsw_sp_port->dev, "Sample already active\n");
+		return -EEXIST;
+	}
+
+	err = mlxsw_sp_port_sample_set(mlxsw_sp_port, true, tcf_sample_rate(a));
+	if (err)
+		return err;
+
+	sampler_id = tcf_sample_sampler_id(a);
+
+	mlxsw_sp_port->sample.enable = true;
+	mlxsw_sp_port->sample.mark = tcf_sample_mark(a);
+	mlxsw_sp_port->sample.truncate = tcf_sample_truncate(a);
+	mlxsw_sp_port->sample.trunc_size = tcf_sample_trunc_size(a);
+	mlxsw_sp_port->sample.eth_type = tcf_sample_eth_type(a);
+	mlxsw_sp_port->sample.sampler_id = TC_SAMPLE_HW_ID(sampler_id);
+	tcf_sample_eth_dst_addr(a, mlxsw_sp_port->sample.eth_dst);
+	tcf_sample_eth_src_addr(a, mlxsw_sp_port->sample.eth_src);
+
+	netdev_dbg(mlxsw_sp_port->dev, "Activate hardware sample\n");
+
+	mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+	if (!mall_tc_entry)
+		return -ENOMEM;
+
+	mall_tc_entry->cookie = cls->cookie;
+	mall_tc_entry->type = MLXSW_SP_PORT_MALL_SAMPLE;
+	list_add_tail(&mall_tc_entry->list, &mlxsw_sp_port->mall_tc_list);
+
+	return 0;
+}
+
 static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
 					  __be16 protocol,
 					  struct tc_cls_matchall_offload *cls,
@@ -1236,17 +1291,19 @@ static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
 	}
 
 	tcf_exts_to_list(cls->exts, &actions);
-	list_for_each_entry(a, &actions, list) {
-		if (!is_tcf_mirred_egress_mirror(a) ||
-		    protocol != htons(ETH_P_ALL)) {
-			return -ENOTSUPP;
-		}
+	a = list_first_entry(&actions, struct tc_action, list);
 
+	if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL))
 		err = mlxsw_sp_port_add_cls_matchall_mirror(mlxsw_sp_port, cls,
 							    a, ingress);
-		if (err)
-			return err;
-	}
+	else if (is_tcf_sample(a) && protocol == htons(ETH_P_ALL))
+		err = mlxsw_sp_port_add_cls_matchall_sample(mlxsw_sp_port, cls,
+							    a, ingress);
+	else
+		return -ENOTSUPP;
+
+	if (err)
+		return err;
 
 	return 0;
 }
@@ -1274,6 +1331,10 @@ static void mlxsw_sp_port_del_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
 
 		mlxsw_sp_span_mirror_remove(mlxsw_sp_port, to_port, span_type);
 		break;
+	case MLXSW_SP_PORT_MALL_SAMPLE:
+		mlxsw_sp_port->sample.enable = false;
+		mlxsw_sp_port_sample_set(mlxsw_sp_port, false, 1);
+		break;
 	default:
 		WARN_ON(1);
 	}
@@ -2774,6 +2835,46 @@ static void mlxsw_sp_rx_listener_mark_func(struct sk_buff *skb, u8 local_port,
 	return mlxsw_sp_rx_listener_func(skb, local_port, priv);
 }
 
+static void mlxsw_sp_rx_listener_sample_func(struct sk_buff *skb, u8 local_port,
+					     void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port];
+	static struct ethhdr *ethhdr;
+	int orig_size;
+
+	if (unlikely(!mlxsw_sp_port)) {
+		dev_warn_ratelimited(mlxsw_sp->bus_info->dev, "Port %d: sample skb received for non-existent port\n",
+				     local_port);
+		return;
+	}
+
+	skb->dev = mlxsw_sp_port->dev;
+	skb->mac_len = skb->dev->hard_header_len;
+	skb->mark = mlxsw_sp_port->sample.mark;
+
+	orig_size = skb->len;
+
+	if (mlxsw_sp_port->sample.truncate)
+		skb_trim(skb, mlxsw_sp_port->sample.trunc_size);
+
+	ethhdr = ife_packet_info_pack(skb, orig_size, skb->dev->ifindex, 0,
+				      mlxsw_sp_port->sample.sampler_id,
+				      mlxsw_sp_port->sample.seq++);
+	if (!ethhdr)
+		return;
+
+	if (!is_zero_ether_addr(mlxsw_sp_port->sample.eth_src))
+		ether_addr_copy(ethhdr->h_source,
+				mlxsw_sp_port->sample.eth_src);
+	if (!is_zero_ether_addr(mlxsw_sp_port->sample.eth_dst))
+		ether_addr_copy(ethhdr->h_dest, mlxsw_sp_port->sample.eth_dst);
+	ethhdr->h_proto = htons(mlxsw_sp_port->sample.eth_type);
+
+	skb->protocol = eth_type_trans(skb, skb->dev);
+	netif_receive_skb(skb);
+}
+
 #define MLXSW_SP_RXL(_func, _trap_id, _action)			\
 	{							\
 		.func = _func,					\
@@ -2784,6 +2885,9 @@ static void mlxsw_sp_rx_listener_mark_func(struct sk_buff *skb, u8 local_port,
 
 static const struct mlxsw_rx_listener mlxsw_sp_rx_listener[] = {
 	MLXSW_SP_RXL(mlxsw_sp_rx_listener_func, FDB_MC, TRAP_TO_CPU),
+	MLXSW_SP_RXL(mlxsw_sp_rx_listener_sample_func, PKT_SAMPLE,
+		     MIRROR_TO_CPU),
+
 	/* Traps for specific L2 packet types, not trapped as FDB MC */
 	MLXSW_SP_RXL(mlxsw_sp_rx_listener_func, STP, TRAP_TO_CPU),
 	MLXSW_SP_RXL(mlxsw_sp_rx_listener_func, LACP, TRAP_TO_CPU),
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 04a2bc7..7f2e6fc 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -229,6 +229,7 @@ struct mlxsw_sp_span_entry {
 
 enum mlxsw_sp_port_mall_action_type {
 	MLXSW_SP_PORT_MALL_MIRROR,
+	MLXSW_SP_PORT_MALL_SAMPLE,
 };
 
 struct mlxsw_sp_port_mall_mirror_tc_entry {
@@ -361,6 +362,17 @@ struct mlxsw_sp_port {
 		struct rtnl_link_stats64 *cache;
 		struct delayed_work update_dw;
 	} hw_stats;
+	struct {
+		bool enable;
+		u32 mark;
+		bool truncate;
+		u32 trunc_size;
+		u8 eth_src[ETH_ALEN];
+		u8 eth_dst[ETH_ALEN];
+		u16 eth_type;
+		u32 sampler_id;
+		u32 seq;
+	} sample;
 };
 
 struct mlxsw_sp_port *mlxsw_sp_port_lower_dev_hold(struct net_device *dev);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/trap.h b/drivers/net/ethernet/mellanox/mlxsw/trap.h
index ed8e301..eebd135 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/trap.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/trap.h
@@ -54,6 +54,7 @@ enum {
 	MLXSW_TRAP_ID_IGMP_V2_REPORT = 0x32,
 	MLXSW_TRAP_ID_IGMP_V2_LEAVE = 0x33,
 	MLXSW_TRAP_ID_IGMP_V3_REPORT = 0x34,
+	MLXSW_TRAP_ID_PKT_SAMPLE = 0x38,
 	MLXSW_TRAP_ID_ARPBC = 0x50,
 	MLXSW_TRAP_ID_ARPUC = 0x51,
 	MLXSW_TRAP_ID_MTUERROR = 0x52,
-- 
2.7.4

^ permalink raw reply related

* [patch net-next 0/8] Add support for offloading packet-sampling
From: Jiri Pirko @ 2016-11-10 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, yotamg, idosch, eladr, nogahf, ogerlitz, jhs,
	geert+renesas, stephen, xiyou.wangcong, linux, roopa

From: Jiri Pirko <jiri@mellanox.com>

Add the sample tc action, which allows to sample packet matching
a classifier. The sample action peeks randomly packets, duplicates them,
truncates them and adds informative metadata on the packet, for example,
the input interface and the original packet length. The sampled packets
are marked to allow matching them and redirecting them to a specific
collector device.

The sampled packets metadata is packed using ife encapsulation. To do
that, this patch-set extracts ife logics from the tc_ife action into an
independent ife module, and uses that functionality to pack the metadata.
To include all the needed metadata, this patch-set introduces some new
IFE_META tlv types.

In addition, Add the support for offloading the macthall-sample tc command
in the Mellanox mlxsw driver, for ingress qdiscs.

Userspace examples for that code can be found in:
 - https://github.com/yotamgi/host-sflow: sflow client that uses the sample
   with combination of tap device to sample packets and send to a
   centralized sflow collector.
 - https://github.com/yotamgi/libife: a library for manipulating ife
   packets in userspace. The library is used in the host-sflow program to
   parse the sampled packets.

---
rfc1->v1:
 - Change ifindex sampled packets metadatum to in_ifindex and out_ifindex
 - Add sequence number metadatum to sampled packets
 - Add sampler_id metadatum to sampled packets
 - Make the user kernel interface extensible
 - Move the sampling helper function to the ife module
 - Fix ife header to be safe when CONFIG_NET_IFE is not set
 - Made the sampled packets eth_type field mandatory other then optional
 - Change the IFE_META* fields to be enum
 - Remove the ife_packet_info struct and pass the parameters directly to
   the ife_packet_ifo_pack function
 - Couple of more styling and cosmetic issues

Yotam Gigi (8):
  Introduce ife encapsulation module
  act_ife: Change to use ife module
  net: ife: Introduce new metadata tlv types
  net: ife: Introduce packet info packing method
  Introduce sample tc action
  tc: sample: Add sequence number and sampler_id fields
  mlxsw: reg: add the Monitoring Packet Sampling Configuration Register
  mlxsw: packet sample: Add packet sample offloading support

 MAINTAINERS                                    |   7 +
 drivers/net/ethernet/mellanox/mlxsw/reg.h      |  38 ++++
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 120 +++++++++-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h |  12 +
 drivers/net/ethernet/mellanox/mlxsw/trap.h     |   1 +
 include/net/ife.h                              |  63 ++++++
 include/net/tc_act/tc_ife.h                    |   3 -
 include/net/tc_act/tc_sample.h                 |  74 +++++++
 include/uapi/linux/Kbuild                      |   1 +
 include/uapi/linux/ife.h                       |  24 ++
 include/uapi/linux/tc_act/Kbuild               |   1 +
 include/uapi/linux/tc_act/tc_ife.h             |  10 +-
 include/uapi/linux/tc_act/tc_sample.h          |  29 +++
 net/Kconfig                                    |   1 +
 net/Makefile                                   |   1 +
 net/ife/Kconfig                                |  16 ++
 net/ife/Makefile                               |   5 +
 net/ife/ife.c                                  | 199 +++++++++++++++++
 net/sched/Kconfig                              |  14 ++
 net/sched/Makefile                             |   1 +
 net/sched/act_ife.c                            | 109 +++------
 net/sched/act_sample.c                         | 291 +++++++++++++++++++++++++
 22 files changed, 923 insertions(+), 97 deletions(-)
 create mode 100644 include/net/ife.h
 create mode 100644 include/net/tc_act/tc_sample.h
 create mode 100644 include/uapi/linux/ife.h
 create mode 100644 include/uapi/linux/tc_act/tc_sample.h
 create mode 100644 net/ife/Kconfig
 create mode 100644 net/ife/Makefile
 create mode 100644 net/ife/ife.c
 create mode 100644 net/sched/act_sample.c

-- 
2.7.4

^ permalink raw reply

* [patch net-next 4/8] net: ife: Introduce packet info packing method
From: Jiri Pirko @ 2016-11-10 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, yotamg, idosch, eladr, nogahf, ogerlitz, jhs,
	geert+renesas, stephen, xiyou.wangcong, linux, roopa
In-Reply-To: <1478776988-5400-1-git-send-email-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

Add the ife_packet_info_pack function which encodes some informative
metadata into a packet using the ife encapsulation. The informative
metadata consists of:
 - The packet input/output ifindices
 - The packet original size, in case it was truncated

This method is useful for anyone that wants to pass informative metadata
on a packet alongside with the packet itself. A good usage can be packet
sampling.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/ife.h | 10 ++++++++++
 net/ife/ife.c     | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/include/net/ife.h b/include/net/ife.h
index a75d4e0..5fbcfb3 100644
--- a/include/net/ife.h
+++ b/include/net/ife.h
@@ -18,6 +18,9 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
 
 void *ife_tlv_meta_next(void *skbdata);
 
+struct ethhdr *ife_packet_info_pack(struct sk_buff *skb, int orig_size,
+				    int in_ifindex, int out_ifindex);
+
 #else
 
 static inline void *ife_encode(struct sk_buff *skb, u16 metalen)
@@ -47,6 +50,13 @@ static inline void *ife_tlv_meta_next(void *skbdata)
 	return NULL;
 }
 
+static inline struct ethhdr *
+ife_packet_info_pack(struct sk_buff *skb, int orig_size, int in_ifindex,
+		     int out_ifindex)
+{
+	return NULL;
+}
+
 #endif
 
 #endif /* __NET_IFE_H */
diff --git a/net/ife/ife.c b/net/ife/ife.c
index ee3221a..756af46 100644
--- a/net/ife/ife.c
+++ b/net/ife/ife.c
@@ -138,6 +138,51 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
 }
 EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
 
+struct ethhdr *ife_packet_info_pack(struct sk_buff *skb, int orig_size,
+				    int in_ifindex, int out_ifindex)
+{
+	int curr_size;
+	void *ifetlv;
+	u16 metalen;
+
+	curr_size = skb->len;
+
+	metalen = nla_total_size(sizeof(orig_size)) +
+		  nla_total_size(sizeof(curr_size));
+
+	if (in_ifindex)
+		metalen += nla_total_size(sizeof(in_ifindex));
+	if (out_ifindex)
+		metalen += nla_total_size(sizeof(out_ifindex));
+
+	in_ifindex = htonl(in_ifindex);
+	out_ifindex = htonl(out_ifindex);
+	orig_size = htonl(orig_size);
+	curr_size = htonl(curr_size);
+
+	ifetlv = ife_encode(skb, metalen);
+	if (!ifetlv)
+		return NULL;
+
+	if (in_ifindex)
+		ifetlv += ife_tlv_meta_encode(ifetlv, IFE_META_IN_IFINDEX,
+					      sizeof(in_ifindex), &in_ifindex);
+
+	if (out_ifindex)
+		ifetlv += ife_tlv_meta_encode(ifetlv, IFE_META_OUT_IFINDEX,
+					      sizeof(out_ifindex),
+					      &out_ifindex);
+
+	ifetlv += ife_tlv_meta_encode(ifetlv, IFE_META_ORIGSIZE,
+				      sizeof(orig_size), &orig_size);
+
+	ifetlv += ife_tlv_meta_encode(ifetlv, IFE_META_SIZE,
+				      sizeof(curr_size), &curr_size);
+
+	return (struct ethhdr *) skb->data;
+}
+EXPORT_SYMBOL(ife_packet_info_pack);
+
 MODULE_AUTHOR("Jamal Hadi Salim <jhs@mojatatu.com>");
 MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
 MODULE_DESCRIPTION("Inter-FE LFB action");
-- 
2.7.4

^ permalink raw reply related

* [patch net v2 0/2] mlxsw: Couple of router fixes
From: Jiri Pirko @ 2016-11-10 11:31 UTC (permalink / raw)
  To: netdev; +Cc: davem, idosch, eladr, yotamg, nogahf, ogerlitz

From: Jiri Pirko <jiri@mellanox.com>

---
v1->v2:
- patch2:
 - use net_eq

Jiri Pirko (2):
  mlxsw: spectrum_router: Fix handling of neighbour structure
  mlxsw: spectrum_router: Ignore FIB notification events for non-init
    namespaces

 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  | 98 ++++++++--------------
 1 file changed, 37 insertions(+), 61 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [patch net v2 1/2] mlxsw: spectrum_router: Fix handling of neighbour structure
From: Jiri Pirko @ 2016-11-10 11:31 UTC (permalink / raw)
  To: netdev; +Cc: davem, idosch, eladr, yotamg, nogahf, ogerlitz
In-Reply-To: <1478777465-6567-1-git-send-email-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

__neigh_create function works in a different way than assumed.
It passes "n" as a parameter to ndo_neigh_construct. But this "n" might
be destroyed right away before __neigh_create() returns in case there is
already another neighbour struct in the hashtable with the same dev and
primary key. That is not expected by mlxsw_sp_router_neigh_construct()
and the stored "n" points to freed memory, eventually leading to crash.

Fix this by doing tight 1:1 coupling between neighbour struct and
internal driver neigh_entry. That allows to narrow down the key in
internal driver hashtable to do lookups by "n" only.

Fixes: 6cf3c971dc84 ("mlxsw: spectrum_router: Add private neigh table")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Ido Schimmel <idosch@mellanox.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  | 95 ++++++++--------------
 1 file changed, 34 insertions(+), 61 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 4573da2..2863012 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -600,15 +600,13 @@ static void mlxsw_sp_vrs_fini(struct mlxsw_sp *mlxsw_sp)
 }
 
 struct mlxsw_sp_neigh_key {
-	unsigned char addr[sizeof(struct in6_addr)];
-	struct net_device *dev;
+	struct neighbour *n;
 };
 
 struct mlxsw_sp_neigh_entry {
 	struct rhash_head ht_node;
 	struct mlxsw_sp_neigh_key key;
 	u16 rif;
-	struct neighbour *n;
 	bool offloaded;
 	struct delayed_work dw;
 	struct mlxsw_sp_port *mlxsw_sp_port;
@@ -646,19 +644,15 @@ mlxsw_sp_neigh_entry_remove(struct mlxsw_sp *mlxsw_sp,
 static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work);
 
 static struct mlxsw_sp_neigh_entry *
-mlxsw_sp_neigh_entry_create(const void *addr, size_t addr_len,
-			    struct net_device *dev, u16 rif,
-			    struct neighbour *n)
+mlxsw_sp_neigh_entry_create(struct neighbour *n, u16 rif)
 {
 	struct mlxsw_sp_neigh_entry *neigh_entry;
 
 	neigh_entry = kzalloc(sizeof(*neigh_entry), GFP_ATOMIC);
 	if (!neigh_entry)
 		return NULL;
-	memcpy(neigh_entry->key.addr, addr, addr_len);
-	neigh_entry->key.dev = dev;
+	neigh_entry->key.n = n;
 	neigh_entry->rif = rif;
-	neigh_entry->n = n;
 	INIT_DELAYED_WORK(&neigh_entry->dw, mlxsw_sp_router_neigh_update_hw);
 	INIT_LIST_HEAD(&neigh_entry->nexthop_list);
 	return neigh_entry;
@@ -671,13 +665,11 @@ mlxsw_sp_neigh_entry_destroy(struct mlxsw_sp_neigh_entry *neigh_entry)
 }
 
 static struct mlxsw_sp_neigh_entry *
-mlxsw_sp_neigh_entry_lookup(struct mlxsw_sp *mlxsw_sp, const void *addr,
-			    size_t addr_len, struct net_device *dev)
+mlxsw_sp_neigh_entry_lookup(struct mlxsw_sp *mlxsw_sp, struct neighbour *n)
 {
-	struct mlxsw_sp_neigh_key key = {{ 0 } };
+	struct mlxsw_sp_neigh_key key;
 
-	memcpy(key.addr, addr, addr_len);
-	key.dev = dev;
+	key.n = n;
 	return rhashtable_lookup_fast(&mlxsw_sp->router.neigh_ht,
 				      &key, mlxsw_sp_neigh_ht_params);
 }
@@ -689,26 +681,20 @@ int mlxsw_sp_router_neigh_construct(struct net_device *dev,
 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	struct mlxsw_sp_neigh_entry *neigh_entry;
 	struct mlxsw_sp_rif *r;
-	u32 dip;
 	int err;
 
 	if (n->tbl != &arp_tbl)
 		return 0;
 
-	dip = ntohl(*((__be32 *) n->primary_key));
-	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, &dip, sizeof(dip),
-						  n->dev);
-	if (neigh_entry) {
-		WARN_ON(neigh_entry->n != n);
+	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n);
+	if (neigh_entry)
 		return 0;
-	}
 
 	r = mlxsw_sp_rif_find_by_dev(mlxsw_sp, n->dev);
 	if (WARN_ON(!r))
 		return -EINVAL;
 
-	neigh_entry = mlxsw_sp_neigh_entry_create(&dip, sizeof(dip), n->dev,
-						  r->rif, n);
+	neigh_entry = mlxsw_sp_neigh_entry_create(n, r->rif);
 	if (!neigh_entry)
 		return -ENOMEM;
 	err = mlxsw_sp_neigh_entry_insert(mlxsw_sp, neigh_entry);
@@ -727,14 +713,11 @@ void mlxsw_sp_router_neigh_destroy(struct net_device *dev,
 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	struct mlxsw_sp_neigh_entry *neigh_entry;
-	u32 dip;
 
 	if (n->tbl != &arp_tbl)
 		return;
 
-	dip = ntohl(*((__be32 *) n->primary_key));
-	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, &dip, sizeof(dip),
-						  n->dev);
+	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n);
 	if (!neigh_entry)
 		return;
 	mlxsw_sp_neigh_entry_remove(mlxsw_sp, neigh_entry);
@@ -862,7 +845,7 @@ static void mlxsw_sp_router_neighs_update_nh(struct mlxsw_sp *mlxsw_sp)
 		 * is active regardless of the traffic.
 		 */
 		if (!list_empty(&neigh_entry->nexthop_list))
-			neigh_event_send(neigh_entry->n, NULL);
+			neigh_event_send(neigh_entry->key.n, NULL);
 	}
 	rtnl_unlock();
 }
@@ -908,9 +891,9 @@ static void mlxsw_sp_router_probe_unresolved_nexthops(struct work_struct *work)
 	rtnl_lock();
 	list_for_each_entry(neigh_entry, &mlxsw_sp->router.nexthop_neighs_list,
 			    nexthop_neighs_list_node) {
-		if (!(neigh_entry->n->nud_state & NUD_VALID) &&
+		if (!(neigh_entry->key.n->nud_state & NUD_VALID) &&
 		    !list_empty(&neigh_entry->nexthop_list))
-			neigh_event_send(neigh_entry->n, NULL);
+			neigh_event_send(neigh_entry->key.n, NULL);
 	}
 	rtnl_unlock();
 
@@ -927,7 +910,7 @@ static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work)
 {
 	struct mlxsw_sp_neigh_entry *neigh_entry =
 		container_of(work, struct mlxsw_sp_neigh_entry, dw.work);
-	struct neighbour *n = neigh_entry->n;
+	struct neighbour *n = neigh_entry->key.n;
 	struct mlxsw_sp_port *mlxsw_sp_port = neigh_entry->mlxsw_sp_port;
 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	char rauht_pl[MLXSW_REG_RAUHT_LEN];
@@ -1030,11 +1013,8 @@ int mlxsw_sp_router_netevent_event(struct notifier_block *unused,
 
 		mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 		dip = ntohl(*((__be32 *) n->primary_key));
-		neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp,
-							  &dip,
-							  sizeof(__be32),
-							  dev);
-		if (WARN_ON(!neigh_entry) || WARN_ON(neigh_entry->n != n)) {
+		neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n);
+		if (WARN_ON(!neigh_entry)) {
 			mlxsw_sp_port_dev_put(mlxsw_sp_port);
 			return NOTIFY_DONE;
 		}
@@ -1343,33 +1323,26 @@ static int mlxsw_sp_nexthop_init(struct mlxsw_sp *mlxsw_sp,
 				 struct fib_nh *fib_nh)
 {
 	struct mlxsw_sp_neigh_entry *neigh_entry;
-	u32 gwip = ntohl(fib_nh->nh_gw);
 	struct net_device *dev = fib_nh->nh_dev;
 	struct neighbour *n;
 	u8 nud_state;
 
-	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, &gwip,
-						  sizeof(gwip), dev);
-	if (!neigh_entry) {
-		__be32 gwipn = htonl(gwip);
-
-		n = neigh_create(&arp_tbl, &gwipn, dev);
+	/* Take a reference of neigh here ensuring that neigh would
+	 * not be detructed before the nexthop entry is finished.
+	 * The reference is taken either in neigh_lookup() or
+	 * in neith_create() in case n is not found.
+	 */
+	n = neigh_lookup(&arp_tbl, &fib_nh->nh_gw, dev);
+	if (!n) {
+		n = neigh_create(&arp_tbl, &fib_nh->nh_gw, dev);
 		if (IS_ERR(n))
 			return PTR_ERR(n);
 		neigh_event_send(n, NULL);
-		neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, &gwip,
-							  sizeof(gwip), dev);
-		if (!neigh_entry) {
-			neigh_release(n);
-			return -EINVAL;
-		}
-	} else {
-		/* Take a reference of neigh here ensuring that neigh would
-		 * not be detructed before the nexthop entry is finished.
-		 * The second branch takes the reference in neith_create()
-		 */
-		n = neigh_entry->n;
-		neigh_clone(n);
+	}
+	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n);
+	if (!neigh_entry) {
+		neigh_release(n);
+		return -EINVAL;
 	}
 
 	/* If that is the first nexthop connected to that neigh, add to
@@ -1403,7 +1376,7 @@ static void mlxsw_sp_nexthop_fini(struct mlxsw_sp *mlxsw_sp,
 	if (list_empty(&nh->neigh_entry->nexthop_list))
 		list_del(&nh->neigh_entry->nexthop_neighs_list_node);
 
-	neigh_release(neigh_entry->n);
+	neigh_release(neigh_entry->key.n);
 }
 
 static struct mlxsw_sp_nexthop_group *
@@ -1463,11 +1436,11 @@ static bool mlxsw_sp_nexthop_match(struct mlxsw_sp_nexthop *nh,
 
 	for (i = 0; i < fi->fib_nhs; i++) {
 		struct fib_nh *fib_nh = &fi->fib_nh[i];
-		u32 gwip = ntohl(fib_nh->nh_gw);
+		struct neighbour *n = nh->neigh_entry->key.n;
 
-		if (memcmp(nh->neigh_entry->key.addr,
-			   &gwip, sizeof(u32)) == 0 &&
-		    nh->neigh_entry->key.dev == fib_nh->nh_dev)
+		if (memcmp(n->primary_key, &fib_nh->nh_gw,
+			   sizeof(fib_nh->nh_gw)) == 0 &&
+		    n->dev == fib_nh->nh_dev)
 			return true;
 	}
 	return false;
-- 
2.7.4

^ permalink raw reply related

* [patch net v2 2/2] mlxsw: spectrum_router: Ignore FIB notification events for non-init namespaces
From: Jiri Pirko @ 2016-11-10 11:31 UTC (permalink / raw)
  To: netdev; +Cc: davem, idosch, eladr, yotamg, nogahf, ogerlitz
In-Reply-To: <1478777465-6567-1-git-send-email-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Since now, the table with same id in multiple netnamespaces were squashed
to a single virtual router. That is not only incorrect, it also causes
error messages when trying to use RALUE register to do double remove
of FIB entries, like this one:

mlxsw_spectrum 0000:03:00.0: EMAD reg access failed (tid=facb831c00007b20,reg_id=8013(ralue),type=write,status=7(bad parameter))

Since we don't allow ports to change namespaces (NETIF_F_NETNS_LOCAL),
and the infrastructure is not yet prepared to handle netnamespaces, just
ignore FIB notification events for non-init namespaces. That is clear to
do since we don't need to offload them.

Fixes: b45f64d16d45 ("mlxsw: spectrum_router: Use FIB notifications instead of switchdev calls")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Ido Schimmel <idosch@mellanox.com>
---
v1->v2:
- use net_eq
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 2863012..040737e 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -1931,6 +1931,9 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 	struct fib_entry_notifier_info *fen_info = ptr;
 	int err;
 
+	if (!net_eq(fen_info->info.net, &init_net))
+		return NOTIFY_DONE;
+
 	switch (event) {
 	case FIB_EVENT_ENTRY_ADD:
 		err = mlxsw_sp_router_fib4_add(mlxsw_sp, fen_info);
-- 
2.7.4

^ permalink raw reply related

* [mm PATCH v3 00/23] Add support for DMA writable pages being writable by the network stack
From: Alexander Duyck @ 2016-11-10 11:34 UTC (permalink / raw)
  To: linux-mm, akpm; +Cc: netdev, linux-kernel

The first 19 patches in the set add support for the DMA attribute
DMA_ATTR_SKIP_CPU_SYNC on multiple platforms/architectures.  This is needed
so that we can flag the calls to dma_map/unmap_page so that we do not
invalidate cache lines that do not currently belong to the device.  Instead
we have to take care of this in the driver via a call to
sync_single_range_for_cpu prior to freeing the Rx page.

Patch 20 adds support for dma_map_page_attrs and dma_unmap_page_attrs so
that we can unmap and map a page using the DMA_ATTR_SKIP_CPU_SYNC
attribute.

Patch 21 adds support for freeing a page that has multiple references being
held by a single caller.  This way we can free page fragments that were
allocated by a given driver.

The last 2 patches use these updates in the igb driver, and lay the
groundwork to allow for us to reimplement the use of build_skb.

v1: Minor fixes based on issues found by kernel build bot
    Few minor changes for issues found on code review
    Added Acked-by for patches that were acked and not changed

v2: Added a few more Acked-by
    Submitting patches to mm instead of net-next

v3: Added Acked-by for PowerPC architecture
    Dropped first 3 patches which were accepted into swiotlb tree
    Dropped comments describing swiotlb changes.

---

Alexander Duyck (23):
      arch/arc: Add option to skip sync on DMA mapping
      arch/arm: Add option to skip sync on DMA map and unmap
      arch/avr32: Add option to skip sync on DMA map
      arch/blackfin: Add option to skip sync on DMA map
      arch/c6x: Add option to skip sync on DMA map and unmap
      arch/frv: Add option to skip sync on DMA map
      arch/hexagon: Add option to skip DMA sync as a part of mapping
      arch/m68k: Add option to skip DMA sync as a part of mapping
      arch/metag: Add option to skip DMA sync as a part of map and unmap
      arch/microblaze: Add option to skip DMA sync as a part of map and unmap
      arch/mips: Add option to skip DMA sync as a part of map and unmap
      arch/nios2: Add option to skip DMA sync as a part of map and unmap
      arch/openrisc: Add option to skip DMA sync as a part of mapping
      arch/parisc: Add option to skip DMA sync as a part of map and unmap
      arch/powerpc: Add option to skip DMA sync as a part of mapping
      arch/sh: Add option to skip DMA sync as a part of mapping
      arch/sparc: Add option to skip DMA sync as a part of map and unmap
      arch/tile: Add option to skip DMA sync as a part of map and unmap
      arch/xtensa: Add option to skip DMA sync as a part of mapping
      dma: Add calls for dma_map_page_attrs and dma_unmap_page_attrs
      mm: Add support for releasing multiple instances of a page
      igb: Update driver to make use of DMA_ATTR_SKIP_CPU_SYNC
      igb: Update code to better handle incrementing page count


 arch/arc/mm/dma.c                         |    5 ++
 arch/arm/common/dmabounce.c               |   16 ++++--
 arch/avr32/mm/dma-coherent.c              |    7 ++-
 arch/blackfin/kernel/dma-mapping.c        |    8 +++
 arch/c6x/kernel/dma.c                     |   14 ++++-
 arch/frv/mb93090-mb00/pci-dma-nommu.c     |   14 ++++-
 arch/frv/mb93090-mb00/pci-dma.c           |    9 +++
 arch/hexagon/kernel/dma.c                 |    6 ++
 arch/m68k/kernel/dma.c                    |    8 +++
 arch/metag/kernel/dma.c                   |   16 +++++-
 arch/microblaze/kernel/dma.c              |   10 +++-
 arch/mips/loongson64/common/dma-swiotlb.c |    2 -
 arch/mips/mm/dma-default.c                |    8 ++-
 arch/nios2/mm/dma-mapping.c               |   26 +++++++---
 arch/openrisc/kernel/dma.c                |    3 +
 arch/parisc/kernel/pci-dma.c              |   20 ++++++--
 arch/powerpc/kernel/dma.c                 |    9 +++
 arch/sh/kernel/dma-nommu.c                |    7 ++-
 arch/sparc/kernel/iommu.c                 |    4 +-
 arch/sparc/kernel/ioport.c                |    4 +-
 arch/tile/kernel/pci-dma.c                |   12 ++++-
 arch/xtensa/kernel/pci-dma.c              |    7 ++-
 drivers/net/ethernet/intel/igb/igb.h      |    7 ++-
 drivers/net/ethernet/intel/igb/igb_main.c |   77 +++++++++++++++++++----------
 include/linux/dma-mapping.h               |   20 +++++---
 include/linux/gfp.h                       |    2 +
 mm/page_alloc.c                           |   14 +++++
 27 files changed, 246 insertions(+), 89 deletions(-)

--

^ permalink raw reply

* [mm PATCH v3 01/23] arch/arc: Add option to skip sync on DMA mapping
From: Alexander Duyck @ 2016-11-10 11:34 UTC (permalink / raw)
  To: linux-mm, akpm; +Cc: Vineet Gupta, linux-kernel, netdev
In-Reply-To: <20161110113027.76501.63030.stgit@ahduyck-blue-test.jf.intel.com>

This change allows us to pass DMA_ATTR_SKIP_CPU_SYNC which allows us to
avoid invoking cache line invalidation if the driver will just handle it
later via a sync_for_cpu or sync_for_device call.

Acked-by: Vineet Gupta <vgupta@synopsys.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 arch/arc/mm/dma.c |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index 20afc65..6303c34 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -133,7 +133,10 @@ static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page,
 		unsigned long attrs)
 {
 	phys_addr_t paddr = page_to_phys(page) + offset;
-	_dma_cache_sync(paddr, size, dir);
+
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		_dma_cache_sync(paddr, size, dir);
+
 	return plat_phys_to_dma(dev, paddr);
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [mm PATCH v3 02/23] arch/arm: Add option to skip sync on DMA map and unmap
From: Alexander Duyck @ 2016-11-10 11:34 UTC (permalink / raw)
  To: linux-mm, akpm; +Cc: netdev, Russell King, linux-kernel
In-Reply-To: <20161110113027.76501.63030.stgit@ahduyck-blue-test.jf.intel.com>

The use of DMA_ATTR_SKIP_CPU_SYNC was not consistent across all of the DMA
APIs in the arch/arm folder.  This change is meant to correct that so that
we get consistent behavior.

Cc: Russell King <linux@armlinux.org.uk>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 arch/arm/common/dmabounce.c |   16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c
index 3012816..75055df 100644
--- a/arch/arm/common/dmabounce.c
+++ b/arch/arm/common/dmabounce.c
@@ -243,7 +243,8 @@ static int needs_bounce(struct device *dev, dma_addr_t dma_addr, size_t size)
 }
 
 static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size,
-		enum dma_data_direction dir)
+				    enum dma_data_direction dir,
+				    unsigned long attrs)
 {
 	struct dmabounce_device_info *device_info = dev->archdata.dmabounce;
 	struct safe_buffer *buf;
@@ -262,7 +263,8 @@ static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size,
 		__func__, buf->ptr, virt_to_dma(dev, buf->ptr),
 		buf->safe, buf->safe_dma_addr);
 
-	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) {
+	if ((dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) &&
+	    !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
 		dev_dbg(dev, "%s: copy unsafe %p to safe %p, size %d\n",
 			__func__, ptr, buf->safe, size);
 		memcpy(buf->safe, ptr, size);
@@ -272,7 +274,8 @@ static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size,
 }
 
 static inline void unmap_single(struct device *dev, struct safe_buffer *buf,
-		size_t size, enum dma_data_direction dir)
+				size_t size, enum dma_data_direction dir,
+				unsigned long attrs)
 {
 	BUG_ON(buf->size != size);
 	BUG_ON(buf->direction != dir);
@@ -283,7 +286,8 @@ static inline void unmap_single(struct device *dev, struct safe_buffer *buf,
 
 	DO_STATS(dev->archdata.dmabounce->bounce_count++);
 
-	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) {
+	if ((dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) &&
+	    !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
 		void *ptr = buf->ptr;
 
 		dev_dbg(dev, "%s: copy back safe %p to unsafe %p size %d\n",
@@ -334,7 +338,7 @@ static dma_addr_t dmabounce_map_page(struct device *dev, struct page *page,
 		return DMA_ERROR_CODE;
 	}
 
-	return map_single(dev, page_address(page) + offset, size, dir);
+	return map_single(dev, page_address(page) + offset, size, dir, attrs);
 }
 
 /*
@@ -357,7 +361,7 @@ static void dmabounce_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t
 		return;
 	}
 
-	unmap_single(dev, buf, size, dir);
+	unmap_single(dev, buf, size, dir, attrs);
 }
 
 static int __dmabounce_sync_for_cpu(struct device *dev, dma_addr_t addr,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [mm PATCH v3 03/23] arch/avr32: Add option to skip sync on DMA map
From: Alexander Duyck @ 2016-11-10 11:34 UTC (permalink / raw)
  To: linux-mm, akpm; +Cc: netdev, linux-kernel, Hans-Christian Noren Egtvedt
In-Reply-To: <20161110113027.76501.63030.stgit@ahduyck-blue-test.jf.intel.com>

The use of DMA_ATTR_SKIP_CPU_SYNC was not consistent across all of the DMA
APIs in the arch/arm folder.  This change is meant to correct that so that
we get consistent behavior.

Acked-by: Hans-Christian Noren Egtvedt <egtvedt@samfundet.no>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 arch/avr32/mm/dma-coherent.c |    7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/avr32/mm/dma-coherent.c b/arch/avr32/mm/dma-coherent.c
index 58610d0..54534e5 100644
--- a/arch/avr32/mm/dma-coherent.c
+++ b/arch/avr32/mm/dma-coherent.c
@@ -146,7 +146,8 @@ static dma_addr_t avr32_dma_map_page(struct device *dev, struct page *page,
 {
 	void *cpu_addr = page_address(page) + offset;
 
-	dma_cache_sync(dev, cpu_addr, size, direction);
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_cache_sync(dev, cpu_addr, size, direction);
 	return virt_to_bus(cpu_addr);
 }
 
@@ -162,6 +163,10 @@ static int avr32_dma_map_sg(struct device *dev, struct scatterlist *sglist,
 
 		sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset;
 		virt = sg_virt(sg);
+
+		if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+			continue;
+
 		dma_cache_sync(dev, virt, sg->length, direction);
 	}
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [mm PATCH v3 04/23] arch/blackfin: Add option to skip sync on DMA map
From: Alexander Duyck @ 2016-11-10 11:34 UTC (permalink / raw)
  To: linux-mm, akpm; +Cc: netdev, linux-kernel, Steven Miao
In-Reply-To: <20161110113027.76501.63030.stgit@ahduyck-blue-test.jf.intel.com>

The use of DMA_ATTR_SKIP_CPU_SYNC was not consistent across all of the DMA
APIs in the arch/arm folder.  This change is meant to correct that so that
we get consistent behavior.

Cc: Steven Miao <realmz6@gmail.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 arch/blackfin/kernel/dma-mapping.c |    8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/blackfin/kernel/dma-mapping.c b/arch/blackfin/kernel/dma-mapping.c
index 53fbbb6..a27a74a 100644
--- a/arch/blackfin/kernel/dma-mapping.c
+++ b/arch/blackfin/kernel/dma-mapping.c
@@ -118,6 +118,10 @@ static int bfin_dma_map_sg(struct device *dev, struct scatterlist *sg_list,
 
 	for_each_sg(sg_list, sg, nents, i) {
 		sg->dma_address = (dma_addr_t) sg_virt(sg);
+
+		if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+			continue;
+
 		__dma_sync(sg_dma_address(sg), sg_dma_len(sg), direction);
 	}
 
@@ -143,7 +147,9 @@ static dma_addr_t bfin_dma_map_page(struct device *dev, struct page *page,
 {
 	dma_addr_t handle = (dma_addr_t)(page_address(page) + offset);
 
-	_dma_sync(handle, size, dir);
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		_dma_sync(handle, size, dir);
+
 	return handle;
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [mm PATCH v3 05/23] arch/c6x: Add option to skip sync on DMA map and unmap
From: Alexander Duyck @ 2016-11-10 11:34 UTC (permalink / raw)
  To: linux-mm, akpm; +Cc: netdev, linux-kernel, Mark Salter
In-Reply-To: <20161110113027.76501.63030.stgit@ahduyck-blue-test.jf.intel.com>

This change allows us to pass DMA_ATTR_SKIP_CPU_SYNC which allows us to
avoid invoking cache line invalidation if the driver will just handle it
later via a sync_for_cpu or sync_for_device call.

Acked-by: Mark Salter <msalter@redhat.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 arch/c6x/kernel/dma.c |   14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/c6x/kernel/dma.c b/arch/c6x/kernel/dma.c
index db4a6a3..6752df3 100644
--- a/arch/c6x/kernel/dma.c
+++ b/arch/c6x/kernel/dma.c
@@ -42,14 +42,17 @@ static dma_addr_t c6x_dma_map_page(struct device *dev, struct page *page,
 {
 	dma_addr_t handle = virt_to_phys(page_address(page) + offset);
 
-	c6x_dma_sync(handle, size, dir);
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		c6x_dma_sync(handle, size, dir);
+
 	return handle;
 }
 
 static void c6x_dma_unmap_page(struct device *dev, dma_addr_t handle,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
-	c6x_dma_sync(handle, size, dir);
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		c6x_dma_sync(handle, size, dir);
 }
 
 static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -60,7 +63,8 @@ static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist,
 
 	for_each_sg(sglist, sg, nents, i) {
 		sg->dma_address = sg_phys(sg);
-		c6x_dma_sync(sg->dma_address, sg->length, dir);
+		if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+			c6x_dma_sync(sg->dma_address, sg->length, dir);
 	}
 
 	return nents;
@@ -72,9 +76,11 @@ static void c6x_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
 	struct scatterlist *sg;
 	int i;
 
+	if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+		return;
+
 	for_each_sg(sglist, sg, nents, i)
 		c6x_dma_sync(sg_dma_address(sg), sg->length, dir);
-
 }
 
 static void c6x_dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox