Netdev List
 help / color / mirror / Atom feed
* [PATCH v2 net 3/5] l2tp: fix racy socket lookup in l2tp_ip and l2tp_ip6 bind()
From: Guillaume Nault @ 2016-11-29 12:09 UTC (permalink / raw)
  To: netdev; +Cc: James Chapman, Chris Elston
In-Reply-To: <cover.1480419658.git.g.nault@alphalink.fr>

It's not enough to check for sockets bound to same address at the
beginning of l2tp_ip{,6}_bind(): even if no socket is found at that
time, a socket with the same address could be bound before we take
the l2tp lock again.

This patch moves the lookup right before inserting the new socket, so
that no change can ever happen to the list between address lookup and
socket insertion.

Care is taken to avoid side effects on the socket in case of failure.
That is, modifications of the socket are done after the lookup, when
binding is guaranteed to succeed, and before releasing the l2tp lock,
so that concurrent lookups will always see fully initialised sockets.

For l2tp_ip, 'ret' is set to -EINVAL before checking the SOCK_ZAPPED
bit. Error code was mistakenly set to -EADDRINUSE on error by commit
32c231164b76 ("l2tp: fix racy SOCK_ZAPPED flag check in l2tp_ip{,6}_bind()").
Using -EINVAL restores original behaviour.

For l2tp_ip6, the lookup is now always done with the correct bound
device. Before this patch, when binding to a link-local address, the
lookup was done with the original sk->sk_bound_dev_if, which was later
overwritten with addr->l2tp_scope_id. Lookup is now performed with the
final sk->sk_bound_dev_if value.

Finally, the (addr_len >= sizeof(struct sockaddr_in6)) check has been
dropped: addr is a sockaddr_l2tpip6 not sockaddr_in6 and addr_len has
already been checked at this point (this part of the code seems to have
been copy-pasted from net/ipv6/raw.c).

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
---
 net/l2tp/l2tp_ip.c  | 27 ++++++++++++---------------
 net/l2tp/l2tp_ip6.c | 43 ++++++++++++++++++++-----------------------
 2 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 4d1c942..b517c33 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -257,15 +257,9 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (addr->l2tp_family != AF_INET)
 		return -EINVAL;
 
-	ret = -EADDRINUSE;
-	read_lock_bh(&l2tp_ip_lock);
-	if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr,
-				  sk->sk_bound_dev_if, addr->l2tp_conn_id))
-		goto out_in_use;
-
-	read_unlock_bh(&l2tp_ip_lock);
-
 	lock_sock(sk);
+
+	ret = -EINVAL;
 	if (!sock_flag(sk, SOCK_ZAPPED))
 		goto out;
 
@@ -282,14 +276,22 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		inet->inet_rcv_saddr = inet->inet_saddr = addr->l2tp_addr.s_addr;
 	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
 		inet->inet_saddr = 0;  /* Use device */
-	sk_dst_reset(sk);
 
+	write_lock_bh(&l2tp_ip_lock);
+	if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr,
+				  sk->sk_bound_dev_if, addr->l2tp_conn_id)) {
+		write_unlock_bh(&l2tp_ip_lock);
+		ret = -EADDRINUSE;
+		goto out;
+	}
+
+	sk_dst_reset(sk);
 	l2tp_ip_sk(sk)->conn_id = addr->l2tp_conn_id;
 
-	write_lock_bh(&l2tp_ip_lock);
 	sk_add_bind_node(sk, &l2tp_ip_bind_table);
 	sk_del_node_init(sk);
 	write_unlock_bh(&l2tp_ip_lock);
+
 	ret = 0;
 	sock_reset_flag(sk, SOCK_ZAPPED);
 
@@ -297,11 +299,6 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	release_sock(sk);
 
 	return ret;
-
-out_in_use:
-	read_unlock_bh(&l2tp_ip_lock);
-
-	return ret;
 }
 
 static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index e3fc778..5f2ae61 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -267,6 +267,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *) uaddr;
 	struct net *net = sock_net(sk);
 	__be32 v4addr = 0;
+	int bound_dev_if;
 	int addr_type;
 	int err;
 
@@ -285,13 +286,6 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (addr_type & IPV6_ADDR_MULTICAST)
 		return -EADDRNOTAVAIL;
 
-	err = -EADDRINUSE;
-	read_lock_bh(&l2tp_ip6_lock);
-	if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr,
-				   sk->sk_bound_dev_if, addr->l2tp_conn_id))
-		goto out_in_use;
-	read_unlock_bh(&l2tp_ip6_lock);
-
 	lock_sock(sk);
 
 	err = -EINVAL;
@@ -301,28 +295,25 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (sk->sk_state != TCP_CLOSE)
 		goto out_unlock;
 
+	bound_dev_if = sk->sk_bound_dev_if;
+
 	/* Check if the address belongs to the host. */
 	rcu_read_lock();
 	if (addr_type != IPV6_ADDR_ANY) {
 		struct net_device *dev = NULL;
 
 		if (addr_type & IPV6_ADDR_LINKLOCAL) {
-			if (addr_len >= sizeof(struct sockaddr_in6) &&
-			    addr->l2tp_scope_id) {
-				/* Override any existing binding, if another
-				 * one is supplied by user.
-				 */
-				sk->sk_bound_dev_if = addr->l2tp_scope_id;
-			}
+			if (addr->l2tp_scope_id)
+				bound_dev_if = addr->l2tp_scope_id;
 
 			/* Binding to link-local address requires an
-			   interface */
-			if (!sk->sk_bound_dev_if)
+			 * interface.
+			 */
+			if (!bound_dev_if)
 				goto out_unlock_rcu;
 
 			err = -ENODEV;
-			dev = dev_get_by_index_rcu(sock_net(sk),
-						   sk->sk_bound_dev_if);
+			dev = dev_get_by_index_rcu(sock_net(sk), bound_dev_if);
 			if (!dev)
 				goto out_unlock_rcu;
 		}
@@ -337,13 +328,22 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	}
 	rcu_read_unlock();
 
-	inet->inet_rcv_saddr = inet->inet_saddr = v4addr;
+	write_lock_bh(&l2tp_ip6_lock);
+	if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, bound_dev_if,
+				   addr->l2tp_conn_id)) {
+		write_unlock_bh(&l2tp_ip6_lock);
+		err = -EADDRINUSE;
+		goto out_unlock;
+	}
+
+	inet->inet_saddr = v4addr;
+	inet->inet_rcv_saddr = v4addr;
+	sk->sk_bound_dev_if = bound_dev_if;
 	sk->sk_v6_rcv_saddr = addr->l2tp_addr;
 	np->saddr = addr->l2tp_addr;
 
 	l2tp_ip6_sk(sk)->conn_id = addr->l2tp_conn_id;
 
-	write_lock_bh(&l2tp_ip6_lock);
 	sk_add_bind_node(sk, &l2tp_ip6_bind_table);
 	sk_del_node_init(sk);
 	write_unlock_bh(&l2tp_ip6_lock);
@@ -356,10 +356,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	rcu_read_unlock();
 out_unlock:
 	release_sock(sk);
-	return err;
 
-out_in_use:
-	read_unlock_bh(&l2tp_ip6_lock);
 	return err;
 }
 
-- 
2.10.2

^ permalink raw reply related

* [PATCH v2 net 5/5] l2tp: fix address test in __l2tp_ip6_bind_lookup()
From: Guillaume Nault @ 2016-11-29 12:09 UTC (permalink / raw)
  To: netdev; +Cc: James Chapman, Chris Elston
In-Reply-To: <cover.1480419658.git.g.nault@alphalink.fr>

The '!(addr && ipv6_addr_equal(addr, laddr))' part of the conditional
matches if addr is NULL or if addr != laddr.
But the intend of __l2tp_ip6_bind_lookup() is to find a sockets with
the same address, so the ipv6_addr_equal() condition needs to be
inverted.

For better clarity and consistency with the rest of the expression, the
(!X || X == Y) notation is used instead of !(X && X != Y).

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
---
 net/l2tp/l2tp_ip6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 4a86440..aa821cb 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -72,7 +72,7 @@ static struct sock *__l2tp_ip6_bind_lookup(struct net *net,
 
 		if ((l2tp->conn_id == tunnel_id) &&
 		    net_eq(sock_net(sk), net) &&
-		    !(addr && ipv6_addr_equal(addr, laddr)) &&
+		    (!addr || ipv6_addr_equal(addr, laddr)) &&
 		    (!sk->sk_bound_dev_if || !dif ||
 		     sk->sk_bound_dev_if == dif))
 			goto found;
-- 
2.10.2

^ permalink raw reply related

* [PATCH v2 net 2/5] l2tp: hold socket before dropping lock in l2tp_ip{,6}_recv()
From: Guillaume Nault @ 2016-11-29 12:09 UTC (permalink / raw)
  To: netdev; +Cc: James Chapman, Chris Elston
In-Reply-To: <cover.1480419658.git.g.nault@alphalink.fr>

Socket must be held while under the protection of the l2tp lock; there
is no guarantee that sk remains valid after the read_unlock_bh() call.

Same issue for l2tp_ip and l2tp_ip6.

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
---
 net/l2tp/l2tp_ip.c  | 11 ++++++-----
 net/l2tp/l2tp_ip6.c | 11 ++++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 1f57094..4d1c942 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -183,14 +183,15 @@ static int l2tp_ip_recv(struct sk_buff *skb)
 
 		read_lock_bh(&l2tp_ip_lock);
 		sk = __l2tp_ip_bind_lookup(net, iph->daddr, 0, tunnel_id);
+		if (!sk) {
+			read_unlock_bh(&l2tp_ip_lock);
+			goto discard;
+		}
+
+		sock_hold(sk);
 		read_unlock_bh(&l2tp_ip_lock);
 	}
 
-	if (sk == NULL)
-		goto discard;
-
-	sock_hold(sk);
-
 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto discard_put;
 
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index af9abff..e3fc778 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -198,14 +198,15 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
 		read_lock_bh(&l2tp_ip6_lock);
 		sk = __l2tp_ip6_bind_lookup(net, &iph->daddr,
 					    0, tunnel_id);
+		if (!sk) {
+			read_unlock_bh(&l2tp_ip6_lock);
+			goto discard;
+		}
+
+		sock_hold(sk);
 		read_unlock_bh(&l2tp_ip6_lock);
 	}
 
-	if (sk == NULL)
-		goto discard;
-
-	sock_hold(sk);
-
 	if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto discard_put;
 
-- 
2.10.2

^ permalink raw reply related

* Re: [PATCH net 3/5] l2tp: fix racy socket lookup in l2tp_ip and l2tp_ip6 bind()
From: Guillaume Nault @ 2016-11-29 12:12 UTC (permalink / raw)
  To: kbuild test robot; +Cc: kbuild-all, netdev, James Chapman, Chris Elston
In-Reply-To: <201611290515.BXajf2gX%fengguang.wu@intel.com>

On Tue, Nov 29, 2016 at 05:10:40AM +0800, kbuild test robot wrote:
> Hi Guillaume,
> 
> [auto build test WARNING on net/master]
> 
> url:    https://github.com/0day-ci/linux/commits/Guillaume-Nault/l2tp-fixes-for-l2tp_ip-and-l2tp_ip6-socket-handling/20161129-043208
> config: x86_64-rhel (attached as .config)
> compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
> reproduce:
>         # save the attached .config to linux build tree
>         make ARCH=x86_64 
> 
> Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
> http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings
> 
> All warnings (new ones prefixed by >>):
> 
>    net/l2tp/l2tp_ip.c: In function 'l2tp_ip_bind':
> >> net/l2tp/l2tp_ip.c:299:9: warning: 'ret' may be used uninitialized in this function [-Wmaybe-uninitialized]
>      return ret;
>             ^~~
> 
I should have seen this earlier, sorry.
This is now fixed in v2.

^ permalink raw reply

* Re: [PATCH v3 net-next 1/2] net: ethernet: slicoss: add slicoss gigabit ethernet driver
From: Markus Böhme @ 2016-11-29 12:15 UTC (permalink / raw)
  To: Lino Sanfilippo, davem, charrer, liodot, gregkh, andrew
  Cc: devel, netdev, linux-kernel
In-Reply-To: <9a789bda-7d98-fa2c-8759-e046c6dd5145@gmx.de>

On 11/28/2016 09:46 PM, Lino Sanfilippo wrote:
> Hi Markus,
> 
> On 27.11.2016 18:59, Markus Böhme wrote:
>> Hello Lino,
>>
>> just some things barely worth mentioning:
>>
> 
>>
>> I found a bunch of unused #defines in slic.h. I cannot judge if they are
>> worth keeping:
>>
>> 	SLIC_VRHSTATB_LONGE
>> 	SLIC_VRHSTATB_PREA
>> 	SLIC_ISR_IO
>> 	SLIC_ISR_PING_MASK
>> 	SLIC_GIG_SPEED_MASK
>> 	SLIC_GMCR_RESET
>> 	SLIC_XCR_RESET
>> 	SLIC_XCR_XMTEN
>> 	SLIC_XCR_PAUSEEN
>> 	SLIC_XCR_LOADRNG
>> 	SLIC_REG_DBAR
>> 	SLIC_REG_PING
>> 	SLIC_REG_DUMP_CMD
>> 	SLIC_REG_DUMP_DATA
>> 	SLIC_REG_WRHOSTID
>> 	SLIC_REG_LOW_POWER
>> 	SLIC_REG_RESET_IFACE
>> 	SLIC_REG_ADDR_UPPER
>> 	SLIC_REG_HBAR64
>> 	SLIC_REG_DBAR64
>> 	SLIC_REG_CBAR64
>> 	SLIC_REG_RBAR64
>> 	SLIC_REG_WRVLANID
>> 	SLIC_REG_READ_XF_INFO
>> 	SLIC_REG_WRITE_XF_INFO
>> 	SLIC_REG_TICKS_PER_SEC
>>
>> These device IDs are not used, either, but maybe it's good to keep them
>> for documentation purposes:
>>
>> 	PCI_SUBDEVICE_ID_ALACRITECH_1000X1_2
>> 	PCI_SUBDEVICE_ID_ALACRITECH_SES1001T
>> 	PCI_SUBDEVICE_ID_ALACRITECH_SEN2002XT
>> 	PCI_SUBDEVICE_ID_ALACRITECH_SEN2001XT
>> 	PCI_SUBDEVICE_ID_ALACRITECH_SEN2104ET
>> 	PCI_SUBDEVICE_ID_ALACRITECH_SEN2102ET
>>
> 
> I left these defines in for both documentation and to avoid gaps in
> register ranges. I would like to keep this as it is.

Seems reasonable.

[...]

>>> +static int slic_init_tx_queue(struct slic_device *sdev)
>>> +{
>>> +	struct slic_tx_queue *txq = &sdev->txq;
>>> +	struct slic_tx_buffer *buff;
>>> +	struct slic_tx_desc *desc;
>>> +	int err;
>>> +	int i;
>>
>> You could make i unsigned...
>>
> 
>>> +
>>> +	txq->len = SLIC_NUM_TX_DESCS;
>>> +	txq->put_idx = 0;
>>> +	txq->done_idx = 0;
>>> +
>>> +	txq->txbuffs = kcalloc(txq->len, sizeof(*buff), GFP_KERNEL);
>>> +	if (!txq->txbuffs)
>>> +		return -ENOMEM;
>>> +
>>> +	txq->dma_pool = dma_pool_create("slic_pool", &sdev->pdev->dev,
>>> +					sizeof(*desc), SLIC_TX_DESC_ALIGN,
>>> +					4096);
>>> +	if (!txq->dma_pool) {
>>> +		err = -ENOMEM;
>>> +		netdev_err(sdev->netdev, "failed to create dma pool\n");
>>> +		goto free_buffs;
>>> +	}
>>> +
>>> +	for (i = 0; i < txq->len; i++) {
>>
>> ...to fix a signed/unsigned comparison warning here, but...
>>
>>> +		buff = &txq->txbuffs[i];
>>> +		desc = dma_pool_zalloc(txq->dma_pool, GFP_KERNEL,
>>> +				       &buff->desc_paddr);
>>> +		if (!desc) {
>>> +			netdev_err(sdev->netdev,
>>> +				   "failed to alloc pool chunk (%i)\n", i);
>>> +			err = -ENOMEM;
>>> +			goto free_descs;
>>> +		}
>>> +
>>> +		desc->hnd = cpu_to_le32((u32)(i + 1));
>>> +		desc->cmd = SLIC_CMD_XMT_REQ;
>>> +		desc->flags = 0;
>>> +		desc->type = cpu_to_le32(SLIC_CMD_TYPE_DUMB);
>>> +		buff->desc = desc;
>>> +	}
>>> +
>>> +	return 0;
>>> +
>>> +free_descs:
>>> +	while (i--) {
>>
>> ...this would require reworking this logic to prevent an endless loop,
>> so probably not worth bothering, considering that txq->len is well
>> within the positive signed range.
> 
> AFAICS the logic does not have to be changed. The while loop will also work
> fine if "i" is unsigned.
> 

My bad! Of course you are right.


Regards,
Markus

^ permalink raw reply

* [PATCH V2 net-next] net: hns: Fix to conditionally convey RX checksum flag to stack
From: Salil Mehta @ 2016-11-29 13:09 UTC (permalink / raw)
  To: davem
  Cc: salil.mehta, yisen.zhuang, mehta.salil.lnk, netdev, linux-kernel,
	linuxarm

This patch introduces the RX checksum function to check the
status of the hardware calculated checksum and its error and
appropriately convey status to the upper stack in skb->ip_summed
field.

We only support checksum for IPv4, UDP(over IPv4 or IPv6),
TCP(over IPv4 or IPv6) and SCTP but we support many L3(IPv4, IPv6,
MPLS, PPPoE etc) and L4(TCP, UDP, GRE, SCTP, IGMP, ICMP etc.)
protocols. We want to filter out L3 and L4 protocols early on for
which checksum is not supported.

Our present hardware RX Descriptor lacks L3/L4 "Checksum
Status & Error" bit (indicating whether checksum was calculated
and if there was an error encountered) for the supported protocol
received in the packet. Therefore, we do the following:
1. Filter the protocols for which checksum is not supported.
2. Check if there were any errors encountered in L3 or L4
   protocols. These errors might not just be Checksum errors but
   could be related to version, length of IPv4, UDP, TCP etc.
   2a. If L3 Errors amd L4 Errors exists, then return as our RX
       descriptor lacks Status-and-Error bits for checksum so
       cannot identify specifically if error was because of
       checksum error or other error for this packet.
   2b. If above errors do not exists, then we set checksum
       un-necessary for upper layers.

Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
---
Change Log:
Patch V2: Addressed the comment by David Miller
          Link: https://www.spinics.net/lists/netdev/msg406697.html
Patch V1: This patch is a result of the comments given by
          David Miller <davem@davemloft.net>
          Link: https://lkml.org/lkml/2016/6/15/27
---
 drivers/net/ethernet/hisilicon/hns/hnae.h     |    2 +
 drivers/net/ethernet/hisilicon/hns/hns_enet.c |   71 ++++++++++++++++++++++---
 2 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hnae.h b/drivers/net/ethernet/hisilicon/hns/hnae.h
index 09602f1..8016854 100644
--- a/drivers/net/ethernet/hisilicon/hns/hnae.h
+++ b/drivers/net/ethernet/hisilicon/hns/hnae.h
@@ -99,6 +99,8 @@ enum hnae_led_state {
 #define HNS_RX_FLAG_L3ID_IPV6 0x1
 #define HNS_RX_FLAG_L4ID_UDP 0x0
 #define HNS_RX_FLAG_L4ID_TCP 0x1
+#define HNS_RX_FLAG_L4ID_SCTP 0x3
+
 
 #define HNS_TXD_ASID_S 0
 #define HNS_TXD_ASID_M (0xff << HNS_TXD_ASID_S)
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index 255fede..6450d0e 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -565,6 +565,66 @@ static void get_rx_desc_bnum(u32 bnum_flag, int *out_bnum)
 				   HNS_RXD_BUFNUM_M, HNS_RXD_BUFNUM_S);
 }
 
+static void hns_nic_rx_checksum(struct hns_nic_ring_data *ring_data,
+				struct sk_buff *skb, u32 flag)
+{
+	struct net_device *netdev = ring_data->napi.dev;
+	u32 l3id;
+	u32 l4id;
+
+	/* check if RX checksum offload is enabled */
+	if (unlikely(!(netdev->features & NETIF_F_RXCSUM)))
+		return;
+
+	/* We only support checksum for IPv4,UDP(over IPv4 or IPv6), TCP(over
+	 * IPv4 or IPv6) and SCTP but we support many L3(IPv4, IPv6, MPLS,
+	 * PPPoE etc) and L4(TCP, UDP, GRE, SCTP, IGMP, ICMP etc.) protocols.
+	 * We want to filter out L3 and L4 protocols early on for which checksum
+	 * is not supported.
+	 *
+	 * Our present hardware RX Descriptor lacks L3/L4 "Checksum Status &
+	 * Error" bit (indicating whether checksum was calculated and if there
+	 * was an error encountered) for the supported protocol received in the
+	 * packet. Therefore, we do the following:
+	 * 1. Filter the protocols for which checksum is not supported.
+	 * 2. Check if there were any errors encountered in L3 or L4 protocols.
+	 *    These errors might not just be Checksum errors but could be
+	 *    related to version, length of IPv4, UDP, TCP etc.
+	 *    2a. If L3 Errors amd L4 Errors exists, then return as our RX
+	 *        descriptor lacks Status-and-Error bits for checksum so cannot
+	 *        identify specifically if error was because of checksum error
+	 *        or other error for this packet.
+	 *    2b. If above errors do not exists, then we set checksum
+	 *        un-necessary for upper layers.
+	 */
+	l3id = hnae_get_field(flag, HNS_RXD_L3ID_M, HNS_RXD_L3ID_S);
+	l4id = hnae_get_field(flag, HNS_RXD_L4ID_M, HNS_RXD_L4ID_S);
+	if ((l3id != HNS_RX_FLAG_L3ID_IPV4) &&
+	    ((l3id != HNS_RX_FLAG_L3ID_IPV6) ||
+	     (l4id != HNS_RX_FLAG_L4ID_UDP)) &&
+	    ((l3id != HNS_RX_FLAG_L3ID_IPV6) ||
+	     (l4id != HNS_RX_FLAG_L4ID_TCP)) &&
+	    (l4id != HNS_RX_FLAG_L4ID_SCTP))
+		return;
+
+	/* We do not support checksum of fragmented packets */
+	if (unlikely(hnae_get_bit(flag, HNS_RXD_FRAG_B)))
+		return;
+
+	/* Check if there are any L3/L4 errors encountered during RX checksum */
+	if (unlikely(hnae_get_bit(flag, HNS_RXD_L3E_B) ||
+		     hnae_get_bit(flag, HNS_RXD_L4E_B))) {
+		/* we dont have any way to detect if this is checksum error or
+		 * any specific protocol error. Therefore, return no checksum
+		 * all protocol errors.
+		 */
+		return;
+	}
+
+	/* Now, this is a packet with valid RX checksum */
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+}
+
 static int hns_nic_poll_rx_skb(struct hns_nic_ring_data *ring_data,
 			       struct sk_buff **out_skb, int *out_bnum)
 {
@@ -683,13 +743,10 @@ static int hns_nic_poll_rx_skb(struct hns_nic_ring_data *ring_data,
 	ring->stats.rx_pkts++;
 	ring->stats.rx_bytes += skb->len;
 
-	if (unlikely(hnae_get_bit(bnum_flag, HNS_RXD_L3E_B) ||
-		     hnae_get_bit(bnum_flag, HNS_RXD_L4E_B))) {
-		ring->stats.l3l4_csum_err++;
-		return 0;
-	}
-
-	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	/* indicate to upper stack if our hardware has already calculated
+	 * the RX checksum
+	 */
+	hns_nic_rx_checksum(ring_data, skb, bnum_flag);
 
 	return 0;
 }
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v3 net-next 0/4] bpf: BPF for lightweight tunnel encapsulation
From: Thomas Graf @ 2016-11-29 13:21 UTC (permalink / raw)
  To: davem; +Cc: netdev, alexei.starovoitov, daniel, tom, roopa, hannes

This series implements BPF program invocation from dst entries via the
lightweight tunnels infrastructure. The BPF program can be attached to
lwtunnel_input(), lwtunnel_output() or lwtunnel_xmit() and see an L3
skb as context. Programs attached to input and output are read-only.
Programs attached to lwtunnel_xmit() can modify and redirect, push headers
and redirect packets.

The facility can be used to:
 - Collect statistics and generate sampling data for a subset of traffic
   based on the dst utilized by the packet thus allowing to extend the
   existing realms.
 - Apply additional per route/dst filters to prohibit certain outgoing
   or incoming packets based on BPF filters. In particular, this allows
   to maintain per dst custom state across multiple packets in BPF maps
   and apply filters based on statistics and behaviour observed over time.
 - Attachment of L2 headers at transmit where resolving the L2 address
   is not required.
 - Possibly many more.

v2 -> v3:
 - Added real world sample lwt_len_hist_kern.c which demonstrates how to
   collect a histogram on packet sizes for all packets flowing through
   a number of routes.
 - Restricted output to be read-only. Since the header can no longer
   be modified, the rerouting functionality has been removed again.
 - Added test case which cover destructive modification of packet data.

v1 -> v2:
 - Added new BPF_LWT_REROUTE return code for program to indicate
   that new route lookup should be performed. Suggested by Tom.
 - New sample to illustrate rerouting
 - New patch 05: Recursion limit for lwtunnel_output for the case
   when user creates circular dst redirection. Also resolves the
   issue for ILA.
 - Fix to ensure headroom for potential future L2 header is still
   guaranteed


Thomas Graf (4):
  route: Set orig_output when redirecting to lwt on locally generated
    traffic
  route: Set lwtstate for local traffic and cached input dsts
  bpf: BPF for lightweight tunnel infrastructure
  bpf: Add tests and samples for LWT-BPF

 include/linux/filter.h          |   2 +-
 include/uapi/linux/bpf.h        |  32 +++-
 include/uapi/linux/lwtunnel.h   |  23 +++
 kernel/bpf/verifier.c           |  14 +-
 net/Kconfig                     |   8 +
 net/core/Makefile               |   1 +
 net/core/filter.c               | 148 ++++++++++++++-
 net/core/lwt_bpf.c              | 397 ++++++++++++++++++++++++++++++++++++++++
 net/core/lwtunnel.c             |   2 +
 net/ipv4/route.c                |  37 ++--
 samples/bpf/Makefile            |   4 +
 samples/bpf/bpf_helpers.h       |   4 +
 samples/bpf/lwt_len_hist.sh     |  37 ++++
 samples/bpf/lwt_len_hist_kern.c |  82 +++++++++
 samples/bpf/lwt_len_hist_user.c |  76 ++++++++
 samples/bpf/test_lwt_bpf.c      | 247 +++++++++++++++++++++++++
 samples/bpf/test_lwt_bpf.sh     | 385 ++++++++++++++++++++++++++++++++++++++
 17 files changed, 1482 insertions(+), 17 deletions(-)
 create mode 100644 net/core/lwt_bpf.c
 create mode 100755 samples/bpf/lwt_len_hist.sh
 create mode 100644 samples/bpf/lwt_len_hist_kern.c
 create mode 100644 samples/bpf/lwt_len_hist_user.c
 create mode 100644 samples/bpf/test_lwt_bpf.c
 create mode 100755 samples/bpf/test_lwt_bpf.sh

-- 
2.7.4

^ permalink raw reply

* [PATCH net-next v3 1/4] route: Set orig_output when redirecting to lwt on locally generated traffic
From: Thomas Graf @ 2016-11-29 13:21 UTC (permalink / raw)
  To: davem; +Cc: netdev, alexei.starovoitov, daniel, tom, roopa, hannes
In-Reply-To: <cover.1480424542.git.tgraf@suug.ch>

orig_output for IPv4 was only set for dsts which hit an input route.
Set it consistently for locally generated traffic as well to allow
lwt to continue the dst_output() path as configured by the nexthop.

Fixes: 2536862311d ("lwt: Add support to redirect dst.input")
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
 net/ipv4/route.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d37fc6f..0c39b62 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2154,8 +2154,10 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	}
 
 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
-	if (lwtunnel_output_redirect(rth->dst.lwtstate))
+	if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
+		rth->dst.lwtstate->orig_output = rth->dst.output;
 		rth->dst.output = lwtunnel_output;
+	}
 
 	return rth;
 }
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next v3 2/4] route: Set lwtstate for local traffic and cached input dsts
From: Thomas Graf @ 2016-11-29 13:21 UTC (permalink / raw)
  To: davem; +Cc: netdev, alexei.starovoitov, daniel, tom, roopa, hannes
In-Reply-To: <cover.1480424542.git.tgraf@suug.ch>

A route on the output path hitting a RTN_LOCAL route will keep the dst
associated on its way through the loopback device. On the receive path,
the dst_input() call will thus invoke the input handler of the route
created in the output path. Thus, lwt redirection for input must be done
for dsts allocated in the otuput path as well.

Also, if a route is cached in the input path, the allocated dst should
respect lwtunnel configuration on the nexthop as well.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
 net/ipv4/route.c | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0c39b62..f4c4d7a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1602,6 +1602,19 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
 	spin_unlock_bh(&fnhe_lock);
 }
 
+static void set_lwt_redirect(struct rtable *rth)
+{
+	if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
+		rth->dst.lwtstate->orig_output = rth->dst.output;
+		rth->dst.output = lwtunnel_output;
+	}
+
+	if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+		rth->dst.lwtstate->orig_input = rth->dst.input;
+		rth->dst.input = lwtunnel_input;
+	}
+}
+
 /* called in rcu_read_lock() section */
 static int __mkroute_input(struct sk_buff *skb,
 			   const struct fib_result *res,
@@ -1691,14 +1704,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->dst.input = ip_forward;
 
 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
-	if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
-		rth->dst.lwtstate->orig_output = rth->dst.output;
-		rth->dst.output = lwtunnel_output;
-	}
-	if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
-		rth->dst.lwtstate->orig_input = rth->dst.input;
-		rth->dst.input = lwtunnel_input;
-	}
+	set_lwt_redirect(rth);
 	skb_dst_set(skb, &rth->dst);
 out:
 	err = 0;
@@ -1925,8 +1931,18 @@ out:	return err;
 		rth->dst.error= -err;
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
+
 	if (do_cache) {
-		if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
+		struct fib_nh *nh = &FIB_RES_NH(res);
+
+		rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
+		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+			WARN_ON(rth->dst.input == lwtunnel_input);
+			rth->dst.lwtstate->orig_input = rth->dst.input;
+			rth->dst.input = lwtunnel_input;
+		}
+
+		if (unlikely(!rt_cache_route(nh, rth))) {
 			rth->dst.flags |= DST_NOCACHE;
 			rt_add_uncached_list(rth);
 		}
@@ -2154,10 +2170,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	}
 
 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
-	if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
-		rth->dst.lwtstate->orig_output = rth->dst.output;
-		rth->dst.output = lwtunnel_output;
-	}
+	set_lwt_redirect(rth);
 
 	return rth;
 }
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next v3 3/4] bpf: BPF for lightweight tunnel infrastructure
From: Thomas Graf @ 2016-11-29 13:21 UTC (permalink / raw)
  To: davem; +Cc: netdev, alexei.starovoitov, daniel, tom, roopa, hannes
In-Reply-To: <cover.1480424542.git.tgraf@suug.ch>

Registers new BPF program types which correspond to the LWT hooks:
  - BPF_PROG_TYPE_LWT_IN   => dst_input()
  - BPF_PROG_TYPE_LWT_OUT  => dst_output()
  - BPF_PROG_TYPE_LWT_XMIT => lwtunnel_xmit()

The separate program types are required to differentiate between the
capabilities each LWT hook allows:

 * Programs attached to dst_input() or dst_output() are restricted and
   may only read the data of an skb. This prevent modification and
   possible invalidation of already validated packet headers on receive
   and the construction of illegal headers while the IP headers are
   still being assembled.

 * Programs attached to lwtunnel_xmit() are allowed to modify packet
   content as well as prepending an L2 header via a newly introduced
   helper bpf_skb_push(). This is safe as lwtunnel_xmit() is invoked
   after the IP header has been assembled completely.

All BPF programs receive an skb with L3 headers attached and may return
one of the following error codes:

 BPF_OK - Continue routing as per nexthop
 BPF_DROP - Drop skb and return EPERM
 BPF_REDIRECT - Redirect skb to device as per redirect() helper.
                (Only valid in lwtunnel_xmit() context)

The return codes are binary compatible with their TC_ACT_
relatives to ease compatibility.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
 include/linux/filter.h        |   2 +-
 include/uapi/linux/bpf.h      |  32 +++-
 include/uapi/linux/lwtunnel.h |  23 +++
 kernel/bpf/verifier.c         |  14 +-
 net/Kconfig                   |   8 +
 net/core/Makefile             |   1 +
 net/core/filter.c             | 148 +++++++++++++++-
 net/core/lwt_bpf.c            | 397 ++++++++++++++++++++++++++++++++++++++++++
 net/core/lwtunnel.c           |   2 +
 9 files changed, 621 insertions(+), 6 deletions(-)
 create mode 100644 net/core/lwt_bpf.c

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7f246a2..7ba6446 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -438,7 +438,7 @@ struct xdp_buff {
 };
 
 /* compute the linear packet data range [data, data_end) which
- * will be accessed by cls_bpf and act_bpf programs
+ * will be accessed by cls_bpf, act_bpf and lwt programs
  */
 static inline void bpf_compute_data_end(struct sk_buff *skb)
 {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1370a9d..81c02c5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -101,6 +101,9 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_XDP,
 	BPF_PROG_TYPE_PERF_EVENT,
 	BPF_PROG_TYPE_CGROUP_SKB,
+	BPF_PROG_TYPE_LWT_IN,
+	BPF_PROG_TYPE_LWT_OUT,
+	BPF_PROG_TYPE_LWT_XMIT,
 };
 
 enum bpf_attach_type {
@@ -409,6 +412,16 @@ union bpf_attr {
  *
  * int bpf_get_numa_node_id()
  *     Return: Id of current NUMA node.
+ *
+ * int bpf_skb_push()
+ *     Add room to beginning of skb and adjusts MAC header offset accordingly.
+ *     Extends/reallocaes for needed skb headeroom automatically.
+ *     May change skb data pointer and will thus invalidate any check done
+ *     for direct packet access.
+ *     @skb: pointer to skb
+ *     @len: length of header to be pushed in front
+ *     @flags: Flags (unused for now)
+ *     Return: 0 on success or negative error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -453,7 +466,8 @@ union bpf_attr {
 	FN(skb_pull_data),		\
 	FN(csum_update),		\
 	FN(set_hash_invalid),		\
-	FN(get_numa_node_id),
+	FN(get_numa_node_id),		\
+	FN(skb_push),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -537,6 +551,22 @@ struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+/* Generic BPF return codes which all BPF program types may support.
+ * The values are binary compatible with their TC_ACT_* counter-part to
+ * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
+ * programs.
+ *
+ * XDP is handled seprately, see XDP_*.
+ */
+enum bpf_ret_code {
+	BPF_OK = 0,
+	/* 1 reserved */
+	BPF_DROP = 2,
+	/* 3-6 reserved */
+	BPF_REDIRECT = 7,
+	/* >127 are reserved for prog type specific return codes */
+};
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 453cc62..937f660 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -10,6 +10,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_ILA,
 	LWTUNNEL_ENCAP_IP6,
 	LWTUNNEL_ENCAP_SEG6,
+	LWTUNNEL_ENCAP_BPF,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
@@ -43,4 +44,26 @@ enum lwtunnel_ip6_t {
 
 #define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1)
 
+enum {
+	LWT_BPF_PROG_UNSPEC,
+	LWT_BPF_PROG_FD,
+	LWT_BPF_PROG_NAME,
+	__LWT_BPF_PROG_MAX,
+};
+
+#define LWT_BPF_PROG_MAX (__LWT_BPF_PROG_MAX - 1)
+
+enum {
+	LWT_BPF_UNSPEC,
+	LWT_BPF_IN,
+	LWT_BPF_OUT,
+	LWT_BPF_XMIT,
+	LWT_BPF_XMIT_HEADROOM,
+	__LWT_BPF_MAX,
+};
+
+#define LWT_BPF_MAX (__LWT_BPF_MAX - 1)
+
+#define LWT_BPF_MAX_HEADROOM 128
+
 #endif /* _UAPI_LWTUNNEL_H_ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8740c5f..8135cb1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -633,12 +633,19 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 #define MAX_PACKET_OFF 0xffff
 
 static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
-				       const struct bpf_call_arg_meta *meta)
+				       const struct bpf_call_arg_meta *meta,
+				       enum bpf_access_type t)
 {
 	switch (env->prog->type) {
+	case BPF_PROG_TYPE_LWT_IN:
+	case BPF_PROG_TYPE_LWT_OUT:
+		/* dst_input() and dst_output() can't write for now */
+		if (t == BPF_WRITE)
+			return false;
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
 	case BPF_PROG_TYPE_XDP:
+	case BPF_PROG_TYPE_LWT_XMIT:
 		if (meta)
 			return meta->pkt_access;
 
@@ -837,7 +844,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 			err = check_stack_read(state, off, size, value_regno);
 		}
 	} else if (state->regs[regno].type == PTR_TO_PACKET) {
-		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
+		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
 		}
@@ -970,7 +977,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
+	if (type == PTR_TO_PACKET &&
+	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
 		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
 	}
diff --git a/net/Kconfig b/net/Kconfig
index 7b6cd34..a100500 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -402,6 +402,14 @@ config LWTUNNEL
 	  weight tunnel endpoint. Tunnel encapsulation parameters are stored
 	  with light weight tunnel state associated with fib routes.
 
+config LWTUNNEL_BPF
+	bool "Execute BPF program as route nexthop action"
+	depends on LWTUNNEL
+	default y if LWTUNNEL=y
+	---help---
+	  Allows to run BPF programs as a nexthop action following a route
+	  lookup for incoming and outgoing packets.
+
 config DST_CACHE
 	bool
 	default n
diff --git a/net/core/Makefile b/net/core/Makefile
index d6508c2..f6761b6 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
 obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
 obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/filter.c b/net/core/filter.c
index 698a262..1dafe37 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2188,6 +2188,50 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+static bool len_fits_mtu(struct sk_buff *skb, u32 len)
+{
+	u32 new_len = skb->len + len;
+
+	if (skb_is_gso(skb))
+		return skb_gso_validate_mtu(skb, skb->dev->mtu - len);
+
+	return new_len <= skb->dev->mtu && new_len >= skb->len;
+}
+
+BPF_CALL_3(bpf_skb_push, struct sk_buff *, skb, __u32, len, u64, flags)
+{
+	if (!len_fits_mtu(skb, len))
+		return -ERANGE;
+
+	if (flags)
+		return -EINVAL;
+
+	if (len > 0) {
+		int ret;
+
+		ret = skb_cow(skb, len);
+		if (unlikely(ret < 0))
+			return ret;
+
+		__skb_push(skb, len);
+		memset(skb->data, 0, len);
+	}
+
+	skb_reset_mac_header(skb);
+	bpf_compute_data_end(skb);
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_push_proto = {
+	.func		= bpf_skb_push,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 bool bpf_helper_changes_skb_data(void *func)
 {
 	if (func == bpf_skb_vlan_push ||
@@ -2197,7 +2241,8 @@ bool bpf_helper_changes_skb_data(void *func)
 	    func == bpf_skb_change_tail ||
 	    func == bpf_skb_pull_data ||
 	    func == bpf_l3_csum_replace ||
-	    func == bpf_l4_csum_replace)
+	    func == bpf_l4_csum_replace ||
+	    func == bpf_skb_push)
 		return true;
 
 	return false;
@@ -2639,6 +2684,68 @@ cg_skb_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static const struct bpf_func_proto *
+lwt_inout_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_load_bytes:
+		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_pull_data:
+		return &bpf_skb_pull_data_proto;
+	case BPF_FUNC_csum_diff:
+		return &bpf_csum_diff_proto;
+	case BPF_FUNC_get_cgroup_classid:
+		return &bpf_get_cgroup_classid_proto;
+	case BPF_FUNC_get_route_realm:
+		return &bpf_get_route_realm_proto;
+	case BPF_FUNC_get_hash_recalc:
+		return &bpf_get_hash_recalc_proto;
+	case BPF_FUNC_perf_event_output:
+		return &bpf_skb_event_output_proto;
+	case BPF_FUNC_get_smp_processor_id:
+		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_skb_under_cgroup:
+		return &bpf_skb_under_cgroup_proto;
+	default:
+		return sk_filter_func_proto(func_id);
+	}
+}
+
+static const struct bpf_func_proto *
+lwt_xmit_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_get_tunnel_key:
+		return &bpf_skb_get_tunnel_key_proto;
+	case BPF_FUNC_skb_set_tunnel_key:
+		return bpf_get_skb_set_tunnel_proto(func_id);
+	case BPF_FUNC_skb_get_tunnel_opt:
+		return &bpf_skb_get_tunnel_opt_proto;
+	case BPF_FUNC_skb_set_tunnel_opt:
+		return bpf_get_skb_set_tunnel_proto(func_id);
+	case BPF_FUNC_redirect:
+		return &bpf_redirect_proto;
+	case BPF_FUNC_clone_redirect:
+		return &bpf_clone_redirect_proto;
+	case BPF_FUNC_skb_change_tail:
+		return &bpf_skb_change_tail_proto;
+	case BPF_FUNC_skb_push:
+		return &bpf_skb_push_proto;
+	case BPF_FUNC_skb_store_bytes:
+		return &bpf_skb_store_bytes_proto;
+	case BPF_FUNC_csum_update:
+		return &bpf_csum_update_proto;
+	case BPF_FUNC_l3_csum_replace:
+		return &bpf_l3_csum_replace_proto;
+	case BPF_FUNC_l4_csum_replace:
+		return &bpf_l4_csum_replace_proto;
+	case BPF_FUNC_set_hash_invalid:
+		return &bpf_set_hash_invalid_proto;
+	default:
+		return lwt_inout_func_proto(func_id);
+	}
+}
+
 static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
 	if (off < 0 || off >= sizeof(struct __sk_buff))
@@ -3007,6 +3114,27 @@ static const struct bpf_verifier_ops cg_skb_ops = {
 	.convert_ctx_access	= sk_filter_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops lwt_in_ops = {
+	.get_func_proto		= lwt_inout_func_proto,
+	.is_valid_access	= tc_cls_act_is_valid_access,
+	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.gen_prologue		= tc_cls_act_prologue,
+};
+
+static const struct bpf_verifier_ops lwt_out_ops = {
+	.get_func_proto		= lwt_inout_func_proto,
+	.is_valid_access	= tc_cls_act_is_valid_access,
+	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.gen_prologue		= tc_cls_act_prologue,
+};
+
+static const struct bpf_verifier_ops lwt_xmit_ops = {
+	.get_func_proto		= lwt_xmit_func_proto,
+	.is_valid_access	= tc_cls_act_is_valid_access,
+	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.gen_prologue		= tc_cls_act_prologue,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops	= &sk_filter_ops,
 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
@@ -3032,6 +3160,21 @@ static struct bpf_prog_type_list cg_skb_type __read_mostly = {
 	.type	= BPF_PROG_TYPE_CGROUP_SKB,
 };
 
+static struct bpf_prog_type_list lwt_in_type __read_mostly = {
+	.ops	= &lwt_in_ops,
+	.type	= BPF_PROG_TYPE_LWT_IN,
+};
+
+static struct bpf_prog_type_list lwt_out_type __read_mostly = {
+	.ops	= &lwt_out_ops,
+	.type	= BPF_PROG_TYPE_LWT_OUT,
+};
+
+static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
+	.ops	= &lwt_xmit_ops,
+	.type	= BPF_PROG_TYPE_LWT_XMIT,
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
@@ -3039,6 +3182,9 @@ static int __init register_sk_filter_ops(void)
 	bpf_register_prog_type(&sched_act_type);
 	bpf_register_prog_type(&xdp_type);
 	bpf_register_prog_type(&cg_skb_type);
+	bpf_register_prog_type(&lwt_in_type);
+	bpf_register_prog_type(&lwt_out_type);
+	bpf_register_prog_type(&lwt_xmit_type);
 
 	return 0;
 }
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
new file mode 100644
index 0000000..fdbf6f4
--- /dev/null
+++ b/net/core/lwt_bpf.c
@@ -0,0 +1,397 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <net/lwtunnel.h>
+
+struct bpf_lwt_prog {
+	struct bpf_prog *prog;
+	char *name;
+};
+
+struct bpf_lwt {
+	struct bpf_lwt_prog in;
+	struct bpf_lwt_prog out;
+	struct bpf_lwt_prog xmit;
+	int family;
+};
+
+#define MAX_PROG_NAME 256
+
+static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+	return (struct bpf_lwt *)lwt->data;
+}
+
+#define NO_REDIRECT false
+#define CAN_REDIRECT true
+
+static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
+		       struct dst_entry *dst, bool can_redirect)
+{
+	int ret;
+
+	/* Preempt disable is needed to protect per-cpu redirect_info between
+	 * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
+	 * access to maps strictly require a rcu_read_lock() for protection,
+	 * mixing with BH RCU lock doesn't work.
+	 */
+	preempt_disable();
+	rcu_read_lock();
+	bpf_compute_data_end(skb);
+	ret = BPF_PROG_RUN(lwt->prog, skb);
+	rcu_read_unlock();
+
+	switch (ret) {
+	case BPF_OK:
+		break;
+
+	case BPF_REDIRECT:
+		if (!can_redirect) {
+			WARN_ONCE(1, "Illegal redirect return code in prog %s\n",
+				  lwt->name ? : "<unknown>");
+			ret = BPF_OK;
+		} else {
+			ret = skb_do_redirect(skb);
+			if (ret == 0)
+				ret = BPF_REDIRECT;
+		}
+		break;
+
+	case BPF_DROP:
+		kfree_skb(skb);
+		ret = -EPERM;
+		break;
+
+	default:
+		WARN_ONCE(1, "Illegal LWT BPF return value %u, expect packet loss\n",
+			  ret);
+		kfree_skb(skb);
+		ret = -EINVAL;
+		break;
+	}
+
+	preempt_enable();
+
+	return ret;
+}
+
+static int bpf_input(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct bpf_lwt *bpf;
+	int ret;
+
+	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+	if (bpf->in.prog) {
+		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (unlikely(!dst->lwtstate->orig_input)) {
+		WARN_ONCE(1, "orig_input not set on dst for prog %s\n",
+			  bpf->out.name);
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	return dst->lwtstate->orig_input(skb);
+}
+
+static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct bpf_lwt *bpf;
+	int ret;
+
+	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+	if (bpf->out.prog) {
+		ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (unlikely(!dst->lwtstate->orig_output)) {
+		WARN_ONCE(1, "orig_output not set on dst for prog %s\n",
+			  bpf->out.name);
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	return dst->lwtstate->orig_output(net, sk, skb);
+}
+
+static int xmit_check_hhlen(struct sk_buff *skb)
+{
+	int hh_len = skb_dst(skb)->dev->hard_header_len;
+
+	if (skb_headroom(skb) < hh_len) {
+		int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
+
+		if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int bpf_xmit(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct bpf_lwt *bpf;
+
+	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+	if (bpf->xmit.prog) {
+		int ret;
+
+		ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
+		switch (ret) {
+		case BPF_OK:
+			/* If the L3 header was expanded, headroom might be too
+			 * small for L2 header now, expand as needed.
+			 */
+			ret = xmit_check_hhlen(skb);
+			if (unlikely(ret))
+				return ret;
+
+			return LWTUNNEL_XMIT_CONTINUE;
+		case BPF_REDIRECT:
+			return LWTUNNEL_XMIT_DONE;
+		default:
+			return ret;
+		}
+	}
+
+	return LWTUNNEL_XMIT_CONTINUE;
+}
+
+static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog)
+{
+	if (prog->prog)
+		bpf_prog_put(prog->prog);
+
+	kfree(prog->name);
+}
+
+static void bpf_destroy_state(struct lwtunnel_state *lwt)
+{
+	struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+	bpf_lwt_prog_destroy(&bpf->in);
+	bpf_lwt_prog_destroy(&bpf->out);
+	bpf_lwt_prog_destroy(&bpf->xmit);
+}
+
+static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
+	[LWT_BPF_PROG_FD] = { .type = NLA_U32, },
+	[LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+				.len = MAX_PROG_NAME },
+};
+
+static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
+			  enum bpf_prog_type type)
+{
+	struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
+	struct bpf_prog *p;
+	int ret;
+	u32 fd;
+
+	ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
+		return -EINVAL;
+
+	prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL);
+	if (!prog->name)
+		return -ENOMEM;
+
+	fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
+	p = bpf_prog_get_type(fd, type);
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	prog->prog = p;
+
+	return 0;
+}
+
+static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
+	[LWT_BPF_IN]		= { .type = NLA_NESTED, },
+	[LWT_BPF_OUT]		= { .type = NLA_NESTED, },
+	[LWT_BPF_XMIT]		= { .type = NLA_NESTED, },
+	[LWT_BPF_XMIT_HEADROOM]	= { .type = NLA_U32 },
+};
+
+static int bpf_build_state(struct net_device *dev, struct nlattr *nla,
+			   unsigned int family, const void *cfg,
+			   struct lwtunnel_state **ts)
+{
+	struct nlattr *tb[LWT_BPF_MAX + 1];
+	struct lwtunnel_state *newts;
+	struct bpf_lwt *bpf;
+	int ret;
+
+	if (family != AF_INET && family != AF_INET6)
+		return -EAFNOSUPPORT;
+
+	ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT])
+		return -EINVAL;
+
+	newts = lwtunnel_state_alloc(sizeof(*bpf));
+	if (!newts)
+		return -ENOMEM;
+
+	newts->type = LWTUNNEL_ENCAP_BPF;
+	bpf = bpf_lwt_lwtunnel(newts);
+
+	if (tb[LWT_BPF_IN]) {
+		newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+		ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in,
+				     BPF_PROG_TYPE_LWT_IN);
+		if (ret  < 0)
+			goto errout;
+	}
+
+	if (tb[LWT_BPF_OUT]) {
+		newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+		ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out,
+				     BPF_PROG_TYPE_LWT_OUT);
+		if (ret < 0)
+			goto errout;
+	}
+
+	if (tb[LWT_BPF_XMIT]) {
+		newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
+		ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit,
+				     BPF_PROG_TYPE_LWT_XMIT);
+		if (ret < 0)
+			goto errout;
+	}
+
+	if (tb[LWT_BPF_XMIT_HEADROOM]) {
+		u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]);
+
+		if (headroom > LWT_BPF_MAX_HEADROOM) {
+			ret = -ERANGE;
+			goto errout;
+		}
+
+		newts->headroom = headroom;
+	}
+
+	bpf->family = family;
+	*ts = newts;
+
+	return 0;
+
+errout:
+	bpf_destroy_state(newts);
+	kfree(newts);
+	return ret;
+}
+
+static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
+			     struct bpf_lwt_prog *prog)
+{
+	struct nlattr *nest;
+
+	if (!prog->prog)
+		return 0;
+
+	nest = nla_nest_start(skb, attr);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (prog->name &&
+	    nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name))
+		return -EMSGSIZE;
+
+	return nla_nest_end(skb, nest);
+}
+
+static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt)
+{
+	struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+	if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 ||
+	    bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 ||
+	    bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	int nest_len = nla_total_size(sizeof(struct nlattr)) +
+		       nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */
+		       0;
+
+	return nest_len + /* LWT_BPF_IN */
+	       nest_len + /* LWT_BPF_OUT */
+	       nest_len + /* LWT_BPF_XMIT */
+	       0;
+}
+
+int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
+{
+	/* FIXME:
+	 * The LWT state is currently rebuilt for delete requests which
+	 * results in a new bpf_prog instance. Comparing names for now.
+	 */
+	if (!a->name && !b->name)
+		return 0;
+
+	if (!a->name || !b->name)
+		return 1;
+
+	return strcmp(a->name, b->name);
+}
+
+static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a);
+	struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b);
+
+	return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) ||
+	       bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) ||
+	       bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit);
+}
+
+static const struct lwtunnel_encap_ops bpf_encap_ops = {
+	.build_state	= bpf_build_state,
+	.destroy_state	= bpf_destroy_state,
+	.input		= bpf_input,
+	.output		= bpf_output,
+	.xmit		= bpf_xmit,
+	.fill_encap	= bpf_fill_encap_info,
+	.get_encap_size = bpf_encap_nlsize,
+	.cmp_encap	= bpf_encap_cmp,
+};
+
+static int __init bpf_lwt_init(void)
+{
+	return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
+}
+
+subsys_initcall(bpf_lwt_init)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 03976e9..a5d4e86 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -41,6 +41,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
 		return "ILA";
 	case LWTUNNEL_ENCAP_SEG6:
 		return "SEG6";
+	case LWTUNNEL_ENCAP_BPF:
+		return "BPF";
 	case LWTUNNEL_ENCAP_IP6:
 	case LWTUNNEL_ENCAP_IP:
 	case LWTUNNEL_ENCAP_NONE:
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next v3 4/4] bpf: Add tests and samples for LWT-BPF
From: Thomas Graf @ 2016-11-29 13:21 UTC (permalink / raw)
  To: davem; +Cc: netdev, alexei.starovoitov, daniel, tom, roopa, hannes
In-Reply-To: <cover.1480424542.git.tgraf@suug.ch>

Adds a series of test to verify the functionality of attaching
BPF programs at LWT hooks.

Also adds a sample which collects a histogram of packet sizes which
pass through an LWT hook.

$ ./lwt_len_hist.sh
Starting netserver with host 'IN(6)ADDR_ANY' port '12865' and family AF_UNSPEC
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.253.2 () port 0 AF_INET : demo
Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/sec

 87380  16384  16384    10.00    39857.69
       1 -> 1        : 0        |                                      |
       2 -> 3        : 0        |                                      |
       4 -> 7        : 0        |                                      |
       8 -> 15       : 0        |                                      |
      16 -> 31       : 0        |                                      |
      32 -> 63       : 22       |                                      |
      64 -> 127      : 98       |                                      |
     128 -> 255      : 213      |                                      |
     256 -> 511      : 1444251  |********                              |
     512 -> 1023     : 660610   |***                                   |
    1024 -> 2047     : 535241   |**                                    |
    2048 -> 4095     : 19       |                                      |
    4096 -> 8191     : 180      |                                      |
    8192 -> 16383    : 5578023  |************************************* |
   16384 -> 32767    : 632099   |***                                   |
   32768 -> 65535    : 6575     |                                      |

Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
 samples/bpf/Makefile            |   4 +
 samples/bpf/bpf_helpers.h       |   4 +
 samples/bpf/lwt_len_hist.sh     |  37 ++++
 samples/bpf/lwt_len_hist_kern.c |  82 +++++++++
 samples/bpf/lwt_len_hist_user.c |  76 ++++++++
 samples/bpf/test_lwt_bpf.c      | 247 ++++++++++++++++++++++++++
 samples/bpf/test_lwt_bpf.sh     | 385 ++++++++++++++++++++++++++++++++++++++++
 7 files changed, 835 insertions(+)
 create mode 100755 samples/bpf/lwt_len_hist.sh
 create mode 100644 samples/bpf/lwt_len_hist_kern.c
 create mode 100644 samples/bpf/lwt_len_hist_user.c
 create mode 100644 samples/bpf/test_lwt_bpf.c
 create mode 100755 samples/bpf/test_lwt_bpf.sh

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 22b6407e..3161f82 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -29,6 +29,7 @@ hostprogs-y += test_current_task_under_cgroup
 hostprogs-y += trace_event
 hostprogs-y += sampleip
 hostprogs-y += tc_l2_redirect
+hostprogs-y += lwt_len_hist
 
 test_lru_dist-objs := test_lru_dist.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
@@ -59,6 +60,7 @@ test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \
 trace_event-objs := bpf_load.o libbpf.o trace_event_user.o
 sampleip-objs := bpf_load.o libbpf.o sampleip_user.o
 tc_l2_redirect-objs := bpf_load.o libbpf.o tc_l2_redirect_user.o
+lwt_len_hist-objs := bpf_load.o libbpf.o lwt_len_hist_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -89,6 +91,7 @@ always += xdp2_kern.o
 always += test_current_task_under_cgroup_kern.o
 always += trace_event_kern.o
 always += sampleip_kern.o
+always += lwt_len_hist_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(objtree)/tools/testing/selftests/bpf/
@@ -117,6 +120,7 @@ HOSTLOADLIBES_test_current_task_under_cgroup += -lelf
 HOSTLOADLIBES_trace_event += -lelf
 HOSTLOADLIBES_sampleip += -lelf
 HOSTLOADLIBES_tc_l2_redirect += -l elf
+HOSTLOADLIBES_lwt_len_hist += -l elf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index 90f44bd..f34e417 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -80,6 +80,8 @@ struct bpf_map_def {
 	unsigned int map_flags;
 };
 
+static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) =
+	(void *) BPF_FUNC_skb_load_bytes;
 static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) =
 	(void *) BPF_FUNC_skb_store_bytes;
 static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) =
@@ -88,6 +90,8 @@ static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flag
 	(void *) BPF_FUNC_l4_csum_replace;
 static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) =
 	(void *) BPF_FUNC_skb_under_cgroup;
+static int (*bpf_skb_push)(void *, int len, int flags) =
+	(void *) BPF_FUNC_skb_push;
 
 #if defined(__x86_64__)
 
diff --git a/samples/bpf/lwt_len_hist.sh b/samples/bpf/lwt_len_hist.sh
new file mode 100755
index 0000000..3a8ee52
--- /dev/null
+++ b/samples/bpf/lwt_len_hist.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+NS1=lwt_ns1
+VETH0=tst_lwt1a
+VETH1=tst_lwt1b
+
+TRACE_ROOT=/sys/kernel/debug/tracing
+
+function cleanup {
+        ip route del 192.168.253.2/32 dev $VETH0 2> /dev/null
+        ip link del $VETH0 2> /dev/null
+        ip link del $VETH1 2> /dev/null
+	ip netns exec $NS1 killall netserver
+        ip netns delete $NS1 2> /dev/null
+}
+
+cleanup
+
+ip netns add $NS1
+ip link add $VETH0 type veth peer name $VETH1
+ip link set dev $VETH0 up
+ip addr add 192.168.253.1/24 dev $VETH0
+ip link set $VETH1 netns $NS1
+ip netns exec $NS1 ip link set dev $VETH1 up
+ip netns exec $NS1 ip addr add 192.168.253.2/24 dev $VETH1
+ip netns exec $NS1 netserver
+
+echo 1 > ${TRACE_ROOT}/tracing_on
+cp /dev/null ${TRACE_ROOT}/trace
+ip route add 192.168.253.2/32 encap bpf out obj lwt_len_hist_kern.o section len_hist dev $VETH0
+netperf -H 192.168.253.2 -t TCP_STREAM
+cat ${TRACE_ROOT}/trace | grep -v '^#'
+./lwt_len_hist
+cleanup
+echo 0 > ${TRACE_ROOT}/tracing_on
+
+exit 0
diff --git a/samples/bpf/lwt_len_hist_kern.c b/samples/bpf/lwt_len_hist_kern.c
new file mode 100644
index 0000000..df75383
--- /dev/null
+++ b/samples/bpf/lwt_len_hist_kern.c
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/in.h>
+#include "bpf_helpers.h"
+
+# define printk(fmt, ...)						\
+		({							\
+			char ____fmt[] = fmt;				\
+			bpf_trace_printk(____fmt, sizeof(____fmt),	\
+				     ##__VA_ARGS__);			\
+		})
+
+struct bpf_elf_map {
+	__u32 type;
+	__u32 size_key;
+	__u32 size_value;
+	__u32 max_elem;
+	__u32 flags;
+	__u32 id;
+	__u32 pinning;
+};
+
+struct bpf_elf_map SEC("maps") lwt_len_hist_map = {
+	.type = BPF_MAP_TYPE_PERCPU_HASH,
+	.size_key = sizeof(__u64),
+	.size_value = sizeof(__u64),
+	.pinning = 2,
+	.max_elem = 1024,
+};
+
+static unsigned int log2(unsigned int v)
+{
+	unsigned int r;
+	unsigned int shift;
+
+	r = (v > 0xFFFF) << 4; v >>= r;
+	shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+	shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+	shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+	r |= (v >> 1);
+	return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+	unsigned int hi = v >> 32;
+	if (hi)
+		return log2(hi) + 32;
+	else
+		return log2(v);
+}
+
+SEC("len_hist")
+int do_len_hist(struct __sk_buff *skb)
+{
+	__u64 *value, key, init_val = 1;
+
+	key = log2l(skb->len);
+
+	value = bpf_map_lookup_elem(&lwt_len_hist_map, &key);
+	if (value)
+		__sync_fetch_and_add(value, 1);
+	else
+		bpf_map_update_elem(&lwt_len_hist_map, &key, &init_val, BPF_ANY);
+
+	return BPF_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/lwt_len_hist_user.c b/samples/bpf/lwt_len_hist_user.c
new file mode 100644
index 0000000..05d783f
--- /dev/null
+++ b/samples/bpf/lwt_len_hist_user.c
@@ -0,0 +1,76 @@
+#include <linux/unistd.h>
+#include <linux/bpf.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <arpa/inet.h>
+
+#include "libbpf.h"
+#include "bpf_util.h"
+
+#define MAX_INDEX 64
+#define MAX_STARS 38
+
+static void stars(char *str, long val, long max, int width)
+{
+	int i;
+
+	for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+		str[i] = '*';
+	if (val > max)
+		str[i - 1] = '+';
+	str[i] = '\0';
+}
+
+int main(int argc, char **argv)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	const char *map_filename = "/sys/fs/bpf/tc/globals/lwt_len_hist_map";
+	uint64_t values[nr_cpus], sum, max_value = 0, data[MAX_INDEX] = {};
+	uint64_t key = 0, next_key, max_key = 0;
+	char starstr[MAX_STARS];
+	int i, map_fd;
+
+	map_fd = bpf_obj_get(map_filename);
+	if (map_fd < 0) {
+		fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n",
+			map_filename, strerror(errno), errno);
+		return -1;
+	}
+
+	while (bpf_get_next_key(map_fd, &key, &next_key) == 0) {
+		if (next_key >= MAX_INDEX) {
+			fprintf(stderr, "Key %lu out of bounds\n", next_key);
+			continue;
+		}
+
+		bpf_lookup_elem(map_fd, &next_key, values);
+
+		sum = 0;
+		for (i = 0; i < nr_cpus; i++)
+			sum += values[i];
+
+		data[next_key] = sum;
+		if (sum && next_key > max_key)
+			max_key = next_key;
+
+		if (sum > max_value)
+			max_value = sum;
+
+		key = next_key;
+	}
+
+	for (i = 1; i <= max_key + 1; i++) {
+		stars(starstr, data[i - 1], max_value, MAX_STARS);
+		printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+		       (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+		       MAX_STARS, starstr);
+	}
+
+	close(map_fd);
+
+	return 0;
+}
diff --git a/samples/bpf/test_lwt_bpf.c b/samples/bpf/test_lwt_bpf.c
new file mode 100644
index 0000000..9c1773e
--- /dev/null
+++ b/samples/bpf/test_lwt_bpf.c
@@ -0,0 +1,247 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmpv6.h>
+#include <linux/if_ether.h>
+#include "bpf_helpers.h"
+#include <string.h>
+
+# define printk(fmt, ...)						\
+		({							\
+			char ____fmt[] = fmt;				\
+			bpf_trace_printk(____fmt, sizeof(____fmt),	\
+				     ##__VA_ARGS__);			\
+		})
+
+#define CB_MAGIC 1234
+
+/* Test: Pass all packets through */
+SEC("nop")
+int do_nop(struct __sk_buff *skb)
+{
+	return BPF_OK;
+}
+
+/* Test: Verify context information can be accessed */
+SEC("test_ctx")
+int do_test_ctx(struct __sk_buff *skb)
+{
+	skb->cb[0] = CB_MAGIC;
+	printk("len %d hash %d protocol %d\n", skb->len, skb->hash,
+	       skb->protocol);
+	printk("cb %d ingress_ifindex %d ifindex %d\n", skb->cb[0],
+	       skb->ingress_ifindex, skb->ifindex);
+
+	return BPF_OK;
+}
+
+/* Test: Ensure skb->cb[] buffer is cleared */
+SEC("test_cb")
+int do_test_cb(struct __sk_buff *skb)
+{
+	printk("cb0: %x cb1: %x cb2: %x\n", skb->cb[0], skb->cb[1],
+	       skb->cb[2]);
+	printk("cb3: %x cb4: %x\n", skb->cb[3], skb->cb[4]);
+
+	return BPF_OK;
+}
+
+/* Test: Verify skb data can be read */
+SEC("test_data")
+int do_test_data(struct __sk_buff *skb)
+{
+	void *data = (void *)(long)skb->data;
+	void *data_end = (void *)(long)skb->data_end;
+	struct iphdr *iph = data;
+
+	if (data + sizeof(*iph) > data_end) {
+		printk("packet truncated\n");
+		return BPF_DROP;
+	}
+
+	printk("src: %x dst: %x\n", iph->saddr, iph->daddr);
+
+	return BPF_OK;
+}
+
+#define IP_CSUM_OFF offsetof(struct iphdr, check)
+#define IP_DST_OFF offsetof(struct iphdr, daddr)
+#define IP_SRC_OFF offsetof(struct iphdr, saddr)
+#define IP_PROTO_OFF offsetof(struct iphdr, protocol)
+#define TCP_CSUM_OFF offsetof(struct tcphdr, check)
+#define UDP_CSUM_OFF offsetof(struct udphdr, check)
+#define IS_PSEUDO 0x10
+
+static inline int rewrite(struct __sk_buff *skb, uint32_t old_ip,
+			  uint32_t new_ip, int rw_daddr)
+{
+	int ret, off = 0, flags = IS_PSEUDO;
+	uint8_t proto;
+
+	ret = bpf_skb_load_bytes(skb, IP_PROTO_OFF, &proto, 1);
+	if (ret < 0) {
+		printk("bpf_l4_csum_replace failed: %d\n", ret);
+		return BPF_DROP;
+	}
+
+	switch (proto) {
+	case IPPROTO_TCP:
+		off = TCP_CSUM_OFF;
+		break;
+
+	case IPPROTO_UDP:
+		off = UDP_CSUM_OFF;
+		flags |= BPF_F_MARK_MANGLED_0;
+		break;
+
+	case IPPROTO_ICMPV6:
+		off = offsetof(struct icmp6hdr, icmp6_cksum);
+		break;
+	}
+
+	if (off) {
+		ret = bpf_l4_csum_replace(skb, off, old_ip, new_ip,
+					  flags | sizeof(new_ip));
+		if (ret < 0) {
+			printk("bpf_l4_csum_replace failed: %d\n");
+			return BPF_DROP;
+		}
+	}
+
+	ret = bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip));
+	if (ret < 0) {
+		printk("bpf_l3_csum_replace failed: %d\n", ret);
+		return BPF_DROP;
+	}
+
+	if (rw_daddr)
+		ret = bpf_skb_store_bytes(skb, IP_DST_OFF, &new_ip, sizeof(new_ip), 0);
+	else
+		ret = bpf_skb_store_bytes(skb, IP_SRC_OFF, &new_ip, sizeof(new_ip), 0);
+
+	if (ret < 0) {
+		printk("bpf_skb_store_bytes() failed: %d\n", ret);
+		return BPF_DROP;
+	}
+
+	return BPF_OK;
+}
+
+/* Test: Verify skb data can be modified */
+SEC("test_rewrite")
+int do_test_rewrite(struct __sk_buff *skb)
+{
+	uint32_t old_ip, new_ip = 0x3fea8c0;
+	int ret;
+
+	ret = bpf_skb_load_bytes(skb, IP_DST_OFF, &old_ip, 4);
+	if (ret < 0) {
+		printk("bpf_skb_load_bytes failed: %d\n", ret);
+		return BPF_DROP;
+	}
+
+	if (old_ip == 0x2fea8c0) {
+		printk("out: rewriting from %x to %x\n", old_ip, new_ip);
+		return rewrite(skb, old_ip, new_ip, 1);
+	}
+
+	return BPF_OK;
+}
+
+static inline int __do_push_ll_and_redirect(struct __sk_buff *skb)
+{
+	uint64_t smac = SRC_MAC, dmac = DST_MAC;
+	int ret, ifindex = DST_IFINDEX;
+	struct ethhdr ehdr;
+
+	ret = bpf_skb_push(skb, 14, 0);
+	if (ret < 0) {
+		printk("skb_push() failed: %d\n", ret);
+	}
+
+	ehdr.h_proto = __constant_htons(ETH_P_IP);
+	memcpy(&ehdr.h_source, &smac, 6);
+	memcpy(&ehdr.h_dest, &dmac, 6);
+
+	ret = bpf_skb_store_bytes(skb, 0, &ehdr, sizeof(ehdr), 0);
+	if (ret < 0) {
+		printk("skb_store_bytes() failed: %d\n", ret);
+		return BPF_DROP;
+	}
+
+	ret = bpf_redirect(ifindex, 0);
+	if (ret < 0) {
+		printk("bpf_redirect() failed: %d\n", ret);
+		return BPF_DROP;
+	}
+
+	return BPF_REDIRECT;
+}
+
+SEC("push_ll_and_redirect_silent")
+int do_push_ll_and_redirect_silent(struct __sk_buff *skb)
+{
+	return __do_push_ll_and_redirect(skb);
+}
+
+SEC("push_ll_and_redirect")
+int do_push_ll_and_redirect(struct __sk_buff *skb)
+{
+	int ret, ifindex = DST_IFINDEX;
+
+	ret = __do_push_ll_and_redirect(skb);
+	if (ret >= 0)
+		printk("redirected to %d\n", ifindex);
+
+	return ret;
+}
+
+SEC("fill_garbage")
+int do_fill_garbage(struct __sk_buff *skb)
+{
+	uint64_t f = 0xFFFFFFFFFFFFFFFF;
+
+	bpf_skb_store_bytes(skb, 0, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 8, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 16, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 24, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 32, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 40, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 48, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 56, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 64, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 72, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 80, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 88, &f, sizeof(f), 0);
+
+	printk("Set initial 96 bytes of header to FF\n");
+
+	return BPF_OK;
+}
+
+/* Drop all packets */
+SEC("drop_all")
+int do_drop_all(struct __sk_buff *skb)
+{
+	printk("dropping with: %d\n", BPF_DROP);
+	return BPF_DROP;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_lwt_bpf.sh b/samples/bpf/test_lwt_bpf.sh
new file mode 100755
index 0000000..d016961
--- /dev/null
+++ b/samples/bpf/test_lwt_bpf.sh
@@ -0,0 +1,385 @@
+#!/bin/bash
+
+# Uncomment to see generated bytecode
+#VERBOSE=verbose
+
+NS1=lwt_ns1
+NS2=lwt_ns2
+VETH0=tst_lwt1a
+VETH1=tst_lwt1b
+VETH2=tst_lwt2a
+VETH3=tst_lwt2b
+IPVETH0="192.168.254.1"
+IPVETH1="192.168.254.2"
+IPVETH1b="192.168.254.3"
+
+IPVETH2="192.168.111.1"
+IPVETH3="192.168.111.2"
+
+IP_LOCAL="192.168.99.1"
+
+TRACE_ROOT=/sys/kernel/debug/tracing
+
+function lookup_mac()
+{
+	set +x
+	if [ ! -z "$2" ]; then
+		MAC=$(ip netns exec $2 ip link show $1 | grep ether | awk '{print $2}')
+	else
+		MAC=$(ip link show $1 | grep ether | awk '{print $2}')
+	fi
+	MAC="${MAC//:/}"
+	echo "0x${MAC:10:2}${MAC:8:2}${MAC:6:2}${MAC:4:2}${MAC:2:2}${MAC:0:2}"
+	set -x
+}
+
+function cleanup {
+        set +ex
+        rm test_lwt_bpf.o 2> /dev/null
+        ip link del $VETH0 2> /dev/null
+        ip link del $VETH1 2> /dev/null
+        ip link del $VETH2 2> /dev/null
+        ip link del $VETH3 2> /dev/null
+	ip netns exec $NS1 killall netserver
+        ip netns delete $NS1 2> /dev/null
+        ip netns delete $NS2 2> /dev/null
+        set -ex
+}
+
+function setup_one_veth {
+        ip netns add $1
+        ip link add $2 type veth peer name $3
+        ip link set dev $2 up
+        ip addr add $4/24 dev $2
+        ip link set $3 netns $1
+        ip netns exec $1 ip link set dev $3 up
+        ip netns exec $1 ip addr add $5/24 dev $3
+
+	if [ "$6" ]; then
+		ip netns exec $1 ip addr add $6/32 dev $3
+	fi
+}
+
+function get_trace {
+	set +x
+        cat ${TRACE_ROOT}/trace | grep -v '^#'
+	set -x
+}
+
+function cleanup_routes {
+	ip route del ${IPVETH1}/32 dev $VETH0 2> /dev/null || true
+	ip route del table local local ${IP_LOCAL}/32 dev lo 2> /dev/null || true
+}
+
+function install_test {
+	cleanup_routes
+	cp /dev/null ${TRACE_ROOT}/trace
+
+	OPTS="encap bpf headroom 14 $1 obj test_lwt_bpf.o section $2 $VERBOSE"
+
+	if [ "$1" == "in" ];  then
+		ip route add table local local ${IP_LOCAL}/32 $OPTS dev lo
+	else
+		ip route add ${IPVETH1}/32 $OPTS dev $VETH0
+	fi
+}
+
+function remove_prog {
+	if [ "$1" == "in" ];  then
+		ip route del table local local ${IP_LOCAL}/32 dev lo
+	else
+		ip route del ${IPVETH1}/32 dev $VETH0
+	fi
+}
+
+function filter_trace {
+	# Add newline to allow starting EXPECT= variables on newline
+	NL=$'\n'
+	echo "${NL}$*" | sed -e 's/^.*: : //g'
+}
+
+function expect_fail {
+	set +x
+	echo "FAIL:"
+	echo "Expected: $1"
+	echo "Got: $2"
+	set -x
+	exit 1
+}
+
+function match_trace {
+	set +x
+	RET=0
+	TRACE=$1
+	EXPECT=$2
+	GOT="$(filter_trace "$TRACE")"
+
+	[ "$GOT" != "$EXPECT" ] && {
+		expect_fail "$EXPECT" "$GOT"
+		RET=1
+	}
+	set -x
+	return $RET
+}
+
+function test_start {
+	set +x
+	echo "----------------------------------------------------------------"
+	echo "Starting test: $*"
+	echo "----------------------------------------------------------------"
+	set -x
+}
+
+function failure {
+	get_trace
+	echo "FAIL: $*"
+	exit 1
+}
+
+function test_ctx_xmit {
+	test_start "test_ctx on lwt xmit"
+	install_test xmit test_ctx
+	ping -c 3 $IPVETH1 || {
+		failure "test_ctx xmit: packets are dropped"
+	}
+	match_trace "$(get_trace)" "
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX" || exit 1
+	remove_prog xmit
+}
+
+function test_ctx_out {
+	test_start "test_ctx on lwt out"
+	install_test out test_ctx
+	ping -c 3 $IPVETH1 || {
+		failure "test_ctx out: packets are dropped"
+	}
+	match_trace "$(get_trace)" "
+len 84 hash 0 protocol 0
+cb 1234 ingress_ifindex 0 ifindex 0
+len 84 hash 0 protocol 0
+cb 1234 ingress_ifindex 0 ifindex 0
+len 84 hash 0 protocol 0
+cb 1234 ingress_ifindex 0 ifindex 0" || exit 1
+	remove_prog out
+}
+
+function test_ctx_in {
+	test_start "test_ctx on lwt in"
+	install_test in test_ctx
+	ping -c 3 $IP_LOCAL || {
+		failure "test_ctx out: packets are dropped"
+	}
+	# We will both request & reply packets as the packets will
+	# be from $IP_LOCAL => $IP_LOCAL
+	match_trace "$(get_trace)" "
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1" || exit 1
+	remove_prog in
+}
+
+function test_data {
+	test_start "test_data on lwt $1"
+	install_test $1 test_data
+	ping -c 3 $IPVETH1 || {
+		failure "test_data ${1}: packets are dropped"
+	}
+	match_trace "$(get_trace)" "
+src: 1fea8c0 dst: 2fea8c0
+src: 1fea8c0 dst: 2fea8c0
+src: 1fea8c0 dst: 2fea8c0" || exit 1
+	remove_prog $1
+}
+
+function test_data_in {
+	test_start "test_data on lwt in"
+	install_test in test_data
+	ping -c 3 $IP_LOCAL || {
+		failure "test_data in: packets are dropped"
+	}
+	# We will both request & reply packets as the packets will
+	# be from $IP_LOCAL => $IP_LOCAL
+	match_trace "$(get_trace)" "
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0" || exit 1
+	remove_prog in
+}
+
+function test_cb {
+	test_start "test_cb on lwt $1"
+	install_test $1 test_cb
+	ping -c 3 $IPVETH1 || {
+		failure "test_cb ${1}: packets are dropped"
+	}
+	match_trace "$(get_trace)" "
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0" || exit 1
+	remove_prog $1
+}
+
+function test_cb_in {
+	test_start "test_cb on lwt in"
+	install_test in test_cb
+	ping -c 3 $IP_LOCAL || {
+		failure "test_cb in: packets are dropped"
+	}
+	# We will both request & reply packets as the packets will
+	# be from $IP_LOCAL => $IP_LOCAL
+	match_trace "$(get_trace)" "
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0" || exit 1
+	remove_prog in
+}
+
+function test_drop_all {
+	test_start "test_drop_all on lwt $1"
+	install_test $1 drop_all
+	ping -c 3 $IPVETH1 && {
+		failure "test_drop_all ${1}: Unexpected success of ping"
+	}
+	match_trace "$(get_trace)" "
+dropping with: 2
+dropping with: 2
+dropping with: 2" || exit 1
+	remove_prog $1
+}
+
+function test_drop_all_in {
+	test_start "test_drop_all on lwt in"
+	install_test in drop_all
+	ping -c 3 $IP_LOCAL && {
+		failure "test_drop_all in: Unexpected success of ping"
+	}
+	match_trace "$(get_trace)" "
+dropping with: 2
+dropping with: 2
+dropping with: 2" || exit 1
+	remove_prog in
+}
+
+function test_push_ll_and_redirect {
+	test_start "test_push_ll_and_redirect on lwt xmit"
+	install_test xmit push_ll_and_redirect
+	ping -c 3 $IPVETH1 || {
+		failure "Redirected packets appear to be dropped"
+	}
+	match_trace "$(get_trace)" "
+redirected to $DST_IFINDEX
+redirected to $DST_IFINDEX
+redirected to $DST_IFINDEX" || exit 1
+	remove_prog xmit
+}
+
+function test_rewrite {
+	test_start "test_rewrite on lwt xmit"
+	install_test xmit test_rewrite
+	ping -c 3 $IPVETH1 || {
+		failure "Rewritten packets appear to be dropped"
+	}
+	match_trace "$(get_trace)" "
+out: rewriting from 2fea8c0 to 3fea8c0
+out: rewriting from 2fea8c0 to 3fea8c0
+out: rewriting from 2fea8c0 to 3fea8c0" || exit 1
+	remove_prog out
+}
+
+function test_fill_garbage {
+	test_start "test_fill_garbage on lwt xmit"
+	install_test xmit fill_garbage
+	ping -c 3 $IPVETH1 && {
+		failure "test_drop_all ${1}: Unexpected success of ping"
+	}
+	match_trace "$(get_trace)" "
+Set initial 96 bytes of header to FF
+Set initial 96 bytes of header to FF
+Set initial 96 bytes of header to FF" || exit 1
+	remove_prog xmit
+}
+
+function test_netperf_nop {
+	test_start "test_netperf_nop on lwt xmit"
+	install_test xmit nop
+	netperf -H $IPVETH1 -t TCP_STREAM || {
+		failure "packets appear to be dropped"
+	}
+	match_trace "$(get_trace)" ""|| exit 1
+	remove_prog xmit
+}
+
+function test_netperf_redirect {
+	test_start "test_netperf_redirect on lwt xmit"
+	install_test xmit push_ll_and_redirect_silent
+	netperf -H $IPVETH1 -t TCP_STREAM || {
+		failure "Rewritten packets appear to be dropped"
+	}
+	match_trace "$(get_trace)" ""|| exit 1
+	remove_prog xmit
+}
+
+cleanup
+setup_one_veth $NS1 $VETH0 $VETH1 $IPVETH0 $IPVETH1 $IPVETH1b
+setup_one_veth $NS2 $VETH2 $VETH3 $IPVETH2 $IPVETH3
+ip netns exec $NS1 netserver
+echo 1 > ${TRACE_ROOT}/tracing_on
+
+DST_MAC=$(lookup_mac $VETH1 $NS1)
+SRC_MAC=$(lookup_mac $VETH0)
+DST_IFINDEX=$(cat /sys/class/net/$VETH0/ifindex)
+
+CLANG_OPTS="-O2 -target bpf -I ../include/"
+CLANG_OPTS+=" -DSRC_MAC=$SRC_MAC -DDST_MAC=$DST_MAC -DDST_IFINDEX=$DST_IFINDEX"
+clang $CLANG_OPTS -c test_lwt_bpf.c -o test_lwt_bpf.o
+
+test_ctx_xmit
+test_ctx_out
+test_ctx_in
+test_data "xmit"
+test_data "out"
+test_data_in
+test_cb "xmit"
+test_cb "out"
+test_cb_in
+test_drop_all "xmit"
+test_drop_all "out"
+test_drop_all_in
+test_rewrite
+test_push_ll_and_redirect
+test_fill_garbage
+test_netperf_nop
+test_netperf_redirect
+
+cleanup
+echo 0 > ${TRACE_ROOT}/tracing_on
+exit 0
-- 
2.7.4

^ permalink raw reply related

* RE: The ubufs->refcount maybe be subtracted twice when tun_get_user failed
From: wangyunjian @ 2016-11-29 13:27 UTC (permalink / raw)
  To: Jason Wang, mst@redhat.com, netdev@vger.kernel.org; +Cc: caihe
In-Reply-To: <30e4c704-70b4-d318-68bd-742eb5f9bfc7@redhat.com>

Sorry, I didn't describe it clearly. In fact, the second subtraction happens in the function handle_tx,
when tun_get_user fails and zcopy_used is ture. Fllowing the steps:

static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
                                  void *msg_control, struct iov_iter *from,
                                  int noblock) {
           ...

           if (zerocopy)
                     err = zerocopy_sg_from_iter(skb, from);
           else {
                     err = skb_copy_datagram_from_iter(skb, 0, from, len);
                     if (!err && msg_control) {
                              struct ubuf_info *uarg = msg_control;
                              uarg->callback(uarg, false);                       --> step 1, the ubufs->refcount is subtracted frist.
                     }
           }

           if (err) {
                     this_cpu_inc(tun->pcpu_stats->rx_dropped);
                     kfree_skb(skb);
                     return -EFAULT;
           }

           err = virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun));
           if (err) {
                     this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
                     kfree_skb(skb);
                     return -EINVAL;                                         -->step 2, return err.
           }
}

static void handle_tx(struct vhost_net *net)
{
	...
	/* TODO: Check specific error and bomb out unless ENOBUFS? */
	err = sock->ops->sendmsg(sock, &msg, len);
	if (unlikely(err < 0)) {
		if (zcopy_used) {
			vhost_net_ubuf_put(ubufs);                                        --> step 3, the ubufs->refcount will be subtracted twice, when sendmsg execution err.
			nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
				% UIO_MAXIOV;
		}
		vhost_discard_vq_desc(vq, 1);
		break;
	}
	...
}

-----Original Message-----
From: Jason Wang [mailto:jasowang@redhat.com] 
Sent: Tuesday, November 29, 2016 6:07 PM
To: wangyunjian; mst@redhat.com; netdev@vger.kernel.org
Cc: caihe
Subject: Re: The ubufs->refcount maybe be subtracted twice when tun_get_user failed



On 2016年11月29日 17:30, wangyunjian wrote:
> In function tun_get_user , the ubufs->refcount may be subtracted twice, when msg_control is true and zerocopy is false.
>
> About the below code:
>
> static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>                                  void *msg_control, struct iov_iter *from,
>                                  int noblock) {
>           ...
>
>           if (zerocopy)
>                     err = zerocopy_sg_from_iter(skb, from);
>           else {
>                     err = skb_copy_datagram_from_iter(skb, 0, from, len);
>                     if (!err && msg_control) {
>                              struct ubuf_info *uarg = msg_control;
>                              uarg->callback(uarg, false);                       --> the ubufs->refcount is subtracted frist.
>                     }
>           }
>
>           if (err) {
>                     this_cpu_inc(tun->pcpu_stats->rx_dropped);
>                     kfree_skb(skb);
>                     return -EFAULT;
>           }
>
>           err = virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun));
>           if (err) {
>                     this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
>                     kfree_skb(skb);
>                     return -EINVAL;                                   -->here, the ubufs->refcount will be subtracted twice, when virtio_net_hdr_to_skb execution err.
>           }

Just make sure I understand the problem. Since we don't set SKBTX_DEV_ZEROCOPY here if zerocopy is false, callback won't be even trigged in skb_release_data(). So we are in fact safe here?

Thanks

> switch (tun->flags & TUN_TYPE_MASK) {
>           case IFF_TUN:
>                     if (tun->flags & IFF_NO_PI) {
>                              switch (skb->data[0] & 0xf0) {
>                              case 0x40:
>                                       pi.proto = htons(ETH_P_IP);
>                                       break;
>                              case 0x60:
>                                       pi.proto = htons(ETH_P_IPV6);
>                                       break;
>                              default:
>                                       this_cpu_inc(tun->pcpu_stats->rx_dropped);
>                                       kfree_skb(skb);
>                                       return -EINVAL;                          --> this will also be subtracted twice.
>                              }
>                     }
>
>                     skb_reset_mac_header(skb);
>                     skb->protocol = pi.proto;
>                     skb->dev = tun->dev;
>                     break;
>           case IFF_TAP:
>                     skb->protocol = eth_type_trans(skb, tun->dev);
>                     break;
>           }
> 		...
> }


^ permalink raw reply

* 转发: Re: [PATCH] phy: fix the bug when remove the phy driver
From: maowenan @ 2016-11-29 13:28 UTC (permalink / raw)
  To: netdev@vger.kernel.org, Florian Fainelli; +Cc: Dingtianhong, weiyongjun (A)

Hi
When remove marvell, the call trace as below:
rmmod marvell -> module_exit(phy_module_exit) -> phy_drivers_unregister -> phy_driver_unregister -> driver_unregister -> bus_remove_driver -> driver_detach -> __device_release_driver -> drv->remove(dev), phy_remove -> phydev->drv = NULL; 

static int phy_remove(struct device *dev)
{
	struct phy_device *phydev = to_phy_device(dev);

	mutex_lock(&phydev->lock);
	phydev->state = PHY_DOWN;
	mutex_unlock(&phydev->lock);

	if (phydev->drv->remove)
		phydev->drv->remove(phydev);
	phydev->drv = NULL;

	return 0;
}

I found there is no reference count check when call drv->remove in __device_release_driver()
		if (dev->bus && dev->bus->remove)
			dev->bus->remove(dev);
		else if (drv->remove) {
			pr_info("refcount:%d\n",dev->kobj.kref.refcount.counter);
			drv->remove(dev);
		}

so I add log to trace this, found the ref count of this dev is 5, unfortunately then call phy_remove and set phydev->drv = NULL and kernel panic happen.

@fainelli, What’s your comments about checking reference counting in driver_detach(), If there is someone remain using this driver, we could not remove the phy driver kernel module.
As bellow:
		if (dev->driver == drv && (dev->kobj.kref.refcount.counter == 1))
			__device_release_driver(dev);




-----邮件原件-----
发件人: Dingtianhong 
发送时间: 2016年11月29日 10:03
收件人: maowenan
主题: Fwd: Re: [PATCH] phy: fix the bug when remove the phy driver




-------- Forwarded Message --------
Subject: Re: [PATCH] phy: fix the bug when remove the phy driver
Date: Wed, 3 Aug 2016 09:21:52 +0800
From: Ding Tianhong <dingtianhong@huawei.com>
To: Florian Fainelli <f.fainelli@gmail.com>, Netdev <netdev@vger.kernel.org>, David S. Miller <davem@davemloft.net>, linux-kernel@vger.kernel.org <linux-kernel@vger.kernel.org>, Andrew Lunn <andrew@lunn.ch>

On 2016/8/3 0:42, Florian Fainelli wrote:
> Le 02/08/2016 à 06:00, Ding Tianhong a écrit :
>> The nic in my board use the phy dev from marvell, and the system will 
>> load the marvell phy driver automatically, but when I remove the phy 
>> drivers, the system immediately panic:
>>
>> localhost login: [ 2582.052465] Unable to handle kernel NULL pointer 
>> dereference at virtual address 000000c0 [ 2582.061429] pgd = 
>> ffff800001226000 [ 2582.065277] [000000c0] *pgd=0000003f7f893003, 
>> *pud=0000003f7f894003, *pmd=0000003f7f895003, *pte=006000006d000707 [ 
>> 2582.076681] Internal error: Oops: 96000006 [#1] SMP [ 2582.082049] Modules linked in: sr_mod(E) cdrom(E) ses(E) enclosure(E) shpchp(E) crc32_arm64(E) aes_ce_blk(E) ablk_helper(E) cryptd(E) aes_ce_cipher(E) ghash_ce(E) sha2_ce(E) sha1_ce(E) usb_storage(E) dm_mirror(E) dm_region_hash(E) dm_log(E) dm_mod(E) [last unloaded: marvell]
>> [ 2582.109226] CPU: 21 PID: 1514 Comm: kworker/21:1 Tainted: G            E   4.1.27-vhulk3.6.5.aarch64 #1
>> [ 2582.119415] Hardware name: Huawei Taishan 2180 /BC11SPCC, BIOS 
>> 1.31 06/23/2016 [ 2582.127346] Workqueue: events_power_efficient 
>> phy_state_machine [ 2582.133796] task: ffff803f6f41bac0 ti: 
>> ffff803f6eca4000 task.ti: ffff803f6eca4000 [ 2582.141910] PC is at 
>> phy_state_machine+0x3c/0x438 [ 2582.147081] LR is at 
>> phy_state_machine+0x34/0x438 [ 2582.152173] pc : [<ffff800000715384>] 
>> lr : [<ffff80000071537c>] pstate: 60000145 [ 2582.160189] sp : 
>> ffff803f6eca7d30 [ 2582.163825] x29: ffff803f6eca7d30 x28: 
>> 0000000000000000 [ 2582.169689] x27: ffff805fdfd0aa00 x26: 
>> 0000000000000008 [ 2582.175482] x25: 0000000000000000 x24: 
>> ffff80000112c57c [ 2582.181391] x23: ffff805f4fc8a800 x22: 
>> ffff805fdfd0f700 [ 2582.187241] x21: ffff805f4fc8abf8 x20: 
>> ffff805f4fc8a770 [ 2582.193035] x19: ffff805f4fc8ab70 x18: 
>> 0000000000000007 [ 2582.198914] x17: 000000000000000e x16: 
>> 0000000000000001 [ 2582.204710] x15: 0000000000000007 x14: 
>> 000000000000000e [ 2582.210584] x13: 0000000000000200 x12: 
>> 0000000055555556 [ 2582.216373] x11: 0000000000001c00 x10: 
>> 0000000000000000 [ 2582.222166] x9 : ffff800000a36880 x8 : 
>> ffff803f6f41c060 [ 2582.227994] x7 : 000000010008b39b x6 : 
>> ffff80000101e690 [ 2582.233788] x5 : 0000000000000000 x4 : 
>> 0000000000800000 [ 2582.239612] x3 : 0000000000000000 x2 : 
>> 0000000000000000 [ 2582.245404] x1 : 0000000000000000 x0 : 
>> 0000000000000000 [ 2582.265813] [ 2582.282971] Process kworker/21:1 
>> (pid: 1514, stack limit = 0xffff803f6eca4020) [ 2582.307284] Stack: 
>> (0xffff803f6eca7d30 to 0xffff803f6eca8000)
>> [ 2582.331183] 7d20:                                   ffff803f6eca7d70 ffff8000000db3b8
>> [ 2582.354788] 7d40: ffff803f6e696000 ffff805f4fc8ab70 
>> ffff805fdfd0aa00 ffff805fdfd0f700 [ 2582.378341] 7d60: 
>> 0000000000000000 ffff80000112c57c ffff803f6eca7dc0 ffff8000000db7d4 [ 
>> 2582.403700] 7d80: ffff803f6e696000 ffff803f6e696030 ffff805fdfd0aa18 
>> ffff805fdfd0aa00 [ 2582.428385] 7da0: ffff803f6eca4000 
>> ffff80000112c57c 0000000000000000 0000000000000008 [ 2582.451222] 
>> 7dc0: ffff803f6eca7e20 ffff8000000e1d0c ffff805f4fc15000 
>> ffff80000115f188 [ 2582.474577] 7de0: ffff800000d1cf88 
>> ffff803f6e696000 ffff8000000db690 0000000000000000 [ 2582.497281] 
>> 7e00: 0000000000000000 0000000000000000 0000000000000000 
>> 0000000000000000 [ 2582.522992] 7e20: 0000000000000000 
>> ffff800000083dd0 ffff8000000e1c10 ffff805f4fc15000 [ 2582.546945] 
>> 7e40: 0000000000000000 0000000000000000 0000000000000000 
>> 0000000000000000 [ 2582.568550] 7e60: 0000000000000000 
>> ffff8000000ee33c ffff803f6e696000 ffff805f00000000 [ 2582.589157] 
>> 7e80: 0000000000000000 ffff803f6eca7e88 ffff803f6eca7e88 
>> 0000000000000000 [ 2582.609767] 7ea0: 0000000000000000 
>> ffff803f6eca7ea8 ffff803f6eca7ea8 cb88537fdc8ba31b [ 2582.633228] 
>> 7ec0: 0000000000000000 0000000000000000 0000000000000000 
>> 0000000000000000 [ 2582.655386] 7ee0: 0000000000000000 
>> 0000000000000000 0000000000000000 0000000000000000 [ 2582.674223] 
>> 7f00: 0000000000000000 0000000000000000 0000000000000000 
>> 0000000000000000 [ 2582.692994] 7f20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 2582.711765] 7f40: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 2582.730809] 7f60: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 2582.748601] 7f80: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 2582.769388] 7fa0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 2582.786756] 7fc0: 0000000000000000 0000000000000005 0000000000000000 0000000000000000 [ 2582.804134] 7fe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 2582.821488] Call trace:
>> [ 2582.834493] [<ffff800000715384>] phy_state_machine+0x3c/0x438 [ 
>> 2582.851754] [<ffff8000000db3b8>] process_one_work+0x150/0x428 [ 
>> 2582.868188] [<ffff8000000db7d4>] worker_thread+0x144/0x4b0 [ 
>> 2582.883882] [<ffff8000000e1d0c>] kthread+0xfc/0x110
>>
>> ============================cut here===============================
>>
>> The phy_state_machine was still running and didn't know whether the 
>> phydev's driver was removed or not, then occur this problem, so we 
>> need to stop the phy_state_machine when removing the phy driver.
> 
> Your explanation of the problem is unclear to me, unless a network 
> driver attached to the PHY and started it, and then never stopped the 
> PHY state machine, this should not happen, also there should be proper 
> reference counting in place to avoid that. Your trace is based on 4.1 
> is this still reproducible with latest vanilla? Is this with a 
> mainline Ethernet driver?
> 

Yes,the network driver already attached to the PHY and started it, and all of them could work well if I didn't do anything, but if I suddenly remove the marvall.ko, the phy_state_machine was still running, but the phydev->drv is NULL at this time, than oops, I found this problem at 4.1 lts, and didn't see any effective improvement in the kernel 4.8, so report this bug and fix it.

Thanks.
Ding



^ permalink raw reply

* [PATCH] ehea: Remove unnecessary memset of stats in netdev private data
From: Tobias Klauser @ 2016-11-29 13:35 UTC (permalink / raw)
  To: Douglas Miller; +Cc: netdev

The memory for netdev private data is allocated using kzalloc/vzalloc in
alloc_netdev_mqs, thus there is no need to zero the stats portion of it
again in the driver's probe function.

In any case, the size for the memset is wrong as the stats member is of
type rtnl_link_stats64, not net_device_stats.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
---
 drivers/net/ethernet/ibm/ehea/ehea_main.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index fa66fa6f8bee..702446a93697 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -3044,7 +3044,6 @@ static struct ehea_port *ehea_setup_single_port(struct ehea_adapter *adapter,
 	init_waitqueue_head(&port->swqe_avail_wq);
 	init_waitqueue_head(&port->restart_wq);
 
-	memset(&port->stats, 0, sizeof(struct net_device_stats));
 	ret = register_netdev(dev);
 	if (ret) {
 		pr_err("register_netdev failed. ret=%d\n", ret);
-- 
2.11.0.rc3.5.g7cdf2ab.dirty

^ permalink raw reply related

* RE: [PATCH net-next v3 3/4] Documentation: net: phy: Add blurb about RGMII
From: David Laight @ 2016-11-29 13:47 UTC (permalink / raw)
  To: 'Florian Fainelli', netdev@vger.kernel.org
  Cc: davem@davemloft.net, andrew@lunn.ch, sf84@laposte.net,
	martin.blumenstingl@googlemail.com, mans@mansr.com,
	alexandre.torgue@st.com, peppe.cavallaro@st.com,
	timur@codeaurora.org, jbrunet@baylibre.com
In-Reply-To: <20161128024515.13070-4-f.fainelli@gmail.com>

From: Florian Fainelli
> Sent: 28 November 2016 02:45
> RGMII is a recurring source of pain for people with Gigabit Ethernet
> hardware since it may require PHY driver and MAC driver level
> configuration hints. Document what are the expectations from PHYLIB and
> what options exist.
...
> + * PHY_INTERFACE_MODE_RGMII_TXID: the PHY should insert an internal delay
> +   for the transmit data lines (TXD[3:0]) processed by the PHY device
> +
> + * PHY_INTERFACE_MODE_RGMII_RXID: the PHY should insert an internal delay
> +   for the receive data lines (RXD[3:0]) processed by the PHY device

These delays are subtly different, so the texts shouldn't really be the same.
I think the clock is delayed before latching the TX data and the RX data gets
delayed from the clock.

...
> + * Switching to lower speeds such as 10/100Mbits/sec makes the problem go away
> +   (since there is enough setup/hold time in that case)

The setup/hold times aren't directly affected by the lower speed.
So something else is going on.
Possibly the data is changed on one edge and sampled on the other at lower
speeds, but changes every clock edge at 1Gb.

	David

^ permalink raw reply

* Re: [PATCH v2 net 0/5] l2tp: fixes for l2tp_ip and l2tp_ip6 socket handling
From: James Chapman @ 2016-11-29 13:47 UTC (permalink / raw)
  To: Guillaume Nault, netdev; +Cc: Chris Elston
In-Reply-To: <cover.1480419658.git.g.nault@alphalink.fr>

On 29/11/16 12:09, Guillaume Nault wrote:
> This series addresses problems found while working on commit 32c231164b76
> ("l2tp: fix racy SOCK_ZAPPED flag check in l2tp_ip{,6}_bind()").
>
> The first three patches fix races in socket's connect, recv and bind
> operations. The last two ones fix scenarios where l2tp fails to
> correctly lookup its userspace sockets.
>
> Apart from the last patch, which is l2tp_ip6 specific, every patch
> fixes the same problem in the L2TP IPv4 and IPv6 code.
>
> All problems fixed by this series exist since the creation of the
> l2tp_ip and l2tp_ip6 modules.
>
> Changes since v1:
>   * Patch #3: fix possible uninitialised use of 'ret' in l2tp_ip_bind().
>
>
> Guillaume Nault (5):
>   l2tp: lock socket before checking flags in connect()
>   l2tp: hold socket before dropping lock in l2tp_ip{,6}_recv()
>   l2tp: fix racy socket lookup in l2tp_ip and l2tp_ip6 bind()
>   l2tp: fix lookup for sockets not bound to a device in l2tp_ip
>   l2tp: fix address test in __l2tp_ip6_bind_lookup()
>
>  include/net/ipv6.h  |  2 ++
>  net/ipv6/datagram.c |  4 ++-
>  net/l2tp/l2tp_ip.c  | 63 ++++++++++++++++++++++--------------------
>  net/l2tp/l2tp_ip6.c | 79 ++++++++++++++++++++++++++++-------------------------
>  4 files changed, 81 insertions(+), 67 deletions(-)
>

Looks good.

Acked-by: James Chapman <jchapman@katalix.com>

^ permalink raw reply

* Re: [PATCH net-next] cgroup, bpf: remove unnecessary #include
From: Daniel Mack @ 2016-11-29 14:05 UTC (permalink / raw)
  To: Daniel Borkmann, Alexei Starovoitov, David S . Miller; +Cc: Tejun Heo, netdev
In-Reply-To: <583D5D00.4070101@iogearbox.net>

On 11/29/2016 11:48 AM, Daniel Borkmann wrote:
> On 11/26/2016 08:23 AM, Alexei Starovoitov wrote:
>> this #include is unnecessary and brings whole set of
>> other headers into cgroup-defs.h. Remove it.
>>
>> Fixes: 3007098494be ("cgroup: add support for eBPF programs")
>> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> 
> This fixes many build errors in samples/bpf/ due to wrong helper
> redefinitions (originating from kernel includes conflicting with
> samples' helper declarations).
> 
> I don't see it pushed out to net-next yet, so:
> 
> Acked-by: Daniel Borkmann <daniel@iogearbox.net>
> 

FWIW:

Acked-by: Daniel Mack <daniel@zonque.org>

^ permalink raw reply

* Re: [PATCH v3 net-next 0/4] bpf: BPF for lightweight tunnel encapsulation
From: Hannes Frederic Sowa @ 2016-11-29 14:15 UTC (permalink / raw)
  To: Thomas Graf, davem; +Cc: netdev, alexei.starovoitov, daniel, tom, roopa
In-Reply-To: <cover.1480424542.git.tgraf@suug.ch>

Hi Thomas,

On 29.11.2016 14:21, Thomas Graf wrote:
> This series implements BPF program invocation from dst entries via the
> lightweight tunnels infrastructure. The BPF program can be attached to
> lwtunnel_input(), lwtunnel_output() or lwtunnel_xmit() and see an L3
> skb as context. Programs attached to input and output are read-only.
> Programs attached to lwtunnel_xmit() can modify and redirect, push headers
> and redirect packets.
> 
> The facility can be used to:
>  - Collect statistics and generate sampling data for a subset of traffic
>    based on the dst utilized by the packet thus allowing to extend the
>    existing realms.
>  - Apply additional per route/dst filters to prohibit certain outgoing
>    or incoming packets based on BPF filters. In particular, this allows
>    to maintain per dst custom state across multiple packets in BPF maps
>    and apply filters based on statistics and behaviour observed over time.
>  - Attachment of L2 headers at transmit where resolving the L2 address
>    is not required.
>  - Possibly many more.

Did you look at the cgroup based hooks which were added recently in
ip_finish_output for cgroup ebpf support and in general the cgroup bpf
subsystem. Does some of this solve the problem for you already? Would be
interesting to hear your opinion on that.

Thanks,
Hannes

^ permalink raw reply

* Re: [PATCH net] bpf: fix states equal logic for varlen access
From: Josef Bacik @ 2016-11-29 14:45 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: davem, netdev, daniel, ast, jannh
In-Reply-To: <20161129030446.GA13735@ast-mbp.thefacebook.com>

On 11/28/2016 10:04 PM, Alexei Starovoitov wrote:
> On Mon, Nov 28, 2016 at 02:44:10PM -0500, Josef Bacik wrote:
>> If we have a branch that looks something like this
>>
>> int foo = map->value;
>> if (condition) {
>>   foo += blah;
>> } else {
>>   foo = bar;
>> }
>> map->array[foo] = baz;
>>
>> We will incorrectly assume that the !condition branch is equal to the condition
>> branch as the register for foo will be UNKNOWN_VALUE in both cases.  We need to
>> adjust this logic to only do this if we didn't do a varlen access after we
>> processed the !condition branch, otherwise we have different ranges and need to
>> check the other branch as well.
>>
>> Signed-off-by: Josef Bacik <jbacik@fb.com>
>> ---
>>  kernel/bpf/verifier.c | 10 ++++++++--
>>  1 file changed, 8 insertions(+), 2 deletions(-)
>>
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 89f787c..2c8a688 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -2478,6 +2478,7 @@ static bool states_equal(struct bpf_verifier_env *env,
>>  {
>>  	struct bpf_reg_state *rold, *rcur;
>>  	int i;
>> +	bool map_access = env->varlen_map_value_access;
>
> that's a bit misleading name for the variable.
> Pls call it varlen_map_access.
>
>>  	for (i = 0; i < MAX_BPF_REG; i++) {
>>  		rold = &old->regs[i];
>> @@ -2489,12 +2490,17 @@ static bool states_equal(struct bpf_verifier_env *env,
>>  		/* If the ranges were not the same, but everything else was and
>>  		 * we didn't do a variable access into a map then we are a-ok.
>>  		 */
>> -		if (!env->varlen_map_value_access &&
>> +		if (!map_access &&
>>  		    rold->type == rcur->type && rold->imm == rcur->imm)
>
> just noticed that this one is missing comparing rold->id == rcur->id
>

Do you want me to fix that here?  I'll fix up the rest of the stuff, and Daniels 
things as well.  Thanks,

Josef

^ permalink raw reply

* [PATCH v2 net-next 00/11] qed*: Add XDP support
From: Yuval Mintz @ 2016-11-29 14:46 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz

This patch series is intended to add XDP to the qede driver, although
it contains quite a bit of cleanups, refactorings and infrastructure
changes as well.

The content of this series can be roughly divided into:

 - Datapath improvements - mostly focused on having the datapath utilize
parameters which can be more tightly contained in cachelines.
Patches #1, #2, #8, #9 belong to this group.

 - Refactoring - done mostly in favour of XDP. Patches #3, #4, #5, #9.

 - Infrastructure changes - done in favour of XDP. Paches #6 and #7 belong
to this category [#7 being by far the biggest patch in the series].

 - Actual XDP support - last two patches [#10, #11].

Hi Dave,

Please consider applying this to `net-next'.

Thanks,
Yuval

Changes from previous versions
------------------------------

 v2:
  - Allow setting of the eBPF even when interface is DOWN.
  - Fix warnings when compiling for an arch with 64Kb pages.

Yuval Mintz (11):
  qede: Optimize aggregation information size
  qed: Optimize qed_chain datapath usage
  qede: Remove 'num_tc'.
  qede: Refactor statistics gathering
  qede: Refactor data-path Rx flow
  qede: Revise state locking scheme
  qed*: Handle-based L2-queues.
  qede: Don't check netdevice for rx-hash
  qede: Better utilize the qede_[rt]x_queue
  qede: Add basic XDP support
  qede: Add support for XDP_TX

 drivers/net/ethernet/qlogic/qed/qed.h             |   12 -
 drivers/net/ethernet/qlogic/qed/qed_dev.c         |   33 +-
 drivers/net/ethernet/qlogic/qed/qed_l2.c          |  595 +++++----
 drivers/net/ethernet/qlogic/qed/qed_l2.h          |  133 +-
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |    4 +-
 drivers/net/ethernet/qlogic/qed/qed_sriov.c       |  275 ++--
 drivers/net/ethernet/qlogic/qed/qed_sriov.h       |   21 +-
 drivers/net/ethernet/qlogic/qed/qed_vf.c          |   90 +-
 drivers/net/ethernet/qlogic/qed/qed_vf.h          |   40 +-
 drivers/net/ethernet/qlogic/qede/qede.h           |  162 ++-
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c   |  210 ++--
 drivers/net/ethernet/qlogic/qede/qede_main.c      | 1395 ++++++++++++---------
 include/linux/qed/qed_chain.h                     |  144 ++-
 include/linux/qed/qed_eth_if.h                    |   56 +-
 14 files changed, 1879 insertions(+), 1291 deletions(-)

-- 
1.9.3

^ permalink raw reply

* [PATCH v2 net-next 01/11] qede: Optimize aggregation information size
From: Yuval Mintz @ 2016-11-29 14:47 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480430830-17671-1-git-send-email-Yuval.Mintz@cavium.com>

Driver needs to maintain a structure per-each concurrent possible
open aggregation, but the structure storing that metadata is far from
being optimized - biggest waste in it is that there are 2 buffer metadata,
one for a replacement buffer when the aggregation begins and the other for
holding the first aggregation's buffer after it begins [as firmware might
still update it]. Those 2 can safely be united into a single metadata
structure.

struct qede_agg_info changes the following:

	/* size: 120, cachelines: 2, members: 9 */
	/* sum members: 114, holes: 1, sum holes: 4 */
	/* padding: 2 */
	/* paddings: 2, sum paddings: 8 */
	/* last cacheline: 56 bytes */
 -->
	/* size: 48, cachelines: 1, members: 9 */
	/* paddings: 1, sum paddings: 4 */
	/* last cacheline: 48 bytes */

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qede/qede.h      | 29 +++++++++----
 drivers/net/ethernet/qlogic/qede/qede_main.c | 63 ++++++++++++----------------
 2 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 0cba21b..1d4c7e0 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -225,15 +225,30 @@ enum qede_agg_state {
 };
 
 struct qede_agg_info {
-	struct sw_rx_data replace_buf;
-	dma_addr_t replace_buf_mapping;
-	struct sw_rx_data start_buf;
-	dma_addr_t start_buf_mapping;
-	struct eth_fast_path_rx_tpa_start_cqe start_cqe;
-	enum qede_agg_state agg_state;
+	/* rx_buf is a data buffer that can be placed / consumed from rx bd
+	 * chain. It has two purposes: We will preallocate the data buffer
+	 * for each aggregation when we open the interface and will place this
+	 * buffer on the rx-bd-ring when we receive TPA_START. We don't want
+	 * to be in a state where allocation fails, as we can't reuse the
+	 * consumer buffer in the rx-chain since FW may still be writing to it
+	 * (since header needs to be modified for TPA).
+	 * The second purpose is to keep a pointer to the bd buffer during
+	 * aggregation.
+	 */
+	struct sw_rx_data buffer;
+	dma_addr_t buffer_mapping;
+
 	struct sk_buff *skb;
-	int frag_id;
+
+	/* We need some structs from the start cookie until termination */
 	u16 vlan_tag;
+	u16 start_cqe_bd_len;
+	u8 start_cqe_placement_offset;
+
+	u8 state;
+	u8 frag_id;
+
+	u8 tunnel_type;
 };
 
 struct qede_rx_queue {
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index b84a2c4..653be22 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1058,7 +1058,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
 	struct qede_agg_info *tpa_info = &rxq->tpa_info[tpa_agg_index];
 	struct sk_buff *skb = tpa_info->skb;
 
-	if (unlikely(tpa_info->agg_state != QEDE_AGG_STATE_START))
+	if (unlikely(tpa_info->state != QEDE_AGG_STATE_START))
 		goto out;
 
 	/* Add one frag and update the appropriate fields in the skb */
@@ -1084,7 +1084,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
 	return 0;
 
 out:
-	tpa_info->agg_state = QEDE_AGG_STATE_ERROR;
+	tpa_info->state = QEDE_AGG_STATE_ERROR;
 	qede_recycle_rx_bd_ring(rxq, edev, 1);
 	return -ENOMEM;
 }
@@ -1096,8 +1096,8 @@ static void qede_tpa_start(struct qede_dev *edev,
 	struct qede_agg_info *tpa_info = &rxq->tpa_info[cqe->tpa_agg_index];
 	struct eth_rx_bd *rx_bd_cons = qed_chain_consume(&rxq->rx_bd_ring);
 	struct eth_rx_bd *rx_bd_prod = qed_chain_produce(&rxq->rx_bd_ring);
-	struct sw_rx_data *replace_buf = &tpa_info->replace_buf;
-	dma_addr_t mapping = tpa_info->replace_buf_mapping;
+	struct sw_rx_data *replace_buf = &tpa_info->buffer;
+	dma_addr_t mapping = tpa_info->buffer_mapping;
 	struct sw_rx_data *sw_rx_data_cons;
 	struct sw_rx_data *sw_rx_data_prod;
 	enum pkt_hash_types rxhash_type;
@@ -1122,11 +1122,11 @@ static void qede_tpa_start(struct qede_dev *edev,
 	/* move partial skb from cons to pool (don't unmap yet)
 	 * save mapping, incase we drop the packet later on.
 	 */
-	tpa_info->start_buf = *sw_rx_data_cons;
+	tpa_info->buffer = *sw_rx_data_cons;
 	mapping = HILO_U64(le32_to_cpu(rx_bd_cons->addr.hi),
 			   le32_to_cpu(rx_bd_cons->addr.lo));
 
-	tpa_info->start_buf_mapping = mapping;
+	tpa_info->buffer_mapping = mapping;
 	rxq->sw_rx_cons++;
 
 	/* set tpa state to start only if we are able to allocate skb
@@ -1137,23 +1137,25 @@ static void qede_tpa_start(struct qede_dev *edev,
 					 le16_to_cpu(cqe->len_on_first_bd));
 	if (unlikely(!tpa_info->skb)) {
 		DP_NOTICE(edev, "Failed to allocate SKB for gro\n");
-		tpa_info->agg_state = QEDE_AGG_STATE_ERROR;
+		tpa_info->state = QEDE_AGG_STATE_ERROR;
 		goto cons_buf;
 	}
 
-	skb_put(tpa_info->skb, le16_to_cpu(cqe->len_on_first_bd));
-	memcpy(&tpa_info->start_cqe, cqe, sizeof(tpa_info->start_cqe));
-
 	/* Start filling in the aggregation info */
+	skb_put(tpa_info->skb, le16_to_cpu(cqe->len_on_first_bd));
 	tpa_info->frag_id = 0;
-	tpa_info->agg_state = QEDE_AGG_STATE_START;
+	tpa_info->state = QEDE_AGG_STATE_START;
 
 	rxhash = qede_get_rxhash(edev, cqe->bitfields,
 				 cqe->rss_hash, &rxhash_type);
 	skb_set_hash(tpa_info->skb, rxhash, rxhash_type);
+
+	/* Store some information from first CQE */
+	tpa_info->start_cqe_placement_offset = cqe->placement_offset;
+	tpa_info->start_cqe_bd_len = le16_to_cpu(cqe->len_on_first_bd);
 	if ((le16_to_cpu(cqe->pars_flags.flags) >>
 	     PARSING_AND_ERR_FLAGS_TAG8021QEXIST_SHIFT) &
-		    PARSING_AND_ERR_FLAGS_TAG8021QEXIST_MASK)
+	    PARSING_AND_ERR_FLAGS_TAG8021QEXIST_MASK)
 		tpa_info->vlan_tag = le16_to_cpu(cqe->vlan_tag);
 	else
 		tpa_info->vlan_tag = 0;
@@ -1169,7 +1171,7 @@ static void qede_tpa_start(struct qede_dev *edev,
 	if (unlikely(cqe->ext_bd_len_list[1])) {
 		DP_ERR(edev,
 		       "Unlikely - got a TPA aggregation with more than one ext_bd_len_list entry in the TPA start\n");
-		tpa_info->agg_state = QEDE_AGG_STATE_ERROR;
+		tpa_info->state = QEDE_AGG_STATE_ERROR;
 	}
 }
 
@@ -1276,7 +1278,7 @@ static void qede_tpa_end(struct qede_dev *edev,
 		DP_ERR(edev,
 		       "Strange - TPA emd with more than a single len_list entry\n");
 
-	if (unlikely(tpa_info->agg_state != QEDE_AGG_STATE_START))
+	if (unlikely(tpa_info->state != QEDE_AGG_STATE_START))
 		goto err;
 
 	/* Sanity */
@@ -1290,14 +1292,9 @@ static void qede_tpa_end(struct qede_dev *edev,
 		       le16_to_cpu(cqe->total_packet_len), skb->len);
 
 	memcpy(skb->data,
-	       page_address(tpa_info->start_buf.data) +
-		tpa_info->start_cqe.placement_offset +
-		tpa_info->start_buf.page_offset,
-	       le16_to_cpu(tpa_info->start_cqe.len_on_first_bd));
-
-	/* Recycle [mapped] start buffer for the next replacement */
-	tpa_info->replace_buf = tpa_info->start_buf;
-	tpa_info->replace_buf_mapping = tpa_info->start_buf_mapping;
+	       page_address(tpa_info->buffer.data) +
+	       tpa_info->start_cqe_placement_offset +
+	       tpa_info->buffer.page_offset, tpa_info->start_cqe_bd_len);
 
 	/* Finalize the SKB */
 	skb->protocol = eth_type_trans(skb, edev->ndev);
@@ -1310,18 +1307,11 @@ static void qede_tpa_end(struct qede_dev *edev,
 
 	qede_gro_receive(edev, fp, skb, tpa_info->vlan_tag);
 
-	tpa_info->agg_state = QEDE_AGG_STATE_NONE;
+	tpa_info->state = QEDE_AGG_STATE_NONE;
 
 	return;
 err:
-	/* The BD starting the aggregation is still mapped; Re-use it for
-	 * future aggregations [as replacement buffer]
-	 */
-	memcpy(&tpa_info->replace_buf, &tpa_info->start_buf,
-	       sizeof(struct sw_rx_data));
-	tpa_info->replace_buf_mapping = tpa_info->start_buf_mapping;
-	tpa_info->start_buf.data = NULL;
-	tpa_info->agg_state = QEDE_AGG_STATE_NONE;
+	tpa_info->state = QEDE_AGG_STATE_NONE;
 	dev_kfree_skb_any(tpa_info->skb);
 	tpa_info->skb = NULL;
 }
@@ -2823,7 +2813,7 @@ static void qede_free_sge_mem(struct qede_dev *edev, struct qede_rx_queue *rxq)
 
 	for (i = 0; i < ETH_TPA_MAX_AGGS_NUM; i++) {
 		struct qede_agg_info *tpa_info = &rxq->tpa_info[i];
-		struct sw_rx_data *replace_buf = &tpa_info->replace_buf;
+		struct sw_rx_data *replace_buf = &tpa_info->buffer;
 
 		if (replace_buf->data) {
 			dma_unmap_page(&edev->pdev->dev,
@@ -2905,7 +2895,7 @@ static int qede_alloc_sge_mem(struct qede_dev *edev, struct qede_rx_queue *rxq)
 
 	for (i = 0; i < ETH_TPA_MAX_AGGS_NUM; i++) {
 		struct qede_agg_info *tpa_info = &rxq->tpa_info[i];
-		struct sw_rx_data *replace_buf = &tpa_info->replace_buf;
+		struct sw_rx_data *replace_buf = &tpa_info->buffer;
 
 		replace_buf->data = alloc_pages(GFP_ATOMIC, 0);
 		if (unlikely(!replace_buf->data)) {
@@ -2923,10 +2913,9 @@ static int qede_alloc_sge_mem(struct qede_dev *edev, struct qede_rx_queue *rxq)
 		}
 
 		replace_buf->mapping = mapping;
-		tpa_info->replace_buf.page_offset = 0;
-
-		tpa_info->replace_buf_mapping = mapping;
-		tpa_info->agg_state = QEDE_AGG_STATE_NONE;
+		tpa_info->buffer.page_offset = 0;
+		tpa_info->buffer_mapping = mapping;
+		tpa_info->state = QEDE_AGG_STATE_NONE;
 	}
 
 	return 0;
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 02/11] qed: Optimize qed_chain datapath usage
From: Yuval Mintz @ 2016-11-29 14:47 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480430830-17671-1-git-send-email-Yuval.Mintz@cavium.com>

The chain structure and functions are widely used by the qed* modules,
both for configuration and datapath.
E.g., qede's Tx has one such chain and its Rx has two.

Currently, the strucutre's fields which are required for datapath
related functions [produce/consume] are intertwined with fields which
are required only for configuration purposes [init/destroy/etc.].

This patch re-arranges the chain structure so that all the fields which
are required for datapath usage could reside in a single cacheline instead
of the two which are required today.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c         |   7 +-
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |   4 +-
 include/linux/qed/qed_chain.h                     | 144 ++++++++++++----------
 3 files changed, 86 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 5be7b8a..80162ee 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -2283,12 +2283,12 @@ static void qed_chain_free_pbl(struct qed_dev *cdev, struct qed_chain *p_chain)
 {
 	void **pp_virt_addr_tbl = p_chain->pbl.pp_virt_addr_tbl;
 	u32 page_cnt = p_chain->page_cnt, i, pbl_size;
-	u8 *p_pbl_virt = p_chain->pbl.p_virt_table;
+	u8 *p_pbl_virt = p_chain->pbl_sp.p_virt_table;
 
 	if (!pp_virt_addr_tbl)
 		return;
 
-	if (!p_chain->pbl.p_virt_table)
+	if (!p_pbl_virt)
 		goto out;
 
 	for (i = 0; i < page_cnt; i++) {
@@ -2306,7 +2306,8 @@ static void qed_chain_free_pbl(struct qed_dev *cdev, struct qed_chain *p_chain)
 	pbl_size = page_cnt * QED_CHAIN_PBL_ENTRY_SIZE;
 	dma_free_coherent(&cdev->pdev->dev,
 			  pbl_size,
-			  p_chain->pbl.p_virt_table, p_chain->pbl.p_phys_table);
+			  p_chain->pbl_sp.p_virt_table,
+			  p_chain->pbl_sp.p_phys_table);
 out:
 	vfree(p_chain->pbl.pp_virt_addr_tbl);
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 2888eb0..d0a5828 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -347,11 +347,11 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 
 	/* Place EQ address in RAMROD */
 	DMA_REGPAIR_LE(p_ramrod->event_ring_pbl_addr,
-		       p_hwfn->p_eq->chain.pbl.p_phys_table);
+		       p_hwfn->p_eq->chain.pbl_sp.p_phys_table);
 	page_cnt = (u8)qed_chain_get_page_cnt(&p_hwfn->p_eq->chain);
 	p_ramrod->event_ring_num_pages = page_cnt;
 	DMA_REGPAIR_LE(p_ramrod->consolid_q_pbl_addr,
-		       p_hwfn->p_consq->chain.pbl.p_phys_table);
+		       p_hwfn->p_consq->chain.pbl_sp.p_phys_table);
 
 	qed_tunn_set_pf_start_params(p_hwfn, p_tunn, &p_ramrod->tunnel_config);
 
diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h
index 72d88cf..615f34c 100644
--- a/include/linux/qed/qed_chain.h
+++ b/include/linux/qed/qed_chain.h
@@ -56,23 +56,6 @@ struct qed_chain_pbl_u32 {
 	u32 cons_page_idx;
 };
 
-struct qed_chain_pbl {
-	/* Base address of a pre-allocated buffer for pbl */
-	dma_addr_t	p_phys_table;
-	void		*p_virt_table;
-
-	/* Table for keeping the virtual addresses of the chain pages,
-	 * respectively to the physical addresses in the pbl table.
-	 */
-	void **pp_virt_addr_tbl;
-
-	/* Index to current used page by producer/consumer */
-	union {
-		struct qed_chain_pbl_u16 pbl16;
-		struct qed_chain_pbl_u32 pbl32;
-	} u;
-};
-
 struct qed_chain_u16 {
 	/* Cyclic index of next element to produce/consme */
 	u16 prod_idx;
@@ -86,46 +69,78 @@ struct qed_chain_u32 {
 };
 
 struct qed_chain {
-	void			*p_virt_addr;
-	dma_addr_t		p_phys_addr;
-	void			*p_prod_elem;
-	void			*p_cons_elem;
+	/* fastpath portion of the chain - required for commands such
+	 * as produce / consume.
+	 */
+	/* Point to next element to produce/consume */
+	void *p_prod_elem;
+	void *p_cons_elem;
+
+	/* Fastpath portions of the PBL [if exists] */
+	struct {
+		/* Table for keeping the virtual addresses of the chain pages,
+		 * respectively to the physical addresses in the pbl table.
+		 */
+		void **pp_virt_addr_tbl;
 
-	enum qed_chain_mode	mode;
-	enum qed_chain_use_mode intended_use; /* used to produce/consume */
-	enum qed_chain_cnt_type cnt_type;
+		union {
+			struct qed_chain_pbl_u16 u16;
+			struct qed_chain_pbl_u32 u32;
+		} c;
+	} pbl;
 
 	union {
 		struct qed_chain_u16 chain16;
 		struct qed_chain_u32 chain32;
 	} u;
 
+	/* Capacity counts only usable elements */
+	u32 capacity;
 	u32 page_cnt;
 
-	/* Number of elements - capacity is for usable elements only,
-	 * while size will contain total number of elements [for entire chain].
+	enum qed_chain_mode mode;
+
+	/* Elements information for fast calculations */
+	u16 elem_per_page;
+	u16 elem_per_page_mask;
+	u16 elem_size;
+	u16 next_page_mask;
+	u16 usable_per_page;
+	u8 elem_unusable;
+
+	u8 cnt_type;
+
+	/* Slowpath of the chain - required for initialization and destruction,
+	 * but isn't involved in regular functionality.
 	 */
-	u32 capacity;
+
+	/* Base address of a pre-allocated buffer for pbl */
+	struct {
+		dma_addr_t p_phys_table;
+		void *p_virt_table;
+	} pbl_sp;
+
+	/* Address of first page of the chain - the address is required
+	 * for fastpath operation [consume/produce] but only for the the SINGLE
+	 * flavour which isn't considered fastpath [== SPQ].
+	 */
+	void *p_virt_addr;
+	dma_addr_t p_phys_addr;
+
+	/* Total number of elements [for entire chain] */
 	u32 size;
 
-	/* Elements information for fast calculations */
-	u16			elem_per_page;
-	u16			elem_per_page_mask;
-	u16			elem_unusable;
-	u16			usable_per_page;
-	u16			elem_size;
-	u16			next_page_mask;
-	struct qed_chain_pbl	pbl;
+	u8 intended_use;
 };
 
 #define QED_CHAIN_PBL_ENTRY_SIZE        (8)
 #define QED_CHAIN_PAGE_SIZE             (0x1000)
 #define ELEMS_PER_PAGE(elem_size)       (QED_CHAIN_PAGE_SIZE / (elem_size))
 
-#define UNUSABLE_ELEMS_PER_PAGE(elem_size, mode)     \
-	((mode == QED_CHAIN_MODE_NEXT_PTR) ?	     \
-	 (1 + ((sizeof(struct qed_chain_next) - 1) / \
-	       (elem_size))) : 0)
+#define UNUSABLE_ELEMS_PER_PAGE(elem_size, mode)	 \
+	(((mode) == QED_CHAIN_MODE_NEXT_PTR) ?		 \
+	 (u8)(1 + ((sizeof(struct qed_chain_next) - 1) / \
+		   (elem_size))) : 0)
 
 #define USABLE_ELEMS_PER_PAGE(elem_size, mode) \
 	((u32)(ELEMS_PER_PAGE(elem_size) -     \
@@ -186,7 +201,7 @@ static inline u16 qed_chain_get_usable_per_page(struct qed_chain *p_chain)
 	return p_chain->usable_per_page;
 }
 
-static inline u16 qed_chain_get_unusable_per_page(struct qed_chain *p_chain)
+static inline u8 qed_chain_get_unusable_per_page(struct qed_chain *p_chain)
 {
 	return p_chain->elem_unusable;
 }
@@ -198,7 +213,7 @@ static inline u32 qed_chain_get_page_cnt(struct qed_chain *p_chain)
 
 static inline dma_addr_t qed_chain_get_pbl_phys(struct qed_chain *p_chain)
 {
-	return p_chain->pbl.p_phys_table;
+	return p_chain->pbl_sp.p_phys_table;
 }
 
 /**
@@ -214,10 +229,10 @@ static inline dma_addr_t qed_chain_get_pbl_phys(struct qed_chain *p_chain)
 static inline void
 qed_chain_advance_page(struct qed_chain *p_chain,
 		       void **p_next_elem, void *idx_to_inc, void *page_to_inc)
-
 {
 	struct qed_chain_next *p_next = NULL;
 	u32 page_index = 0;
+
 	switch (p_chain->mode) {
 	case QED_CHAIN_MODE_NEXT_PTR:
 		p_next = *p_next_elem;
@@ -305,7 +320,7 @@ static inline void *qed_chain_produce(struct qed_chain *p_chain)
 		if ((p_chain->u.chain16.prod_idx &
 		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
 			p_prod_idx = &p_chain->u.chain16.prod_idx;
-			p_prod_page_idx = &p_chain->pbl.u.pbl16.prod_page_idx;
+			p_prod_page_idx = &p_chain->pbl.c.u16.prod_page_idx;
 			qed_chain_advance_page(p_chain, &p_chain->p_prod_elem,
 					       p_prod_idx, p_prod_page_idx);
 		}
@@ -314,7 +329,7 @@ static inline void *qed_chain_produce(struct qed_chain *p_chain)
 		if ((p_chain->u.chain32.prod_idx &
 		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
 			p_prod_idx = &p_chain->u.chain32.prod_idx;
-			p_prod_page_idx = &p_chain->pbl.u.pbl32.prod_page_idx;
+			p_prod_page_idx = &p_chain->pbl.c.u32.prod_page_idx;
 			qed_chain_advance_page(p_chain, &p_chain->p_prod_elem,
 					       p_prod_idx, p_prod_page_idx);
 		}
@@ -378,7 +393,7 @@ static inline void *qed_chain_consume(struct qed_chain *p_chain)
 		if ((p_chain->u.chain16.cons_idx &
 		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
 			p_cons_idx = &p_chain->u.chain16.cons_idx;
-			p_cons_page_idx = &p_chain->pbl.u.pbl16.cons_page_idx;
+			p_cons_page_idx = &p_chain->pbl.c.u16.cons_page_idx;
 			qed_chain_advance_page(p_chain, &p_chain->p_cons_elem,
 					       p_cons_idx, p_cons_page_idx);
 		}
@@ -387,8 +402,8 @@ static inline void *qed_chain_consume(struct qed_chain *p_chain)
 		if ((p_chain->u.chain32.cons_idx &
 		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
 			p_cons_idx = &p_chain->u.chain32.cons_idx;
-			p_cons_page_idx = &p_chain->pbl.u.pbl32.cons_page_idx;
-		qed_chain_advance_page(p_chain, &p_chain->p_cons_elem,
+			p_cons_page_idx = &p_chain->pbl.c.u32.cons_page_idx;
+			qed_chain_advance_page(p_chain, &p_chain->p_cons_elem,
 					       p_cons_idx, p_cons_page_idx);
 		}
 		p_chain->u.chain32.cons_idx++;
@@ -429,25 +444,26 @@ static inline void qed_chain_reset(struct qed_chain *p_chain)
 		u32 reset_val = p_chain->page_cnt - 1;
 
 		if (is_chain_u16(p_chain)) {
-			p_chain->pbl.u.pbl16.prod_page_idx = (u16)reset_val;
-			p_chain->pbl.u.pbl16.cons_page_idx = (u16)reset_val;
+			p_chain->pbl.c.u16.prod_page_idx = (u16)reset_val;
+			p_chain->pbl.c.u16.cons_page_idx = (u16)reset_val;
 		} else {
-			p_chain->pbl.u.pbl32.prod_page_idx = reset_val;
-			p_chain->pbl.u.pbl32.cons_page_idx = reset_val;
+			p_chain->pbl.c.u32.prod_page_idx = reset_val;
+			p_chain->pbl.c.u32.cons_page_idx = reset_val;
 		}
 	}
 
 	switch (p_chain->intended_use) {
-	case QED_CHAIN_USE_TO_CONSUME_PRODUCE:
-	case QED_CHAIN_USE_TO_PRODUCE:
-		/* Do nothing */
-		break;
-
 	case QED_CHAIN_USE_TO_CONSUME:
 		/* produce empty elements */
 		for (i = 0; i < p_chain->capacity; i++)
 			qed_chain_recycle_consumed(p_chain);
 		break;
+
+	case QED_CHAIN_USE_TO_CONSUME_PRODUCE:
+	case QED_CHAIN_USE_TO_PRODUCE:
+	default:
+		/* Do nothing */
+		break;
 	}
 }
 
@@ -473,13 +489,13 @@ static inline void qed_chain_init_params(struct qed_chain *p_chain,
 	p_chain->p_virt_addr = NULL;
 	p_chain->p_phys_addr = 0;
 	p_chain->elem_size	= elem_size;
-	p_chain->intended_use = intended_use;
+	p_chain->intended_use = (u8)intended_use;
 	p_chain->mode		= mode;
-	p_chain->cnt_type = cnt_type;
+	p_chain->cnt_type = (u8)cnt_type;
 
-	p_chain->elem_per_page		= ELEMS_PER_PAGE(elem_size);
+	p_chain->elem_per_page = ELEMS_PER_PAGE(elem_size);
 	p_chain->usable_per_page = USABLE_ELEMS_PER_PAGE(elem_size, mode);
-	p_chain->elem_per_page_mask	= p_chain->elem_per_page - 1;
+	p_chain->elem_per_page_mask = p_chain->elem_per_page - 1;
 	p_chain->elem_unusable = UNUSABLE_ELEMS_PER_PAGE(elem_size, mode);
 	p_chain->next_page_mask = (p_chain->usable_per_page &
 				   p_chain->elem_per_page_mask);
@@ -488,8 +504,8 @@ static inline void qed_chain_init_params(struct qed_chain *p_chain,
 	p_chain->capacity = p_chain->usable_per_page * page_cnt;
 	p_chain->size = p_chain->elem_per_page * page_cnt;
 
-	p_chain->pbl.p_phys_table = 0;
-	p_chain->pbl.p_virt_table = NULL;
+	p_chain->pbl_sp.p_phys_table = 0;
+	p_chain->pbl_sp.p_virt_table = NULL;
 	p_chain->pbl.pp_virt_addr_tbl = NULL;
 }
 
@@ -530,8 +546,8 @@ static inline void qed_chain_init_pbl_mem(struct qed_chain *p_chain,
 					  dma_addr_t p_phys_pbl,
 					  void **pp_virt_addr_tbl)
 {
-	p_chain->pbl.p_phys_table = p_phys_pbl;
-	p_chain->pbl.p_virt_table = p_virt_pbl;
+	p_chain->pbl_sp.p_phys_table = p_phys_pbl;
+	p_chain->pbl_sp.p_virt_table = p_virt_pbl;
 	p_chain->pbl.pp_virt_addr_tbl = pp_virt_addr_tbl;
 }
 
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 03/11] qede: Remove 'num_tc'.
From: Yuval Mintz @ 2016-11-29 14:47 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480430830-17671-1-git-send-email-Yuval.Mintz@cavium.com>

Driver currently doesn't support multi-CoS, but it contains logic
where multiple transmission queues could be theoretically manipulated.
No point in maintaining the infrastructure at the moment.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qede/qede.h         |  17 +--
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |  40 +++----
 drivers/net/ethernet/qlogic/qede/qede_main.c    | 132 ++++++++----------------
 3 files changed, 62 insertions(+), 127 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 1d4c7e0..b9584b2 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -127,10 +127,9 @@ struct qede_dev {
 
 	const struct qed_eth_ops	*ops;
 
-	struct qed_dev_eth_info	dev_info;
+	struct qed_dev_eth_info dev_info;
 #define QEDE_MAX_RSS_CNT(edev)	((edev)->dev_info.num_queues)
-#define QEDE_MAX_TSS_CNT(edev)	((edev)->dev_info.num_queues * \
-				 (edev)->dev_info.num_tc)
+#define QEDE_MAX_TSS_CNT(edev)	((edev)->dev_info.num_queues)
 
 	struct qede_fastpath		*fp_array;
 	u8				req_num_tx;
@@ -139,17 +138,9 @@ struct qede_dev {
 	u8				fp_num_rx;
 	u16				req_queues;
 	u16				num_queues;
-	u8				num_tc;
 #define QEDE_QUEUE_CNT(edev)	((edev)->num_queues)
 #define QEDE_RSS_COUNT(edev)	((edev)->num_queues - (edev)->fp_num_tx)
-#define QEDE_TSS_COUNT(edev)	(((edev)->num_queues - (edev)->fp_num_rx) * \
-				 (edev)->num_tc)
-#define QEDE_TX_IDX(edev, txqidx)	((edev)->fp_num_rx + (txqidx) % \
-					 QEDE_TSS_COUNT(edev))
-#define QEDE_TC_IDX(edev, txqidx)	((txqidx) / QEDE_TSS_COUNT(edev))
-#define QEDE_TX_QUEUE(edev, txqidx)	\
-	(&(edev)->fp_array[QEDE_TX_IDX((edev), (txqidx))].txqs[QEDE_TC_IDX(\
-							(edev), (txqidx))])
+#define QEDE_TSS_COUNT(edev)	((edev)->num_queues - (edev)->fp_num_rx)
 
 	struct qed_int_info		int_info;
 	unsigned char			primary_mac[ETH_ALEN];
@@ -324,7 +315,7 @@ struct qede_fastpath {
 	struct napi_struct	napi;
 	struct qed_sb_info	*sb_info;
 	struct qede_rx_queue	*rxq;
-	struct qede_tx_queue	*txqs;
+	struct qede_tx_queue	*txq;
 
 #define VEC_NAME_SIZE	(sizeof(((struct net_device *)0)->name) + 8)
 	char	name[VEC_NAME_SIZE];
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 8a3debe..a892843 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -59,8 +59,8 @@
 	QEDE_TQSTAT(stopped_cnt),
 };
 
-#define QEDE_TQSTATS_DATA(dev, sindex, tssid, tcid) \
-	(*((u64 *)(((void *)(&dev->fp_array[tssid].txqs[tcid])) +\
+#define QEDE_TQSTATS_DATA(dev, sindex, tssid) \
+	(*((u64 *)(((void *)((dev)->fp_array[tssid].txq)) + \
 		   qede_tqstats_arr[(sindex)].offset)))
 
 static const struct {
@@ -175,7 +175,6 @@ static void qede_get_strings_stats(struct qede_dev *edev, u8 *buf)
 	int i, j, k;
 
 	for (i = 0, k = 0; i < QEDE_QUEUE_CNT(edev); i++) {
-		int tc;
 
 		if (edev->fp_array[i].type & QEDE_FASTPATH_RX) {
 			for (j = 0; j < QEDE_NUM_RQSTATS; j++)
@@ -186,14 +185,12 @@ static void qede_get_strings_stats(struct qede_dev *edev, u8 *buf)
 		}
 
 		if (edev->fp_array[i].type & QEDE_FASTPATH_TX) {
-			for (tc = 0; tc < edev->num_tc; tc++) {
-				for (j = 0; j < QEDE_NUM_TQSTATS; j++)
-					sprintf(buf + (k + j) *
-						ETH_GSTRING_LEN,
-						"%d.%d: %s", i, tc,
-						qede_tqstats_arr[j].string);
-				k += QEDE_NUM_TQSTATS;
-			}
+			for (j = 0; j < QEDE_NUM_TQSTATS; j++)
+				sprintf(buf + (k + j) *
+					ETH_GSTRING_LEN,
+					"%d: %s", i,
+					qede_tqstats_arr[j].string);
+			k += QEDE_NUM_TQSTATS;
 		}
 	}
 
@@ -240,21 +237,16 @@ static void qede_get_ethtool_stats(struct net_device *dev,
 	mutex_lock(&edev->qede_lock);
 
 	for (qid = 0; qid < QEDE_QUEUE_CNT(edev); qid++) {
-		int tc;
 
-		if (edev->fp_array[qid].type & QEDE_FASTPATH_RX) {
+		if (edev->fp_array[qid].type & QEDE_FASTPATH_RX)
 			for (sidx = 0; sidx < QEDE_NUM_RQSTATS; sidx++)
 				buf[cnt++] = QEDE_RQSTATS_DATA(edev, sidx, qid);
-		}
 
-		if (edev->fp_array[qid].type & QEDE_FASTPATH_TX) {
-			for (tc = 0; tc < edev->num_tc; tc++) {
-				for (sidx = 0; sidx < QEDE_NUM_TQSTATS; sidx++)
-					buf[cnt++] = QEDE_TQSTATS_DATA(edev,
-								       sidx,
-								       qid, tc);
-			}
-		}
+		if (edev->fp_array[qid].type & QEDE_FASTPATH_TX)
+			for (sidx = 0; sidx < QEDE_NUM_TQSTATS; sidx++)
+				buf[cnt++] = QEDE_TQSTATS_DATA(edev,
+							       sidx,
+							       qid);
 	}
 
 	for (sidx = 0; sidx < QEDE_NUM_STATS; sidx++) {
@@ -281,7 +273,7 @@ static int qede_get_sset_count(struct net_device *dev, int stringset)
 					num_stats--;
 		}
 		return num_stats + QEDE_RSS_COUNT(edev) * QEDE_NUM_RQSTATS +
-		       QEDE_TSS_COUNT(edev) * QEDE_NUM_TQSTATS * edev->num_tc;
+		       QEDE_TSS_COUNT(edev) * QEDE_NUM_TQSTATS;
 	case ETH_SS_PRIV_FLAGS:
 		return QEDE_PRI_FLAG_LEN;
 	case ETH_SS_TEST:
@@ -1178,7 +1170,7 @@ static int qede_selftest_transmit_traffic(struct qede_dev *edev,
 
 	for_each_queue(i) {
 		if (edev->fp_array[i].type & QEDE_FASTPATH_TX) {
-			txq = edev->fp_array[i].txqs;
+			txq = edev->fp_array[i].txq;
 			break;
 		}
 	}
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 653be22..2006dd4 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -547,7 +547,7 @@ static netdev_tx_t qede_start_xmit(struct sk_buff *skb,
 	/* Get tx-queue context and netdev index */
 	txq_index = skb_get_queue_mapping(skb);
 	WARN_ON(txq_index >= QEDE_TSS_COUNT(edev));
-	txq = QEDE_TX_QUEUE(edev, txq_index);
+	txq = edev->fp_array[edev->fp_num_rx + txq_index].txq;
 	netdev_txq = netdev_get_tx_queue(ndev, txq_index);
 
 	WARN_ON(qed_chain_get_elem_left(&txq->tx_pbl) < (MAX_SKB_FRAGS + 1));
@@ -881,16 +881,6 @@ bool qede_has_rx_work(struct qede_rx_queue *rxq)
 	return hw_comp_cons != sw_comp_cons;
 }
 
-static bool qede_has_tx_work(struct qede_fastpath *fp)
-{
-	u8 tc;
-
-	for (tc = 0; tc < fp->edev->num_tc; tc++)
-		if (qede_txq_has_work(&fp->txqs[tc]))
-			return true;
-	return false;
-}
-
 static inline void qede_rx_bd_ring_consume(struct qede_rx_queue *rxq)
 {
 	qed_chain_consume(&rxq->rx_bd_ring);
@@ -1633,12 +1623,9 @@ static int qede_poll(struct napi_struct *napi, int budget)
 						napi);
 	struct qede_dev *edev = fp->edev;
 	int rx_work_done = 0;
-	u8 tc;
 
-	for (tc = 0; tc < edev->num_tc; tc++)
-		if (likely(fp->type & QEDE_FASTPATH_TX) &&
-		    qede_txq_has_work(&fp->txqs[tc]))
-			qede_tx_int(edev, &fp->txqs[tc]);
+	if (likely(fp->type & QEDE_FASTPATH_TX) && qede_txq_has_work(fp->txq))
+		qede_tx_int(edev, fp->txq);
 
 	rx_work_done = (likely(fp->type & QEDE_FASTPATH_RX) &&
 			qede_has_rx_work(fp->rxq)) ?
@@ -1664,7 +1651,7 @@ static int qede_poll(struct napi_struct *napi, int budget)
 		if (!((likely(fp->type & QEDE_FASTPATH_RX) &&
 		       qede_has_rx_work(fp->rxq)) ||
 		      (likely(fp->type & QEDE_FASTPATH_TX) &&
-		       qede_has_tx_work(fp)))) {
+		       qede_txq_has_work(fp->txq)))) {
 			napi_complete(napi);
 
 			/* Update and reenable interrupts */
@@ -2330,8 +2317,6 @@ static struct qede_dev *qede_alloc_etherdev(struct qed_dev *cdev,
 	memset(&edev->stats, 0, sizeof(edev->stats));
 	memcpy(&edev->dev_info, info, sizeof(*info));
 
-	edev->num_tc = edev->dev_info.num_tc;
-
 	INIT_LIST_HEAD(&edev->vlan_list);
 
 	return edev;
@@ -2429,7 +2414,7 @@ static void qede_free_fp_array(struct qede_dev *edev)
 
 			kfree(fp->sb_info);
 			kfree(fp->rxq);
-			kfree(fp->txqs);
+			kfree(fp->txq);
 		}
 		kfree(edev->fp_array);
 	}
@@ -2462,7 +2447,7 @@ static int qede_alloc_fp_array(struct qede_dev *edev)
 	for_each_queue(i) {
 		fp = &edev->fp_array[i];
 
-		fp->sb_info = kcalloc(1, sizeof(*fp->sb_info), GFP_KERNEL);
+		fp->sb_info = kzalloc(sizeof(*fp->sb_info), GFP_KERNEL);
 		if (!fp->sb_info) {
 			DP_NOTICE(edev, "sb info struct allocation failed\n");
 			goto err;
@@ -2479,22 +2464,15 @@ static int qede_alloc_fp_array(struct qede_dev *edev)
 		}
 
 		if (fp->type & QEDE_FASTPATH_TX) {
-			fp->txqs = kcalloc(edev->num_tc, sizeof(*fp->txqs),
-					   GFP_KERNEL);
-			if (!fp->txqs) {
-				DP_NOTICE(edev,
-					  "TXQ array allocation failed\n");
+			fp->txq = kzalloc(sizeof(*fp->txq), GFP_KERNEL);
+			if (!fp->txq)
 				goto err;
-			}
 		}
 
 		if (fp->type & QEDE_FASTPATH_RX) {
-			fp->rxq = kcalloc(1, sizeof(*fp->rxq), GFP_KERNEL);
-			if (!fp->rxq) {
-				DP_NOTICE(edev,
-					  "RXQ struct allocation failed\n");
+			fp->rxq = kzalloc(sizeof(*fp->rxq), GFP_KERNEL);
+			if (!fp->rxq)
 				goto err;
-			}
 		}
 	}
 
@@ -3031,16 +3009,13 @@ static int qede_alloc_mem_txq(struct qede_dev *edev, struct qede_tx_queue *txq)
 /* This function frees all memory of a single fp */
 static void qede_free_mem_fp(struct qede_dev *edev, struct qede_fastpath *fp)
 {
-	int tc;
-
 	qede_free_mem_sb(edev, fp->sb_info);
 
 	if (fp->type & QEDE_FASTPATH_RX)
 		qede_free_mem_rxq(edev, fp->rxq);
 
 	if (fp->type & QEDE_FASTPATH_TX)
-		for (tc = 0; tc < edev->num_tc; tc++)
-			qede_free_mem_txq(edev, &fp->txqs[tc]);
+		qede_free_mem_txq(edev, fp->txq);
 }
 
 /* This function allocates all memory needed for a single fp (i.e. an entity
@@ -3048,7 +3023,7 @@ static void qede_free_mem_fp(struct qede_dev *edev, struct qede_fastpath *fp)
  */
 static int qede_alloc_mem_fp(struct qede_dev *edev, struct qede_fastpath *fp)
 {
-	int rc, tc;
+	int rc;
 
 	rc = qede_alloc_mem_sb(edev, fp->sb_info, fp->id);
 	if (rc)
@@ -3061,11 +3036,9 @@ static int qede_alloc_mem_fp(struct qede_dev *edev, struct qede_fastpath *fp)
 	}
 
 	if (fp->type & QEDE_FASTPATH_TX) {
-		for (tc = 0; tc < edev->num_tc; tc++) {
-			rc = qede_alloc_mem_txq(edev, &fp->txqs[tc]);
-			if (rc)
-				goto err;
-		}
+		rc = qede_alloc_mem_txq(edev, fp->txq);
+		if (rc)
+			goto err;
 	}
 
 	return 0;
@@ -3108,7 +3081,7 @@ static int qede_alloc_mem_load(struct qede_dev *edev)
 /* This function inits fp content and resets the SB, RXQ and TXQ structures */
 static void qede_init_fp(struct qede_dev *edev)
 {
-	int queue_id, rxq_index = 0, txq_index = 0, tc;
+	int queue_id, rxq_index = 0, txq_index = 0;
 	struct qede_fastpath *fp;
 
 	for_each_queue(queue_id) {
@@ -3117,25 +3090,15 @@ static void qede_init_fp(struct qede_dev *edev)
 		fp->edev = edev;
 		fp->id = queue_id;
 
-		memset((void *)&fp->napi, 0, sizeof(fp->napi));
-
-		memset((void *)fp->sb_info, 0, sizeof(*fp->sb_info));
 
 		if (fp->type & QEDE_FASTPATH_RX) {
-			memset((void *)fp->rxq, 0, sizeof(*fp->rxq));
 			fp->rxq->rxq_id = rxq_index++;
 		}
 
 		if (fp->type & QEDE_FASTPATH_TX) {
-			memset((void *)fp->txqs, 0,
-			       (edev->num_tc * sizeof(*fp->txqs)));
-			for (tc = 0; tc < edev->num_tc; tc++) {
-				fp->txqs[tc].index = txq_index +
-				    tc * QEDE_TSS_COUNT(edev);
-				if (edev->dev_info.is_legacy)
-					fp->txqs[tc].is_legacy = true;
-			}
-			txq_index++;
+			fp->txq->index = txq_index++;
+			if (edev->dev_info.is_legacy)
+				fp->txq->is_legacy = 1;
 		}
 
 		snprintf(fp->name, sizeof(fp->name), "%s-fp-%d",
@@ -3307,7 +3270,8 @@ static int qede_stop_queues(struct qede_dev *edev)
 {
 	struct qed_update_vport_params vport_update_params;
 	struct qed_dev *cdev = edev->cdev;
-	int rc, tc, i;
+	struct qede_fastpath *fp;
+	int rc, i;
 
 	/* Disable the vport */
 	memset(&vport_update_params, 0, sizeof(vport_update_params));
@@ -3324,16 +3288,12 @@ static int qede_stop_queues(struct qede_dev *edev)
 
 	/* Flush Tx queues. If needed, request drain from MCP */
 	for_each_queue(i) {
-		struct qede_fastpath *fp = &edev->fp_array[i];
+		fp = &edev->fp_array[i];
 
 		if (fp->type & QEDE_FASTPATH_TX) {
-			for (tc = 0; tc < edev->num_tc; tc++) {
-				struct qede_tx_queue *txq = &fp->txqs[tc];
-
-				rc = qede_drain_txq(edev, txq, true);
-				if (rc)
-					return rc;
-			}
+			rc = qede_drain_txq(edev, fp->txq, true);
+			if (rc)
+				return rc;
 		}
 	}
 
@@ -3341,29 +3301,24 @@ static int qede_stop_queues(struct qede_dev *edev)
 	for (i = QEDE_QUEUE_CNT(edev) - 1; i >= 0; i--) {
 		struct qed_stop_rxq_params rx_params;
 
+		fp = &edev->fp_array[i];
+
 		/* Stop the Tx Queue(s) */
-		if (edev->fp_array[i].type & QEDE_FASTPATH_TX) {
-			for (tc = 0; tc < edev->num_tc; tc++) {
-				struct qed_stop_txq_params tx_params;
-				u8 val;
-
-				tx_params.rss_id = i;
-				val = edev->fp_array[i].txqs[tc].index;
-				tx_params.tx_queue_id = val;
+		if (fp->type & QEDE_FASTPATH_TX) {
+			struct qed_stop_txq_params tx_params;
+
+			tx_params.rss_id = i;
+			tx_params.tx_queue_id = fp->txq->index;
 				rc = edev->ops->q_tx_stop(cdev, &tx_params);
-				if (rc) {
-					DP_ERR(edev, "Failed to stop TXQ #%d\n",
-					       tx_params.tx_queue_id);
+				if (rc)
 					return rc;
-				}
-			}
 		}
 
 		/* Stop the Rx Queue */
-		if (edev->fp_array[i].type & QEDE_FASTPATH_RX) {
+		if (fp->type & QEDE_FASTPATH_RX) {
 			memset(&rx_params, 0, sizeof(rx_params));
 			rx_params.rss_id = i;
-			rx_params.rx_queue_id = edev->fp_array[i].rxq->rxq_id;
+			rx_params.rx_queue_id = fp->rxq->rxq_id;
 
 			rc = edev->ops->q_rx_stop(cdev, &rx_params);
 			if (rc) {
@@ -3383,7 +3338,6 @@ static int qede_stop_queues(struct qede_dev *edev)
 
 static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 {
-	int rc, tc, i;
 	int vlan_removal_en = 1;
 	struct qed_dev *cdev = edev->cdev;
 	struct qed_update_vport_params vport_update_params;
@@ -3391,6 +3345,7 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 	struct qed_dev_info *qed_info = &edev->dev_info.common;
 	struct qed_start_vport_params start = {0};
 	bool reset_rss_indir = false;
+	int rc, i;
 
 	if (!edev->num_queues) {
 		DP_ERR(edev,
@@ -3454,11 +3409,8 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 			qede_update_rx_prod(edev, rxq);
 		}
 
-		if (!(fp->type & QEDE_FASTPATH_TX))
-			continue;
-
-		for (tc = 0; tc < edev->num_tc; tc++) {
-			struct qede_tx_queue *txq = &fp->txqs[tc];
+		if (fp->type & QEDE_FASTPATH_TX) {
+			struct qede_tx_queue *txq = fp->txq;
 
 			p_phys_table = qed_chain_get_pbl_phys(&txq->tx_pbl);
 			page_cnt = qed_chain_get_page_cnt(&txq->tx_pbl);
@@ -3468,7 +3420,7 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 			q_params.queue_id = txq->index;
 			q_params.vport_id = 0;
 			q_params.sb = fp->sb_info->igu_sb_id;
-			q_params.sb_idx = TX_PI(tc);
+			q_params.sb_idx = TX_PI(0);
 
 			rc = edev->ops->q_tx_start(cdev, &q_params,
 						   p_phys_table, page_cnt,
@@ -3480,7 +3432,7 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 			}
 
 			txq->hw_cons_ptr =
-				&fp->sb_info->sb_virt->pi_array[TX_PI(tc)];
+				&fp->sb_info->sb_virt->pi_array[TX_PI(0)];
 			SET_FIELD(txq->tx_db.data.params,
 				  ETH_DB_DATA_DEST, DB_DEST_XCM);
 			SET_FIELD(txq->tx_db.data.params, ETH_DB_DATA_AGG_CMD,
@@ -3654,8 +3606,8 @@ static int qede_load(struct qede_dev *edev, enum qede_load_mode mode)
 	rc = qede_alloc_mem_load(edev);
 	if (rc)
 		goto err1;
-	DP_INFO(edev, "Allocated %d RSS queues on %d TC/s\n",
-		QEDE_QUEUE_CNT(edev), edev->num_tc);
+	DP_INFO(edev, "Allocated %d Rx, %d Tx queues\n",
+		QEDE_RSS_COUNT(edev), QEDE_TSS_COUNT(edev));
 
 	rc = qede_set_real_num_queues(edev);
 	if (rc)
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 05/11] qede: Refactor data-path Rx flow
From: Yuval Mintz @ 2016-11-29 14:47 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480430830-17671-1-git-send-email-Yuval.Mintz@cavium.com>

Driver's NAPI poll is using a long sequence for processing ingress
packets, and it's going to get even longer once we do XDP.
Break down the main loop into a series of sub-functions to allow
better readability of the function.

While we're at it, correct the accounting of the NAPI budget -
currently we're counting only packets passed to the stack against
the budget, even in case those are actually aggregations.
After refactoring every CQE processed would be counted against the budget.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qede/qede_main.c | 488 +++++++++++++++------------
 1 file changed, 264 insertions(+), 224 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 2006dd4..acc350b 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1021,6 +1021,7 @@ static inline void qede_skb_receive(struct qede_dev *edev,
 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
 
 	napi_gro_receive(&fp->napi, skb);
+	fp->rxq->rcv_pkts++;
 }
 
 static void qede_set_gro_params(struct qede_dev *edev,
@@ -1383,238 +1384,298 @@ static bool qede_pkt_is_ip_fragmented(struct eth_fast_path_rx_reg_cqe *cqe,
 	return false;
 }
 
-static int qede_rx_int(struct qede_fastpath *fp, int budget)
+static struct sk_buff *qede_rx_allocate_skb(struct qede_dev *edev,
+					    struct qede_rx_queue *rxq,
+					    struct sw_rx_data *bd, u16 len,
+					    u16 pad)
 {
-	struct qede_dev *edev = fp->edev;
-	struct qede_rx_queue *rxq = fp->rxq;
-
-	u16 hw_comp_cons, sw_comp_cons, sw_rx_index, parse_flag;
-	int rx_pkt = 0;
-	u8 csum_flag;
+	unsigned int offset = bd->page_offset;
+	struct skb_frag_struct *frag;
+	struct page *page = bd->data;
+	unsigned int pull_len;
+	struct sk_buff *skb;
+	unsigned char *va;
 
-	hw_comp_cons = le16_to_cpu(*rxq->hw_cons_ptr);
-	sw_comp_cons = qed_chain_get_cons_idx(&rxq->rx_comp_ring);
+	/* Allocate a new SKB with a sufficient large header len */
+	skb = netdev_alloc_skb(edev->ndev, QEDE_RX_HDR_SIZE);
+	if (unlikely(!skb))
+		return NULL;
 
-	/* Memory barrier to prevent the CPU from doing speculative reads of CQE
-	 * / BD in the while-loop before reading hw_comp_cons. If the CQE is
-	 * read before it is written by FW, then FW writes CQE and SB, and then
-	 * the CPU reads the hw_comp_cons, it will use an old CQE.
+	/* Copy data into SKB - if it's small, we can simply copy it and
+	 * re-use the already allcoated & mapped memory.
 	 */
-	rmb();
+	if (len + pad <= edev->rx_copybreak) {
+		memcpy(skb_put(skb, len),
+		       page_address(page) + pad + offset, len);
+		qede_reuse_page(edev, rxq, bd);
+		goto out;
+	}
 
-	/* Loop to complete all indicated BDs */
-	while (sw_comp_cons != hw_comp_cons) {
-		struct eth_fast_path_rx_reg_cqe *fp_cqe;
-		enum pkt_hash_types rxhash_type;
-		enum eth_rx_cqe_type cqe_type;
-		struct sw_rx_data *sw_rx_data;
-		union eth_rx_cqe *cqe;
-		struct sk_buff *skb;
-		struct page *data;
-		__le16 flags;
-		u16 len, pad;
-		u32 rx_hash;
-
-		/* Get the CQE from the completion ring */
-		cqe = (union eth_rx_cqe *)
-			qed_chain_consume(&rxq->rx_comp_ring);
-		cqe_type = cqe->fast_path_regular.type;
-
-		if (unlikely(cqe_type == ETH_RX_CQE_TYPE_SLOW_PATH)) {
-			edev->ops->eth_cqe_completion(
-					edev->cdev, fp->id,
-					(struct eth_slow_path_rx_cqe *)cqe);
-			goto next_cqe;
-		}
+	frag = &skb_shinfo(skb)->frags[0];
 
-		if (cqe_type != ETH_RX_CQE_TYPE_REGULAR) {
-			switch (cqe_type) {
-			case ETH_RX_CQE_TYPE_TPA_START:
-				qede_tpa_start(edev, rxq,
-					       &cqe->fast_path_tpa_start);
-				goto next_cqe;
-			case ETH_RX_CQE_TYPE_TPA_CONT:
-				qede_tpa_cont(edev, rxq,
-					      &cqe->fast_path_tpa_cont);
-				goto next_cqe;
-			case ETH_RX_CQE_TYPE_TPA_END:
-				qede_tpa_end(edev, fp,
-					     &cqe->fast_path_tpa_end);
-				goto next_rx_only;
-			default:
-				break;
-			}
-		}
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+			page, pad + offset, len, rxq->rx_buf_seg_size);
 
-		/* Get the data from the SW ring */
-		sw_rx_index = rxq->sw_rx_cons & NUM_RX_BDS_MAX;
-		sw_rx_data = &rxq->sw_rx_ring[sw_rx_index];
-		data = sw_rx_data->data;
-
-		fp_cqe = &cqe->fast_path_regular;
-		len =  le16_to_cpu(fp_cqe->len_on_first_bd);
-		pad = fp_cqe->placement_offset;
-		flags = cqe->fast_path_regular.pars_flags.flags;
-
-		/* If this is an error packet then drop it */
-		parse_flag = le16_to_cpu(flags);
-
-		csum_flag = qede_check_csum(parse_flag);
-		if (unlikely(csum_flag == QEDE_CSUM_ERROR)) {
-			if (qede_pkt_is_ip_fragmented(&cqe->fast_path_regular,
-						      parse_flag)) {
-				rxq->rx_ip_frags++;
-				goto alloc_skb;
-			}
+	va = skb_frag_address(frag);
+	pull_len = eth_get_headlen(va, QEDE_RX_HDR_SIZE);
 
-			DP_NOTICE(edev,
-				  "CQE in CONS = %u has error, flags = %x, dropping incoming packet\n",
-				  sw_comp_cons, parse_flag);
-			rxq->rx_hw_errors++;
-			qede_recycle_rx_bd_ring(rxq, edev, fp_cqe->bd_num);
-			goto next_cqe;
-		}
+	/* Align the pull_len to optimize memcpy */
+	memcpy(skb->data, va, ALIGN(pull_len, sizeof(long)));
 
-alloc_skb:
-		skb = netdev_alloc_skb(edev->ndev, QEDE_RX_HDR_SIZE);
-		if (unlikely(!skb)) {
-			DP_NOTICE(edev,
-				  "skb allocation failed, dropping incoming packet\n");
-			qede_recycle_rx_bd_ring(rxq, edev, fp_cqe->bd_num);
-			rxq->rx_alloc_errors++;
-			goto next_cqe;
+	/* Correct the skb & frag sizes offset after the pull */
+	skb_frag_size_sub(frag, pull_len);
+	frag->page_offset += pull_len;
+	skb->data_len -= pull_len;
+	skb->tail += pull_len;
+
+	if (unlikely(qede_realloc_rx_buffer(edev, rxq, bd))) {
+		/* Incr page ref count to reuse on allocation failure so
+		 * that it doesn't get freed while freeing SKB [as its
+		 * already mapped there].
+		 */
+		page_ref_inc(page);
+		dev_kfree_skb_any(skb);
+		return NULL;
+	}
+
+out:
+	/* We've consumed the first BD and prepared an SKB */
+	qede_rx_bd_ring_consume(rxq);
+	return skb;
+}
+
+static int qede_rx_build_jumbo(struct qede_dev *edev,
+			       struct qede_rx_queue *rxq,
+			       struct sk_buff *skb,
+			       struct eth_fast_path_rx_reg_cqe *cqe,
+			       u16 first_bd_len)
+{
+	u16 pkt_len = le16_to_cpu(cqe->pkt_len);
+	struct sw_rx_data *bd;
+	u16 bd_cons_idx;
+	u8 num_frags;
+
+	pkt_len -= first_bd_len;
+
+	/* We've already used one BD for the SKB. Now take care of the rest */
+	for (num_frags = cqe->bd_num - 1; num_frags > 0; num_frags--) {
+		u16 cur_size = pkt_len > rxq->rx_buf_size ? rxq->rx_buf_size :
+		    pkt_len;
+
+		if (unlikely(!cur_size)) {
+			DP_ERR(edev,
+			       "Still got %d BDs for mapping jumbo, but length became 0\n",
+			       num_frags);
+			goto out;
 		}
 
-		/* Copy data into SKB */
-		if (len + pad <= edev->rx_copybreak) {
-			memcpy(skb_put(skb, len),
-			       page_address(data) + pad +
-				sw_rx_data->page_offset, len);
-			qede_reuse_page(edev, rxq, sw_rx_data);
+		/* We need a replacement buffer for each BD */
+		if (unlikely(qede_alloc_rx_buffer(edev, rxq)))
+			goto out;
+
+		/* Now that we've allocated the replacement buffer,
+		 * we can safely consume the next BD and map it to the SKB.
+		 */
+		bd_cons_idx = rxq->sw_rx_cons & NUM_RX_BDS_MAX;
+		bd = &rxq->sw_rx_ring[bd_cons_idx];
+		qede_rx_bd_ring_consume(rxq);
+
+		dma_unmap_page(&edev->pdev->dev, bd->mapping,
+			       PAGE_SIZE, DMA_FROM_DEVICE);
+
+		skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags++,
+				   bd->data, 0, cur_size);
+
+		skb->truesize += PAGE_SIZE;
+		skb->data_len += cur_size;
+		skb->len += cur_size;
+		pkt_len -= cur_size;
+	}
+
+	if (unlikely(pkt_len))
+		DP_ERR(edev,
+		       "Mapped all BDs of jumbo, but still have %d bytes\n",
+		       pkt_len);
+
+out:
+	return num_frags;
+}
+
+static int qede_rx_process_tpa_cqe(struct qede_dev *edev,
+				   struct qede_fastpath *fp,
+				   struct qede_rx_queue *rxq,
+				   union eth_rx_cqe *cqe,
+				   enum eth_rx_cqe_type type)
+{
+	switch (type) {
+	case ETH_RX_CQE_TYPE_TPA_START:
+		qede_tpa_start(edev, rxq, &cqe->fast_path_tpa_start);
+		return 0;
+	case ETH_RX_CQE_TYPE_TPA_CONT:
+		qede_tpa_cont(edev, rxq, &cqe->fast_path_tpa_cont);
+		return 0;
+	case ETH_RX_CQE_TYPE_TPA_END:
+		qede_tpa_end(edev, fp, &cqe->fast_path_tpa_end);
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static int qede_rx_process_cqe(struct qede_dev *edev,
+			       struct qede_fastpath *fp,
+			       struct qede_rx_queue *rxq)
+{
+	struct eth_fast_path_rx_reg_cqe *fp_cqe;
+	u16 len, pad, bd_cons_idx, parse_flag;
+	enum pkt_hash_types rxhash_type;
+	enum eth_rx_cqe_type cqe_type;
+	union eth_rx_cqe *cqe;
+	struct sw_rx_data *bd;
+	struct sk_buff *skb;
+	__le16 flags;
+	u8 csum_flag;
+	u32 rx_hash;
+
+	/* Get the CQE from the completion ring */
+	cqe = (union eth_rx_cqe *)qed_chain_consume(&rxq->rx_comp_ring);
+	cqe_type = cqe->fast_path_regular.type;
+
+	/* Process an unlikely slowpath event */
+	if (unlikely(cqe_type == ETH_RX_CQE_TYPE_SLOW_PATH)) {
+		struct eth_slow_path_rx_cqe *sp_cqe;
+
+		sp_cqe = (struct eth_slow_path_rx_cqe *)cqe;
+		edev->ops->eth_cqe_completion(edev->cdev, fp->id, sp_cqe);
+		return 0;
+	}
+
+	/* Handle TPA cqes */
+	if (cqe_type != ETH_RX_CQE_TYPE_REGULAR)
+		return qede_rx_process_tpa_cqe(edev, fp, rxq, cqe, cqe_type);
+
+	/* Get the data from the SW ring; Consume it only after it's evident
+	 * we wouldn't recycle it.
+	 */
+	bd_cons_idx = rxq->sw_rx_cons & NUM_RX_BDS_MAX;
+	bd = &rxq->sw_rx_ring[bd_cons_idx];
+
+	fp_cqe = &cqe->fast_path_regular;
+	len = le16_to_cpu(fp_cqe->len_on_first_bd);
+	pad = fp_cqe->placement_offset;
+
+	/* If this is an error packet then drop it */
+	flags = cqe->fast_path_regular.pars_flags.flags;
+	parse_flag = le16_to_cpu(flags);
+
+	csum_flag = qede_check_csum(parse_flag);
+	if (unlikely(csum_flag == QEDE_CSUM_ERROR)) {
+		if (qede_pkt_is_ip_fragmented(fp_cqe, parse_flag)) {
+			rxq->rx_ip_frags++;
 		} else {
-			struct skb_frag_struct *frag;
-			unsigned int pull_len;
-			unsigned char *va;
-
-			frag = &skb_shinfo(skb)->frags[0];
-
-			skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, data,
-					pad + sw_rx_data->page_offset,
-					len, rxq->rx_buf_seg_size);
-
-			va = skb_frag_address(frag);
-			pull_len = eth_get_headlen(va, QEDE_RX_HDR_SIZE);
-
-			/* Align the pull_len to optimize memcpy */
-			memcpy(skb->data, va, ALIGN(pull_len, sizeof(long)));
-
-			skb_frag_size_sub(frag, pull_len);
-			frag->page_offset += pull_len;
-			skb->data_len -= pull_len;
-			skb->tail += pull_len;
-
-			if (unlikely(qede_realloc_rx_buffer(edev, rxq,
-							    sw_rx_data))) {
-				DP_ERR(edev, "Failed to allocate rx buffer\n");
-				/* Incr page ref count to reuse on allocation
-				 * failure so that it doesn't get freed while
-				 * freeing SKB.
-				 */
-
-				page_ref_inc(sw_rx_data->data);
-				rxq->rx_alloc_errors++;
-				qede_recycle_rx_bd_ring(rxq, edev,
-							fp_cqe->bd_num);
-				dev_kfree_skb_any(skb);
-				goto next_cqe;
-			}
+			DP_NOTICE(edev,
+				  "CQE has error, flags = %x, dropping incoming packet\n",
+				  parse_flag);
+			rxq->rx_hw_errors++;
+			qede_recycle_rx_bd_ring(rxq, edev, fp_cqe->bd_num);
+			return 0;
 		}
+	}
 
-		qede_rx_bd_ring_consume(rxq);
+	/* Basic validation passed; Need to prepare an SKB. This would also
+	 * guarantee to finally consume the first BD upon success.
+	 */
+	skb = qede_rx_allocate_skb(edev, rxq, bd, len, pad);
+	if (!skb) {
+		rxq->rx_alloc_errors++;
+		qede_recycle_rx_bd_ring(rxq, edev, fp_cqe->bd_num);
+		return 0;
+	}
 
-		if (fp_cqe->bd_num != 1) {
-			u16 pkt_len = le16_to_cpu(fp_cqe->pkt_len);
-			u8 num_frags;
-
-			pkt_len -= len;
-
-			for (num_frags = fp_cqe->bd_num - 1; num_frags > 0;
-			     num_frags--) {
-				u16 cur_size = pkt_len > rxq->rx_buf_size ?
-						rxq->rx_buf_size : pkt_len;
-				if (unlikely(!cur_size)) {
-					DP_ERR(edev,
-					       "Still got %d BDs for mapping jumbo, but length became 0\n",
-					       num_frags);
-					qede_recycle_rx_bd_ring(rxq, edev,
-								num_frags);
-					dev_kfree_skb_any(skb);
-					goto next_cqe;
-				}
-
-				if (unlikely(qede_alloc_rx_buffer(edev, rxq))) {
-					qede_recycle_rx_bd_ring(rxq, edev,
-								num_frags);
-					dev_kfree_skb_any(skb);
-					goto next_cqe;
-				}
-
-				sw_rx_index = rxq->sw_rx_cons & NUM_RX_BDS_MAX;
-				sw_rx_data = &rxq->sw_rx_ring[sw_rx_index];
-				qede_rx_bd_ring_consume(rxq);
-
-				dma_unmap_page(&edev->pdev->dev,
-					       sw_rx_data->mapping,
-					       PAGE_SIZE, DMA_FROM_DEVICE);
-
-				skb_fill_page_desc(skb,
-						   skb_shinfo(skb)->nr_frags++,
-						   sw_rx_data->data, 0,
-						   cur_size);
-
-				skb->truesize += PAGE_SIZE;
-				skb->data_len += cur_size;
-				skb->len += cur_size;
-				pkt_len -= cur_size;
-			}
+	/* In case of Jumbo packet, several PAGE_SIZEd buffers will be pointed
+	 * by a single cqe.
+	 */
+	if (fp_cqe->bd_num > 1) {
+		u16 unmapped_frags = qede_rx_build_jumbo(edev, rxq, skb,
+							 fp_cqe, len);
 
-			if (unlikely(pkt_len))
-				DP_ERR(edev,
-				       "Mapped all BDs of jumbo, but still have %d bytes\n",
-				       pkt_len);
+		if (unlikely(unmapped_frags > 0)) {
+			qede_recycle_rx_bd_ring(rxq, edev, unmapped_frags);
+			dev_kfree_skb_any(skb);
+			return 0;
 		}
+	}
 
-		skb->protocol = eth_type_trans(skb, edev->ndev);
+	/* The SKB contains all the data. Now prepare meta-magic */
+	skb->protocol = eth_type_trans(skb, edev->ndev);
+	rx_hash = qede_get_rxhash(edev, fp_cqe->bitfields,
+				  fp_cqe->rss_hash, &rxhash_type);
+	skb_set_hash(skb, rx_hash, rxhash_type);
+	qede_set_skb_csum(skb, csum_flag);
+	skb_record_rx_queue(skb, rxq->rxq_id);
 
-		rx_hash = qede_get_rxhash(edev, fp_cqe->bitfields,
-					  fp_cqe->rss_hash, &rxhash_type);
+	/* SKB is prepared - pass it to stack */
+	qede_skb_receive(edev, fp, skb, le16_to_cpu(fp_cqe->vlan_tag));
 
-		skb_set_hash(skb, rx_hash, rxhash_type);
+	return 1;
+}
 
-		qede_set_skb_csum(skb, csum_flag);
+static int qede_rx_int(struct qede_fastpath *fp, int budget)
+{
+	struct qede_rx_queue *rxq = fp->rxq;
+	struct qede_dev *edev = fp->edev;
+	u16 hw_comp_cons, sw_comp_cons;
+	int work_done = 0;
 
-		skb_record_rx_queue(skb, fp->rxq->rxq_id);
+	hw_comp_cons = le16_to_cpu(*rxq->hw_cons_ptr);
+	sw_comp_cons = qed_chain_get_cons_idx(&rxq->rx_comp_ring);
 
-		qede_skb_receive(edev, fp, skb, le16_to_cpu(fp_cqe->vlan_tag));
-next_rx_only:
-		rx_pkt++;
+	/* Memory barrier to prevent the CPU from doing speculative reads of CQE
+	 * / BD in the while-loop before reading hw_comp_cons. If the CQE is
+	 * read before it is written by FW, then FW writes CQE and SB, and then
+	 * the CPU reads the hw_comp_cons, it will use an old CQE.
+	 */
+	rmb();
 
-next_cqe: /* don't consume bd rx buffer */
+	/* Loop to complete all indicated BDs */
+	while ((sw_comp_cons != hw_comp_cons) && (work_done < budget)) {
+		qede_rx_process_cqe(edev, fp, rxq);
 		qed_chain_recycle_consumed(&rxq->rx_comp_ring);
 		sw_comp_cons = qed_chain_get_cons_idx(&rxq->rx_comp_ring);
-		/* CR TPA - revisit how to handle budget in TPA perhaps
-		 * increase on "end"
-		 */
-		if (rx_pkt == budget)
-			break;
-	} /* repeat while sw_comp_cons != hw_comp_cons... */
+		work_done++;
+	}
 
 	/* Update producers */
 	qede_update_rx_prod(edev, rxq);
 
-	rxq->rcv_pkts += rx_pkt;
+	return work_done;
+}
+
+static bool qede_poll_is_more_work(struct qede_fastpath *fp)
+{
+	qed_sb_update_sb_idx(fp->sb_info);
 
-	return rx_pkt;
+	/* *_has_*_work() reads the status block, thus we need to ensure that
+	 * status block indices have been actually read (qed_sb_update_sb_idx)
+	 * prior to this check (*_has_*_work) so that we won't write the
+	 * "newer" value of the status block to HW (if there was a DMA right
+	 * after qede_has_rx_work and if there is no rmb, the memory reading
+	 * (qed_sb_update_sb_idx) may be postponed to right before *_ack_sb).
+	 * In this case there will never be another interrupt until there is
+	 * another update of the status block, while there is still unhandled
+	 * work.
+	 */
+	rmb();
+
+	if (likely(fp->type & QEDE_FASTPATH_RX))
+		if (qede_has_rx_work(fp->rxq))
+			return true;
+
+	if (likely(fp->type & QEDE_FASTPATH_TX))
+		if (qede_txq_has_work(fp->txq))
+			return true;
+
+	return false;
 }
 
 static int qede_poll(struct napi_struct *napi, int budget)
@@ -1631,32 +1692,11 @@ static int qede_poll(struct napi_struct *napi, int budget)
 			qede_has_rx_work(fp->rxq)) ?
 			qede_rx_int(fp, budget) : 0;
 	if (rx_work_done < budget) {
-		qed_sb_update_sb_idx(fp->sb_info);
-		/* *_has_*_work() reads the status block,
-		 * thus we need to ensure that status block indices
-		 * have been actually read (qed_sb_update_sb_idx)
-		 * prior to this check (*_has_*_work) so that
-		 * we won't write the "newer" value of the status block
-		 * to HW (if there was a DMA right after
-		 * qede_has_rx_work and if there is no rmb, the memory
-		 * reading (qed_sb_update_sb_idx) may be postponed
-		 * to right before *_ack_sb). In this case there
-		 * will never be another interrupt until there is
-		 * another update of the status block, while there
-		 * is still unhandled work.
-		 */
-		rmb();
-
-		/* Fall out from the NAPI loop if needed */
-		if (!((likely(fp->type & QEDE_FASTPATH_RX) &&
-		       qede_has_rx_work(fp->rxq)) ||
-		      (likely(fp->type & QEDE_FASTPATH_TX) &&
-		       qede_txq_has_work(fp->txq)))) {
+		if (!qede_poll_is_more_work(fp)) {
 			napi_complete(napi);
 
 			/* Update and reenable interrupts */
-			qed_sb_ack(fp->sb_info, IGU_INT_ENABLE,
-				   1 /*update*/);
+			qed_sb_ack(fp->sb_info, IGU_INT_ENABLE, 1);
 		} else {
 			rx_work_done = budget;
 		}
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 06/11] qede: Revise state locking scheme
From: Yuval Mintz @ 2016-11-29 14:47 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480430830-17671-1-git-send-email-Yuval.Mintz@cavium.com>

As qede utilizes an internal-reload sequence as result of various
configuration changes, the netif state wouldn't always accurately describe
the status of the configuration.
To compensate, we're storing an internal state of the device, which should
only be accessed under the qede_lock.

This patch fixes and improves several state/lock interactions:
  - The internal state should only be checked while locked.
  - While holding lock, it's preferable to check state rather than
    the netdevice's state.
  - The reload sequence is not 'atomic' - unload and subsequent load
    are not in the same critical section.

This also add the 'locked' variant for the reload, which would later be
used by XDP - useful in the case where the correct sequence is 'lock,
check state and re-configure if good', instead of allowing the reload
itself to make the decision regarding the configurability of the device.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qede/qede.h         |  14 ++-
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |  37 +++---
 drivers/net/ethernet/qlogic/qede/qede_main.c    | 146 +++++++++++++++---------
 3 files changed, 122 insertions(+), 75 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index b9584b2..e0a94bc 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -338,8 +338,12 @@ struct qede_fastpath {
 #define QEDE_SP_VXLAN_PORT_CONFIG	2
 #define QEDE_SP_GENEVE_PORT_CONFIG	3
 
-union qede_reload_args {
-	u16 mtu;
+struct qede_reload_args {
+	void (*func)(struct qede_dev *edev, struct qede_reload_args *args);
+	union {
+		netdev_features_t features;
+		u16 mtu;
+	} u;
 };
 
 #ifdef CONFIG_DCB
@@ -348,11 +352,11 @@ struct qede_fastpath {
 void qede_config_debug(uint debug, u32 *p_dp_module, u8 *p_dp_level);
 void qede_set_ethtool_ops(struct net_device *netdev);
 void qede_reload(struct qede_dev *edev,
-		 void (*func)(struct qede_dev *edev,
-			      union qede_reload_args *args),
-		 union qede_reload_args *args);
+		 struct qede_reload_args *args, bool is_locked);
 int qede_change_mtu(struct net_device *dev, int new_mtu);
 void qede_fill_by_demand_stats(struct qede_dev *edev);
+void __qede_lock(struct qede_dev *edev);
+void __qede_unlock(struct qede_dev *edev);
 bool qede_has_rx_work(struct qede_rx_queue *rxq);
 int qede_txq_has_work(struct qede_tx_queue *txq);
 void qede_recycle_rx_bd_ring(struct qede_rx_queue *rxq, struct qede_dev *edev,
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index d2c23d1..ef8c327 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -258,7 +258,9 @@ static void qede_get_ethtool_stats(struct net_device *dev,
 
 	qede_fill_by_demand_stats(edev);
 
-	mutex_lock(&edev->qede_lock);
+	/* Need to protect the access to the fastpath array */
+	__qede_lock(edev);
+
 	for (i = 0; i < QEDE_QUEUE_CNT(edev); i++) {
 		fp = &edev->fp_array[i];
 
@@ -278,7 +280,7 @@ static void qede_get_ethtool_stats(struct net_device *dev,
 		buf++;
 	}
 
-	mutex_unlock(&edev->qede_lock);
+	__qede_unlock(edev);
 }
 
 static int qede_get_sset_count(struct net_device *dev, int stringset)
@@ -374,6 +376,8 @@ static int qede_get_link_ksettings(struct net_device *dev,
 	struct qede_dev *edev = netdev_priv(dev);
 	struct qed_link_output current_link;
 
+	__qede_lock(edev);
+
 	memset(&current_link, 0, sizeof(current_link));
 	edev->ops->common->get_link(edev->cdev, &current_link);
 
@@ -393,6 +397,9 @@ static int qede_get_link_ksettings(struct net_device *dev,
 		base->speed = SPEED_UNKNOWN;
 		base->duplex = DUPLEX_UNKNOWN;
 	}
+
+	__qede_unlock(edev);
+
 	base->port = current_link.port;
 	base->autoneg = (current_link.autoneg) ? AUTONEG_ENABLE :
 			AUTONEG_DISABLE;
@@ -701,8 +708,7 @@ static int qede_set_ringparam(struct net_device *dev,
 	edev->q_num_rx_buffers = ering->rx_pending;
 	edev->q_num_tx_buffers = ering->tx_pending;
 
-	if (netif_running(edev->ndev))
-		qede_reload(edev, NULL, NULL);
+	qede_reload(edev, NULL, false);
 
 	return 0;
 }
@@ -787,29 +793,27 @@ static int qede_get_regs_len(struct net_device *ndev)
 		return -EINVAL;
 }
 
-static void qede_update_mtu(struct qede_dev *edev, union qede_reload_args *args)
+static void qede_update_mtu(struct qede_dev *edev,
+			    struct qede_reload_args *args)
 {
-	edev->ndev->mtu = args->mtu;
+	edev->ndev->mtu = args->u.mtu;
 }
 
 /* Netdevice NDOs */
 int qede_change_mtu(struct net_device *ndev, int new_mtu)
 {
 	struct qede_dev *edev = netdev_priv(ndev);
-	union qede_reload_args args;
+	struct qede_reload_args args;
 
 	DP_VERBOSE(edev, (NETIF_MSG_IFUP | NETIF_MSG_IFDOWN),
 		   "Configuring MTU size of %d\n", new_mtu);
 
-	/* Set the mtu field and re-start the interface if needed*/
-	args.mtu = new_mtu;
-
-	if (netif_running(edev->ndev))
-		qede_reload(edev, &qede_update_mtu, &args);
-
-	qede_update_mtu(edev, &args);
+	/* Set the mtu field and re-start the interface if needed */
+	args.u.mtu = new_mtu;
+	args.func = &qede_update_mtu;
+	qede_reload(edev, &args, false);
 
-	edev->ops->common->update_mtu(edev->cdev, args.mtu);
+	edev->ops->common->update_mtu(edev->cdev, new_mtu);
 
 	return 0;
 }
@@ -893,8 +897,7 @@ static int qede_set_channels(struct net_device *dev,
 		       sizeof(edev->rss_params.rss_ind_table));
 	}
 
-	if (netif_running(dev))
-		qede_reload(edev, NULL, NULL);
+	qede_reload(edev, NULL, false);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index acc350b..8774f8e 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -100,6 +100,19 @@ static int qede_alloc_rx_buffer(struct qede_dev *edev,
 				struct qede_rx_queue *rxq);
 static void qede_link_update(void *dev, struct qed_link_output *link);
 
+/* The qede lock is used to protect driver state change and driver flows that
+ * are not reentrant.
+ */
+void __qede_lock(struct qede_dev *edev)
+{
+	mutex_lock(&edev->qede_lock);
+}
+
+void __qede_unlock(struct qede_dev *edev)
+{
+	mutex_unlock(&edev->qede_lock);
+}
+
 #ifdef CONFIG_QED_SRIOV
 static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos,
 			    __be16 vlan_proto)
@@ -1952,7 +1965,7 @@ static int qede_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 {
 	struct qede_dev *edev = netdev_priv(dev);
 	struct qede_vlan *vlan, *tmp;
-	int rc;
+	int rc = 0;
 
 	DP_VERBOSE(edev, NETIF_MSG_IFUP, "Adding vlan 0x%04x\n", vid);
 
@@ -1976,6 +1989,7 @@ static int qede_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 	}
 
 	/* If interface is down, cache this VLAN ID and return */
+	__qede_lock(edev);
 	if (edev->state != QEDE_STATE_OPEN) {
 		DP_VERBOSE(edev, NETIF_MSG_IFDOWN,
 			   "Interface is down, VLAN %d will be configured when interface is up\n",
@@ -1983,8 +1997,7 @@ static int qede_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 		if (vid != 0)
 			edev->non_configured_vlans++;
 		list_add(&vlan->list, &edev->vlan_list);
-
-		return 0;
+		goto out;
 	}
 
 	/* Check for the filter limit.
@@ -2000,7 +2013,7 @@ static int qede_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 			DP_ERR(edev, "Failed to configure VLAN %d\n",
 			       vlan->vid);
 			kfree(vlan);
-			return -EINVAL;
+			goto out;
 		}
 		vlan->configured = true;
 
@@ -2017,7 +2030,9 @@ static int qede_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 
 	list_add(&vlan->list, &edev->vlan_list);
 
-	return 0;
+out:
+	__qede_unlock(edev);
+	return rc;
 }
 
 static void qede_del_vlan_from_list(struct qede_dev *edev,
@@ -2094,11 +2109,12 @@ static int qede_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 {
 	struct qede_dev *edev = netdev_priv(dev);
 	struct qede_vlan *vlan = NULL;
-	int rc;
+	int rc = 0;
 
 	DP_VERBOSE(edev, NETIF_MSG_IFDOWN, "Removing vlan 0x%04x\n", vid);
 
 	/* Find whether entry exists */
+	__qede_lock(edev);
 	list_for_each_entry(vlan, &edev->vlan_list, list)
 		if (vlan->vid == vid)
 			break;
@@ -2106,7 +2122,7 @@ static int qede_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 	if (!vlan || (vlan->vid != vid)) {
 		DP_VERBOSE(edev, (NETIF_MSG_IFUP | NETIF_MSG_IFDOWN),
 			   "Vlan isn't configured\n");
-		return 0;
+		goto out;
 	}
 
 	if (edev->state != QEDE_STATE_OPEN) {
@@ -2116,7 +2132,7 @@ static int qede_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 		DP_VERBOSE(edev, NETIF_MSG_IFDOWN,
 			   "Interface is down, removing VLAN from list only\n");
 		qede_del_vlan_from_list(edev, vlan);
-		return 0;
+		goto out;
 	}
 
 	/* Remove vlan */
@@ -2125,7 +2141,7 @@ static int qede_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 					    vid);
 		if (rc) {
 			DP_ERR(edev, "Failed to remove VLAN %d\n", vid);
-			return -EINVAL;
+			goto out;
 		}
 	}
 
@@ -2136,6 +2152,8 @@ static int qede_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 	 */
 	rc = qede_configure_vlan_filters(edev);
 
+out:
+	__qede_unlock(edev);
 	return rc;
 }
 
@@ -2165,7 +2183,13 @@ static void qede_vlan_mark_nonconfigured(struct qede_dev *edev)
 	edev->accept_any_vlan = false;
 }
 
-static int qede_set_features(struct net_device *dev, netdev_features_t features)
+static void qede_set_features_reload(struct qede_dev *edev,
+				     struct qede_reload_args *args)
+{
+	edev->ndev->features = args->u.features;
+}
+
+int qede_set_features(struct net_device *dev, netdev_features_t features)
 {
 	struct qede_dev *edev = netdev_priv(dev);
 	netdev_features_t changes = features ^ dev->features;
@@ -2179,9 +2203,14 @@ static int qede_set_features(struct net_device *dev, netdev_features_t features)
 			need_reload = edev->gro_disable;
 	}
 
-	if (need_reload && netif_running(edev->ndev)) {
-		dev->features = features;
-		qede_reload(edev, NULL, NULL);
+	if (need_reload) {
+		struct qede_reload_args args;
+
+		args.u.features = features;
+		args.func = &qede_set_features_reload;
+
+		qede_reload(edev, &args, false);
+
 		return 1;
 	}
 
@@ -2528,12 +2557,11 @@ static void qede_sp_task(struct work_struct *work)
 					     sp_task.work);
 	struct qed_dev *cdev = edev->cdev;
 
-	mutex_lock(&edev->qede_lock);
+	__qede_lock(edev);
 
-	if (edev->state == QEDE_STATE_OPEN) {
-		if (test_and_clear_bit(QEDE_SP_RX_MODE, &edev->sp_flags))
+	if (test_and_clear_bit(QEDE_SP_RX_MODE, &edev->sp_flags))
+		if (edev->state == QEDE_STATE_OPEN)
 			qede_config_rx_mode(edev->ndev);
-	}
 
 	if (test_and_clear_bit(QEDE_SP_VXLAN_PORT_CONFIG, &edev->sp_flags)) {
 		struct qed_tunn_params tunn_params;
@@ -2553,7 +2581,7 @@ static void qede_sp_task(struct work_struct *work)
 		qed_ops->tunn_config(cdev, &tunn_params);
 	}
 
-	mutex_unlock(&edev->qede_lock);
+	__qede_unlock(edev);
 }
 
 static void qede_update_pf_params(struct qed_dev *cdev)
@@ -3576,15 +3604,18 @@ enum qede_unload_mode {
 	QEDE_UNLOAD_NORMAL,
 };
 
-static void qede_unload(struct qede_dev *edev, enum qede_unload_mode mode)
+static void qede_unload(struct qede_dev *edev, enum qede_unload_mode mode,
+			bool is_locked)
 {
 	struct qed_link_params link_params;
 	int rc;
 
 	DP_INFO(edev, "Starting qede unload\n");
 
+	if (!is_locked)
+		__qede_lock(edev);
+
 	qede_roce_dev_event_close(edev);
-	mutex_lock(&edev->qede_lock);
 	edev->state = QEDE_STATE_CLOSED;
 
 	/* Close OS Tx */
@@ -3616,7 +3647,8 @@ static void qede_unload(struct qede_dev *edev, enum qede_unload_mode mode)
 	qede_free_fp_array(edev);
 
 out:
-	mutex_unlock(&edev->qede_lock);
+	if (!is_locked)
+		__qede_unlock(edev);
 	DP_INFO(edev, "Ending qede unload\n");
 }
 
@@ -3625,7 +3657,8 @@ enum qede_load_mode {
 	QEDE_LOAD_RELOAD,
 };
 
-static int qede_load(struct qede_dev *edev, enum qede_load_mode mode)
+static int qede_load(struct qede_dev *edev, enum qede_load_mode mode,
+		     bool is_locked)
 {
 	struct qed_link_params link_params;
 	struct qed_link_output link_output;
@@ -3633,13 +3666,16 @@ static int qede_load(struct qede_dev *edev, enum qede_load_mode mode)
 
 	DP_INFO(edev, "Starting qede load\n");
 
+	if (!is_locked)
+		__qede_lock(edev);
+
 	rc = qede_set_num_queues(edev);
 	if (rc)
-		goto err0;
+		goto out;
 
 	rc = qede_alloc_fp_array(edev);
 	if (rc)
-		goto err0;
+		goto out;
 
 	qede_init_fp(edev);
 
@@ -3669,10 +3705,6 @@ static int qede_load(struct qede_dev *edev, enum qede_load_mode mode)
 	/* Add primary mac and set Rx filters */
 	ether_addr_copy(edev->primary_mac, edev->ndev->dev_addr);
 
-	mutex_lock(&edev->qede_lock);
-	edev->state = QEDE_STATE_OPEN;
-	mutex_unlock(&edev->qede_lock);
-
 	/* Program un-configured VLANs */
 	qede_configure_vlan_filters(edev);
 
@@ -3687,10 +3719,12 @@ static int qede_load(struct qede_dev *edev, enum qede_load_mode mode)
 	qede_roce_dev_event_open(edev);
 	qede_link_update(edev, &link_output);
 
+	edev->state = QEDE_STATE_OPEN;
+
 	DP_INFO(edev, "Ending successfully qede load\n");
 
-	return 0;
 
+	goto out;
 err4:
 	qede_sync_free_irqs(edev);
 	memset(&edev->int_info.msix_cnt, 0, sizeof(struct qed_int_info));
@@ -3704,26 +3738,40 @@ static int qede_load(struct qede_dev *edev, enum qede_load_mode mode)
 	edev->num_queues = 0;
 	edev->fp_num_tx = 0;
 	edev->fp_num_rx = 0;
-err0:
+out:
+	if (!is_locked)
+		__qede_unlock(edev);
+
 	return rc;
 }
 
+/* 'func' should be able to run between unload and reload assuming interface
+ * is actually running, or afterwards in case it's currently DOWN.
+ */
 void qede_reload(struct qede_dev *edev,
-		 void (*func)(struct qede_dev *, union qede_reload_args *),
-		 union qede_reload_args *args)
+		 struct qede_reload_args *args, bool is_locked)
 {
-	qede_unload(edev, QEDE_UNLOAD_NORMAL);
-	/* Call function handler to update parameters
-	 * needed for function load.
+	if (!is_locked)
+		__qede_lock(edev);
+
+	/* Since qede_lock is held, internal state wouldn't change even
+	 * if netdev state would start transitioning. Check whether current
+	 * internal configuration indicates device is up, then reload.
 	 */
-	if (func)
-		func(edev, args);
+	if (edev->state == QEDE_STATE_OPEN) {
+		qede_unload(edev, QEDE_UNLOAD_NORMAL, true);
+		if (args)
+			args->func(edev, args);
+		qede_load(edev, QEDE_LOAD_RELOAD, true);
 
-	qede_load(edev, QEDE_LOAD_RELOAD);
+		/* Since no one is going to do it for us, re-configure */
+		qede_config_rx_mode(edev->ndev);
+	} else if (args) {
+		args->func(edev, args);
+	}
 
-	mutex_lock(&edev->qede_lock);
-	qede_config_rx_mode(edev->ndev);
-	mutex_unlock(&edev->qede_lock);
+	if (!is_locked)
+		__qede_unlock(edev);
 }
 
 /* called with rtnl_lock */
@@ -3736,8 +3784,7 @@ static int qede_open(struct net_device *ndev)
 
 	edev->ops->common->set_power_state(edev->cdev, PCI_D0);
 
-	rc = qede_load(edev, QEDE_LOAD_NORMAL);
-
+	rc = qede_load(edev, QEDE_LOAD_NORMAL, false);
 	if (rc)
 		return rc;
 
@@ -3752,7 +3799,7 @@ static int qede_close(struct net_device *ndev)
 {
 	struct qede_dev *edev = netdev_priv(ndev);
 
-	qede_unload(edev, QEDE_UNLOAD_NORMAL);
+	qede_unload(edev, QEDE_UNLOAD_NORMAL, false);
 
 	edev->ops->common->update_drv_state(edev->cdev, false);
 
@@ -3884,15 +3931,8 @@ static void qede_set_rx_mode(struct net_device *ndev)
 {
 	struct qede_dev *edev = netdev_priv(ndev);
 
-	DP_INFO(edev, "qede_set_rx_mode called\n");
-
-	if (edev->state != QEDE_STATE_OPEN) {
-		DP_INFO(edev,
-			"qede_set_rx_mode called while interface is down\n");
-	} else {
-		set_bit(QEDE_SP_RX_MODE, &edev->sp_flags);
-		schedule_delayed_work(&edev->sp_task, 0);
-	}
+	set_bit(QEDE_SP_RX_MODE, &edev->sp_flags);
+	schedule_delayed_work(&edev->sp_task, 0);
 }
 
 /* Must be called with qede_lock held */
-- 
1.9.3

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox