Netdev List
 help / color / mirror / Atom feed
* [PATCH v3 net-next 02/11] ibmvnic: Replace is_closed with state field
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

Replace the is_closed flag in the ibmvnic adapter strcut with a
more comprehensive state field that tracks the current state of
the driver.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |   20 +++++++++++++-------
 drivers/net/ethernet/ibm/ibmvnic.h |   12 ++++++++++--
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index c67f1d6..40a8ba0 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -669,7 +669,9 @@ static int ibmvnic_open(struct net_device *netdev)
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 	int i, rc;
 
-	if (adapter->is_closed) {
+	adapter->state = VNIC_OPENING;
+
+	if (adapter->state == VNIC_CLOSED) {
 		rc = ibmvnic_init(adapter);
 		if (rc)
 			return rc;
@@ -704,7 +706,7 @@ static int ibmvnic_open(struct net_device *netdev)
 		release_resources(adapter);
 	} else {
 		netif_tx_start_all_queues(netdev);
-		adapter->is_closed = false;
+		adapter->state = VNIC_OPEN;
 	}
 
 	return rc;
@@ -733,7 +735,7 @@ static int ibmvnic_close(struct net_device *netdev)
 	int rc = 0;
 	int i;
 
-	adapter->closing = true;
+	adapter->state = VNIC_CLOSING;
 	disable_sub_crqs(adapter);
 
 	if (adapter->napi) {
@@ -748,8 +750,7 @@ static int ibmvnic_close(struct net_device *netdev)
 
 	release_resources(adapter);
 
-	adapter->is_closed = true;
-	adapter->closing = false;
+	adapter->state = VNIC_CLOSED;
 	return rc;
 }
 
@@ -1860,7 +1861,8 @@ static int pending_scrq(struct ibmvnic_adapter *adapter,
 {
 	union sub_crq *entry = &scrq->msgs[scrq->cur];
 
-	if (entry->generic.first & IBMVNIC_CRQ_CMD_RSP || adapter->closing)
+	if (entry->generic.first & IBMVNIC_CRQ_CMD_RSP ||
+	    adapter->state == VNIC_CLOSING)
 		return 1;
 	else
 		return 0;
@@ -3353,6 +3355,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 		return -ENOMEM;
 
 	adapter = netdev_priv(netdev);
+	adapter->state = VNIC_PROBING;
 	dev_set_drvdata(&dev->dev, netdev);
 	adapter->vdev = dev;
 	adapter->netdev = netdev;
@@ -3380,7 +3383,6 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	}
 
 	netdev->mtu = adapter->req_mtu - ETH_HLEN;
-	adapter->is_closed = false;
 
 	rc = register_netdev(netdev);
 	if (rc) {
@@ -3390,6 +3392,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	}
 	dev_info(&dev->dev, "ibmvnic registered\n");
 
+	adapter->state = VNIC_PROBED;
 	return 0;
 }
 
@@ -3398,12 +3401,15 @@ static int ibmvnic_remove(struct vio_dev *dev)
 	struct net_device *netdev = dev_get_drvdata(&dev->dev);
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 
+	adapter->state = VNIC_REMOVING;
 	unregister_netdev(netdev);
 
 	release_resources(adapter);
 	release_sub_crqs(adapter);
 	release_crq_queue(adapter);
 
+	adapter->state = VNIC_REMOVED;
+
 	free_netdev(netdev);
 	dev_set_drvdata(&dev->dev, NULL);
 
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index a69979f..03a866f 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -913,6 +913,15 @@ struct ibmvnic_error_buff {
 	__be32 error_id;
 };
 
+enum vnic_state {VNIC_PROBING = 1,
+		 VNIC_PROBED,
+		 VNIC_OPENING,
+		 VNIC_OPEN,
+		 VNIC_CLOSING,
+		 VNIC_CLOSED,
+		 VNIC_REMOVING,
+		 VNIC_REMOVED};
+
 struct ibmvnic_adapter {
 	struct vio_dev *vdev;
 	struct net_device *netdev;
@@ -962,7 +971,6 @@ struct ibmvnic_adapter {
 	u64 promisc;
 
 	struct ibmvnic_tx_pool *tx_pool;
-	bool closing;
 	struct completion init_done;
 	int init_done_rc;
 
@@ -1011,5 +1019,5 @@ struct ibmvnic_adapter {
 	struct work_struct ibmvnic_xport;
 	struct tasklet_struct tasklet;
 	bool failover;
-	bool is_closed;
+	enum vnic_state state;
 };

^ permalink raw reply related

* [PATCH v3 net-next 01/11] ibmvnic: Move resource initialization to its own routine
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon
In-Reply-To: <20170502185006.28214.89072.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>

Move all of the calls to initialize resources for the driver to
a separate routine.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |   71 ++++++++++++++++++++----------------
 1 file changed, 39 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 4fcd2f0..c67f1d6 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -624,22 +624,10 @@ static int set_real_num_queues(struct net_device *netdev)
 	return rc;
 }
 
-static int ibmvnic_open(struct net_device *netdev)
+static int init_resources(struct ibmvnic_adapter *adapter)
 {
-	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
-	struct device *dev = &adapter->vdev->dev;
-	int rc = 0;
-	int i;
-
-	if (adapter->is_closed) {
-		rc = ibmvnic_init(adapter);
-		if (rc)
-			return rc;
-	}
-
-	rc = ibmvnic_login(netdev);
-	if (rc)
-		return rc;
+	struct net_device *netdev = adapter->netdev;
+	int i, rc;
 
 	rc = set_real_num_queues(netdev);
 	if (rc)
@@ -647,7 +635,7 @@ static int ibmvnic_open(struct net_device *netdev)
 
 	rc = init_sub_crq_irqs(adapter);
 	if (rc) {
-		dev_err(dev, "failed to initialize sub crq irqs\n");
+		netdev_err(netdev, "failed to initialize sub crq irqs\n");
 		return -1;
 	}
 
@@ -659,25 +647,47 @@ static int ibmvnic_open(struct net_device *netdev)
 	adapter->napi = kcalloc(adapter->req_rx_queues,
 				sizeof(struct napi_struct), GFP_KERNEL);
 	if (!adapter->napi)
-		goto ibmvnic_open_fail;
+		return -ENOMEM;
+
 	for (i = 0; i < adapter->req_rx_queues; i++) {
 		netif_napi_add(netdev, &adapter->napi[i], ibmvnic_poll,
 			       NAPI_POLL_WEIGHT);
-		napi_enable(&adapter->napi[i]);
 	}
 
 	send_map_query(adapter);
 
 	rc = init_rx_pools(netdev);
 	if (rc)
-		goto ibmvnic_open_fail;
+		return rc;
 
 	rc = init_tx_pools(netdev);
+	return rc;
+}
+
+static int ibmvnic_open(struct net_device *netdev)
+{
+	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+	int i, rc;
+
+	if (adapter->is_closed) {
+		rc = ibmvnic_init(adapter);
+		if (rc)
+			return rc;
+	}
+
+	rc = ibmvnic_login(netdev);
 	if (rc)
-		goto ibmvnic_open_fail;
+		return rc;
+
+	rc = init_resources(adapter);
+	if (rc)
+		return rc;
 
 	replenish_pools(adapter);
 
+	for (i = 0; i < adapter->req_rx_queues; i++)
+		napi_enable(&adapter->napi[i]);
+
 	/* We're ready to receive frames, enable the sub-crq interrupts and
 	 * set the logical link state to up
 	 */
@@ -688,19 +698,16 @@ static int ibmvnic_open(struct net_device *netdev)
 		enable_scrq_irq(adapter, adapter->tx_scrq[i]);
 
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_UP);
-	if (rc)
-		goto ibmvnic_open_fail;
-
-	netif_tx_start_all_queues(netdev);
-	adapter->is_closed = false;
-
-	return 0;
+	if (rc) {
+		for (i = 0; i < adapter->req_rx_queues; i++)
+			napi_disable(&adapter->napi[i]);
+		release_resources(adapter);
+	} else {
+		netif_tx_start_all_queues(netdev);
+		adapter->is_closed = false;
+	}
 
-ibmvnic_open_fail:
-	for (i = 0; i < adapter->req_rx_queues; i++)
-		napi_disable(&adapter->napi[i]);
-	release_resources(adapter);
-	return -ENOMEM;
+	return rc;
 }
 
 static void disable_sub_crqs(struct ibmvnic_adapter *adapter)

^ permalink raw reply related

* [PATCH v3 net-next 00/11] ibmvnic: Updated reset handler and code fixes
From: Nathan Fontenot @ 2017-05-02 18:52 UTC (permalink / raw)
  To: netdev; +Cc: brking, jallen, muvic, tlfalcon

This set of patches multiple code fixes and a new rest handler
for the ibmvnic driver. In order to implement the new reset handler
for the ibmvnic driver resource initialization needed to be moved to
its own routine, a state variable is introduced to replace the
various is_* flags in the driver, and a new routine to handle the
assorted reasons the driver can be reset.

v3 updates:

Patch 10/11: Correct patch subject line to be a description of the patch.

v2 updates:

Patch 11/11: Use __netif_subqueue_stopped() instead of
netif_subqueue_stopped() to avoid possible use of an un-initialized
skb variable.

---

Nathan Fontenot (10):
      ibmvnic: Move resource initialization to its own routine
      ibmvnic: Replace is_closed with state field
      ibmvnic: Updated reset handling
      ibmvnic: Delete napi's when releasing driver resources
      ibmvnic: Whitespace correction in release_rx_pools
      ibmvnic: Clean up tx pools when closing
      ibmvnic: Wait for any pending scrqs entries at driver close
      ibmvnic: Check for driver reset first in ibmvnic_xmit
      ibmvnic: Continue skb processing after skb completion error
      ibmvnic: Move queue restarting in ibmvnic_tx_complete

Thomas Falcon (1):
      ibmvnic: Record SKB RX queue during poll


 drivers/net/ethernet/ibm/ibmvnic.c |  563 +++++++++++++++++++++++-------------
 drivers/net/ethernet/ibm/ibmvnic.h |   31 ++
 2 files changed, 389 insertions(+), 205 deletions(-)

^ permalink raw reply

* [PATCH] selftests: bpf: Use bpf_endian.h in test_xdp.c
From: David Miller @ 2017-05-02 14:54 UTC (permalink / raw)
  To: netdev; +Cc: ast, daniel


This fixes the testcase on big-endian.

Signed-off-by: David S. Miller <davem@davemloft.net>
---

Committed to net-next, I almost have test_progs working fully
on sparc with a small verifier hack...

 tools/testing/selftests/bpf/test_xdp.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_xdp.c b/tools/testing/selftests/bpf/test_xdp.c
index 9a33b03..5e7df8b 100644
--- a/tools/testing/selftests/bpf/test_xdp.c
+++ b/tools/testing/selftests/bpf/test_xdp.c
@@ -17,10 +17,9 @@
 #include <linux/pkt_cls.h>
 #include <sys/socket.h>
 #include "bpf_helpers.h"
+#include "bpf_endian.h"
 #include "test_iptunnel_common.h"
 
-#define htons __builtin_bswap16
-#define ntohs __builtin_bswap16
 int _version SEC("version") = 1;
 
 struct bpf_map_def SEC("maps") rxcnt = {
@@ -104,7 +103,7 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
 	vip.family = AF_INET;
 	vip.daddr.v4 = iph->daddr;
 	vip.dport = dport;
-	payload_len = ntohs(iph->tot_len);
+	payload_len = bpf_ntohs(iph->tot_len);
 
 	tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
 	/* It only does v4-in-v4 */
@@ -126,7 +125,7 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
 	    iph + 1 > data_end)
 		return XDP_DROP;
 
-	set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IP));
+	set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IP));
 
 	iph->version = 4;
 	iph->ihl = sizeof(*iph) >> 2;
@@ -134,7 +133,7 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
 	iph->protocol = IPPROTO_IPIP;
 	iph->check = 0;
 	iph->tos = 0;
-	iph->tot_len = htons(payload_len + sizeof(*iph));
+	iph->tot_len = bpf_htons(payload_len + sizeof(*iph));
 	iph->daddr = tnl->daddr.v4;
 	iph->saddr = tnl->saddr.v4;
 	iph->ttl = 8;
@@ -195,12 +194,12 @@ static __always_inline int handle_ipv6(struct xdp_md *xdp)
 	    ip6h + 1 > data_end)
 		return XDP_DROP;
 
-	set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IPV6));
+	set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IPV6));
 
 	ip6h->version = 6;
 	ip6h->priority = 0;
 	memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl));
-	ip6h->payload_len = htons(ntohs(payload_len) + sizeof(*ip6h));
+	ip6h->payload_len = bpf_htons(bpf_ntohs(payload_len) + sizeof(*ip6h));
 	ip6h->nexthdr = IPPROTO_IPV6;
 	ip6h->hop_limit = 8;
 	memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6));
@@ -224,9 +223,9 @@ int _xdp_tx_iptunnel(struct xdp_md *xdp)
 
 	h_proto = eth->h_proto;
 
-	if (h_proto == htons(ETH_P_IP))
+	if (h_proto == bpf_htons(ETH_P_IP))
 		return handle_ipv4(xdp);
-	else if (h_proto == htons(ETH_P_IPV6))
+	else if (h_proto == bpf_htons(ETH_P_IPV6))
 
 		return handle_ipv6(xdp);
 	else
-- 
2.1.2.532.g19b5d50

^ permalink raw reply related

* Re: [PATCH v1 1/3] bnx2x: Replace custom scnprintf()
From: David Miller @ 2017-05-02 14:42 UTC (permalink / raw)
  To: andy.shevchenko; +Cc: Yuval.Mintz, andriy.shevchenko, netdev, Ariel.Elior
In-Reply-To: <CAHp75VfX71nmjzr23WWUO6ycGcvyw1sNkdTLhX-L=Pbk2jDPLg@mail.gmail.com>

From: Andy Shevchenko <andy.shevchenko@gmail.com>
Date: Tue, 2 May 2017 11:45:50 +0300

> On Sun, Apr 30, 2017 at 6:32 PM, David Miller <davem@davemloft.net> wrote:
>> It is an essential part of all patch series.
> 
> I hope it doesn't apply for the single patch(es).

No it does not.

^ permalink raw reply

* Re: net/smc and the RDMA core
From: Doug Ledford @ 2017-05-02 14:34 UTC (permalink / raw)
  To: Ursula Braun, Bart Van Assche, hch-jcswGhMUV9g@public.gmane.org,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <dce14470-06f4-8da3-6894-cd724eac3447-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>


[-- Attachment #1.1: Type: text/plain, Size: 5819 bytes --]

On 5/2/2017 8:34 AM, Ursula Braun wrote:
> On 05/01/2017 07:29 PM, Bart Van Assche wrote:
>> On Mon, 2017-05-01 at 18:33 +0200, Christoph Hellwig wrote:
>>> Hi Ursual, hi netdev reviewers,
>>>
>>> how did the smc protocol manage to get merged without any review 
>>> on linux-rdma at all?  As the results it seems it's very substandard
>>> in terms of RDMA API usage, e.g. it neither uses the proper CQ API
>>> nor the RDMA R/W API, and other will probably find additional issues
>>> as well.
>>
>> Hello Dave and Ursula,
>>
>> It seems very rude to me to have merged the SMC protocol driver without
>> having involved the linux-rdma community. Anyway, I have the following
>> questions for Dave and Ursula:
>> * Since the Linux kernel is standards based: where can we find the standard
>>   that defines the SMC wire protocol? If this protocol has not been
>>   standardized yet: in what file (other than *.[ch]) in the Linux kernel
>>   tree has this protocol been documented?
> 
> Hello Bart,
> 
> The protocol is standardized, see: http://www.rfc-editor.org/info/rfc7609.

No, SMC-R is *not* standardized.  It is informationally publicized.  In
other words, you guys dumped what you considered your standard out and
rfc-editor published it under the "information" status category.  To be
an Internet Standard requires more work, and additionally requires a
period of time for people to comment on this proposal.  Given that this
is for a linux RDMA communications layer, advertising this RFC on the
linux-rdma@ mailing list seems the absolute *minimum* advertising that
should be done during the comment phase (at least as relates to the RDMA
portion of this protocol, which this protocol is first and foremost an
RDMA protocol with only TCP as a control/fallback).  It's been 18 months
since you published this, and this is the first that it's been brought
to the linux-rdma@ mailing list to my knowledge.

Anyway, the RFC is informational, not a standard, and as it stands, I'm
pretty sure the RDMA community would have a number of comments on it
before it were allowed to become a standard, including the fact that it
is currently exclusive to RoCEv1 which is, as far as the RDMA community
is concerned, a deprecated RoCE mode.  Adding a new protocol that only
supports this is just simply short sighted.  Had we been consulted
before this was merged, I have no doubt that we would have had a
considerable list of review requirements.  But, we are where we are, so
the issue is how to move forward.  Moving it to staging doesn't seem so
inappropriate in this particular case...

> I described this and some more protocol details in my patch series
> overview mail, see for instance:
>      http://marc.info/?l=linux-s390&m=148397751211964&w=2
> 
> This description explains the reasons to come up with SMC-R.
> 
>> * What are the differences between the SMC protocol, the SDP protocol and
>>   the rsockets protocol? How do existing implementations for these protocols
>>   compare to each other from a performance point of view? If no performance
>>   comparison between these protocols is available, shouldn't the performance
>>   of these protocols have been compared with each other before a review of
>>   the SMC driver even started?
>> * What are the reasons why the SDP driver was never accepted upstream? Do
>>   the arguments why SDP was not accepted upstream also apply to the SMC
>>   driver (SDP = Sockets Direct Protocol)?
>> * Since SMC has to be selected by specifying AF_SMC, how are users expected
>>   to specify whether AF_INET, AF_INET6 or yet another address family should
>>   be used to set up a connection between SMC
>> endpoints?
> 
> The IPv6 support in SMC-R is on our todo-list.
> 
>> * Is the SMC driver limited to RoCE? Are you aware that the rsockets library
>>   supports multiple transport layers (RoCE, IB and iWARP)?
> 
> For now, only RoCE is supported. Other transports might be added in the future.

We generally don't allow this type of partial support in RDMA code if we
can avoid it.  There are means in place in the RDMA subsystem to hide
the differences between the different RDMA types and we highly frown on
any code that doesn't deal with all of the fabrics.  Old code might get
grandfathered in if it had good reason for being specific to a fabric,
but not new code.  Fixing this would have been high on our list of
review items.  At a minimum supporting iWARP and all flavors of RoCE
would have required as these are all Ethernet devices at their heart and
should all support the required TCP control channel and such.  We
*might* have been more lenient on IB and OPA since they don't have a
native TCP network device, but since IPoIB works on both, even that
isn't really a reason not to support them.  The real issue is the much
more difficult issue of namespaces and SELinux, which is different
between TCP sockets and IB/OPA connections.  That would have been the
difficult part to get right, but as we haven't investigated it, we
really don't know if it would have been solvable in a reasonable fashion.

>> * Since functionality that is similar what the SMC driver provides already
>>   exists in user space (rsockets), why has this functionality been
>>   reimplemented as a kernel driver (SMC)?
>>
>> Bart.
>>
> 
> Regards, Ursula
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


-- 
Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
    GPG Key ID: B826A3330E572FDD
    Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 884 bytes --]

^ permalink raw reply

* Re: [PATCH] net: dsa: mv88e6xxx: fix mdio bus name when using devicetree
From: Sylvain Lemieux @ 2017-05-02 14:26 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: vivien.didelot, netdev
In-Reply-To: <20170501152054.GD1285@lunn.ch>

Hi Andrew,

On Mon, 2017-05-01 at 17:20 +0200, Andrew Lunn wrote:
> On Mon, May 01, 2017 at 10:54:04AM -0400, Sylvain Lemieux wrote:
> > From: Liam Beguin <lbeguin@tycoint.com>
> > 
> > mv88e6xxx_mdio_register automatically generates mdio buses for each switch
> > discovered in the devicetree. When switch nodes are embedded in other nodes,
> > this can cause sysfs naming collisions since full_name may be truncated.
> > 
> > Only use devicetree node name instead of the full devicetree path
> > as the mdio bus name.
> 
> Hi Sylvain
> 
> I'm not sure this is a good idea. It probably breaks my boards:
> 
> :/sys/class/mdio_bus# ls
> !mdio-mux!mdio@1!switch@0!mdio	0.1  0.4  400d0000.ethernet-1  fixed-0
> !mdio-mux!mdio@2!switch@0!mdio	0.2  0.8  400d1000.ethernet-2  mv88e6xxx-0
> 
> np->name is not unique, where as np->full_name is unique.
> 
> However, i can understand your problem with truncation. Maybe a better
> solution is to detect if truncation is going to happen. If so, use a
> concatenation of a hash of np->full_name, and the right hand part of
> np->full_name?
> 
Thanks for the feedback.

I am currently busy on something else; I should be able to look at this
next week or the following.

Regards,
Sylvain

> 	Andrew

^ permalink raw reply

* [PATCH net v4 12/12] net: batman-adv: Fix one possbile memleak when fail to register_netdevice
From: gfree.wind @ 2017-05-02 14:17 UTC (permalink / raw)
  To: davem, jiri, mareklindner, sw, a, kuznet, jmorris, yoshfuji,
	kaber, steffen.klassert, herbert, netdev
  Cc: Gao Feng
In-Reply-To: <cover.1493699451.git.gfree.wind@foxmail.com>

From: Gao Feng <gfree.wind@foxmail.com>

The batman-adv allocates some resources in its ndo_init func, and free
them in its destructor func. Then there is one memleak that some errors
happen after register_netdevice invokes the ndo_init callback. Because
the destructor would not be invoked to free the resources.

Now create one new func batadv_destructor_free to free the mem in
the destructor, and add ndo_uninit func also invokes it when fail to
register the batman-adv device.

It's not only free all resources, but also follow the original desgin
that the resources are freed in the destructor normally after
register the device successfully.

Signed-off-by: Gao Feng <gfree.wind@foxmail.com>
---
 net/batman-adv/soft-interface.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index d042c99..626cbf6 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -878,6 +878,19 @@ static int batadv_softif_init_late(struct net_device *dev)
 	return ret;
 }
 
+static void batadv_destructor_free(struct net_device *dev)
+{
+	batadv_debugfs_del_meshif(dev);
+	batadv_mesh_free(dev);
+}
+
+static void batadv_softif_uninit(struct net_device *dev)
+{
+	/* dev is not registered, perform the free instead of destructor */
+	if (dev->reg_state == NETREG_UNINITIALIZED)
+		batadv_destructor_free(dev);
+}
+
 /**
  * batadv_softif_slave_add - Add a slave interface to a batadv_soft_interface
  * @dev: batadv_soft_interface used as master interface
@@ -933,6 +946,7 @@ static int batadv_softif_slave_del(struct net_device *dev,
 
 static const struct net_device_ops batadv_netdev_ops = {
 	.ndo_init = batadv_softif_init_late,
+	.ndo_uninit = batadv_softif_uninit,
 	.ndo_open = batadv_interface_open,
 	.ndo_stop = batadv_interface_release,
 	.ndo_get_stats = batadv_interface_stats,
@@ -953,9 +967,7 @@ static int batadv_softif_slave_del(struct net_device *dev,
  */
 static void batadv_softif_free(struct net_device *dev)
 {
-	batadv_debugfs_del_meshif(dev);
-	batadv_mesh_free(dev);
-
+	batadv_destructor_free(dev);
 	/* some scheduled RCU callbacks need the bat_priv struct to accomplish
 	 * their tasks. Wait for them all to be finished before freeing the
 	 * netdev and its private data (bat_priv)
-- 
1.9.1

^ permalink raw reply related

* Re: sparc64 and ARM64 JIT bug (was Re: LLVM 4.0 code generation bug)
From: Daniel Borkmann @ 2017-05-02 14:11 UTC (permalink / raw)
  To: David Miller, ast; +Cc: netdev, xi.wang, catalin.marinas
In-Reply-To: <20170501.230234.787989809925411599.davem@davemloft.net>

On 05/02/2017 05:02 AM, David Miller wrote:
> From: Alexei Starovoitov <ast@fb.com>
> Date: Mon, 1 May 2017 19:39:33 -0700
>
>> On 5/1/17 7:31 PM, David Miller wrote:
>>>
>>> If the last BPF instruction before exit is a ldimm64, branches to the
>>> exit point at the wrong location.
>>>
>>> Here is what I get from test_pkt_access.c on sparc:
>>>
>>> 0000000000000000 <process>:
>>>     0:	b7 00 00 00 00 00 00 02 	mov	r0, 2
>>>     8:	61 21 00 50 00 00 00 00 	ldw	r2, [r1+80]
>>>    10:	61 11 00 4c 00 00 00 00 	ldw	r1, [r1+76]
>>>    18:	bf 41 00 00 00 00 00 00 	mov	r4, r1
>>>    20:	07 40 00 00 00 00 00 0e 	add	r4, 14
>>>    28:	2d 42 00 25 00 00 00 00 	jgt	r4, r2, 148 <LBB0_11>
>>>   ...
>>> 0000000000000148 <LBB0_11>:
>>>   148:	18 00 00 00 ff ff ff ff 	ldimm64	r0, 4294967295
>>>   150:	00 00 00 00 00 00 00 00
>>>
>>> 0000000000000158 <LBB0_12>:
>>>   158:	95 00 00 00 00 00 00 00 	exit	
>   ...
>> looks fine to me. it jumps to 0x158,
>> since offset 0 is the next insn after jump which is 0x30
>> That's how classic bpf defined jumps.
>
> Ok, it seems that both arm64 and sparc64's JIT handle the above
> situation improperly.
>
> They both work by recording the instruction offsets in an array which
> is indexed off by one.  It it built like this:
>
> 	for (i = 0; i < prog->len; i++) {
> 		const struct bpf_insn *insn = &prog->insnsi[i];
> 		int ret;
>
> 		ret = build_insn(insn, ctx);
> 		ctx->offset[i] = ctx->idx;
>
> 		if (ret > 0) {
> 			i++;
> 			continue;
> 		}
> 		if (ret)
> 			return ret;
> 	}
>
> That is, we record the JIT'd instruction offset for BPF instruction
> 'idx' in array entry 'idx - 1'.
>
> Then when we emit a relative branch, we lookup the destination offset
> using "ctx->offset[this_insn_idx + insn->off]"
>
> And this works most of the time.  It doesn't work for the scenerio
> above, because 'idx - 1' is not necessarily the index of the previous
> BPF instruction.  Instead, that might point to the second half of an
> lddimm64 instruction.
>
> This bug was introduced by commit
> 8eee539ddea09bccae2426f09b0ba6a18b72b691 ("arm64: bpf: fix
> out-of-bounds read in bpf2a64_offset()") and I copied the logic into
> sparc64 :-)

I'll take a look at the arm64 one since I have a node at hand, and add
tests into lib/test_bpf.c as well later today.

^ permalink raw reply

* Re: net/ipv6: warning in inet6_ifa_finish_destroy
From: Mike Manning @ 2017-05-02 14:04 UTC (permalink / raw)
  To: Cong Wang, Andrey Konovalov
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, LKML, Eric Dumazet,
	David Ahern, Dmitry Vyukov, Kostya Serebryany, syzkaller
In-Reply-To: <CAM_iQpUTx_nDtZ63TRVGnqu5L33NzTE8XFMjaxSD_i4ghGFQ7g@mail.gmail.com>

On 28/04/17 21:39, Cong Wang wrote:
> On Fri, Apr 28, 2017 at 6:08 AM, Andrey Konovalov <andreyknvl@google.com> wrote:
>> Hi,
>>
>> I've got the following error report while fuzzing the kernel with syzkaller.
>>
>> On commit 5a7ad1146caa895ad718a534399e38bd2ba721b7 (4.11-rc8).
>>
>> C reproducer and .config are attached.
>> It takes 1-2 minutes of running the reproducer to trigger the issue.
>>
>> ------------[ cut here ]------------
>> WARNING: CPU: 0 PID: 21 at net/ipv6/addrconf.c:894
>> inet6_ifa_finish_destroy+0x12e/0x190
>> Modules linked in:
>> CPU: 0 PID: 21 Comm: kworker/0:1 Not tainted 4.11.0-rc8+ #296
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>> Workqueue: ipv6_addrconf addrconf_dad_work
>> Call Trace:
>>  __dump_stack lib/dump_stack.c:16
>>  dump_stack+0x292/0x398 lib/dump_stack.c:52
>>  __warn+0x19f/0x1e0 kernel/panic.c:549
>>  warn_slowpath_null+0x2c/0x40 kernel/panic.c:584
>>  inet6_ifa_finish_destroy+0x12e/0x190 c:894
>>  in6_ifa_put ./include/net/addrconf.h:330
>>  addrconf_dad_work+0x4e9/0x1040 net/ipv6/addrconf.c:3963
> 
> 
> I don't look too much, but a quick glance shows in the following
> path:
> 
>         } else if (action == DAD_ABORT) {
>                 in6_ifa_hold(ifp);
>                 addrconf_dad_stop(ifp, 1);
>                 if (disable_ipv6)
>                         addrconf_ifdown(idev->dev, 0);
>                 goto out;
>         }
> 
> the inet6_addr could be removed from hash table in
> addrconf_ifdown() before calling in6_ifa_put(). which causes
> this warning.
> 

git describe 85b51b12115c79cce7ea1ced6c0bd0339a165d3f --contains
v4.8-rc5~34^2~32

This fix was introduced in 4.8, so it is interesting that this problem is only showing up now for 4.11. Also, it is not reproducible manually, i.e. DAD failure with disable_ipv6 works just fine without triggering this warning, with or without keeping IPv6 addresses on admin down.

I will go ahead with putting out a fix so that in6_ifa_put() precedes the call to addrconf_ifdown() in this case.

Thanks for the heads up on this,
Mike

^ permalink raw reply

* Re: IPsec PFP support on linux
From: Sowmini Varadhan @ 2017-05-02 14:05 UTC (permalink / raw)
  To: Paul Wouters
  Cc: steffen.klassert, borisp, swan, netdev, ilant, linux-crypto,
	herbert
In-Reply-To: <B4EE88ED-B403-42C3-A6D8-1A2526F1DA67@nohats.ca>

On (05/02/17 09:58), Paul Wouters wrote:
>    I think you want to use Opportunistic IPsec, eg see 
>    https://libreswan.org/wiki/HOWTO:_Opportunistic_IPsec

I dont think that what I want is opportunistic ipsec..

taking an extreme example, I can set up the ipsec tunnels with 
esp-null for *.5001 and 5001.* (and that's how I'm able to
trigger quick mode in my experiment).

But I want each 5-tuple to/from 5001 to get its own SPI. That
is not happening in my case. 

If I used OE, I would *never* get a per-5-tuple SPI, right?
(I can give this a try later today but the rfc definition of OE is
not what I have in mind- I really want PFP, not OE).

>    Note that IKEv2 also allows you to define one connection and instantiate
>     a connection based on the trigger packet whose src/dst proto/port are
>    included in the IKEv2 packet as traffic selectors. See RFC7296 and
>    "narrowing".

yes, the rfc's permit this, I'm not sure how to set up my 
SPD for this though.

^ permalink raw reply

* Re: IPsec PFP support on linux
From: Paul Wouters @ 2017-05-02 13:58 UTC (permalink / raw)
  To: Sowmini Varadhan
  Cc: steffen.klassert, borisp, swan, netdev, ilant, linux-crypto,
	herbert
In-Reply-To: <20170502123238.GE5843@oracle.com>


[-- Attachment #1.1: Type: text/plain, Size: 2979 bytes --]

I think you want to use Opportunistic IPsec, eg see 

https://libreswan.org/wiki/HOWTO:_Opportunistic_IPsec

Note that IKEv2 also allows you to define one connection and instantiate  a connection based on the trigger packet whose src/dst proto/port are included in the IKEv2 packet as traffic selectors. See RFC7296 and "narrowing".

Paul


Sent from my iPhone

> On May 2, 2017, at 08:32, Sowmini Varadhan <sowmini.varadhan@oracle.com> wrote:
> 
> I have a question about linux support for IPsec PFP (as defined in
> rfc 4301). I am assuming this exists, and is accessible from uspace,
> in which case I need some hints on how to set it up.
> 
> Assuming I have a server listening at port 5001 that I want to
> secure via ipsec. Suppose I want to make sure that each TCP/UDP 5-tuple
> sending packets to port 5001 gets its own SA.
> 
> RFC4301 has this:
> 
>       - SPD-S: For traffic that is to be protected using IPsec, the
>         entry consists of the values of the selectors that apply to the
>         traffic to be protected via AH or ESP, controls on how to
>         create SAs based on these selectors, ...
> 
> and further down
>      If IPsec processing is specified for
>      an entry, a "populate from packet" (PFP) flag may be asserted for
>      one or more of the selectors in the SPD entry (Local IP address;
>      Remote IP address; Next Layer Protocol; and, depending on Next
>      Layer Protocol, Local port and Remote port, or ICMP type/code, or
>      Mobility Header type).  If asserted for a given selector X, the
>      flag indicates that the SA to be created should take its value for
>      X from the value in the packet.  Otherwise, the SA should take its
>      value(s) for X from the value(s) in the SPD entry.
> 
> A google search produces a discarded patch
>  http://marc.info/?l=linux-netdev&m=119746758904140
> but its not clear to me how to set this up (if PFP works fine,
> as suggested by Herbert's response above)
> 
> I tried experimenting with IP_XFRM_POLICY from a simple udp client but
> (a) that seems to require a SPI and reqid to set up the SPD 
> (b) I see the SADB_ACQUIRE upcall being triggered after the local port
>    is bound (and SADB entry is set up for the lport).  But ike phase2
>    does not converge for the lport specific sadb added
>    by the bind (even in quick mode)
> 
> My understanding is that pluto shoud be generating spi's to make sure
> they are sufficiently unique/random etc. so (a) makes me think I'm
> either not setting this up or not using this correctly.
> 
> Any hints/sample code/RTFMs would be helpful (documentation for
> IP_XFRM_POLICY seems scant, afaict). I'd be happy to share my 
> udp client program, if it can provide more context to my question.
> 
> --Sowmini
> 
> _______________________________________________
> Swan mailing list
> Swan@lists.libreswan.org
> https://lists.libreswan.org/mailman/listinfo/swan

[-- Attachment #1.2: Type: text/html, Size: 4716 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply

* RE: [PATCH 3/4] net: macb: Add hardware PTP support
From: Rafal Ozieblo @ 2017-05-02 13:57 UTC (permalink / raw)
  To: Richard Cochran
  Cc: David Miller, nicolas.ferre@atmel.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, devicetree@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	harinikatakamlinux@gmail.com, harini.katakam@xilinx.com,
	Andrei.Pistirica@microchip.com
In-Reply-To: <20170414182850.GB7751@localhost.localdomain>

> From: Richard Cochran [mailto:richardcochran@gmail.com]
> Sent: 14 kwietnia 2017 20:29
> To: Rafal Ozieblo <rafalo@cadence.com>
> Cc: David Miller <davem@davemloft.net>; nicolas.ferre@atmel.com;
> netdev@vger.kernel.org; linux-kernel@vger.kernel.org;
> devicetree@vger.kernel.org; linux-arm-kernel@lists.infradead.org;
> harinikatakamlinux@gmail.com; harini.katakam@xilinx.com;
> Andrei.Pistirica@microchip.com
> Subject: Re: [PATCH 3/4] net: macb: Add hardware PTP support
> 
> On Thu, Apr 13, 2017 at 02:39:23PM +0100, Rafal Ozieblo wrote:
(...)
> > +static int gem_tsu_get_time(struct macb *bp, struct timespec64 *ts)
> > +{
> > +	long first, second;
> > +	u32 secl, sech;
> > +	unsigned long flags;
> > +
> > +	if (!bp || !ts)
> > +		return -EINVAL;
> 
> Useless test.
Sorry for me being too carefulness, I'll remove all tests.
> 
(...)
> > +static int gem_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64
> *ts)
> > +{
> > +	struct macb *bp = container_of(ptp, struct macb, ptp_clock_info);
> > +
> > +	if (!ptp || !ts)
> > +		return -EINVAL;
> 
> Useles test.
> 
> What is the point of this wrapper function anyhow?  Please remove it.
gem_ptp_gettime() is assigned in ptp_clock_info and it has to have 
ptp_clock_info pointer as first parameter. gem_tsu_get_time() is used in
the source code but with macb pointer.
Do you want me to do something like:
gem_ptp_gettime(macb->ptp, ts);
and first would be getting macb pointer from ptp ?
struct macb *bp = container_of(ptp, struct macb, ptp_clock_info);
> 
> > +
> > +	gem_tsu_get_time(bp, ts);
> > +	return 0;
> > +}
> > +
> > +static int gem_ptp_settime(struct ptp_clock_info *ptp,
> > +		const struct timespec64 *ts)
> > +{
> > +	struct macb *bp = container_of(ptp, struct macb, ptp_clock_info);
> > +
> > +	if (!ptp || !ts)
> > +		return -EINVAL;
> 
> Another useless function.
ditto
> 
> > +	gem_tsu_set_time(bp, ts);
> > +	return 0;
> > +}
> > +
> > +static int gem_ptp_enable(struct ptp_clock_info *ptp,
> > +			  struct ptp_clock_request *rq, int on)
> > +{
> > +	struct macb *bp = container_of(ptp, struct macb, ptp_clock_info);
> > +
> > +	if (!ptp || !rq)
> > +		return -EINVAL;
> 
> Sigh.
> 
> > +
> > +	switch (rq->type) {
> > +	case PTP_CLK_REQ_EXTTS:	/* Toggle TSU match interrupt */
> > +		if (on)
> > +			macb_writel(bp, IER, MACB_BIT(TCI));
> 
> No locking to protect IER and IDE?
There is no need.
> 
> > +		else
> > +			macb_writel(bp, IDR, MACB_BIT(TCI));
> > +		break;
> > +	case PTP_CLK_REQ_PEROUT: /* Toggle Periodic output */
> > +		return -EOPNOTSUPP;
> > +		/* break; */
> > +	case PTP_CLK_REQ_PPS:	/* Toggle TSU periodic (second)
> interrupt */
> > +		if (on)
> > +			macb_writel(bp, IER, MACB_BIT(SRI));
> > +		else
> > +			macb_writel(bp, IDR, MACB_BIT(SRI));
> > +		break;
> > +	default:
> > +		break;
> > +	}
> > +	return 0;
> > +}
> > +
(...)
> > --
> > 2.4.5
> >
> 
> Thanks,
> Richard

^ permalink raw reply

* [PATCH net v4 11/12] net: vlan: Fix one possbile memleak when fail to register_netdevice
From: gfree.wind @ 2017-05-02 13:41 UTC (permalink / raw)
  To: davem, jiri, mareklindner, sw, a, kuznet, jmorris, yoshfuji,
	kaber, steffen.klassert, herbert, netdev
  Cc: Gao Feng
In-Reply-To: <cover.1493699451.git.gfree.wind@foxmail.com>

From: Gao Feng <gfree.wind@foxmail.com>

The vlan driver allocates some resources in its ndo_init func, and
free some of them in its destructor func. Then there is one memleak
that some errors happen after register_netdevice invokes the ndo_init
callback. Because only the ndo_uninit callback is invoked in the error
handler of register_netdevice, but destructor not.

Now create one new func vlan_destructor_free to free the mem in the
destructor, and ndo_uninit func also invokes it when fail to register
the vlan device.

It's not only free all resources, but also follow the original desgin
that the resources are freed in the destructor normally after
register the device successfully.

Signed-off-by: Gao Feng <gfree.wind@foxmail.com>
---
 net/8021q/vlan_dev.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index e97ab82..adcaa39 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -608,6 +608,14 @@ static int vlan_dev_init(struct net_device *dev)
 	return 0;
 }
 
+static void vlan_destructor_free(struct net_device *dev)
+{
+	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+
+	free_percpu(vlan->vlan_pcpu_stats);
+	vlan->vlan_pcpu_stats = NULL;
+}
+
 static void vlan_dev_uninit(struct net_device *dev)
 {
 	struct vlan_priority_tci_mapping *pm;
@@ -620,6 +628,10 @@ static void vlan_dev_uninit(struct net_device *dev)
 			kfree(pm);
 		}
 	}
+
+	/* dev is not registered, perform the free instead of destructor */
+	if (dev->reg_state == NETREG_UNINITIALIZED)
+		vlan_destructor_free(dev);
 }
 
 static netdev_features_t vlan_dev_fix_features(struct net_device *dev,
@@ -803,10 +815,7 @@ static int vlan_dev_get_iflink(const struct net_device *dev)
 
 static void vlan_dev_free(struct net_device *dev)
 {
-	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
-
-	free_percpu(vlan->vlan_pcpu_stats);
-	vlan->vlan_pcpu_stats = NULL;
+	vlan_destructor_free(dev);
 	free_netdev(dev);
 }
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH net v4 10/12] net: sit: Fix one possbile memleak when fail to register_netdevice
From: gfree.wind @ 2017-05-02 13:39 UTC (permalink / raw)
  To: davem, jiri, mareklindner, sw, a, kuznet, jmorris, yoshfuji,
	kaber, steffen.klassert, herbert, netdev
  Cc: Gao Feng
In-Reply-To: <cover.1493699451.git.gfree.wind@foxmail.com>

From: Gao Feng <gfree.wind@foxmail.com>

The ipip6 allocates some resources in its ndo_init func, and
free some of them in its destructor func. Then there is one memleak
that some errors happen after register_netdevice invokes the ndo_init
callback. Because only the ndo_uninit callback is invoked in the error
handler of register_netdevice, but destructor not.

Now create one new func ipip6_destructor_free to free the mem in
the destructor, and ndo_uninit func also invokes it when fail to
register the ipip6 device.

It's not only free all resources, but also follow the original desgin
that the resources are freed in the destructor normally after
register the device successfully.

Signed-off-by: Gao Feng <gfree.wind@foxmail.com>
---
 net/ipv6/sit.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 99853c6..28c1649 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -464,6 +464,14 @@ static void prl_list_destroy_rcu(struct rcu_head *head)
 	return ok;
 }
 
+static void ipip6_destructor_free(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+
+	dst_cache_destroy(&tunnel->dst_cache);
+	free_percpu(dev->tstats);
+}
+
 static void ipip6_tunnel_uninit(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
@@ -477,6 +485,10 @@ static void ipip6_tunnel_uninit(struct net_device *dev)
 	}
 	dst_cache_reset(&tunnel->dst_cache);
 	dev_put(dev);
+
+	/* dev is not registered, perform the free instead of destructor */
+	if (dev->reg_state == NETREG_UNINITIALIZED)
+		ipip6_destructor_free(dev);
 }
 
 static int ipip6_err(struct sk_buff *skb, u32 info)
@@ -1329,10 +1341,7 @@ static bool ipip6_valid_ip_proto(u8 ipproto)
 
 static void ipip6_dev_free(struct net_device *dev)
 {
-	struct ip_tunnel *tunnel = netdev_priv(dev);
-
-	dst_cache_destroy(&tunnel->dst_cache);
-	free_percpu(dev->tstats);
+	ipip6_destructor_free(dev);
 	free_netdev(dev);
 }
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH net v4 09/12] net: ip_tunnel: Fix one possbile memleak when fail to register_netdevice
From: gfree.wind @ 2017-05-02 13:37 UTC (permalink / raw)
  To: davem, jiri, mareklindner, sw, a, kuznet, jmorris, yoshfuji,
	kaber, steffen.klassert, herbert, netdev
  Cc: Gao Feng
In-Reply-To: <cover.1493699451.git.gfree.wind@foxmail.com>

From: Gao Feng <gfree.wind@foxmail.com>

The ip_tunnel allocates some resources in its ndo_init func, and
free some of them in its destructor func. Then there is one memleak
that some errors happen after register_netdevice invokes the ndo_init
callback. Because only the ndo_uninit callback is invoked in the error
handler of register_netdevice, but destructor not.

Now create one new func ip_tunnel_destructor_free to free the mem in
the destructor, and ndo_uninit func also invokes it when fail to
register the ip_tunnel device.

It's not only free all resources, but also follow the original desgin
that the resources are freed in the destructor normally after
register the device successfully.

Signed-off-by: Gao Feng <gfree.wind@foxmail.com>
---
 net/ipv4/ip_tunnel.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 823abae..96a3005 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -954,13 +954,18 @@ int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
 
-static void ip_tunnel_dev_free(struct net_device *dev)
+static void ip_tunnel_destructor_free(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
 	gro_cells_destroy(&tunnel->gro_cells);
 	dst_cache_destroy(&tunnel->dst_cache);
 	free_percpu(dev->tstats);
+}
+
+static void ip_tunnel_dev_free(struct net_device *dev)
+{
+	ip_tunnel_destructor_free(dev);
 	free_netdev(dev);
 }
 
@@ -1192,6 +1197,10 @@ void ip_tunnel_uninit(struct net_device *dev)
 		ip_tunnel_del(itn, netdev_priv(dev));
 
 	dst_cache_reset(&tunnel->dst_cache);
+
+	/* dev is not registered, perform the free instead of destructor */
+	if (dev->reg_state == NETREG_UNINITIALIZED)
+		ip_tunnel_destructor_free(dev);
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH net v4 08/12] net: ip6_vti: Fix one possbile memleak when fail to register_netdevice
From: gfree.wind @ 2017-05-02 13:34 UTC (permalink / raw)
  To: davem, jiri, mareklindner, sw, a, kuznet, jmorris, yoshfuji,
	kaber, steffen.klassert, herbert, netdev
  Cc: Gao Feng
In-Reply-To: <cover.1493699451.git.gfree.wind@foxmail.com>

From: Gao Feng <gfree.wind@foxmail.com>

The ip6_vti allocates some resources in its ndo_init func, and
free some of them in its destructor func. Then there is one memleak
that some errors happen after register_netdevice invokes the ndo_init
callback. Because only the ndo_uninit callback is invoked in the error
handler of register_netdevice, but destructor not.

Now create one new func vti6_destructor_free to free the mem in
the destructor, and ndo_uninit func also invokes it when fail to
register the vti6 device.

It's not only free all resources, but also follow the original desgin
that the resources are freed in the destructor normally after
register the device successfully.

Signed-off-by: Gao Feng <gfree.wind@foxmail.com>
---
 net/ipv6/ip6_vti.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 3d8a3b6..3b3f49a 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -177,9 +177,14 @@ struct vti6_net {
 	}
 }
 
-static void vti6_dev_free(struct net_device *dev)
+static void vti6_destructor_free(struct net_device *dev)
 {
 	free_percpu(dev->tstats);
+}
+
+static void vti6_dev_free(struct net_device *dev)
+{
+	vti6_destructor_free(dev);
 	free_netdev(dev);
 }
 
@@ -296,6 +301,10 @@ static void vti6_dev_uninit(struct net_device *dev)
 	else
 		vti6_tnl_unlink(ip6n, t);
 	dev_put(dev);
+
+	/* dev is not registered, perform the free instead of destructor */
+	if (dev->reg_state == NETREG_UNINITIALIZED)
+		vti6_destructor_free(dev);
 }
 
 static int vti6_rcv(struct sk_buff *skb)
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH 0/3] net-SCTP: Fine-tuning for six function implementations
From: Neil Horman @ 2017-05-02 13:15 UTC (permalink / raw)
  To: SF Markus Elfring
  Cc: linux-sctp, netdev, David S. Miller, Vlad Yasevich, LKML,
	kernel-janitors
In-Reply-To: <ea9206ca-2a2f-095e-d764-3f819610d5e8@users.sourceforge.net>

On Mon, May 01, 2017 at 03:30:25PM +0200, SF Markus Elfring wrote:
> From: Markus Elfring <elfring@users.sourceforge.net>
> Date: Mon, 1 May 2017 15:25:05 +0200
> 
> A few update suggestions were taken into account
> from static source code analysis.
> 
> Markus Elfring (3):
>   Replace six seq_printf() calls by seq_putc()
>   Combine two seq_printf() calls into one call in sctp_remaddr_seq_show()
>   Replace four seq_printf() calls by seq_puts()
> 
>  net/sctp/proc.c | 39 ++++++++++++++++-----------------------
>  1 file changed, 16 insertions(+), 23 deletions(-)
> 
> -- 
> 2.12.2
> 
> 
Acked-by: Neil Horman <nhorman@tuxdriver.com>

^ permalink raw reply

* [PATCH 6/9] net: thunderx: Add support for XDP_DROP
From: sunil.kovvuri @ 2017-05-02 13:06 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, linux-arm-kernel, Sunil Goutham
In-Reply-To: <1493730418-24606-1-git-send-email-sunil.kovvuri@gmail.com>

From: Sunil Goutham <sgoutham@cavium.com>

Adds support for XDP_DROP.
Also since in XDP mode there is just a single buffer per page,
made changes to recycle DMA mapping info as well along with pages.

Signed-off-by: Sunil Goutham <sgoutham@cavium.com>
---
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   | 23 ++++++-
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 77 ++++++++++++++++------
 drivers/net/ethernet/cavium/thunder/nicvf_queues.h |  4 +-
 3 files changed, 79 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 9c48873..a58cc1e 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -18,6 +18,7 @@
 #include <linux/irq.h>
 #include <linux/iommu.h>
 #include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 #include <linux/filter.h>
 
 #include "nic_reg.h"
@@ -505,6 +506,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic,
 				struct cqe_rx_t *cqe_rx)
 {
 	struct xdp_buff xdp;
+	struct page *page;
 	u32 action;
 	u16 len;
 	u64 dma_addr, cpu_addr;
@@ -527,12 +529,27 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic,
 	switch (action) {
 	case XDP_PASS:
 	case XDP_TX:
-	case XDP_ABORTED:
-	case XDP_DROP:
 		/* Pass on all packets to network stack */
 		return false;
 	default:
 		bpf_warn_invalid_xdp_action(action);
+	case XDP_ABORTED:
+		trace_xdp_exception(nic->netdev, prog, action);
+	case XDP_DROP:
+		page = virt_to_page(xdp.data);
+		/* Check if it's a recycled page, if not
+		 * unmap the DMA mapping.
+		 *
+		 * Recycled page holds an extra reference.
+		 */
+		if (page_ref_count(page) == 1) {
+			dma_addr &= PAGE_MASK;
+			dma_unmap_page_attrs(&nic->pdev->dev, dma_addr,
+					     RCV_FRAG_LEN, DMA_FROM_DEVICE,
+					     DMA_ATTR_SKIP_CPU_SYNC);
+		}
+		put_page(page);
+		return true;
 	}
 	return false;
 }
@@ -645,7 +662,7 @@ static void nicvf_rcv_pkt_handler(struct net_device *netdev,
 		if (nicvf_xdp_rx(snic, nic->xdp_prog, cqe_rx))
 			return;
 
-	skb = nicvf_get_rcv_skb(snic, cqe_rx);
+	skb = nicvf_get_rcv_skb(snic, cqe_rx, nic->xdp_prog ? true : false);
 	if (!skb) {
 		netdev_dbg(nic->netdev, "Packet not received\n");
 		return;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 8c3c571..5009f49 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -117,6 +117,7 @@ static struct pgcache *nicvf_alloc_page(struct nicvf *nic,
 
 		/* Save the page in page cache */
 		pgcache->page = page;
+		pgcache->dma_addr = 0;
 		rbdr->pgalloc++;
 	}
 
@@ -144,7 +145,7 @@ static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, struct rbdr *rbdr,
 	/* Check if request can be accomodated in previous allocated page.
 	 * But in XDP mode only one buffer per page is permitted.
 	 */
-	if (!nic->pnicvf->xdp_prog && nic->rb_page &&
+	if (!rbdr->is_xdp && nic->rb_page &&
 	    ((nic->rb_page_offset + buf_len) <= PAGE_SIZE)) {
 		nic->rb_pageref++;
 		goto ret;
@@ -165,18 +166,24 @@ static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, struct rbdr *rbdr,
 	if (pgcache)
 		nic->rb_page = pgcache->page;
 ret:
-	/* HW will ensure data coherency, CPU sync not required */
-	*rbuf = (u64)dma_map_page_attrs(&nic->pdev->dev, nic->rb_page,
-					nic->rb_page_offset, buf_len,
-					DMA_FROM_DEVICE,
-					DMA_ATTR_SKIP_CPU_SYNC);
-	if (dma_mapping_error(&nic->pdev->dev, (dma_addr_t)*rbuf)) {
-		if (!nic->rb_page_offset)
-			__free_pages(nic->rb_page, 0);
-		nic->rb_page = NULL;
-		return -ENOMEM;
+	if (rbdr->is_xdp && pgcache && pgcache->dma_addr) {
+		*rbuf = pgcache->dma_addr;
+	} else {
+		/* HW will ensure data coherency, CPU sync not required */
+		*rbuf = (u64)dma_map_page_attrs(&nic->pdev->dev, nic->rb_page,
+						nic->rb_page_offset, buf_len,
+						DMA_FROM_DEVICE,
+						DMA_ATTR_SKIP_CPU_SYNC);
+		if (dma_mapping_error(&nic->pdev->dev, (dma_addr_t)*rbuf)) {
+			if (!nic->rb_page_offset)
+				__free_pages(nic->rb_page, 0);
+			nic->rb_page = NULL;
+			return -ENOMEM;
+		}
+		if (pgcache)
+			pgcache->dma_addr = *rbuf;
+		nic->rb_page_offset += buf_len;
 	}
-	nic->rb_page_offset += buf_len;
 
 	return 0;
 }
@@ -230,8 +237,16 @@ static int  nicvf_init_rbdr(struct nicvf *nic, struct rbdr *rbdr,
 	 * On embedded platforms i.e 81xx/83xx available memory itself
 	 * is low and minimum ring size of RBDR is 8K, that takes away
 	 * lots of memory.
+	 *
+	 * But for XDP it has to be a single buffer per page.
 	 */
-	rbdr->pgcnt = ring_len / (PAGE_SIZE / buf_size);
+	if (!nic->pnicvf->xdp_prog) {
+		rbdr->pgcnt = ring_len / (PAGE_SIZE / buf_size);
+		rbdr->is_xdp = false;
+	} else {
+		rbdr->pgcnt = ring_len;
+		rbdr->is_xdp = true;
+	}
 	rbdr->pgcnt = roundup_pow_of_two(rbdr->pgcnt);
 	rbdr->pgcache = kzalloc(sizeof(*rbdr->pgcache) *
 				rbdr->pgcnt, GFP_KERNEL);
@@ -1454,8 +1469,31 @@ static inline unsigned frag_num(unsigned i)
 #endif
 }
 
+static void nicvf_unmap_rcv_buffer(struct nicvf *nic, u64 dma_addr,
+				   u64 buf_addr, bool xdp)
+{
+	struct page *page = NULL;
+	int len = RCV_FRAG_LEN;
+
+	if (xdp) {
+		page = virt_to_page(phys_to_virt(buf_addr));
+		/* Check if it's a recycled page, if not
+		 * unmap the DMA mapping.
+		 *
+		 * Recycled page holds an extra reference.
+		 */
+		if (page_ref_count(page) != 1)
+			return;
+		/* Receive buffers in XDP mode are mapped from page start */
+		dma_addr &= PAGE_MASK;
+	}
+	dma_unmap_page_attrs(&nic->pdev->dev, dma_addr, len,
+			     DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
+}
+
 /* Returns SKB for a received packet */
-struct sk_buff *nicvf_get_rcv_skb(struct nicvf *nic, struct cqe_rx_t *cqe_rx)
+struct sk_buff *nicvf_get_rcv_skb(struct nicvf *nic,
+				  struct cqe_rx_t *cqe_rx, bool xdp)
 {
 	int frag;
 	int payload_len = 0;
@@ -1490,10 +1528,9 @@ struct sk_buff *nicvf_get_rcv_skb(struct nicvf *nic, struct cqe_rx_t *cqe_rx)
 
 		if (!frag) {
 			/* First fragment */
-			dma_unmap_page_attrs(&nic->pdev->dev,
-					     *rb_ptrs - cqe_rx->align_pad,
-					     RCV_FRAG_LEN, DMA_FROM_DEVICE,
-					     DMA_ATTR_SKIP_CPU_SYNC);
+			nicvf_unmap_rcv_buffer(nic,
+					       *rb_ptrs - cqe_rx->align_pad,
+					       phys_addr, xdp);
 			skb = nicvf_rb_ptr_to_skb(nic,
 						  phys_addr - cqe_rx->align_pad,
 						  payload_len);
@@ -1503,9 +1540,7 @@ struct sk_buff *nicvf_get_rcv_skb(struct nicvf *nic, struct cqe_rx_t *cqe_rx)
 			skb_put(skb, payload_len);
 		} else {
 			/* Add fragments */
-			dma_unmap_page_attrs(&nic->pdev->dev, *rb_ptrs,
-					     RCV_FRAG_LEN, DMA_FROM_DEVICE,
-					     DMA_ATTR_SKIP_CPU_SYNC);
+			nicvf_unmap_rcv_buffer(nic, *rb_ptrs, phys_addr, xdp);
 			page = virt_to_page(phys_to_virt(phys_addr));
 			offset = phys_to_virt(phys_addr) - page_address(page);
 			skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
index 07136a2..db04c0e 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
@@ -228,6 +228,7 @@ struct rbdr {
 	u32		head;
 	u32		tail;
 	struct q_desc_mem   dmem;
+	bool		is_xdp;
 
 	/* For page recycling */
 	int		pgidx;
@@ -339,7 +340,8 @@ void nicvf_sq_free_used_descs(struct net_device *netdev,
 int nicvf_sq_append_skb(struct nicvf *nic, struct snd_queue *sq,
 			struct sk_buff *skb, u8 sq_num);
 
-struct sk_buff *nicvf_get_rcv_skb(struct nicvf *nic, struct cqe_rx_t *cqe_rx);
+struct sk_buff *nicvf_get_rcv_skb(struct nicvf *nic,
+				  struct cqe_rx_t *cqe_rx, bool xdp);
 void nicvf_rbdr_task(unsigned long data);
 void nicvf_rbdr_work(struct work_struct *work);
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH 3/9] net: thunderx: Optimize CQE_TX handling
From: sunil.kovvuri @ 2017-05-02 13:06 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, linux-arm-kernel, Sunil Goutham
In-Reply-To: <1493730418-24606-1-git-send-email-sunil.kovvuri@gmail.com>

From: Sunil Goutham <sgoutham@cavium.com>

Optimized CQE handling with below changes
- Feeing descriptors back to SQ in bulk i.e once per NAPI
  instance instead for every CQE_TX, this will reduce number
  of atomic updates to 'sq->free_cnt'.
- Checking errors in CQE_TX and CQE_RX before calling appropriate
  fn()s to update error stats i.e reduce branching.

Also removed debug messages in packet handling path which otherwise
causes issues if DEBUG is enabled.

Signed-off-by: Sunil Goutham <sgoutham@cavium.com>
---
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   | 44 +++++++++++-----------
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c |  5 ---
 2 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 81a2fcb..0d79894 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -498,7 +498,7 @@ static int nicvf_init_resources(struct nicvf *nic)
 
 static void nicvf_snd_pkt_handler(struct net_device *netdev,
 				  struct cqe_send_t *cqe_tx,
-				  int cqe_type, int budget,
+				  int budget, int *subdesc_cnt,
 				  unsigned int *tx_pkts, unsigned int *tx_bytes)
 {
 	struct sk_buff *skb = NULL;
@@ -513,12 +513,10 @@ static void nicvf_snd_pkt_handler(struct net_device *netdev,
 	if (hdr->subdesc_type != SQ_DESC_TYPE_HEADER)
 		return;
 
-	netdev_dbg(nic->netdev,
-		   "%s Qset #%d SQ #%d SQ ptr #%d subdesc count %d\n",
-		   __func__, cqe_tx->sq_qs, cqe_tx->sq_idx,
-		   cqe_tx->sqe_ptr, hdr->subdesc_cnt);
+	/* Check for errors */
+	if (cqe_tx->send_status)
+		nicvf_check_cqe_tx_errs(nic->pnicvf, cqe_tx);
 
-	nicvf_check_cqe_tx_errs(nic, cqe_tx);
 	skb = (struct sk_buff *)sq->skbuff[cqe_tx->sqe_ptr];
 	if (skb) {
 		/* Check for dummy descriptor used for HW TSO offload on 88xx */
@@ -528,12 +526,12 @@ static void nicvf_snd_pkt_handler(struct net_device *netdev,
 			 (struct sq_hdr_subdesc *)GET_SQ_DESC(sq, hdr->rsvd2);
 			nicvf_unmap_sndq_buffers(nic, sq, hdr->rsvd2,
 						 tso_sqe->subdesc_cnt);
-			nicvf_put_sq_desc(sq, tso_sqe->subdesc_cnt + 1);
+			*subdesc_cnt += tso_sqe->subdesc_cnt + 1;
 		} else {
 			nicvf_unmap_sndq_buffers(nic, sq, cqe_tx->sqe_ptr,
 						 hdr->subdesc_cnt);
 		}
-		nicvf_put_sq_desc(sq, hdr->subdesc_cnt + 1);
+		*subdesc_cnt += hdr->subdesc_cnt + 1;
 		prefetch(skb);
 		(*tx_pkts)++;
 		*tx_bytes += skb->len;
@@ -544,7 +542,7 @@ static void nicvf_snd_pkt_handler(struct net_device *netdev,
 		 * a SKB attached, so just free SQEs here.
 		 */
 		if (!nic->hw_tso)
-			nicvf_put_sq_desc(sq, hdr->subdesc_cnt + 1);
+			*subdesc_cnt += hdr->subdesc_cnt + 1;
 	}
 }
 
@@ -595,9 +593,11 @@ static void nicvf_rcv_pkt_handler(struct net_device *netdev,
 	}
 
 	/* Check for errors */
-	err = nicvf_check_cqe_rx_errs(nic, cqe_rx);
-	if (err && !cqe_rx->rb_cnt)
-		return;
+	if (cqe_rx->err_level || cqe_rx->err_opcode) {
+		err = nicvf_check_cqe_rx_errs(nic, cqe_rx);
+		if (err && !cqe_rx->rb_cnt)
+			return;
+	}
 
 	skb = nicvf_get_rcv_skb(snic, cqe_rx);
 	if (!skb) {
@@ -646,6 +646,7 @@ static int nicvf_cq_intr_handler(struct net_device *netdev, u8 cq_idx,
 {
 	int processed_cqe, work_done = 0, tx_done = 0;
 	int cqe_count, cqe_head;
+	int subdesc_cnt = 0;
 	struct nicvf *nic = netdev_priv(netdev);
 	struct queue_set *qs = nic->qs;
 	struct cmp_queue *cq = &qs->cq[cq_idx];
@@ -667,8 +668,6 @@ static int nicvf_cq_intr_handler(struct net_device *netdev, u8 cq_idx,
 	cqe_head = nicvf_queue_reg_read(nic, NIC_QSET_CQ_0_7_HEAD, cq_idx) >> 9;
 	cqe_head &= 0xFFFF;
 
-	netdev_dbg(nic->netdev, "%s CQ%d cqe_count %d cqe_head %d\n",
-		   __func__, cq_idx, cqe_count, cqe_head);
 	while (processed_cqe < cqe_count) {
 		/* Get the CQ descriptor */
 		cq_desc = (struct cqe_rx_t *)GET_CQ_DESC(cq, cqe_head);
@@ -682,17 +681,15 @@ static int nicvf_cq_intr_handler(struct net_device *netdev, u8 cq_idx,
 			break;
 		}
 
-		netdev_dbg(nic->netdev, "CQ%d cq_desc->cqe_type %d\n",
-			   cq_idx, cq_desc->cqe_type);
 		switch (cq_desc->cqe_type) {
 		case CQE_TYPE_RX:
 			nicvf_rcv_pkt_handler(netdev, napi, cq_desc);
 			work_done++;
 		break;
 		case CQE_TYPE_SEND:
-			nicvf_snd_pkt_handler(netdev,
-					      (void *)cq_desc, CQE_TYPE_SEND,
-					      budget, &tx_pkts, &tx_bytes);
+			nicvf_snd_pkt_handler(netdev, (void *)cq_desc,
+					      budget, &subdesc_cnt,
+					      &tx_pkts, &tx_bytes);
 			tx_done++;
 		break;
 		case CQE_TYPE_INVALID:
@@ -704,9 +701,6 @@ static int nicvf_cq_intr_handler(struct net_device *netdev, u8 cq_idx,
 		}
 		processed_cqe++;
 	}
-	netdev_dbg(nic->netdev,
-		   "%s CQ%d processed_cqe %d work_done %d budget %d\n",
-		   __func__, cq_idx, processed_cqe, work_done, budget);
 
 	/* Ring doorbell to inform H/W to reuse processed CQEs */
 	nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_DOOR,
@@ -716,8 +710,12 @@ static int nicvf_cq_intr_handler(struct net_device *netdev, u8 cq_idx,
 		goto loop;
 
 done:
-	/* Wakeup TXQ if its stopped earlier due to SQ full */
 	sq = &nic->qs->sq[cq_idx];
+	/* Update SQ's descriptor free count */
+	if (subdesc_cnt)
+		nicvf_put_sq_desc(sq, subdesc_cnt);
+
+	/* Wakeup TXQ if its stopped earlier due to SQ full */
 	if (tx_done ||
 	    (atomic_read(&sq->free_cnt) >= MIN_SQ_DESC_PER_PKT_XMIT)) {
 		netdev = nic->pnicvf->netdev;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index dfc85a1..90c5bc7d 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -1640,9 +1640,6 @@ void nicvf_update_sq_stats(struct nicvf *nic, int sq_idx)
 /* Check for errors in the receive cmp.queue entry */
 int nicvf_check_cqe_rx_errs(struct nicvf *nic, struct cqe_rx_t *cqe_rx)
 {
-	if (!cqe_rx->err_level && !cqe_rx->err_opcode)
-		return 0;
-
 	if (netif_msg_rx_err(nic))
 		netdev_err(nic->netdev,
 			   "%s: RX error CQE err_level 0x%x err_opcode 0x%x\n",
@@ -1731,8 +1728,6 @@ int nicvf_check_cqe_rx_errs(struct nicvf *nic, struct cqe_rx_t *cqe_rx)
 int nicvf_check_cqe_tx_errs(struct nicvf *nic, struct cqe_send_t *cqe_tx)
 {
 	switch (cqe_tx->send_status) {
-	case CQ_TX_ERROP_GOOD:
-		return 0;
 	case CQ_TX_ERROP_DESC_FAULT:
 		this_cpu_inc(nic->drv_stats->tx_desc_fault);
 		break;
-- 
2.7.4

^ permalink raw reply related

* [PATCH 9/9] net: thunderx: Optimize page recycling for XDP
From: sunil.kovvuri @ 2017-05-02 13:06 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, linux-arm-kernel, Sunil Goutham
In-Reply-To: <1493730418-24606-1-git-send-email-sunil.kovvuri@gmail.com>

From: Sunil Goutham <sgoutham@cavium.com>

Driver follows a method of taking one extra reference on the
page for recycling which is fine in usual packet path where
each 64KB page is segmented into multiple receive buffers.

But in XDP mode since there is just one receive buffer per
page taking extra page reference itself becomes big bottleneck
consuming ~50% of CPU cycles due to atomic operations.

This patch adds a internal ref count in pgcache for each
page and additional page references are taken in a batch
instead of just one at a time. Internal i.e 'pgcache->ref_count'
and page's i.e 'page->_refcount' counters are compared to check
page's recyclability.

Signed-off-by: Sunil Goutham <sgoutham@cavium.com>
---
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 57 +++++++++++++++++++---
 drivers/net/ethernet/cavium/thunder/nicvf_queues.h |  1 +
 2 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 43428ce..2b18176 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -82,6 +82,8 @@ static void nicvf_free_q_desc_mem(struct nicvf *nic, struct q_desc_mem *dmem)
 	dmem->base = NULL;
 }
 
+#define XDP_PAGE_REFCNT_REFILL 256
+
 /* Allocate a new page or recycle one if possible
  *
  * We cannot optimize dma mapping here, since
@@ -90,9 +92,10 @@ static void nicvf_free_q_desc_mem(struct nicvf *nic, struct q_desc_mem *dmem)
  *    and not idx into RBDR ring, so can't refer to saved info.
  * 3. There are multiple receive buffers per page
  */
-static struct pgcache *nicvf_alloc_page(struct nicvf *nic,
-					struct rbdr *rbdr, gfp_t gfp)
+static inline struct pgcache *nicvf_alloc_page(struct nicvf *nic,
+					       struct rbdr *rbdr, gfp_t gfp)
 {
+	int ref_count;
 	struct page *page = NULL;
 	struct pgcache *pgcache, *next;
 
@@ -100,8 +103,23 @@ static struct pgcache *nicvf_alloc_page(struct nicvf *nic,
 	pgcache = &rbdr->pgcache[rbdr->pgidx];
 	page = pgcache->page;
 	/* Check if page can be recycled */
-	if (page && (page_ref_count(page) != 1))
-		page = NULL;
+	if (page) {
+		ref_count = page_ref_count(page);
+		/* Check if this page has been used once i.e 'put_page'
+		 * called after packet transmission i.e internal ref_count
+		 * and page's ref_count are equal i.e page can be recycled.
+		 */
+		if (rbdr->is_xdp && (ref_count == pgcache->ref_count))
+			pgcache->ref_count--;
+		else
+			page = NULL;
+
+		/* In non-XDP mode, page's ref_count needs to be '1' for it
+		 * to be recycled.
+		 */
+		if (!rbdr->is_xdp && (ref_count != 1))
+			page = NULL;
+	}
 
 	if (!page) {
 		page = alloc_pages(gfp | __GFP_COMP | __GFP_NOWARN, 0);
@@ -120,11 +138,30 @@ static struct pgcache *nicvf_alloc_page(struct nicvf *nic,
 		/* Save the page in page cache */
 		pgcache->page = page;
 		pgcache->dma_addr = 0;
+		pgcache->ref_count = 0;
 		rbdr->pgalloc++;
 	}
 
-	/* Take extra page reference for recycling */
-	page_ref_add(page, 1);
+	/* Take additional page references for recycling */
+	if (rbdr->is_xdp) {
+		/* Since there is single RBDR (i.e single core doing
+		 * page recycling) per 8 Rx queues, in XDP mode adjusting
+		 * page references atomically is the biggest bottleneck, so
+		 * take bunch of references at a time.
+		 *
+		 * So here, below reference counts defer by '1'.
+		 */
+		if (!pgcache->ref_count) {
+			pgcache->ref_count = XDP_PAGE_REFCNT_REFILL;
+			page_ref_add(page, XDP_PAGE_REFCNT_REFILL);
+		}
+	} else {
+		/* In non-XDP case, single 64K page is divided across multiple
+		 * receive buffers, so cost of recycling is less anyway.
+		 * So we can do with just one extra reference.
+		 */
+		page_ref_add(page, 1);
+	}
 
 	rbdr->pgidx++;
 	rbdr->pgidx &= (rbdr->pgcnt - 1);
@@ -327,8 +364,14 @@ static void nicvf_free_rbdr(struct nicvf *nic, struct rbdr *rbdr)
 	head = 0;
 	while (head < rbdr->pgcnt) {
 		pgcache = &rbdr->pgcache[head];
-		if (pgcache->page && page_ref_count(pgcache->page) != 0)
+		if (pgcache->page && page_ref_count(pgcache->page) != 0) {
+			if (!rbdr->is_xdp) {
+				put_page(pgcache->page);
+				continue;
+			}
+			page_ref_sub(pgcache->page, pgcache->ref_count - 1);
 			put_page(pgcache->page);
+		}
 		head++;
 	}
 
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
index a07d5b4..5785852 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
@@ -216,6 +216,7 @@ struct q_desc_mem {
 
 struct pgcache {
 	struct page	*page;
+	int		ref_count;
 	u64		dma_addr;
 };
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH 8/9] net: thunderx: Support for XDP header adjustment
From: sunil.kovvuri @ 2017-05-02 13:06 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, linux-arm-kernel, Sunil Goutham
In-Reply-To: <1493730418-24606-1-git-send-email-sunil.kovvuri@gmail.com>

From: Sunil Goutham <sgoutham@cavium.com>

When in XDP mode reserve XDP_PACKET_HEADROOM bytes at the start
of receive buffer for XDP program to modify headers and adjust
packet start. Additional code changes done to handle such packets.

Signed-off-by: Sunil Goutham <sgoutham@cavium.com>
---
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   | 63 ++++++++++++++++------
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c |  9 +++-
 2 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index bb13dee..d6477af 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -502,13 +502,15 @@ static int nicvf_init_resources(struct nicvf *nic)
 }
 
 static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
-				struct cqe_rx_t *cqe_rx, struct snd_queue *sq)
+				struct cqe_rx_t *cqe_rx, struct snd_queue *sq,
+				struct sk_buff **skb)
 {
 	struct xdp_buff xdp;
 	struct page *page;
 	u32 action;
-	u16 len;
+	u16 len, offset = 0;
 	u64 dma_addr, cpu_addr;
+	void *orig_data;
 
 	/* Retrieve packet buffer's DMA address and length */
 	len = *((u16 *)((void *)cqe_rx + (3 * sizeof(u64))));
@@ -517,17 +519,47 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
 	cpu_addr = nicvf_iova_to_phys(nic, dma_addr);
 	if (!cpu_addr)
 		return false;
+	cpu_addr = (u64)phys_to_virt(cpu_addr);
+	page = virt_to_page((void *)cpu_addr);
 
-	xdp.data = phys_to_virt(cpu_addr);
+	xdp.data_hard_start = page_address(page);
+	xdp.data = (void *)cpu_addr;
 	xdp.data_end = xdp.data + len;
+	orig_data = xdp.data;
 
 	rcu_read_lock();
 	action = bpf_prog_run_xdp(prog, &xdp);
 	rcu_read_unlock();
 
+	/* Check if XDP program has changed headers */
+	if (orig_data != xdp.data) {
+		len = xdp.data_end - xdp.data;
+		offset = orig_data - xdp.data;
+		dma_addr -= offset;
+	}
+
 	switch (action) {
 	case XDP_PASS:
-		/* Pass on packet to network stack */
+		/* Check if it's a recycled page, if not
+		 * unmap the DMA mapping.
+		 *
+		 * Recycled page holds an extra reference.
+		 */
+		if (page_ref_count(page) == 1) {
+			dma_addr &= PAGE_MASK;
+			dma_unmap_page_attrs(&nic->pdev->dev, dma_addr,
+					     RCV_FRAG_LEN + XDP_PACKET_HEADROOM,
+					     DMA_FROM_DEVICE,
+					     DMA_ATTR_SKIP_CPU_SYNC);
+		}
+
+		/* Build SKB and pass on packet to network stack */
+		*skb = build_skb(xdp.data,
+				 RCV_FRAG_LEN - cqe_rx->align_pad + offset);
+		if (!*skb)
+			put_page(page);
+		else
+			skb_put(*skb, len);
 		return false;
 	case XDP_TX:
 		nicvf_xdp_sq_append_pkt(nic, sq, (u64)xdp.data, dma_addr, len);
@@ -537,7 +569,6 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
 	case XDP_ABORTED:
 		trace_xdp_exception(nic->netdev, prog, action);
 	case XDP_DROP:
-		page = virt_to_page(xdp.data);
 		/* Check if it's a recycled page, if not
 		 * unmap the DMA mapping.
 		 *
@@ -546,7 +577,8 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
 		if (page_ref_count(page) == 1) {
 			dma_addr &= PAGE_MASK;
 			dma_unmap_page_attrs(&nic->pdev->dev, dma_addr,
-					     RCV_FRAG_LEN, DMA_FROM_DEVICE,
+					     RCV_FRAG_LEN + XDP_PACKET_HEADROOM,
+					     DMA_FROM_DEVICE,
 					     DMA_ATTR_SKIP_CPU_SYNC);
 		}
 		put_page(page);
@@ -654,7 +686,7 @@ static void nicvf_rcv_pkt_handler(struct net_device *netdev,
 				  struct napi_struct *napi,
 				  struct cqe_rx_t *cqe_rx, struct snd_queue *sq)
 {
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
 	struct nicvf *nic = netdev_priv(netdev);
 	struct nicvf *snic = nic;
 	int err = 0;
@@ -676,15 +708,17 @@ static void nicvf_rcv_pkt_handler(struct net_device *netdev,
 	}
 
 	/* For XDP, ignore pkts spanning multiple pages */
-	if (nic->xdp_prog && (cqe_rx->rb_cnt == 1))
-		if (nicvf_xdp_rx(snic, nic->xdp_prog, cqe_rx, sq))
+	if (nic->xdp_prog && (cqe_rx->rb_cnt == 1)) {
+		/* Packet consumed by XDP */
+		if (nicvf_xdp_rx(snic, nic->xdp_prog, cqe_rx, sq, &skb))
 			return;
+	} else {
+		skb = nicvf_get_rcv_skb(snic, cqe_rx,
+					nic->xdp_prog ? true : false);
+	}
 
-	skb = nicvf_get_rcv_skb(snic, cqe_rx, nic->xdp_prog ? true : false);
-	if (!skb) {
-		netdev_dbg(nic->netdev, "Packet not received\n");
+	if (!skb)
 		return;
-	}
 
 	if (netif_msg_pktdata(nic)) {
 		netdev_info(nic->netdev, "%s: skb 0x%p, len=%d\n", netdev->name,
@@ -1672,9 +1706,6 @@ static int nicvf_xdp_setup(struct nicvf *nic, struct bpf_prog *prog)
 		return -EOPNOTSUPP;
 	}
 
-	if (prog && prog->xdp_adjust_head)
-		return -EOPNOTSUPP;
-
 	/* ALL SQs attached to CQs i.e same as RQs, are treated as
 	 * XDP Tx queues and more Tx queues are allocated for
 	 * network stack to send pkts out.
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index ec234b6..43428ce 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -164,6 +164,11 @@ static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, struct rbdr *rbdr,
 	}
 
 	nic->rb_page_offset = 0;
+
+	/* Reserve space for header modifications by BPF program */
+	if (rbdr->is_xdp)
+		buf_len += XDP_PACKET_HEADROOM;
+
 	/* Check if it's recycled */
 	if (pgcache)
 		nic->rb_page = pgcache->page;
@@ -183,7 +188,7 @@ static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, struct rbdr *rbdr,
 			return -ENOMEM;
 		}
 		if (pgcache)
-			pgcache->dma_addr = *rbuf;
+			pgcache->dma_addr = *rbuf + XDP_PACKET_HEADROOM;
 		nic->rb_page_offset += buf_len;
 	}
 
@@ -1575,6 +1580,8 @@ static void nicvf_unmap_rcv_buffer(struct nicvf *nic, u64 dma_addr,
 		 */
 		if (page_ref_count(page) != 1)
 			return;
+
+		len += XDP_PACKET_HEADROOM;
 		/* Receive buffers in XDP mode are mapped from page start */
 		dma_addr &= PAGE_MASK;
 	}
-- 
2.7.4

^ permalink raw reply related

* [PATCH 7/9] net: thunderx: Add support for XDP_TX
From: sunil.kovvuri @ 2017-05-02 13:06 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, linux-arm-kernel, Sunil Goutham
In-Reply-To: <1493730418-24606-1-git-send-email-sunil.kovvuri@gmail.com>

From: Sunil Goutham <sgoutham@cavium.com>

Adds support for XDP_TX i.e transmits packet out of
the XDP TX queue mapped to the corresponding Rx queue
on which packet is received.

Since SQ for XDP TX will be used only on a single cpu i.e
SQ description creation and freeing, using atomic free count
is not necessary and will become a bottleneck. Hence added
a separate 'xdp_free_cnt' used for SQs designated for XDP
to track descriptor free count.

Changes also include
- A new entry 'xdp_page' is added to save transmitted packet's
  page pointer for later cleanup.
- XDP Tx SQ's doorbell is ringed once per NAPI instance.
- Retrieving designated SQ for packets being sent out by stack
  via 'nicvf_xmit'.

Signed-off-by: Sunil Goutham <sgoutham@cavium.com>
---
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   |  63 ++++++++---
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 117 ++++++++++++++++++---
 drivers/net/ethernet/cavium/thunder/nicvf_queues.h |   7 ++
 3 files changed, 160 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index a58cc1e..bb13dee 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -501,9 +501,8 @@ static int nicvf_init_resources(struct nicvf *nic)
 	return 0;
 }
 
-static inline bool nicvf_xdp_rx(struct nicvf *nic,
-				struct bpf_prog *prog,
-				struct cqe_rx_t *cqe_rx)
+static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
+				struct cqe_rx_t *cqe_rx, struct snd_queue *sq)
 {
 	struct xdp_buff xdp;
 	struct page *page;
@@ -528,9 +527,11 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic,
 
 	switch (action) {
 	case XDP_PASS:
-	case XDP_TX:
-		/* Pass on all packets to network stack */
+		/* Pass on packet to network stack */
 		return false;
+	case XDP_TX:
+		nicvf_xdp_sq_append_pkt(nic, sq, (u64)xdp.data, dma_addr, len);
+		return true;
 	default:
 		bpf_warn_invalid_xdp_action(action);
 	case XDP_ABORTED:
@@ -560,6 +561,7 @@ static void nicvf_snd_pkt_handler(struct net_device *netdev,
 				  unsigned int *tx_pkts, unsigned int *tx_bytes)
 {
 	struct sk_buff *skb = NULL;
+	struct page *page;
 	struct nicvf *nic = netdev_priv(netdev);
 	struct snd_queue *sq;
 	struct sq_hdr_subdesc *hdr;
@@ -575,6 +577,22 @@ static void nicvf_snd_pkt_handler(struct net_device *netdev,
 	if (cqe_tx->send_status)
 		nicvf_check_cqe_tx_errs(nic->pnicvf, cqe_tx);
 
+	/* Is this a XDP designated Tx queue */
+	if (sq->is_xdp) {
+		page = (struct page *)sq->xdp_page[cqe_tx->sqe_ptr];
+		/* Check if it's recycled page or else unmap DMA mapping */
+		if (page && (page_ref_count(page) == 1))
+			nicvf_unmap_sndq_buffers(nic, sq, cqe_tx->sqe_ptr,
+						 hdr->subdesc_cnt);
+
+		/* Release page reference for recycling */
+		if (page)
+			put_page(page);
+		sq->xdp_page[cqe_tx->sqe_ptr] = (u64)NULL;
+		*subdesc_cnt += hdr->subdesc_cnt + 1;
+		return;
+	}
+
 	skb = (struct sk_buff *)sq->skbuff[cqe_tx->sqe_ptr];
 	if (skb) {
 		/* Check for dummy descriptor used for HW TSO offload on 88xx */
@@ -634,7 +652,7 @@ static inline void nicvf_set_rxhash(struct net_device *netdev,
 
 static void nicvf_rcv_pkt_handler(struct net_device *netdev,
 				  struct napi_struct *napi,
-				  struct cqe_rx_t *cqe_rx)
+				  struct cqe_rx_t *cqe_rx, struct snd_queue *sq)
 {
 	struct sk_buff *skb;
 	struct nicvf *nic = netdev_priv(netdev);
@@ -659,7 +677,7 @@ static void nicvf_rcv_pkt_handler(struct net_device *netdev,
 
 	/* For XDP, ignore pkts spanning multiple pages */
 	if (nic->xdp_prog && (cqe_rx->rb_cnt == 1))
-		if (nicvf_xdp_rx(snic, nic->xdp_prog, cqe_rx))
+		if (nicvf_xdp_rx(snic, nic->xdp_prog, cqe_rx, sq))
 			return;
 
 	skb = nicvf_get_rcv_skb(snic, cqe_rx, nic->xdp_prog ? true : false);
@@ -715,8 +733,8 @@ static int nicvf_cq_intr_handler(struct net_device *netdev, u8 cq_idx,
 	struct cmp_queue *cq = &qs->cq[cq_idx];
 	struct cqe_rx_t *cq_desc;
 	struct netdev_queue *txq;
-	struct snd_queue *sq;
-	unsigned int tx_pkts = 0, tx_bytes = 0;
+	struct snd_queue *sq = &qs->sq[cq_idx];
+	unsigned int tx_pkts = 0, tx_bytes = 0, txq_idx;
 
 	spin_lock_bh(&cq->lock);
 loop:
@@ -746,7 +764,7 @@ static int nicvf_cq_intr_handler(struct net_device *netdev, u8 cq_idx,
 
 		switch (cq_desc->cqe_type) {
 		case CQE_TYPE_RX:
-			nicvf_rcv_pkt_handler(netdev, napi, cq_desc);
+			nicvf_rcv_pkt_handler(netdev, napi, cq_desc, sq);
 			work_done++;
 		break;
 		case CQE_TYPE_SEND:
@@ -773,17 +791,26 @@ static int nicvf_cq_intr_handler(struct net_device *netdev, u8 cq_idx,
 		goto loop;
 
 done:
-	sq = &nic->qs->sq[cq_idx];
 	/* Update SQ's descriptor free count */
 	if (subdesc_cnt)
 		nicvf_put_sq_desc(sq, subdesc_cnt);
 
+	txq_idx = nicvf_netdev_qidx(nic, cq_idx);
+	/* Handle XDP TX queues */
+	if (nic->pnicvf->xdp_prog) {
+		if (txq_idx < nic->pnicvf->xdp_tx_queues) {
+			nicvf_xdp_sq_doorbell(nic, sq, cq_idx);
+			goto out;
+		}
+		nic = nic->pnicvf;
+		txq_idx -= nic->pnicvf->xdp_tx_queues;
+	}
+
 	/* Wakeup TXQ if its stopped earlier due to SQ full */
 	if (tx_done ||
 	    (atomic_read(&sq->free_cnt) >= MIN_SQ_DESC_PER_PKT_XMIT)) {
 		netdev = nic->pnicvf->netdev;
-		txq = netdev_get_tx_queue(netdev,
-					  nicvf_netdev_qidx(nic, cq_idx));
+		txq = netdev_get_tx_queue(netdev, txq_idx);
 		if (tx_pkts)
 			netdev_tx_completed_queue(txq, tx_pkts, tx_bytes);
 
@@ -796,10 +823,11 @@ static int nicvf_cq_intr_handler(struct net_device *netdev, u8 cq_idx,
 			if (netif_msg_tx_err(nic))
 				netdev_warn(netdev,
 					    "%s: Transmit queue wakeup SQ%d\n",
-					    netdev->name, cq_idx);
+					    netdev->name, txq_idx);
 		}
 	}
 
+out:
 	spin_unlock_bh(&cq->lock);
 	return work_done;
 }
@@ -1115,6 +1143,13 @@ static netdev_tx_t nicvf_xmit(struct sk_buff *skb, struct net_device *netdev)
 		return NETDEV_TX_OK;
 	}
 
+	/* In XDP case, initial HW tx queues are used for XDP,
+	 * but stack's queue mapping starts at '0', so skip the
+	 * Tx queues attached to Rx queues for XDP.
+	 */
+	if (nic->xdp_prog)
+		qid += nic->xdp_tx_queues;
+
 	snic = nic;
 	/* Get secondary Qset's SQ structure */
 	if (qid >= MAX_SND_QUEUES_PER_QS) {
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 5009f49..ec234b6 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -19,6 +19,8 @@
 #include "q_struct.h"
 #include "nicvf_queues.h"
 
+static inline void nicvf_sq_add_gather_subdesc(struct snd_queue *sq, int qentry,
+					       int size, u64 data);
 static void nicvf_get_page(struct nicvf *nic)
 {
 	if (!nic->rb_pageref || !nic->rb_page)
@@ -456,7 +458,7 @@ static void nicvf_free_cmp_queue(struct nicvf *nic, struct cmp_queue *cq)
 
 /* Initialize transmit queue */
 static int nicvf_init_snd_queue(struct nicvf *nic,
-				struct snd_queue *sq, int q_len)
+				struct snd_queue *sq, int q_len, int qidx)
 {
 	int err;
 
@@ -469,17 +471,38 @@ static int nicvf_init_snd_queue(struct nicvf *nic,
 	sq->skbuff = kcalloc(q_len, sizeof(u64), GFP_KERNEL);
 	if (!sq->skbuff)
 		return -ENOMEM;
+
 	sq->head = 0;
 	sq->tail = 0;
-	atomic_set(&sq->free_cnt, q_len - 1);
 	sq->thresh = SND_QUEUE_THRESH;
 
-	/* Preallocate memory for TSO segment's header */
-	sq->tso_hdrs = dma_alloc_coherent(&nic->pdev->dev,
-					  q_len * TSO_HEADER_SIZE,
-					  &sq->tso_hdrs_phys, GFP_KERNEL);
-	if (!sq->tso_hdrs)
-		return -ENOMEM;
+	/* Check if this SQ is a XDP TX queue */
+	if (nic->sqs_mode)
+		qidx += ((nic->sqs_id + 1) * MAX_SND_QUEUES_PER_QS);
+	if (qidx < nic->pnicvf->xdp_tx_queues) {
+		/* Alloc memory to save page pointers for XDP_TX */
+		sq->xdp_page = kcalloc(q_len, sizeof(u64), GFP_KERNEL);
+		if (!sq->xdp_page)
+			return -ENOMEM;
+		sq->xdp_desc_cnt = 0;
+		sq->xdp_free_cnt = q_len - 1;
+		sq->is_xdp = true;
+	} else {
+		sq->xdp_page = NULL;
+		sq->xdp_desc_cnt = 0;
+		sq->xdp_free_cnt = 0;
+		sq->is_xdp = false;
+
+		atomic_set(&sq->free_cnt, q_len - 1);
+
+		/* Preallocate memory for TSO segment's header */
+		sq->tso_hdrs = dma_alloc_coherent(&nic->pdev->dev,
+						  q_len * TSO_HEADER_SIZE,
+						  &sq->tso_hdrs_phys,
+						  GFP_KERNEL);
+		if (!sq->tso_hdrs)
+			return -ENOMEM;
+	}
 
 	return 0;
 }
@@ -505,6 +528,7 @@ void nicvf_unmap_sndq_buffers(struct nicvf *nic, struct snd_queue *sq,
 static void nicvf_free_snd_queue(struct nicvf *nic, struct snd_queue *sq)
 {
 	struct sk_buff *skb;
+	struct page *page;
 	struct sq_hdr_subdesc *hdr;
 	struct sq_hdr_subdesc *tso_sqe;
 
@@ -522,8 +546,15 @@ static void nicvf_free_snd_queue(struct nicvf *nic, struct snd_queue *sq)
 	smp_rmb();
 	while (sq->head != sq->tail) {
 		skb = (struct sk_buff *)sq->skbuff[sq->head];
-		if (!skb)
+		if (!skb || !sq->xdp_page)
+			goto next;
+
+		page = (struct page *)sq->xdp_page[sq->head];
+		if (!page)
 			goto next;
+		else
+			put_page(page);
+
 		hdr = (struct sq_hdr_subdesc *)GET_SQ_DESC(sq, sq->head);
 		/* Check for dummy descriptor used for HW TSO offload on 88xx */
 		if (hdr->dont_send) {
@@ -536,12 +567,14 @@ static void nicvf_free_snd_queue(struct nicvf *nic, struct snd_queue *sq)
 			nicvf_unmap_sndq_buffers(nic, sq, sq->head,
 						 hdr->subdesc_cnt);
 		}
-		dev_kfree_skb_any(skb);
+		if (skb)
+			dev_kfree_skb_any(skb);
 next:
 		sq->head++;
 		sq->head &= (sq->dmem.q_len - 1);
 	}
 	kfree(sq->skbuff);
+	kfree(sq->xdp_page);
 	nicvf_free_q_desc_mem(nic, &sq->dmem);
 }
 
@@ -932,7 +965,7 @@ static int nicvf_alloc_resources(struct nicvf *nic)
 
 	/* Alloc send queue */
 	for (qidx = 0; qidx < qs->sq_cnt; qidx++) {
-		if (nicvf_init_snd_queue(nic, &qs->sq[qidx], qs->sq_len))
+		if (nicvf_init_snd_queue(nic, &qs->sq[qidx], qs->sq_len, qidx))
 			goto alloc_fail;
 	}
 
@@ -1035,7 +1068,10 @@ static inline int nicvf_get_sq_desc(struct snd_queue *sq, int desc_cnt)
 	int qentry;
 
 	qentry = sq->tail;
-	atomic_sub(desc_cnt, &sq->free_cnt);
+	if (!sq->is_xdp)
+		atomic_sub(desc_cnt, &sq->free_cnt);
+	else
+		sq->xdp_free_cnt -= desc_cnt;
 	sq->tail += desc_cnt;
 	sq->tail &= (sq->dmem.q_len - 1);
 
@@ -1053,7 +1089,10 @@ static inline void nicvf_rollback_sq_desc(struct snd_queue *sq,
 /* Free descriptor back to SQ for future use */
 void nicvf_put_sq_desc(struct snd_queue *sq, int desc_cnt)
 {
-	atomic_add(desc_cnt, &sq->free_cnt);
+	if (!sq->is_xdp)
+		atomic_add(desc_cnt, &sq->free_cnt);
+	else
+		sq->xdp_free_cnt += desc_cnt;
 	sq->head += desc_cnt;
 	sq->head &= (sq->dmem.q_len - 1);
 }
@@ -1111,6 +1150,58 @@ void nicvf_sq_free_used_descs(struct net_device *netdev, struct snd_queue *sq,
 	}
 }
 
+/* XDP Transmit APIs */
+void nicvf_xdp_sq_doorbell(struct nicvf *nic,
+			   struct snd_queue *sq, int sq_num)
+{
+	if (!sq->xdp_desc_cnt)
+		return;
+
+	/* make sure all memory stores are done before ringing doorbell */
+	wmb();
+
+	/* Inform HW to xmit all TSO segments */
+	nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_DOOR,
+			      sq_num, sq->xdp_desc_cnt);
+	sq->xdp_desc_cnt = 0;
+}
+
+static inline void
+nicvf_xdp_sq_add_hdr_subdesc(struct snd_queue *sq, int qentry,
+			     int subdesc_cnt, u64 data, int len)
+{
+	struct sq_hdr_subdesc *hdr;
+
+	hdr = (struct sq_hdr_subdesc *)GET_SQ_DESC(sq, qentry);
+	memset(hdr, 0, SND_QUEUE_DESC_SIZE);
+	hdr->subdesc_type = SQ_DESC_TYPE_HEADER;
+	hdr->subdesc_cnt = subdesc_cnt;
+	hdr->tot_len = len;
+	hdr->post_cqe = 1;
+	sq->xdp_page[qentry] = (u64)virt_to_page((void *)data);
+}
+
+int nicvf_xdp_sq_append_pkt(struct nicvf *nic, struct snd_queue *sq,
+			    u64 bufaddr, u64 dma_addr, u16 len)
+{
+	int subdesc_cnt = MIN_SQ_DESC_PER_PKT_XMIT;
+	int qentry;
+
+	if (subdesc_cnt > sq->xdp_free_cnt)
+		return 0;
+
+	qentry = nicvf_get_sq_desc(sq, subdesc_cnt);
+
+	nicvf_xdp_sq_add_hdr_subdesc(sq, qentry, subdesc_cnt - 1, bufaddr, len);
+
+	qentry = nicvf_get_nxt_sqentry(sq, qentry);
+	nicvf_sq_add_gather_subdesc(sq, qentry, len, dma_addr);
+
+	sq->xdp_desc_cnt += subdesc_cnt;
+
+	return 1;
+}
+
 /* Calculate no of SQ subdescriptors needed to transmit all
  * segments of this TSO packet.
  * Taken from 'Tilera network driver' with a minor modification.
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
index db04c0e..a07d5b4 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
@@ -271,6 +271,10 @@ struct snd_queue {
 	u32		tail;
 	u64		*skbuff;
 	void		*desc;
+	u64		*xdp_page;
+	u16		xdp_desc_cnt;
+	u16		xdp_free_cnt;
+	bool		is_xdp;
 
 #define	TSO_HEADER_SIZE	128
 	/* For TSO segment's header */
@@ -339,6 +343,9 @@ void nicvf_sq_free_used_descs(struct net_device *netdev,
 			      struct snd_queue *sq, int qidx);
 int nicvf_sq_append_skb(struct nicvf *nic, struct snd_queue *sq,
 			struct sk_buff *skb, u8 sq_num);
+int nicvf_xdp_sq_append_pkt(struct nicvf *nic, struct snd_queue *sq,
+			    u64 bufaddr, u64 dma_addr, u16 len);
+void nicvf_xdp_sq_doorbell(struct nicvf *nic, struct snd_queue *sq, int sq_num);
 
 struct sk_buff *nicvf_get_rcv_skb(struct nicvf *nic,
 				  struct cqe_rx_t *cqe_rx, bool xdp);
-- 
2.7.4

^ permalink raw reply related

* [PATCH 5/9] net: thunderx: Add basic XDP support
From: sunil.kovvuri @ 2017-05-02 13:06 UTC (permalink / raw)
  To: netdev; +Cc: Sunil Goutham, linux-kernel, linux-arm-kernel
In-Reply-To: <1493730418-24606-1-git-send-email-sunil.kovvuri@gmail.com>

From: Sunil Goutham <sgoutham@cavium.com>

Adds basic XDP support i.e attaching a BPF program to an
interface. Also takes care of allocating separate Tx queues
for XDP path and for network stack packet transmission.

This patch doesn't support handling of any of the XDP actions,
all are treated as XDP_PASS i.e packets will be handed over to
the network stack.

Changes also involve allocating one receive buffer per page in XDP
mode and multiple in normal mode i.e when no BPF program is attached.

Signed-off-by: Sunil Goutham <sgoutham@cavium.com>
---
 drivers/net/ethernet/cavium/thunder/nic.h          |   6 +-
 .../net/ethernet/cavium/thunder/nicvf_ethtool.c    |  26 +++-
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   | 162 ++++++++++++++++++++-
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c |  15 +-
 drivers/net/ethernet/cavium/thunder/nicvf_queues.h |   9 ++
 5 files changed, 199 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
index dca6aed..4a02e61 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -268,9 +268,9 @@ struct nicvf {
 	struct net_device	*netdev;
 	struct pci_dev		*pdev;
 	void __iomem		*reg_base;
+	struct bpf_prog         *xdp_prog;
 #define	MAX_QUEUES_PER_QSET			8
 	struct queue_set	*qs;
-	struct nicvf_cq_poll	*napi[8];
 	void			*iommu_domain;
 	u8			vf_id;
 	u8			sqs_id;
@@ -296,6 +296,7 @@ struct nicvf {
 	/* Queue count */
 	u8			rx_queues;
 	u8			tx_queues;
+	u8			xdp_tx_queues;
 	u8			max_queues;
 
 	u8			node;
@@ -320,6 +321,9 @@ struct nicvf {
 	struct nicvf_drv_stats  __percpu *drv_stats;
 	struct bgx_stats	bgx_stats;
 
+	/* Napi */
+	struct nicvf_cq_poll	*napi[8];
+
 	/* MSI-X  */
 	u8			num_vec;
 	char			irq_name[NIC_VF_MSIX_VECTORS][IFNAMSIZ + 15];
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
index a89db5f..b9ece9c 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
@@ -721,7 +721,7 @@ static int nicvf_set_channels(struct net_device *dev,
 	struct nicvf *nic = netdev_priv(dev);
 	int err = 0;
 	bool if_up = netif_running(dev);
-	int cqcount;
+	u8 cqcount, txq_count;
 
 	if (!channel->rx_count || !channel->tx_count)
 		return -EINVAL;
@@ -730,10 +730,26 @@ static int nicvf_set_channels(struct net_device *dev,
 	if (channel->tx_count > nic->max_queues)
 		return -EINVAL;
 
+	if (nic->xdp_prog &&
+	    ((channel->tx_count + channel->rx_count) > nic->max_queues)) {
+		netdev_err(nic->netdev,
+			   "XDP mode, RXQs + TXQs > Max %d\n",
+			   nic->max_queues);
+		return -EINVAL;
+	}
+
 	if (if_up)
 		nicvf_stop(dev);
 
-	cqcount = max(channel->rx_count, channel->tx_count);
+	nic->rx_queues = channel->rx_count;
+	nic->tx_queues = channel->tx_count;
+	if (!nic->xdp_prog)
+		nic->xdp_tx_queues = 0;
+	else
+		nic->xdp_tx_queues = channel->rx_count;
+
+	txq_count = nic->xdp_tx_queues + nic->tx_queues;
+	cqcount = max(nic->rx_queues, txq_count);
 
 	if (cqcount > MAX_CMP_QUEUES_PER_QS) {
 		nic->sqs_count = roundup(cqcount, MAX_CMP_QUEUES_PER_QS);
@@ -742,12 +758,10 @@ static int nicvf_set_channels(struct net_device *dev,
 		nic->sqs_count = 0;
 	}
 
-	nic->qs->rq_cnt = min_t(u32, channel->rx_count, MAX_RCV_QUEUES_PER_QS);
-	nic->qs->sq_cnt = min_t(u32, channel->tx_count, MAX_SND_QUEUES_PER_QS);
+	nic->qs->rq_cnt = min_t(u8, nic->rx_queues, MAX_RCV_QUEUES_PER_QS);
+	nic->qs->sq_cnt = min_t(u8, txq_count, MAX_SND_QUEUES_PER_QS);
 	nic->qs->cq_cnt = max(nic->qs->rq_cnt, nic->qs->sq_cnt);
 
-	nic->rx_queues = channel->rx_count;
-	nic->tx_queues = channel->tx_count;
 	err = nicvf_set_real_num_queues(dev, nic->tx_queues, nic->rx_queues);
 	if (err)
 		return err;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 0d79894..9c48873 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -17,6 +17,8 @@
 #include <linux/prefetch.h>
 #include <linux/irq.h>
 #include <linux/iommu.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
 
 #include "nic_reg.h"
 #include "nic.h"
@@ -397,8 +399,10 @@ static void nicvf_request_sqs(struct nicvf *nic)
 
 	if (nic->rx_queues > MAX_RCV_QUEUES_PER_QS)
 		rx_queues = nic->rx_queues - MAX_RCV_QUEUES_PER_QS;
-	if (nic->tx_queues > MAX_SND_QUEUES_PER_QS)
-		tx_queues = nic->tx_queues - MAX_SND_QUEUES_PER_QS;
+
+	tx_queues = nic->tx_queues + nic->xdp_tx_queues;
+	if (tx_queues > MAX_SND_QUEUES_PER_QS)
+		tx_queues = tx_queues - MAX_SND_QUEUES_PER_QS;
 
 	/* Set no of Rx/Tx queues in each of the SQsets */
 	for (sqs = 0; sqs < nic->sqs_count; sqs++) {
@@ -496,6 +500,43 @@ static int nicvf_init_resources(struct nicvf *nic)
 	return 0;
 }
 
+static inline bool nicvf_xdp_rx(struct nicvf *nic,
+				struct bpf_prog *prog,
+				struct cqe_rx_t *cqe_rx)
+{
+	struct xdp_buff xdp;
+	u32 action;
+	u16 len;
+	u64 dma_addr, cpu_addr;
+
+	/* Retrieve packet buffer's DMA address and length */
+	len = *((u16 *)((void *)cqe_rx + (3 * sizeof(u64))));
+	dma_addr = *((u64 *)((void *)cqe_rx + (7 * sizeof(u64))));
+
+	cpu_addr = nicvf_iova_to_phys(nic, dma_addr);
+	if (!cpu_addr)
+		return false;
+
+	xdp.data = phys_to_virt(cpu_addr);
+	xdp.data_end = xdp.data + len;
+
+	rcu_read_lock();
+	action = bpf_prog_run_xdp(prog, &xdp);
+	rcu_read_unlock();
+
+	switch (action) {
+	case XDP_PASS:
+	case XDP_TX:
+	case XDP_ABORTED:
+	case XDP_DROP:
+		/* Pass on all packets to network stack */
+		return false;
+	default:
+		bpf_warn_invalid_xdp_action(action);
+	}
+	return false;
+}
+
 static void nicvf_snd_pkt_handler(struct net_device *netdev,
 				  struct cqe_send_t *cqe_tx,
 				  int budget, int *subdesc_cnt,
@@ -599,6 +640,11 @@ static void nicvf_rcv_pkt_handler(struct net_device *netdev,
 			return;
 	}
 
+	/* For XDP, ignore pkts spanning multiple pages */
+	if (nic->xdp_prog && (cqe_rx->rb_cnt == 1))
+		if (nicvf_xdp_rx(snic, nic->xdp_prog, cqe_rx))
+			return;
+
 	skb = nicvf_get_rcv_skb(snic, cqe_rx);
 	if (!skb) {
 		netdev_dbg(nic->netdev, "Packet not received\n");
@@ -1529,6 +1575,117 @@ static int nicvf_set_features(struct net_device *netdev,
 	return 0;
 }
 
+static void nicvf_set_xdp_queues(struct nicvf *nic, bool bpf_attached)
+{
+	u8 cq_count, txq_count;
+
+	/* Set XDP Tx queue count same as Rx queue count */
+	if (!bpf_attached)
+		nic->xdp_tx_queues = 0;
+	else
+		nic->xdp_tx_queues = nic->rx_queues;
+
+	/* If queue count > MAX_CMP_QUEUES_PER_QS, then additional qsets
+	 * needs to be allocated, check how many.
+	 */
+	txq_count = nic->xdp_tx_queues + nic->tx_queues;
+	cq_count = max(nic->rx_queues, txq_count);
+	if (cq_count > MAX_CMP_QUEUES_PER_QS) {
+		nic->sqs_count = roundup(cq_count, MAX_CMP_QUEUES_PER_QS);
+		nic->sqs_count = (nic->sqs_count / MAX_CMP_QUEUES_PER_QS) - 1;
+	} else {
+		nic->sqs_count = 0;
+	}
+
+	/* Set primary Qset's resources */
+	nic->qs->rq_cnt = min_t(u8, nic->rx_queues, MAX_RCV_QUEUES_PER_QS);
+	nic->qs->sq_cnt = min_t(u8, txq_count, MAX_SND_QUEUES_PER_QS);
+	nic->qs->cq_cnt = max_t(u8, nic->qs->rq_cnt, nic->qs->sq_cnt);
+
+	/* Update stack */
+	nicvf_set_real_num_queues(nic->netdev, nic->tx_queues, nic->rx_queues);
+}
+
+static int nicvf_xdp_setup(struct nicvf *nic, struct bpf_prog *prog)
+{
+	struct net_device *dev = nic->netdev;
+	bool if_up = netif_running(nic->netdev);
+	struct bpf_prog *old_prog;
+	bool bpf_attached = false;
+
+	/* For now just support only the usual MTU sized frames */
+	if (prog && (dev->mtu > 1500)) {
+		netdev_warn(dev, "Jumbo frames not yet supported with XDP, current MTU %d.\n",
+			    dev->mtu);
+		return -EOPNOTSUPP;
+	}
+
+	if (prog && prog->xdp_adjust_head)
+		return -EOPNOTSUPP;
+
+	/* ALL SQs attached to CQs i.e same as RQs, are treated as
+	 * XDP Tx queues and more Tx queues are allocated for
+	 * network stack to send pkts out.
+	 *
+	 * No of Tx queues are either same as Rx queues or whatever
+	 * is left in max no of queues possible.
+	 */
+	if ((nic->rx_queues + nic->tx_queues) > nic->max_queues) {
+		netdev_warn(dev,
+			    "Failed to attach BPF prog, RXQs + TXQs > Max %d\n",
+			    nic->max_queues);
+		return -ENOMEM;
+	}
+
+	if (if_up)
+		nicvf_stop(nic->netdev);
+
+	old_prog = xchg(&nic->xdp_prog, prog);
+	/* Detach old prog, if any */
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	if (nic->xdp_prog) {
+		/* Attach BPF program */
+		nic->xdp_prog = bpf_prog_add(nic->xdp_prog, nic->rx_queues - 1);
+		if (!IS_ERR(nic->xdp_prog))
+			bpf_attached = true;
+	}
+
+	/* Calculate Tx queues needed for XDP and network stack */
+	nicvf_set_xdp_queues(nic, bpf_attached);
+
+	if (if_up) {
+		/* Reinitialize interface, clean slate */
+		nicvf_open(nic->netdev);
+		netif_trans_update(nic->netdev);
+	}
+
+	return 0;
+}
+
+static int nicvf_xdp(struct net_device *netdev, struct netdev_xdp *xdp)
+{
+	struct nicvf *nic = netdev_priv(netdev);
+
+	/* To avoid checks while retrieving buffer address from CQE_RX,
+	 * do not support XDP for T88 pass1.x silicons which are anyway
+	 * not in use widely.
+	 */
+	if (pass1_silicon(nic->pdev))
+		return -EOPNOTSUPP;
+
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return nicvf_xdp_setup(nic, xdp->prog);
+	case XDP_QUERY_PROG:
+		xdp->prog_attached = !!nic->xdp_prog;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops nicvf_netdev_ops = {
 	.ndo_open		= nicvf_open,
 	.ndo_stop		= nicvf_stop,
@@ -1539,6 +1696,7 @@ static const struct net_device_ops nicvf_netdev_ops = {
 	.ndo_tx_timeout         = nicvf_tx_timeout,
 	.ndo_fix_features       = nicvf_fix_features,
 	.ndo_set_features       = nicvf_set_features,
+	.ndo_xdp		= nicvf_xdp,
 };
 
 static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index e4a02a9..8c3c571 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -19,14 +19,6 @@
 #include "q_struct.h"
 #include "nicvf_queues.h"
 
-static inline u64 nicvf_iova_to_phys(struct nicvf *nic, dma_addr_t dma_addr)
-{
-	/* Translation is installed only when IOMMU is present */
-	if (nic->iommu_domain)
-		return iommu_iova_to_phys(nic->iommu_domain, dma_addr);
-	return dma_addr;
-}
-
 static void nicvf_get_page(struct nicvf *nic)
 {
 	if (!nic->rb_pageref || !nic->rb_page)
@@ -149,8 +141,10 @@ static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, struct rbdr *rbdr,
 {
 	struct pgcache *pgcache = NULL;
 
-	/* Check if request can be accomodated in previous allocated page */
-	if (nic->rb_page &&
+	/* Check if request can be accomodated in previous allocated page.
+	 * But in XDP mode only one buffer per page is permitted.
+	 */
+	if (!nic->pnicvf->xdp_prog && nic->rb_page &&
 	    ((nic->rb_page_offset + buf_len) <= PAGE_SIZE)) {
 		nic->rb_pageref++;
 		goto ret;
@@ -961,6 +955,7 @@ int nicvf_set_qset_resources(struct nicvf *nic)
 
 	nic->rx_queues = qs->rq_cnt;
 	nic->tx_queues = qs->sq_cnt;
+	nic->xdp_tx_queues = 0;
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
index da48366..07136a2 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
@@ -10,6 +10,7 @@
 #define NICVF_QUEUES_H
 
 #include <linux/netdevice.h>
+#include <linux/iommu.h>
 #include "q_struct.h"
 
 #define MAX_QUEUE_SET			128
@@ -312,6 +313,14 @@ struct queue_set {
 
 #define	CQ_ERR_MASK	(CQ_WR_FULL | CQ_WR_DISABLE | CQ_WR_FAULT)
 
+static inline u64 nicvf_iova_to_phys(struct nicvf *nic, dma_addr_t dma_addr)
+{
+	/* Translation is installed only when IOMMU is present */
+	if (nic->iommu_domain)
+		return iommu_iova_to_phys(nic->iommu_domain, dma_addr);
+	return dma_addr;
+}
+
 void nicvf_unmap_sndq_buffers(struct nicvf *nic, struct snd_queue *sq,
 			      int hdr_sqe, u8 subdesc_cnt);
 void nicvf_config_vlan_stripping(struct nicvf *nic,
-- 
2.7.4

^ permalink raw reply related

* [PATCH 4/9] net: thunderx: Cleanup receive buffer allocation
From: sunil.kovvuri @ 2017-05-02 13:06 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, linux-arm-kernel, Sunil Goutham
In-Reply-To: <1493730418-24606-1-git-send-email-sunil.kovvuri@gmail.com>

From: Sunil Goutham <sgoutham@cavium.com>

Get rid of unnecessary double pointer references and type casting
in receive buffer allocation code.

Signed-off-by: Sunil Goutham <sgoutham@cavium.com>
---
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 90c5bc7d..e4a02a9 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -145,7 +145,7 @@ static struct pgcache *nicvf_alloc_page(struct nicvf *nic,
 
 /* Allocate buffer for packet reception */
 static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, struct rbdr *rbdr,
-					 gfp_t gfp, u32 buf_len, u64 **rbuf)
+					 gfp_t gfp, u32 buf_len, u64 *rbuf)
 {
 	struct pgcache *pgcache = NULL;
 
@@ -172,10 +172,10 @@ static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, struct rbdr *rbdr,
 		nic->rb_page = pgcache->page;
 ret:
 	/* HW will ensure data coherency, CPU sync not required */
-	*rbuf = (u64 *)((u64)dma_map_page_attrs(&nic->pdev->dev, nic->rb_page,
-						nic->rb_page_offset, buf_len,
-						DMA_FROM_DEVICE,
-						DMA_ATTR_SKIP_CPU_SYNC));
+	*rbuf = (u64)dma_map_page_attrs(&nic->pdev->dev, nic->rb_page,
+					nic->rb_page_offset, buf_len,
+					DMA_FROM_DEVICE,
+					DMA_ATTR_SKIP_CPU_SYNC);
 	if (dma_mapping_error(&nic->pdev->dev, (dma_addr_t)*rbuf)) {
 		if (!nic->rb_page_offset)
 			__free_pages(nic->rb_page, 0);
@@ -212,7 +212,7 @@ static int  nicvf_init_rbdr(struct nicvf *nic, struct rbdr *rbdr,
 			    int ring_len, int buf_size)
 {
 	int idx;
-	u64 *rbuf;
+	u64 rbuf;
 	struct rbdr_entry_t *desc;
 	int err;
 
@@ -257,7 +257,7 @@ static int  nicvf_init_rbdr(struct nicvf *nic, struct rbdr *rbdr,
 		}
 
 		desc = GET_RBDR_DESC(rbdr, idx);
-		desc->buf_addr = (u64)rbuf & ~(NICVF_RCV_BUF_ALIGN_BYTES - 1);
+		desc->buf_addr = rbuf & ~(NICVF_RCV_BUF_ALIGN_BYTES - 1);
 	}
 
 	nicvf_get_page(nic);
@@ -330,7 +330,7 @@ static void nicvf_refill_rbdr(struct nicvf *nic, gfp_t gfp)
 	int refill_rb_cnt;
 	struct rbdr *rbdr;
 	struct rbdr_entry_t *desc;
-	u64 *rbuf;
+	u64 rbuf;
 	int new_rb = 0;
 
 refill:
@@ -364,7 +364,7 @@ static void nicvf_refill_rbdr(struct nicvf *nic, gfp_t gfp)
 			break;
 
 		desc = GET_RBDR_DESC(rbdr, tail);
-		desc->buf_addr = (u64)rbuf & ~(NICVF_RCV_BUF_ALIGN_BYTES - 1);
+		desc->buf_addr = rbuf & ~(NICVF_RCV_BUF_ALIGN_BYTES - 1);
 		refill_rb_cnt--;
 		new_rb++;
 	}
-- 
2.7.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox