Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH] i40e: remove unnecessary __packed
From: David Miller @ 2016-11-20  2:47 UTC (permalink / raw)
  To: tushar.n.dave; +Cc: jeffrey.t.kirsher, intel-wired-lan, netdev
In-Reply-To: <1479592438-17078-1-git-send-email-tushar.n.dave@oracle.com>

From: Tushar Dave <tushar.n.dave@oracle.com>
Date: Sat, 19 Nov 2016 13:53:58 -0800

> 'struct i40e_dma_mem' defined with 'packed' directive causing kernel
> unaligned errors on sparc.

I really wish people would get out of the habit of just slapping
the __packed attribute onto structures seemingly by default.

^ permalink raw reply

* [net-next PATCH v2 0/5] XDP for virtio_net
From: John Fastabend @ 2016-11-20  2:49 UTC (permalink / raw)
  To: daniel, eric.dumazet, mst, kubakici, shm, davem,
	alexei.starovoitov
  Cc: netdev, bblanco, john.fastabend, john.r.fastabend, brouer, tgraf

This implements virtio_net for the mergeable buffers and big_packet
modes. I tested this with vhost_net running on qemu and did not see
any issues.

There are some restrictions for XDP to be enabled (see patch 3) for
more details.

  1. LRO must be off
  2. MTU must be less than PAGE_SIZE
  3. queues must be available to dedicate to XDP
  4. num_bufs received in mergeable buffers must be 1
  5. big_packet mode must have all data on single page

Please review any comments/feedback welcome as always.

v2, fixes rcu usage throughout thanks to Eric and the use of
num_online_cpus() usage thanks to Jakub.

Thanks,
John

---

John Fastabend (4):
      net: virtio dynamically disable/enable LRO
      net: xdp: add invalid buffer warning
      virtio_net: add dedicated XDP transmit queues
      virtio_net: add XDP_TX support

Shrijeet Mukherjee (1):
      virtio_net: Add XDP support

 drivers/net/virtio_net.c |  268 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/filter.h   |    1 
 net/core/filter.c        |    6 +
 3 files changed, 270 insertions(+), 5 deletions(-)

^ permalink raw reply

* [net-next PATCH v2 1/5] net: virtio dynamically disable/enable LRO
From: John Fastabend @ 2016-11-20  2:49 UTC (permalink / raw)
  To: daniel, eric.dumazet, mst, kubakici, shm, davem,
	alexei.starovoitov
  Cc: netdev, bblanco, john.fastabend, john.r.fastabend, brouer, tgraf
In-Reply-To: <20161120024710.19187.31037.stgit@john-Precision-Tower-5810>

This adds support for dynamically setting the LRO feature flag. The
message to control guest features in the backend uses the
CTRL_GUEST_OFFLOADS msg type.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/virtio_net.c |   45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index ca5239a..8189e5b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1419,6 +1419,41 @@ static void virtnet_init_settings(struct net_device *dev)
 	.set_settings = virtnet_set_settings,
 };
 
+static int virtnet_set_features(struct net_device *netdev,
+				netdev_features_t features)
+{
+	struct virtnet_info *vi = netdev_priv(netdev);
+	struct virtio_device *vdev = vi->vdev;
+	struct scatterlist sg;
+	u64 offloads = 0;
+
+	if (features & NETIF_F_LRO)
+		offloads |= (1 << VIRTIO_NET_F_GUEST_TSO4) |
+			    (1 << VIRTIO_NET_F_GUEST_TSO6);
+
+	if (features & NETIF_F_RXCSUM)
+		offloads |= (1 << VIRTIO_NET_F_GUEST_CSUM);
+
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
+		sg_init_one(&sg, &offloads, sizeof(uint64_t));
+		if (!virtnet_send_command(vi,
+					  VIRTIO_NET_CTRL_GUEST_OFFLOADS,
+					  VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
+					  &sg)) {
+			dev_warn(&netdev->dev,
+				 "Failed to set guest offloads by virtnet command.\n");
+			return -EINVAL;
+		}
+	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) &&
+		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+		dev_warn(&netdev->dev,
+			 "No support for setting offloads pre version_1.\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static const struct net_device_ops virtnet_netdev = {
 	.ndo_open            = virtnet_open,
 	.ndo_stop   	     = virtnet_close,
@@ -1435,6 +1470,7 @@ static void virtnet_init_settings(struct net_device *dev)
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	.ndo_busy_poll		= virtnet_busy_poll,
 #endif
+	.ndo_set_features	= virtnet_set_features,
 };
 
 static void virtnet_config_changed_work(struct work_struct *work)
@@ -1810,6 +1846,12 @@ static int virtnet_probe(struct virtio_device *vdev)
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
 		dev->features |= NETIF_F_RXCSUM;
 
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) &&
+	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) {
+		dev->features |= NETIF_F_LRO;
+		dev->hw_features |= NETIF_F_LRO;
+	}
+
 	dev->vlan_features = dev->features;
 
 	/* MTU range: 68 - 65535 */
@@ -2047,7 +2089,8 @@ static int virtnet_restore(struct virtio_device *vdev)
 	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
 	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
 	VIRTIO_NET_F_CTRL_MAC_ADDR, \
-	VIRTIO_NET_F_MTU
+	VIRTIO_NET_F_MTU, \
+	VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
 
 static unsigned int features[] = {
 	VIRTNET_FEATURES,

^ permalink raw reply related

* [net-next PATCH v2 2/5] net: xdp: add invalid buffer warning
From: John Fastabend @ 2016-11-20  2:50 UTC (permalink / raw)
  To: daniel, eric.dumazet, mst, kubakici, shm, davem,
	alexei.starovoitov
  Cc: netdev, bblanco, john.fastabend, john.r.fastabend, brouer, tgraf
In-Reply-To: <20161120024710.19187.31037.stgit@john-Precision-Tower-5810>

This adds a warning for drivers to use when encountering an invalid
buffer for XDP. For normal cases this should not happen but to catch
this in virtual/qemu setups that I may not have expected from the
emulation layer having a standard warning is useful.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/linux/filter.h |    1 +
 net/core/filter.c      |    6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1f09c52..0c79004 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -595,6 +595,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 void bpf_warn_invalid_xdp_action(u32 act);
+void bpf_warn_invalid_xdp_buffer(void);
 
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
diff --git a/net/core/filter.c b/net/core/filter.c
index dece94f..b777730 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2774,6 +2774,12 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
+void bpf_warn_invalid_xdp_buffer(void)
+{
+	WARN_ONCE(1, "Illegal XDP buffer encountered, expect packet loss\n");
+}
+EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_buffer);
+
 static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 					int src_reg, int ctx_off,
 					struct bpf_insn *insn_buf,

^ permalink raw reply related

* [net-next PATCH v2 3/5] virtio_net: Add XDP support
From: John Fastabend @ 2016-11-20  2:50 UTC (permalink / raw)
  To: daniel, eric.dumazet, mst, kubakici, shm, davem,
	alexei.starovoitov
  Cc: netdev, bblanco, john.fastabend, john.r.fastabend, brouer, tgraf
In-Reply-To: <20161120024710.19187.31037.stgit@john-Precision-Tower-5810>

From: Shrijeet Mukherjee <shrijeet@gmail.com>

This adds XDP support to virtio_net. Some requirements must be
met for XDP to be enabled depending on the mode. First it will
only be supported with LRO disabled so that data is not pushed
across multiple buffers. The MTU must be less than a page size
to avoid having to handle XDP across multiple pages.

If mergeable receive is enabled this first series only supports
the case where header and data are in the same buf which we can
check when a packet is received by looking at num_buf. If the
num_buf is greater than 1 and a XDP program is loaded the packet
is dropped and a warning is thrown. When any_header_sg is set this
does not happen and both header and data is put in a single buffer
as expected so we check this when XDP programs are loaded. Note I
have only tested this with Linux vhost backend.

If big packets mode is enabled and MTU/LRO conditions above are
met then XDP is allowed.

A follow on patch can be generated to solve the mergeable receive
case with num_bufs equal to 2. Buffers greater than two may not
be handled has easily.

Suggested-by: Shrijeet Mukherjee <shrijeet@gmail.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/virtio_net.c |  146 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 142 insertions(+), 4 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 8189e5b..8f99a53 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/virtio.h>
 #include <linux/virtio_net.h>
+#include <linux/bpf.h>
 #include <linux/scatterlist.h>
 #include <linux/if_vlan.h>
 #include <linux/slab.h>
@@ -81,6 +82,8 @@ struct receive_queue {
 
 	struct napi_struct napi;
 
+	struct bpf_prog __rcu *xdp_prog;
+
 	/* Chain pages by the private ptr. */
 	struct page *pages;
 
@@ -324,6 +327,38 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	return skb;
 }
 
+static u32 do_xdp_prog(struct virtnet_info *vi,
+		       struct bpf_prog *xdp_prog,
+		       struct page *page, int offset, int len)
+{
+	int hdr_padded_len;
+	struct xdp_buff xdp;
+	u32 act;
+	u8 *buf;
+
+	buf = page_address(page) + offset;
+
+	if (vi->mergeable_rx_bufs)
+		hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	else
+		hdr_padded_len = sizeof(struct padded_vnet_hdr);
+
+	xdp.data = buf + hdr_padded_len;
+	xdp.data_end = xdp.data + (len - vi->hdr_len);
+
+	act = bpf_prog_run_xdp(xdp_prog, &xdp);
+	switch (act) {
+	case XDP_PASS:
+		return XDP_PASS;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+	case XDP_TX:
+	case XDP_ABORTED:
+	case XDP_DROP:
+		return XDP_DROP;
+	}
+}
+
 static struct sk_buff *receive_small(struct virtnet_info *vi, void *buf, unsigned int len)
 {
 	struct sk_buff * skb = buf;
@@ -340,9 +375,19 @@ static struct sk_buff *receive_big(struct net_device *dev,
 				   void *buf,
 				   unsigned int len)
 {
+	struct bpf_prog *xdp_prog;
 	struct page *page = buf;
-	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
+	struct sk_buff *skb;
 
+	xdp_prog = rcu_dereference_bh(rq->xdp_prog);
+	if (xdp_prog) {
+		u32 act = do_xdp_prog(vi, xdp_prog, page, 0, len);
+
+		if (act == XDP_DROP)
+			goto err;
+	}
+
+	skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
 	if (unlikely(!skb))
 		goto err;
 
@@ -366,10 +411,25 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	struct page *page = virt_to_head_page(buf);
 	int offset = buf - page_address(page);
 	unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
+	struct sk_buff *head_skb, *curr_skb;
+	struct bpf_prog *xdp_prog;
 
-	struct sk_buff *head_skb = page_to_skb(vi, rq, page, offset, len,
-					       truesize);
-	struct sk_buff *curr_skb = head_skb;
+	xdp_prog = rcu_dereference_bh(rq->xdp_prog);
+	if (xdp_prog) {
+		u32 act;
+
+		if (num_buf > 1) {
+			bpf_warn_invalid_xdp_buffer();
+			goto err_skb;
+		}
+
+		act = do_xdp_prog(vi, xdp_prog, page, offset, len);
+		if (act == XDP_DROP)
+			goto err_skb;
+	}
+
+	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
+	curr_skb = head_skb;
 
 	if (unlikely(!curr_skb))
 		goto err_skb;
@@ -1328,6 +1388,13 @@ static int virtnet_set_channels(struct net_device *dev,
 	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
 		return -EINVAL;
 
+	/* For now we don't support modifying channels while XDP is loaded
+	 * also when XDP is loaded all RX queues have XDP programs so we only
+	 * need to check a single RX queue.
+	 */
+	if (vi->rq[0].xdp_prog)
+		return -EINVAL;
+
 	get_online_cpus();
 	err = virtnet_set_queues(vi, queue_pairs);
 	if (!err) {
@@ -1454,6 +1521,68 @@ static int virtnet_set_features(struct net_device *netdev,
 	return 0;
 }
 
+static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	struct bpf_prog *old_prog;
+	int i;
+
+	if ((dev->features & NETIF_F_LRO) && prog) {
+		netdev_warn(dev, "can't set XDP while LRO is on, disable LRO first\n");
+		return -EINVAL;
+	}
+
+	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
+		netdev_warn(dev, "XDP expects header/data in single page\n");
+		return -EINVAL;
+	}
+
+	if (dev->mtu > PAGE_SIZE) {
+		netdev_warn(dev, "XDP requires MTU less than %lu\n", PAGE_SIZE);
+		return -EINVAL;
+	}
+
+	if (prog) {
+		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+	}
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
+		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
+		if (old_prog)
+			bpf_prog_put(old_prog);
+	}
+
+	return 0;
+}
+
+static bool virtnet_xdp_query(struct net_device *dev)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		if (vi->rq[i].xdp_prog)
+			return true;
+	}
+	return false;
+}
+
+static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return virtnet_xdp_set(dev, xdp->prog);
+	case XDP_QUERY_PROG:
+		xdp->prog_attached = virtnet_xdp_query(dev);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops virtnet_netdev = {
 	.ndo_open            = virtnet_open,
 	.ndo_stop   	     = virtnet_close,
@@ -1471,6 +1600,7 @@ static int virtnet_set_features(struct net_device *netdev,
 	.ndo_busy_poll		= virtnet_busy_poll,
 #endif
 	.ndo_set_features	= virtnet_set_features,
+	.ndo_xdp		= virtnet_xdp,
 };
 
 static void virtnet_config_changed_work(struct work_struct *work)
@@ -1527,12 +1657,20 @@ static void virtnet_free_queues(struct virtnet_info *vi)
 
 static void free_receive_bufs(struct virtnet_info *vi)
 {
+	struct bpf_prog *old_prog;
 	int i;
 
+	rtnl_lock();
 	for (i = 0; i < vi->max_queue_pairs; i++) {
 		while (vi->rq[i].pages)
 			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
+
+		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
+		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
+		if (old_prog)
+			bpf_prog_put(old_prog);
 	}
+	rtnl_unlock();
 }
 
 static void free_receive_page_frags(struct virtnet_info *vi)

^ permalink raw reply related

* [net-next PATCH v2 4/5] virtio_net: add dedicated XDP transmit queues
From: John Fastabend @ 2016-11-20  2:51 UTC (permalink / raw)
  To: daniel, eric.dumazet, mst, kubakici, shm, davem,
	alexei.starovoitov
  Cc: netdev, bblanco, john.fastabend, john.r.fastabend, brouer, tgraf
In-Reply-To: <20161120024710.19187.31037.stgit@john-Precision-Tower-5810>

XDP requires using isolated transmit queues to avoid interference
with normal networking stack (BQL, NETDEV_TX_BUSY, etc). This patch
adds a XDP queue per cpu when a XDP program is loaded and does not
expose the queues to the OS via the normal API call to
netif_set_real_num_tx_queues(). This way the stack will never push
an skb to these queues.

However virtio/vhost/qemu implementation only allows for creating
TX/RX queue pairs at this time so creating only TX queues was not
possible. And because the associated RX queues are being created I
went ahead and exposed these to the stack and let the backend use
them. This creates more RX queues visible to the network stack than
TX queues which is worth mentioning but does not cause any issues as
far as I can tell.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/virtio_net.c |   32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 8f99a53..80a426c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -114,6 +114,9 @@ struct virtnet_info {
 	/* # of queue pairs currently used by the driver */
 	u16 curr_queue_pairs;
 
+	/* # of XDP queue pairs currently used by the driver */
+	u16 xdp_queue_pairs;
+
 	/* I like... big packets and I cannot lie! */
 	bool big_packets;
 
@@ -1525,7 +1528,8 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct bpf_prog *old_prog;
-	int i;
+	u16 xdp_qp = 0, curr_qp;
+	int err, i;
 
 	if ((dev->features & NETIF_F_LRO) && prog) {
 		netdev_warn(dev, "can't set XDP while LRO is on, disable LRO first\n");
@@ -1542,12 +1546,34 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 		return -EINVAL;
 	}
 
+	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
+	if (prog)
+		xdp_qp = nr_cpu_ids;
+
+	/* XDP requires extra queues for XDP_TX */
+	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
+		netdev_warn(dev, "request %i queues but max is %i\n",
+			    curr_qp + xdp_qp, vi->max_queue_pairs);
+		return -ENOMEM;
+	}
+
+	err = virtnet_set_queues(vi, curr_qp + xdp_qp);
+	if (err) {
+		dev_warn(&dev->dev, "XDP Device queue allocation failure.\n");
+		return err;
+	}
+
 	if (prog) {
-		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
-		if (IS_ERR(prog))
+		prog = bpf_prog_add(prog, vi->max_queue_pairs);
+		if (IS_ERR(prog)) {
+			virtnet_set_queues(vi, curr_qp);
 			return PTR_ERR(prog);
+		}
 	}
 
+	vi->xdp_queue_pairs = xdp_qp;
+	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
+
 	for (i = 0; i < vi->max_queue_pairs; i++) {
 		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
 		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);

^ permalink raw reply related

* [net-next PATCH v2 5/5] virtio_net: add XDP_TX support
From: John Fastabend @ 2016-11-20  2:51 UTC (permalink / raw)
  To: daniel, eric.dumazet, mst, kubakici, shm, davem,
	alexei.starovoitov
  Cc: netdev, bblanco, john.fastabend, john.r.fastabend, brouer, tgraf
In-Reply-To: <20161120024710.19187.31037.stgit@john-Precision-Tower-5810>

This adds support for the XDP_TX action to virtio_net. When an XDP
program is run and returns the XDP_TX action the virtio_net XDP
implementation will transmit the packet on a TX queue that aligns
with the current CPU that the XDP packet was processed on.

Before sending the packet the header is zeroed.  Also XDP is expected
to handle checksum correctly so no checksum offload  support is
provided.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/virtio_net.c |   57 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 3 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 80a426c..c127494 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -330,12 +330,40 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	return skb;
 }
 
+static void virtnet_xdp_xmit(struct virtnet_info *vi,
+			     unsigned int qnum, struct xdp_buff *xdp)
+{
+	struct send_queue *sq = &vi->sq[qnum];
+	struct virtio_net_hdr_mrg_rxbuf *hdr;
+	unsigned int num_sg, len;
+	void *xdp_sent;
+
+	/* Free up any pending old buffers before queueing new ones. */
+	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
+		struct page *page = virt_to_head_page(xdp_sent);
+
+		put_page(page);
+	}
+
+	/* Zero header and leave csum up to XDP layers */
+	hdr = xdp->data;
+	memset(hdr, 0, vi->hdr_len);
+	hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+	hdr->hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
+
+	num_sg = 1;
+	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
+	virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, xdp->data, GFP_ATOMIC);
+	virtqueue_kick(sq->vq);
+}
+
 static u32 do_xdp_prog(struct virtnet_info *vi,
 		       struct bpf_prog *xdp_prog,
 		       struct page *page, int offset, int len)
 {
 	int hdr_padded_len;
 	struct xdp_buff xdp;
+	unsigned int qp;
 	u32 act;
 	u8 *buf;
 
@@ -353,9 +381,15 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
 	switch (act) {
 	case XDP_PASS:
 		return XDP_PASS;
+	case XDP_TX:
+		qp = vi->curr_queue_pairs -
+			vi->xdp_queue_pairs +
+			smp_processor_id();
+		xdp.data = buf + (vi->mergeable_rx_bufs ? 0 : 4);
+		virtnet_xdp_xmit(vi, qp, &xdp);
+		return XDP_TX;
 	default:
 		bpf_warn_invalid_xdp_action(act);
-	case XDP_TX:
 	case XDP_ABORTED:
 	case XDP_DROP:
 		return XDP_DROP;
@@ -386,8 +420,15 @@ static struct sk_buff *receive_big(struct net_device *dev,
 	if (xdp_prog) {
 		u32 act = do_xdp_prog(vi, xdp_prog, page, 0, len);
 
-		if (act == XDP_DROP)
+		switch (act) {
+		case XDP_PASS:
+			break;
+		case XDP_TX:
+			goto xdp_xmit;
+		case XDP_DROP:
+		default:
 			goto err;
+		}
 	}
 
 	skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
@@ -399,6 +440,7 @@ static struct sk_buff *receive_big(struct net_device *dev,
 err:
 	dev->stats.rx_dropped++;
 	give_pages(rq, page);
+xdp_xmit:
 	return NULL;
 }
 
@@ -417,6 +459,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	struct sk_buff *head_skb, *curr_skb;
 	struct bpf_prog *xdp_prog;
 
+	head_skb = NULL;
 	xdp_prog = rcu_dereference_bh(rq->xdp_prog);
 	if (xdp_prog) {
 		u32 act;
@@ -427,8 +470,15 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		}
 
 		act = do_xdp_prog(vi, xdp_prog, page, offset, len);
-		if (act == XDP_DROP)
+		switch (act) {
+		case XDP_PASS:
+			break;
+		case XDP_TX:
+			goto xdp_xmit;
+		case XDP_DROP:
+		default:
 			goto err_skb;
+		}
 	}
 
 	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
@@ -502,6 +552,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 err_buf:
 	dev->stats.rx_dropped++;
 	dev_kfree_skb(head_skb);
+xdp_xmit:
 	return NULL;
 }
 

^ permalink raw reply related

* Re: [PATCH] VSOCK: add loopback to virtio_transport
From: David Miller @ 2016-11-20  3:08 UTC (permalink / raw)
  To: stefanha; +Cc: netdev, cavery, imbrenda, jhansen
In-Reply-To: <1479397763-22319-1-git-send-email-stefanha@redhat.com>

From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 17 Nov 2016 15:49:23 +0000

> @@ -159,6 +199,10 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
>  		return -ENODEV;
>  	}
>  
> +	if (le32_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) {
> +		return virtio_transport_send_pkt_loopback(vsock, pkt);
> +	}
> +

A single-statement basic block should get surrounding curly
braces.

^ permalink raw reply

* Re: [PATCH net] l2tp: fix racy SOCK_ZAPPED flag check in l2tp_ip{,6}_bind()
From: David Miller @ 2016-11-20  3:10 UTC (permalink / raw)
  To: g.nault; +Cc: netdev, jchapman, celston, sploving1, andreyknvl
In-Reply-To: <f04ad12d6f2a433d60baf5720b2f7a6d685d37c6.1479503186.git.g.nault@alphalink.fr>

From: Guillaume Nault <g.nault@alphalink.fr>
Date: Fri, 18 Nov 2016 22:13:00 +0100

> Lock socket before checking the SOCK_ZAPPED flag in l2tp_ip6_bind().
> Without lock, a concurrent call could modify the socket flags between
> the sock_flag(sk, SOCK_ZAPPED) test and the lock_sock() call. This way,
> a socket could be inserted twice in l2tp_ip6_bind_table. Releasing it
> would then leave a stale pointer there, generating use-after-free
> errors when walking through the list or modifying adjacent entries.
 ...
> The same issue exists with l2tp_ip_bind() and l2tp_ip_bind_table.
> 
> Fixes: c51ce49735c1 ("l2tp: fix oops in L2TP IP sockets for connect() AF_UNSPEC case")
> Reported-by: Baozeng Ding <sploving1@gmail.com>
> Reported-by: Andrey Konovalov <andreyknvl@google.com>
> Tested-by: Baozeng Ding <sploving1@gmail.com>
> Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>

Applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [PATCH] net: make struct napi_alloc_cache::skb_count unsigned int
From: David Miller @ 2016-11-20  3:11 UTC (permalink / raw)
  To: adobriyan; +Cc: netdev
In-Reply-To: <20161119004756.GC1200@avx2>

From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 19 Nov 2016 03:47:56 +0300

> size_t is way too much for an integer not exceeding 64.
> 
> Space savings: 10 bytes!
> 
> 	add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-10 (-10)
> 	function                                     old     new   delta
> 	napi_consume_skb                             165     163      -2
> 	__kfree_skb_flush                             56      53      -3
> 	__kfree_skb_defer                             97      92      -5
> 	Total: Before=154865639, After=154865629, chg -0.00%
> 
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>

Applied to net-next.

^ permalink raw reply

* Re: [PATCH] netlink: use "unsigned int" in nla_next()
From: David Miller @ 2016-11-20  3:11 UTC (permalink / raw)
  To: adobriyan; +Cc: netdev
In-Reply-To: <20161119005435.GD1200@avx2>

From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 19 Nov 2016 03:54:35 +0300

> ->nla_len is unsigned entity (it's length after all) and u16,
> thus it can't overflow when being aligned into int/unsigned int.
> 
> (nlmsg_next has the same code, but I didn't yet convince myself
> it is correct to do so).
> 
> There is pointer arithmetic in this function and offset being
> unsigned is better:
 ...
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>

Applied to net-next.

^ permalink raw reply

* Re: [PATCH] netlink: smaller nla_attr_minlen table
From: David Miller @ 2016-11-20  3:12 UTC (permalink / raw)
  To: adobriyan; +Cc: netdev
In-Reply-To: <20161119005907.GE1200@avx2>

From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 19 Nov 2016 03:59:07 +0300

> Length of a netlink attribute may be u16 but lengths of basic attributes
> are much smaller, so small we can save 16 bytes of .rodata and pocket
> change inside .text.
> 
> 16-bit is worse on x86-64 than 8-bit because of operand size override prefix.
 ...
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>

Applied to net-next.

^ permalink raw reply

* Re: [PATCH] net: fix bogus cast in skb_pagelen() and use unsigned variables
From: David Miller @ 2016-11-20  3:12 UTC (permalink / raw)
  To: adobriyan; +Cc: netdev
In-Reply-To: <20161119010808.GF1200@avx2>

From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 19 Nov 2016 04:08:08 +0300

> 1) cast to "int" is unnecessary:
>    u8 will be promoted to int before decrementing,
>    small positive numbers fit into "int", so their values won't be changed
>    during promotion.
> 
>    Once everything is int including loop counters, signedness doesn't
>    matter: 32-bit operations will stay 32-bit operations.
> 
>    But! Someone tried to make this loop smart by making everything of
>    the same type apparently in an attempt to optimise it.
>    Do the optimization, just differently.
>    Do the cast where it matters. :^)
> 
> 2) frag size is unsigned entity and sum of fragments sizes is also
>    unsigned.
> 
> Make everything unsigned, leave no MOVSX instruction behind.
 ...
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>

Applied to net-next.

^ permalink raw reply

* Re: [PATCH] VSOCK: add loopback to virtio_transport
From: John Fastabend @ 2016-11-20  3:14 UTC (permalink / raw)
  To: David Miller, stefanha; +Cc: netdev, cavery, imbrenda, jhansen
In-Reply-To: <20161119.220804.755313313976863187.davem@davemloft.net>

On 16-11-19 07:08 PM, David Miller wrote:
> From: Stefan Hajnoczi <stefanha@redhat.com>
> Date: Thu, 17 Nov 2016 15:49:23 +0000
> 
>> @@ -159,6 +199,10 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
>>  		return -ENODEV;
>>  	}
>>  
>> +	if (le32_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) {
>> +		return virtio_transport_send_pkt_loopback(vsock, pkt);
>> +	}
>> +
> 
> A single-statement basic block should get surrounding curly
> braces.
> 

Should _not_ get curly braces in case it wasn't obvious ;)

^ permalink raw reply

* Re: [net-next] rtnl: fix the loop index update error in rtnl_dump_ifinfo()
From: David Miller @ 2016-11-20  3:15 UTC (permalink / raw)
  To: zhangshengju; +Cc: netdev, dsa
In-Reply-To: <1479569312-15224-1-git-send-email-zhangshengju@cmss.chinamobile.com>

From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Sat, 19 Nov 2016 23:28:32 +0800

> If the link is filtered out, loop index should also be updated. If not,
> loop index will not be correct.
> 
> Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>

Applied to 'net' and queued up for -stable.

Bug fixes should always be targetted at 'net' not 'net-next'.

^ permalink raw reply

* Re: [PATCH net 1/1] tipc: eliminate obsolete socket locking policy description
From: David Miller @ 2016-11-20  3:15 UTC (permalink / raw)
  To: jon.maloy
  Cc: netdev, paul.gortmaker, parthasarathy.bhuvaragan, ying.xue, maloy,
	tipc-discussion
In-Reply-To: <1479584827-21772-1-git-send-email-jon.maloy@ericsson.com>

From: Jon Maloy <jon.maloy@ericsson.com>
Date: Sat, 19 Nov 2016 14:47:07 -0500

> The comment block in socket.c describing the locking policy is
> obsolete, and does not reflect current reality. We remove it in this
> commit.
> 
> Since the current locking policy is much simpler and follows a
> mainstream approach, we see no need to add a new description.
> 
> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>

Applied, thanks Jon.

^ permalink raw reply

* Re: [PATCH] VSOCK: add loopback to virtio_transport
From: David Miller @ 2016-11-20  3:16 UTC (permalink / raw)
  To: john.fastabend; +Cc: stefanha, netdev, cavery, imbrenda, jhansen
In-Reply-To: <58311507.7070707@gmail.com>

From: John Fastabend <john.fastabend@gmail.com>
Date: Sat, 19 Nov 2016 19:14:15 -0800

> On 16-11-19 07:08 PM, David Miller wrote:
>> From: Stefan Hajnoczi <stefanha@redhat.com>
>> Date: Thu, 17 Nov 2016 15:49:23 +0000
>> 
>>> @@ -159,6 +199,10 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
>>>  		return -ENODEV;
>>>  	}
>>>  
>>> +	if (le32_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) {
>>> +		return virtio_transport_send_pkt_loopback(vsock, pkt);
>>> +	}
>>> +
>> 
>> A single-statement basic block should get surrounding curly
>> braces.
>> 
> 
> Should _not_ get curly braces in case it wasn't obvious ;)

Indeed, thanks for the correction :)

^ permalink raw reply

* Re: [PATCH v8 5/6] net: ipv4, ipv6: run cgroup eBPF egress programs
From: Alexei Starovoitov @ 2016-11-20  6:07 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Daniel Mack, htejun-b10kYP2dOMg, daniel-FeC+5ew28dpmcu3hnIyYJQ,
	ast-b10kYP2dOMg, davem-fT/PcQaiUtIeIZ0/mPfg9Q, kafai-b10kYP2dOMg,
	fw-HFFVJYpyMKqzQB+pC5nmwQ, harald-H+wXaHxf7aLQT0dZR+AlfA,
	netdev-u79uwXL29TY76Z2rM5mHXA, sargun-GaZTRHToo+CzQB+pC5nmwQ,
	cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161118174405.GA9678@salvia>

On Fri, Nov 18, 2016 at 06:44:05PM +0100, Pablo Neira Ayuso wrote:
> On Fri, Nov 18, 2016 at 09:17:18AM -0800, Alexei Starovoitov wrote:
> > On Fri, Nov 18, 2016 at 01:37:32PM +0100, Pablo Neira Ayuso wrote:
> > > On Thu, Nov 17, 2016 at 07:27:08PM +0100, Daniel Mack wrote:
> > > [...]
> > > > @@ -312,6 +314,12 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
> > > >  	skb->dev = dev;
> > > >  	skb->protocol = htons(ETH_P_IP);
> > > >  
> > > > +	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
> > > > +	if (ret) {
> > > > +		kfree_skb(skb);
> > > > +		return ret;
> > > > +	}
> > > > +
> > > >  	/*
> > > >  	 *	Multicasts are looped back for other local users
> > > >  	 */
> > > > @@ -364,12 +372,19 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
> > > >  int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
> > > >  {
> > > >  	struct net_device *dev = skb_dst(skb)->dev;
> > > > +	int ret;
> > > >  
> > > >  	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
> > > >  
> > > >  	skb->dev = dev;
> > > >  	skb->protocol = htons(ETH_P_IP);
> > > >  
> > > > +	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
> > > > +	if (ret) {
> > > > +		kfree_skb(skb);
> > > > +		return ret;
> > > > +	}
> > > > +
> > > >  	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
> > > >  			    net, sk, skb, NULL, dev,
> > > >  			    ip_finish_output,
> > > 
> > > Please, place this after the netfilter hook.
> > > 
> > > Since this new hook may mangle output packets, any mangling
> > > potentially interfers and breaks conntrack.
> > 
> > actually this hook cannot mangle the packets, so no conntrack
> > concerns.  Also this was brought up by Lorenzo earlier and consensus
> > was that it's cleaner to leave it in this order.
> 
> Not yet probably, but this could be used to implement snat at some
> point, you have potentially the infrastructure to do so in place
> already.
> 
> > My reply:
> > http://www.spinics.net/lists/cgroups/msg16675.html
> > and Daniel's:
> > http://www.spinics.net/lists/cgroups/msg16677.html
> > and the rest of that thread.
> 
> Please place this afterwards since I don't want to update Netfilter
> documentation to indicate that there is a new spot to debug before
> POSTROUTING that may drop packets. People are used to debugging things
> in a certain way, if packets are dropped after POSTROUTING, then
> netfilter tracing will indicate the packet has successfully left our
> framework and people will notice that packets are dropped somewhere
> else, so they have a clue probably is this new layer.
> 
> Actually I remember you mentioned in a previous email that this hook
> can be placed anywhere, and that they don't really need a fixed
> location, if so, then it should not be much of a problem to change
> this.

correct. As I said in the link above I don't mind the order.
We discussed it in private and (my personal interpretation) that
the opinions are 60% to keep it as-is and 40% to go with your
proposal.
It is certainly not a big deal to go one way or the other.
I think only you have a strong opinion about the order whereas
myself and Daneil B and Daniel M are exhuasted enough, so we will
take it either way.
So let me recoup the pro and con and let you decide, since it seems
doing it after nf hook on tx will actually make them much harder to
use together and I think you wouldn't want it in the first place.
Since RX hook is so late in the receive path there is no
practical use case to mangle the packet at this stage.
This set of patches doesn't allow to change the packet either.
After this rx hook the socket can only receive the packet and
messing up l3/l4 headers won't have any effect. Furthermore since
the hooks have to be symmetrical it doesn't make sense to mangle
the packet on TX either. In tcp case the socket is an established
connection, so allowing mangling won't do any good to the remote side.
Hence this cgroup+bpf+inet_rx/tx thingy is only useful for monitoring
of what applications are doing and high level application firewalling.
Now imagine some policy framework that prevents a websever
in a cgroup talk to a database using this facility. If nf hook
runs first on tx, this policy framework won't be usable, since l3/l4
can be mangled, so application enforcement is not possible unless
iptables are disabled. I don't want such "either iptables or cgroup+bpf"
scenario and I don't think you want that either.
At the same time if iptables do traditional nat and firewall
_after_ this tx hook then this policy framework can coexist
with iptables based security. So imo this order of hooks
is a better way forward.
But if you strongly disagree, I'm ok to change it,
though I think long term it will be a loosing proposition
for both frameworks. So your call.

Thanks

^ permalink raw reply

* RE: [PATCH] net: fec: Detect and recover receive queue hangs
From: Andy Duan @ 2016-11-20  6:18 UTC (permalink / raw)
  To: Chris Lesiak
  Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	Jaccon Bastiaansen
In-Reply-To: <b420fcd7-4171-f5c7-9d0c-cd8809e31bbc@licor.com>

From: Chris Lesiak <chris.lesiak@licor.com> Sent: Friday, November 18, 2016 10:37 PM
 >To: Andy Duan <fugang.duan@nxp.com>
 >Cc: netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Jaccon
 >Bastiaansen <jaccon.bastiaansen@gmail.com>
 >Subject: Re: [PATCH] net: fec: Detect and recover receive queue hangs
 >
 >On 11/18/2016 12:44 AM, Andy Duan wrote:
 >> From: Chris Lesiak <chris.lesiak@licor.com> Sent: Friday, November 18,
 >> 2016 5:15 AM
 >>  >To: Andy Duan <fugang.duan@nxp.com>
 >>  >Cc: netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Jaccon
 >> >Bastiaansen <jaccon.bastiaansen@gmail.com>; chris.lesiak@licor.com
 >>  >Subject: [PATCH] net: fec: Detect and recover receive queue hangs  >
 >> >This corrects a problem that appears to be similar to ERR006358.  But
 >> while
 >>  >ERR006358 is a race when the tx queue transitions from empty to not
 >> empty,  >this problem is a race when the rx queue transitions from full to
 >not full.
 >>  >
 >>  >The symptom is a receive queue that is stuck.  The ENET_RDAR
 >> register will  >read 0, indicating that there are no empty receive
 >> descriptors in the receive  >ring.  Since no additional frames can be queued,
 >no RXF interrupts occur.
 >>  >
 >>  >This problem can be triggered with a 1 Gb link and about 400 Mbps of
 >traffic.
 >
 >I can cause the error by running the following on an imx6q: iperf -s -u And
 >sending packets from the other end of a 1 Gbps link:
 >iperf -c $IPADDR -u -b40000pps
 >
 >A few others have seen this problem.
 >See: https://community.nxp.com/thread/322882
 >
 >>  >
 >>  >This patch detects this condition, sets the work_rx bit, and
 >> reschedules the  >poll method.
 >>  >
 >>  >Signed-off-by: Chris Lesiak <chris.lesiak@licor.com>
 >>  >---
 >>  > drivers/net/ethernet/freescale/fec_main.c | 31
 >> >+++++++++++++++++++++++++++++++  > 1 file changed, 31 insertions(+)
 >> > Firstly, how to reproduce the issue, pls list the reproduce steps.
 >> Thanks.
 >> Secondly, pls check below comments.
 >>
 >>  >diff --git a/drivers/net/ethernet/freescale/fec_main.c
 >>  >b/drivers/net/ethernet/freescale/fec_main.c
 >>  >index fea0f33..8a87037 100644
 >>  >--- a/drivers/net/ethernet/freescale/fec_main.c
 >>  >+++ b/drivers/net/ethernet/freescale/fec_main.c
 >>  >@@ -1588,6 +1588,34 @@ fec_enet_interrupt(int irq, void *dev_id)
 >>  > 	return ret;
 >>  > }
 >>  >
 >>  >+static inline bool
 >>  >+fec_enet_recover_rxq(struct fec_enet_private *fep, u16 queue_id) {
 >>  >+	int work_bit = (queue_id == 0) ? 2 : ((queue_id == 1) ? 0 : 1);
 >>  >+
 >>  >+	if (readl(fep->rx_queue[queue_id]->bd.reg_desc_active))
 >> If rx ring is really empty in slight throughput cases,  rdar is always cleared,
 >then there always do napi reschedule.
 >
 >I think that you are concerned that if rdar is zero due to this hardware
 >problem, but the rx ring is actually empty, then fec_enet_rx_queue will
 >never do a write to rdar so that it can be non-zero.  That will cause napi to
 >always be resceduled.
 >
 >I suppose that might be the case with zero rx traffic, and I was concerned
 >that it might be true even when there was rx traffic.  I suspected that the
 >hardware, seeing that rdar is zero, would never queue another packet, even
 >if there were in fact empty descriptors.  But it doesn't seem to be the case.  It
 >does reschedule multiple times, but eventually sees some packets in the rx
 >ring and recovers.
 >
 >I admit that I do not completely understand how that can happen.  I did
 >confirm that fec_enet_active_rxring is not being called.
 >
 >Maybe someone with a deeper understanding of the fec than I can provide
 >an explanation.
 >
The patch needs to hold on for some time (days), I will reserve time to investigate the issue.
Thanks.

 >>
 >>  >+		return false;
 >>  >+
 >>  >+	dev_notice_once(&fep->pdev->dev, "Recovered rx queue\n");
 >>  >+
 >>  >+	fep->work_rx |= 1 << work_bit;
 >>  >+
 >>  >+	return true;
 >>  >+}
 >>  >+
 >>  >+static inline bool fec_enet_recover_rxqs(struct fec_enet_private
 >> *fep)  >+{
 >>  >+	unsigned int q;
 >>  >+	bool ret = false;
 >>  >+
 >>  >+	for (q = 0; q < fep->num_rx_queues; q++) {
 >>  >+		if (fec_enet_recover_rxq(fep, q))
 >>  >+			ret = true;
 >>  >+	}
 >>  >+
 >>  >+	return ret;
 >>  >+}
 >>  >+
 >>  > static int fec_enet_rx_napi(struct napi_struct *napi, int budget)  {
 >>  > 	struct net_device *ndev = napi->dev;
 >>  >@@ -1601,6 +1629,9 @@ static int fec_enet_rx_napi(struct napi_struct
 >> *napi,  >int budget)
 >>  > 	if (pkts < budget) {
 >>  > 		napi_complete(napi);
 >>  > 		writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
 >>  >+
 >>  >+		if (fec_enet_recover_rxqs(fep) && napi_reschedule(napi))
 >>  >+			writel(FEC_NAPI_IMASK, fep->hwp + FEC_IMASK);
 >>  > 	}
 >>  > 	return pkts;
 >>  > }
 >>  >--
 >>  >2.5.5
 >>
 >
 >
 >--
 >Chris Lesiak
 >Principal Design Engineer, Software
 >LI-COR Biosciences
 >chris.lesiak@licor.com
 >
 >Any opinions expressed are those of the author and do not necessarily
 >represent those of his employer.
 >

^ permalink raw reply

* RE: [PATCH v9 0/8] thunderbolt: Introducing Thunderbolt(TM) Networking
From: Levy, Amir (Jer) @ 2016-11-20  6:30 UTC (permalink / raw)
  To: gregkh@linuxfoundation.org
  Cc: Simon Guinot, andreas.noever@gmail.com, bhelgaas@google.com,
	corbet@lwn.net, linux-kernel@vger.kernel.org,
	linux-pci@vger.kernel.org, netdev@vger.kernel.org,
	linux-doc@vger.kernel.org, mario_limonciello@dell.com,
	thunderbolt-linux, Westerberg, Mika, Winkler, Tomas,
	Zhang, Xiong Y, Jamet, Michael, remi.rerolle@seagate.com
In-Reply-To: <20161118100700.GA7126@kroah.com>

On Fri, Nov 18 2016, 12:07 PM, gregkh@linuxfoundation.org wrote:
> On Fri, Nov 18, 2016 at 08:48:36AM +0000, Levy, Amir (Jer) wrote:
> > > BTW, it is quite a shame that the Thunderbolt firmware version can't
> > > be read from Linux.
> > >
> >
> > This is WIP, once this patch will be upstream, we will be able to
> > focus more on aligning Linux with the Thunderbolt features that we have
> for windows.
> 
> Why is this patch somehow holding that work back?  You aren't just sitting
> around waiting for people to review this and not doing anything else, right?
> Is there some basic building block in these patches that your firmware
> download code is going to rely on?
> 
> confused,
> 
> greg k-h

All the Thunderbolt SW features (including networking and FW update) depend 
on the communication with FW, which is patch 3/8 in the series.
The patch also sets up a generic netlink for user space communication.
So yes, the communication with Thunderbolt FW is a basic building block.

^ permalink raw reply

* [PATCH net 02/18] net/ena: fix queues number calculation
From: Netanel Belgazal @ 2016-11-20  8:45 UTC (permalink / raw)
  To: linux-kernel, davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, alex, saeed, msw, aliguori, nafea
In-Reply-To: <1479631547-29354-1-git-send-email-netanel@annapurnalabs.com>

The ENA driver tries to open a queue per vCPU.
To determine how many vCPUs the instance have it uses num_possible_cpus
while it should have use num_online_cpus instead.

Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 33a760e..8815217 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -2667,7 +2667,7 @@ static int ena_calc_io_queue_num(struct pci_dev *pdev,
 		io_sq_num = get_feat_ctx->max_queues.max_sq_num;
 	}
 
-	io_queue_num = min_t(int, num_possible_cpus(), ENA_MAX_NUM_IO_QUEUES);
+	io_queue_num = min_t(int, num_online_cpus(), ENA_MAX_NUM_IO_QUEUES);
 	io_queue_num = min_t(int, io_queue_num, io_sq_num);
 	io_queue_num = min_t(int, io_queue_num,
 			     get_feat_ctx->max_queues.max_cq_num);
-- 
1.9.1

^ permalink raw reply related

* [PATCH net 03/18] net/ena: use napi_schedule_irqoff when possible
From: Netanel Belgazal @ 2016-11-20  8:45 UTC (permalink / raw)
  To: linux-kernel, davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, alex, saeed, msw, aliguori, nafea
In-Reply-To: <1479631547-29354-1-git-send-email-netanel@annapurnalabs.com>

Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 8815217..e3acf76 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -1181,7 +1181,7 @@ static irqreturn_t ena_intr_msix_io(int irq, void *data)
 {
 	struct ena_napi *ena_napi = data;
 
-	napi_schedule(&ena_napi->napi);
+	napi_schedule_irqoff(&ena_napi->napi);
 
 	return IRQ_HANDLED;
 }
-- 
1.9.1

^ permalink raw reply related

* [PATCH net 04/18] net/ena: reduce the severity of ena printouts
From: Netanel Belgazal @ 2016-11-20  8:45 UTC (permalink / raw)
  To: linux-kernel, davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, alex, saeed, msw, aliguori, nafea
In-Reply-To: <1479631547-29354-1-git-send-email-netanel@annapurnalabs.com>

Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
---
 drivers/net/ethernet/amazon/ena/ena_com.c    | 27 +++++++++++++++++----------
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 14 +++++++++++---
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c
index 3066d9c..6bd2b9b 100644
--- a/drivers/net/ethernet/amazon/ena/ena_com.c
+++ b/drivers/net/ethernet/amazon/ena/ena_com.c
@@ -784,7 +784,7 @@ static int ena_com_get_feature_ex(struct ena_com_dev *ena_dev,
 	int ret;
 
 	if (!ena_com_check_supported_feature_id(ena_dev, feature_id)) {
-		pr_info("Feature %d isn't supported\n", feature_id);
+		pr_debug("Feature %d isn't supported\n", feature_id);
 		return -EPERM;
 	}
 
@@ -1126,7 +1126,13 @@ int ena_com_execute_admin_command(struct ena_com_admin_queue *admin_queue,
 	comp_ctx = ena_com_submit_admin_cmd(admin_queue, cmd, cmd_size,
 					    comp, comp_size);
 	if (unlikely(IS_ERR(comp_ctx))) {
-		pr_err("Failed to submit command [%ld]\n", PTR_ERR(comp_ctx));
+		if (comp_ctx == ERR_PTR(-ENODEV))
+			pr_debug("Failed to submit command [%ld]\n",
+				 PTR_ERR(comp_ctx));
+		else
+			pr_err("Failed to submit command [%ld]\n",
+			       PTR_ERR(comp_ctx));
+
 		return PTR_ERR(comp_ctx);
 	}
 
@@ -1895,7 +1901,7 @@ int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, int mtu)
 	int ret;
 
 	if (!ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_MTU)) {
-		pr_info("Feature %d isn't supported\n", ENA_ADMIN_MTU);
+		pr_debug("Feature %d isn't supported\n", ENA_ADMIN_MTU);
 		return -EPERM;
 	}
 
@@ -1948,8 +1954,8 @@ int ena_com_set_hash_function(struct ena_com_dev *ena_dev)
 
 	if (!ena_com_check_supported_feature_id(ena_dev,
 						ENA_ADMIN_RSS_HASH_FUNCTION)) {
-		pr_info("Feature %d isn't supported\n",
-			ENA_ADMIN_RSS_HASH_FUNCTION);
+		pr_debug("Feature %d isn't supported\n",
+			 ENA_ADMIN_RSS_HASH_FUNCTION);
 		return -EPERM;
 	}
 
@@ -2112,7 +2118,8 @@ int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev)
 
 	if (!ena_com_check_supported_feature_id(ena_dev,
 						ENA_ADMIN_RSS_HASH_INPUT)) {
-		pr_info("Feature %d isn't supported\n", ENA_ADMIN_RSS_HASH_INPUT);
+		pr_debug("Feature %d isn't supported\n",
+			 ENA_ADMIN_RSS_HASH_INPUT);
 		return -EPERM;
 	}
 
@@ -2270,8 +2277,8 @@ int ena_com_indirect_table_set(struct ena_com_dev *ena_dev)
 
 	if (!ena_com_check_supported_feature_id(
 		    ena_dev, ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG)) {
-		pr_info("Feature %d isn't supported\n",
-			ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG);
+		pr_debug("Feature %d isn't supported\n",
+			 ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG);
 		return -EPERM;
 	}
 
@@ -2542,8 +2549,8 @@ int ena_com_init_interrupt_moderation(struct ena_com_dev *ena_dev)
 
 	if (rc) {
 		if (rc == -EPERM) {
-			pr_info("Feature %d isn't supported\n",
-				ENA_ADMIN_INTERRUPT_MODERATION);
+			pr_debug("Feature %d isn't supported\n",
+				 ENA_ADMIN_INTERRUPT_MODERATION);
 			rc = 0;
 		} else {
 			pr_err("Failed to get interrupt moderation admin cmd. rc: %d\n",
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index e3acf76..7247abb 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -566,6 +566,7 @@ static void ena_free_all_rx_bufs(struct ena_adapter *adapter)
  */
 static void ena_free_tx_bufs(struct ena_ring *tx_ring)
 {
+	bool print_once = true;
 	u32 i;
 
 	for (i = 0; i < tx_ring->ring_size; i++) {
@@ -577,9 +578,16 @@ static void ena_free_tx_bufs(struct ena_ring *tx_ring)
 		if (!tx_info->skb)
 			continue;
 
-		netdev_notice(tx_ring->netdev,
-			      "free uncompleted tx skb qid %d idx 0x%x\n",
-			      tx_ring->qid, i);
+		if (print_once) {
+			netdev_notice(tx_ring->netdev,
+				      "free uncompleted tx skb qid %d idx 0x%x\n",
+				      tx_ring->qid, i);
+			print_once = false;
+		} else {
+			netdev_dbg(tx_ring->netdev,
+				   "free uncompleted tx skb qid %d idx 0x%x\n",
+				   tx_ring->qid, i);
+		}
 
 		ena_buf = tx_info->bufs;
 		dma_unmap_single(tx_ring->dev,
-- 
1.9.1

^ permalink raw reply related

* [PATCH net 05/18] net/ena: add hardware hints capability to the driver
From: Netanel Belgazal @ 2016-11-20  8:45 UTC (permalink / raw)
  To: linux-kernel, davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, alex, saeed, msw, aliguori, nafea
In-Reply-To: <1479631547-29354-1-git-send-email-netanel@annapurnalabs.com>

The ENA device can update the ena driver about the desire timeouts.
The hardware hints are transmitted as Asynchronous event to the driver.

In case the device does not support this capability, the driver
will use its own defines.

Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
---
 drivers/net/ethernet/amazon/ena/ena_admin_defs.h | 31 +++++++++
 drivers/net/ethernet/amazon/ena/ena_com.c        | 41 ++++++++---
 drivers/net/ethernet/amazon/ena/ena_com.h        |  5 ++
 drivers/net/ethernet/amazon/ena/ena_ethtool.c    |  1 -
 drivers/net/ethernet/amazon/ena/ena_netdev.c     | 86 +++++++++++++++++++-----
 drivers/net/ethernet/amazon/ena/ena_netdev.h     | 19 +++++-
 drivers/net/ethernet/amazon/ena/ena_regs_defs.h  |  2 +
 7 files changed, 157 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_admin_defs.h b/drivers/net/ethernet/amazon/ena/ena_admin_defs.h
index a46e749..51b2a92 100644
--- a/drivers/net/ethernet/amazon/ena/ena_admin_defs.h
+++ b/drivers/net/ethernet/amazon/ena/ena_admin_defs.h
@@ -70,6 +70,8 @@ enum ena_admin_aq_feature_id {
 
 	ENA_ADMIN_MAX_QUEUES_NUM		= 2,
 
+	ENA_ADMIN_HW_HINTS			= 3,
+
 	ENA_ADMIN_RSS_HASH_FUNCTION		= 10,
 
 	ENA_ADMIN_STATELESS_OFFLOAD_CONFIG	= 11,
@@ -749,6 +751,31 @@ struct ena_admin_feature_rss_ind_table {
 	struct ena_admin_rss_ind_table_entry inline_entry;
 };
 
+/* When hint value is 0, driver should use it's own predefined value */
+struct ena_admin_ena_hw_hints {
+	/* value in ms */
+	u16 mmio_read_timeout;
+
+	/* value in ms */
+	u16 driver_watchdog_timeout;
+
+	/* Per packet tx completion timeout. value in ms */
+	u16 missing_tx_completion_timeout;
+
+	u16 missed_tx_completion_count_threshold_to_reset;
+
+	/* value in ms */
+	u16 admin_completion_tx_timeout;
+
+	u16 netdev_wd_timeout;
+
+	u16 max_tx_sgl_size;
+
+	u16 max_rx_sgl_size;
+
+	u16 reserved[8];
+};
+
 struct ena_admin_get_feat_cmd {
 	struct ena_admin_aq_common_desc aq_common_descriptor;
 
@@ -782,6 +809,8 @@ struct ena_admin_get_feat_resp {
 		struct ena_admin_feature_rss_ind_table ind_table;
 
 		struct ena_admin_feature_intr_moder_desc intr_moderation;
+
+		struct ena_admin_ena_hw_hints hw_hints;
 	} u;
 };
 
@@ -857,6 +886,8 @@ enum ena_admin_aenq_notification_syndrom {
 	ENA_ADMIN_SUSPEND	= 0,
 
 	ENA_ADMIN_RESUME	= 1,
+
+	ENA_ADMIN_UPDATE_HINTS	= 2,
 };
 
 struct ena_admin_aenq_entry {
diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c
index 6bd2b9b..366c2c5 100644
--- a/drivers/net/ethernet/amazon/ena/ena_com.c
+++ b/drivers/net/ethernet/amazon/ena/ena_com.c
@@ -508,15 +508,13 @@ static int ena_com_comp_status_to_errno(u8 comp_status)
 static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_ctx,
 						     struct ena_com_admin_queue *admin_queue)
 {
-	unsigned long flags;
-	u32 start_time;
+	unsigned long flags, timeout;
 	int ret;
 
-	start_time = ((u32)jiffies_to_usecs(jiffies));
+	timeout = jiffies + usecs_to_jiffies(admin_queue->completion_timeout);
 
 	while (comp_ctx->status == ENA_CMD_SUBMITTED) {
-		if ((((u32)jiffies_to_usecs(jiffies)) - start_time) >
-		    ADMIN_CMD_TIMEOUT_US) {
+		if (time_is_before_jiffies(timeout)) {
 			pr_err("Wait for completion (polling) timeout\n");
 			/* ENA didn't have any completion */
 			spin_lock_irqsave(&admin_queue->q_lock, flags);
@@ -560,7 +558,8 @@ static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *com
 	int ret;
 
 	wait_for_completion_timeout(&comp_ctx->wait_event,
-				    usecs_to_jiffies(ADMIN_CMD_TIMEOUT_US));
+				    usecs_to_jiffies(
+					    admin_queue->completion_timeout));
 
 	/* In case the command wasn't completed find out the root cause.
 	 * There might be 2 kinds of errors
@@ -600,12 +599,14 @@ static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset)
 	struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read;
 	volatile struct ena_admin_ena_mmio_req_read_less_resp *read_resp =
 		mmio_read->read_resp;
-	u32 mmio_read_reg, ret;
+	u32 mmio_read_reg, timeout, ret;
 	unsigned long flags;
 	int i;
 
 	might_sleep();
 
+	timeout = mmio_read->reg_read_to ? : ENA_REG_READ_TIMEOUT;
+
 	/* If readless is disabled, perform regular read */
 	if (!mmio_read->readless_supported)
 		return readl(ena_dev->reg_bar + offset);
@@ -626,14 +627,14 @@ static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset)
 
 	writel(mmio_read_reg, ena_dev->reg_bar + ENA_REGS_MMIO_REG_READ_OFF);
 
-	for (i = 0; i < ENA_REG_READ_TIMEOUT; i++) {
+	for (i = 0; i < timeout; i++) {
 		if (read_resp->req_id == mmio_read->seq_num)
 			break;
 
 		udelay(1);
 	}
 
-	if (unlikely(i == ENA_REG_READ_TIMEOUT)) {
+	if (unlikely(i == timeout)) {
 		pr_err("reading reg failed for timeout. expected: req id[%hu] offset[%hu] actual: req id[%hu] offset[%hu]\n",
 		       mmio_read->seq_num, offset, read_resp->req_id,
 		       read_resp->reg_off);
@@ -1723,6 +1724,20 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 	memcpy(&get_feat_ctx->offload, &get_resp.u.offload,
 	       sizeof(get_resp.u.offload));
 
+	/* Driver hints isn't mandatory admin command. So in case the
+	 * command isn't supported set driver hints to 0
+	 */
+	rc = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_HW_HINTS);
+
+	if (!rc)
+		memcpy(&get_feat_ctx->hw_hints, &get_resp.u.hw_hints,
+		       sizeof(get_resp.u.hw_hints));
+	else if (rc == -EPERM)
+		memset(&get_feat_ctx->hw_hints, 0x0,
+		       sizeof(get_feat_ctx->hw_hints));
+	else
+		return rc;
+
 	return 0;
 }
 
@@ -1848,6 +1863,14 @@ int ena_com_dev_reset(struct ena_com_dev *ena_dev)
 		return rc;
 	}
 
+	timeout = (cap & ENA_REGS_CAPS_ADMIN_CMD_TO_MASK) >>
+		ENA_REGS_CAPS_ADMIN_CMD_TO_SHIFT;
+	if (timeout)
+		/* the resolution of timeout reg is 100ms */
+		ena_dev->admin_queue.completion_timeout = timeout * 100000;
+	else
+		ena_dev->admin_queue.completion_timeout = ADMIN_CMD_TIMEOUT_US;
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/amazon/ena/ena_com.h b/drivers/net/ethernet/amazon/ena/ena_com.h
index 509d7b8..6883ee5 100644
--- a/drivers/net/ethernet/amazon/ena/ena_com.h
+++ b/drivers/net/ethernet/amazon/ena/ena_com.h
@@ -96,6 +96,8 @@
 #define ENA_INTR_MODER_LEVEL_STRIDE			2
 #define ENA_INTR_BYTE_COUNT_NOT_SUPPORTED		0xFFFFFF
 
+#define ENA_HW_HINTS_NO_TIMEOUT				0xFFFF
+
 enum ena_intr_moder_level {
 	ENA_INTR_MODER_LOWEST = 0,
 	ENA_INTR_MODER_LOW,
@@ -232,6 +234,7 @@ struct ena_com_admin_queue {
 	void *q_dmadev;
 	spinlock_t q_lock; /* spinlock for the admin queue */
 	struct ena_comp_ctx *comp_ctx;
+	u32 completion_timeout;
 	u16 q_depth;
 	struct ena_com_admin_cq cq;
 	struct ena_com_admin_sq sq;
@@ -266,6 +269,7 @@ struct ena_com_aenq {
 struct ena_com_mmio_read {
 	struct ena_admin_ena_mmio_req_read_less_resp *read_resp;
 	dma_addr_t read_resp_dma_addr;
+	u32 reg_read_to; /* in us */
 	u16 seq_num;
 	bool readless_supported;
 	/* spin lock to ensure a single outstanding read */
@@ -335,6 +339,7 @@ struct ena_com_dev_get_features_ctx {
 	struct ena_admin_device_attr_feature_desc dev_attr;
 	struct ena_admin_feature_aenq_desc aenq;
 	struct ena_admin_feature_offload_desc offload;
+	struct ena_admin_ena_hw_hints hw_hints;
 };
 
 struct ena_com_create_io_ctx {
diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
index 67b2338f..a1fbfc2 100644
--- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c
+++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
@@ -80,7 +80,6 @@ struct ena_stats {
 	ENA_STAT_TX_ENTRY(tx_poll),
 	ENA_STAT_TX_ENTRY(doorbells),
 	ENA_STAT_TX_ENTRY(prepare_ctx_err),
-	ENA_STAT_TX_ENTRY(missing_tx_comp),
 	ENA_STAT_TX_ENTRY(bad_req_id),
 };
 
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 7247abb..e7dda8b 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -1989,6 +1989,7 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	tx_info->tx_descs = nb_hw_desc;
 	tx_info->last_jiffies = jiffies;
+	tx_info->print_once = 0;
 
 	tx_ring->next_to_use = ENA_TX_RING_IDX_NEXT(next_to_use,
 		tx_ring->ring_size);
@@ -2542,33 +2543,34 @@ static void check_for_missing_tx_completions(struct ena_adapter *adapter)
 	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
 		return;
 
+	if (adapter->missing_tx_completion_to == ENA_HW_HINTS_NO_TIMEOUT)
+		return;
+
 	budget = ENA_MONITORED_TX_QUEUES;
 
 	for (i = adapter->last_monitored_tx_qid; i < adapter->num_queues; i++) {
 		tx_ring = &adapter->tx_ring[i];
 
+		missed_tx = 0;
+
 		for (j = 0; j < tx_ring->ring_size; j++) {
 			tx_buf = &tx_ring->tx_buffer_info[j];
 			last_jiffies = tx_buf->last_jiffies;
-			if (unlikely(last_jiffies && time_is_before_jiffies(last_jiffies + TX_TIMEOUT))) {
-				netif_notice(adapter, tx_err, adapter->netdev,
-					     "Found a Tx that wasn't completed on time, qid %d, index %d.\n",
-					     tx_ring->qid, j);
+			if (unlikely(last_jiffies && time_is_before_jiffies(last_jiffies + adapter->missing_tx_completion_to))) {
+				if (!tx_buf->print_once)
+					netif_notice(adapter, tx_err, adapter->netdev,
+						     "Found a Tx that wasn't completed on time, qid %d, index %d.\n",
+						     tx_ring->qid, j);
 
-				u64_stats_update_begin(&tx_ring->syncp);
-				missed_tx = tx_ring->tx_stats.missing_tx_comp++;
-				u64_stats_update_end(&tx_ring->syncp);
+				tx_buf->print_once = 1;
+				missed_tx++;
 
-				/* Clear last jiffies so the lost buffer won't
-				 * be counted twice.
-				 */
-				tx_buf->last_jiffies = 0;
-
-				if (unlikely(missed_tx > MAX_NUM_OF_TIMEOUTED_PACKETS)) {
+				if (unlikely(missed_tx > adapter->missing_tx_completion_threshold)) {
 					netif_err(adapter, tx_err, adapter->netdev,
 						  "The number of lost tx completion is above the threshold (%d > %d). Reset the device\n",
-						  missed_tx, MAX_NUM_OF_TIMEOUTED_PACKETS);
+						  missed_tx, adapter->missing_tx_completion_threshold);
 					set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+					return;
 				}
 			}
 		}
@@ -2589,8 +2591,11 @@ static void check_for_missing_keep_alive(struct ena_adapter *adapter)
 	if (!adapter->wd_state)
 		return;
 
-	keep_alive_expired = round_jiffies(adapter->last_keep_alive_jiffies
-					   + ENA_DEVICE_KALIVE_TIMEOUT);
+	if (adapter->keep_alive_timeout == ENA_HW_HINTS_NO_TIMEOUT)
+		return;
+
+	keep_alive_expired = round_jiffies(adapter->last_keep_alive_jiffies +
+					   adapter->keep_alive_timeout);
 	if (unlikely(time_is_before_jiffies(keep_alive_expired))) {
 		netif_err(adapter, drv, adapter->netdev,
 			  "Keep alive watchdog timeout.\n");
@@ -2613,6 +2618,44 @@ static void check_for_admin_com_state(struct ena_adapter *adapter)
 	}
 }
 
+static void ena_update_hints(struct ena_adapter *adapter,
+			     struct ena_admin_ena_hw_hints *hints)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	if (hints->admin_completion_tx_timeout)
+		adapter->ena_dev->admin_queue.completion_timeout =
+			hints->admin_completion_tx_timeout * 1000;
+
+	if (hints->mmio_read_timeout)
+		/* convert to usec */
+		adapter->ena_dev->mmio_read.reg_read_to =
+			hints->mmio_read_timeout * 1000;
+
+	if (hints->missed_tx_completion_count_threshold_to_reset)
+		adapter->missing_tx_completion_threshold =
+			hints->missed_tx_completion_count_threshold_to_reset;
+
+	if (hints->missing_tx_completion_timeout) {
+		if (hints->missing_tx_completion_timeout == ENA_HW_HINTS_NO_TIMEOUT)
+			adapter->missing_tx_completion_to = ENA_HW_HINTS_NO_TIMEOUT;
+		else
+			adapter->missing_tx_completion_to =
+				msecs_to_jiffies(hints->missing_tx_completion_timeout);
+	}
+
+	if (hints->netdev_wd_timeout)
+		netdev->watchdog_timeo = msecs_to_jiffies(hints->netdev_wd_timeout);
+
+	if (hints->driver_watchdog_timeout) {
+		if (hints->driver_watchdog_timeout == ENA_HW_HINTS_NO_TIMEOUT)
+			adapter->keep_alive_timeout = ENA_HW_HINTS_NO_TIMEOUT;
+		else
+			adapter->keep_alive_timeout =
+				msecs_to_jiffies(hints->driver_watchdog_timeout);
+	}
+}
+
 static void ena_update_host_info(struct ena_admin_host_info *host_info,
 				 struct net_device *netdev)
 {
@@ -3024,6 +3067,11 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	INIT_WORK(&adapter->reset_task, ena_fw_reset_device);
 
 	adapter->last_keep_alive_jiffies = jiffies;
+	adapter->keep_alive_timeout = ENA_DEVICE_KALIVE_TIMEOUT;
+	adapter->missing_tx_completion_to = TX_TIMEOUT;
+	adapter->missing_tx_completion_threshold = MAX_NUM_OF_TIMEOUTED_PACKETS;
+
+	ena_update_hints(adapter, &get_feat_ctx.hw_hints);
 
 	init_timer(&adapter->timer_service);
 	adapter->timer_service.expires = round_jiffies(jiffies + HZ);
@@ -3232,6 +3280,7 @@ static void ena_notification(void *adapter_data,
 			     struct ena_admin_aenq_entry *aenq_e)
 {
 	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
+	struct ena_admin_ena_hw_hints *hints;
 
 	WARN(aenq_e->aenq_common_desc.group != ENA_ADMIN_NOTIFICATION,
 	     "Invalid group(%x) expected %x\n",
@@ -3249,6 +3298,11 @@ static void ena_notification(void *adapter_data,
 	case ENA_ADMIN_RESUME:
 		queue_work(ena_wq, &adapter->resume_io_task);
 		break;
+	case ENA_ADMIN_UPDATE_HINTS:
+		hints = (struct ena_admin_ena_hw_hints *)
+			(&aenq_e->inline_data_w4);
+		ena_update_hints(adapter, hints);
+		break;
 	default:
 		netif_err(adapter, drv, adapter->netdev,
 			  "Invalid aenq notification link state %d\n",
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h
index 69d7e9e..f8ef1f0 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.h
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h
@@ -146,7 +146,18 @@ struct ena_tx_buffer {
 	u32 tx_descs;
 	/* num of buffers used by this skb */
 	u32 num_of_bufs;
-	/* Save the last jiffies to detect missing tx packets */
+
+	/* Used for detect missing tx packets to limit the number of prints */
+	u32 print_once;
+	/* Save the last jiffies to detect missing tx packets
+	 *
+	 * sets to non zero value on ena_start_xmit and set to zero on
+	 * napi and timer_Service_routine.
+	 *
+	 * while this value is not protected by lock,
+	 * a given packet is not expected to be handled by ena_start_xmit
+	 * and by napi/timer_service at the same time.
+	 */
 	unsigned long last_jiffies;
 	struct ena_com_buf bufs[ENA_PKT_MAX_BUFS];
 } ____cacheline_aligned;
@@ -170,7 +181,6 @@ struct ena_stats_tx {
 	u64 napi_comp;
 	u64 tx_poll;
 	u64 doorbells;
-	u64 missing_tx_comp;
 	u64 bad_req_id;
 };
 
@@ -269,6 +279,8 @@ struct ena_adapter {
 	struct msix_entry *msix_entries;
 	int msix_vecs;
 
+	u32 missing_tx_completion_threshold;
+
 	u32 tx_usecs, rx_usecs; /* interrupt moderation */
 	u32 tx_frames, rx_frames; /* interrupt moderation */
 
@@ -282,6 +294,9 @@ struct ena_adapter {
 
 	u8 mac_addr[ETH_ALEN];
 
+	unsigned long keep_alive_timeout;
+	unsigned long missing_tx_completion_to;
+
 	char name[ENA_NAME_MAX_LEN];
 
 	unsigned long flags;
diff --git a/drivers/net/ethernet/amazon/ena/ena_regs_defs.h b/drivers/net/ethernet/amazon/ena/ena_regs_defs.h
index 26097a2..c3891c5 100644
--- a/drivers/net/ethernet/amazon/ena/ena_regs_defs.h
+++ b/drivers/net/ethernet/amazon/ena/ena_regs_defs.h
@@ -78,6 +78,8 @@
 #define ENA_REGS_CAPS_RESET_TIMEOUT_MASK		0x3e
 #define ENA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT		8
 #define ENA_REGS_CAPS_DMA_ADDR_WIDTH_MASK		0xff00
+#define ENA_REGS_CAPS_ADMIN_CMD_TO_SHIFT		16
+#define ENA_REGS_CAPS_ADMIN_CMD_TO_MASK		0xf0000
 
 /* aq_caps register */
 #define ENA_REGS_AQ_CAPS_AQ_DEPTH_MASK		0xffff
-- 
1.9.1

^ permalink raw reply related

* [PATCH net 07/18] net/ena: refactor ena_get_stats64 to be atomic context safe
From: Netanel Belgazal @ 2016-11-20  8:45 UTC (permalink / raw)
  To: linux-kernel, davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, alex, saeed, msw, aliguori, nafea
In-Reply-To: <1479631547-29354-1-git-send-email-netanel@annapurnalabs.com>

ndo_get_stat64 can be called from atomic context.
However the current implementation sends an admin command to retrieve
the statistics from the device.
This admin commands uses sleep.

Refactor the implementation of ena_get_stats64 to take the
{rx,tx}bytes/cnt from the driver's inner counters
and to take the rx drops counter
from the asynchronous keep alive (heart bit) event.

Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
---
 drivers/net/ethernet/amazon/ena/ena_admin_defs.h |  8 ++++
 drivers/net/ethernet/amazon/ena/ena_netdev.c     | 57 +++++++++++++++++-------
 drivers/net/ethernet/amazon/ena/ena_netdev.h     |  1 +
 3 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_admin_defs.h b/drivers/net/ethernet/amazon/ena/ena_admin_defs.h
index c78f0b2..35ae511 100644
--- a/drivers/net/ethernet/amazon/ena/ena_admin_defs.h
+++ b/drivers/net/ethernet/amazon/ena/ena_admin_defs.h
@@ -904,6 +904,14 @@ struct ena_admin_aenq_link_change_desc {
 	u32 flags;
 };
 
+struct ena_admin_aenq_keep_alive_desc {
+	struct ena_admin_aenq_common_desc aenq_common_desc;
+
+	u32 rx_drops_low;
+
+	u32 rx_drops_high;
+};
+
 struct ena_admin_ena_mmio_req_read_less_resp {
 	u16 req_id;
 
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index e7dda8b..44dc298 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -2185,28 +2185,46 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 						 struct rtnl_link_stats64 *stats)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
-	struct ena_admin_basic_stats ena_stats;
-	int rc;
+	struct ena_ring *rx_ring, *tx_ring;
+	unsigned int start;
+	u64 rx_drops;
+	int i;
 
 	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
 		return NULL;
 
-	rc = ena_com_get_dev_basic_stats(adapter->ena_dev, &ena_stats);
-	if (rc)
-		return NULL;
+	for (i = 0; i < adapter->num_queues; i++) {
+		u64 bytes, packets;
+
+		tx_ring = &adapter->tx_ring[i];
+
+		do {
+			start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+			packets = tx_ring->tx_stats.cnt;
+			bytes = tx_ring->tx_stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+
+		stats->tx_packets += packets;
+		stats->tx_bytes += bytes;
 
-	stats->tx_bytes = ((u64)ena_stats.tx_bytes_high << 32) |
-		ena_stats.tx_bytes_low;
-	stats->rx_bytes = ((u64)ena_stats.rx_bytes_high << 32) |
-		ena_stats.rx_bytes_low;
+		rx_ring = &adapter->rx_ring[i];
+
+		do {
+			start = u64_stats_fetch_begin_irq(&rx_ring->syncp);
+			packets = rx_ring->rx_stats.cnt;
+			bytes = rx_ring->rx_stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&rx_ring->syncp, start));
 
-	stats->rx_packets = ((u64)ena_stats.rx_pkts_high << 32) |
-		ena_stats.rx_pkts_low;
-	stats->tx_packets = ((u64)ena_stats.tx_pkts_high << 32) |
-		ena_stats.tx_pkts_low;
+		stats->rx_packets += packets;
+		stats->rx_bytes += bytes;
+	}
+
+	do {
+		start = u64_stats_fetch_begin_irq(&rx_ring->syncp);
+		rx_drops = adapter->dev_stats.rx_drops;
+	} while (u64_stats_fetch_retry_irq(&rx_ring->syncp, start));
 
-	stats->rx_dropped = ((u64)ena_stats.rx_drops_high << 32) |
-		ena_stats.rx_drops_low;
+	stats->rx_dropped = rx_drops;
 
 	stats->multicast = 0;
 	stats->collisions = 0;
@@ -3272,8 +3290,17 @@ static void ena_keep_alive_wd(void *adapter_data,
 			      struct ena_admin_aenq_entry *aenq_e)
 {
 	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
+	struct ena_admin_aenq_keep_alive_desc *desc;
+	u64 rx_drops;
 
+	desc = (struct ena_admin_aenq_keep_alive_desc *)aenq_e;
 	adapter->last_keep_alive_jiffies = jiffies;
+
+	rx_drops = ((u64)desc->rx_drops_high << 32) | desc->rx_drops_low;
+
+	u64_stats_update_begin(&adapter->syncp);
+	adapter->dev_stats.rx_drops = rx_drops;
+	u64_stats_update_end(&adapter->syncp);
 }
 
 static void ena_notification(void *adapter_data,
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h
index f8ef1f0..2897fab 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.h
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h
@@ -251,6 +251,7 @@ struct ena_stats_dev {
 	u64 interface_up;
 	u64 interface_down;
 	u64 admin_q_pause;
+	u64 rx_drops;
 };
 
 enum ena_flags_t {
-- 
1.9.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox