Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next PATCH v3 3/6] virtio_net: Add XDP support
From: John Fastabend @ 2016-11-29 20:10 UTC (permalink / raw)
  To: eric.dumazet, daniel, shm, davem, tgraf, alexei.starovoitov
  Cc: john.r.fastabend, netdev, bblanco, john.fastabend, brouer
In-Reply-To: <20161129200933.26851.41883.stgit@john-Precision-Tower-5810>

From: John Fastabend <john.fastabend@gmail.com>

This adds XDP support to virtio_net. Some requirements must be
met for XDP to be enabled depending on the mode. First it will
only be supported with LRO disabled so that data is not pushed
across multiple buffers. Second the MTU must be less than a page
size to avoid having to handle XDP across multiple pages.

If mergeable receive is enabled this patch only supports the case
where header and data are in the same buf which we can check when
a packet is received by looking at num_buf. If the num_buf is
greater than 1 and a XDP program is loaded the packet is dropped
and a warning is thrown. When any_header_sg is set this does not
happen and both header and data is put in a single buffer as expected
so we check this when XDP programs are loaded.  Subsequent patches
will process the packet in a degraded mode to ensure connectivity
and correctness is not lost even if backend pushes packets into
multiple buffers.

If big packets mode is enabled and MTU/LRO conditions above are
met then XDP is allowed.

This patch was tested with qemu with vhost=on and vhost=off where
mergable and big_packet modes were forced via hard coding feature
negotiation. Multiple buffers per packet was forced via a small
test patch to vhost.c in the vhost=on qemu mode.

Suggested-by: Shrijeet Mukherjee <shrijeet@gmail.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/virtio_net.c |  154 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 150 insertions(+), 4 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 8189e5b..32126bf 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/virtio.h>
 #include <linux/virtio_net.h>
+#include <linux/bpf.h>
 #include <linux/scatterlist.h>
 #include <linux/if_vlan.h>
 #include <linux/slab.h>
@@ -81,6 +82,8 @@ struct receive_queue {
 
 	struct napi_struct napi;
 
+	struct bpf_prog __rcu *xdp_prog;
+
 	/* Chain pages by the private ptr. */
 	struct page *pages;
 
@@ -324,6 +327,38 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	return skb;
 }
 
+static u32 do_xdp_prog(struct virtnet_info *vi,
+		       struct bpf_prog *xdp_prog,
+		       struct page *page, int offset, int len)
+{
+	int hdr_padded_len;
+	struct xdp_buff xdp;
+	u32 act;
+	u8 *buf;
+
+	buf = page_address(page) + offset;
+
+	if (vi->mergeable_rx_bufs)
+		hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	else
+		hdr_padded_len = sizeof(struct padded_vnet_hdr);
+
+	xdp.data = buf + hdr_padded_len;
+	xdp.data_end = xdp.data + (len - vi->hdr_len);
+
+	act = bpf_prog_run_xdp(xdp_prog, &xdp);
+	switch (act) {
+	case XDP_PASS:
+		return XDP_PASS;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+	case XDP_TX:
+	case XDP_ABORTED:
+	case XDP_DROP:
+		return XDP_DROP;
+	}
+}
+
 static struct sk_buff *receive_small(struct virtnet_info *vi, void *buf, unsigned int len)
 {
 	struct sk_buff * skb = buf;
@@ -340,14 +375,28 @@ static struct sk_buff *receive_big(struct net_device *dev,
 				   void *buf,
 				   unsigned int len)
 {
+	struct bpf_prog *xdp_prog;
 	struct page *page = buf;
-	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
+	struct sk_buff *skb;
 
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(rq->xdp_prog);
+	if (xdp_prog) {
+		u32 act = do_xdp_prog(vi, xdp_prog, page, 0, len);
+
+		if (act == XDP_DROP)
+			goto err_xdp;
+	}
+	rcu_read_unlock();
+
+	skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
 	if (unlikely(!skb))
 		goto err;
 
 	return skb;
 
+err_xdp:
+	rcu_read_unlock();
 err:
 	dev->stats.rx_dropped++;
 	give_pages(rq, page);
@@ -366,10 +415,27 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	struct page *page = virt_to_head_page(buf);
 	int offset = buf - page_address(page);
 	unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
+	struct sk_buff *head_skb, *curr_skb;
+	struct bpf_prog *xdp_prog;
 
-	struct sk_buff *head_skb = page_to_skb(vi, rq, page, offset, len,
-					       truesize);
-	struct sk_buff *curr_skb = head_skb;
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(rq->xdp_prog);
+	if (xdp_prog) {
+		u32 act;
+
+		if (num_buf > 1) {
+			bpf_warn_invalid_xdp_buffer();
+			goto err_xdp;
+		}
+
+		act = do_xdp_prog(vi, xdp_prog, page, offset, len);
+		if (act == XDP_DROP)
+			goto err_xdp;
+	}
+	rcu_read_unlock();
+
+	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
+	curr_skb = head_skb;
 
 	if (unlikely(!curr_skb))
 		goto err_skb;
@@ -423,6 +489,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
 	return head_skb;
 
+err_xdp:
+	rcu_read_unlock();
 err_skb:
 	put_page(page);
 	while (--num_buf) {
@@ -1328,6 +1396,13 @@ static int virtnet_set_channels(struct net_device *dev,
 	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
 		return -EINVAL;
 
+	/* For now we don't support modifying channels while XDP is loaded
+	 * also when XDP is loaded all RX queues have XDP programs so we only
+	 * need to check a single RX queue.
+	 */
+	if (vi->rq[0].xdp_prog)
+		return -EINVAL;
+
 	get_online_cpus();
 	err = virtnet_set_queues(vi, queue_pairs);
 	if (!err) {
@@ -1454,6 +1529,68 @@ static int virtnet_set_features(struct net_device *netdev,
 	return 0;
 }
 
+static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	struct bpf_prog *old_prog;
+	int i;
+
+	if ((dev->features & NETIF_F_LRO) && prog) {
+		netdev_warn(dev, "can't set XDP while LRO is on, disable LRO first\n");
+		return -EINVAL;
+	}
+
+	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
+		netdev_warn(dev, "XDP expects header/data in single page\n");
+		return -EINVAL;
+	}
+
+	if (dev->mtu > PAGE_SIZE) {
+		netdev_warn(dev, "XDP requires MTU less than %lu\n", PAGE_SIZE);
+		return -EINVAL;
+	}
+
+	if (prog) {
+		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+	}
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
+		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
+		if (old_prog)
+			bpf_prog_put(old_prog);
+	}
+
+	return 0;
+}
+
+static bool virtnet_xdp_query(struct net_device *dev)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		if (vi->rq[i].xdp_prog)
+			return true;
+	}
+	return false;
+}
+
+static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return virtnet_xdp_set(dev, xdp->prog);
+	case XDP_QUERY_PROG:
+		xdp->prog_attached = virtnet_xdp_query(dev);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops virtnet_netdev = {
 	.ndo_open            = virtnet_open,
 	.ndo_stop   	     = virtnet_close,
@@ -1471,6 +1608,7 @@ static int virtnet_set_features(struct net_device *netdev,
 	.ndo_busy_poll		= virtnet_busy_poll,
 #endif
 	.ndo_set_features	= virtnet_set_features,
+	.ndo_xdp		= virtnet_xdp,
 };
 
 static void virtnet_config_changed_work(struct work_struct *work)
@@ -1527,12 +1665,20 @@ static void virtnet_free_queues(struct virtnet_info *vi)
 
 static void free_receive_bufs(struct virtnet_info *vi)
 {
+	struct bpf_prog *old_prog;
 	int i;
 
+	rtnl_lock();
 	for (i = 0; i < vi->max_queue_pairs; i++) {
 		while (vi->rq[i].pages)
 			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
+
+		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
+		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
+		if (old_prog)
+			bpf_prog_put(old_prog);
 	}
+	rtnl_unlock();
 }
 
 static void free_receive_page_frags(struct virtnet_info *vi)

^ permalink raw reply related

* [net-next PATCH v3 2/6] net: xdp: add invalid buffer warning
From: John Fastabend @ 2016-11-29 20:09 UTC (permalink / raw)
  To: eric.dumazet, daniel, shm, davem, tgraf, alexei.starovoitov
  Cc: john.r.fastabend, netdev, bblanco, john.fastabend, brouer
In-Reply-To: <20161129200933.26851.41883.stgit@john-Precision-Tower-5810>

This adds a warning for drivers to use when encountering an invalid
buffer for XDP. For normal cases this should not happen but to catch
this in virtual/qemu setups that I may not have expected from the
emulation layer having a standard warning is useful.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/linux/filter.h |    1 +
 net/core/filter.c      |    6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1f09c52..0c79004 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -595,6 +595,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 void bpf_warn_invalid_xdp_action(u32 act);
+void bpf_warn_invalid_xdp_buffer(void);
 
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
diff --git a/net/core/filter.c b/net/core/filter.c
index dece94f..5f3ed4e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2774,6 +2774,12 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
+void bpf_warn_invalid_xdp_buffer(void)
+{
+	WARN_ONCE(1, "Illegal XDP buffer encountered, expect throughput degradation\n");
+}
+EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_buffer);
+
 static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 					int src_reg, int ctx_off,
 					struct bpf_insn *insn_buf,

^ permalink raw reply related

* [net-next PATCH v3 1/6] net: virtio dynamically disable/enable LRO
From: John Fastabend @ 2016-11-29 20:09 UTC (permalink / raw)
  To: eric.dumazet, daniel, shm, davem, tgraf, alexei.starovoitov
  Cc: john.r.fastabend, netdev, bblanco, john.fastabend, brouer

This adds support for dynamically setting the LRO feature flag. The
message to control guest features in the backend uses the
CTRL_GUEST_OFFLOADS msg type.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/virtio_net.c |   45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index ca5239a..8189e5b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1419,6 +1419,41 @@ static void virtnet_init_settings(struct net_device *dev)
 	.set_settings = virtnet_set_settings,
 };
 
+static int virtnet_set_features(struct net_device *netdev,
+				netdev_features_t features)
+{
+	struct virtnet_info *vi = netdev_priv(netdev);
+	struct virtio_device *vdev = vi->vdev;
+	struct scatterlist sg;
+	u64 offloads = 0;
+
+	if (features & NETIF_F_LRO)
+		offloads |= (1 << VIRTIO_NET_F_GUEST_TSO4) |
+			    (1 << VIRTIO_NET_F_GUEST_TSO6);
+
+	if (features & NETIF_F_RXCSUM)
+		offloads |= (1 << VIRTIO_NET_F_GUEST_CSUM);
+
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
+		sg_init_one(&sg, &offloads, sizeof(uint64_t));
+		if (!virtnet_send_command(vi,
+					  VIRTIO_NET_CTRL_GUEST_OFFLOADS,
+					  VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
+					  &sg)) {
+			dev_warn(&netdev->dev,
+				 "Failed to set guest offloads by virtnet command.\n");
+			return -EINVAL;
+		}
+	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) &&
+		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+		dev_warn(&netdev->dev,
+			 "No support for setting offloads pre version_1.\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static const struct net_device_ops virtnet_netdev = {
 	.ndo_open            = virtnet_open,
 	.ndo_stop   	     = virtnet_close,
@@ -1435,6 +1470,7 @@ static void virtnet_init_settings(struct net_device *dev)
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	.ndo_busy_poll		= virtnet_busy_poll,
 #endif
+	.ndo_set_features	= virtnet_set_features,
 };
 
 static void virtnet_config_changed_work(struct work_struct *work)
@@ -1810,6 +1846,12 @@ static int virtnet_probe(struct virtio_device *vdev)
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
 		dev->features |= NETIF_F_RXCSUM;
 
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) &&
+	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) {
+		dev->features |= NETIF_F_LRO;
+		dev->hw_features |= NETIF_F_LRO;
+	}
+
 	dev->vlan_features = dev->features;
 
 	/* MTU range: 68 - 65535 */
@@ -2047,7 +2089,8 @@ static int virtnet_restore(struct virtio_device *vdev)
 	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
 	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
 	VIRTIO_NET_F_CTRL_MAC_ADDR, \
-	VIRTIO_NET_F_MTU
+	VIRTIO_NET_F_MTU, \
+	VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
 
 static unsigned int features[] = {
 	VIRTNET_FEATURES,

^ permalink raw reply related

* [net-next PATCH v3 0/6] XDP for virtio_net
From: John Fastabend @ 2016-11-29 20:05 UTC (permalink / raw)
  To: tgraf, shm, alexei.starovoitov, daniel, davem
  Cc: john.r.fastabend, netdev, bblanco, john.fastabend, brouer

This implements virtio_net for the mergeable buffers and big_packet
modes. I tested this with vhost_net running on qemu and did not see
any issues. For testing num_buf > 1 I added a hack to vhost driver
to only but 100 bytes per buffer.

There are some restrictions for XDP to be enabled and work well
(see patch 3) for more details.

  1. LRO must be off
  2. MTU must be less than PAGE_SIZE
  3. queues must be available to dedicate to XDP
  4. num_bufs received in mergeable buffers must be 1
  5. big_packet mode must have all data on single page

Please review any comments/feedback welcome as always.

v2, fixes rcu usage throughout thanks to Eric and the use of
num_online_cpus() usage thanks to Jakub.

v3, add slowpath patch to handle num_bufs > 1

Thanks,
John
---

John Fastabend (6):
      net: virtio dynamically disable/enable LRO
      net: xdp: add invalid buffer warning
      virtio_net: Add XDP support
      virtio_net: add dedicated XDP transmit queues
      virtio_net: add XDP_TX support
      virtio_net: xdp, add slowpath case for non contiguous buffers

 drivers/net/virtio_net.c |  344 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/filter.h   |    1 
 net/core/filter.c        |    6 +
 3 files changed, 346 insertions(+), 5 deletions(-)

^ permalink raw reply

* Re: [PATCH net-next v5 2/3] bpf: Add new cgroup attach type to enable sock modifications
From: Alexei Starovoitov @ 2016-11-29 20:01 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, daniel, ast, daniel, maheshb, tgraf
In-Reply-To: <1480434813-3141-3-git-send-email-dsa@cumulusnetworks.com>

On Tue, Nov 29, 2016 at 07:53:32AM -0800, David Ahern wrote:
> Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
> BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
> any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
> Currently only sk_bound_dev_if is exported to userspace for modification
> by a bpf program.
> 
> This allows a cgroup to be configured such that AF_INET{6} sockets opened
> by processes are automatically bound to a specific device. In turn, this
> enables the running of programs that do not support SO_BINDTODEVICE in a
> specific VRF context / L3 domain.
> 
> Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
...
> +struct bpf_sock {
> +	__u32 bound_dev_if;
> +};

overall looks great to me.
Could you also expose sk_protcol and sk_type as read only fields?
They have user space visible values already and will make this new
BPF_PROG_TYPE_CGROUP_SOCK program type much more useful beyond vrf
use case. Like we'll be able to write a tiny bpf program to block all
raw sockets or all udp sockets for an application within a given cgroup.
If someone would want to prevent udp traffic from an application,
it can be done here within close to zero overhead for socket() syscall
instead of checking every packet at networking layer.
It can help vrf use case as well, so you can auto-bindtodevice
only udp and tcp sockets instead of all... or do it for ipv4 or ipv6 only.
Plenty of interesting opportunities with just two extra fields
that cost nothing when not in use.

^ permalink raw reply

* Re: [PATCH v3] ethernet :mellanox :mlx4: Replace pci_pool_alloc by pci_pool_zalloc
From: Yuval Shaia @ 2016-11-29 19:58 UTC (permalink / raw)
  To: Souptick Joarder
  Cc: sergei.shtylyov, yishaih, netdev, linux-rdma, sahu.rameshwar73
In-Reply-To: <20161129194611.GA4088@jordon-HP-15-Notebook-PC>

On Wed, Nov 30, 2016 at 01:16:12AM +0530, Souptick Joarder wrote:
> In mlx4_alloc_cmd_mailbox(), pci_pool_alloc() followed by memset will be
> replaced by pci_pool_zalloc()
> 
> Signed-off-by: Souptick joarder <jrdr.linux@gmail.com>
> ---
> v3:
>   - Fixed alignment issues

You mean remove extra empty line?

Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>

> 
> v2:
>   - Address comment from sergei
>     Alignment was not proper
> 
>  drivers/net/ethernet/mellanox/mlx4/cmd.c | 6 ++----
>  1 file changed, 2 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
> index e36bebc..a49072b4 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
> @@ -2679,15 +2679,13 @@ struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev)
>  	if (!mailbox)
>  		return ERR_PTR(-ENOMEM);
> 
> -	mailbox->buf = pci_pool_alloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
> -				      &mailbox->dma);
> +	mailbox->buf = pci_pool_zalloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
> +				       &mailbox->dma);
>  	if (!mailbox->buf) {
>  		kfree(mailbox);
>  		return ERR_PTR(-ENOMEM);
>  	}
> 
> -	memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE);
> -
>  	return mailbox;
>  }
>  EXPORT_SYMBOL_GPL(mlx4_alloc_cmd_mailbox);
> --
> 1.9.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: 4.9-rc7: (forcedeth?) BUG: sleeping function called from invalid context at kernel/irq/manage.c:110
From: Eric Dumazet @ 2016-11-29 19:58 UTC (permalink / raw)
  To: Meelis Roos; +Cc: Linus Torvalds, Linux Kernel list, netdev
In-Reply-To: <alpine.LRH.2.20.1611292309490.31348@math.ut.ee>

On Tue, 2016-11-29 at 23:16 +0200, Meelis Roos wrote:
> > On Tue, Nov 29, 2016 at 12:26 PM, Meelis Roos <mroos@linux.ee> wrote:
> > > This is 4.9-rc7 on Sun Ultra 20 (Opteron 175 on NVidia chipset PC with
> > > NVidia ethernet).
> > >
> > > BUG: sleeping function called from invalid context at kernel/irq/manage.c:110
> > 
> > Hmm. No changes in either forcedeth or in the synchronize_irq() debugging.
> [...]
> > But none of this looks new. I don't see _anything_ in any of these
> > areas that has changed since 4.8.
> > 
> > Which is why I suspect you changed something in your setup wrt
> > netconsole or your kernel config?
> 
> No changes that I could see. Only answered oldconfig questions, diff 
> below. /etc/default/grub is from May so not changed recently. Just 
> verified that 4.8.0 dmesg did not contain these warnings.
> 
> Conf diff and current config are below:

nv_do_nic_poll() is simply buggy and needs a fix.

synchronize_irq() can sleep.

^ permalink raw reply

* Re: [PATCH] debugfs: improve formatting of debugfs_real_fops()
From: Greg Kroah-Hartman @ 2016-11-29 19:58 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev, Nicolai Stange, Christian Lamparter
In-Reply-To: <1478798629-22318-1-git-send-email-jakub.kicinski@netronome.com>

On Thu, Nov 10, 2016 at 05:23:49PM +0000, Jakub Kicinski wrote:
> Type of debugfs_real_fops() is longer than parameters and
> the name, so there is no way to break the declaration nicely.
> We have to go over 80 characters.
> 
> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> ---
>  include/linux/debugfs.h | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)

Doesn't apply to my tree, what did you make this against?

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH net-next] bpf: add test for the verifier equal logic bug
From: Josef Bacik @ 2016-11-29 19:50 UTC (permalink / raw)
  To: Daniel Borkmann, davem, netdev, ast, jannh, kernel-team
In-Reply-To: <583DD1C0.1010209@iogearbox.net>

On 11/29/2016 02:06 PM, Daniel Borkmann wrote:
> On 11/29/2016 06:35 PM, Josef Bacik wrote:
>> This is a test to verify that
>>
>> bpf: fix states equal logic for varlen access
>>
>> actually fixed the problem.  The problem was if the register we added to our map
>> register was UNKNOWN in both the false and true branches and the only thing that
>> changed was the range then we'd incorrectly assume that the true branch was
>> valid, which it really wasnt.  This tests this case and properly fails without
>> my fix in place and passes with it in place.
>>
>> Signed-off-by: Josef Bacik <jbacik@fb.com>
>
> Acked-by: Daniel Borkmann <daniel@iogearbox.net>
>
> Thanks a lot for the test case! They are always useful to have ... which
> just reminds me: it seems we didn't add anything for f23cc643f9ba ("bpf:
> fix range arithmetic for bpf map access"). ;-)

I was hoping you wouldn't notice ;).  I'll add one in the next couple of days. 
Thanks,

Josef

^ permalink raw reply

* Re: 4.9-rc7: (forcedeth?) BUG: sleeping function called from invalid context at kernel/irq/manage.c:110
From: Linus Torvalds @ 2016-11-29 19:50 UTC (permalink / raw)
  To: Meelis Roos; +Cc: Linux Kernel list, Network Development
In-Reply-To: <alpine.LRH.2.20.1611292309490.31348@math.ut.ee>

On Tue, Nov 29, 2016 at 1:16 PM, Meelis Roos <mroos@linux.ee> wrote:
> [...]
>> But none of this looks new. I don't see _anything_ in any of these
>> areas that has changed since 4.8.
>>
>> Which is why I suspect you changed something in your setup wrt
>> netconsole or your kernel config?
>
> No changes that I could see. Only answered oldconfig questions, diff
> below. /etc/default/grub is from May so not changed recently. Just
> verified that 4.8.0 dmesg did not contain these warnings.

Hmm. I wonder if it might just be timing (the netpoll_poll_dev() thing
is only called conditionally, and it depends on magic details and I
guess you might just have gotten unlucky).

But it would be good if you can bisect it...

           Linus

^ permalink raw reply

* [PATCH v3] ethernet :mellanox :mlx4: Replace pci_pool_alloc by pci_pool_zalloc
From: Souptick Joarder @ 2016-11-29 19:46 UTC (permalink / raw)
  To: sergei.shtylyov, yishaih; +Cc: netdev, linux-rdma, sahu.rameshwar73

In mlx4_alloc_cmd_mailbox(), pci_pool_alloc() followed by memset will be
replaced by pci_pool_zalloc()

Signed-off-by: Souptick joarder <jrdr.linux@gmail.com>
---
v3:
  - Fixed alignment issues

v2:
  - Address comment from sergei
    Alignment was not proper

 drivers/net/ethernet/mellanox/mlx4/cmd.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index e36bebc..a49072b4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -2679,15 +2679,13 @@ struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev)
 	if (!mailbox)
 		return ERR_PTR(-ENOMEM);

-	mailbox->buf = pci_pool_alloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
-				      &mailbox->dma);
+	mailbox->buf = pci_pool_zalloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
+				       &mailbox->dma);
 	if (!mailbox->buf) {
 		kfree(mailbox);
 		return ERR_PTR(-ENOMEM);
 	}

-	memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE);
-
 	return mailbox;
 }
 EXPORT_SYMBOL_GPL(mlx4_alloc_cmd_mailbox);

^ permalink raw reply related

* Re: [PATCH net-next v5 1/3] bpf: Refactor cgroups code in prep for new type
From: Alexei Starovoitov @ 2016-11-29 19:41 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, daniel, ast, daniel, maheshb, tgraf
In-Reply-To: <1480434813-3141-2-git-send-email-dsa@cumulusnetworks.com>

On Tue, Nov 29, 2016 at 07:53:31AM -0800, David Ahern wrote:
> Code move and rename only; no functional change intended.
> 
> Signed-off-by: David Ahern <dsa@cumulusnetworks.com>

Acked-by: Alexei Starovoitov <ast@kernel.org>

^ permalink raw reply

* Re: [PATCH net-next] cgroup, bpf: remove unnecessary #include
From: Alexei Starovoitov @ 2016-11-29 19:26 UTC (permalink / raw)
  To: Daniel Borkmann, David S. Miller
  Cc: Daniel Mack, Tejun Heo, netdev@vger.kernel.org
In-Reply-To: <583D5D00.4070101@iogearbox.net>

On Tue, Nov 29, 2016 at 2:48 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> On 11/26/2016 08:23 AM, Alexei Starovoitov wrote:
>>
>> this #include is unnecessary and brings whole set of
>> other headers into cgroup-defs.h. Remove it.
>>
>> Fixes: 3007098494be ("cgroup: add support for eBPF programs")
>> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>
>
> This fixes many build errors in samples/bpf/ due to wrong helper
> redefinitions (originating from kernel includes conflicting with
> samples' helper declarations).
>
> I don't see it pushed out to net-next yet, so:
>
> Acked-by: Daniel Borkmann <daniel@iogearbox.net>

Dave,
this patch is marked as 'accepted', but I don't see in net-next ...
please double check.
Thanks!

^ permalink raw reply

* Re: 4.9-rc7: (forcedeth?) BUG: sleeping function called from invalid context at kernel/irq/manage.c:110
From: Linus Torvalds @ 2016-11-29 19:11 UTC (permalink / raw)
  To: Meelis Roos; +Cc: Linux Kernel list, netdev
In-Reply-To: <alpine.LRH.2.20.1611292222200.31348@math.ut.ee>

On Tue, Nov 29, 2016 at 12:26 PM, Meelis Roos <mroos@linux.ee> wrote:
> This is 4.9-rc7 on Sun Ultra 20 (Opteron 175 on NVidia chipset PC with
> NVidia ethernet).
>
> BUG: sleeping function called from invalid context at kernel/irq/manage.c:110

Hmm. No changes in either forcedeth or in the synchronize_irq() debugging.

It seems to be due to netconsole. Did you enable new debugging (like
the DEBUG_ATOMIC_SLEEP config option) or change any netconsole things?

It looks like it's simply the dev_info() call in usb_add_hcd():

        dev_info(hcd->self.controller, "%s\n", hcd->product_desc);

and the printk() in amd64_edac_init():

        printk(KERN_INFO "AMD64 EDAC driver v%s\n", EDAC_AMD64_VERSION);

and yes, netconsole does "write_msg()" which is run with interrupts
disabled and the 'target_list_lock' spinlock held.

So when netpoll_send_udp() then calls down to the NIC poll routine, we
definitely are in an atomic context.

But none of this looks new. I don't see _anything_ in any of these
areas that has changed since 4.8.

Which is why I suspect you changed something in your setup wrt
netconsole or your kernel config?

                 Linus

^ permalink raw reply

* Re: [PATCH net][v2] bpf: fix states equal logic for varlen access
From: Daniel Borkmann @ 2016-11-29 19:09 UTC (permalink / raw)
  To: Josef Bacik, davem, netdev, ast, jannh, kernel-team
In-Reply-To: <1480440429-2531-1-git-send-email-jbacik@fb.com>

On 11/29/2016 06:27 PM, Josef Bacik wrote:
> If we have a branch that looks something like this
>
> int foo = map->value;
> if (condition) {
>    foo += blah;
> } else {
>    foo = bar;
> }
> map->array[foo] = baz;
>
> We will incorrectly assume that the !condition branch is equal to the condition
> branch as the register for foo will be UNKNOWN_VALUE in both cases.  We need to
> adjust this logic to only do this if we didn't do a varlen access after we
> processed the !condition branch, otherwise we have different ranges and need to
> check the other branch as well.
>
> Fixes: 484611357c19 ("bpf: allow access into map value arrays")
> Reported-by: Jann Horn <jannh@google.com>
> Signed-off-by: Josef Bacik <jbacik@fb.com>

Acked-by: Daniel Borkmann <daniel@iogearbox.net>

^ permalink raw reply

* Re: [PATCH net-next] bpf: add test for the verifier equal logic bug
From: Daniel Borkmann @ 2016-11-29 19:06 UTC (permalink / raw)
  To: Josef Bacik, davem, netdev, ast, jannh, kernel-team
In-Reply-To: <1480440919-3252-1-git-send-email-jbacik@fb.com>

On 11/29/2016 06:35 PM, Josef Bacik wrote:
> This is a test to verify that
>
> bpf: fix states equal logic for varlen access
>
> actually fixed the problem.  The problem was if the register we added to our map
> register was UNKNOWN in both the false and true branches and the only thing that
> changed was the range then we'd incorrectly assume that the true branch was
> valid, which it really wasnt.  This tests this case and properly fails without
> my fix in place and passes with it in place.
>
> Signed-off-by: Josef Bacik <jbacik@fb.com>

Acked-by: Daniel Borkmann <daniel@iogearbox.net>

Thanks a lot for the test case! They are always useful to have ... which
just reminds me: it seems we didn't add anything for f23cc643f9ba ("bpf:
fix range arithmetic for bpf map access"). ;-)

^ permalink raw reply

* Re: [PATCH net-netx] net: lan78xx: add LAN7801 MAC-only support
From: Florian Fainelli @ 2016-11-29 18:58 UTC (permalink / raw)
  To: Woojung.Huh, davem, andrew; +Cc: netdev, UNGLinuxDriver
In-Reply-To: <9235D6609DB808459E95D78E17F2E43D4096FC85@CHN-SV-EXMX02.mchp-main.com>

On 11/29/2016 10:49 AM, Woojung.Huh@microchip.com wrote:
>>> +	/* LED2/PME_N/IRQ_N/RGMII_ID pin to IRQ_N mode */
>>> +	buf = phy_read_mmd_indirect(phydev, 32784, 3);
>>> +	buf &= ~0x1800;
>>> +	buf |= 0x0800;
>>> +	phy_write_mmd_indirect(phydev, 32784, 3, buf);
>>
>> Using decimal numbers for register addresses is a bit unusual.
> 
> OK. Will change it.
> 
>>> +
>>> +	/* RGMII MAC TXC Delay Enable */
>>> +	ret = lan78xx_write_reg(dev, MAC_RGMII_ID,
>>> +				MAC_RGMII_ID_TXC_DELAY_EN_);
>>> +
>>> +	/* RGMII TX DLL Tune Adjust */
>>> +	ret = lan78xx_write_reg(dev, RGMII_TX_BYP_DLL, 0x3D00);
>>> +
>>> +	*interface = PHY_INTERFACE_MODE_RGMII_TXID;
>>> +}
>>> +
>>> +static void ksz9031rnx_pre_config(struct lan78xx_net *dev,
>>> +				  struct phy_device *phydev,
>>> +				  phy_interface_t *interface)
>>> +{
>>> +	/* Micrel9301RNX PHY configuration */
>>> +	/* RGMII Control Signal Pad Skew */
>>> +	phy_write_mmd_indirect(phydev, 4, 2, 0x0077);
>>> +	/* RGMII RX Data Pad Skew */
>>> +	phy_write_mmd_indirect(phydev, 5, 2, 0x7777);
>>> +	/* RGMII RX Clock Pad Skew */
>>> +	phy_write_mmd_indirect(phydev, 8, 2, 0x1FF);
>>> +
>>> +	*interface = PHY_INTERFACE_MODE_RGMII_RXID;
>>> +}
>>
>> This should really belong in the respective PHY drivers for these PHYs,
>> is there a particular reason you decided to do this here?
> 
> Main reason is because these are MAC dependent.
> In case these are in PHY driver, I expect some parameters/defines may need to be passed to it.
> Trying to keep PHY driver as generic as possible.

There are two ways to get these settings propagated to the PHY driver:

- using a board fixup which is going to be invoked during
drv->config_init() time

- specifying a phydev->dev_flags and reading it from the PHY driver to
act upon and configure the PHY based on that value, there are only
32-bits available though, and you need to make sure they are not
conflicting with other potential users in tree

My preference would go with 1, since you could just register it in your
PHY driver and re-use the code you are proposing to include here.
-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next] bpf: add test for the verifier equal logic bug
From: Alexei Starovoitov @ 2016-11-29 18:54 UTC (permalink / raw)
  To: Josef Bacik; +Cc: davem, netdev, ast, jannh, daniel, kernel-team
In-Reply-To: <1480440919-3252-1-git-send-email-jbacik@fb.com>

On Tue, Nov 29, 2016 at 12:35:19PM -0500, Josef Bacik wrote:
> This is a test to verify that
> 
> bpf: fix states equal logic for varlen access
> 
> actually fixed the problem.  The problem was if the register we added to our map
> register was UNKNOWN in both the false and true branches and the only thing that
> changed was the range then we'd incorrectly assume that the true branch was
> valid, which it really wasnt.  This tests this case and properly fails without
> my fix in place and passes with it in place.
> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>

Awesome. thanks for the test!
Acked-by: Alexei Starovoitov <ast@kernel.org>

^ permalink raw reply

* Re: [PATCH net][v2] bpf: fix states equal logic for varlen access
From: Alexei Starovoitov @ 2016-11-29 18:52 UTC (permalink / raw)
  To: Josef Bacik; +Cc: davem, netdev, ast, jannh, daniel, kernel-team
In-Reply-To: <1480440429-2531-1-git-send-email-jbacik@fb.com>

On Tue, Nov 29, 2016 at 12:27:09PM -0500, Josef Bacik wrote:
> If we have a branch that looks something like this
> 
> int foo = map->value;
> if (condition) {
>   foo += blah;
> } else {
>   foo = bar;
> }
> map->array[foo] = baz;
> 
> We will incorrectly assume that the !condition branch is equal to the condition
> branch as the register for foo will be UNKNOWN_VALUE in both cases.  We need to
> adjust this logic to only do this if we didn't do a varlen access after we
> processed the !condition branch, otherwise we have different ranges and need to
> check the other branch as well.
> 
> Fixes: 484611357c19 ("bpf: allow access into map value arrays")
> Reported-by: Jann Horn <jannh@google.com>
> Signed-off-by: Josef Bacik <jbacik@fb.com>

Thank you!
Acked-by: Alexei Starovoitov <ast@kernel.org>

^ permalink raw reply

* Re: bpf debug info
From: Alexei Starovoitov @ 2016-11-29 18:51 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Daniel Borkmann, netdev, Brenden Blanco, Thomas Graf, Wangnan,
	He Kuang, kernel-team
In-Reply-To: <20161129153818.51104fab@jkicinski-Precision-T1700>

On Tue, Nov 29, 2016 at 03:38:18PM +0000, Jakub Kicinski wrote:
> 
> >>> [...]
> > > So next step is to improve verifier messages to be more human friendly.
> > > The step after is to introduce BPF_COMMENT pseudo instruction
> > > that will be ignored by the interpreter yet it will contain the text
> > > of original source code. Then llvm-objdump step won't be necessary.
> > > The bpf loader will load both instructions and pieces of C sources.
> > > Then verifier errors should be even easier to read and humans
> > > can easily understand the purpose of the program.  
> > 
> > So the BPF_COMMENT pseudo insn will get stripped away from the insn array
> > after verification step, so we don't need to hold/account for this mem? I
> > assume in it's ->imm member it will just hold offset into text blob?
> 
> Associating any form of opaque data with programs always makes me
> worried about opening a side channel of communication with a specialized
> user space implementations/compilers.  But I guess if the BPF_COMMENTs
> are stripped in the verifier as Daniel assumes drivers and JITs will
> never see it.

yes. the idea that it's a comment. It can contain any text,
not only C code, but any other language.
It's definitely going to be stripped before JITs and kernel will
not make any safety or translation decisions based on such comment.

> Just to clarify, however - is there any reason why pushing the source
> code into the kernel is necessary?  Or is it just for convenience?
> Provided the user space loader has access to the debug info it should
> have no problems matching the verifier output to code lines?

correct. just for convenience. The user space has to keep .o around,
since it can crash, would have to reload and so on.
Only for some script that ssh-es into servers and wants to see
what is being loaded, it might help to dump full asm and these comments
along with prog_digest that Daniel is working on in parallel.
Alternatively instead of doing BPF_COMMENT we can load the whole .o
as-is into bpffs as a blob. Later (based on digest) the kernel can
dump such .o back for user space to run objdump on. It all can be
done without kernel involvement. Like tc command can copy .o and so on.
But not everything is using tc.
Another alternative is to do a decompiler from bpf binary
into meaningful C code. It's not trivial and names will be lost.
bpf_comment approach is pretty cheap from kernel point of view
and greatly helps visibility when users don't cheat with debug info.

^ permalink raw reply

* RE: [PATCH net-netx] net: lan78xx: add LAN7801 MAC-only support
From: Woojung.Huh @ 2016-11-29 18:49 UTC (permalink / raw)
  To: f.fainelli, davem, andrew; +Cc: netdev, UNGLinuxDriver
In-Reply-To: <007d0bc0-378c-da93-d730-28f7b5b5073a@gmail.com>

> > +	/* LED2/PME_N/IRQ_N/RGMII_ID pin to IRQ_N mode */
> > +	buf = phy_read_mmd_indirect(phydev, 32784, 3);
> > +	buf &= ~0x1800;
> > +	buf |= 0x0800;
> > +	phy_write_mmd_indirect(phydev, 32784, 3, buf);
> 
> Using decimal numbers for register addresses is a bit unusual.

OK. Will change it.

> > +
> > +	/* RGMII MAC TXC Delay Enable */
> > +	ret = lan78xx_write_reg(dev, MAC_RGMII_ID,
> > +				MAC_RGMII_ID_TXC_DELAY_EN_);
> > +
> > +	/* RGMII TX DLL Tune Adjust */
> > +	ret = lan78xx_write_reg(dev, RGMII_TX_BYP_DLL, 0x3D00);
> > +
> > +	*interface = PHY_INTERFACE_MODE_RGMII_TXID;
> > +}
> > +
> > +static void ksz9031rnx_pre_config(struct lan78xx_net *dev,
> > +				  struct phy_device *phydev,
> > +				  phy_interface_t *interface)
> > +{
> > +	/* Micrel9301RNX PHY configuration */
> > +	/* RGMII Control Signal Pad Skew */
> > +	phy_write_mmd_indirect(phydev, 4, 2, 0x0077);
> > +	/* RGMII RX Data Pad Skew */
> > +	phy_write_mmd_indirect(phydev, 5, 2, 0x7777);
> > +	/* RGMII RX Clock Pad Skew */
> > +	phy_write_mmd_indirect(phydev, 8, 2, 0x1FF);
> > +
> > +	*interface = PHY_INTERFACE_MODE_RGMII_RXID;
> > +}
> 
> This should really belong in the respective PHY drivers for these PHYs,
> is there a particular reason you decided to do this here?

Main reason is because these are MAC dependent.
In case these are in PHY driver, I expect some parameters/defines may need to be passed to it.
Trying to keep PHY driver as generic as possible.

Thanks for your comments.
- Woojung

^ permalink raw reply

* Re: [PATCH v2 net-next 10/11] qede: Add basic XDP support
From: Daniel Borkmann @ 2016-11-29 18:28 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: Yuval Mintz, davem, netdev, alexei.starovoitov
In-Reply-To: <20161129171020.6b8b552d@jkicinski-Precision-T1700>

On 11/29/2016 06:10 PM, Jakub Kicinski wrote:
> On Tue, 29 Nov 2016 16:48:50 +0100, Daniel Borkmann wrote:
>> On 11/29/2016 03:47 PM, Yuval Mintz wrote:
>>> Add support for the ndo_xdp callback. This patch would support XDP_PASS,
>>> XDP_DROP and XDP_ABORTED commands.
>>>
>>> This also adds a per Rx queue statistic which counts number of packets
>>> which didn't reach the stack [due to XDP].
>>>
>>> Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
>> [...]
>>> @@ -1560,6 +1593,7 @@ static int qede_rx_process_cqe(struct qede_dev *edev,
>>>    			       struct qede_fastpath *fp,
>>>    			       struct qede_rx_queue *rxq)
>>>    {
>>> +	struct bpf_prog *xdp_prog = READ_ONCE(rxq->xdp_prog);
>>>    	struct eth_fast_path_rx_reg_cqe *fp_cqe;
>>>    	u16 len, pad, bd_cons_idx, parse_flag;
>>>    	enum eth_rx_cqe_type cqe_type;
>>> @@ -1596,6 +1630,11 @@ static int qede_rx_process_cqe(struct qede_dev *edev,
>>>    	len = le16_to_cpu(fp_cqe->len_on_first_bd);
>>>    	pad = fp_cqe->placement_offset;
>>>
>>> +	/* Run eBPF program if one is attached */
>>> +	if (xdp_prog)
>>> +		if (!qede_rx_xdp(edev, fp, rxq, xdp_prog, bd, fp_cqe))
>>> +			return 1;
>>> +
>>
>> You also need to wrap this under rcu_read_lock() (at least I haven't seen
>> it in your patches) for same reasons as stated in 326fe02d1ed6 ("net/mlx4_en:
>> protect ring->xdp_prog with rcu_read_lock"), as otherwise xdp_prog could
>> disappear underneath you. mlx4 and nfp does it correctly, looks like mlx5
>> doesn't.
>
> My understanding was that Yuval is always doing full stop()/start() so
> there should be no RX packets in flight while the XDP prog is being
> changed.  But thinking about it again, perhaps is worth adding the

Ohh, true, thanks for pointing this out. I guess I got confused by
the READ_ONCE() then.

> optimization to forego the full qede_reload() in qede_xdp_set() if there
> is a program already loaded and just do the xchg()+put() (and add RCU
> protection on the fast path)?

Would be worth it as a follow-up later on, yes.

^ permalink raw reply

* [PATCH RFC v3] ethtool: implement helper to get flow_type value
From: Jacob Keller @ 2016-11-29 18:45 UTC (permalink / raw)
  To: netdev, Intel Wired LAN; +Cc: David Miller, Jakub Kicinski, Jacob Keller

Often a driver wants to store the flow type and thus it must mask the
extra fields. This is a task that could grow more complex as more flags
are added in the future. Add a helper function that masks the flags for
marking additional fields.

Modify drivers in drivers/net/ethernet that currently check for FLOW_EXT
and FLOW_MAC_EXT to use the helper. Currently this is only the mellanox
drivers.

I chose not to modify other drivers as I'm actually unsure whether we
should always mask the flow type even for drivers which don't recognize
the newer flags. On the one hand, today's drivers (generally)
automatically fail when a new flag is used because they won't mask it
and their checks against flow_type will not match. On the other hand, it
means another place that you have to update when you begin implementing
a flag.

An alternative is to have the driver store a set of flags that it knows
about, and then have ethtool core do the check for us to discard frames.
I haven't implemented this quite yet.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c         | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c | 6 +++---
 include/uapi/linux/ethtool.h                            | 5 +++++
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 487a58f9c192..d8f9839ce2a3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -1270,7 +1270,7 @@ static int mlx4_en_validate_flow(struct net_device *dev,
 			return -EINVAL;
 	}
 
-	switch (cmd->fs.flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	switch (ethtool_get_flow_spec_type(cmd->fs.flow_type)) {
 	case TCP_V4_FLOW:
 	case UDP_V4_FLOW:
 		if (cmd->fs.m_u.tcp_ip4_spec.tos)
@@ -1493,7 +1493,7 @@ static int mlx4_en_ethtool_to_net_trans_rule(struct net_device *dev,
 	if (err)
 		return err;
 
-	switch (cmd->fs.flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	switch (ethtool_get_flow_spec_type(cmd->fs.flow_type)) {
 	case ETHER_FLOW:
 		spec_l2 = kzalloc(sizeof(*spec_l2), GFP_KERNEL);
 		if (!spec_l2)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index 3691451c728c..066e6c5cf38b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -63,7 +63,7 @@ static struct mlx5e_ethtool_table *get_flow_table(struct mlx5e_priv *priv,
 	int table_size;
 	int prio;
 
-	switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	switch (ethtool_get_flow_spec_type(fs->flow_type)) {
 	case TCP_V4_FLOW:
 	case UDP_V4_FLOW:
 		max_tuples = ETHTOOL_NUM_L3_L4_FTS;
@@ -147,7 +147,7 @@ static int set_flow_attrs(u32 *match_c, u32 *match_v,
 					     outer_headers);
 	void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
 					     outer_headers);
-	u32 flow_type = fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT);
+	u32 flow_type = ethtool_get_flow_spec_type(fs->flow_type);
 	struct ethtool_tcpip4_spec *l4_mask;
 	struct ethtool_tcpip4_spec *l4_val;
 	struct ethtool_usrip4_spec *l3_mask;
@@ -393,7 +393,7 @@ static int validate_flow(struct mlx5e_priv *priv,
 	    fs->ring_cookie != RX_CLS_FLOW_DISC)
 		return -EINVAL;
 
-	switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	switch (ethtool_get_flow_spec_type(fs->flow_type)) {
 	case ETHER_FLOW:
 		eth_mask = &fs->m_u.ether_spec;
 		if (!is_zero_ether_addr(eth_mask->h_dest))
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index f0db7788f887..ed650eef9e54 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1583,6 +1583,11 @@ static inline int ethtool_validate_duplex(__u8 duplex)
 #define	FLOW_EXT	0x80000000
 #define	FLOW_MAC_EXT	0x40000000
 
+static inline __u32 ethtool_get_flow_spec_type(__u32 flow_type)
+{
+	return flow_type & ~(FLOW_EXT | FLOW_MAC_EXT);
+}
+
 /* L3-L4 network traffic flow hash options */
 #define	RXH_L2DA	(1 << 1)
 #define	RXH_VLAN	(1 << 2)
-- 
2.11.0.rc2.152.g4d04e67

^ permalink raw reply related

* Re: [PATCH RFC v2] ethtool: implement helper to get flow_type value
From: Keller, Jacob E @ 2016-11-29 18:44 UTC (permalink / raw)
  To: kubakici@wp.pl
  Cc: netdev@vger.kernel.org, davem@davemloft.net,
	intel-wired-lan@lists.osuosl.org
In-Reply-To: <20161129152144.484bea62@jkicinski-Precision-T1700>

On Tue, 2016-11-29 at 15:21 +0000, Jakub Kicinski wrote:
> On Mon, 28 Nov 2016 15:03:43 -0800, Jacob Keller wrote:
> > +static inline __u32 ethtool_get_flow_spec_type(__u32 flow_type)
> > +{
> > +	return flow_type & (FLOW_EXT | FLOW_MAC_EXT);
> 
> I don't have anything of substance to say but I think you are missing
> a
> negation (~) here compared to the code you are replacing ;)

HAH! Yes you are right. I made a mistake when copying this out of my
driver header file.

Will fix. Sorry for the thrash, and thanks for catching my mistake.

Regards,
Jake

^ permalink raw reply

* Re: [PATCH v2 net-next 10/11] qede: Add basic XDP support
From: Daniel Borkmann @ 2016-11-29 18:42 UTC (permalink / raw)
  To: Mintz, Yuval, Jakub Kicinski
  Cc: davem@davemloft.net, netdev@vger.kernel.org,
	alexei.starovoitov@gmail.com
In-Reply-To: <BL2PR07MB23061A22119C747AA7E9A2FA8D8D0@BL2PR07MB2306.namprd07.prod.outlook.com>

On 11/29/2016 06:51 PM, Mintz, Yuval wrote:
>>> You also need to wrap this under rcu_read_lock() (at least I haven't seen
>>> it in your patches) for same reasons as stated in 326fe02d1ed6 ("net/mlx4_en:
>>> protect ring->xdp_prog with rcu_read_lock"), as otherwise xdp_prog could
>>> disappear underneath you. mlx4 and nfp does it correctly, looks like mlx5
>>> doesn't.
>
>> My understanding was that Yuval is always doing full stop()/start() so
>> there should be no RX packets in flight while the XDP prog is being
>> changed.  But thinking about it again, perhaps is worth adding the
>> optimization to forego the full qede_reload() in qede_xdp_set() if there
>> is a program already loaded and just do the xchg()+put() (and add RCU
>> protection on the fast path)?
>
> Yeps. That the current state of the code.
> I'll surely pursue this later, but I don't think all this added complexity
> is required for the initial submission.
>
> BTW, one of the problems [or perhaps 'lack of motivation' is a better term]
> I had with the program switching scenario was that there was no sample
> application that did it.
> If it's really an interesting [basic] scenario, perhaps it's worthy to add
> a sample user app. that will repeatedly switch the attached eBPF?

Fwiw, I'm still waiting for Stephen to process his queue, and then I have
a patch for iproute2 to add a minimal initial front-end that can be useful
for experimenting/testing. The atomic switching scenario w/o stop()/start()
would definitely be useful when you need to fix an issue or modify behavior
in your currently loaded program on the fly when you cannot afford small
downtime.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox