Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH net-next] net: thunderx: Fix transmit queue timeout issue
From: David Miller @ 2016-11-30 19:11 UTC (permalink / raw)
  To: sunil.kovvuri; +Cc: netdev, linux-kernel, linux-arm-kernel, sgoutham
In-Reply-To: <1480419620-32500-1-git-send-email-sunil.kovvuri@gmail.com>

From: sunil.kovvuri@gmail.com
Date: Tue, 29 Nov 2016 17:10:20 +0530

> +		/* Check again, incase another cpu freed descriptors */
> +		if (atomic_read(&sq->free_cnt) > MIN_SQ_DESC_PER_PKT_XMIT) {
> +			netif_tx_start_queue(txq);

You have to use netif_tx_wake_queue() any time you restart a queue after bringing
the device up.

^ permalink raw reply

* Re: [PATCH net] cdc_ether: Fix handling connection notification
From: Henning Schild @ 2016-11-30 19:09 UTC (permalink / raw)
  To: Kristian Evensen
  Cc: oliver-GvhC2dPhHPQdnm+yROfE0A, linux-usb-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161130184216.4224-1-kristian.evensen-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

Am Wed, 30 Nov 2016 19:42:16 +0100
schrieb Kristian Evensen <kristian.evensen-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>:

> Commit bfe9b9d2df66 ("cdc_ether: Improve ZTE MF823/831/910 handling")
> introduced a work-around in usbnet_cdc_status() for devices that
> exported cdc carrier on twice on connect. Before the commit, this
> behavior caused the link state to be incorrect. It was assumed that
> all CDC Ethernet devices would either export this behavior, or send
> one off and then one on notification (which seems to be the default
> behavior).
> 
> Unfortunately, it turns out multiple devices sends a connection
> notification multiple times per second (via an interrupt), even when
> connection state does not change. This has been observed with several
> different USB LAN dongles (at least), for example 13b1:0041 (Linksys).
> After bfe9b9d2df66, the link state has been set as down and then up
> for each notification. This has caused a flood of Netlink NEWLINK
> messages and syslog to be flooded with messages similar to:
> 
> cdc_ether 2-1:2.0 eth1: kevent 12 may have been dropped
> 
> This commit fixes the behavior by reverting usbnet_cdc_status() to
> how it was before bfe9b9d2df66. The work-around has been moved to a
> separate status-function which is only called when a known, affect
> device is detected.
> 
> Fixes: bfe9b9d2df66 ("cdc_ether: Improve ZTE MF823/831/910 handling")
> Reported-by: Henning Schild <henning.schild-kv7WeFo6aLtBDgjK7y7TUQ@public.gmane.org>
> Signed-off-by: Kristian Evensen <kristian.evensen-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> ---
>  drivers/net/usb/cdc_ether.c | 38
> +++++++++++++++++++++++++++++++------- 1 file changed, 31
> insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
> index 45e5e43..8c628ea 100644
> --- a/drivers/net/usb/cdc_ether.c
> +++ b/drivers/net/usb/cdc_ether.c
> @@ -388,12 +388,6 @@ void usbnet_cdc_status(struct usbnet *dev,
> struct urb *urb) case USB_CDC_NOTIFY_NETWORK_CONNECTION:
>  		netif_dbg(dev, timer, dev->net, "CDC: carrier %s\n",
>  			  event->wValue ? "on" : "off");
> -
> -		/* Work-around for devices with broken
> off-notifications */
> -		if (event->wValue &&
> -		    !test_bit(__LINK_STATE_NOCARRIER,
> &dev->net->state))
> -			usbnet_link_change(dev, 0, 0);
> -
>  		usbnet_link_change(dev, !!event->wValue, 0);
>  		break;
>  	case USB_CDC_NOTIFY_SPEED_CHANGE:	/* tx/rx rates */
> @@ -466,6 +460,36 @@ static int usbnet_cdc_zte_rx_fixup(struct usbnet
> *dev, struct sk_buff *skb) return 1;
>  }
>  
> +/* Ensure correct link state
> + *
> + * Some devices (ZTE MF823/831/910) export two carrier on
> notifications when
> + * connected. This causes the link state to be incorrect. Work
> around this by
> + * always setting the state to off, then on.
> + */
> +void usbnet_cdc_zte_status(struct usbnet *dev, struct urb *urb)
> +{
> +	struct usb_cdc_notification *event;
> +
> +	if (urb->actual_length < sizeof(*event))
> +		return;
> +
> +	event = urb->transfer_buffer;
> +
> +	if (event->bNotificationType !=
> USB_CDC_NOTIFY_NETWORK_CONNECTION) {
> +		usbnet_cdc_status(dev, urb);
> +		return;
> +	}
> +
> +	netif_dbg(dev, timer, dev->net, "CDC: carrier %s\n",
> +		  event->wValue ? "on" : "off");
> +
> +	if (event->wValue &&
> +	    !test_bit(__LINK_STATE_NOCARRIER, &dev->net->state))

You should probably use netif_carrier_ok(dev->net) instead.

> +		usbnet_link_change(dev, 0, 0);
> +
> +	usbnet_link_change(dev, !!event->wValue, 0);

Would the following work?

usbnet_link_change(dev, !!event->wValue, event->wValue &&
netif_carrier_ok(dev->net));

> +}
> +
>  static const struct driver_info	cdc_info = {
>  	.description =	"CDC Ethernet Device",
>  	.flags =	FLAG_ETHER | FLAG_POINTTOPOINT,
> @@ -481,7 +505,7 @@ static const struct driver_info
> zte_cdc_info = { .flags =	FLAG_ETHER | FLAG_POINTTOPOINT,
>  	.bind =		usbnet_cdc_zte_bind,
>  	.unbind =	usbnet_cdc_unbind,
> -	.status =	usbnet_cdc_status,
> +	.status =	usbnet_cdc_zte_status,
>  	.set_rx_mode =	usbnet_cdc_update_filter,
>  	.manage_power =	usbnet_manage_power,
>  	.rx_fixup = usbnet_cdc_zte_rx_fixup,

--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH net-next] sock: reset sk_err for ICMP packets read from error queue
From: Soheil Hassas Yeganeh @ 2016-11-30 19:01 UTC (permalink / raw)
  To: davem, netdev; +Cc: edumazet, willemb, maze, hannes, Soheil Hassas Yeganeh

From: Soheil Hassas Yeganeh <soheil@google.com>

Only when ICMP packets are enqueued onto the error queue,
sk_err is also set. Before f5f99309fa74 (sock: do not set sk_err
in sock_dequeue_err_skb), a subsequent error queue read
would set sk_err to the next error on the queue, or 0 if empty.
As no error types other than ICMP set this field, sk_err should
not be modified upon dequeuing them.

Only for ICMP errors, reset the (racy) sk_err. Some applications,
like traceroute, rely on it and go into a futile busy POLLERR
loop otherwise.

In principle, sk_err has to be set while an ICMP error is queued.
Testing is_icmp_err_skb(skb_next) approximates this without
requiring a full queue walk. Applications that receive both ICMP
and other errors cannot rely on this legacy behavior, as other
errors do not set sk_err in the first place.

Fixes: f5f99309fa74 (sock: do not set sk_err in sock_dequeue_err_skb)
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 net/core/skbuff.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d1d1a5a..8dad391 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3714,20 +3714,29 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(sock_queue_err_skb);
 
+static bool is_icmp_err_skb(const struct sk_buff *skb)
+{
+	return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+		       SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
+}
+
 struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
 {
 	struct sk_buff_head *q = &sk->sk_error_queue;
-	struct sk_buff *skb, *skb_next;
+	struct sk_buff *skb, *skb_next = NULL;
+	bool icmp_next = false;
 	unsigned long flags;
-	int err = 0;
 
 	spin_lock_irqsave(&q->lock, flags);
 	skb = __skb_dequeue(q);
 	if (skb && (skb_next = skb_peek(q)))
-		err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
+		icmp_next = is_icmp_err_skb(skb_next);
 	spin_unlock_irqrestore(&q->lock, flags);
 
-	if (err)
+        if (is_icmp_err_skb(skb) && !icmp_next)
+		sk->sk_err = 0;
+
+	if (skb_next)
 		sk->sk_error_report(sk);
 
 	return skb;
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* Re: DSA vs. SWTICHDEV ?
From: Joakim Tjernlund @ 2016-11-30 18:44 UTC (permalink / raw)
  To: f.fainelli@gmail.com, andrew@lunn.ch; +Cc: netdev@vger.kernel.org
In-Reply-To: <922521a8-3a33-bbc4-2e07-2b946590c829@gmail.com>

On Wed, 2016-11-30 at 10:10 -0800, Florian Fainelli wrote:
> On 11/30/2016 09:44 AM, Joakim Tjernlund wrote:
> > On Wed, 2016-11-30 at 17:55 +0100, Andrew Lunn wrote:
> > > > This is an embedded system with several boards in a subrack.
> > > > Each board has eth I/F connected to a switch to communicate with each other.
> > > > One of the board will also house the actual switch device and manage the switch.
> > > > Then the normal app just communicates over the physical eth I/F like any other board
> > > > in the system. There is a "manage switch app" which brings the switch up and partition
> > > > phys VLANs etc. (each phys I/F would be a a separate domain so no loop)
> > > 
> > > So you are planning on throwing away the "manage switch app", and just
> > > use standard linux networking commands? That is what switchdev is all
> > > about really, throwing away the vendor SDK for the switch, making a
> > > switch just a bunch on interfaces on the host which you manage as
> > > normal interfaces.
> > 
> > Something like that. I need to run routing protocols on the switch I/Fs and egress
> > pkgs on selected switch I/Fs bypassing ARP, just like DSA does with its vendor
> > tags.
> > 
> > > 
> > > > I guess I could skip the phys I/F and have the switch app create a virtual eth0 I/F over PCIe
> > > 
> > > No need to create this interface. It will exist if you go the
> > > switchdev route.
> > > 
> > > > > > And switchdev can do all this over PCIe instead? Can you have a
> > > > > > switch tree in switchdev too?
> > > > > 
> > > > > Mellonex says so, but i don't think they have actually implemented it.
> > > > 
> > > > Not impl. any of DSAs features? What can you do with a Mellonex switch then?
> > > 
> > > They don't have a tree of switches, as far as i know. Just a single
> > > switch. But DSA does support a tree of switches, that is what the D in
> > > DSA means, distributed. And there are a couple of boards which have 2
> > > to 4 switches in a tree.
> > 
> > We might have a tree as well so now I really wonder: Given we write a
> > proper switchdev driver, can it support switchtrees without touching
> > switchdev infra structure? If not I guess we will attach a physical
> > eth I/F to the switch and use both DSA and switchdev to support both trees
> > and HW offload. 
> 
> switchdev in itself really is the glue layer between the networking
> stack and how to push specific objects down to the Ethernet switch
> driver, and that Ethernet switch driver. Switchdev does not enforce a
> specific network device driver model object, and just provides standard
> hooks for your network devices to register with switchdev in order to
> push/receive offloads. DSA on the other hand, utilizes switchdev to get
> notifications about offloads from the networking stack, but also exposes
> a clearly and well defined Ethernet switch device driver model, as
> Andrew described, it creates per-port network devices, binds the ports
> to their PHYs (built-in, or external), and also takes care of
> encapsulating/decapsulating the switch specific tagging protocol.

Lets see, switchdev is mainly for offloading L2/L3 into HW and does NOT create
virtual I/F(one for each phys sw port) so if my only goal is to offload I don't
need DSA? (How do one create routes if no virtual I/Fs I wonder ..) 

DSA then does create virt. I/Fs and manages switch trees, to actually tx pkg it needs
a phys I/F using vendor specific tags(for PCIe connected switches I can envision using
the PCIe connection instead).

Who is the master when using both DSA and switchdev w.r.t initil conf/bring up of switch?
configuring VLANs?

> 
> We should probably put that in some crystal clear sentence somewhere in
> Documentation/networking/ but switchdev and DSA are complementary and
> not competitors, they just do not tackle the problems from the same angle.

Yes :)

> 
> > 
> > > 
> > > I think this is partially down to market segments. Mellonex market is
> > > top of rack switches. High port count, very high bandwidth. DSA is
> > > more wireless access points, set top boxes, generally up to 7 ports of
> > > 1Gbps and a few custom embedded products which need more ports, so
> > > build a tree of switches.
> > 
> > We have on an existing board with a BCM ROBO switch with lots of ports(>24),
> > managed over SPI. Looking at BCM DSA tag code it looks like it only supports
> > some 8 ports or so. I still have to find out if this is a limitation in BCM tagging
> > protocol or if just not impl. in DSA yet.
> 
> Oh cool, can you share the model by chance? I suspect the tagging format

I think it is an BCM5322

> of that switch is going to be different than what net/dsa/tag_brcm.c, so
> feel free to add something NET_DSA_TAG_BRCM8B (for 8 bytes) or something
> like that.

I will have to look at that, don't have an docs handy.

> 
> Note that DSA currently hardcodes the maximum number of ports to 14
> (DSA_MAX_PORTS), but this should obviously be something dynamically
> determined based on probing the switch device.
> 
> Can you also evaluate if using drivers/net/dsa/b53/ would work for you?
> My hope would be that they preserved the register compatibility here,
> but since this has a large number of ports, it may have completely
> offset most registers.

I will have a look, I sure want to try out DSA/switchdev on existing boards if I can :)

> 
> BTW, there is #linuxswitch on Freenode if you want to chat!

^ permalink raw reply

* Re: [PATCH net-next] cgroup, bpf: remove unnecessary #include
From: David Miller @ 2016-11-30 18:58 UTC (permalink / raw)
  To: ast; +Cc: daniel, daniel, roszenrami, netdev
In-Reply-To: <1480529768-1954199-1-git-send-email-ast@fb.com>

From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 30 Nov 2016 10:16:08 -0800

> this #include is unnecessary and brings whole set of
> other headers into cgroup-defs.h. Remove it.
> 
> Fixes: 3007098494be ("cgroup: add support for eBPF programs")
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> Acked-by: Rami Rosen <roszenrami@gmail.com>
> Acked-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: Daniel Mack <daniel@zonque.org>
> ---
> Dave,
> this patch got lost somehow (marked accepted, but not in net-next).
> Resending.

Applied, thanks ALexei.

^ permalink raw reply

* Re: [PATCH v4 net-next 3/7] net: mvneta: Use cacheable memory to store the rx buffer virtual address
From: David Miller @ 2016-11-30 18:57 UTC (permalink / raw)
  To: gregory.clement
  Cc: linux-kernel, netdev, jszhang, arnd, jason, andrew,
	sebastian.hesselbarth, thomas.petazzoni, linux-arm-kernel, nadavh,
	mw, dima, yelena
In-Reply-To: <bb2f1a61e9b912905f8390f7a06c36f9f1e7ab66.1480431285.git-series.gregory.clement@free-electrons.com>

From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Tue, 29 Nov 2016 15:55:21 +0100

> +	/* Virtual address of the RX buffer */
> +	void  **buf_virt_addr;
> +
>  	/* Virtual address of the RX DMA descriptors array */
>  	struct mvneta_rx_desc *descs;
>  
 ...
> +		data = (unsigned char *)rxq->buf_virt_addr[index];

This cast is unnecessary, please remove it.

^ permalink raw reply

* Re: [net-next PATCH v3 0/6] XDP for virtio_net
From: Michael S. Tsirkin @ 2016-11-30 18:56 UTC (permalink / raw)
  To: John Fastabend
  Cc: tgraf, shm, alexei.starovoitov, daniel, davem, john.r.fastabend,
	netdev, bblanco, brouer
In-Reply-To: <583F1D40.7060408@gmail.com>

On Wed, Nov 30, 2016 at 10:41:04AM -0800, John Fastabend wrote:
> On 16-11-30 10:35 AM, Michael S. Tsirkin wrote:
> > On Tue, Nov 29, 2016 at 12:05:20PM -0800, John Fastabend wrote:
> >> This implements virtio_net for the mergeable buffers and big_packet
> >> modes. I tested this with vhost_net running on qemu and did not see
> >> any issues. For testing num_buf > 1 I added a hack to vhost driver
> >> to only but 100 bytes per buffer.
> >>
> >> There are some restrictions for XDP to be enabled and work well
> >> (see patch 3) for more details.
> >>
> >>   1. LRO must be off
> >>   2. MTU must be less than PAGE_SIZE
> >>   3. queues must be available to dedicate to XDP
> >>   4. num_bufs received in mergeable buffers must be 1
> >>   5. big_packet mode must have all data on single page
> >>
> >> Please review any comments/feedback welcome as always.
> >>
> >> v2, fixes rcu usage throughout thanks to Eric and the use of
> >> num_online_cpus() usage thanks to Jakub.
> >>
> >> v3, add slowpath patch to handle num_bufs > 1
> >>
> >> Thanks,
> >> John
> > 
> > BTW this is threaded incorrectly: patch 1/6 isn't a reply to 0/6,
> > patches 2 and on are replies to patch 1.
> > 
> 
> Ah yep, if you mangle the command line git will send the
> cover letter even if you have mangled 'to' email addresses but when
> it hits a real patch it aborts. At least on my version of git.
> 
> > I'm busy until end of week, I'll review Monday. Sorry about the delay.
> 
> In the meantime I'll post a v4 with better commit message (Alexei) and
> address a corner cases Jakub pointed out.

I did a quick look and found some too, but a detailed review will
have to wait till next week.

> > 
> >> ---
> >>
> >> John Fastabend (6):
> >>       net: virtio dynamically disable/enable LRO
> >>       net: xdp: add invalid buffer warning
> >>       virtio_net: Add XDP support
> >>       virtio_net: add dedicated XDP transmit queues
> >>       virtio_net: add XDP_TX support
> >>       virtio_net: xdp, add slowpath case for non contiguous buffers
> >>
> >>
> >>  drivers/net/virtio_net.c |  344 +++++++++++++++++++++++++++++++++++++++++++++-
> >>  include/linux/filter.h   |    1 
> >>  net/core/filter.c        |    6 +
> >>  3 files changed, 346 insertions(+), 5 deletions(-)
> >>
> >> --
> >> Signature

^ permalink raw reply

* Re: [net-next PATCH v3 3/6] virtio_net: Add XDP support
From: Michael S. Tsirkin @ 2016-11-30 18:54 UTC (permalink / raw)
  To: John Fastabend
  Cc: eric.dumazet, daniel, shm, davem, tgraf, alexei.starovoitov,
	john.r.fastabend, netdev, bblanco, brouer
In-Reply-To: <20161129201021.26851.34352.stgit@john-Precision-Tower-5810>

On Tue, Nov 29, 2016 at 12:10:21PM -0800, John Fastabend wrote:
> From: John Fastabend <john.fastabend@gmail.com>
> 
> This adds XDP support to virtio_net. Some requirements must be
> met for XDP to be enabled depending on the mode. First it will
> only be supported with LRO disabled so that data is not pushed
> across multiple buffers. Second the MTU must be less than a page
> size to avoid having to handle XDP across multiple pages.
> 
> If mergeable receive is enabled this patch only supports the case
> where header and data are in the same buf which we can check when
> a packet is received by looking at num_buf. If the num_buf is
> greater than 1 and a XDP program is loaded the packet is dropped
> and a warning is thrown. When any_header_sg is set this does not
> happen and both header and data is put in a single buffer as expected
> so we check this when XDP programs are loaded.  Subsequent patches
> will process the packet in a degraded mode to ensure connectivity
> and correctness is not lost even if backend pushes packets into
> multiple buffers.
> 
> If big packets mode is enabled and MTU/LRO conditions above are
> met then XDP is allowed.
> 
> This patch was tested with qemu with vhost=on and vhost=off where
> mergable and big_packet modes were forced via hard coding feature
> negotiation. Multiple buffers per packet was forced via a small
> test patch to vhost.c in the vhost=on qemu mode.
> 
> Suggested-by: Shrijeet Mukherjee <shrijeet@gmail.com>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>  drivers/net/virtio_net.c |  154 +++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 150 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 8189e5b..32126bf 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -22,6 +22,7 @@
>  #include <linux/module.h>
>  #include <linux/virtio.h>
>  #include <linux/virtio_net.h>
> +#include <linux/bpf.h>
>  #include <linux/scatterlist.h>
>  #include <linux/if_vlan.h>
>  #include <linux/slab.h>
> @@ -81,6 +82,8 @@ struct receive_queue {
>  
>  	struct napi_struct napi;
>  
> +	struct bpf_prog __rcu *xdp_prog;
> +
>  	/* Chain pages by the private ptr. */
>  	struct page *pages;
>  
> @@ -324,6 +327,38 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>  	return skb;
>  }
>  
> +static u32 do_xdp_prog(struct virtnet_info *vi,
> +		       struct bpf_prog *xdp_prog,
> +		       struct page *page, int offset, int len)
> +{
> +	int hdr_padded_len;
> +	struct xdp_buff xdp;
> +	u32 act;
> +	u8 *buf;
> +
> +	buf = page_address(page) + offset;
> +
> +	if (vi->mergeable_rx_bufs)
> +		hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +	else
> +		hdr_padded_len = sizeof(struct padded_vnet_hdr);
> +
> +	xdp.data = buf + hdr_padded_len;
> +	xdp.data_end = xdp.data + (len - vi->hdr_len);

so header seems to be ignored completely.
but the packet could be from the time when
e.g. checksum offloading was on, and
so it might gave DATA_VALID (from CHECKSUM_UNNECESSARY
in host).

I think you want to verify that flags and gso type
are 0.


> +
> +	act = bpf_prog_run_xdp(xdp_prog, &xdp);
> +	switch (act) {
> +	case XDP_PASS:
> +		return XDP_PASS;
> +	default:
> +		bpf_warn_invalid_xdp_action(act);
> +	case XDP_TX:
> +	case XDP_ABORTED:
> +	case XDP_DROP:
> +		return XDP_DROP;
> +	}
> +}

do we really want this switch just to warn?
How about doing != XDP_PASS in the caller?

> +
>  static struct sk_buff *receive_small(struct virtnet_info *vi, void *buf, unsigned int len)
>  {
>  	struct sk_buff * skb = buf;
> @@ -340,14 +375,28 @@ static struct sk_buff *receive_big(struct net_device *dev,
>  				   void *buf,
>  				   unsigned int len)
>  {
> +	struct bpf_prog *xdp_prog;
>  	struct page *page = buf;
> -	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
> +	struct sk_buff *skb;
>  
> +	rcu_read_lock();
> +	xdp_prog = rcu_dereference(rq->xdp_prog);
> +	if (xdp_prog) {
> +		u32 act = do_xdp_prog(vi, xdp_prog, page, 0, len);
> +
> +		if (act == XDP_DROP)
> +			goto err_xdp;
> +	}
> +	rcu_read_unlock();
> +
> +	skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
>  	if (unlikely(!skb))
>  		goto err;
>  
>  	return skb;
>  
> +err_xdp:
> +	rcu_read_unlock();
>  err:
>  	dev->stats.rx_dropped++;
>  	give_pages(rq, page);
> @@ -366,10 +415,27 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  	struct page *page = virt_to_head_page(buf);
>  	int offset = buf - page_address(page);
>  	unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));

This is some useless computation when XDP is used, isn't it?

> +	struct sk_buff *head_skb, *curr_skb;
> +	struct bpf_prog *xdp_prog;
>  
> -	struct sk_buff *head_skb = page_to_skb(vi, rq, page, offset, len,
> -					       truesize);
> -	struct sk_buff *curr_skb = head_skb;
> +	rcu_read_lock();
> +	xdp_prog = rcu_dereference(rq->xdp_prog);
> +	if (xdp_prog) {
> +		u32 act;
> +
> +		if (num_buf > 1) {
> +			bpf_warn_invalid_xdp_buffer();
> +			goto err_xdp;
> +		}
> +
> +		act = do_xdp_prog(vi, xdp_prog, page, offset, len);
> +		if (act == XDP_DROP)
> +			goto err_xdp;
> +	}
> +	rcu_read_unlock();
> +
> +	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
> +	curr_skb = head_skb;
>  
>  	if (unlikely(!curr_skb))
>  		goto err_skb;

I'm confused. Did the requirement to have a page per packet go away?
I don't think this mode is doing it here.


> @@ -423,6 +489,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
>  	return head_skb;
>  
> +err_xdp:
> +	rcu_read_unlock();
>  err_skb:
>  	put_page(page);
>  	while (--num_buf) {
> @@ -1328,6 +1396,13 @@ static int virtnet_set_channels(struct net_device *dev,
>  	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
>  		return -EINVAL;
>  
> +	/* For now we don't support modifying channels while XDP is loaded
> +	 * also when XDP is loaded all RX queues have XDP programs so we only
> +	 * need to check a single RX queue.
> +	 */
> +	if (vi->rq[0].xdp_prog)
> +		return -EINVAL;
> +
>  	get_online_cpus();
>  	err = virtnet_set_queues(vi, queue_pairs);
>  	if (!err) {
> @@ -1454,6 +1529,68 @@ static int virtnet_set_features(struct net_device *netdev,
>  	return 0;
>  }
>  
> +static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	struct bpf_prog *old_prog;
> +	int i;
> +
> +	if ((dev->features & NETIF_F_LRO) && prog) {
> +		netdev_warn(dev, "can't set XDP while LRO is on, disable LRO first\n");
> +		return -EINVAL;
> +	}
> +
> +	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
> +		netdev_warn(dev, "XDP expects header/data in single page\n");
> +		return -EINVAL;
> +	}
> +
> +	if (dev->mtu > PAGE_SIZE) {
> +		netdev_warn(dev, "XDP requires MTU less than %lu\n", PAGE_SIZE);
> +		return -EINVAL;
> +	}
> +
> +	if (prog) {
> +		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
> +		if (IS_ERR(prog))
> +			return PTR_ERR(prog);
> +	}
> +
> +	for (i = 0; i < vi->max_queue_pairs; i++) {
> +		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
> +		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
> +		if (old_prog)
> +			bpf_prog_put(old_prog);

don't we need to sync before put?


> +	}
> +
> +	return 0;
> +}
> +
> +static bool virtnet_xdp_query(struct net_device *dev)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	int i;
> +
> +	for (i = 0; i < vi->max_queue_pairs; i++) {
> +		if (vi->rq[i].xdp_prog)
> +			return true;
> +	}
> +	return false;
> +}
> +
> +static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
> +{
> +	switch (xdp->command) {
> +	case XDP_SETUP_PROG:
> +		return virtnet_xdp_set(dev, xdp->prog);
> +	case XDP_QUERY_PROG:
> +		xdp->prog_attached = virtnet_xdp_query(dev);
> +		return 0;
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
>  static const struct net_device_ops virtnet_netdev = {
>  	.ndo_open            = virtnet_open,
>  	.ndo_stop   	     = virtnet_close,
> @@ -1471,6 +1608,7 @@ static int virtnet_set_features(struct net_device *netdev,
>  	.ndo_busy_poll		= virtnet_busy_poll,
>  #endif
>  	.ndo_set_features	= virtnet_set_features,
> +	.ndo_xdp		= virtnet_xdp,
>  };
>  
>  static void virtnet_config_changed_work(struct work_struct *work)
> @@ -1527,12 +1665,20 @@ static void virtnet_free_queues(struct virtnet_info *vi)
>  
>  static void free_receive_bufs(struct virtnet_info *vi)
>  {
> +	struct bpf_prog *old_prog;
>  	int i;
>  
> +	rtnl_lock();
>  	for (i = 0; i < vi->max_queue_pairs; i++) {
>  		while (vi->rq[i].pages)
>  			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
> +
> +		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
> +		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
> +		if (old_prog)
> +			bpf_prog_put(old_prog);
>  	}
> +	rtnl_unlock();
>  }
>  
>  static void free_receive_page_frags(struct virtnet_info *vi)

^ permalink raw reply

* Re: [net-next PATCH v3 6/6] virtio_net: xdp, add slowpath case for non contiguous buffers
From: Michael S. Tsirkin @ 2016-11-30 18:49 UTC (permalink / raw)
  To: John Fastabend
  Cc: Jakub Kicinski, eric.dumazet, daniel, shm, davem, tgraf,
	alexei.starovoitov, john.r.fastabend, netdev, bblanco, brouer
In-Reply-To: <583F0361.5030804@gmail.com>

On Wed, Nov 30, 2016 at 08:50:41AM -0800, John Fastabend wrote:
> On 16-11-30 06:30 AM, Jakub Kicinski wrote:
> > [add MST]
> > 
> 
> Thanks sorry MST. I did a cut'n'paste of an old list of CC's and missed
> you were not on the list.
> 
> [...]
> 
> >> +	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
> >> +	while (--num_buf) {
> >> +		unsigned int buflen;
> >> +		unsigned long ctx;
> >> +		void *buf;
> >> +		int off;
> >> +
> >> +		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &buflen);
> >> +		if (unlikely(!ctx))
> >> +			goto err_buf;
> >> +
> >> +		buf = mergeable_ctx_to_buf_address(ctx);
> >> +		p = virt_to_head_page(buf);
> >> +		off = buf - page_address(p);
> >> +
> >> +		memcpy(page_address(page) + page_off,
> >> +		       page_address(p) + off, buflen);
> >> +		page_off += buflen;
> > 
> > Could malicious user potentially submit a frame bigger than MTU?
> 
> Well presumably if the MTU is greater than PAGE_SIZE the xdp program
> would not have been loaded. And the malicious user in this case would
> have to be qemu which seems like everything is already lost if qemu
> is trying to attack its VM.
> 
> But this is a good point because it looks like there is nothing in
> virtio or qemu that drops frames with MTU greater than the virtio
> configured setting. Maybe Michael can confirm this or I'll poke at it
> more. I think qemu should drop these frames in general.
> 
> So I think adding a guard here is sensible I'll go ahead and do that.
> Also the MTU guard at set_xdp time needs to account for header length.

I agree. Further, offloads are disabled dynamically and we could
get a packet that was processed with LRO.

> Thanks nice catch.
> 
> > 
> >> +	}
> >> +
> >> +	*len = page_off;
> >> +	return page;
> >> +err_buf:
> >> +	__free_pages(page, 0);
> >> +	return NULL;
> >> +}
> >> +
> >>  static struct sk_buff *receive_mergeable(struct net_device *dev,
> >>  					 struct virtnet_info *vi,
> >>  					 struct receive_queue *rq,
> >> @@ -469,21 +519,37 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >>  	rcu_read_lock();
> >>  	xdp_prog = rcu_dereference(rq->xdp_prog);
> >>  	if (xdp_prog) {
> >> +		struct page *xdp_page;
> >>  		u32 act;
> >>  
> >>  		if (num_buf > 1) {
> >>  			bpf_warn_invalid_xdp_buffer();
> >> -			goto err_xdp;
> >> +
> >> +			/* linearize data for XDP */
> >> +			xdp_page = xdp_linearize_page(rq, num_buf,
> >> +						      page, offset, &len);
> >> +			if (!xdp_page)
> >> +				goto err_xdp;
> >> +			offset = len;
> >> +		} else {
> >> +			xdp_page = page;
> >>  		}
> >>  
> >> -		act = do_xdp_prog(vi, xdp_prog, page, offset, len);
> >> +		act = do_xdp_prog(vi, xdp_prog, xdp_page, offset, len);
> >>  		switch (act) {
> >>  		case XDP_PASS:
> >> +			if (unlikely(xdp_page != page))
> >> +				__free_pages(xdp_page, 0);
> >>  			break;
> >>  		case XDP_TX:
> >> +			if (unlikely(xdp_page != page))
> >> +				goto err_xdp;
> >> +			rcu_read_unlock();
> > 
> > Only if there is a reason for v4 - this unlock could go to the previous
> > patch.
> > 
> 
> Sure will do this.

^ permalink raw reply

* Re: [net-next v2] neigh: remove duplicate check for same neigh
From: David Miller @ 2016-11-30 18:46 UTC (permalink / raw)
  To: zhangshengju; +Cc: netdev, dsa
In-Reply-To: <1480476282-3650-1-git-send-email-zhangshengju@cmss.chinamobile.com>

From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Wed, 30 Nov 2016 11:24:42 +0800

> Currently loop index 'idx' is used as the index in the neigh list of interest.
> It's increased only when the neigh is dumped. It's not the absolute index in
> the list. Because there is no info to record which neigh has already be scanned
> by previous loop. This will cause the filtered out neighs to be scanned mulitple
> times.
> 
> This patch make idx as the absolute index in the list, it will increase no matter
> whether the neigh is filtered. This will prevent the above problem.
> 
> And this is in line with other dump functions.
> 
> v2:
>  - take David Ahern's advice to do simple change
> 
> Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>

Applied, thanks.

^ permalink raw reply

* Re: [net-next PATCH v3 5/6] virtio_net: add XDP_TX support
From: Michael S. Tsirkin @ 2016-11-30 18:45 UTC (permalink / raw)
  To: John Fastabend
  Cc: eric.dumazet, daniel, shm, davem, tgraf, alexei.starovoitov,
	john.r.fastabend, netdev, bblanco, brouer
In-Reply-To: <20161129201108.26851.1114.stgit@john-Precision-Tower-5810>

On Tue, Nov 29, 2016 at 12:11:08PM -0800, John Fastabend wrote:
> This adds support for the XDP_TX action to virtio_net. When an XDP
> program is run and returns the XDP_TX action the virtio_net XDP
> implementation will transmit the packet on a TX queue that aligns
> with the current CPU that the XDP packet was processed on.
> 
> Before sending the packet the header is zeroed.  Also XDP is expected
> to handle checksum correctly so no checksum offload  support is
> provided.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>  drivers/net/virtio_net.c |   59 ++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 56 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index a1bfa99..9604e55 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -330,12 +330,40 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>  	return skb;
>  }
>  
> +static void virtnet_xdp_xmit(struct virtnet_info *vi,
> +			     unsigned int qnum, struct xdp_buff *xdp)
> +{
> +	struct send_queue *sq = &vi->sq[qnum];
> +	struct virtio_net_hdr_mrg_rxbuf *hdr;
> +	unsigned int num_sg, len;
> +	void *xdp_sent;
> +
> +	/* Free up any pending old buffers before queueing new ones. */
> +	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> +		struct page *page = virt_to_head_page(xdp_sent);
> +
> +		put_page(page);
> +	}
> +
> +	/* Zero header and leave csum up to XDP layers */
> +	hdr = xdp->data;
> +	memset(hdr, 0, vi->hdr_len);
> +	hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
> +	hdr->hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;

Do we really want this?
This is CHECKSUM_UNNECESSARY.
Does not XDP pass checksummed packets?


> +
> +	num_sg = 1;
> +	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
> +	virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, xdp->data, GFP_ATOMIC);

This might fail. If it does, you want to at least free up the memory.

> +	virtqueue_kick(sq->vq);
> +}
> +
>  static u32 do_xdp_prog(struct virtnet_info *vi,
>  		       struct bpf_prog *xdp_prog,
>  		       struct page *page, int offset, int len)
>  {
>  	int hdr_padded_len;
>  	struct xdp_buff xdp;
> +	unsigned int qp;
>  	u32 act;
>  	u8 *buf;
>  
> @@ -353,9 +381,15 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
>  	switch (act) {
>  	case XDP_PASS:
>  		return XDP_PASS;
> +	case XDP_TX:
> +		qp = vi->curr_queue_pairs -
> +			vi->xdp_queue_pairs +
> +			smp_processor_id();
> +		xdp.data = buf + (vi->mergeable_rx_bufs ? 0 : 4);
> +		virtnet_xdp_xmit(vi, qp, &xdp);
> +		return XDP_TX;
>  	default:
>  		bpf_warn_invalid_xdp_action(act);
> -	case XDP_TX:
>  	case XDP_ABORTED:
>  	case XDP_DROP:
>  		return XDP_DROP;
> @@ -387,8 +421,16 @@ static struct sk_buff *receive_big(struct net_device *dev,
>  	if (xdp_prog) {
>  		u32 act = do_xdp_prog(vi, xdp_prog, page, 0, len);
>  
> -		if (act == XDP_DROP)
> +		switch (act) {
> +		case XDP_PASS:
> +			break;
> +		case XDP_TX:
> +			rcu_read_unlock();
> +			goto xdp_xmit;
> +		case XDP_DROP:
> +		default:
>  			goto err_xdp;
> +		}
>  	}
>  	rcu_read_unlock();
>  
> @@ -403,6 +445,7 @@ static struct sk_buff *receive_big(struct net_device *dev,
>  err:
>  	dev->stats.rx_dropped++;
>  	give_pages(rq, page);
> +xdp_xmit:
>  	return NULL;
>  }
>  
> @@ -421,6 +464,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  	struct sk_buff *head_skb, *curr_skb;
>  	struct bpf_prog *xdp_prog;
>  
> +	head_skb = NULL;
> +
>  	rcu_read_lock();
>  	xdp_prog = rcu_dereference(rq->xdp_prog);
>  	if (xdp_prog) {
> @@ -432,8 +477,15 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  		}
>  
>  		act = do_xdp_prog(vi, xdp_prog, page, offset, len);
> -		if (act == XDP_DROP)
> +		switch (act) {
> +		case XDP_PASS:
> +			break;
> +		case XDP_TX:
> +			goto xdp_xmit;
> +		case XDP_DROP:
> +		default:
>  			goto err_xdp;
> +		}
>  	}
>  	rcu_read_unlock();
>  
> @@ -510,6 +562,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  err_buf:
>  	dev->stats.rx_dropped++;
>  	dev_kfree_skb(head_skb);
> +xdp_xmit:
>  	return NULL;
>  }
>  

^ permalink raw reply

* Re: [PATCH 4/6] net: ethernet: ti: cpts: add ptp pps support
From: Richard Cochran @ 2016-11-30 18:45 UTC (permalink / raw)
  To: Grygorii Strashko
  Cc: David S. Miller, netdev, Mugunthan V N, Sekhar Nori, linux-kernel,
	linux-omap, Rob Herring, devicetree, Murali Karicheri,
	Wingman Kwok
In-Reply-To: <20161128230428.6872-5-grygorii.strashko@ti.com>

On Mon, Nov 28, 2016 at 05:04:26PM -0600, Grygorii Strashko wrote:
> +static cycle_t cpts_cc_ns2cyc(struct cpts *cpts, u64 nsecs)
> +{
> +	cycle_t cyc = (nsecs << cpts->cc.shift) + nsecs;
> +
> +	do_div(cyc, cpts->cc.mult);
> +
> +	return cyc;
> +}

So you set the comparison value once per second, based on cc.mult.
But when the clock is being actively synchronized, user space calls to
clock_adjtimex() will change cc.mult.  This can happen several times
per second, depending on the PTP Sync rate.

In order to produce the PPS edge correctly, you would have to adjust
the comparison value whenever cc.mult changes, but of course this is
unworkable.

So I'll have to say NAK for this patch.

Thanks,
Richard

^ permalink raw reply

* [PATCH net] cdc_ether: Fix handling connection notification
From: Kristian Evensen @ 2016-11-30 18:42 UTC (permalink / raw)
  To: oliver, linux-usb, netdev, linux-kernel, henning.schild; +Cc: Kristian Evensen

Commit bfe9b9d2df66 ("cdc_ether: Improve ZTE MF823/831/910 handling")
introduced a work-around in usbnet_cdc_status() for devices that exported
cdc carrier on twice on connect. Before the commit, this behavior caused
the link state to be incorrect. It was assumed that all CDC Ethernet
devices would either export this behavior, or send one off and then one on
notification (which seems to be the default behavior).

Unfortunately, it turns out multiple devices sends a connection
notification multiple times per second (via an interrupt), even when
connection state does not change. This has been observed with several
different USB LAN dongles (at least), for example 13b1:0041 (Linksys).
After bfe9b9d2df66, the link state has been set as down and then up for
each notification. This has caused a flood of Netlink NEWLINK messages and
syslog to be flooded with messages similar to:

cdc_ether 2-1:2.0 eth1: kevent 12 may have been dropped

This commit fixes the behavior by reverting usbnet_cdc_status() to how it
was before bfe9b9d2df66. The work-around has been moved to a separate
status-function which is only called when a known, affect device is
detected.

Fixes: bfe9b9d2df66 ("cdc_ether: Improve ZTE MF823/831/910 handling")
Reported-by: Henning Schild <henning.schild@siemens.com>
Signed-off-by: Kristian Evensen <kristian.evensen@gmail.com>
---
 drivers/net/usb/cdc_ether.c | 38 +++++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
index 45e5e43..8c628ea 100644
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -388,12 +388,6 @@ void usbnet_cdc_status(struct usbnet *dev, struct urb *urb)
 	case USB_CDC_NOTIFY_NETWORK_CONNECTION:
 		netif_dbg(dev, timer, dev->net, "CDC: carrier %s\n",
 			  event->wValue ? "on" : "off");
-
-		/* Work-around for devices with broken off-notifications */
-		if (event->wValue &&
-		    !test_bit(__LINK_STATE_NOCARRIER, &dev->net->state))
-			usbnet_link_change(dev, 0, 0);
-
 		usbnet_link_change(dev, !!event->wValue, 0);
 		break;
 	case USB_CDC_NOTIFY_SPEED_CHANGE:	/* tx/rx rates */
@@ -466,6 +460,36 @@ static int usbnet_cdc_zte_rx_fixup(struct usbnet *dev, struct sk_buff *skb)
 	return 1;
 }
 
+/* Ensure correct link state
+ *
+ * Some devices (ZTE MF823/831/910) export two carrier on notifications when
+ * connected. This causes the link state to be incorrect. Work around this by
+ * always setting the state to off, then on.
+ */
+void usbnet_cdc_zte_status(struct usbnet *dev, struct urb *urb)
+{
+	struct usb_cdc_notification *event;
+
+	if (urb->actual_length < sizeof(*event))
+		return;
+
+	event = urb->transfer_buffer;
+
+	if (event->bNotificationType != USB_CDC_NOTIFY_NETWORK_CONNECTION) {
+		usbnet_cdc_status(dev, urb);
+		return;
+	}
+
+	netif_dbg(dev, timer, dev->net, "CDC: carrier %s\n",
+		  event->wValue ? "on" : "off");
+
+	if (event->wValue &&
+	    !test_bit(__LINK_STATE_NOCARRIER, &dev->net->state))
+		usbnet_link_change(dev, 0, 0);
+
+	usbnet_link_change(dev, !!event->wValue, 0);
+}
+
 static const struct driver_info	cdc_info = {
 	.description =	"CDC Ethernet Device",
 	.flags =	FLAG_ETHER | FLAG_POINTTOPOINT,
@@ -481,7 +505,7 @@ static const struct driver_info	zte_cdc_info = {
 	.flags =	FLAG_ETHER | FLAG_POINTTOPOINT,
 	.bind =		usbnet_cdc_zte_bind,
 	.unbind =	usbnet_cdc_unbind,
-	.status =	usbnet_cdc_status,
+	.status =	usbnet_cdc_zte_status,
 	.set_rx_mode =	usbnet_cdc_update_filter,
 	.manage_power =	usbnet_manage_power,
 	.rx_fixup = usbnet_cdc_zte_rx_fixup,
-- 
2.9.3

^ permalink raw reply related

* Re: [net-next PATCH v3 0/6] XDP for virtio_net
From: John Fastabend @ 2016-11-30 18:41 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: tgraf, shm, alexei.starovoitov, daniel, davem, john.r.fastabend,
	netdev, bblanco, brouer
In-Reply-To: <20161130203450-mutt-send-email-mst@kernel.org>

On 16-11-30 10:35 AM, Michael S. Tsirkin wrote:
> On Tue, Nov 29, 2016 at 12:05:20PM -0800, John Fastabend wrote:
>> This implements virtio_net for the mergeable buffers and big_packet
>> modes. I tested this with vhost_net running on qemu and did not see
>> any issues. For testing num_buf > 1 I added a hack to vhost driver
>> to only but 100 bytes per buffer.
>>
>> There are some restrictions for XDP to be enabled and work well
>> (see patch 3) for more details.
>>
>>   1. LRO must be off
>>   2. MTU must be less than PAGE_SIZE
>>   3. queues must be available to dedicate to XDP
>>   4. num_bufs received in mergeable buffers must be 1
>>   5. big_packet mode must have all data on single page
>>
>> Please review any comments/feedback welcome as always.
>>
>> v2, fixes rcu usage throughout thanks to Eric and the use of
>> num_online_cpus() usage thanks to Jakub.
>>
>> v3, add slowpath patch to handle num_bufs > 1
>>
>> Thanks,
>> John
> 
> BTW this is threaded incorrectly: patch 1/6 isn't a reply to 0/6,
> patches 2 and on are replies to patch 1.
> 

Ah yep, if you mangle the command line git will send the
cover letter even if you have mangled 'to' email addresses but when
it hits a real patch it aborts. At least on my version of git.

> I'm busy until end of week, I'll review Monday. Sorry about the delay.

In the meantime I'll post a v4 with better commit message (Alexei) and
address a corner cases Jakub pointed out.

> 
>> ---
>>
>> John Fastabend (6):
>>       net: virtio dynamically disable/enable LRO
>>       net: xdp: add invalid buffer warning
>>       virtio_net: Add XDP support
>>       virtio_net: add dedicated XDP transmit queues
>>       virtio_net: add XDP_TX support
>>       virtio_net: xdp, add slowpath case for non contiguous buffers
>>
>>
>>  drivers/net/virtio_net.c |  344 +++++++++++++++++++++++++++++++++++++++++++++-
>>  include/linux/filter.h   |    1 
>>  net/core/filter.c        |    6 +
>>  3 files changed, 346 insertions(+), 5 deletions(-)
>>
>> --
>> Signature

^ permalink raw reply

* Re: [v5,1/5] soc: qcom: smem_state: Fix include for ERR_PTR()
From: Valo, Kalle @ 2016-11-30 18:40 UTC (permalink / raw)
  To: Bjorn Andersson
  Cc: k.eugene.e@gmail.com, Andy Gross, wcn36xx@lists.infradead.org,
	linux-wireless@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-arm-msm@vger.kernel.org
In-Reply-To: <877f7vimyg.fsf@kamboji.qca.qualcomm.com>

"Valo, Kalle" <kvalo@qca.qualcomm.com> writes:

> Bjorn Andersson <bjorn.andersson@linaro.org> writes:
>
>> On Wed 16 Nov 10:49 PST 2016, Kalle Valo wrote:
>>
>>> Bjorn Andersson <bjorn.andersson@linaro.org> wrote:
>>> > The correct include file for getting errno constants and ERR_PTR() is
>>> > linux/err.h, rather than linux/errno.h, so fix the include.
>>> > 
>>> > Fixes: e8b123e60084 ("soc: qcom: smem_state: Add stubs for disabled smem_state")
>>> > Acked-by: Andy Gross <andy.gross@linaro.org>
>>> > Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
>>> 
>>> For some reason this fails to compile now. Can you take a look, please?
>>> 
>>> ERROR: "qcom_wcnss_open_channel" [drivers/net/wireless/ath/wcn36xx/wcn36xx.ko] undefined!
>>> make[1]: *** [__modpost] Error 1
>>> make: *** [modules] Error 2
>>> 
>>> 5 patches set to Changes Requested.
>>> 
>>> 9429045 [v5,1/5] soc: qcom: smem_state: Fix include for ERR_PTR()
>>> 9429047 [v5,2/5] wcn36xx: Transition driver to SMD client
>>
>> This patch was updated with the necessary depends in Kconfig to catch
>> this exact issue and when I pull in your .config (which has QCOM_SMD=n,
>> QCOM_WCNSS_CTRL=n and WCN36XX=y) I can build this just fine.
>>
>> I've tested the various combinations and it seems to work fine. Do you
>> have any other patches in your tree?
>
> This was with the pending branch of my ath.git tree. There are other
> wireless patches (ath10k etc) but I would guess they don't affect here.
>
>> Any stale objects?
>
> Not sure what you mean with this question, but I didn't run 'make clean'
> if that's what you are asking.
>
>> Would you mind retesting this, before I invest more time in trying to
>> reproduce the issue you're seeing?
>
> Sure, I'll take a look but that might take few days.

I didn't find enough time to look at this in detail. I applied this to
my ath.git pending branch, let's see what the kbuild bot finds.

-- 
Kalle Valo

^ permalink raw reply

* Re: [net-next PATCH v3 0/6] XDP for virtio_net
From: Michael S. Tsirkin @ 2016-11-30 18:35 UTC (permalink / raw)
  To: John Fastabend
  Cc: tgraf, shm, alexei.starovoitov, daniel, davem, john.r.fastabend,
	netdev, bblanco, brouer
In-Reply-To: <20161129200450.26752.86882.stgit@john-Precision-Tower-5810>

On Tue, Nov 29, 2016 at 12:05:20PM -0800, John Fastabend wrote:
> This implements virtio_net for the mergeable buffers and big_packet
> modes. I tested this with vhost_net running on qemu and did not see
> any issues. For testing num_buf > 1 I added a hack to vhost driver
> to only but 100 bytes per buffer.
> 
> There are some restrictions for XDP to be enabled and work well
> (see patch 3) for more details.
> 
>   1. LRO must be off
>   2. MTU must be less than PAGE_SIZE
>   3. queues must be available to dedicate to XDP
>   4. num_bufs received in mergeable buffers must be 1
>   5. big_packet mode must have all data on single page
> 
> Please review any comments/feedback welcome as always.
> 
> v2, fixes rcu usage throughout thanks to Eric and the use of
> num_online_cpus() usage thanks to Jakub.
> 
> v3, add slowpath patch to handle num_bufs > 1
> 
> Thanks,
> John

BTW this is threaded incorrectly: patch 1/6 isn't a reply to 0/6,
patches 2 and on are replies to patch 1.

I'm busy until end of week, I'll review Monday. Sorry about the delay.

> ---
> 
> John Fastabend (6):
>       net: virtio dynamically disable/enable LRO
>       net: xdp: add invalid buffer warning
>       virtio_net: Add XDP support
>       virtio_net: add dedicated XDP transmit queues
>       virtio_net: add XDP_TX support
>       virtio_net: xdp, add slowpath case for non contiguous buffers
> 
> 
>  drivers/net/virtio_net.c |  344 +++++++++++++++++++++++++++++++++++++++++++++-
>  include/linux/filter.h   |    1 
>  net/core/filter.c        |    6 +
>  3 files changed, 346 insertions(+), 5 deletions(-)
> 
> --
> Signature

^ permalink raw reply

* Re: [PATCH v2 07/13] net: ethernet: ti: cpts: rework initialization/deinitialization
From: Grygorii Strashko @ 2016-11-30 18:30 UTC (permalink / raw)
  To: Richard Cochran
  Cc: David S. Miller, netdev-u79uwXL29TY76Z2rM5mHXA, Mugunthan V N,
	Sekhar Nori, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-omap-u79uwXL29TY76Z2rM5mHXA, Rob Herring,
	devicetree-u79uwXL29TY76Z2rM5mHXA, Murali Karicheri, Wingman Kwok
In-Reply-To: <2037427d-0ab6-9446-450d-21c34123c03b-l0cyMroinI0@public.gmane.org>

Hi Richard,

On 11/29/2016 09:50 AM, Grygorii Strashko wrote:
> On 11/29/2016 04:07 AM, Richard Cochran wrote:
>> On Mon, Nov 28, 2016 at 05:03:31PM -0600, Grygorii Strashko wrote:
>>> +int cpts_register(struct cpts *cpts)
>>>  {
>>>  	int err, i;
>>>
>>> -	cpts->info = cpts_info;
>>> -	spin_lock_init(&cpts->lock);
>>> -
>>> -	cpts->cc.read = cpts_systim_read;
>>> -	cpts->cc.mask = CLOCKSOURCE_MASK(32);
>>> -	cpts->cc_mult = mult;
>>> -	cpts->cc.mult = mult;
>>> -	cpts->cc.shift = shift;
>>> -
>>>  	INIT_LIST_HEAD(&cpts->events);
>>>  	INIT_LIST_HEAD(&cpts->pool);
>>>  	for (i = 0; i < CPTS_MAX_EVENTS; i++)
>>>  		list_add(&cpts->pool_data[i].list, &cpts->pool);
>>>
>>> -	cpts_clk_init(dev, cpts);
>>> +	clk_enable(cpts->refclk);
>>> +
>>>  	cpts_write32(cpts, CPTS_EN, control);
>>>  	cpts_write32(cpts, TS_PEND_EN, int_enable);
>>>
>>> +	cpts->cc.mult = cpts->cc_mult;
>>
>> It is not clear why you set cc.mult in a different place than
>> cc.shift.  That isn't logical, but maybe later patches make it
>> clear...
>
> cc.mult has to be reloaded to original value each time CPTS is registered(restarted)
> as it can be modified by cpts_ptp_adjfreq().
>
> While cc.shift is static.
>
>

Will it ok if i will add comment here and re-send series?

-- 
regards,
-grygorii
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH net-next v4 0/4] Fix OdroidC2 Gigabit Tx link issue
From: Florian Fainelli @ 2016-11-30 18:28 UTC (permalink / raw)
  To: Jerome Brunet, netdev-u79uwXL29TY76Z2rM5mHXA,
	devicetree-u79uwXL29TY76Z2rM5mHXA
  Cc: Carlo Caione, Kevin Hilman, Giuseppe Cavallaro, Alexandre TORGUE,
	Martin Blumenstingl, Andre Roth, Andrew Lunn, Neil Armstrong,
	linux-amlogic-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Julia Lawall, Yegor Yefremov,
	Andreas Färber
In-Reply-To: <1480499246.17538.208.camel-rdvid1DuHRBWk0Htik3J/w@public.gmane.org>

On 11/30/2016 01:47 AM, Jerome Brunet wrote:
>> If we start supporting generic "enable", "disable" type of properties
>> with values that map directly to register definitions of the HW, we
>> leave too much room for these properties to be utilized to implement
>> a
>> specific policy, and this is not acceptable.
> 
> Florian, 
> 
> I agree that DT should not be used to setup a policy, but to describe
> what the HW is.
> 
> I tried to implement it the way you suggested, using phy fixup, too see
> what it looks like.
> There is 2 places in the code that seems (remotely) linked to the
> issue: 
> - meson8b_dwmac driver : if the mac, regardless of the board/platform,
>  could not tolerate to have EEE activated, it would make sense to have
> the fixup here. It can provide a C callback for such case.
> - realtek phy driver: philosophy is kind of the same
> 
> To be clear, it is doable and it works that way, but I don't think
> embedding this directly in the code is the right way to do it. It seems
> we are hiding an information specific about the board inside a generic
> driver.

So there are a few things about that:

- if we were not on ARM64, there would be possibly a remote chance of
having some concept of a board file which would be where such a PHY
fixup, or fixup of any kind would reside

- having the PHY fixup in the PHY driver gated by both an exact match on
the PHY OUI *and* the specific affected board makes it reasonably easy
to locate it

> 
> We have several amlogic's design with the same MAC, sometimes with the
> same PHY, which have no problem with EEE at all. The issue is really
> about the board design.

OK, not a problem then: of_machine_is_compatible() should help you here?

> 
> What I propose is not an enable/disable configuration switch, but to
> clearly state that a particular mode of operation is broken. Like the
> "max-speed" property, it setup a restriction. IMO, this is a
> description of what the HW is and is capable of, and as such it should
> be part of the DT.

Sure, there is a fine line between describing what's broken, and being
able to use that to actually configure non-broken hardware the way you want.

> 
> Yes the property directly map to a register, but it does let you
> directly manipulate it (you can't pass the value you want to write in
> the register). Having it this way just makes the code simple on both
> ends (user and driver).

That's exactly the part that is giving me the creeps, any property that
directly maps to a register value has a chance of a) leading to hard to
debug problem if mis-configured, and b) being used as a policy as
opposed to purely describing what is going on with the HW.

> 
> Yes people could start abusing this to setup policy. In the end, it is
> our responsibility, as community, to make sure APIs are used in a
> proper way, and not let it be used that way.
> 
> I'm open to suggestion on how improve the solution, maybe something
> which could bring more confidence that property won't be misused.

Once the binding lands in the kernel, there is absolutely zero guarantee
nor visibility in how people end-up using in e.g: DT aware bootloader,
and I am one of these people. Since there is a binding, there is
consumer code in the kernel that needs to behave properly with respect
to how the binding is defined. This is the same problem as with any kind
of ABI, and a diverse range of consumers.
-- 
Florian
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [patch net-next v3 11/12] mlxsw: spectrum_router: Request a dump of FIB tables during init
From: Ido Schimmel @ 2016-11-30 18:22 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: Jiri Pirko, netdev, davem, idosch, eladr, yotamg, nogahf, arkadis,
	ogerlitz, roopa, dsa, nikolay, andy, vivien.didelot, andrew,
	f.fainelli, alexander.h.duyck, kaber
In-Reply-To: <043a2ba9-1e19-c3cd-a930-59e8cb0d5f7a@stressinduktion.org>

On Wed, Nov 30, 2016 at 05:49:56PM +0100, Hannes Frederic Sowa wrote:
> On 30.11.2016 17:32, Ido Schimmel wrote:
> > On Wed, Nov 30, 2016 at 04:37:48PM +0100, Hannes Frederic Sowa wrote:
> >> On 30.11.2016 11:09, Jiri Pirko wrote:
> >>> From: Ido Schimmel <idosch@mellanox.com>
> >>>
> >>> Make sure the device has a complete view of the FIB tables by invoking
> >>> their dump during module init.
> >>>
> >>> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> >>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> >>> ---
> >>>  .../net/ethernet/mellanox/mlxsw/spectrum_router.c  | 23 ++++++++++++++++++++++
> >>>  1 file changed, 23 insertions(+)
> >>>
> >>> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
> >>> index 14bed1d..d176047 100644
> >>> --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
> >>> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
> >>> @@ -2027,8 +2027,23 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
> >>>  	return NOTIFY_DONE;
> >>>  }
> >>>  
> >>> +static void mlxsw_sp_router_fib_dump_flush(struct notifier_block *nb)
> >>> +{
> >>> +	struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
> >>> +
> >>> +	/* Flush pending FIB notifications and then flush the device's
> >>> +	 * table before requesting another dump. Do that with RTNL held,
> >>> +	 * as FIB notification block is already registered.
> >>> +	 */
> >>> +	mlxsw_core_flush_owq();
> >>> +	rtnl_lock();
> >>> +	mlxsw_sp_router_fib_flush(mlxsw_sp);
> >>> +	rtnl_unlock();
> >>> +}
> >>> +
> >>>  int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
> >>>  {
> >>> +	fib_dump_cb_t *cb = mlxsw_sp_router_fib_dump_flush;
> >>>  	int err;
> >>>  
> >>>  	INIT_LIST_HEAD(&mlxsw_sp->router.nexthop_neighs_list);
> >>> @@ -2048,8 +2063,16 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
> >>>  
> >>>  	mlxsw_sp->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
> >>>  	register_fib_notifier(&mlxsw_sp->fib_nb);
> >>
> >> Sorry to pick in here again:
> >>
> >> There is a race here. You need to protect the registration of the fib
> >> notifier as well by the sequence counter. Updates here are not ordered
> >> in relation to this code below.
> > 
> > You mean updates that can be received after you registered the notifier
> > and until the dump started? I'm aware of that and that's OK. This
> > listener should be able to handle duplicates.
> 
> I am not concerned about duplicates, but about ordering deletes and
> getting an add from the RCU code you will add the node to hw while it is
> deleted in the software path. You probably will ignore the delete
> because nothing is installed in hw and later add the node which was
> actually deleted but just reordered which happend on another CPU, no?

Are you referring to reordering in the workqueue? We already covered
this using an ordered workqueue, which has one context of execution
system-wide.

> > I've a follow up patchset that introduces a new event in switchdev
> > notification chain called SWITCHDEV_SYNC, which is sent when port
> > netdevs are enslaved / released  from a master device (points in time
> > where kernel<->device can get out of sync). It will invoke
> > re-propagation of configuration from different parts of the stack
> > (e.g. bridge driver, 8021q driver, fib/neigh code), which can result
> > in duplicates.
> 
> Okay, understood. I wonder how we can protect against accidentally abort
> calls actually. E.g. if I start to inject routes into my routing domain
> how can I make sure the box doesn't die after I try to insert enough
> routes. Do we need to touch quagga etc?

The whole point of moving abort mechanism to the driver is that the
system won't die, but instead routing will be done in the kernel. If you
respect hardware limitations, then there's no reason for abort mechanism
to kick in.

^ permalink raw reply

* Re: [PATCH 1/6] net: ethernet: ti: netcp: add support of cpts
From: Richard Cochran @ 2016-11-30 18:22 UTC (permalink / raw)
  To: Grygorii Strashko
  Cc: David S. Miller, netdev, Mugunthan V N, Sekhar Nori, linux-kernel,
	linux-omap, Rob Herring, devicetree, Murali Karicheri,
	Wingman Kwok
In-Reply-To: <4d69aff4-70a0-e6b8-38b0-8e95cfea7601@ti.com>

On Wed, Nov 30, 2016 at 11:31:56AM -0600, Grygorii Strashko wrote:

> ok. Seems my assumption that ndo_open/ndo_close serialized by
> rtnl_lock is incorrect. Right?

No, you were right in the first place.  The open/close are indeed
serialized by the rtnl lock.

Sorry for the noise,
Richard

^ permalink raw reply

* [PATCH net-next v6 4/6] bpf: Add support for reading socket family, type, protocol
From: David Ahern @ 2016-11-30 18:16 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, daniel, maheshb, tgraf, David Ahern
In-Reply-To: <1480529810-25850-1-git-send-email-dsa@cumulusnetworks.com>

Add socket family, type and protocol to bpf_sock allowing bpf programs
read-only access.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v6
- new patch for version 6 of set

 include/net/sock.h       | 15 +++++++++++++++
 include/uapi/linux/bpf.h |  3 +++
 net/core/filter.c        | 42 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 60 insertions(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index 442cbb118a07..69afda6bea15 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -389,6 +389,21 @@ struct sock {
 	 * Because of non atomicity rules, all
 	 * changes are protected by socket lock.
 	 */
+	unsigned int		__sk_flags_offset[0];
+#ifdef __BIG_ENDIAN_BITFIELD
+#define SK_FL_PROTO_SHIFT  16
+#define SK_FL_PROTO_MASK   0x00ff0000
+
+#define SK_FL_TYPE_SHIFT   0
+#define SK_FL_TYPE_MASK    0x0000ffff
+#else
+#define SK_FL_PROTO_SHIFT  8
+#define SK_FL_PROTO_MASK   0x0000ff00
+
+#define SK_FL_TYPE_SHIFT   16
+#define SK_FL_TYPE_MASK    0xffff0000
+#endif
+
 	kmemcheck_bitfield_begin(flags);
 	unsigned int		sk_padding : 2,
 				sk_no_check_tx : 1,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 75964e00d947..b47ffd117fd6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -541,6 +541,9 @@ struct bpf_tunnel_key {
 
 struct bpf_sock {
 	__u32 bound_dev_if;
+	__u32 family;
+	__u32 type;
+	__u32 protocol;
 };
 
 /* User return codes for XDP prog type.
diff --git a/net/core/filter.c b/net/core/filter.c
index 5ee722dc097d..ddc86efe1911 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2960,6 +2960,33 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	return insn - insn_buf;
 }
 
+#define SOCKF_AD_TYPE     1
+#define SOCKF_AD_PROTOCOL 2
+
+static u32 convert_sock_access(int sock_field, int dst_reg, int src_reg,
+			       struct bpf_insn *insn_buf)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (sock_field) {
+	case SOCKF_AD_TYPE:
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sock, __sk_flags_offset));
+		*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT);
+		break;
+
+	case SOCKF_AD_PROTOCOL:
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sock, __sk_flags_offset));
+		*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT);
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
 					  int dst_reg, int src_reg,
 					  int ctx_off,
@@ -2979,6 +3006,21 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
 			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
 				      offsetof(struct sock, sk_bound_dev_if));
 		break;
+
+	case offsetof(struct bpf_sock, family):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+				      offsetof(struct sock, sk_family));
+		break;
+
+	case offsetof(struct bpf_sock, type):
+		return convert_sock_access(SOCKF_AD_TYPE, dst_reg, src_reg,
+					   insn_buf);
+
+	case offsetof(struct bpf_sock, protocol):
+		return convert_sock_access(SOCKF_AD_PROTOCOL, dst_reg, src_reg,
+					   insn_buf);
 	}
 
 	return insn - insn_buf;
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v6 5/6] samples/bpf: Update bpf loader for cgroup section names
From: David Ahern @ 2016-11-30 18:16 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, daniel, maheshb, tgraf, David Ahern
In-Reply-To: <1480529810-25850-1-git-send-email-dsa@cumulusnetworks.com>

Add support for section names starting with cgroup/skb and cgroup/sock.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v6
- new patch for version 6

 samples/bpf/bpf_load.c | 14 +++++++++++---
 samples/bpf/bpf_load.h |  1 +
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 62f54d6eb8bf..49b45ccbe153 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -52,6 +52,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
 	bool is_xdp = strncmp(event, "xdp", 3) == 0;
 	bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
+	bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
+	bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
 	enum bpf_prog_type prog_type;
 	char buf[256];
 	int fd, efd, err, id;
@@ -72,6 +74,10 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		prog_type = BPF_PROG_TYPE_XDP;
 	} else if (is_perf_event) {
 		prog_type = BPF_PROG_TYPE_PERF_EVENT;
+	} else if (is_cgroup_skb) {
+		prog_type = BPF_PROG_TYPE_CGROUP_SKB;
+	} else if (is_cgroup_sk) {
+		prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
 	} else {
 		printf("Unknown event '%s'\n", event);
 		return -1;
@@ -85,7 +91,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
 	prog_fd[prog_cnt++] = fd;
 
-	if (is_xdp || is_perf_event)
+	if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
 		return 0;
 
 	if (is_socket) {
@@ -334,7 +340,8 @@ int load_bpf_file(char *path)
 			    memcmp(shname_prog, "tracepoint/", 11) == 0 ||
 			    memcmp(shname_prog, "xdp", 3) == 0 ||
 			    memcmp(shname_prog, "perf_event", 10) == 0 ||
-			    memcmp(shname_prog, "socket", 6) == 0)
+			    memcmp(shname_prog, "socket", 6) == 0 ||
+			    memcmp(shname_prog, "cgroup/", 7) == 0)
 				load_and_attach(shname_prog, insns, data_prog->d_size);
 		}
 	}
@@ -353,7 +360,8 @@ int load_bpf_file(char *path)
 		    memcmp(shname, "tracepoint/", 11) == 0 ||
 		    memcmp(shname, "xdp", 3) == 0 ||
 		    memcmp(shname, "perf_event", 10) == 0 ||
-		    memcmp(shname, "socket", 6) == 0)
+		    memcmp(shname, "socket", 6) == 0 ||
+		    memcmp(shname, "cgroup/", 7) == 0)
 			load_and_attach(shname, data->d_buf, data->d_size);
 	}
 
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index dfa57fe65c8e..4adeeef53ad6 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -7,6 +7,7 @@
 extern int map_fd[MAX_MAPS];
 extern int prog_fd[MAX_PROGS];
 extern int event_fd[MAX_PROGS];
+extern int prog_cnt;
 
 /* parses elf file compiled by llvm .c->.o
  * . parses 'maps' section and creates maps via BPF syscall
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v6 6/6] samples/bpf: add userspace example for prohibiting sockets
From: David Ahern @ 2016-11-30 18:16 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, daniel, maheshb, tgraf, David Ahern
In-Reply-To: <1480529810-25850-1-git-send-email-dsa@cumulusnetworks.com>

Add examples preventing a process in a cgroup from opening a socket
based family, protocol and type.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v6
- new patch for version 6

 samples/bpf/Makefile            |  4 ++
 samples/bpf/sock_flags_kern.c   | 37 +++++++++++++++++++
 samples/bpf/test_cgrp2_sock2.c  | 66 +++++++++++++++++++++++++++++++++
 samples/bpf/test_cgrp2_sock2.sh | 81 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 188 insertions(+)
 create mode 100644 samples/bpf/sock_flags_kern.c
 create mode 100644 samples/bpf/test_cgrp2_sock2.c
 create mode 100755 samples/bpf/test_cgrp2_sock2.sh

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index a335b218198e..8df12f9429dc 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -24,6 +24,7 @@ hostprogs-y += test_overhead
 hostprogs-y += test_cgrp2_array_pin
 hostprogs-y += test_cgrp2_attach
 hostprogs-y += test_cgrp2_sock
+hostprogs-y += test_cgrp2_sock2
 hostprogs-y += xdp1
 hostprogs-y += xdp2
 hostprogs-y += test_current_task_under_cgroup
@@ -53,6 +54,7 @@ test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
 test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
 test_cgrp2_attach-objs := libbpf.o test_cgrp2_attach.o
 test_cgrp2_sock-objs := libbpf.o test_cgrp2_sock.o
+test_cgrp2_sock2-objs := bpf_load.o libbpf.o test_cgrp2_sock2.o
 xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
@@ -73,6 +75,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
 always += tcbpf1_kern.o
@@ -106,6 +109,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
 HOSTLOADLIBES_trace_output += -lelf -lrt
 HOSTLOADLIBES_lathist += -lelf
diff --git a/samples/bpf/sock_flags_kern.c b/samples/bpf/sock_flags_kern.c
new file mode 100644
index 000000000000..d6a7f0013a5d
--- /dev/null
+++ b/samples/bpf/sock_flags_kern.c
@@ -0,0 +1,37 @@
+#include <uapi/linux/bpf.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+
+SEC("cgroup/sock1")
+int bpf_prog1(struct bpf_sock *sk)
+{
+	char fmt[] = "socket: family %d type %d protocol %d\n";
+
+	bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol);
+
+	/* block PF_INET6, SOCK_RAW, IPPROTO_ICMPV6 sockets
+	 * ie., make ping6 fail
+	 */
+	if (sk->family == PF_INET6 && sk->type == 3 && sk->protocol == 58)
+		return 0;
+
+	return 1;
+}
+
+SEC("cgroup/sock2")
+int bpf_prog2(struct bpf_sock *sk)
+{
+	char fmt[] = "socket: family %d type %d protocol %d\n";
+
+	bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol);
+
+	/* block PF_INET, SOCK_RAW, IPPROTO_ICMP sockets
+	 * ie., make ping fail
+	 */
+	if (sk->family == PF_INET && sk->type == 3 && sk->protocol == 1)
+		return 0;
+
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c
new file mode 100644
index 000000000000..455ef0d06e93
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock2.c
@@ -0,0 +1,66 @@
+/* eBPF example program:
+ *
+ * - Loads eBPF program
+ *
+ *   The eBPF program loads a filter from file and attaches the
+ *   program to a cgroup using BPF_PROG_ATTACH
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <net/if.h>
+#include <linux/bpf.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+
+static int usage(const char *argv0)
+{
+	printf("Usage: %s cg-path filter-path [filter-id]\n", argv0);
+	return EXIT_FAILURE;
+}
+
+int main(int argc, char **argv)
+{
+	int cg_fd, ret, filter_id = 0;
+
+	if (argc < 3)
+		return usage(argv[0]);
+
+	cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY);
+	if (cg_fd < 0) {
+		printf("Failed to open cgroup path: '%s'\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	if (load_bpf_file(argv[2]))
+		return EXIT_FAILURE;
+
+	printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf);
+
+	if (argc > 3)
+		filter_id = atoi(argv[3]);
+
+	if (filter_id > prog_cnt) {
+		printf("Invalid program id; program not found in file\n");
+		return EXIT_FAILURE;
+	}
+
+	ret = bpf_prog_attach(prog_fd[filter_id], cg_fd,
+			      BPF_CGROUP_INET_SOCK_CREATE);
+	if (ret < 0) {
+		printf("Failed to attach prog to cgroup: '%s'\n",
+		       strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	return EXIT_SUCCESS;
+}
diff --git a/samples/bpf/test_cgrp2_sock2.sh b/samples/bpf/test_cgrp2_sock2.sh
new file mode 100755
index 000000000000..891f12a0e26f
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock2.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+function config_device {
+	ip netns add at_ns0
+	ip link add veth0 type veth peer name veth0b
+	ip link set veth0b up
+	ip link set veth0 netns at_ns0
+	ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
+	ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad
+	ip netns exec at_ns0 ip link set dev veth0 up
+	ip addr add 172.16.1.101/24 dev veth0b
+	ip addr add 2401:db00::2/64 dev veth0b nodad
+}
+
+function config_cgroup {
+	rm -rf /tmp/cgroupv2
+	mkdir -p /tmp/cgroupv2
+	mount -t cgroup2 none /tmp/cgroupv2
+	mkdir -p /tmp/cgroupv2/foo
+	echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
+}
+
+
+function attach_bpf {
+	test_cgrp2_sock2 /tmp/cgroupv2/foo sock_flags_kern.o $1
+	[ $? -ne 0 ] && exit 1
+}
+
+function cleanup {
+	ip link del veth0b
+	ip netns delete at_ns0
+	umount /tmp/cgroupv2
+	rm -rf /tmp/cgroupv2
+}
+
+cleanup 2>/dev/null
+
+set -e
+config_device
+config_cgroup
+set +e
+
+#
+# Test 1 - fail ping6
+#
+attach_bpf 0
+ping -c1 -w1 172.16.1.100
+if [ $? -ne 0 ]; then
+	echo "ping failed when it should succeed"
+	cleanup
+	exit 1
+fi
+
+ping6 -c1 -w1 2401:db00::1
+if [ $? -eq 0 ]; then
+	echo "ping6 succeeded when it should not"
+	cleanup
+	exit 1
+fi
+
+#
+# Test 2 - fail ping
+#
+attach_bpf 1
+ping6 -c1 -w1 2401:db00::1
+if [ $? -ne 0 ]; then
+	echo "ping6 failed when it should succeed"
+	cleanup
+	exit 1
+fi
+
+ping -c1 -w1 172.16.1.100
+if [ $? -eq 0 ]; then
+	echo "ping succeeded when it should not"
+	cleanup
+	exit 1
+fi
+
+cleanup
+echo
+echo "*** PASS ***"
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v6 3/6] samples: bpf: add userspace example for modifying sk_bound_dev_if
From: David Ahern @ 2016-11-30 18:16 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, daniel, maheshb, tgraf, David Ahern
In-Reply-To: <1480529810-25850-1-git-send-email-dsa@cumulusnetworks.com>

Add a simple program to demonstrate the ability to attach a bpf program
to a cgroup that sets sk_bound_dev_if for AF_INET{6} sockets when they
are created.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v6
- added conversion from device name to index in test program

v5
- changed BPF_CGROUP_INET_SOCK to BPF_CGROUP_INET_SOCK_CREATE

v4
- added test_cgrp2_sock.sh for an automated test

v3
- revert to BPF_PROG_TYPE_CGROUP_SOCK prog type

v2
- removed bpf_sock_store_u32 references
- changed BPF_CGROUP_INET_SOCK_CREATE to BPF_CGROUP_INET_SOCK
- remove BPF_PROG_TYPE_CGROUP_SOCK prog type and add prog_subtype

 samples/bpf/Makefile           |  2 +
 samples/bpf/test_cgrp2_sock.c  | 83 ++++++++++++++++++++++++++++++++++++++++++
 samples/bpf/test_cgrp2_sock.sh | 47 ++++++++++++++++++++++++
 3 files changed, 132 insertions(+)
 create mode 100644 samples/bpf/test_cgrp2_sock.c
 create mode 100755 samples/bpf/test_cgrp2_sock.sh

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 3ceb5a9d86df..a335b218198e 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -23,6 +23,7 @@ hostprogs-y += map_perf_test
 hostprogs-y += test_overhead
 hostprogs-y += test_cgrp2_array_pin
 hostprogs-y += test_cgrp2_attach
+hostprogs-y += test_cgrp2_sock
 hostprogs-y += xdp1
 hostprogs-y += xdp2
 hostprogs-y += test_current_task_under_cgroup
@@ -51,6 +52,7 @@ map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
 test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
 test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
 test_cgrp2_attach-objs := libbpf.o test_cgrp2_attach.o
+test_cgrp2_sock-objs := libbpf.o test_cgrp2_sock.o
 xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
new file mode 100644
index 000000000000..d467b3c1c55c
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -0,0 +1,83 @@
+/* eBPF example program:
+ *
+ * - Loads eBPF program
+ *
+ *   The eBPF program sets the sk_bound_dev_if index in new AF_INET{6}
+ *   sockets opened by processes in the cgroup.
+ *
+ * - Attaches the new program to a cgroup using BPF_PROG_ATTACH
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <net/if.h>
+#include <linux/bpf.h>
+
+#include "libbpf.h"
+
+static int prog_load(int idx)
+{
+	struct bpf_insn prog[] = {
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+		BPF_MOV64_IMM(BPF_REG_3, idx),
+		BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, bound_dev_if)),
+		BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, bound_dev_if)),
+		BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */
+		BPF_EXIT_INSN(),
+	};
+
+	return bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, prog, sizeof(prog),
+			     "GPL", 0);
+}
+
+static int usage(const char *argv0)
+{
+	printf("Usage: %s cg-path device-index\n", argv0);
+	return EXIT_FAILURE;
+}
+
+int main(int argc, char **argv)
+{
+	int cg_fd, prog_fd, ret;
+	unsigned int idx;
+
+	if (argc < 2)
+		return usage(argv[0]);
+
+	idx = if_nametoindex(argv[2]);
+	if (!idx) {
+		printf("Invalid device name\n");
+		return EXIT_FAILURE;
+	}
+
+	cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY);
+	if (cg_fd < 0) {
+		printf("Failed to open cgroup path: '%s'\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	prog_fd = prog_load(idx);
+	printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf);
+
+	if (prog_fd < 0) {
+		printf("Failed to load prog: '%s'\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE);
+	if (ret < 0) {
+		printf("Failed to attach prog to cgroup: '%s'\n",
+		       strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	return EXIT_SUCCESS;
+}
diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh
new file mode 100755
index 000000000000..925fd467c7cc
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+function config_device {
+	ip netns add at_ns0
+	ip link add veth0 type veth peer name veth0b
+	ip link set veth0b up
+	ip link set veth0 netns at_ns0
+	ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
+	ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad
+	ip netns exec at_ns0 ip link set dev veth0 up
+	ip link add foo type vrf table 1234
+	ip link set foo up
+	ip addr add 172.16.1.101/24 dev veth0b
+	ip addr add 2401:db00::2/64 dev veth0b nodad
+	ip link set veth0b master foo
+}
+
+function attach_bpf {
+	rm -rf /tmp/cgroupv2
+	mkdir -p /tmp/cgroupv2
+	mount -t cgroup2 none /tmp/cgroupv2
+	mkdir -p /tmp/cgroupv2/foo
+	test_cgrp2_sock /tmp/cgroupv2/foo foo
+	echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
+}
+
+function cleanup {
+	set +ex
+	ip netns delete at_ns0
+	ip link del veth0
+	ip link del foo
+	umount /tmp/cgroupv2
+	rm -rf /tmp/cgroupv2
+	set -ex
+}
+
+function do_test {
+	ping -c1 -w1 172.16.1.100
+	ping6 -c1 -w1 2401:db00::1
+}
+
+cleanup 2>/dev/null
+config_device
+attach_bpf
+do_test
+cleanup
+echo "*** PASS ***"
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v6 2/6] bpf: Add new cgroup attach type to enable sock modifications
From: David Ahern @ 2016-11-30 18:16 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, daniel, maheshb, tgraf, David Ahern
In-Reply-To: <1480529810-25850-1-git-send-email-dsa@cumulusnetworks.com>

Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
Currently only sk_bound_dev_if is exported to userspace for modification
by a bpf program.

This allows a cgroup to be configured such that AF_INET{6} sockets opened
by processes are automatically bound to a specific device. In turn, this
enables the running of programs that do not support SO_BINDTODEVICE in a
specific VRF context / L3 domain.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v6
- added size check to sock_filter_is_valid_access; accesses must be u32

v5
- no change

v4
- dropped tweak to bpf_func signature
- dropped cg_sock_func_proto in favor of sk_filter_func_proto
- new __cgroup_bpf_run_filter_sk versus overloading __cgroup_bpf_run_filter
- reverted BPF_CGROUP_INET_SOCK to BPF_CGROUP_INET_SOCK_CREATE

v3
- reverted to new prog type BPF_PROG_TYPE_CGROUP_SOCK
- dropped the subtype

v2
- dropped the bpf_sock_store_u32 helper
- dropped the new prog type BPF_PROG_TYPE_CGROUP_SOCK
- moved valid access and context conversion to use subtype
- dropped CREATE from BPF_CGROUP_INET_SOCK and related function names
- moved running of filter from sk_alloc to inet{6}_create

 include/linux/bpf-cgroup.h | 14 +++++++++++
 include/uapi/linux/bpf.h   |  6 +++++
 kernel/bpf/cgroup.c        | 33 ++++++++++++++++++++++++
 kernel/bpf/syscall.c       |  5 +++-
 net/core/filter.c          | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/af_inet.c         | 12 ++++++++-
 net/ipv6/af_inet6.c        |  8 ++++++
 7 files changed, 138 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 7f0fc635b13e..7de376e37c5c 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -41,6 +41,9 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
 				enum bpf_attach_type type);
 
+int __cgroup_bpf_run_filter_sk(struct sock *sk,
+			       enum bpf_attach_type type);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -64,6 +67,16 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	__ret;								       \
 })
 
+#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk)				       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled && sk) {					       \
+		__ret = __cgroup_bpf_run_filter_sk(sk,			       \
+						 BPF_CGROUP_INET_SOCK_CREATE); \
+	}								       \
+	__ret;								       \
+})
+
 #else
 
 struct cgroup_bpf {};
@@ -73,6 +86,7 @@ static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
 
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1370a9d1456f..75964e00d947 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -101,11 +101,13 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_XDP,
 	BPF_PROG_TYPE_PERF_EVENT,
 	BPF_PROG_TYPE_CGROUP_SKB,
+	BPF_PROG_TYPE_CGROUP_SOCK,
 };
 
 enum bpf_attach_type {
 	BPF_CGROUP_INET_INGRESS,
 	BPF_CGROUP_INET_EGRESS,
+	BPF_CGROUP_INET_SOCK_CREATE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -537,6 +539,10 @@ struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+struct bpf_sock {
+	__u32 bound_dev_if;
+};
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 8fe55ffd109d..a515f7b007c6 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -165,3 +165,36 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	return ret;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
+
+/**
+ * __cgroup_bpf_run_filter_sk() - Run a program on a sock
+ * @sk: sock structure to manipulate
+ * @type: The type of program to be exectuted
+ *
+ * socket is passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sk(struct sock *sk,
+			       enum bpf_attach_type type)
+{
+	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_prog *prog;
+	int ret = 0;
+
+
+	rcu_read_lock();
+
+	prog = rcu_dereference(cgrp->bpf.effective[type]);
+	if (prog)
+		ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5518a6839ab1..85af86c496cd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -869,7 +869,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET_EGRESS:
 		ptype = BPF_PROG_TYPE_CGROUP_SKB;
 		break;
-
+	case BPF_CGROUP_INET_SOCK_CREATE:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -905,6 +907,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
+	case BPF_CGROUP_INET_SOCK_CREATE:
 		cgrp = cgroup_get_from_fd(attr->target_fd);
 		if (IS_ERR(cgrp))
 			return PTR_ERR(cgrp);
diff --git a/net/core/filter.c b/net/core/filter.c
index 698a262b8ebb..5ee722dc097d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2676,6 +2676,32 @@ static bool sk_filter_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static bool sock_filter_is_valid_access(int off, int size,
+					enum bpf_access_type type,
+					enum bpf_reg_type *reg_type)
+{
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct bpf_sock, bound_dev_if):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	if (off < 0 || off + size > sizeof(struct bpf_sock))
+		return false;
+
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+
+	if (size != sizeof(__u32))
+		return false;
+
+	return true;
+}
+
 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
 			       const struct bpf_prog *prog)
 {
@@ -2934,6 +2960,30 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	return insn - insn_buf;
 }
 
+static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
+					  int dst_reg, int src_reg,
+					  int ctx_off,
+					  struct bpf_insn *insn_buf,
+					  struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (ctx_off) {
+	case offsetof(struct bpf_sock, bound_dev_if):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+					offsetof(struct sock, sk_bound_dev_if));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sock, sk_bound_dev_if));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 					 int src_reg, int ctx_off,
 					 struct bpf_insn *insn_buf,
@@ -3007,6 +3057,12 @@ static const struct bpf_verifier_ops cg_skb_ops = {
 	.convert_ctx_access	= sk_filter_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops cg_sock_ops = {
+	.get_func_proto		= sk_filter_func_proto,
+	.is_valid_access	= sock_filter_is_valid_access,
+	.convert_ctx_access	= sock_filter_convert_ctx_access,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops	= &sk_filter_ops,
 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
@@ -3032,6 +3088,11 @@ static struct bpf_prog_type_list cg_skb_type __read_mostly = {
 	.type	= BPF_PROG_TYPE_CGROUP_SKB,
 };
 
+static struct bpf_prog_type_list cg_sock_type __read_mostly = {
+	.ops	= &cg_sock_ops,
+	.type	= BPF_PROG_TYPE_CGROUP_SOCK
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
@@ -3039,6 +3100,7 @@ static int __init register_sk_filter_ops(void)
 	bpf_register_prog_type(&sched_act_type);
 	bpf_register_prog_type(&xdp_type);
 	bpf_register_prog_type(&cg_skb_type);
+	bpf_register_prog_type(&cg_sock_type);
 
 	return 0;
 }
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5ddf5cda07f4..24d2550492ee 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -374,8 +374,18 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
 
 	if (sk->sk_prot->init) {
 		err = sk->sk_prot->init(sk);
-		if (err)
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
+	}
+
+	if (!kern) {
+		err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+		if (err) {
 			sk_common_release(sk);
+			goto out;
+		}
 	}
 out:
 	return err;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d424f3a3737a..237e654ba717 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -258,6 +258,14 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
 			goto out;
 		}
 	}
+
+	if (!kern) {
+		err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
+	}
 out:
 	return err;
 out_rcu_unlock:
-- 
2.1.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox