Netdev List
 help / color / mirror / Atom feed
* [PATCH net] packet: fix bitfield update race
From: Willem de Bruijn @ 2018-04-23 21:37 UTC (permalink / raw)
  To: netdev
  Cc: byoungyoung, threeearcat, xiyou.wangcong, davem, herbert,
	Willem de Bruijn

From: Willem de Bruijn <willemb@google.com>

Updates to the bitfields in struct packet_sock are not atomic.
Serialize these read-modify-write cycles.

Move po->running into a separate variable. Its writes are protected by
po->bind_lock (except for one startup case at packet_create). Also
replace a textual precondition warning with lockdep annotation.

All others are set only in packet_setsockopt. Serialize these
updates by holding the socket lock. Analogous to other field updates,
also hold the lock when testing whether a ring is active (pg_vec).

Fixes: 8dc419447415 ("[PACKET]: Add optional checksum computation for recvmsg")
Reported-by: DaeRyong Jeong <threeearcat@gmail.com>
Reported-by: Byoungyoung Lee <byoungyoung@purdue.edu>
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 net/packet/af_packet.c | 60 +++++++++++++++++++++++++++++++-----------
 net/packet/internal.h  | 10 +++----
 2 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index c31b0687396a..01f3515cada0 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -329,11 +329,11 @@ static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
 	skb_set_queue_mapping(skb, queue_index);
 }
 
-/* register_prot_hook must be invoked with the po->bind_lock held,
+/* __register_prot_hook must be invoked through register_prot_hook
  * or from a context in which asynchronous accesses to the packet
  * socket is not possible (packet_create()).
  */
-static void register_prot_hook(struct sock *sk)
+static void __register_prot_hook(struct sock *sk)
 {
 	struct packet_sock *po = pkt_sk(sk);
 
@@ -348,8 +348,13 @@ static void register_prot_hook(struct sock *sk)
 	}
 }
 
-/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
- * held.   If the sync parameter is true, we will temporarily drop
+static void register_prot_hook(struct sock *sk)
+{
+	lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
+	__register_prot_hook(sk);
+}
+
+/* If the sync parameter is true, we will temporarily drop
  * the po->bind_lock and do a synchronize_net to make sure no
  * asynchronous packet processing paths still refer to the elements
  * of po->prot_hook.  If the sync parameter is false, it is the
@@ -359,6 +364,8 @@ static void __unregister_prot_hook(struct sock *sk, bool sync)
 {
 	struct packet_sock *po = pkt_sk(sk);
 
+	lockdep_assert_held_once(&po->bind_lock);
+
 	po->running = 0;
 
 	if (po->fanout)
@@ -3252,7 +3259,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 
 	if (proto) {
 		po->prot_hook.type = proto;
-		register_prot_hook(sk);
+		__register_prot_hook(sk);
 	}
 
 	mutex_lock(&net->packet.sklist_lock);
@@ -3732,12 +3739,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 
 		if (optlen != sizeof(val))
 			return -EINVAL;
-		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
-			return -EBUSY;
 		if (copy_from_user(&val, optval, sizeof(val)))
 			return -EFAULT;
-		po->tp_loss = !!val;
-		return 0;
+
+		lock_sock(sk);
+		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
+			ret = -EBUSY;
+		} else {
+			po->tp_loss = !!val;
+			ret = 0;
+		}
+		release_sock(sk);
+		return ret;
 	}
 	case PACKET_AUXDATA:
 	{
@@ -3748,7 +3761,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 		if (copy_from_user(&val, optval, sizeof(val)))
 			return -EFAULT;
 
+		lock_sock(sk);
 		po->auxdata = !!val;
+		release_sock(sk);
 		return 0;
 	}
 	case PACKET_ORIGDEV:
@@ -3760,7 +3775,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 		if (copy_from_user(&val, optval, sizeof(val)))
 			return -EFAULT;
 
+		lock_sock(sk);
 		po->origdev = !!val;
+		release_sock(sk);
 		return 0;
 	}
 	case PACKET_VNET_HDR:
@@ -3769,15 +3786,20 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 
 		if (sock->type != SOCK_RAW)
 			return -EINVAL;
-		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
-			return -EBUSY;
 		if (optlen < sizeof(val))
 			return -EINVAL;
 		if (copy_from_user(&val, optval, sizeof(val)))
 			return -EFAULT;
 
-		po->has_vnet_hdr = !!val;
-		return 0;
+		lock_sock(sk);
+		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
+			ret = -EBUSY;
+		} else {
+			po->has_vnet_hdr = !!val;
+			ret = 0;
+		}
+		release_sock(sk);
+		return ret;
 	}
 	case PACKET_TIMESTAMP:
 	{
@@ -3815,11 +3837,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 
 		if (optlen != sizeof(val))
 			return -EINVAL;
-		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
-			return -EBUSY;
 		if (copy_from_user(&val, optval, sizeof(val)))
 			return -EFAULT;
-		po->tp_tx_has_off = !!val;
+
+		lock_sock(sk);
+		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
+			ret = -EBUSY;
+		} else {
+			po->tp_tx_has_off = !!val;
+			ret = 0;
+		}
+		release_sock(sk);
 		return 0;
 	}
 	case PACKET_QDISC_BYPASS:
diff --git a/net/packet/internal.h b/net/packet/internal.h
index a1d2b2319ae9..3bb7c5fb3bff 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -112,10 +112,12 @@ struct packet_sock {
 	int			copy_thresh;
 	spinlock_t		bind_lock;
 	struct mutex		pg_vec_lock;
-	unsigned int		running:1,	/* prot_hook is attached*/
-				auxdata:1,
+	unsigned int		running;	/* bind_lock must be held */
+	unsigned int		auxdata:1,	/* writer must hold sock lock */
 				origdev:1,
-				has_vnet_hdr:1;
+				has_vnet_hdr:1,
+				tp_loss:1,
+				tp_tx_has_off:1;
 	int			pressure;
 	int			ifindex;	/* bound device		*/
 	__be16			num;
@@ -125,8 +127,6 @@ struct packet_sock {
 	enum tpacket_versions	tp_version;
 	unsigned int		tp_hdrlen;
 	unsigned int		tp_reserve;
-	unsigned int		tp_loss:1;
-	unsigned int		tp_tx_has_off:1;
 	unsigned int		tp_tstamp;
 	struct net_device __rcu	*cached_dev;
 	int			(*xmit)(struct sk_buff *skb);
-- 
2.17.0.484.g0c8726318c-goog

^ permalink raw reply related

* Re: [PATCH net-next 0/4] mm,tcp: provide mmap_hook to solve lockdep issue
From: Eric Dumazet @ 2018-04-23 21:38 UTC (permalink / raw)
  To: Andy Lutomirski, Eric Dumazet, David S . Miller
  Cc: netdev, linux-kernel, Soheil Hassas Yeganeh, Eric Dumazet,
	linux-mm, Linux API
In-Reply-To: <9ed6083f-d731-945c-dbcd-f977c5600b03@kernel.org>

Hi Andy

On 04/23/2018 02:14 PM, Andy Lutomirski wrote:
> On 04/20/2018 08:55 AM, Eric Dumazet wrote:
>> This patch series provide a new mmap_hook to fs willing to grab
>> a mutex before mm->mmap_sem is taken, to ensure lockdep sanity.
>>
>> This hook allows us to shorten tcp_mmap() execution time (while mmap_sem
>> is held), and improve multi-threading scalability.
>>
> 
> I think that the right solution is to rework mmap() on TCP sockets a bit.  The current approach in net-next is very strange for a few reasons:
> 
> 1. It uses mmap() as an operation that has side effects besides just creating a mapping.  If nothing else, it's surprising, since mmap() doesn't usually do that.  But it's also causing problems like what you're seeing.
> 
> 2. The performance is worse than it needs to be.  mmap() is slow, and I doubt you'll find many mm developers who consider this particular abuse of mmap() to be a valid thing to optimize for.
> 
> 3. I'm not at all convinced the accounting is sane.  As far as I can tell, you're allowing unprivileged users to increment the count on network-owned pages, limited only by available virtual memory, without obviously charging it to the socket buffer limits.  It looks like a program that simply forgot to call munmap() would cause the system to run out of memory, and I see no reason to expect the OOM killer to have any real chance of killing the right task.

> 
> 4. Error handling sucks.  If I try to mmap() a large range (which is the whole point -- using a small range will kill performance) and not quite all of it can be mapped, then I waste a bunch of time in the kernel and get *none* of the range mapped.
> 
> I would suggest that you rework the interface a bit.  First a user would call mmap() on a TCP socket, which would create an empty VMA.  (It would set vm_ops to point to tcp_vm_ops or similar so that the TCP code could recognize it, but it would have no effect whatsoever on the TCP state machine.  Reading the VMA would get SIGBUS.)  Then a user would call a new ioctl() or setsockopt() function and pass something like:


> 
> struct tcp_zerocopy_receive {
>   void *address;
>   size_t length;
> };
> 
> The kernel would verify that [address, address+length) is entirely inside a single TCP VMA and then would do the vm_insert_range magic.

I have no idea what is the proper API for that.
Where the TCP VMA(s) would be stored ?
In TCP socket, or MM layer ?


And I am not sure why the error handling would be better (point 4), unless we can return smaller @length than requested maybe ?

Also how the VMA space would be accounted (point 3) when creating an empty VMA (no pages in there yet)

  On success, length is changed to the length that actually got mapped.  The kernel could do this while holding mmap_sem for *read*, and it could get the lock ordering right.  If and when mm range locks ever get merged, it could switch to using a range lock.
> 
> Then you could use MADV_DONTNEED or another ioctl/setsockopt to zap the part of the mapping that you're done with.
> 
> Does this seem reasonable?  It should involve very little code change, it will run faster, it will scale better, and it is much less weird IMO.

Maybe, although I do not see the 'little code change' yet.

But at least, this seems pretty nice idea, especially if it could allow us to fill the mmap()ed area later when packets are received.

^ permalink raw reply

* Re: WARNING in refcount_dec
From: Willem de Bruijn @ 2018-04-23 21:38 UTC (permalink / raw)
  To: DaeRyong Jeong
  Cc: Cong Wang, Byoungyoung Lee, LKML, Kyungtae Kim,
	Linux Kernel Network Developers, Willem de Bruijn
In-Reply-To: <CAF=yD-JR1Bg1FLat41Uua9vi1-ALyuQttChAWyWz0Yx6_01_BA@mail.gmail.com>

On Thu, Apr 19, 2018 at 2:55 PM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Thu, Apr 19, 2018 at 2:32 AM, DaeRyong Jeong <threeearcat@gmail.com> wrote:
>> Hello.
>> We have analyzed the cause of the crash in v4.16-rc3, WARNING in refcount_dec,
>> which is found by RaceFuzzer (a modified version of Syzkaller).
>>
>> Since struct packet_sock's member variables, running, has_vnet_hdr, origdev
>> and auxdata are declared as bitfields, accessing these variables can race if
>> there is no synchronization mechanism.
>
> Great catch.
>
> These fields po->{running, auxdata, origdev, has_vnet_hdr} are
> accessed without a uniform locking strategy.
>
> po->running is always accessed with po->bind_lock held (with the
> exception of reading in packet_seq_show, but that is best effort).
>
> That is the only field written to outside setsockopt. If it is moved to
> a separate word, it will no longer interfere with the others.
>
> The other fields are read lockless in the various recv and send
> functions, but only set in setsockopt. We've had enough
> locking bugs around setsockopt that I suggest we wrap all of
> those in lock_sock, like the example I gave before for
> has_vnet_hdr.

Sent http://patchwork.ozlabs.org/patch/903190/

^ permalink raw reply

* dev_loopback_xmit parameters
From: Emanuele @ 2018-04-23 21:40 UTC (permalink / raw)
  To: Linux Kernel Network Developers

Hello,

I don't know if this is the right place where to ask, but I was 
wondering why the dev_loopback_xmit function defined in /net/core/dev.c 
takes struct net * and struct sock *  as parameters. They are never 
used, so I believe passing only the struct  sk_buff * should be enough.

In addition, it would like to know where I can read what is and how to 
set a skb dst_entry, since I don't really understand it.

Thanks a lot,

Emanuele

^ permalink raw reply

* Re: [bpf PATCH 1/2] bpf: Document sockmap '-target bpf' requirement for PROG_TYPE_SK_MSG
From: Daniel Borkmann @ 2018-04-23 21:44 UTC (permalink / raw)
  To: John Fastabend, ast; +Cc: netdev
In-Reply-To: <20180423191102.21348.85601.stgit@john-Precision-Tower-5810>

Applied both to bpf tree, thanks John!

^ permalink raw reply

* [PATCH net-next] tcp: md5: only call tp->af_specific->md5_lookup() for md5 sockets
From: Eric Dumazet @ 2018-04-23 21:46 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet

RETPOLINE made calls to tp->af_specific->md5_lookup() quite expensive,
given they have no result.
We can omit the calls for sockets that have no md5 keys.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_output.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 383cac0ff0ec059ca7dbc1a6304cc7f8183e008d..95feffb6d53f8a9eadfb15a2fffeec498d6e993a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -585,14 +585,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 	unsigned int remaining = MAX_TCP_OPTION_SPACE;
 	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
 
+	*md5 = NULL;
 #ifdef CONFIG_TCP_MD5SIG
-	*md5 = tp->af_specific->md5_lookup(sk, sk);
-	if (*md5) {
-		opts->options |= OPTION_MD5;
-		remaining -= TCPOLEN_MD5SIG_ALIGNED;
+	if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+		*md5 = tp->af_specific->md5_lookup(sk, sk);
+		if (*md5) {
+			opts->options |= OPTION_MD5;
+			remaining -= TCPOLEN_MD5SIG_ALIGNED;
+		}
 	}
-#else
-	*md5 = NULL;
 #endif
 
 	/* We always get an MSS option.  The option bytes which will be seen in
@@ -720,14 +721,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 
 	opts->options = 0;
 
+	*md5 = NULL;
 #ifdef CONFIG_TCP_MD5SIG
-	*md5 = tp->af_specific->md5_lookup(sk, sk);
-	if (unlikely(*md5)) {
-		opts->options |= OPTION_MD5;
-		size += TCPOLEN_MD5SIG_ALIGNED;
+	if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+		*md5 = tp->af_specific->md5_lookup(sk, sk);
+		if (*md5) {
+			opts->options |= OPTION_MD5;
+			size += TCPOLEN_MD5SIG_ALIGNED;
+		}
 	}
-#else
-	*md5 = NULL;
 #endif
 
 	if (likely(tp->rx_opt.tstamp_ok)) {
-- 
2.17.0.484.g0c8726318c-goog

^ permalink raw reply related

* [PATCH v2] net: qrtr: Expose tunneling endpoint to user space
From: Bjorn Andersson @ 2018-04-23 21:46 UTC (permalink / raw)
  To: David S. Miller; +Cc: Chris Lew, linux-kernel, netdev, linux-arm-msm

This implements a misc character device named "qrtr-tun" for the purpose
of allowing user space applications to implement endpoints in the qrtr
network.

This allows more advanced (and dynamic) testing of the qrtr code as well
as opens up the ability of tunneling qrtr over a network or USB link.

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---

Changes since v1:
- Dropped queue lock

 net/qrtr/Kconfig  |   7 +++
 net/qrtr/Makefile |   2 +
 net/qrtr/tun.c    | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 155 insertions(+)
 create mode 100644 net/qrtr/tun.c

diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig
index 326fd97444f5..1944834d225c 100644
--- a/net/qrtr/Kconfig
+++ b/net/qrtr/Kconfig
@@ -21,4 +21,11 @@ config QRTR_SMD
 	  Say Y here to support SMD based ipcrouter channels.  SMD is the
 	  most common transport for IPC Router.
 
+config QRTR_TUN
+	tristate "TUN device for Qualcomm IPC Router"
+	---help---
+	  Say Y here to expose a character device that allows user space to
+	  implement endpoints of QRTR, for purpose of tunneling data to other
+	  hosts or testing purposes.
+
 endif # QRTR
diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile
index ab09e40f7c74..be012bfd3e52 100644
--- a/net/qrtr/Makefile
+++ b/net/qrtr/Makefile
@@ -2,3 +2,5 @@ obj-$(CONFIG_QRTR) := qrtr.o
 
 obj-$(CONFIG_QRTR_SMD) += qrtr-smd.o
 qrtr-smd-y	:= smd.o
+obj-$(CONFIG_QRTR_TUN) += qrtr-tun.o
+qrtr-tun-y	:= tun.o
diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c
new file mode 100644
index 000000000000..48b2147c98f8
--- /dev/null
+++ b/net/qrtr/tun.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Linaro Ltd */
+
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/uaccess.h>
+
+#include "qrtr.h"
+
+struct qrtr_tun {
+	struct qrtr_endpoint ep;
+
+	struct sk_buff_head queue;
+	wait_queue_head_t readq;
+};
+
+static int qrtr_tun_send(struct qrtr_endpoint *ep, struct sk_buff *skb)
+{
+	struct qrtr_tun *tun = container_of(ep, struct qrtr_tun, ep);
+
+	skb_queue_tail(&tun->queue, skb);
+
+	/* wake up any blocking processes, waiting for new data */
+	wake_up_interruptible(&tun->readq);
+
+	return 0;
+}
+
+static int qrtr_tun_open(struct inode *inode, struct file *filp)
+{
+	struct qrtr_tun *tun;
+
+	tun = kzalloc(sizeof(*tun), GFP_KERNEL);
+	if (!tun)
+		return -ENOMEM;
+
+	skb_queue_head_init(&tun->queue);
+	init_waitqueue_head(&tun->readq);
+
+	tun->ep.xmit = qrtr_tun_send;
+
+	filp->private_data = tun;
+
+	return qrtr_endpoint_register(&tun->ep, QRTR_EP_NID_AUTO);
+}
+
+static ssize_t qrtr_tun_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *filp = iocb->ki_filp;
+	struct qrtr_tun *tun = filp->private_data;
+	struct sk_buff *skb;
+	int count;
+
+	while (!(skb = skb_dequeue(&tun->queue))) {
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		/* Wait until we get data or the endpoint goes away */
+		if (wait_event_interruptible(tun->readq,
+					     !skb_queue_empty(&tun->queue)))
+			return -ERESTARTSYS;
+	}
+
+	count = min_t(size_t, iov_iter_count(to), skb->len);
+	if (copy_to_iter(skb->data, count, to) != count)
+		count = -EFAULT;
+
+	kfree_skb(skb);
+
+	return count;
+}
+
+static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *filp = iocb->ki_filp;
+	struct qrtr_tun *tun = filp->private_data;
+	size_t len = iov_iter_count(from);
+	ssize_t ret;
+	void *kbuf;
+
+	kbuf = kzalloc(len, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	if (!copy_from_iter_full(kbuf, len, from))
+		return -EFAULT;
+
+	ret = qrtr_endpoint_post(&tun->ep, kbuf, len);
+
+	return ret < 0 ? ret : len;
+}
+
+static int qrtr_tun_release(struct inode *inode, struct file *filp)
+{
+	struct qrtr_tun *tun = filp->private_data;
+	struct sk_buff *skb;
+
+	qrtr_endpoint_unregister(&tun->ep);
+
+	/* Discard all SKBs */
+	while (!skb_queue_empty(&tun->queue)) {
+		skb = skb_dequeue(&tun->queue);
+		kfree_skb(skb);
+	}
+
+	kfree(tun);
+
+	return 0;
+}
+
+static const struct file_operations qrtr_tun_ops = {
+	.owner = THIS_MODULE,
+	.open = qrtr_tun_open,
+	.read_iter = qrtr_tun_read_iter,
+	.write_iter = qrtr_tun_write_iter,
+	.release = qrtr_tun_release,
+};
+
+static struct miscdevice qrtr_tun_miscdev = {
+	MISC_DYNAMIC_MINOR,
+	"qrtr-tun",
+	&qrtr_tun_ops,
+};
+
+static int __init qrtr_tun_init(void)
+{
+	int ret;
+
+	ret = misc_register(&qrtr_tun_miscdev);
+	if (ret)
+		pr_err("failed to register Qualcomm IPC Router tun device\n");
+
+	return ret;
+}
+
+static void __exit qrtr_tun_exit(void)
+{
+	misc_deregister(&qrtr_tun_miscdev);
+}
+
+module_init(qrtr_tun_init);
+module_exit(qrtr_tun_exit);
+
+MODULE_DESCRIPTION("Qualcomm IPC Router TUN device");
+MODULE_LICENSE("GPL v2");
-- 
2.16.2

^ permalink raw reply related

* Re: [bpf PATCH v2 1/3] bpf: sockmap, map_release does not hold refcnt for pinned maps
From: Daniel Borkmann @ 2018-04-23 21:53 UTC (permalink / raw)
  To: John Fastabend, ast; +Cc: netdev
In-Reply-To: <20180423182929.17999.59712.stgit@john-Precision-Tower-5810>

On 04/23/2018 08:29 PM, John Fastabend wrote:
> Relying on map_release hook to decrement the reference counts when a
> map is removed only works if the map is not being pinned. In the
> pinned case the ref is decremented immediately and the BPF programs
> released. After this BPF programs may not be in-use which is not
> what the user would expect.
> 
> This patch moves the release logic into bpf_map_put_uref() and brings
> sockmap in-line with how a similar case is handled in prog array maps.
> 
> Fixes: 3d9e952697de ("bpf: sockmap, fix leaking maps with attached but not detached progs")
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>

Patches look good, but one trivial request below.

[...]
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 4ca46df..4b70439 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -257,8 +257,8 @@ static void bpf_map_free_deferred(struct work_struct *work)
>  static void bpf_map_put_uref(struct bpf_map *map)
>  {
>  	if (atomic_dec_and_test(&map->usercnt)) {
> -		if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
> -			bpf_fd_array_map_clear(map);
> +		if (map->ops->map_put_uref)
> +			map->ops->map_put_uref(map);

Could you change the callback name into something like 'map_release_uref'?
Naming it 'map_put_uref' is a bit confusing since this is only called when
the uref reference count already dropped to zero, and here we really release
the last reference point to user space. Given this is BPF core infra, would
be nice to still fix this up before applying.

Thanks,
Daniel

^ permalink raw reply

* Re: [PATCH] selftests: bpf: update .gitignore with missing file
From: Anders Roxell @ 2018-04-23 22:14 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: ast, Shuah Khan, netdev, Linux Kernel Mailing List,
	linux-kselftest
In-Reply-To: <5ea150a6-8af1-3aa8-7ff0-5fdb64054bab@iogearbox.net>

On 23 April 2018 at 23:34, Daniel Borkmann <daniel@iogearbox.net> wrote:
> On 04/23/2018 03:50 PM, Anders Roxell wrote:
>> Fixes: c0fa1b6c3efc ("bpf: btf: Add BTF tests")
>> Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
>> ---
>>  tools/testing/selftests/bpf/.gitignore | 1 +
>>  1 file changed, 1 insertion(+)
>>
>> diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
>> index 5e1ab2f0eb79..3e3b3ced3f7c 100644
>> --- a/tools/testing/selftests/bpf/.gitignore
>> +++ b/tools/testing/selftests/bpf/.gitignore
>> @@ -15,3 +15,4 @@ test_libbpf_open
>>  test_sock
>>  test_sock_addr
>>  urandom_read
>> +test_btf
>
> Against which tree is this? This doesn't apply to bpf-next. It would
> apply against bpf tree, but c0fa1b6c3efc ("bpf: btf: Add BTF tests")
> is part of bpf-next, so fits to neither of them.

I'm sorry,

I did it against this patch [1] that I thought was already applied to
the bpf tree.

Cheers,
Anders
[1] https://patchwork.kernel.org/patch/10332907/

^ permalink raw reply

* [PATCH 0/4] A few rhashtables cleanups
From: NeilBrown @ 2018-04-23 22:29 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu, David Miller; +Cc: netdev, linux-kernel

2 patches fixes documentation
1 fixes a bit in rhashtable_walk_start()
1 improves rhashtable_walk stability.

All reviewed and Acked.

Thanks,
NeilBrown


---

NeilBrown (4):
      rhashtable: remove outdated comments about grow_decision etc
      rhashtable: Revise incorrect comment on r{hl,hash}table_walk_enter()
      rhashtable: reset iter when rhashtable_walk_start sees new table
      rhashtable: improve rhashtable_walk stability when stop/start used.


 include/linux/rhashtable.h |   38 +++++++++++++++------------------
 lib/rhashtable.c           |   51 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 63 insertions(+), 26 deletions(-)

--
Signature

^ permalink raw reply

* [PATCH 1/4] rhashtable: remove outdated comments about grow_decision etc
From: NeilBrown @ 2018-04-23 22:29 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu, David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <152452244405.1456.8175298512483573078.stgit@noble>

grow_decision and shink_decision no longer exist, so remove
the remaining references to them.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |   33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 1f8ad121eb43..87d443a5b11d 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -836,9 +836,8 @@ static inline void *__rhashtable_insert_fast(
  *
  * It is safe to call this function from atomic context.
  *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
  */
 static inline int rhashtable_insert_fast(
 	struct rhashtable *ht, struct rhash_head *obj,
@@ -866,9 +865,8 @@ static inline int rhashtable_insert_fast(
  *
  * It is safe to call this function from atomic context.
  *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
  */
 static inline int rhltable_insert_key(
 	struct rhltable *hlt, const void *key, struct rhlist_head *list,
@@ -890,9 +888,8 @@ static inline int rhltable_insert_key(
  *
  * It is safe to call this function from atomic context.
  *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
  */
 static inline int rhltable_insert(
 	struct rhltable *hlt, struct rhlist_head *list,
@@ -922,9 +919,8 @@ static inline int rhltable_insert(
  *
  * It is safe to call this function from atomic context.
  *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
  */
 static inline int rhashtable_lookup_insert_fast(
 	struct rhashtable *ht, struct rhash_head *obj,
@@ -981,9 +977,8 @@ static inline void *rhashtable_lookup_get_insert_fast(
  *
  * Lookups may occur in parallel with hashtable mutations and resizing.
  *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
  *
  * Returns zero on success.
  */
@@ -1134,8 +1129,8 @@ static inline int __rhashtable_remove_fast(
  * walk the bucket chain upon removal. The removal operation is thus
  * considerable slow if the hash table is not correctly sized.
  *
- * Will automatically shrink the table via rhashtable_expand() if the
- * shrink_decision function specified at rhashtable_init() returns true.
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%.
  *
  * Returns zero on success, -ENOENT if the entry could not be found.
  */
@@ -1156,8 +1151,8 @@ static inline int rhashtable_remove_fast(
  * walk the bucket chain upon removal. The removal operation is thus
  * considerable slow if the hash table is not correctly sized.
  *
- * Will automatically shrink the table via rhashtable_expand() if the
- * shrink_decision function specified at rhashtable_init() returns true.
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%
  *
  * Returns zero on success, -ENOENT if the entry could not be found.
  */

^ permalink raw reply related

* [PATCH 2/4] rhashtable: Revise incorrect comment on r{hl, hash}table_walk_enter()
From: NeilBrown @ 2018-04-23 22:29 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu, David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <152452244405.1456.8175298512483573078.stgit@noble>

Neither rhashtable_walk_enter() or rhltable_walk_enter() sleep, though
they do take a spinlock without irq protection.
So revise the comments to accurately state the contexts in which
these functions can be called.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |    5 +++--
 lib/rhashtable.c           |    5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 87d443a5b11d..4e1f535c2034 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1268,8 +1268,9 @@ static inline int rhashtable_walk_init(struct rhashtable *ht,
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
  *
- * This function may sleep so you must not call it from interrupt
- * context or with spin locks held.
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
  *
  * You must call rhashtable_walk_exit after this function returns.
  */
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 2b2b79974b61..6d490f51174e 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -668,8 +668,9 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
  *
- * This function may sleep so you must not call it from interrupt
- * context or with spin locks held.
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
  *
  * You must call rhashtable_walk_exit after this function returns.
  */

^ permalink raw reply related

* [PATCH 3/4] rhashtable: reset iter when rhashtable_walk_start sees new table
From: NeilBrown @ 2018-04-23 22:29 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu, David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <152452244405.1456.8175298512483573078.stgit@noble>

The documentation claims that when rhashtable_walk_start_check()
detects a resize event, it will rewind back to the beginning
of the table.  This is not true.  We need to set ->slot and
->skip to be zero for it to be true.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 lib/rhashtable.c |    2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 6d490f51174e..81edf1ab38ab 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -737,6 +737,8 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 
 	if (!iter->walker.tbl && !iter->end_of_table) {
 		iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
+		iter->slot = 0;
+		iter->skip = 0;
 		return -EAGAIN;
 	}
 

^ permalink raw reply related

* [PATCH 4/4] rhashtable: improve rhashtable_walk stability when stop/start used.
From: NeilBrown @ 2018-04-23 22:29 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu, David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <152452244405.1456.8175298512483573078.stgit@noble>

When a walk of an rhashtable is interrupted with rhastable_walk_stop()
and then rhashtable_walk_start(), the location to restart from is based
on a 'skip' count in the current hash chain, and this can be incorrect
if insertions or deletions have happened.  This does not happen when
the walk is not stopped and started as iter->p is a placeholder which
is safe to use while holding the RCU read lock.

In rhashtable_walk_start() we can revalidate that 'p' is still in the
same hash chain.  If it isn't then the current method is still used.

With this patch, if a rhashtable walker ensures that the current
object remains in the table over a stop/start period (possibly by
elevating the reference count if that is sufficient), it can be sure
that a walk will not miss objects that were in the hashtable for the
whole time of the walk.

rhashtable_walk_start() may not find the object even though it is
still in the hashtable if a rehash has moved it to a new table.  In
this case it will (eventually) get -EAGAIN and will need to proceed
through the whole table again to be sure to see everything at least
once.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 lib/rhashtable.c |   44 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 81edf1ab38ab..9427b5766134 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -727,6 +727,7 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 	__acquires(RCU)
 {
 	struct rhashtable *ht = iter->ht;
+	bool rhlist = ht->rhlist;
 
 	rcu_read_lock();
 
@@ -735,13 +736,52 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 		list_del(&iter->walker.list);
 	spin_unlock(&ht->lock);
 
-	if (!iter->walker.tbl && !iter->end_of_table) {
+	if (iter->end_of_table)
+		return 0;
+	if (!iter->walker.tbl) {
 		iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
 		iter->slot = 0;
 		iter->skip = 0;
 		return -EAGAIN;
 	}
 
+	if (iter->p && !rhlist) {
+		/*
+		 * We need to validate that 'p' is still in the table, and
+		 * if so, update 'skip'
+		 */
+		struct rhash_head *p;
+		int skip = 0;
+		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+			skip++;
+			if (p == iter->p) {
+				iter->skip = skip;
+				goto found;
+			}
+		}
+		iter->p = NULL;
+	} else if (iter->p && rhlist) {
+		/* Need to validate that 'list' is still in the table, and
+		 * if so, update 'skip' and 'p'.
+		 */
+		struct rhash_head *p;
+		struct rhlist_head *list;
+		int skip = 0;
+		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+			for (list = container_of(p, struct rhlist_head, rhead);
+			     list;
+			     list = rcu_dereference(list->next)) {
+				skip++;
+				if (list == iter->list) {
+					iter->p = p;
+					skip = skip;
+					goto found;
+				}
+			}
+		}
+		iter->p = NULL;
+	}
+found:
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
@@ -917,8 +957,6 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
 		iter->walker.tbl = NULL;
 	spin_unlock(&ht->lock);
 
-	iter->p = NULL;
-
 out:
 	rcu_read_unlock();
 }

^ permalink raw reply related

* Re: [PATCH] selftests: bpf: update .gitignore with missing file
From: Daniel Borkmann @ 2018-04-23 22:31 UTC (permalink / raw)
  To: Anders Roxell
  Cc: ast, Shuah Khan, netdev, Linux Kernel Mailing List,
	linux-kselftest
In-Reply-To: <CADYN=9KAorrGBh+2_LZ+6cxaf4yU-pOmU4BSzkMKT8sdU7hhUw@mail.gmail.com>

On 04/24/2018 12:14 AM, Anders Roxell wrote:
> On 23 April 2018 at 23:34, Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 04/23/2018 03:50 PM, Anders Roxell wrote:
>>> Fixes: c0fa1b6c3efc ("bpf: btf: Add BTF tests")
>>> Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
>>> ---
>>>  tools/testing/selftests/bpf/.gitignore | 1 +
>>>  1 file changed, 1 insertion(+)
>>>
>>> diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
>>> index 5e1ab2f0eb79..3e3b3ced3f7c 100644
>>> --- a/tools/testing/selftests/bpf/.gitignore
>>> +++ b/tools/testing/selftests/bpf/.gitignore
>>> @@ -15,3 +15,4 @@ test_libbpf_open
>>>  test_sock
>>>  test_sock_addr
>>>  urandom_read
>>> +test_btf
>>
>> Against which tree is this? This doesn't apply to bpf-next. It would
>> apply against bpf tree, but c0fa1b6c3efc ("bpf: btf: Add BTF tests")
>> is part of bpf-next, so fits to neither of them.
> 
> I'm sorry,
> 
> I did it against this patch [1] that I thought was already applied to
> the bpf tree.

That was bpf tree since the original change already went to mainline; the
BTF is in bpf-next however, so you need to rebase your change against that.

Thanks,
Daniel

^ permalink raw reply

* Re: dev_loopback_xmit parameters
From: Eric Dumazet @ 2018-04-23 22:36 UTC (permalink / raw)
  To: Emanuele, Linux Kernel Network Developers
In-Reply-To: <eb1bf67b-7f38-a3b5-4974-6ee1d198887f@gmail.com>



On 04/23/2018 02:40 PM, Emanuele wrote:
> Hello,
> 
> I don't know if this is the right place where to ask, but I was wondering why the dev_loopback_xmit function defined in /net/core/dev.c takes struct net * and struct sock *  as parameters. They are never used, so I believe passing only the struct  sk_buff * should be enough.
> 

Look at net/ipv6/ip6_output.c where NF_HOOK() uses dev_loopback_xmit().

> In addition, it would like to know where I can read what is and how to set a skb dst_entry, since I don't really understand it.
> 
> Thanks a lot,
> 
> Emanuele
> 

^ permalink raw reply

* [bpf PATCH v3 0/3] BPF, a couple sockmap fixes
From: John Fastabend @ 2018-04-23 22:39 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev

While testing sockmap with more programs (besides our test programs)
I found a couple issues.

The attached series fixes an issue where pinned maps were not
working correctly, blocking sockets returned zero, and an error
path that when the sock hit an out of memory case resulted in a
double page_put() while doing ingress redirects.

See individual patches for more details.

v2: Incorporated Daniel's feedback to use map ops for uref put op
    which also fixed the build error discovered in v1.
v3: rename map_put_uref to map_release_uref

---

John Fastabend (3):
      bpf: sockmap, map_release does not hold refcnt for pinned maps
      bpf: sockmap, sk_wait_event needed to handle blocking cases
      bpf: sockmap, fix double page_put on ENOMEM error in redirect path


 0 files changed

^ permalink raw reply

* [bpf PATCH v3 1/3] bpf: sockmap, map_release does not hold refcnt for pinned maps
From: John Fastabend @ 2018-04-23 22:39 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev
In-Reply-To: <20180423223712.16388.63625.stgit@john-Precision-Tower-5810>

Relying on map_release hook to decrement the reference counts when a
map is removed only works if the map is not being pinned. In the
pinned case the ref is decremented immediately and the BPF programs
released. After this BPF programs may not be in-use which is not
what the user would expect.

This patch moves the release logic into bpf_map_put_uref() and brings
sockmap in-line with how a similar case is handled in prog array maps.

Fixes: 3d9e952697de ("bpf: sockmap, fix leaking maps with attached but not detached progs")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 0 files changed

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0ac4340..a5782d0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -33,6 +33,7 @@ struct bpf_map_ops {
 	void (*map_release)(struct bpf_map *map, struct file *map_file);
 	void (*map_free)(struct bpf_map *map);
 	int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
+	void (*map_release_uref)(struct bpf_map *map);
 
 	/* funcs callable from userspace and from eBPF programs */
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
@@ -448,7 +449,6 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
 				 void *key, void *value, u64 map_flags);
 int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
-void bpf_fd_array_map_clear(struct bpf_map *map);
 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 				void *key, void *value, u64 map_flags);
 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 02a1893..0fd8d8f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -526,7 +526,7 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr)
 }
 
 /* decrement refcnt of all bpf_progs that are stored in this map */
-void bpf_fd_array_map_clear(struct bpf_map *map)
+static void bpf_fd_array_map_clear(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
@@ -545,6 +545,7 @@ void bpf_fd_array_map_clear(struct bpf_map *map)
 	.map_fd_get_ptr = prog_fd_array_get_ptr,
 	.map_fd_put_ptr = prog_fd_array_put_ptr,
 	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
+	.map_release_uref = bpf_fd_array_map_clear,
 };
 
 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a3b2138..a73d484 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1831,7 +1831,7 @@ static int sock_map_update_elem(struct bpf_map *map,
 	return err;
 }
 
-static void sock_map_release(struct bpf_map *map, struct file *map_file)
+static void sock_map_release(struct bpf_map *map)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 	struct bpf_prog *orig;
@@ -1855,7 +1855,7 @@ static void sock_map_release(struct bpf_map *map, struct file *map_file)
 	.map_get_next_key = sock_map_get_next_key,
 	.map_update_elem = sock_map_update_elem,
 	.map_delete_elem = sock_map_delete_elem,
-	.map_release = sock_map_release,
+	.map_release_uref = sock_map_release,
 };
 
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fe23dc5a..a22c26e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -260,8 +260,8 @@ static void bpf_map_free_deferred(struct work_struct *work)
 static void bpf_map_put_uref(struct bpf_map *map)
 {
 	if (atomic_dec_and_test(&map->usercnt)) {
-		if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
-			bpf_fd_array_map_clear(map);
+		if (map->ops->map_release_uref)
+			map->ops->map_release_uref(map);
 	}
 }
 

^ permalink raw reply related

* [bpf PATCH v3 2/3] bpf: sockmap, sk_wait_event needed to handle blocking cases
From: John Fastabend @ 2018-04-23 22:39 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev
In-Reply-To: <20180423223712.16388.63625.stgit@john-Precision-Tower-5810>

In the recvmsg handler we need to add a wait event to support the
blocking use cases. Without this we return zero and may confuse
user applications. In the wait event any data received on the
sk either via sk_receive_queue or the psock ingress list will
wake up the sock.

Fixes: fa246693a111 ("bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 0 files changed

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a73d484..aaf50ec 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -43,6 +43,7 @@
 #include <net/tcp.h>
 #include <linux/ptr_ring.h>
 #include <net/inet_common.h>
+#include <linux/sched/signal.h>
 
 #define SOCK_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -732,6 +733,26 @@ static int bpf_exec_tx_verdict(struct smap_psock *psock,
 	return err;
 }
 
+static int bpf_wait_data(struct sock *sk,
+			 struct smap_psock *psk, int flags,
+			 long timeo, int *err)
+{
+	int rc;
+
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	rc = sk_wait_event(sk, &timeo,
+			   !list_empty(&psk->ingress) ||
+			   !skb_queue_empty(&sk->sk_receive_queue),
+			   &wait);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	remove_wait_queue(sk_sleep(sk), &wait);
+
+	return rc;
+}
+
 static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			   int nonblock, int flags, int *addr_len)
 {
@@ -755,6 +776,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
 
 	lock_sock(sk);
+bytes_ready:
 	while (copied != len) {
 		struct scatterlist *sg;
 		struct sk_msg_buff *md;
@@ -809,6 +831,28 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		}
 	}
 
+	if (!copied) {
+		long timeo;
+		int data;
+		int err = 0;
+
+		timeo = sock_rcvtimeo(sk, nonblock);
+		data = bpf_wait_data(sk, psock, flags, timeo, &err);
+
+		if (data) {
+			if (!skb_queue_empty(&sk->sk_receive_queue)) {
+				release_sock(sk);
+				smap_release_sock(psock, sk);
+				copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+				return copied;
+			}
+			goto bytes_ready;
+		}
+
+		if (err)
+			copied = err;
+	}
+
 	release_sock(sk);
 	smap_release_sock(psock, sk);
 	return copied;

^ permalink raw reply related

* [bpf PATCH v3 3/3] bpf: sockmap, fix double page_put on ENOMEM error in redirect path
From: John Fastabend @ 2018-04-23 22:39 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev
In-Reply-To: <20180423223712.16388.63625.stgit@john-Precision-Tower-5810>

In the case where the socket memory boundary is hit the redirect
path returns an ENOMEM error. However, before checking for this
condition the redirect scatterlist buffer is setup with a valid
page and length. This is never unwound so when the buffers are
released latter in the error path we do a put_page() and clear
the scatterlist fields. But, because the initial error happens
before completing the scatterlist buffer we end up with both the
original buffer and the redirect buffer pointing to the same page
resulting in duplicate put_page() calls.

To fix this simply move the initial configuration of the redirect
scatterlist buffer below the sock memory check.

Found this while running TCP_STREAM test with netperf using Cilium.

Fixes: fa246693a111 ("bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 0 files changed

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index aaf50ec..634415c 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -524,8 +524,6 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
 	i = md->sg_start;
 
 	do {
-		r->sg_data[i] = md->sg_data[i];
-
 		size = (apply && apply_bytes < md->sg_data[i].length) ?
 			apply_bytes : md->sg_data[i].length;
 
@@ -536,6 +534,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
 		}
 
 		sk_mem_charge(sk, size);
+		r->sg_data[i] = md->sg_data[i];
 		r->sg_data[i].length = size;
 		md->sg_data[i].length -= size;
 		md->sg_data[i].offset += size;

^ permalink raw reply related

* [PATCH net v2] net: ethtool: Add missing kernel doc for FEC parameters
From: Florian Fainelli @ 2018-04-23 22:51 UTC (permalink / raw)
  To: netdev; +Cc: davem, vidya.chowdary, dustin, roopa, Florian Fainelli

While adding support for ethtool::get_fecparam and set_fecparam, kernel
doc for these functions was missed, add those.

Fixes: 1a5f3da20bd9 ("net: ethtool: add support for forward error correction modes")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
Changes in v2:

- corrected set_fecparam in commit message

 include/linux/ethtool.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index ebe41811ed34..b32cd2062f18 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -310,6 +310,8 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
  *	fields should be ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS
  *	instead of the latter), any change to them will be overwritten
  *	by kernel. Returns a negative error code or zero.
+ * @get_fecparam: Get the network device Forward Error Correction parameters.
+ * @set_fecparam: Set the network device Forward Error Correction parameters.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
-- 
2.14.1

^ permalink raw reply related

* Re: [bpf PATCH v3 0/3] BPF, a couple sockmap fixes
From: Daniel Borkmann @ 2018-04-23 22:52 UTC (permalink / raw)
  To: John Fastabend, ast; +Cc: netdev
In-Reply-To: <20180423223712.16388.63625.stgit@john-Precision-Tower-5810>

On 04/24/2018 12:39 AM, John Fastabend wrote:
> While testing sockmap with more programs (besides our test programs)
> I found a couple issues.
> 
> The attached series fixes an issue where pinned maps were not
> working correctly, blocking sockets returned zero, and an error
> path that when the sock hit an out of memory case resulted in a
> double page_put() while doing ingress redirects.
> 
> See individual patches for more details.
> 
> v2: Incorporated Daniel's feedback to use map ops for uref put op
>     which also fixed the build error discovered in v1.
> v3: rename map_put_uref to map_release_uref

Applied to bpf tree, thanks John!

^ permalink raw reply

* [PATCH v2] selftests: bpf: update .gitignore with missing file
From: Anders Roxell @ 2018-04-23 22:53 UTC (permalink / raw)
  To: ast, daniel, shuah; +Cc: netdev, linux-kernel, linux-kselftest, Anders Roxell
In-Reply-To: <065a8bc2-6065-3d43-8b03-35ed693de7ce@iogearbox.net>

Fixes: c0fa1b6c3efc ("bpf: btf: Add BTF tests")
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
---
Rebased against bpf-next.

 tools/testing/selftests/bpf/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 9cf83f895d98..da19f0562bf8 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -12,3 +12,4 @@ test_tcpbpf_user
 test_verifier_log
 feature
 test_libbpf_open
+test_btf
-- 
2.17.0

^ permalink raw reply related

* Re: [PATCH bpf-next 02/15] xsk: add user memory registration support sockopt
From: Willem de Bruijn @ 2018-04-23 23:04 UTC (permalink / raw)
  To: Björn Töpel
  Cc: Karlsson, Magnus, Alexander Duyck, Alexander Duyck,
	John Fastabend, Alexei Starovoitov, Jesper Dangaard Brouer,
	Daniel Borkmann, Michael S. Tsirkin, Network Development,
	Björn Töpel, michael.lundkvist, Brandeburg, Jesse,
	Singhai, Anjali, Zhang, Qi Z
In-Reply-To: <20180423135619.7179-3-bjorn.topel@gmail.com>

On Mon, Apr 23, 2018 at 9:56 AM, Björn Töpel <bjorn.topel@gmail.com> wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
>
> In this commit the base structure of the AF_XDP address family is set
> up. Further, we introduce the abilty register a window of user memory
> to the kernel via the XDP_UMEM_REG setsockopt syscall. The memory
> window is viewed by an AF_XDP socket as a set of equally large
> frames. After a user memory registration all frames are "owned" by the
> user application, and not the kernel.
>
> Co-authored-by: Magnus Karlsson <magnus.karlsson@intel.com>
> Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>

> +static void xdp_umem_release(struct xdp_umem *umem)
> +{
> +       struct task_struct *task;
> +       struct mm_struct *mm;
> +       unsigned long diff;
> +
> +       if (umem->pgs) {
> +               xdp_umem_unpin_pages(umem);
> +
> +               task = get_pid_task(umem->pid, PIDTYPE_PID);
> +               put_pid(umem->pid);
> +               if (!task)
> +                       goto out;
> +               mm = get_task_mm(task);
> +               put_task_struct(task);
> +               if (!mm)
> +                       goto out;
> +
> +               diff = umem->size >> PAGE_SHIFT;

Need to round up or size must always be a multiple of PAGE_SIZE.

> +
> +               down_write(&mm->mmap_sem);
> +               mm->pinned_vm -= diff;
> +               up_write(&mm->mmap_sem);

When using user->locked_vm for resource limit checks, no need
to also update mm->pinned_vm?

> +static int __xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> +{
> +       u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom;
> +       u64 addr = mr->addr, size = mr->len;
> +       unsigned int nframes;
> +       int size_chk, err;
> +
> +       if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
> +               /* Strictly speaking we could support this, if:
> +                * - huge pages, or*
> +                * - using an IOMMU, or
> +                * - making sure the memory area is consecutive
> +                * but for now, we simply say "computer says no".
> +                */
> +               return -EINVAL;
> +       }

Ideally, AF_XDP subsumes all packet socket use cases. It does not
have packet v3's small packet optimizations of variable sized frames
and block signaling.

I don't suggest adding that now. But for the non-zerocopy case, it may
make sense to ensure that nothing is blocking a later addition of these
features. Especially for header-only (snaplen) workloads. So far, I don't
see any issues.

> +       if (!is_power_of_2(frame_size))
> +               return -EINVAL;
> +
> +       if (!PAGE_ALIGNED(addr)) {
> +               /* Memory area has to be page size aligned. For
> +                * simplicity, this might change.
> +                */
> +               return -EINVAL;
> +       }
> +
> +       if ((addr + size) < addr)
> +               return -EINVAL;
> +
> +       nframes = size / frame_size;
> +       if (nframes == 0 || nframes > UINT_MAX)
> +               return -EINVAL;

You may also want a check here that nframes * frame_size is at least
PAGE_SIZE and probably a multiple of that.

> +       frame_headroom = ALIGN(frame_headroom, 64);
> +
> +       size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM;
> +       if (size_chk < 0)
> +               return -EINVAL;
> +
> +       umem->pid = get_task_pid(current, PIDTYPE_PID);
> +       umem->size = (size_t)size;
> +       umem->address = (unsigned long)addr;
> +       umem->props.frame_size = frame_size;
> +       umem->props.nframes = nframes;
> +       umem->frame_headroom = frame_headroom;
> +       umem->npgs = size / PAGE_SIZE;
> +       umem->pgs = NULL;
> +       umem->user = NULL;
> +
> +       umem->frame_size_log2 = ilog2(frame_size);
> +       umem->nfpp_mask = (PAGE_SIZE / frame_size) - 1;
> +       umem->nfpplog2 = ilog2(PAGE_SIZE / frame_size);
> +       atomic_set(&umem->users, 1);
> +
> +       err = xdp_umem_account_pages(umem);
> +       if (err)
> +               goto out;
> +
> +       err = xdp_umem_pin_pages(umem);
> +       if (err)

need to call xdp_umem_unaccount_pages on error
> +               goto out;
> +       return 0;
> +
> +out:
> +       put_pid(umem->pid);
> +       return err;
> +}

^ permalink raw reply

* Re: dev_loopback_xmit parameters
From: Emanuele @ 2018-04-23 23:11 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Linux Kernel Network Developers
In-Reply-To: <c91f52c8-c7de-e555-a184-b62d39665b9f@gmail.com>

Ok, clear now.

Even though I don't understand what to set to avoid triggering the
WARN_ON(!skb_dst(skb));
inside dev_loopback_xmit.
I just would like to send the skb in loopback, i.e. moving the packet 
from the sending to the receiving queue of a certain struct net_device.


On 24/04/2018 00:36, Eric Dumazet wrote:
>
> On 04/23/2018 02:40 PM, Emanuele wrote:
>> Hello,
>>
>> I don't know if this is the right place where to ask, but I was wondering why the dev_loopback_xmit function defined in /net/core/dev.c takes struct net * and struct sock *  as parameters. They are never used, so I believe passing only the struct  sk_buff * should be enough.
>>
> Look at net/ipv6/ip6_output.c where NF_HOOK() uses dev_loopback_xmit().
>
>> In addition, it would like to know where I can read what is and how to set a skb dst_entry, since I don't really understand it.
>>
>> Thanks a lot,
>>
>> Emanuele
>>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox