Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf-next 03/15] xsk: add umem fill queue support and mmap
From: Björn Töpel @ 2018-04-23 13:56 UTC (permalink / raw)
  To: bjorn.topel, magnus.karlsson, alexander.h.duyck, alexander.duyck,
	john.fastabend, ast, brouer, willemdebruijn.kernel, daniel, mst,
	netdev
  Cc: michael.lundkvist, jesse.brandeburg, anjali.singhai, qi.z.zhang
In-Reply-To: <20180423135619.7179-1-bjorn.topel@gmail.com>

From: Magnus Karlsson <magnus.karlsson@intel.com>

Here, we add another setsockopt for registered user memory (umem)
called XDP_UMEM_FILL_QUEUE. Using this socket option, the process can
ask the kernel to allocate a queue (ring buffer) and also mmap it
(XDP_UMEM_PGOFF_FILL_QUEUE) into the process.

The queue is used to explicitly pass ownership of umem frames from the
user process to the kernel. These frames will in a later patch be
filled in with Rx packet data by the kernel.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
---
 include/uapi/linux/if_xdp.h | 15 +++++++++++
 net/xdp/Makefile            |  2 +-
 net/xdp/xdp_umem.c          |  5 ++++
 net/xdp/xdp_umem.h          |  2 ++
 net/xdp/xsk.c               | 62 ++++++++++++++++++++++++++++++++++++++++++++-
 net/xdp/xsk_queue.c         | 58 ++++++++++++++++++++++++++++++++++++++++++
 net/xdp/xsk_queue.h         | 38 +++++++++++++++++++++++++++
 7 files changed, 180 insertions(+), 2 deletions(-)
 create mode 100644 net/xdp/xsk_queue.c
 create mode 100644 net/xdp/xsk_queue.h

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 41252135a0fe..975661e1baca 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -23,6 +23,7 @@
 
 /* XDP socket options */
 #define XDP_UMEM_REG			3
+#define XDP_UMEM_FILL_RING		4
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
@@ -31,4 +32,18 @@ struct xdp_umem_reg {
 	__u32 frame_headroom; /* Frame head room */
 };
 
+/* Pgoff for mmaping the rings */
+#define XDP_UMEM_PGOFF_FILL_RING	0x100000000
+
+struct xdp_ring {
+	__u32 producer __attribute__((aligned(64)));
+	__u32 consumer __attribute__((aligned(64)));
+};
+
+/* Used for the fill and completion queues for buffers */
+struct xdp_umem_ring {
+	struct xdp_ring ptrs;
+	__u32 desc[0] __attribute__((aligned(64)));
+};
+
 #endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
index a5d736640a0f..074fb2b2d51c 100644
--- a/net/xdp/Makefile
+++ b/net/xdp/Makefile
@@ -1,2 +1,2 @@
-obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
 
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index bff058f5a769..6fc233e03f30 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -62,6 +62,11 @@ static void xdp_umem_release(struct xdp_umem *umem)
 	struct mm_struct *mm;
 	unsigned long diff;
 
+	if (umem->fq) {
+		xskq_destroy(umem->fq);
+		umem->fq = NULL;
+	}
+
 	if (umem->pgs) {
 		xdp_umem_unpin_pages(umem);
 
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 58714f4f7f25..3086091aebdd 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -18,9 +18,11 @@
 #include <linux/mm.h>
 #include <linux/if_xdp.h>
 
+#include "xsk_queue.h"
 #include "xdp_umem_props.h"
 
 struct xdp_umem {
+	struct xsk_queue *fq;
 	struct page **pgs;
 	struct xdp_umem_props props;
 	u32 npgs;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 19fc719cbe0d..bf6a1151df28 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -32,6 +32,7 @@
 #include <linux/netdevice.h>
 #include <net/sock.h>
 
+#include "xsk_queue.h"
 #include "xdp_umem.h"
 
 struct xdp_sock {
@@ -47,6 +48,21 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
 	return (struct xdp_sock *)sk;
 }
 
+static int xsk_init_queue(u32 entries, struct xsk_queue **queue)
+{
+	struct xsk_queue *q;
+
+	if (entries == 0 || *queue || !is_power_of_2(entries))
+		return -EINVAL;
+
+	q = xskq_create(entries);
+	if (!q)
+		return -ENOMEM;
+
+	*queue = q;
+	return 0;
+}
+
 static int xsk_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -109,6 +125,23 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		mutex_unlock(&xs->mutex);
 		return 0;
 	}
+	case XDP_UMEM_FILL_RING:
+	{
+		struct xsk_queue **q;
+		int entries;
+
+		if (!xs->umem)
+			return -EINVAL;
+
+		if (copy_from_user(&entries, optval, sizeof(entries)))
+			return -EFAULT;
+
+		mutex_lock(&xs->mutex);
+		q = &xs->umem->fq;
+		err = xsk_init_queue(entries, q);
+		mutex_unlock(&xs->mutex);
+		return err;
+	}
 	default:
 		break;
 	}
@@ -116,6 +149,33 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 	return -ENOPROTOOPT;
 }
 
+static int xsk_mmap(struct file *file, struct socket *sock,
+		    struct vm_area_struct *vma)
+{
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct xdp_sock *xs = xdp_sk(sock->sk);
+	struct xsk_queue *q;
+	unsigned long pfn;
+	struct page *qpg;
+
+	if (!xs->umem)
+		return -EINVAL;
+
+	if (offset == XDP_UMEM_PGOFF_FILL_RING)
+		q = xs->umem->fq;
+	else
+		return -EINVAL;
+
+	qpg = virt_to_head_page(q->ring);
+	if (size > (PAGE_SIZE << compound_order(qpg)))
+		return -EINVAL;
+
+	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
+	return remap_pfn_range(vma, vma->vm_start, pfn,
+			       size, vma->vm_page_prot);
+}
+
 static struct proto xsk_proto = {
 	.name =		"XDP",
 	.owner =	THIS_MODULE,
@@ -139,7 +199,7 @@ static const struct proto_ops xsk_proto_ops = {
 	.getsockopt =	sock_no_getsockopt,
 	.sendmsg =	sock_no_sendmsg,
 	.recvmsg =	sock_no_recvmsg,
-	.mmap =		sock_no_mmap,
+	.mmap =		xsk_mmap,
 	.sendpage =	sock_no_sendpage,
 };
 
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
new file mode 100644
index 000000000000..23da4f29d3fb
--- /dev/null
+++ b/net/xdp/xsk_queue.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/slab.h>
+
+#include "xsk_queue.h"
+
+static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
+{
+	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32);
+}
+
+struct xsk_queue *xskq_create(u32 nentries)
+{
+	struct xsk_queue *q;
+	gfp_t gfp_flags;
+	size_t size;
+
+	q = kzalloc(sizeof(*q), GFP_KERNEL);
+	if (!q)
+		return NULL;
+
+	q->nentries = nentries;
+	q->ring_mask = nentries - 1;
+
+	gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
+		    __GFP_COMP  | __GFP_NORETRY;
+	size = xskq_umem_get_ring_size(q);
+
+	q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
+						      get_order(size));
+	if (!q->ring) {
+		kfree(q);
+		return NULL;
+	}
+
+	return q;
+}
+
+void xskq_destroy(struct xsk_queue *q)
+{
+	if (!q)
+		return;
+
+	page_frag_free(q->ring);
+	kfree(q);
+}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
new file mode 100644
index 000000000000..7eb556bf73be
--- /dev/null
+++ b/net/xdp/xsk_queue.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_XSK_QUEUE_H
+#define _LINUX_XSK_QUEUE_H
+
+#include <linux/types.h>
+#include <linux/if_xdp.h>
+
+#include "xdp_umem_props.h"
+
+struct xsk_queue {
+	struct xdp_umem_props umem_props;
+	u32 ring_mask;
+	u32 nentries;
+	u32 prod_head;
+	u32 prod_tail;
+	u32 cons_head;
+	u32 cons_tail;
+	struct xdp_ring *ring;
+	u64 invalid_descs;
+};
+
+struct xsk_queue *xskq_create(u32 nentries);
+void xskq_destroy(struct xsk_queue *q);
+
+#endif /* _LINUX_XSK_QUEUE_H */
-- 
2.14.1

^ permalink raw reply related

* [PATCH bpf-next 02/15] xsk: add user memory registration support sockopt
From: Björn Töpel @ 2018-04-23 13:56 UTC (permalink / raw)
  To: bjorn.topel, magnus.karlsson, alexander.h.duyck, alexander.duyck,
	john.fastabend, ast, brouer, willemdebruijn.kernel, daniel, mst,
	netdev
  Cc: Björn Töpel, michael.lundkvist, jesse.brandeburg,
	anjali.singhai, qi.z.zhang
In-Reply-To: <20180423135619.7179-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

In this commit the base structure of the AF_XDP address family is set
up. Further, we introduce the abilty register a window of user memory
to the kernel via the XDP_UMEM_REG setsockopt syscall. The memory
window is viewed by an AF_XDP socket as a set of equally large
frames. After a user memory registration all frames are "owned" by the
user application, and not the kernel.

Co-authored-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 include/uapi/linux/if_xdp.h |  34 +++++++
 net/Makefile                |   1 +
 net/xdp/Makefile            |   2 +
 net/xdp/xdp_umem.c          | 237 ++++++++++++++++++++++++++++++++++++++++++++
 net/xdp/xdp_umem.h          |  42 ++++++++
 net/xdp/xdp_umem_props.h    |  23 +++++
 net/xdp/xsk.c               | 223 +++++++++++++++++++++++++++++++++++++++++
 7 files changed, 562 insertions(+)
 create mode 100644 include/uapi/linux/if_xdp.h
 create mode 100644 net/xdp/Makefile
 create mode 100644 net/xdp/xdp_umem.c
 create mode 100644 net/xdp/xdp_umem.h
 create mode 100644 net/xdp/xdp_umem_props.h
 create mode 100644 net/xdp/xsk.c

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
new file mode 100644
index 000000000000..41252135a0fe
--- /dev/null
+++ b/include/uapi/linux/if_xdp.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+ *
+ * if_xdp: XDP socket user-space interface
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Author(s): Björn Töpel <bjorn.topel@intel.com>
+ *	      Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#ifndef _LINUX_IF_XDP_H
+#define _LINUX_IF_XDP_H
+
+#include <linux/types.h>
+
+/* XDP socket options */
+#define XDP_UMEM_REG			3
+
+struct xdp_umem_reg {
+	__u64 addr; /* Start of packet data area */
+	__u64 len; /* Length of packet data area */
+	__u32 frame_size; /* Frame size */
+	__u32 frame_headroom; /* Frame head room */
+};
+
+#endif /* _LINUX_IF_XDP_H */
diff --git a/net/Makefile b/net/Makefile
index a6147c61b174..77aaddedbd29 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -85,3 +85,4 @@ obj-y				+= l3mdev/
 endif
 obj-$(CONFIG_QRTR)		+= qrtr/
 obj-$(CONFIG_NET_NCSI)		+= ncsi/
+obj-$(CONFIG_XDP_SOCKETS)	+= xdp/
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
new file mode 100644
index 000000000000..a5d736640a0f
--- /dev/null
+++ b/net/xdp/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o
+
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
new file mode 100644
index 000000000000..bff058f5a769
--- /dev/null
+++ b/net/xdp/xdp_umem.c
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/mm.h>
+
+#include "xdp_umem.h"
+
+#define XDP_UMEM_MIN_FRAME_SIZE 2048
+
+int xdp_umem_create(struct xdp_umem **umem)
+{
+	*umem = kzalloc(sizeof(**umem), GFP_KERNEL);
+
+	if (!(*umem))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void xdp_umem_unpin_pages(struct xdp_umem *umem)
+{
+	unsigned int i;
+
+	if (umem->pgs) {
+		for (i = 0; i < umem->npgs; i++)
+			put_page(umem->pgs[i]);
+
+		kfree(umem->pgs);
+		umem->pgs = NULL;
+	}
+}
+
+static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
+{
+	if (umem->user) {
+		atomic_long_sub(umem->npgs, &umem->user->locked_vm);
+		free_uid(umem->user);
+	}
+}
+
+static void xdp_umem_release(struct xdp_umem *umem)
+{
+	struct task_struct *task;
+	struct mm_struct *mm;
+	unsigned long diff;
+
+	if (umem->pgs) {
+		xdp_umem_unpin_pages(umem);
+
+		task = get_pid_task(umem->pid, PIDTYPE_PID);
+		put_pid(umem->pid);
+		if (!task)
+			goto out;
+		mm = get_task_mm(task);
+		put_task_struct(task);
+		if (!mm)
+			goto out;
+
+		diff = umem->size >> PAGE_SHIFT;
+
+		down_write(&mm->mmap_sem);
+		mm->pinned_vm -= diff;
+		up_write(&mm->mmap_sem);
+		mmput(mm);
+		umem->pgs = NULL;
+	}
+
+	xdp_umem_unaccount_pages(umem);
+out:
+	kfree(umem);
+}
+
+void xdp_put_umem(struct xdp_umem *umem)
+{
+	if (!umem)
+		return;
+
+	if (atomic_dec_and_test(&umem->users))
+		xdp_umem_release(umem);
+}
+
+static int xdp_umem_pin_pages(struct xdp_umem *umem)
+{
+	unsigned int gup_flags = FOLL_WRITE;
+	long npgs;
+	int err;
+
+	umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL);
+	if (!umem->pgs)
+		return -ENOMEM;
+
+	npgs = get_user_pages(umem->address, umem->npgs,
+			      gup_flags, &umem->pgs[0], NULL);
+	if (npgs != umem->npgs) {
+		if (npgs >= 0) {
+			umem->npgs = npgs;
+			err = -ENOMEM;
+			goto out_pin;
+		}
+		err = npgs;
+		goto out_pgs;
+	}
+	return 0;
+
+out_pin:
+	xdp_umem_unpin_pages(umem);
+out_pgs:
+	kfree(umem->pgs);
+	umem->pgs = NULL;
+	return err;
+}
+
+static int xdp_umem_account_pages(struct xdp_umem *umem)
+{
+	unsigned long lock_limit, new_npgs, old_npgs;
+
+	if (capable(CAP_IPC_LOCK))
+		return 0;
+
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	umem->user = get_uid(current_user());
+
+	do {
+		old_npgs = atomic_long_read(&umem->user->locked_vm);
+		new_npgs = old_npgs + umem->npgs;
+		if (new_npgs > lock_limit) {
+			free_uid(umem->user);
+			umem->user = NULL;
+			return -ENOBUFS;
+		}
+	} while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
+				     new_npgs) != old_npgs);
+	return 0;
+}
+
+static int __xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
+{
+	u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom;
+	u64 addr = mr->addr, size = mr->len;
+	unsigned int nframes;
+	int size_chk, err;
+
+	if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
+		/* Strictly speaking we could support this, if:
+		 * - huge pages, or*
+		 * - using an IOMMU, or
+		 * - making sure the memory area is consecutive
+		 * but for now, we simply say "computer says no".
+		 */
+		return -EINVAL;
+	}
+
+	if (!is_power_of_2(frame_size))
+		return -EINVAL;
+
+	if (!PAGE_ALIGNED(addr)) {
+		/* Memory area has to be page size aligned. For
+		 * simplicity, this might change.
+		 */
+		return -EINVAL;
+	}
+
+	if ((addr + size) < addr)
+		return -EINVAL;
+
+	nframes = size / frame_size;
+	if (nframes == 0 || nframes > UINT_MAX)
+		return -EINVAL;
+
+	frame_headroom = ALIGN(frame_headroom, 64);
+
+	size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM;
+	if (size_chk < 0)
+		return -EINVAL;
+
+	umem->pid = get_task_pid(current, PIDTYPE_PID);
+	umem->size = (size_t)size;
+	umem->address = (unsigned long)addr;
+	umem->props.frame_size = frame_size;
+	umem->props.nframes = nframes;
+	umem->frame_headroom = frame_headroom;
+	umem->npgs = size / PAGE_SIZE;
+	umem->pgs = NULL;
+	umem->user = NULL;
+
+	umem->frame_size_log2 = ilog2(frame_size);
+	umem->nfpp_mask = (PAGE_SIZE / frame_size) - 1;
+	umem->nfpplog2 = ilog2(PAGE_SIZE / frame_size);
+	atomic_set(&umem->users, 1);
+
+	err = xdp_umem_account_pages(umem);
+	if (err)
+		goto out;
+
+	err = xdp_umem_pin_pages(umem);
+	if (err)
+		goto out;
+	return 0;
+
+out:
+	put_pid(umem->pid);
+	return err;
+}
+
+int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
+{
+	int err;
+
+	if (!umem)
+		return -EINVAL;
+
+	down_write(&current->mm->mmap_sem);
+
+	err = __xdp_umem_reg(umem, mr);
+
+	up_write(&current->mm->mmap_sem);
+	return err;
+}
+
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
new file mode 100644
index 000000000000..58714f4f7f25
--- /dev/null
+++ b/net/xdp/xdp_umem.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef XDP_UMEM_H_
+#define XDP_UMEM_H_
+
+#include <linux/mm.h>
+#include <linux/if_xdp.h>
+
+#include "xdp_umem_props.h"
+
+struct xdp_umem {
+	struct page **pgs;
+	struct xdp_umem_props props;
+	u32 npgs;
+	u32 frame_headroom;
+	u32 nfpp_mask;
+	u32 nfpplog2;
+	u32 frame_size_log2;
+	struct user_struct *user;
+	struct pid *pid;
+	unsigned long address;
+	size_t size;
+	atomic_t users;
+};
+
+int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr);
+void xdp_put_umem(struct xdp_umem *umem);
+int xdp_umem_create(struct xdp_umem **umem);
+
+#endif /* XDP_UMEM_H_ */
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
new file mode 100644
index 000000000000..77fb5daf29f3
--- /dev/null
+++ b/net/xdp/xdp_umem_props.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef XDP_UMEM_PROPS_H_
+#define XDP_UMEM_PROPS_H_
+
+struct xdp_umem_props {
+	u32 frame_size;
+	u32 nframes;
+};
+
+#endif /* XDP_UMEM_PROPS_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
new file mode 100644
index 000000000000..19fc719cbe0d
--- /dev/null
+++ b/net/xdp/xsk.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP sockets
+ *
+ * AF_XDP sockets allows a channel between XDP programs and userspace
+ * applications.
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Author(s): Björn Töpel <bjorn.topel@intel.com>
+ *	      Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
+
+#include <linux/if_xdp.h>
+#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/socket.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+
+#include "xdp_umem.h"
+
+struct xdp_sock {
+	/* struct sock must be the first member of struct xdp_sock */
+	struct sock sk;
+	struct xdp_umem *umem;
+	/* Protects multiple processes in the control path */
+	struct mutex mutex;
+};
+
+static struct xdp_sock *xdp_sk(struct sock *sk)
+{
+	return (struct xdp_sock *)sk;
+}
+
+static int xsk_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct net *net;
+
+	if (!sk)
+		return 0;
+
+	net = sock_net(sk);
+
+	local_bh_disable();
+	sock_prot_inuse_add(net, sk->sk_prot, -1);
+	local_bh_enable();
+
+	sock_orphan(sk);
+	sock->sk = NULL;
+
+	sk_refcnt_debug_release(sk);
+	sock_put(sk);
+
+	return 0;
+}
+
+static int xsk_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+	int err;
+
+	if (level != SOL_XDP)
+		return -ENOPROTOOPT;
+
+	switch (optname) {
+	case XDP_UMEM_REG:
+	{
+		struct xdp_umem_reg mr;
+		struct xdp_umem *umem;
+
+		if (xs->umem)
+			return -EBUSY;
+
+		if (copy_from_user(&mr, optval, sizeof(mr)))
+			return -EFAULT;
+
+		mutex_lock(&xs->mutex);
+		err = xdp_umem_create(&umem);
+
+		err = xdp_umem_reg(umem, &mr);
+		if (err) {
+			kfree(umem);
+			mutex_unlock(&xs->mutex);
+			return err;
+		}
+
+		/* Make sure umem is ready before it can be seen by others */
+		smp_wmb();
+
+		xs->umem = umem;
+		mutex_unlock(&xs->mutex);
+		return 0;
+	}
+	default:
+		break;
+	}
+
+	return -ENOPROTOOPT;
+}
+
+static struct proto xsk_proto = {
+	.name =		"XDP",
+	.owner =	THIS_MODULE,
+	.obj_size =	sizeof(struct xdp_sock),
+};
+
+static const struct proto_ops xsk_proto_ops = {
+	.family =	PF_XDP,
+	.owner =	THIS_MODULE,
+	.release =	xsk_release,
+	.bind =		sock_no_bind,
+	.connect =	sock_no_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	sock_no_getname,
+	.poll =		sock_no_poll,
+	.ioctl =	sock_no_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	xsk_setsockopt,
+	.getsockopt =	sock_no_getsockopt,
+	.sendmsg =	sock_no_sendmsg,
+	.recvmsg =	sock_no_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static void xsk_destruct(struct sock *sk)
+{
+	struct xdp_sock *xs = xdp_sk(sk);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		return;
+
+	xdp_put_umem(xs->umem);
+
+	sk_refcnt_debug_dec(sk);
+}
+
+static int xsk_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	struct sock *sk;
+	struct xdp_sock *xs;
+
+	if (!ns_capable(net->user_ns, CAP_NET_RAW))
+		return -EPERM;
+	if (sock->type != SOCK_RAW)
+		return -ESOCKTNOSUPPORT;
+
+	if (protocol)
+		return -EPROTONOSUPPORT;
+
+	sock->state = SS_UNCONNECTED;
+
+	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
+	if (!sk)
+		return -ENOBUFS;
+
+	sock->ops = &xsk_proto_ops;
+
+	sock_init_data(sock, sk);
+
+	sk->sk_family = PF_XDP;
+
+	sk->sk_destruct = xsk_destruct;
+	sk_refcnt_debug_inc(sk);
+
+	xs = xdp_sk(sk);
+	mutex_init(&xs->mutex);
+
+	local_bh_disable();
+	sock_prot_inuse_add(net, &xsk_proto, 1);
+	local_bh_enable();
+
+	return 0;
+}
+
+static const struct net_proto_family xsk_family_ops = {
+	.family = PF_XDP,
+	.create = xsk_create,
+	.owner	= THIS_MODULE,
+};
+
+static int __init xsk_init(void)
+{
+	int err;
+
+	err = proto_register(&xsk_proto, 0 /* no slab */);
+	if (err)
+		goto out;
+
+	err = sock_register(&xsk_family_ops);
+	if (err)
+		goto out_proto;
+
+	return 0;
+
+out_proto:
+	proto_unregister(&xsk_proto);
+out:
+	return err;
+}
+
+fs_initcall(xsk_init);
-- 
2.14.1

^ permalink raw reply related

* [PATCH bpf-next 01/15] net: initial AF_XDP skeleton
From: Björn Töpel @ 2018-04-23 13:56 UTC (permalink / raw)
  To: bjorn.topel, magnus.karlsson, alexander.h.duyck, alexander.duyck,
	john.fastabend, ast, brouer, willemdebruijn.kernel, daniel, mst,
	netdev
  Cc: Björn Töpel, michael.lundkvist, jesse.brandeburg,
	anjali.singhai, qi.z.zhang
In-Reply-To: <20180423135619.7179-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

Buildable skeleton of AF_XDP without any functionality. Just what it
takes to register a new address family.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 MAINTAINERS                         |  8 ++++++++
 include/linux/socket.h              |  5 ++++-
 net/Kconfig                         |  1 +
 net/core/sock.c                     | 12 ++++++++----
 net/xdp/Kconfig                     |  7 +++++++
 security/selinux/hooks.c            |  4 +++-
 security/selinux/include/classmap.h |  4 +++-
 7 files changed, 34 insertions(+), 7 deletions(-)
 create mode 100644 net/xdp/Kconfig

diff --git a/MAINTAINERS b/MAINTAINERS
index fc812fb5857a..ff93d024e6c3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15405,6 +15405,14 @@ T:	git git://linuxtv.org/media_tree.git
 S:	Maintained
 F:	drivers/media/tuners/tuner-xc2028.*
 
+XDP SOCKETS (AF_XDP)
+M:	Björn Töpel <bjorn.topel@intel.com>
+M:	Magnus Karlsson <magnus.karlsson@intel.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	kernel/bpf/xskmap.c
+F:	net/xdp/
+
 XEN BLOCK SUBSYSTEM
 M:	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
 M:	Roger Pau Monné <roger.pau@citrix.com>
diff --git a/include/linux/socket.h b/include/linux/socket.h
index ea50f4a65816..7ed4713d5337 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -207,8 +207,9 @@ struct ucred {
 				 * PF_SMC protocol family that
 				 * reuses AF_INET address family
 				 */
+#define AF_XDP		44	/* XDP sockets			*/
 
-#define AF_MAX		44	/* For now.. */
+#define AF_MAX		45	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -257,6 +258,7 @@ struct ucred {
 #define PF_KCM		AF_KCM
 #define PF_QIPCRTR	AF_QIPCRTR
 #define PF_SMC		AF_SMC
+#define PF_XDP		AF_XDP
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
@@ -338,6 +340,7 @@ struct ucred {
 #define SOL_NFC		280
 #define SOL_KCM		281
 #define SOL_TLS		282
+#define SOL_XDP		283
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/net/Kconfig b/net/Kconfig
index 6fa1a4493b8c..86471a1c1ed4 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -59,6 +59,7 @@ source "net/tls/Kconfig"
 source "net/xfrm/Kconfig"
 source "net/iucv/Kconfig"
 source "net/smc/Kconfig"
+source "net/xdp/Kconfig"
 
 config INET
 	bool "TCP/IP networking"
diff --git a/net/core/sock.c b/net/core/sock.c
index b2c3db169ca1..e7d8b6c955c6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -226,7 +226,8 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
-  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_MAX"
+  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
+  x "AF_MAX"
 
 static const char *const af_family_key_strings[AF_MAX+1] = {
 	_sock_locks("sk_lock-")
@@ -262,7 +263,8 @@ static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
-  "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
+  "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_XDP"      ,
+  "rlock-AF_MAX"
 };
 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
@@ -279,7 +281,8 @@ static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
-  "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
+  "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_XDP"      ,
+  "wlock-AF_MAX"
 };
 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
@@ -296,7 +299,8 @@ static const char *const af_family_elock_key_strings[AF_MAX+1] = {
   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
-  "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
+  "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_XDP"      ,
+  "elock-AF_MAX"
 };
 
 /*
diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig
new file mode 100644
index 000000000000..90e4a7152854
--- /dev/null
+++ b/net/xdp/Kconfig
@@ -0,0 +1,7 @@
+config XDP_SOCKETS
+	bool "XDP sockets"
+	depends on BPF_SYSCALL
+	default n
+	help
+	  XDP sockets allows a channel between XDP programs and
+	  userspace applications.
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4cafe6a19167..5c508d26b367 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1471,7 +1471,9 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc
 			return SECCLASS_QIPCRTR_SOCKET;
 		case PF_SMC:
 			return SECCLASS_SMC_SOCKET;
-#if PF_MAX > 44
+		case PF_XDP:
+			return SECCLASS_XDP_SOCKET;
+#if PF_MAX > 45
 #error New address family defined, please update this function.
 #endif
 		}
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 7f0372426494..bd5fe0d3204a 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -240,9 +240,11 @@ struct security_class_mapping secclass_map[] = {
 	  { "manage_subnet", NULL } },
 	{ "bpf",
 	  {"map_create", "map_read", "map_write", "prog_load", "prog_run"} },
+	{ "xdp_socket",
+	  { COMMON_SOCK_PERMS, NULL } },
 	{ NULL }
   };
 
-#if PF_MAX > 44
+#if PF_MAX > 45
 #error New address family defined, please update secclass_map.
 #endif
-- 
2.14.1

^ permalink raw reply related

* [PATCH bpf-next 00/15] Introducing AF_XDP support
From: Björn Töpel @ 2018-04-23 13:56 UTC (permalink / raw)
  To: bjorn.topel, magnus.karlsson, alexander.h.duyck, alexander.duyck,
	john.fastabend, ast, brouer, willemdebruijn.kernel, daniel, mst,
	netdev
  Cc: Björn Töpel, michael.lundkvist, jesse.brandeburg,
	anjali.singhai, qi.z.zhang

From: Björn Töpel <bjorn.topel@intel.com>

This RFC introduces a new address family called AF_XDP that is
optimized for high performance packet processing and, in upcoming
patch sets, zero-copy semantics. In this v2 version, we have removed
all zero-copy related code in order to make it smaller, simpler and
hopefully more review friendly. This RFC only supports copy-mode for
the generic XDP path (XDP_SKB) for both RX and TX and copy-mode for RX
using the XDP_DRV path. Zero-copy support requires XDP and driver
changes that Jesper Dangaard Brouer is working on. Some of his work
has already been accepted. We will publish our zero-copy support for
RX and TX on top of his patch sets at a later point in time.

An AF_XDP socket (XSK) is created with the normal socket()
syscall. Associated with each XSK are two queues: the RX queue and the
TX queue. A socket can receive packets on the RX queue and it can send
packets on the TX queue. These queues are registered and sized with
the setsockopts XDP_RX_RING and XDP_TX_RING, respectively. It is
mandatory to have at least one of these queues for each socket. In
contrast to AF_PACKET V2/V3 these descriptor queues are separated from
packet buffers. An RX or TX descriptor points to a data buffer in a
memory area called a UMEM. RX and TX can share the same UMEM so that a
packet does not have to be copied between RX and TX. Moreover, if a
packet needs to be kept for a while due to a possible retransmit, the
descriptor that points to that packet can be changed to point to
another and reused right away. This again avoids copying data.

This new dedicated packet buffer area is call a UMEM. It consists of a
number of equally size frames and each frame has a unique frame id. A
descriptor in one of the queues references a frame by referencing its
frame id. The user space allocates memory for this UMEM using whatever
means it feels is most appropriate (malloc, mmap, huge pages,
etc). This memory area is then registered with the kernel using the new
setsockopt XDP_UMEM_REG. The UMEM also has two queues: the FILL queue
and the COMPLETION queue. The fill queue is used by the application to
send down frame ids for the kernel to fill in with RX packet
data. References to these frames will then appear in the RX queue of
the XSK once they have been received. The completion queue, on the
other hand, contains frame ids that the kernel has transmitted
completely and can now be used again by user space, for either TX or
RX. Thus, the frame ids appearing in the completion queue are ids that
were previously transmitted using the TX queue. In summary, the RX and
FILL queues are used for the RX path and the TX and COMPLETION queues
are used for the TX path.

The socket is then finally bound with a bind() call to a device and a
specific queue id on that device, and it is not until bind is
completed that traffic starts to flow. Note that in this RFC, all
packet data is copied out to user-space.

A new feature in this RFC is that the UMEM can be shared between
processes, if desired. If a process wants to do this, it simply skips
the registration of the UMEM and its corresponding two queues, sets a
flag in the bind call and submits the XSK of the process it would like
to share UMEM with as well as its own newly created XSK socket. The
new process will then receive frame id references in its own RX queue
that point to this shared UMEM. Note that since the queue structures
are single-consumer / single-producer (for performance reasons), the
new process has to create its own socket with associated RX and TX
queues, since it cannot share this with the other process. This is
also the reason that there is only one set of FILL and COMPLETION
queues per UMEM. It is the responsibility of a single process to
handle the UMEM. If multiple-producer / multiple-consumer queues are
implemented in the future, this requirement could be relaxed.

How is then packets distributed between these two XSK? We have
introduced a new BPF map called XSKMAP (or BPF_MAP_TYPE_XSKMAP in
full). The user-space application can place an XSK at an arbitrary
place in this map. The XDP program can then redirect a packet to a
specific index in this map and at this point XDP validates that the
XSK in that map was indeed bound to that device and queue number. If
not, the packet is dropped. If the map is empty at that index, the
packet is also dropped. This also means that it is currently mandatory
to have an XDP program loaded (and one XSK in the XSKMAP) to be able
to get any traffic to user space through the XSK.

AF_XDP can operate in two different modes: XDP_SKB and XDP_DRV. If the
driver does not have support for XDP, or XDP_SKB is explicitly chosen
when loading the XDP program, XDP_SKB mode is employed that uses SKBs
together with the generic XDP support and copies out the data to user
space. A fallback mode that works for any network device. On the other
hand, if the driver has support for XDP, it will be used by the AF_XDP
code to provide better performance, but there is still a copy of the
data into user space.

There is a xdpsock benchmarking/test application included that
demonstrates how to use AF_XDP sockets with both private and shared
UMEMs. Say that you would like your UDP traffic from port 4242 to end
up in queue 16, that we will enable AF_XDP on. Here, we use ethtool
for this:

      ethtool -N p3p2 rx-flow-hash udp4 fn
      ethtool -N p3p2 flow-type udp4 src-port 4242 dst-port 4242 \
          action 16

Running the rxdrop benchmark in XDP_DRV mode can then be done
using:

      samples/bpf/xdpsock -i p3p2 -q 16 -r -N

For XDP_SKB mode, use the switch "-S" instead of "-N" and all options
can be displayed with "-h", as usual.

We have run some benchmarks on a dual socket system with two Broadwell
E5 2660 @ 2.0 GHz with hyperthreading turned off. Each socket has 14
cores which gives a total of 28, but only two cores are used in these
experiments. One for TR/RX and one for the user space application. The
memory is DDR4 @ 2133 MT/s (1067 MHz) and the size of each DIMM is
8192MB and with 8 of those DIMMs in the system we have 64 GB of total
memory. The compiler used is gcc version 5.4.0 20160609. The NIC is an
Intel I40E 40Gbit/s using the i40e driver.

Below are the results in Mpps of the I40E NIC benchmark runs for 64
and 1500 byte packets, generated by commercial packet generator HW that is
generating packets at full 40 Gbit/s line rate.

AF_XDP performance 64 byte packets. Results from RFC V2 in parenthesis.
Benchmark   XDP_SKB   XDP_DRV
rxdrop       2.9(3.0)   9.4(9.3)  
txpush       2.5(2.2)   NA*
l2fwd        1.9(1.7)   2.4(2.4) (TX using XDP_SKB in both cases)

AF_XDP performance 1500 byte packets:
Benchmark   XDP_SKB   XDP_DRV
rxdrop       2.1(2.2)   3.3(3.1)  
l2fwd        1.4(1.1)   1.8(1.7) (TX using XDP_SKB in both cases)

* NA since we have no support for TX using the XDP_DRV infrastructure
  in this RFC. This is for a future patch set since it involves
  changes to the XDP NDOs. Some of this has been upstreamed by Jesper
  Dangaard Brouer.

XDP performance on our system as a base line:

64 byte packets:
XDP stats       CPU     pps         issue-pps
XDP-RX CPU      16      32,921,521  0

1500 byte packets:
XDP stats       CPU     pps         issue-pps
XDP-RX CPU      16      3,289,491   0

Changes from RFC V2:

* Optimizations and simplifications to the ring structures inspired by
  ptr_ring.h 
* Renamed XDP_[RX|TX]_QUEUE to XDP_[RX|TX]_RING in the uapi to be
  consistent with AF_PACKET
* Support for only having an RX queue or a TX queue defined
* Some bug fixes and code cleanup

The structure of the patch set is as follows:

Patches 1-2: Basic socket and umem plumbing 
Patches 3-10: RX support together with the new XSKMAP
Patches 11-14: TX support
Patch 15: Sample application

We based this patch set on bpf-next commit fbcf93ebcaef ("bpf: btf:
Clean up btf.h in uapi")

Questions:

* How to deal with cache alignment for uapi when different
  architectures can have different cache line sizes? We have just
  aligned it to 64 bytes for now, which works for many popular
  architectures, but not all. Please advise.

To do:

* Optimize performance

* Kernel selftest

Post-series plan:

* Kernel load module support of AF_XDP would be nice. Unclear how to
  achieve this though since our XDP code depends on net/core.

* Support for AF_XDP sockets without an XPD program loaded. In this
  case all the traffic on a queue should go up to the user space socket.

* Daniel Borkmann's suggestion for a "copy to XDP socket, and return
  XDP_PASS" for a tcpdump-like functionality.

* And of course getting to zero-copy support in small increments. 

Thanks: Björn and Magnus

Björn Töpel (8):
  net: initial AF_XDP skeleton
  xsk: add user memory registration support sockopt
  xsk: add Rx queue setup and mmap support
  xdp: introduce xdp_return_buff API
  xsk: add Rx receive functions and poll support
  bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP
  xsk: wire up XDP_DRV side of AF_XDP
  xsk: wire up XDP_SKB side of AF_XDP

Magnus Karlsson (7):
  xsk: add umem fill queue support and mmap
  xsk: add support for bind for Rx
  xsk: add umem completion queue support and mmap
  xsk: add Tx queue setup and mmap support
  xsk: support for Tx
  xsk: statistics support
  samples/bpf: sample application for AF_XDP sockets

 MAINTAINERS                         |   8 +
 include/linux/bpf.h                 |  26 +
 include/linux/bpf_types.h           |   3 +
 include/linux/filter.h              |   2 +-
 include/linux/socket.h              |   5 +-
 include/net/xdp.h                   |   1 +
 include/net/xdp_sock.h              |  46 ++
 include/uapi/linux/bpf.h            |   1 +
 include/uapi/linux/if_xdp.h         |  87 ++++
 kernel/bpf/Makefile                 |   3 +
 kernel/bpf/verifier.c               |   8 +-
 kernel/bpf/xskmap.c                 | 286 +++++++++++
 net/Kconfig                         |   1 +
 net/Makefile                        |   1 +
 net/core/dev.c                      |  34 +-
 net/core/filter.c                   |  40 +-
 net/core/sock.c                     |  12 +-
 net/core/xdp.c                      |  15 +-
 net/xdp/Kconfig                     |   7 +
 net/xdp/Makefile                    |   2 +
 net/xdp/xdp_umem.c                  | 256 ++++++++++
 net/xdp/xdp_umem.h                  |  65 +++
 net/xdp/xdp_umem_props.h            |  23 +
 net/xdp/xsk.c                       | 704 +++++++++++++++++++++++++++
 net/xdp/xsk_queue.c                 |  73 +++
 net/xdp/xsk_queue.h                 | 245 ++++++++++
 samples/bpf/Makefile                |   4 +
 samples/bpf/xdpsock.h               |  11 +
 samples/bpf/xdpsock_kern.c          |  56 +++
 samples/bpf/xdpsock_user.c          | 947 ++++++++++++++++++++++++++++++++++++
 security/selinux/hooks.c            |   4 +-
 security/selinux/include/classmap.h |   4 +-
 32 files changed, 2945 insertions(+), 35 deletions(-)
 create mode 100644 include/net/xdp_sock.h
 create mode 100644 include/uapi/linux/if_xdp.h
 create mode 100644 kernel/bpf/xskmap.c
 create mode 100644 net/xdp/Kconfig
 create mode 100644 net/xdp/Makefile
 create mode 100644 net/xdp/xdp_umem.c
 create mode 100644 net/xdp/xdp_umem.h
 create mode 100644 net/xdp/xdp_umem_props.h
 create mode 100644 net/xdp/xsk.c
 create mode 100644 net/xdp/xsk_queue.c
 create mode 100644 net/xdp/xsk_queue.h
 create mode 100644 samples/bpf/xdpsock.h
 create mode 100644 samples/bpf/xdpsock_kern.c
 create mode 100644 samples/bpf/xdpsock_user.c

-- 
2.14.1

^ permalink raw reply

* Re: [PATCH net] tcp: don't read out-of-bounds opsize
From: David Miller @ 2018-04-23 13:52 UTC (permalink / raw)
  To: jannh; +Cc: kuznet, yoshfuji, netdev, linux-kernel
In-Reply-To: <20180420135730.44921-1-jannh@google.com>

From: Jann Horn <jannh@google.com>
Date: Fri, 20 Apr 2018 15:57:30 +0200

> The old code reads the "opsize" variable from out-of-bounds memory (first
> byte behind the segment) if a broken TCP segment ends directly after an
> opcode that is neither EOL nor NOP.
> 
> The result of the read isn't used for anything, so the worst thing that
> could theoretically happen is a pagefault; and since the physmap is usually
> mostly contiguous, even that seems pretty unlikely.
> 
> The following C reproducer triggers the uninitialized read - however, you
> can't actually see anything happen unless you put something like a
> pr_warn() in tcp_parse_md5sig_option() to print the opsize.
 ...
> Fixes: cfb6eeb4c860 ("[TCP]: MD5 Signature Option (RFC2385) support.")
> Signed-off-by: Jann Horn <jannh@google.com>

Applied and queued up for -stable, thank you.

^ permalink raw reply

* Re: [PATCH net-next] net: stmmac: Implement logic to automatically select HW Interface
From: Jose Abreu @ 2018-04-23 13:52 UTC (permalink / raw)
  To: David Miller, Jose.Abreu, Philippe Ombredanne
  Cc: netdev, Joao.Pinto, Vitor.Soares, peppe.cavallaro,
	alexandre.torgue
In-Reply-To: <20180422.205937.1092068986573510341.davem@davemloft.net>

Hi David,

On 23-04-2018 01:59, David Miller wrote:
> From: Jose Abreu <Jose.Abreu@synopsys.com>
> Date: Thu, 19 Apr 2018 16:24:15 +0100
>
>> @@ -0,0 +1,216 @@
>> +// SPDX-License-Identifier: (GPL-2.0 OR MIT)
>> +// Copyright (c) 2018 Synopsys, Inc. and/or its affiliates.
>> +// stmmac HW Interface Handling
> Please do not use C++ style comments for anything past the
> SPDX identifier line.
>
> Thank you.

I sent a new version but I was looking at some old patches I sent
for media subsystem [1] and I found an email where Philippe told
me to use this style.

Philippe, can you please clarify?

[1] https://patchwork.linuxtv.org/patch/45812/

Thanks and Best Regards,
Jose Miguel Abreu

^ permalink raw reply

* [PATCH] selftests: bpf: update .gitignore with missing file
From: Anders Roxell @ 2018-04-23 13:50 UTC (permalink / raw)
  To: ast, daniel, shuah; +Cc: netdev, linux-kernel, linux-kselftest, Anders Roxell

Fixes: c0fa1b6c3efc ("bpf: btf: Add BTF tests")
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
---
 tools/testing/selftests/bpf/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 5e1ab2f0eb79..3e3b3ced3f7c 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -15,3 +15,4 @@ test_libbpf_open
 test_sock
 test_sock_addr
 urandom_read
+test_btf
-- 
2.17.0

^ permalink raw reply related

* Re: [PATCH net-next v2 0/2] openvswitch: Support conntrack zone limit
From: David Miller @ 2018-04-23 13:39 UTC (permalink / raw)
  To: yihung.wei; +Cc: netdev, pshelar
In-Reply-To: <1524011429-14500-1-git-send-email-yihung.wei@gmail.com>

From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Tue, 17 Apr 2018 17:30:27 -0700

> Currently, nf_conntrack_max is used to limit the maximum number of
> conntrack entries in the conntrack table for every network namespace.
> For the VMs and containers that reside in the same namespace,
> they share the same conntrack table, and the total # of conntrack entries
> for all the VMs and containers are limited by nf_conntrack_max.  In this
> case, if one of the VM/container abuses the usage the conntrack entries,
> it blocks the others from committing valid conntrack entries into the
> conntrack table.  Even if we can possibly put the VM in different network
> namespace, the current nf_conntrack_max configuration is kind of rigid
> that we cannot limit different VM/container to have different # conntrack
> entries.
> 
> To address the aforementioned issue, this patch proposes to have a
> fine-grained mechanism that could further limit the # of conntrack entries
> per-zone.  For example, we can designate different zone to different VM,
> and set conntrack limit to each zone.  By providing this isolation, a
> mis-behaved VM only consumes the conntrack entries in its own zone, and
> it will not influence other well-behaved VMs.  Moreover, the users can
> set various conntrack limit to different zone based on their preference.
> 
> The proposed implementation utilizes Netfilter's nf_conncount backend
> to count the number of connections in a particular zone.  If the number of
> connection is above a configured limitation, OVS will return ENOMEM to the
> userspace.  If userspace does not configure the zone limit, the limit
> defaults to zero that is no limitation, which is backward compatible to
> the behavior without this patch.
> 
> The first patch defines the conntrack limit netlink definition, and the
> second patch provides the implementation.

Pravin, I need this series reviewed.

Thank you.

^ permalink raw reply

* Re: WARNING: suspicious RCU usage in rt6_check_expired
From: Eric Dumazet @ 2018-04-23 13:31 UTC (permalink / raw)
  To: syzbot, davem, kuznet, linux-kernel, netdev, syzkaller-bugs,
	David Ahern
In-Reply-To: <0000000000009f23ab056a7fc243@google.com>



On 04/23/2018 01:24 AM, syzbot wrote:
> Hello,
> 
> syzbot hit the following crash on net-next commit
> 0638eb573cde5888c0886c7f35da604e5db209a6 (Sat Apr 21 20:06:14 2018 +0000)
> Merge branch 'ipv6-Another-followup-to-the-fib6_info-change'
> syzbot dashboard link: https://syzkaller.appspot.com/bug?extid=2422c9e35796659d2273
> 
> So far this crash happened 3 times on net-next.
> Unfortunately, I don't have any reproducer for this crash yet.
> Raw console output: https://syzkaller.appspot.com/x/log.txt?id=6081013801287680
> Kernel config: https://syzkaller.appspot.com/x/.config?id=-8412024688694752032
> compiler: gcc (GCC) 8.0.1 20180413 (experimental)
> 
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+2422c9e35796659d2273@syzkaller.appspotmail.com
> It will help syzbot understand when the bug is fixed. See footer for details.
> If you forward the report, please keep this part and the footer.
> 
> netlink: 'syz-executor4': attribute type 6 has an invalid length.
> netlink: 'syz-executor4': attribute type 1 has an invalid length.
> netlink: 'syz-executor4': attribute type 6 has an invalid length.
> 
> =============================
> WARNING: suspicious RCU usage
> 4.16.0+ #11 Not tainted
> -----------------------------
> net/ipv6/route.c:410 suspicious rcu_dereference_check() usage!
> 
> other info that might help us debug this:
> 
> 
> rcu_scheduler_active = 2, debug_locks = 1
> 1 lock held by syz-executor7/25958:
>  #0: 00000000d1963139 (sk_lock-AF_INET6){+.+.}, at: lock_sock include/net/sock.h:1469 [inline]
>  #0: 00000000d1963139 (sk_lock-AF_INET6){+.+.}, at: sock_setsockopt+0x19c/0x1fe0 net/core/sock.c:717
> 
> stack backtrace:
> CPU: 1 PID: 25958 Comm: syz-executor7 Not tainted 4.16.0+ #11
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0x1b9/0x294 lib/dump_stack.c:113
>  lockdep_rcu_suspicious+0x14a/0x153 kernel/locking/lockdep.c:4592
>  rt6_check_expired+0x38b/0x3e0 net/ipv6/route.c:410
>  ip6_negative_advice+0x67/0xc0 net/ipv6/route.c:2204
>  dst_negative_advice include/net/sock.h:1786 [inline]
>  sock_setsockopt+0x138f/0x1fe0 net/core/sock.c:1051
>  __sys_setsockopt+0x2df/0x390 net/socket.c:1899
>  SYSC_setsockopt net/socket.c:1914 [inline]
>  SyS_setsockopt+0x34/0x50 net/socket.c:1911
>  do_syscall_64+0x29e/0x9d0 arch/x86/entry/common.c:287
>  entry_SYSCALL_64_after_hwframe+0x42/0xb7
> RIP: 0033:0x455389
> RSP: 002b:00007f7556e30c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000036
> RAX: ffffffffffffffda RBX: 00007f7556e316d4 RCX: 0000000000455389
> RDX: 0000000000000035 RSI: 0000000000000001 RDI: 0000000000000013
> RBP: 000000000072bf58 R08: 0000000000000004 R09: 0000000000000000
> R10: 0000000020000000 R11: 0000000000000246 R12: 00000000ffffffff
> R13: 00000000000005ff R14: 00000000006fc088 R15: 0000000000000001
> netlink: 'syz-executor4': attribute type 4 has an invalid length.
> netlink: 'syz-executor4': attribute type 4 has an invalid length.
> IPVS: set_ctl: invalid protocol: 59 127.0.0.1:20000 lc
> IPVS: set_ctl: invalid protocol: 127 224.0.0.1:20000 rr
> IPVS: sync thread started: state = BACKUP, mcast_ifn = ip6tnl0, syncid = 4, id = 0
> IPVS: set_ctl: invalid protocol: 127 224.0.0.1:20000 rr
> netlink: 72 bytes leftover after parsing attributes in process `syz-executor2'.
> netlink: 72 bytes leftover after parsing attributes in process `syz-executor2'.
> dccp_xmit_packet: Payload too large (65423) for featneg.
> IPVS: set_ctl: invalid protocol: 29 1.0.0.0:2 wlc
> IPVS: set_ctl: invalid protocol: 29 1.0.0.0:2 wlc
> netlink: 32 bytes leftover after parsing attributes in process `syz-executor7'.
> netlink: 12 bytes leftover after parsing attributes in process `syz-executor1'.
> netlink: 12 bytes leftover after parsing attributes in process `syz-executor1'.
> netlink: 'syz-executor1': attribute type 29 has an invalid length.
> netlink: 8 bytes leftover after parsing attributes in process `syz-executor1'.
> IPVS: set_ctl: invalid protocol: 108 224.0.0.1:20004 lblc
> netlink: 'syz-executor1': attribute type 29 has an invalid length.
> netlink: 8 bytes leftover after parsing attributes in process `syz-executor1'.
> IPVS: set_ctl: invalid protocol: 108 224.0.0.1:20004 lblc
> 
> 
> ---
> This bug is generated by a dumb bot. It may contain errors.
> See https://goo.gl/tpsmEJ for details.
> Direct all questions to syzkaller@googlegroups.com.
> 
> syzbot will keep track of this bug report.
> If you forgot to add the Reported-by tag, once the fix for this bug is merged
> into any tree, please reply to this email with:
> #syz fix: exact-commit-title
> To mark this as a duplicate of another syzbot report, please reply with:
> #syz dup: exact-subject-of-another-report
> If it's a one-off invalid bug report, please reply with:
> #syz invalid
> Note: if the crash happens again, it will cause creation of a new bug report.
> Note: all commands must start from beginning of the line in the email body.

Added in commit a68886a691804d3f6d479ebf6825480fbafb6a00
("net/ipv6: Make from in rt6_info rcu protected")

^ permalink raw reply

* [PATCH 2/3] net/unix: hook unix_socketpair() into LSM
From: David Herrmann @ 2018-04-23 13:30 UTC (permalink / raw)
  To: linux-kernel
  Cc: James Morris, Paul Moore, teg, Stephen Smalley, selinux,
	linux-security-module, Eric Paris, serge, davem, netdev,
	David Herrmann
In-Reply-To: <20180423133015.5455-1-dh.herrmann@gmail.com>

Use the newly created LSM-hook for unix_socketpair(). The default hook
return-value is 0, so behavior stays the same unless LSMs start using
this hook.

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
---
 net/unix/af_unix.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 68bb70a62afe..bc9705ace9b1 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1371,6 +1371,11 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 static int unix_socketpair(struct socket *socka, struct socket *sockb)
 {
 	struct sock *ska = socka->sk, *skb = sockb->sk;
+	int err;
+
+	err = security_unix_stream_socketpair(ska, skb);
+	if (err)
+		return err;
 
 	/* Join our sockets back to back */
 	sock_hold(ska);
-- 
2.17.0

^ permalink raw reply related

* [PATCH 3/3] selinux: provide unix_stream_socketpair callback
From: David Herrmann @ 2018-04-23 13:30 UTC (permalink / raw)
  To: linux-kernel
  Cc: James Morris, Paul Moore, teg, Stephen Smalley, selinux,
	linux-security-module, Eric Paris, serge, davem, netdev,
	David Herrmann
In-Reply-To: <20180423133015.5455-1-dh.herrmann@gmail.com>

Make sure to implement the new unix_stream_socketpair callback so the
SO_PEERSEC call on socketpair(2)s will return correct information.

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
---
 security/selinux/hooks.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4cafe6a19167..828881d9a41d 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4905,6 +4905,18 @@ static int selinux_socket_unix_stream_connect(struct sock *sock,
 	return 0;
 }
 
+static int selinux_socket_unix_stream_socketpair(struct sock *socka,
+						 struct sock *sockb)
+{
+	struct sk_security_struct *sksec_a = socka->sk_security;
+	struct sk_security_struct *sksec_b = sockb->sk_security;
+
+	sksec_a->peer_sid = sksec_b->sid;
+	sksec_b->peer_sid = sksec_a->sid;
+
+	return 0;
+}
+
 static int selinux_socket_unix_may_send(struct socket *sock,
 					struct socket *other)
 {
@@ -6995,6 +7007,8 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(inode_getsecctx, selinux_inode_getsecctx),
 
 	LSM_HOOK_INIT(unix_stream_connect, selinux_socket_unix_stream_connect),
+	LSM_HOOK_INIT(unix_stream_socketpair,
+			selinux_socket_unix_stream_socketpair),
 	LSM_HOOK_INIT(unix_may_send, selinux_socket_unix_may_send),
 
 	LSM_HOOK_INIT(socket_create, selinux_socket_create),
-- 
2.17.0

^ permalink raw reply related

* [PATCH 1/3] security: add hook for socketpair(AF_UNIX, ...)
From: David Herrmann @ 2018-04-23 13:30 UTC (permalink / raw)
  To: linux-kernel
  Cc: James Morris, Paul Moore, teg, Stephen Smalley, selinux,
	linux-security-module, Eric Paris, serge, davem, netdev,
	David Herrmann
In-Reply-To: <20180423133015.5455-1-dh.herrmann@gmail.com>

Right now the LSM labels for socketpairs are always uninitialized,
since there is no security hook for the socketpair() syscall. This
patch adds the required hooks so LSMs can properly label socketpairs.
This allows SO_PEERSEC to return useful information on those sockets.

Note that the behavior of socketpair() can be emulated by creating a
listener socket, connecting to it, and then discarding the initial
listener socket. With this workaround, SO_PEERSEC would return the
caller's security context. However, with socketpair(), the uninitialized
context is returned unconditionally. This is unexpected and makes
socketpair() less useful in situations where the security context is
crucial to the application.

With the new socketpair-hook this disparity can be solved by making
socketpair() return the expected security context.

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
---
 include/linux/lsm_hooks.h | 8 ++++++++
 include/linux/security.h  | 7 +++++++
 security/security.c       | 6 ++++++
 3 files changed, 21 insertions(+)

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 9d0b286f3dba..2a23c75c1541 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -717,6 +717,12 @@
  *	@other contains the peer sock structure.
  *	@newsk contains the new sock structure.
  *	Return 0 if permission is granted.
+ * @unix_stream_socketpair:
+ *	Check permissions before establishing a Unix domain stream connection
+ *	for a fresh pair of sockets.
+ *	@socka contains the first sock structure.
+ *	@sockb contains the second sock structure.
+ *	Return 0 if permission is granted and the connection was established.
  * @unix_may_send:
  *	Check permissions before connecting or sending datagrams from @sock to
  *	@other.
@@ -1651,6 +1657,7 @@ union security_list_options {
 #ifdef CONFIG_SECURITY_NETWORK
 	int (*unix_stream_connect)(struct sock *sock, struct sock *other,
 					struct sock *newsk);
+	int (*unix_stream_socketpair)(struct sock *socka, struct sock *sockb);
 	int (*unix_may_send)(struct socket *sock, struct socket *other);
 
 	int (*socket_create)(int family, int type, int protocol, int kern);
@@ -1919,6 +1926,7 @@ struct security_hook_heads {
 	struct hlist_head inode_getsecctx;
 #ifdef CONFIG_SECURITY_NETWORK
 	struct hlist_head unix_stream_connect;
+	struct hlist_head unix_stream_socketpair;
 	struct hlist_head unix_may_send;
 	struct hlist_head socket_create;
 	struct hlist_head socket_post_create;
diff --git a/include/linux/security.h b/include/linux/security.h
index 200920f521a1..be275deeda10 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1187,6 +1187,7 @@ static inline int security_inode_getsecctx(struct inode *inode, void **ctx, u32
 #ifdef CONFIG_SECURITY_NETWORK
 
 int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk);
+int security_unix_stream_socketpair(struct sock *socka, struct sock *sockb);
 int security_unix_may_send(struct socket *sock,  struct socket *other);
 int security_socket_create(int family, int type, int protocol, int kern);
 int security_socket_post_create(struct socket *sock, int family,
@@ -1242,6 +1243,12 @@ static inline int security_unix_stream_connect(struct sock *sock,
 	return 0;
 }
 
+static inline int security_unix_stream_socketpair(struct sock *socka,
+						  struct sock *sockb)
+{
+	return 0;
+}
+
 static inline int security_unix_may_send(struct socket *sock,
 					 struct socket *other)
 {
diff --git a/security/security.c b/security/security.c
index 7bc2fde023a7..3dfd374e84e5 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1340,6 +1340,12 @@ int security_unix_stream_connect(struct sock *sock, struct sock *other, struct s
 }
 EXPORT_SYMBOL(security_unix_stream_connect);
 
+int security_unix_stream_socketpair(struct sock *socka, struct sock *sockb)
+{
+	return call_int_hook(unix_stream_socketpair, 0, socka, sockb);
+}
+EXPORT_SYMBOL(security_unix_stream_socketpair);
+
 int security_unix_may_send(struct socket *sock,  struct socket *other)
 {
 	return call_int_hook(unix_may_send, 0, sock, other);
-- 
2.17.0

^ permalink raw reply related

* [PATCH 0/3] Introduce LSM-hook for socketpair(2)
From: David Herrmann @ 2018-04-23 13:30 UTC (permalink / raw)
  To: linux-kernel
  Cc: James Morris, Paul Moore, teg, Stephen Smalley, selinux,
	linux-security-module, Eric Paris, serge, davem, netdev,
	David Herrmann

Hi

This series adds a new LSM hook for the socketpair(2) syscall. The idea
is to allow SO_PEERSEC to be called on AF_UNIX sockets created via
socketpair(2), and return the same information as if you emulated
socketpair(2) via a temporary listener socket. Right now SO_PEERSEC
will return the unlabeled credentials for a socketpair, rather than the
actual credentials of the creating process.

A simple call to:

    socketpair(AF_UNIX, SOCK_STREAM, 0, out);

can be emulated via a temporary listener socket bound to a unique,
random name in the abstract namespace. By connecting to this listener
socket, accept(2) will return the second part of the pair. If
SO_PEERSEC is queried on these, the correct credentials of the creating
process are returned. A simple comparison between the behavior of
SO_PEERSEC on socketpair(2) and an emulated socketpair is included in
the dbus-broker test-suite [1].

This patch series tries to close this gap and makes both behave the
same. A new LSM-hook is added which allows LSMs to cache the correct
peer information on newly created socket-pairs.

Apart from fixing this behavioral difference, the dbus-broker project
actually needs to query the credentials of socketpairs, and currently
must resort to racy procfs(2) queries to get the LSM credentials of its
controller socket. Several parts of the dbus-broker project allow you
to pass in a socket during execve(2), which will be used by the child
process to accept control-commands from its parent. One natural way to
create this communication channel is to use socketpair(2). However,
right now SO_PEERSEC does not return any useful information, hence, the
child-process would need other means to retrieve this information. By
avoiding socketpair(2) and using the hacky-emulated version, this is not
an issue.

There was a previous discussion on this matter [2] roughly a year ago.
Back then there was the suspicion that proper SO_PEERSEC would confuse
applications. However, we could not find any evidence backing this
suspicion. Furthermore, we now actually see the contrary. Lack of
SO_PEERSEC makes it a hassle to use socketpairs with LSM credentials.
Hence, we propose to implement full SO_PEERSEC for socketpairs.

This series only adds SELinux backends, since that is what we need for
RHEL. I will gladly extend the other LSMs if needed.

Thanks
David

[1] https://github.com/bus1/dbus-broker/blob/master/src/util/test-peersec.c
[2] https://www.spinics.net/lists/selinux/msg22674.html

David Herrmann (3):
  security: add hook for socketpair(AF_UNIX, ...)
  net/unix: hook unix_socketpair() into LSM
  selinux: provide unix_stream_socketpair callback

 include/linux/lsm_hooks.h |  8 ++++++++
 include/linux/security.h  |  7 +++++++
 net/unix/af_unix.c        |  5 +++++
 security/security.c       |  6 ++++++
 security/selinux/hooks.c  | 14 ++++++++++++++
 5 files changed, 40 insertions(+)

-- 
2.17.0

^ permalink raw reply

* Re: IP_ADD_MEMBERSHIP with imr_ifindex!=0 for multiple processes with different interfaces
From: Eric Dumazet @ 2018-04-23 13:29 UTC (permalink / raw)
  To: Klebsch, Mario, netdev@vger.kernel.org
In-Reply-To: <9208AF0D0E6F444F80E90796B4E51CA07C163C10@EXCHANGE01.muppets.local>



On 04/23/2018 06:22 AM, Klebsch, Mario wrote:
> Hi, 
> 
> I have a problem with multicast reception in the linux kernel and I hope, this is the right place to ask for help or to report a bug.
> 
> I need to receive multicasts on a single interface. I have written a small program, which executes IP_ADD_MEMBERSHIP with imr.imr_ifindex set to the interface index. The program works well, as long as only a single instance of this program is running. If I start a second instance on a different network interface, both programs receive multicast frames from both interfaces. 
> 
> When called without argument, the test program list the network interfaces. When called with an interface name as argument, if starts receiving multicasts on that interface.
> 
> I am running vanilla Linux kernel 4.12.0.
> 
> # uname -a
> Linux c627 4.12.0 #1 SMP Mon Apr 23 14:08:24 CEST 2018 i686 Intel(R) Core(TM) i5-7400 CPU @ 3.00GHz GenuineIntel GNU/Linux
> # 
> 
> P.S. The program runs fine on MacOSX.
>

It looks like your program needs to use SO_BINDTODEVICE if it really wants this device filtering ?

^ permalink raw reply

* KMSAN: uninit-value in ip_vs_lblcr_check_expire
From: syzbot @ 2018-04-23 13:28 UTC (permalink / raw)
  To: coreteam, davem, fw, horms, ja, kadlec, linux-kernel, lvs-devel,
	netdev, netfilter-devel, pablo, syzkaller-bugs, wensong

Hello,

syzbot hit the following crash on  
https://github.com/google/kmsan.git/master commit
d2d741e5d1898dfde1a75ea3d29a9a3e2edf0617 (Sun Apr 22 15:05:22 2018 +0000)
kmsan: add initialization for shmem pages
syzbot dashboard link:  
https://syzkaller.appspot.com/bug?extid=3dfdea57819073a04f21

So far this crash happened 2 times on  
https://github.com/google/kmsan.git/master.
Unfortunately, I don't have any reproducer for this crash yet.
Raw console output:  
https://syzkaller.appspot.com/x/log.txt?id=6285034612850688
Kernel config: https://syzkaller.appspot.com/x/.config?id=328654897048964367
compiler: clang version 7.0.0 (trunk 329391)

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+3dfdea57819073a04f21@syzkaller.appspotmail.com
It will help syzbot understand when the bug is fixed. See footer for  
details.
If you forward the report, please keep this part and the footer.

RDX: 0000000000000000 RSI: 0000000020000080 RDI: 0000000000000013
RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000014
R13: 00000000000004f3 R14: 00000000006fa768 R15: 0000000000000000
==================================================================
BUG: KMSAN: uninit-value in ip_vs_lblcr_check_expire+0x1551/0x1600  
net/netfilter/ipvs/ip_vs_lblcr.c:479
CPU: 0 PID: 13883 Comm: syz-executor4 Not tainted 4.16.0+ #86
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  <IRQ>
  __dump_stack lib/dump_stack.c:17 [inline]
  dump_stack+0x185/0x1d0 lib/dump_stack.c:53
  kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
  __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683
  ip_vs_lblcr_check_expire+0x1551/0x1600 net/netfilter/ipvs/ip_vs_lblcr.c:479
  call_timer_fn+0x26a/0x5a0 kernel/time/timer.c:1326
  expire_timers kernel/time/timer.c:1363 [inline]
  __run_timers+0xda7/0x11c0 kernel/time/timer.c:1666
  run_timer_softirq+0x43/0x70 kernel/time/timer.c:1692
  __do_softirq+0x56d/0x93d kernel/softirq.c:285
  invoke_softirq kernel/softirq.c:365 [inline]
  irq_exit+0x202/0x240 kernel/softirq.c:405
  exiting_irq+0xe/0x10 arch/x86/include/asm/apic.h:541
  smp_apic_timer_interrupt+0x64/0x90 arch/x86/kernel/apic/apic.c:1055
  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:857
  </IRQ>
RIP: 0010:native_restore_fl arch/x86/include/asm/irqflags.h:37 [inline]
RIP: 0010:arch_local_irq_restore arch/x86/include/asm/irqflags.h:78 [inline]
RIP: 0010:dump_stack+0x1af/0x1d0 lib/dump_stack.c:58
RSP: 0018:ffff880156a2ef00 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff12
RAX: ffff8801fddc2590 RBX: ffff88014f62c418 RCX: ffff880000000000
RDX: ffff8801fd9c2590 RSI: aaaaaaaaaaaab000 RDI: ffffea0000000000
RBP: ffff880156a2ef48 R08: 0000000001080000 R09: 0000000000000002
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000cf000109
R13: 0000000000000286 R14: 0000000000000000 R15: 0000000000000000
  fail_dump lib/fault-inject.c:51 [inline]
  should_fail+0x87b/0xab0 lib/fault-inject.c:149
  should_failslab+0x279/0x2a0 mm/failslab.c:32
  slab_pre_alloc_hook mm/slab.h:422 [inline]
  slab_alloc_node mm/slub.c:2663 [inline]
  slab_alloc mm/slub.c:2745 [inline]
  kmem_cache_alloc+0x136/0xb90 mm/slub.c:2750
  dst_alloc+0x295/0x860 net/core/dst.c:104
  __ip6_dst_alloc net/ipv6/route.c:361 [inline]
  ip6_rt_cache_alloc+0x445/0xd00 net/ipv6/route.c:1061
  ip6_pol_route+0x3f19/0x5da0 net/ipv6/route.c:1751
  ip6_pol_route_output+0xe6/0x110 net/ipv6/route.c:1892
  fib6_rule_lookup+0x494/0x720 net/ipv6/fib6_rules.c:87
  ip6_route_output_flags+0x4fa/0x590 net/ipv6/route.c:1920
  ip6_dst_lookup_tail+0x2fe/0x1a60 net/ipv6/ip6_output.c:992
  ip6_dst_lookup_flow+0xfc/0x270 net/ipv6/ip6_output.c:1093
  rawv6_sendmsg+0x1b05/0x4fb0 net/ipv6/raw.c:908
  inet_sendmsg+0x48d/0x740 net/ipv4/af_inet.c:764
  sock_sendmsg_nosec net/socket.c:630 [inline]
  sock_sendmsg net/socket.c:640 [inline]
  ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046
  __sys_sendmsg net/socket.c:2080 [inline]
  SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091
  SyS_sendmsg+0x54/0x80 net/socket.c:2087
  do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x455389
RSP: 002b:00007fa5b1000c68 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00007fa5b10016d4 RCX: 0000000000455389
RDX: 0000000000000000 RSI: 0000000020000080 RDI: 0000000000000013
RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000014
R13: 00000000000004f3 R14: 00000000006fa768 R15: 0000000000000000

Uninit was created at:
  kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline]
  kmsan_alloc_meta_for_pages+0x161/0x3a0 mm/kmsan/kmsan.c:814
  kmsan_alloc_page+0x82/0xe0 mm/kmsan/kmsan.c:868
  __alloc_pages_nodemask+0xf5b/0x5dc0 mm/page_alloc.c:4283
  alloc_pages_current+0x6b5/0x970 mm/mempolicy.c:2055
  alloc_pages include/linux/gfp.h:494 [inline]
  kmalloc_order mm/slab_common.c:1164 [inline]
  kmalloc_order_trace+0xb9/0x390 mm/slab_common.c:1175
  kmalloc_large include/linux/slab.h:446 [inline]
  __kmalloc+0x332/0x350 mm/slub.c:3778
  kmalloc include/linux/slab.h:517 [inline]
  ip_vs_lblcr_init_svc+0x57/0x310 net/netfilter/ipvs/ip_vs_lblcr.c:518
  ip_vs_bind_scheduler+0xa4/0x1e0 net/netfilter/ipvs/ip_vs_sched.c:51
  ip_vs_add_service+0xa91/0x1d70 net/netfilter/ipvs/ip_vs_ctl.c:1265
  do_ip_vs_set_ctl+0x25c8/0x2790 net/netfilter/ipvs/ip_vs_ctl.c:2457
  nf_sockopt net/netfilter/nf_sockopt.c:106 [inline]
  nf_setsockopt+0x476/0x4d0 net/netfilter/nf_sockopt.c:115
  ip_setsockopt+0x24b/0x2b0 net/ipv4/ip_sockglue.c:1261
  dccp_setsockopt+0x1c3/0x1f0 net/dccp/proto.c:576
  sock_common_setsockopt+0x136/0x170 net/core/sock.c:2975
  SYSC_setsockopt+0x4b8/0x570 net/socket.c:1849
  SyS_setsockopt+0x76/0xa0 net/socket.c:1828
  do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
==================================================================


---
This bug is generated by a dumb bot. It may contain errors.
See https://goo.gl/tpsmEJ for details.
Direct all questions to syzkaller@googlegroups.com.

syzbot will keep track of this bug report.
If you forgot to add the Reported-by tag, once the fix for this bug is  
merged
into any tree, please reply to this email with:
#syz fix: exact-commit-title
To mark this as a duplicate of another syzbot report, please reply with:
#syz dup: exact-subject-of-another-report
If it's a one-off invalid bug report, please reply with:
#syz invalid
Note: if the crash happens again, it will cause creation of a new bug  
report.
Note: all commands must start from beginning of the line in the email body.

^ permalink raw reply

* KMSAN: uninit-value in ip_vs_lblc_check_expire
From: syzbot @ 2018-04-23 13:28 UTC (permalink / raw)
  To: coreteam, davem, fw, horms, ja, kadlec, linux-kernel, lvs-devel,
	netdev, netfilter-devel, pablo, syzkaller-bugs, wensong

Hello,

syzbot hit the following crash on  
https://github.com/google/kmsan.git/master commit
d2d741e5d1898dfde1a75ea3d29a9a3e2edf0617 (Sun Apr 22 15:05:22 2018 +0000)
kmsan: add initialization for shmem pages
syzbot dashboard link:  
https://syzkaller.appspot.com/bug?extid=3e9695f147fb529aa9bc

So far this crash happened 3 times on  
https://github.com/google/kmsan.git/master.
Unfortunately, I don't have any reproducer for this crash yet.
Raw console output:  
https://syzkaller.appspot.com/x/log.txt?id=5822255644803072
Kernel config: https://syzkaller.appspot.com/x/.config?id=328654897048964367
compiler: clang version 7.0.0 (trunk 329391)

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+3e9695f147fb529aa9bc@syzkaller.appspotmail.com
It will help syzbot understand when the bug is fixed. See footer for  
details.
If you forward the report, please keep this part and the footer.

kernel msg: ebtables bug: please report to author: bad policy
==================================================================
BUG: KMSAN: uninit-value in ip_vs_lblc_check_expire+0xe62/0xf10  
net/netfilter/ipvs/ip_vs_lblc.c:315
CPU: 0 PID: 11383 Comm: syz-executor3 Not tainted 4.16.0+ #86
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  <IRQ>
  __dump_stack lib/dump_stack.c:17 [inline]
  dump_stack+0x185/0x1d0 lib/dump_stack.c:53
  kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
  __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683
  ip_vs_lblc_check_expire+0xe62/0xf10 net/netfilter/ipvs/ip_vs_lblc.c:315
  call_timer_fn+0x26a/0x5a0 kernel/time/timer.c:1326
  expire_timers kernel/time/timer.c:1363 [inline]
  __run_timers+0xda7/0x11c0 kernel/time/timer.c:1666
  run_timer_softirq+0x43/0x70 kernel/time/timer.c:1692
  __do_softirq+0x56d/0x93d kernel/softirq.c:285
  invoke_softirq kernel/softirq.c:365 [inline]
  irq_exit+0x202/0x240 kernel/softirq.c:405
  exiting_irq+0xe/0x10 arch/x86/include/asm/apic.h:541
  smp_apic_timer_interrupt+0x64/0x90 arch/x86/kernel/apic/apic.c:1055
  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:857
  </IRQ>
RIP: 0010:native_restore_fl arch/x86/include/asm/irqflags.h:37 [inline]
RIP: 0010:arch_local_irq_restore arch/x86/include/asm/irqflags.h:78 [inline]
RIP: 0010:vprintk_emit+0xcb2/0xff0 kernel/printk/printk.c:1899
RSP: 0018:ffff8801c2a1f0d8 EFLAGS: 00000296 ORIG_RAX: ffffffffffffff12
RAX: 0000000000000296 RBX: ffff8801574c4418 RCX: 0000000000040000
RDX: ffffc900033a6000 RSI: 00000000000001bf RDI: 00000000000001c0
RBP: ffff8801c2a1f1f8 R08: 000000219bfd8445 R09: ffff8801fd6d615d
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
R13: ffffffff8b300430 R14: 0000000000000000 R15: 0000000000000000
  vprintk_default+0x90/0xa0 kernel/printk/printk.c:1955
  vprintk_func+0x517/0x700 kernel/printk/printk_safe.c:379
  printk+0x1b6/0x1f0 kernel/printk/printk.c:1991
  translate_table+0x474/0x5e10 net/bridge/netfilter/ebtables.c:846
  do_replace_finish+0x1258/0x2ea0 net/bridge/netfilter/ebtables.c:1002
  do_replace+0x707/0x770 net/bridge/netfilter/ebtables.c:1141
  do_ebt_set_ctl+0x2ab/0x3c0 net/bridge/netfilter/ebtables.c:1518
  nf_sockopt net/netfilter/nf_sockopt.c:106 [inline]
  nf_setsockopt+0x476/0x4d0 net/netfilter/nf_sockopt.c:115
  ip_setsockopt+0x24b/0x2b0 net/ipv4/ip_sockglue.c:1261
  udp_setsockopt+0x108/0x1b0 net/ipv4/udp.c:2406
  ipv6_setsockopt+0x30c/0x340 net/ipv6/ipv6_sockglue.c:917
  udpv6_setsockopt+0x110/0x1c0 net/ipv6/udp.c:1422
  sock_common_setsockopt+0x136/0x170 net/core/sock.c:2975
  SYSC_setsockopt+0x4b8/0x570 net/socket.c:1849
  SyS_setsockopt+0x76/0xa0 net/socket.c:1828
  do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x455389
RSP: 002b:00007f470c9e3c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 00007f470c9e46d4 RCX: 0000000000455389
RDX: 0000000000000080 RSI: 0000000000000000 RDI: 0000000000000013
RBP: 000000000072bea0 R08: 0000000000000dd0 R09: 0000000000000000
R10: 0000000020000dc0 R11: 0000000000000246 R12: 00000000ffffffff
R13: 000000000000051d R14: 00000000006fab58 R15: 0000000000000000

Uninit was created at:
  kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline]
  kmsan_alloc_meta_for_pages+0x161/0x3a0 mm/kmsan/kmsan.c:814
  kmsan_alloc_page+0x82/0xe0 mm/kmsan/kmsan.c:868
  __alloc_pages_nodemask+0xf5b/0x5dc0 mm/page_alloc.c:4283
  alloc_pages_current+0x6b5/0x970 mm/mempolicy.c:2055
  alloc_pages include/linux/gfp.h:494 [inline]
  kmalloc_order mm/slab_common.c:1164 [inline]
  kmalloc_order_trace+0xb9/0x390 mm/slab_common.c:1175
  kmalloc_large include/linux/slab.h:446 [inline]
  __kmalloc+0x332/0x350 mm/slub.c:3778
  kmalloc include/linux/slab.h:517 [inline]
  ip_vs_lblc_init_svc+0x57/0x310 net/netfilter/ipvs/ip_vs_lblc.c:355
  ip_vs_bind_scheduler+0xa4/0x1e0 net/netfilter/ipvs/ip_vs_sched.c:51
  ip_vs_add_service+0xa91/0x1d70 net/netfilter/ipvs/ip_vs_ctl.c:1265
  do_ip_vs_set_ctl+0x25c8/0x2790 net/netfilter/ipvs/ip_vs_ctl.c:2457
  nf_sockopt net/netfilter/nf_sockopt.c:106 [inline]
  nf_setsockopt+0x476/0x4d0 net/netfilter/nf_sockopt.c:115
  ip_setsockopt+0x24b/0x2b0 net/ipv4/ip_sockglue.c:1261
  raw_setsockopt+0x2e5/0x350 net/ipv4/raw.c:870
  sock_common_setsockopt+0x136/0x170 net/core/sock.c:2975
  SYSC_setsockopt+0x4b8/0x570 net/socket.c:1849
  SyS_setsockopt+0x76/0xa0 net/socket.c:1828
  do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
==================================================================


---
This bug is generated by a dumb bot. It may contain errors.
See https://goo.gl/tpsmEJ for details.
Direct all questions to syzkaller@googlegroups.com.

syzbot will keep track of this bug report.
If you forgot to add the Reported-by tag, once the fix for this bug is  
merged
into any tree, please reply to this email with:
#syz fix: exact-commit-title
To mark this as a duplicate of another syzbot report, please reply with:
#syz dup: exact-subject-of-another-report
If it's a one-off invalid bug report, please reply with:
#syz invalid
Note: if the crash happens again, it will cause creation of a new bug  
report.
Note: all commands must start from beginning of the line in the email body.

^ permalink raw reply

* IP_ADD_MEMBERSHIP with imr_ifindex!=0 for multiple processes with different interfaces
From: Klebsch, Mario @ 2018-04-23 13:22 UTC (permalink / raw)
  To: netdev@vger.kernel.org

Hi, 

I have a problem with multicast reception in the linux kernel and I hope, this is the right place to ask for help or to report a bug.

I need to receive multicasts on a single interface. I have written a small program, which executes IP_ADD_MEMBERSHIP with imr.imr_ifindex set to the interface index. The program works well, as long as only a single instance of this program is running. If I start a second instance on a different network interface, both programs receive multicast frames from both interfaces. 

When called without argument, the test program list the network interfaces. When called with an interface name as argument, if starts receiving multicasts on that interface.

I am running vanilla Linux kernel 4.12.0.

# uname -a
Linux c627 4.12.0 #1 SMP Mon Apr 23 14:08:24 CEST 2018 i686 Intel(R) Core(TM) i5-7400 CPU @ 3.00GHz GenuineIntel GNU/Linux
# 

P.S. The program runs fine on MacOSX.

73, Mario

----8<--------8<--------8<--------8<--------8<--------8<--------8<--------8<--------8<--------8<----
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <ifaddrs.h>
#include <net/if.h>

#define MCAST_PORT  6154
#define MCAST_ADDR  "239.255.1.1"

void ListInterfaces(struct ifaddrs *Interfaces)
{
                for (struct ifaddrs *a=Interfaces; a; a=a->ifa_next)
                {
                               if (!(a->ifa_flags & IFF_UP))
                                               continue;
                               if (!a->ifa_addr || a->ifa_addr->sa_family != AF_INET)
                                               continue;
                               struct sockaddr_in *Addr = (struct sockaddr_in *)a->ifa_addr;
                               printf("%s: %s\n", a->ifa_name, inet_ntoa(Addr->sin_addr));
                }
}

int main(int argc, char *argv[])
{
                struct ifaddrs *Interfaces;
                if (getifaddrs(&Interfaces) < 0)
                {
                               perror("getifaddrs");
                               return -1;
                }

                struct sockaddr_in *MyIfAddr=NULL;
                int                MyIfIndex=0;
                if (argc > 1 && (MyIfIndex = if_nametoindex(argv[1])) )
                               for (struct ifaddrs *a=Interfaces; a; a=a->ifa_next)
                               {
                                               if (!(a->ifa_flags & IFF_UP))
                                                               continue;
                                               if (!a->ifa_addr || a->ifa_addr->sa_family != AF_INET)
                                                               continue;
                                               if (strcmp(argv[1], a->ifa_name)!= 0)
                                                               continue;
                                               MyIfAddr = (struct sockaddr_in *)a->ifa_addr;
                                               break;
                               }

                if (!MyIfAddr || !MyIfIndex)
                {
                               ListInterfaces(Interfaces);
                               return 0;
                }

                int s=socket(PF_INET, SOCK_DGRAM, 0);
                if (s<0)
                {
                               perror("socket");
                               return -1;
                }

                int off=0;
                if (setsockopt(s, IPPROTO_IP, IP_MULTICAST_LOOP, &off, sizeof(off)) < 0)
                               perror("setsockopt(SO_REUSEADDR)");

                int on=1;
                if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0)
                               perror("setsockopt(SO_REUSEADDR)");

                struct sockaddr_in Addr;
                Addr.sin_family = AF_INET;
                Addr.sin_port = htons(MCAST_PORT);
                inet_aton(MCAST_ADDR, &Addr.sin_addr);
                if (bind(s, (struct sockaddr*)&Addr, sizeof(Addr)) < 0)
                               perror("bind");

                struct ip_mreqn imr;
                inet_aton(MCAST_ADDR, &imr.imr_multiaddr);
                imr.imr_address = MyIfAddr->sin_addr;
                imr.imr_ifindex = MyIfIndex;
                if (setsockopt(s, IPPROTO_IP, IP_ADD_MEMBERSHIP, &imr, sizeof(imr)) < 0)
                               perror("setsockopt(IP_ADD_MEMBERSHIP)");

                for (;;)
                {              
                               struct sockaddr_in AddrBuffer;
                               int AddrLen = sizeof(AddrBuffer);
                               char Buffer[2048];
                               size_t BufferLen = recvfrom(s, &Buffer, sizeof(Buffer), 0, (struct sockaddr*)&AddrBuffer, & AddrLen);
                               if (BufferLen <= 0)
                               {
                                               if (BufferLen < 0)
                                                               perror("recvfrom");
                                               break;
                               }
                               printf("%s: Received %d bytes from %s\n", argv[1], BufferLen, inet_ntoa(AddrBuffer.sin_addr));
                }
}
----8<--------8<--------8<--------8<--------8<--------8<--------8<--------8<--------8<--------8<----

-- 
Mario Klebsch				Actia I+ME GmbH
Mario.klebsch@ime-actia.de		Dresdenstrasse 17/18
Fon: +49 531 38 701 716			38124 Braunschweig
Fax: +49 531 38 701 88			Germany



^ permalink raw reply

* Re: [PATCH net-next 0/5] virtio-net: Add SCTP checksum offload support
From: Vlad Yasevich @ 2018-04-23 13:17 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner, Michael S. Tsirkin
  Cc: Vladislav Yasevich, netdev, linux-sctp, virtualization, jasowang,
	nhorman
In-Reply-To: <20180420172219.GR4716@localhost.localdomain>

On 04/20/2018 01:22 PM, Marcelo Ricardo Leitner wrote:
> On Wed, Apr 18, 2018 at 05:06:46PM +0300, Michael S. Tsirkin wrote:
>> On Tue, Apr 17, 2018 at 04:35:18PM -0400, Vlad Yasevich wrote:
>>> On 04/02/2018 10:47 AM, Marcelo Ricardo Leitner wrote:
>>>> On Mon, Apr 02, 2018 at 09:40:01AM -0400, Vladislav Yasevich wrote:
>>>>> Now that we have SCTP offload capabilities in the kernel, we can add
>>>>> them to virtio as well.  First step is SCTP checksum.
>>>>
>>>> Thanks.
>>>>
>>>>> As for GSO, the way sctp GSO is currently implemented buys us nothing
>>>>> in added support to virtio.  To add true GSO, would require a lot of
>>>>> re-work inside of SCTP and would require extensions to the virtio
>>>>> net header to carry extra sctp data.
>>>>
>>>> Can you please elaborate more on this? Is this because SCTP GSO relies
>>>> on the gso skb format for knowing how to segment it instead of having
>>>> a list of sizes?
>>>>
>>>
>>> it's mainly because all the true segmentation, placing data into chunks,
>>> has already happened.  All that GSO does is allow for higher bundling
>>> rate between VMs. If that is all SCTP GSO ever going to do, that fine,
>>> but the goal is to do real GSO eventually and potentially reduce the
>>> amount of memory copying we are doing.
>>> If we do that, any current attempt at GSO in virtio would have to be
>>> depricated and we'd need GSO2 or something like that.
>>
>> Batching helps virtualization *a lot* though.
> 
> Yep. The results posted by Xin in the other email give good insights
> on it.
> 
>> Are there actual plans for GSO2? Is it just for SCTP?
> 
> No plans. In this context, at least, yes, just for SCTP.
> 
> It was a supposition in case we start doing a different GSO for SCTP,
> one more like what we have for TCP.
> 
> Currently, as the SCTP GSO code doesn't leave the system, we can
> update it if we want. But by the moment we add support for it in
> virtio, we will have to be backwards compatible if we end up doing
> SCTP GSO differently.

So, just because the linux code doesn't do it differently doesn't mean
that someone else doesn't.  Since the device has to work across different
possible implementations, it needs to be generic enough.  If we simply
document the current linux practice, that may not be ideal on the future.

I was hesitant to introduce this without studying the feasibility of
doing late segmentation.

-vlad

> 
> But again, I don't think such approach for SCTP GSO would be neither
> feasible or worth. The complexity for it, to work across stream
> schedules and late TSN allocation, would do more harm then good IMO.
> 
>>
>>>
>>> This is why, after doing the GSO support, I decided not to include it.
>>>
>>> -vlad
>>>>   Marcelo
>>>>

^ permalink raw reply

* Re: Page allocator bottleneck
From: Aaron Lu @ 2018-04-23 13:10 UTC (permalink / raw)
  To: Tariq Toukan
  Cc: Linux Kernel Network Developers, linux-mm, Mel Gorman,
	David Miller, Jesper Dangaard Brouer, Eric Dumazet,
	Alexei Starovoitov, Saeed Mahameed, Eran Ben Elisha,
	Andrew Morton, Michal Hocko
In-Reply-To: <0dea4da6-8756-22d4-c586-267217a5fa63@mellanox.com>

On Mon, Apr 23, 2018 at 11:54:57AM +0300, Tariq Toukan wrote:
> Hi,
> 
> I ran my tests with your patches.
> Initial BW numbers are significantly higher than I documented back then in
> this mail-thread.
> For example, in driver #2 (see original mail thread), with 6 rings, I now
> get 92Gbps (slightly less than linerate) in comparison to 64Gbps back then.
> 
> However, there were many kernel changes since then, I need to isolate your
> changes. I am not sure I can finish this today, but I will surely get to it
> next week after I'm back from vacation.
> 
> Still, when I increase the scale (more rings, i.e. more cpus), I see that
> queued_spin_lock_slowpath gets to 60%+ cpu. Still high, but lower than it
> used to be.

I wonder if it is on allocation path or free path?

Also, increasing PCP size through vm.percpu_pagelist_fraction would
still help with my patches since it can avoid touching even more cache
lines on allocation path with a higher PCP->batch(which has an upper
limit of 96 though at the moment).

> 
> This should be root solved by the (orthogonal) changes planned in network
> subsystem, which will change the SKB allocation/free scheme so that SKBs are
> released on the originating cpu.

^ permalink raw reply

* [PATCH net] sfc: ARFS filter IDs
From: Edward Cree @ 2018-04-23 13:08 UTC (permalink / raw)
  To: linux-net-drivers, David Miller; +Cc: netdev

Associate an arbitrary ID with each ARFS filter, allowing to properly query
 for expiry.  The association is maintained in a hash table, which is
 protected by a spinlock.

Fixes: 3af0f34290f6 ("sfc: replace asynchronous filter operations")
Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 drivers/net/ethernet/sfc/ef10.c       |  80 +++++++++++--------
 drivers/net/ethernet/sfc/efx.c        | 143 ++++++++++++++++++++++++++++++++++
 drivers/net/ethernet/sfc/efx.h        |  19 +++++
 drivers/net/ethernet/sfc/farch.c      |  41 ++++++++--
 drivers/net/ethernet/sfc/net_driver.h |  36 +++++++++
 drivers/net/ethernet/sfc/rx.c         |  62 +++++++++++++--
 6 files changed, 335 insertions(+), 46 deletions(-)

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 83ce229f4eb7..63036d9bf3e6 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -3999,29 +3999,6 @@ static void efx_ef10_prepare_flr(struct efx_nic *efx)
 	atomic_set(&efx->active_queues, 0);
 }
 
-static bool efx_ef10_filter_equal(const struct efx_filter_spec *left,
-				  const struct efx_filter_spec *right)
-{
-	if ((left->match_flags ^ right->match_flags) |
-	    ((left->flags ^ right->flags) &
-	     (EFX_FILTER_FLAG_RX | EFX_FILTER_FLAG_TX)))
-		return false;
-
-	return memcmp(&left->outer_vid, &right->outer_vid,
-		      sizeof(struct efx_filter_spec) -
-		      offsetof(struct efx_filter_spec, outer_vid)) == 0;
-}
-
-static unsigned int efx_ef10_filter_hash(const struct efx_filter_spec *spec)
-{
-	BUILD_BUG_ON(offsetof(struct efx_filter_spec, outer_vid) & 3);
-	return jhash2((const u32 *)&spec->outer_vid,
-		      (sizeof(struct efx_filter_spec) -
-		       offsetof(struct efx_filter_spec, outer_vid)) / 4,
-		      0);
-	/* XXX should we randomise the initval? */
-}
-
 /* Decide whether a filter should be exclusive or else should allow
  * delivery to additional recipients.  Currently we decide that
  * filters for specific local unicast MAC and IP addresses are
@@ -4346,7 +4323,7 @@ static s32 efx_ef10_filter_insert(struct efx_nic *efx,
 		goto out_unlock;
 	match_pri = rc;
 
-	hash = efx_ef10_filter_hash(spec);
+	hash = efx_filter_spec_hash(spec);
 	is_mc_recip = efx_filter_is_mc_recipient(spec);
 	if (is_mc_recip)
 		bitmap_zero(mc_rem_map, EFX_EF10_FILTER_SEARCH_LIMIT);
@@ -4378,7 +4355,7 @@ static s32 efx_ef10_filter_insert(struct efx_nic *efx,
 		if (!saved_spec) {
 			if (ins_index < 0)
 				ins_index = i;
-		} else if (efx_ef10_filter_equal(spec, saved_spec)) {
+		} else if (efx_filter_spec_equal(spec, saved_spec)) {
 			if (spec->priority < saved_spec->priority &&
 			    spec->priority != EFX_FILTER_PRI_AUTO) {
 				rc = -EPERM;
@@ -4762,27 +4739,62 @@ static s32 efx_ef10_filter_get_rx_ids(struct efx_nic *efx,
 static bool efx_ef10_filter_rfs_expire_one(struct efx_nic *efx, u32 flow_id,
 					   unsigned int filter_idx)
 {
+	struct efx_filter_spec *spec, saved_spec;
 	struct efx_ef10_filter_table *table;
-	struct efx_filter_spec *spec;
-	bool ret;
+	struct efx_arfs_rule *rule = NULL;
+	bool ret = true, force = false;
+	u16 arfs_id;
 
 	down_read(&efx->filter_sem);
 	table = efx->filter_state;
 	down_write(&table->lock);
 	spec = efx_ef10_filter_entry_spec(table, filter_idx);
 
-	if (!spec || spec->priority != EFX_FILTER_PRI_HINT) {
-		ret = true;
+	if (!spec || spec->priority != EFX_FILTER_PRI_HINT)
 		goto out_unlock;
-	}
 
-	if (!rps_may_expire_flow(efx->net_dev, spec->dmaq_id, flow_id, 0)) {
-		ret = false;
-		goto out_unlock;
+	spin_lock_bh(&efx->rps_hash_lock);
+	if (!efx->rps_hash_table) {
+		/* In the absence of the table, we always return 0 to ARFS. */
+		arfs_id = 0;
+	} else {
+		rule = efx_rps_hash_find(efx, spec);
+		if (!rule)
+			/* ARFS table doesn't know of this filter, so remove it */
+			goto expire;
+		arfs_id = rule->arfs_id;
+		ret = efx_rps_check_rule(rule, filter_idx, &force);
+		if (force)
+			goto expire;
+		if (!ret) {
+			spin_unlock_bh(&efx->rps_hash_lock);
+			goto out_unlock;
+		}
 	}
-
+	if (!rps_may_expire_flow(efx->net_dev, spec->dmaq_id, flow_id, arfs_id))
+		ret = false;
+	else if (rule)
+		rule->filter_id = EFX_ARFS_FILTER_ID_REMOVING;
+expire:
+	saved_spec = *spec; /* remove operation will kfree spec */
+	spin_unlock_bh(&efx->rps_hash_lock);
+	/* At this point (since we dropped the lock), another thread might queue
+	 * up a fresh insertion request (but the actual insertion will be held
+	 * up by our possession of the filter table lock).  In that case, it
+	 * will set rule->filter_id to EFX_ARFS_FILTER_ID_PENDING, meaning that
+	 * the rule is not removed by efx_rps_hash_del() below.
+	 */
 	ret = efx_ef10_filter_remove_internal(efx, 1U << spec->priority,
 					      filter_idx, true) == 0;
+	/* While we can't safely dereference rule (we dropped the lock), we can
+	 * still test it for NULL.
+	 */
+	if (ret && rule) {
+		/* Expiring, so remove entry from ARFS table */
+		spin_lock_bh(&efx->rps_hash_lock);
+		efx_rps_hash_del(efx, &saved_spec);
+		spin_unlock_bh(&efx->rps_hash_lock);
+	}
 out_unlock:
 	up_write(&table->lock);
 	up_read(&efx->filter_sem);
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 692dd729ee2a..a4ebd8715494 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -3027,6 +3027,10 @@ static int efx_init_struct(struct efx_nic *efx,
 	mutex_init(&efx->mac_lock);
 #ifdef CONFIG_RFS_ACCEL
 	mutex_init(&efx->rps_mutex);
+	spin_lock_init(&efx->rps_hash_lock);
+	/* Failure to allocate is not fatal, but may degrade ARFS performance */
+	efx->rps_hash_table = kcalloc(EFX_ARFS_HASH_TABLE_SIZE,
+				      sizeof(*efx->rps_hash_table), GFP_KERNEL);
 #endif
 	efx->phy_op = &efx_dummy_phy_operations;
 	efx->mdio.dev = net_dev;
@@ -3070,6 +3074,10 @@ static void efx_fini_struct(struct efx_nic *efx)
 {
 	int i;
 
+#ifdef CONFIG_RFS_ACCEL
+	kfree(efx->rps_hash_table);
+#endif
+
 	for (i = 0; i < EFX_MAX_CHANNELS; i++)
 		kfree(efx->channel[i]);
 
@@ -3092,6 +3100,141 @@ void efx_update_sw_stats(struct efx_nic *efx, u64 *stats)
 	stats[GENERIC_STAT_rx_noskb_drops] = atomic_read(&efx->n_rx_noskb_drops);
 }
 
+bool efx_filter_spec_equal(const struct efx_filter_spec *left,
+			   const struct efx_filter_spec *right)
+{
+	if ((left->match_flags ^ right->match_flags) |
+	    ((left->flags ^ right->flags) &
+	     (EFX_FILTER_FLAG_RX | EFX_FILTER_FLAG_TX)))
+		return false;
+
+	return memcmp(&left->outer_vid, &right->outer_vid,
+		      sizeof(struct efx_filter_spec) -
+		      offsetof(struct efx_filter_spec, outer_vid)) == 0;
+}
+
+u32 efx_filter_spec_hash(const struct efx_filter_spec *spec)
+{
+	BUILD_BUG_ON(offsetof(struct efx_filter_spec, outer_vid) & 3);
+	return jhash2((const u32 *)&spec->outer_vid,
+		      (sizeof(struct efx_filter_spec) -
+		       offsetof(struct efx_filter_spec, outer_vid)) / 4,
+		      0);
+}
+
+#ifdef CONFIG_RFS_ACCEL
+bool efx_rps_check_rule(struct efx_arfs_rule *rule, unsigned int filter_idx,
+			bool *force)
+{
+	if (rule->filter_id == EFX_ARFS_FILTER_ID_PENDING) {
+		/* ARFS is currently updating this entry, leave it */
+		return false;
+	}
+	if (rule->filter_id == EFX_ARFS_FILTER_ID_ERROR) {
+		/* ARFS tried and failed to update this, so it's probably out
+		 * of date.  Remove the filter and the ARFS rule entry.
+		 */
+		rule->filter_id = EFX_ARFS_FILTER_ID_REMOVING;
+		*force = true;
+		return true;
+	} else if (WARN_ON(rule->filter_id != filter_idx)) { /* can't happen */
+		/* ARFS has moved on, so old filter is not needed.  Since we did
+		 * not mark the rule with EFX_ARFS_FILTER_ID_REMOVING, it will
+		 * not be removed by efx_rps_hash_del() subsequently.
+		 */
+		*force = true;
+		return true;
+	}
+	/* Remove it iff ARFS wants to. */
+	return true;
+}
+
+struct hlist_head *efx_rps_hash_bucket(struct efx_nic *efx,
+				       const struct efx_filter_spec *spec)
+{
+	u32 hash = efx_filter_spec_hash(spec);
+
+	WARN_ON(!spin_is_locked(&efx->rps_hash_lock));
+	if (!efx->rps_hash_table)
+		return NULL;
+	return &efx->rps_hash_table[hash % EFX_ARFS_HASH_TABLE_SIZE];
+}
+
+struct efx_arfs_rule *efx_rps_hash_find(struct efx_nic *efx,
+					const struct efx_filter_spec *spec)
+{
+	struct efx_arfs_rule *rule;
+	struct hlist_head *head;
+	struct hlist_node *node;
+
+	head = efx_rps_hash_bucket(efx, spec);
+	if (!head)
+		return NULL;
+	hlist_for_each(node, head) {
+		rule = container_of(node, struct efx_arfs_rule, node);
+		if (efx_filter_spec_equal(spec, &rule->spec))
+			return rule;
+	}
+	return NULL;
+}
+
+struct efx_arfs_rule *efx_rps_hash_add(struct efx_nic *efx,
+				       const struct efx_filter_spec *spec,
+				       bool *new)
+{
+	struct efx_arfs_rule *rule;
+	struct hlist_head *head;
+	struct hlist_node *node;
+
+	head = efx_rps_hash_bucket(efx, spec);
+	if (!head)
+		return NULL;
+	hlist_for_each(node, head) {
+		rule = container_of(node, struct efx_arfs_rule, node);
+		if (efx_filter_spec_equal(spec, &rule->spec)) {
+			*new = false;
+			return rule;
+		}
+	}
+	rule = kmalloc(sizeof(*rule), GFP_ATOMIC);
+	*new = true;
+	if (rule) {
+		memcpy(&rule->spec, spec, sizeof(rule->spec));
+		hlist_add_head(&rule->node, head);
+	}
+	return rule;
+}
+
+void efx_rps_hash_del(struct efx_nic *efx, const struct efx_filter_spec *spec)
+{
+	struct efx_arfs_rule *rule;
+	struct hlist_head *head;
+	struct hlist_node *node;
+
+	head = efx_rps_hash_bucket(efx, spec);
+	if (WARN_ON(!head))
+		return;
+	hlist_for_each(node, head) {
+		rule = container_of(node, struct efx_arfs_rule, node);
+		if (efx_filter_spec_equal(spec, &rule->spec)) {
+			/* Someone already reused the entry.  We know that if
+			 * this check doesn't fire (i.e. filter_id == REMOVING)
+			 * then the REMOVING mark was put there by our caller,
+			 * because caller is holding a lock on filter table and
+			 * only holders of that lock set REMOVING.
+			 */
+			if (rule->filter_id != EFX_ARFS_FILTER_ID_REMOVING)
+				return;
+			hlist_del(node);
+			kfree(rule);
+			return;
+		}
+	}
+	/* We didn't find it. */
+	WARN_ON(1);
+}
+#endif
+
 /* RSS contexts.  We're using linked lists and crappy O(n) algorithms, because
  * (a) this is an infrequent control-plane operation and (b) n is small (max 64)
  */
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index a3140e16fcef..6b4164b6d938 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -186,6 +186,25 @@ static inline void efx_filter_rfs_expire(struct work_struct *data) {}
 #endif
 bool efx_filter_is_mc_recipient(const struct efx_filter_spec *spec);
 
+bool efx_filter_spec_equal(const struct efx_filter_spec *left,
+			   const struct efx_filter_spec *right);
+u32 efx_filter_spec_hash(const struct efx_filter_spec *spec);
+
+bool efx_rps_check_rule(struct efx_arfs_rule *rule, unsigned int filter_idx,
+			bool *force);
+
+struct efx_arfs_rule *efx_rps_hash_find(struct efx_nic *efx,
+					const struct efx_filter_spec *spec);
+
+/* @new is written to indicate if entry was newly added (true) or if an old
+ * entry was found and returned (false).
+ */
+struct efx_arfs_rule *efx_rps_hash_add(struct efx_nic *efx,
+				       const struct efx_filter_spec *spec,
+				       bool *new);
+
+void efx_rps_hash_del(struct efx_nic *efx, const struct efx_filter_spec *spec);
+
 /* RSS contexts */
 struct efx_rss_context *efx_alloc_rss_context_entry(struct efx_nic *efx);
 struct efx_rss_context *efx_find_rss_context_entry(struct efx_nic *efx, u32 id);
diff --git a/drivers/net/ethernet/sfc/farch.c b/drivers/net/ethernet/sfc/farch.c
index 7174ef5e5c5e..ade694c1f9a6 100644
--- a/drivers/net/ethernet/sfc/farch.c
+++ b/drivers/net/ethernet/sfc/farch.c
@@ -2905,18 +2905,45 @@ bool efx_farch_filter_rfs_expire_one(struct efx_nic *efx, u32 flow_id,
 {
 	struct efx_farch_filter_state *state = efx->filter_state;
 	struct efx_farch_filter_table *table;
-	bool ret = false;
+	bool ret = false, force = false;
+	u16 arfs_id;
 
 	down_write(&state->lock);
+	spin_lock_bh(&efx->rps_hash_lock);
 	table = &state->table[EFX_FARCH_FILTER_TABLE_RX_IP];
 	if (test_bit(index, table->used_bitmap) &&
-	    table->spec[index].priority == EFX_FILTER_PRI_HINT &&
-	    rps_may_expire_flow(efx->net_dev, table->spec[index].dmaq_id,
-				flow_id, 0)) {
-		efx_farch_filter_table_clear_entry(efx, table, index);
-		ret = true;
+	    table->spec[index].priority == EFX_FILTER_PRI_HINT) {
+		struct efx_filter_spec spec;
+		struct efx_arfs_rule *rule;
+
+		efx_farch_filter_to_gen_spec(&spec, &table->spec[index]);
+		if (!efx->rps_hash_table) {
+			/* In the absence of the table, we always returned 0 to
+			 * ARFS, so use the same to query it.
+			 */
+			arfs_id = 0;
+		} else {
+			rule = efx_rps_hash_find(efx, &spec);
+			if (!rule) {
+				/* ARFS table doesn't know of this filter, remove it */
+				force = true;
+			} else {
+				arfs_id = rule->arfs_id;
+				if (!efx_rps_check_rule(rule, index, &force))
+					goto out_unlock;
+			}
+		}
+		if (force || rps_may_expire_flow(efx->net_dev, spec.dmaq_id,
+						 flow_id, arfs_id)) {
+			if (rule)
+				rule->filter_id = EFX_ARFS_FILTER_ID_REMOVING;
+			efx_rps_hash_del(efx, &spec);
+			efx_farch_filter_table_clear_entry(efx, table, index);
+			ret = true;
+		}
 	}
-
+out_unlock:
+	spin_unlock_bh(&efx->rps_hash_lock);
 	up_write(&state->lock);
 	return ret;
 }
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index eea3808b3f25..65568925c3ef 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -734,6 +734,35 @@ struct efx_rss_context {
 };
 
 #ifdef CONFIG_RFS_ACCEL
+/* Order of these is important, since filter_id >= %EFX_ARFS_FILTER_ID_PENDING
+ * is used to test if filter does or will exist.
+ */
+#define EFX_ARFS_FILTER_ID_PENDING	-1
+#define EFX_ARFS_FILTER_ID_ERROR	-2
+#define EFX_ARFS_FILTER_ID_REMOVING	-3
+/**
+ * struct efx_arfs_rule - record of an ARFS filter and its IDs
+ * @node: linkage into hash table
+ * @spec: details of the filter (used as key for hash table).  Use efx->type to
+ *	determine which member to use.
+ * @rxq_index: channel to which the filter will steer traffic.
+ * @arfs_id: filter ID which was returned to ARFS
+ * @filter_id: index in software filter table.  May be
+ *	%EFX_ARFS_FILTER_ID_PENDING if filter was not inserted yet,
+ *	%EFX_ARFS_FILTER_ID_ERROR if filter insertion failed, or
+ *	%EFX_ARFS_FILTER_ID_REMOVING if expiry is currently removing the filter.
+ */
+struct efx_arfs_rule {
+	struct hlist_node node;
+	struct efx_filter_spec spec;
+	u16 rxq_index;
+	u16 arfs_id;
+	s32 filter_id;
+};
+
+/* Size chosen so that the table is one page (4kB) */
+#define EFX_ARFS_HASH_TABLE_SIZE	512
+
 /**
  * struct efx_async_filter_insertion - Request to asynchronously insert a filter
  * @net_dev: Reference to the netdevice
@@ -873,6 +902,10 @@ struct efx_async_filter_insertion {
  *	@rps_expire_channel's @rps_flow_id
  * @rps_slot_map: bitmap of in-flight entries in @rps_slot
  * @rps_slot: array of ARFS insertion requests for efx_filter_rfs_work()
+ * @rps_hash_lock: Protects ARFS filter mapping state (@rps_hash_table and
+ *	@rps_next_id).
+ * @rps_hash_table: Mapping between ARFS filters and their various IDs
+ * @rps_next_id: next arfs_id for an ARFS filter
  * @active_queues: Count of RX and TX queues that haven't been flushed and drained.
  * @rxq_flush_pending: Count of number of receive queues that need to be flushed.
  *	Decremented when the efx_flush_rx_queue() is called.
@@ -1029,6 +1062,9 @@ struct efx_nic {
 	unsigned int rps_expire_index;
 	unsigned long rps_slot_map;
 	struct efx_async_filter_insertion rps_slot[EFX_RPS_MAX_IN_FLIGHT];
+	spinlock_t rps_hash_lock;
+	struct hlist_head *rps_hash_table;
+	u32 rps_next_id;
 #endif
 
 	atomic_t active_queues;
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 9c593c661cbf..64a94f242027 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -834,9 +834,29 @@ static void efx_filter_rfs_work(struct work_struct *data)
 	struct efx_nic *efx = netdev_priv(req->net_dev);
 	struct efx_channel *channel = efx_get_channel(efx, req->rxq_index);
 	int slot_idx = req - efx->rps_slot;
+	struct efx_arfs_rule *rule;
+	u16 arfs_id = 0;
 	int rc;
 
 	rc = efx->type->filter_insert(efx, &req->spec, true);
+	if (efx->rps_hash_table) {
+		spin_lock_bh(&efx->rps_hash_lock);
+		rule = efx_rps_hash_find(efx, &req->spec);
+		/* The rule might have already gone, if someone else's request
+		 * for the same spec was already worked and then expired before
+		 * we got around to our work.  In that case we have nothing
+		 * tying us to an arfs_id, meaning that as soon as the filter
+		 * is considered for expiry it will be removed.
+		 */
+		if (rule) {
+			if (rc < 0)
+				rule->filter_id = EFX_ARFS_FILTER_ID_ERROR;
+			else
+				rule->filter_id = rc;
+			arfs_id = rule->arfs_id;
+		}
+		spin_unlock_bh(&efx->rps_hash_lock);
+	}
 	if (rc >= 0) {
 		/* Remember this so we can check whether to expire the filter
 		 * later.
@@ -848,18 +868,18 @@ static void efx_filter_rfs_work(struct work_struct *data)
 
 		if (req->spec.ether_type == htons(ETH_P_IP))
 			netif_info(efx, rx_status, efx->net_dev,
-				   "steering %s %pI4:%u:%pI4:%u to queue %u [flow %u filter %d]\n",
+				   "steering %s %pI4:%u:%pI4:%u to queue %u [flow %u filter %d id %u]\n",
 				   (req->spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP",
 				   req->spec.rem_host, ntohs(req->spec.rem_port),
 				   req->spec.loc_host, ntohs(req->spec.loc_port),
-				   req->rxq_index, req->flow_id, rc);
+				   req->rxq_index, req->flow_id, rc, arfs_id);
 		else
 			netif_info(efx, rx_status, efx->net_dev,
-				   "steering %s [%pI6]:%u:[%pI6]:%u to queue %u [flow %u filter %d]\n",
+				   "steering %s [%pI6]:%u:[%pI6]:%u to queue %u [flow %u filter %d id %u]\n",
 				   (req->spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP",
 				   req->spec.rem_host, ntohs(req->spec.rem_port),
 				   req->spec.loc_host, ntohs(req->spec.loc_port),
-				   req->rxq_index, req->flow_id, rc);
+				   req->rxq_index, req->flow_id, rc, arfs_id);
 	}
 
 	/* Release references */
@@ -872,8 +892,10 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 	struct efx_async_filter_insertion *req;
+	struct efx_arfs_rule *rule;
 	struct flow_keys fk;
 	int slot_idx;
+	bool new;
 	int rc;
 
 	/* find a free slot */
@@ -926,12 +948,42 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
 	req->spec.rem_port = fk.ports.src;
 	req->spec.loc_port = fk.ports.dst;
 
+	if (efx->rps_hash_table) {
+		/* Add it to ARFS hash table */
+		spin_lock(&efx->rps_hash_lock);
+		rule = efx_rps_hash_add(efx, &req->spec, &new);
+		if (!rule) {
+			rc = -ENOMEM;
+			goto out_unlock;
+		}
+		if (new)
+			rule->arfs_id = efx->rps_next_id++ % RPS_NO_FILTER;
+		rc = rule->arfs_id;
+		/* Skip if existing or pending filter already does the right thing */
+		if (!new && rule->rxq_index == rxq_index &&
+		    rule->filter_id >= EFX_ARFS_FILTER_ID_PENDING)
+			goto out_unlock;
+		rule->rxq_index = rxq_index;
+		rule->filter_id = EFX_ARFS_FILTER_ID_PENDING;
+		spin_unlock(&efx->rps_hash_lock);
+	} else {
+		/* Without an ARFS hash table, we just use arfs_id 0 for all
+		 * filters.  This means if multiple flows hash to the same
+		 * flow_id, all but the most recently touched will be eligible
+		 * for expiry.
+		 */
+		rc = 0;
+	}
+
+	/* Queue the request */
 	dev_hold(req->net_dev = net_dev);
 	INIT_WORK(&req->work, efx_filter_rfs_work);
 	req->rxq_index = rxq_index;
 	req->flow_id = flow_id;
 	schedule_work(&req->work);
-	return 0;
+	return rc;
+out_unlock:
+	spin_unlock(&efx->rps_hash_lock);
 out_clear:
 	clear_bit(slot_idx, &efx->rps_slot_map);
 	return rc;

^ permalink raw reply related

* Repeating "unregister_netdevice: waiting for lo to become free" caused by upstream 76da0704507bb ("ipv6: only call ip6_route_dev_notify() once for NETDEV_UNREGISTER")
From: Rafał Miłecki @ 2018-04-23 13:08 UTC (permalink / raw)
  To: WANG Cong, David S. Miller, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	Network Development, jeffy, David Ahern, Khlebnikov
  Cc: Greg Kroah-Hartman, Stable

Hi,

I've just updated my kernel 4.4.x and noticed a regression. Bisecting
pointed me to the commit 2417da3f4d6bc ("ipv6: only call
ip6_route_dev_notify() once for NETDEV_UNREGISTER") [0] which is
backport of upstream 76da0704507bb. That backported commit has
appeared in a 4.4.103.

I use OpenWrt/LEDE [1] distribution and LXC [2] 1.1.5. After stopping
a container I start getting these messages:
[  229.419188] unregister_netdevice: waiting for lo to become free.
Usage count = 1
[  239.660408] unregister_netdevice: waiting for lo to become free.
Usage count = 1
[  249.839189] unregister_netdevice: waiting for lo to become free.
Usage count = 1
(...)

Trying to start LXC nevertheless results in lxc-start command hang
around network configuration. Trying to query LXC state afterwards
results in a lxc-info command hang too.

I tried Googling for this issue and found similar reports:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1729637
https://github.com/fnproject/fn/issues/686
https://lime-technology.com/forums/topic/66863-kernelunregister_netdevice-waiting-for-lo-to-become-free-usage-count-1/
all of them related to the Docker, which is probably a similar use
case to the LXC.

I couldn't find any reference to commit 76da0704507bb that could
suggest fixing the problem I'm seeing.

Does anyone have an idea what is the issue I'm seeing about? Or even
better, how to fix it? Can I provide any additional info that would
help?

[0] https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/?h=linux-4.4.y&id=2417da3f4d6bc4fc6c77f613f0e2264090892aa5
[1] https://openwrt.org/
[2] https://linuxcontainers.org/

-- 
Rafał

^ permalink raw reply

* [PATCH net 1/1] Modify the seq_puts and seq_printf of af_netlink.c file
From: Bo YU @ 2018-04-23 12:51 UTC (permalink / raw)
  To: davem, Wang, Berg, Tkhai, Long, Elena; +Cc: netdev

Modify format output symbol of seq_printf function and adjust blanks in
seq_puts function in order to make convenience with command:`cat
/proc/net/netlink`

Signed-off-by: Bo YU <tsu.yubo@gmail.com>
---
  net/netlink/af_netlink.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 55342c4d5cec..2e2dd88fc79f 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2606,13 +2606,13 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
  {
  	if (v == SEQ_START_TOKEN) {
  		seq_puts(seq,
-			 "sk       Eth Pid    Groups   "
-			 "Rmem     Wmem     Dump     Locks     Drops     Inode\n");
+			 "sk               Eth Pid        Groups   "
+			 "Rmem     Wmem     Dump  Locks    Drops    Inode\n");
  	} else {
  		struct sock *s = v;
  		struct netlink_sock *nlk = nlk_sk(s);

-		seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %d %-8d %-8d %-8lu\n",
+		seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8d %-8lu\n",
  			   s,
  			   s->sk_protocol,
  			   nlk->portid,

^ permalink raw reply related

* [PATCH] dca: make function dca_common_get_tag static
From: Colin King @ 2018-04-23 12:49 UTC (permalink / raw)
  To: netdev; +Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

Function dca_common_get_tag is local to the source and does not need to be
in global scope, so make it static.

Cleans up sparse warning:
drivers/dca/dca-core.c:273:4: warning: symbol 'dca_common_get_tag' was
not declared. Should it be static?

Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 drivers/dca/dca-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dca/dca-core.c b/drivers/dca/dca-core.c
index 7afbb28d6a0f..1bc5ffb338c8 100644
--- a/drivers/dca/dca-core.c
+++ b/drivers/dca/dca-core.c
@@ -270,7 +270,7 @@ EXPORT_SYMBOL_GPL(dca_remove_requester);
  * @dev - the device that wants dca service
  * @cpu - the cpuid as returned by get_cpu()
  */
-u8 dca_common_get_tag(struct device *dev, int cpu)
+static u8 dca_common_get_tag(struct device *dev, int cpu)
 {
 	struct dca_provider *dca;
 	u8 tag;
-- 
2.17.0

^ permalink raw reply related

* Re: [PATCH net-next] lan78xx: Lan7801 Support for Fixed PHY
From: Andrew Lunn @ 2018-04-23 12:42 UTC (permalink / raw)
  To: Raghuram Chary J; +Cc: davem, netdev, unglinuxdriver, woojung.huh
In-Reply-To: <20180423044630.2672-1-raghuramchary.jallipalli@microchip.com>

>  #define DRIVER_AUTHOR	"WOOJUNG HUH <woojung.huh@microchip.com>"
>  #define DRIVER_DESC	"LAN78XX USB 3.0 Gigabit Ethernet Devices"
>  #define DRIVER_NAME	"lan78xx"
> -#define DRIVER_VERSION	"1.0.6"
> +#define DRIVER_VERSION	"1.0.7"

Hi Raghuram

Driver version strings a pretty pointless. You might want to remove
it.

>  
>  #define TX_TIMEOUT_JIFFIES		(5 * HZ)
>  #define THROTTLE_JIFFIES		(HZ / 8)
> @@ -426,6 +426,7 @@ struct lan78xx_net {
>  	struct statstage	stats;
>  
>  	struct irq_domain_data	domain_data;
> +	struct phy_device	*fixedphy;
>  };
>  
>  /* define external phy id */
> @@ -2062,11 +2063,39 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
>  	int ret;
>  	u32 mii_adv;
>  	struct phy_device *phydev;
> +	struct fixed_phy_status fphy_status = {
> +		.link = 1,
> +		.speed = SPEED_1000,
> +		.duplex = DUPLEX_FULL,
> +	};
>  
>  	phydev = phy_find_first(dev->mdiobus);
>  	if (!phydev) {
> -		netdev_err(dev->net, "no PHY found\n");
> -		return -EIO;
> +		if (dev->chipid == ID_REV_CHIP_ID_7801_) {
> +			u32 buf;
> +
> +			netdev_info(dev->net, "PHY Not Found!! Registering Fixed PHY\n");
> +			phydev = fixed_phy_register(PHY_POLL, &fphy_status, -1,
> +						    NULL);
> +			if (IS_ERR(phydev)) {
> +				netdev_err(dev->net, "No PHY/fixed_PHY found\n");
> +				return -ENODEV;
> +			}
> +			netdev_info(dev->net, "Registered FIXED PHY\n");

There are too many detdev_info() messages here. Maybe make them both
netdev_dbg().

> +			dev->interface = PHY_INTERFACE_MODE_RGMII;
> +			dev->fixedphy = phydev;

You can use 

if (!phy_is_pseudo_fixed_link(phydev))

to determine is a PHY is a fixed phy. I think you can then do without
dev->fixedphy.

> +			ret = lan78xx_write_reg(dev, MAC_RGMII_ID,
> +						MAC_RGMII_ID_TXC_DELAY_EN_);
> +			ret = lan78xx_write_reg(dev, RGMII_TX_BYP_DLL, 0x3D00);
> +			ret = lan78xx_read_reg(dev, HW_CFG, &buf);
> +			buf |= HW_CFG_CLK125_EN_;
> +			buf |= HW_CFG_REFCLK25_EN_;
> +			ret = lan78xx_write_reg(dev, HW_CFG, buf);
> +			goto phyinit;

Please don't use a goto like this. Maybe turn this into a switch statement?

> +		} else {
> +			netdev_err(dev->net, "no PHY found\n");
> +			return -EIO;
> +		}
>  	}
>  
>  	if ((dev->chipid == ID_REV_CHIP_ID_7800_) ||
> @@ -2105,6 +2134,7 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
>  		goto error;

Please take a look at what happens at error: It does not look
correct. Probably now is a good time to refactor the whole of lan78xx_phy_init()

	 Andrew

^ permalink raw reply

* Re: [PATCH bpf-next v3 4/9] bpf/verifier: improve register value range tracking with ARSH
From: Edward Cree @ 2018-04-23 12:25 UTC (permalink / raw)
  To: Yonghong Song, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180420221842.742330-5-yhs@fb.com>

On 20/04/18 23:18, Yonghong Song wrote:
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 3c8bb92..01c215d 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -2975,6 +2975,32 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
>  		/* We may learn something more from the var_off */
>  		__update_reg_bounds(dst_reg);
>  		break;
> +	case BPF_ARSH:
> +		if (umax_val >= insn_bitness) {
> +			/* Shifts greater than 31 or 63 are undefined.
> +			 * This includes shifts by a negative number.
> +			 */
> +			mark_reg_unknown(env, regs, insn->dst_reg);
> +			break;
> +		}
> +		if (dst_reg->smin_value < 0)
> +			dst_reg->smin_value >>= umin_val;
> +		else
> +			dst_reg->smin_value >>= umax_val;
> +		if (dst_reg->smax_value < 0)
> +			dst_reg->smax_value >>= umax_val;
> +		else
> +			dst_reg->smax_value >>= umin_val;
> +		if (src_known)
> +			dst_reg->var_off = tnum_rshift(dst_reg->var_off,
> +						       umin_val);
tnum_rshift is an unsigned shift, it won't do what you want here.
I think you could write a tnum_arshift that looks something like this
 (UNTESTED!):

    struct tnum tnum_arshift(struct tnum a, u8 shift)
    {
        return TNUM(((s64)a.value) >> shift, ((s64)a.mask) >> shift);
    }
Theory: if value sign bit is 1 then number is known negative so populate
 upper bits with known 1s.  If mask sign bit is 1 then number might be
 negative so populate upper bits with unknown.  Otherwise, number is
 known positive so populate upper bits with known 0s.

> +		else
> +			dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
Applying the above here, tnum_arshift(tnum_unknown, ...) would always just
 return tnum_unknown, so just do "dst_reg->var_off = tnum_unknown;".
The reason for the corresponding logic in the BPF_RSH case is that a right
 logical shift _always_ populates upper bits with zeroes.
In any case these 'else' branches are currently never taken because they
 fall foul of the check Alexei added just before the switch,
    if (!src_known &&
        opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
        __mark_reg_unknown(dst_reg);
        return 0;
    }
So I can guarantee you haven't tested this code :-)

> +		dst_reg->umin_value >>= umax_val;
> +		dst_reg->umax_value >>= umin_val;
FWIW I think the way to handle umin/umax here is to blow them away and
 just rely on inferring new ubounds from the sbounds (i.e. the inverse of
 what we do just above in case BPF_RSH) since BPF_ARSH is essentially an
 operation on the signed value.  I don't think there is a need to support
 cases where the unsigned bounds of a signed shift of a value that may
 cross the sign boundary at (1<<63) are needed to verify a program.
(Unlike in the unsigned shift case, it is at least _possible_ for there to
 be information from the ubounds that we can't get from the sbounds - but
 it's a contrived case that isn't likely to be useful in real programs.)

-Ed
> +		/* We may learn something more from the var_off */
> +		__update_reg_bounds(dst_reg);
> +		break;
>  	default:
>  		mark_reg_unknown(env, regs, insn->dst_reg);
>  		break;

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox