Netdev List
 help / color / mirror / Atom feed
* [PATCH v3 bpf-next 2/2] bpf: add selftest for stackmap with build_id in NMI context
From: Song Liu @ 2018-05-07 17:50 UTC (permalink / raw)
  To: netdev; +Cc: Song Liu, kernel-team, qinteng, tobin
In-Reply-To: <20180507175049.1541963-1-songliubraving@fb.com>

This new test captures stackmap with build_id with hardware event
PERF_COUNT_HW_CPU_CYCLES.

Because we only support one ips-to-build_id lookup per cpu in NMI
context, stack_amap will not be able to do the lookup in this test.
Therefore, we didn't do compare_stack_ips(), as it will alwasy fail.

urandom_read.c is extended to run configurable cycles so that it can be
caught by the perf event.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 tools/testing/selftests/bpf/test_progs.c   | 134 +++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/urandom_read.c |  10 ++-
 2 files changed, 142 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index aa336f0..5d2d8ef 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1272,6 +1272,139 @@ static void test_stacktrace_build_id(void)
 	return;
 }
 
+static void test_stacktrace_build_id_nmi(void)
+{
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
+	const char *file = "./test_stacktrace_build_id.o";
+	int err, pmu_fd, prog_fd;
+	struct perf_event_attr attr = {
+		.sample_freq = 5000,
+		.freq = 1,
+		.type = PERF_TYPE_HARDWARE,
+		.config = PERF_COUNT_HW_CPU_CYCLES,
+	};
+	__u32 key, previous_key, val, duration = 0;
+	struct bpf_object *obj;
+	char buf[256];
+	int i, j;
+	struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
+	int build_id_matches = 0;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_PERF_EVENT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+		return;
+
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (CHECK(pmu_fd < 0, "perf_event_open",
+		  "err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n",
+		  pmu_fd, errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
+		  err, errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	/* find map fds */
+	control_map_fd = bpf_find_map(__func__, obj, "control_map");
+	if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
+	if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
+	if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+	if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
+	       == 0);
+	assert(system("taskset 0x1 ./urandom_read 100000") == 0);
+	/* disable stack trace collection */
+	key = 0;
+	val = 1;
+	bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+	/* for every element in stackid_hmap, we can find a corresponding one
+	 * in stackmap, and vise versa.
+	 */
+	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = extract_build_id(buf, 256);
+
+	if (CHECK(err, "get build_id with readelf",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
+	if (CHECK(err, "get_next_key from stackmap",
+		  "err %d, errno %d\n", err, errno))
+		goto disable_pmu;
+
+	do {
+		char build_id[64];
+
+		err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
+		if (CHECK(err, "lookup_elem from stackmap",
+			  "err %d, errno %d\n", err, errno))
+			goto disable_pmu;
+		for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
+			if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
+			    id_offs[i].offset != 0) {
+				for (j = 0; j < 20; ++j)
+					sprintf(build_id + 2 * j, "%02x",
+						id_offs[i].build_id[j] & 0xff);
+				if (strstr(buf, build_id) != NULL)
+					build_id_matches = 1;
+			}
+		previous_key = key;
+	} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
+
+	if (CHECK(build_id_matches < 1, "build id match",
+		  "Didn't find expected build ID from the map\n"))
+		goto disable_pmu;
+
+	/*
+	 * We intentionally skip compare_stack_ips(). This is because we
+	 * only support one in_nmi() ips-to-build_id translation per cpu
+	 * at any time, thus stack_amap here will always fallback to
+	 * BPF_STACK_BUILD_ID_IP;
+	 */
+
+disable_pmu:
+	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
+
+close_pmu:
+	close(pmu_fd);
+
+close_prog:
+	bpf_object__close(obj);
+}
+
 #define MAX_CNT_RAWTP	10ull
 #define MAX_STACK_RAWTP	100
 struct get_stack_trace_t {
@@ -1425,6 +1558,7 @@ int main(void)
 	test_tp_attach_query();
 	test_stacktrace_map();
 	test_stacktrace_build_id();
+	test_stacktrace_build_id_nmi();
 	test_stacktrace_map_raw_tp();
 	test_get_stack_raw_tp();
 
diff --git a/tools/testing/selftests/bpf/urandom_read.c b/tools/testing/selftests/bpf/urandom_read.c
index 4acfdeb..9de8b7c 100644
--- a/tools/testing/selftests/bpf/urandom_read.c
+++ b/tools/testing/selftests/bpf/urandom_read.c
@@ -6,15 +6,21 @@
 #include <stdlib.h>
 
 #define BUF_SIZE 256
-int main(void)
+
+int main(int argc, char *argv[])
 {
 	int fd = open("/dev/urandom", O_RDONLY);
 	int i;
 	char buf[BUF_SIZE];
+	int count = 4;
 
 	if (fd < 0)
 		return 1;
-	for (i = 0; i < 4; ++i)
+
+	if (argc == 2)
+		count = atoi(argv[1]);
+
+	for (i = 0; i < count; ++i)
 		read(fd, buf, BUF_SIZE);
 
 	close(fd);
-- 
2.9.5

^ permalink raw reply related

* [PATCH v3 bpf-next 1/2] bpf: enable stackmap with build_id in nmi context
From: Song Liu @ 2018-05-07 17:50 UTC (permalink / raw)
  To: netdev
  Cc: Song Liu, kernel-team, qinteng, tobin, Alexei Starovoitov,
	Daniel Borkmann, Peter Zijlstra
In-Reply-To: <20180507175049.1541963-1-songliubraving@fb.com>

Currently, we cannot parse build_id in nmi context because of
up_read(&current->mm->mmap_sem), this makes stackmap with build_id
less useful. This patch enables parsing build_id in nmi by putting
the up_read() call in irq_work. To avoid memory allocation in nmi
context, we use per cpu variable for the irq_work. As a result, only
one irq_work per cpu is allowed. If the irq_work is in-use, we
fallback to only report ips.

Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 init/Kconfig          |  1 +
 kernel/bpf/stackmap.c | 59 +++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index f013afc..480a4f2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1391,6 +1391,7 @@ config BPF_SYSCALL
 	bool "Enable bpf() system call"
 	select ANON_INODES
 	select BPF
+	select IRQ_WORK
 	default n
 	help
 	  Enable the bpf() system call that allows to manipulate eBPF
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 3ba102b..b59ace0 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,7 @@
 #include <linux/perf_event.h>
 #include <linux/elf.h>
 #include <linux/pagemap.h>
+#include <linux/irq_work.h>
 #include "percpu_freelist.h"
 
 #define STACK_CREATE_FLAG_MASK					\
@@ -32,6 +33,23 @@ struct bpf_stack_map {
 	struct stack_map_bucket *buckets[];
 };
 
+/* irq_work to run up_read() for build_id lookup in nmi context */
+struct stack_map_irq_work {
+	struct irq_work irq_work;
+	struct rw_semaphore *sem;
+};
+
+static void do_up_read(struct irq_work *entry)
+{
+	struct stack_map_irq_work *work;
+
+	work = container_of(entry, struct stack_map_irq_work, irq_work);
+	up_read(work->sem);
+	work->sem = NULL;
+}
+
+static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
+
 static inline bool stack_map_use_build_id(struct bpf_map *map)
 {
 	return (map->map_flags & BPF_F_STACK_BUILD_ID);
@@ -267,17 +285,27 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 {
 	int i;
 	struct vm_area_struct *vma;
+	bool in_nmi_ctx = in_nmi();
+	bool irq_work_busy = false;
+	struct stack_map_irq_work *work;
+
+	if (in_nmi_ctx) {
+		work = this_cpu_ptr(&up_read_work);
+		if (work->irq_work.flags & IRQ_WORK_BUSY)
+			/* cannot queue more up_read, fallback */
+			irq_work_busy = true;
+	}
 
 	/*
-	 * We cannot do up_read() in nmi context, so build_id lookup is
-	 * only supported for non-nmi events. If at some point, it is
-	 * possible to run find_vma() without taking the semaphore, we
-	 * would like to allow build_id lookup in nmi context.
+	 * We cannot do up_read() in nmi context. To do build_id lookup
+	 * in nmi context, we need to run up_read() in irq_work. We use
+	 * a percpu variable to do the irq_work. If the irq_work is
+	 * already used by another lookup, we fall back to report ips.
 	 *
 	 * Same fallback is used for kernel stack (!user) on a stackmap
 	 * with build_id.
 	 */
-	if (!user || !current || !current->mm || in_nmi() ||
+	if (!user || !current || !current->mm || irq_work_busy ||
 	    down_read_trylock(&current->mm->mmap_sem) == 0) {
 		/* cannot access current->mm, fall back to ips */
 		for (i = 0; i < trace_nr; i++) {
@@ -299,7 +327,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 			- vma->vm_start;
 		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
 	}
-	up_read(&current->mm->mmap_sem);
+
+	if (!in_nmi_ctx) {
+		up_read(&current->mm->mmap_sem);
+	} else {
+		work->sem = &current->mm->mmap_sem;
+		irq_work_queue(&work->irq_work);
+	}
 }
 
 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
@@ -575,3 +609,16 @@ const struct bpf_map_ops stack_map_ops = {
 	.map_update_elem = stack_map_update_elem,
 	.map_delete_elem = stack_map_delete_elem,
 };
+
+static int __init stack_map_init(void)
+{
+	int cpu;
+	struct stack_map_irq_work *work;
+
+	for_each_possible_cpu(cpu) {
+		work = per_cpu_ptr(&up_read_work, cpu);
+		init_irq_work(&work->irq_work, do_up_read);
+	}
+	return 0;
+}
+subsys_initcall(stack_map_init);
-- 
2.9.5

^ permalink raw reply related

* Re: [net-next PATCH v2 0/8] UDP GSO Segmentation clean-ups
From: Alexander Duyck @ 2018-05-07 18:02 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Willem de Bruijn, David Miller
In-Reply-To: <CAF=yD-LAF8s+GXbGQxAHx1ptw73Aj3pr3ipMzRhmsnmwSczZ5Q@mail.gmail.com>

On Sat, May 5, 2018 at 3:06 AM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Fri, May 4, 2018 at 8:28 PM, Alexander Duyck
> <alexander.duyck@gmail.com> wrote:
>> This patch set addresses a number of issues I found while sorting out
>> enabling UDP GSO Segmentation support for ixgbe/ixgbevf. Specifically there
>> were a number of issues related to the checksum and such that seemed to
>> cause either minor irregularities or kernel panics in the case of the
>> offload request being allowed to traverse between name spaces.
>
> Were you able to traverse GSO packets between network namespace before
> adding to NETIF_F_GSO_SOFTWARE? It does appear that veth includes
> NETIF_F_GSO_ENCAP_ALL, which also allows GSO.

Without that change the tunnel wouldn't pass the requests between
namespaces. However with it I was able to easily test the software
checksum code as otherwise the socket was returning EIO when the
hardware checksum was disabled.

> In either case, it should not be possible for GSO packets to arrive on a veth
> device, as that can result in queuing the GSO packet to a recipient socket.
> In this regard veth is like loopback and must exclude GSO support.
>
> I'll take a look.

I suspect it was probably sending veth UDP segmentation offload
requests. For now I can probably drop he patch that was adding it and
it can be added later to individual drivers if needed.

Thanks.

- Alex

^ permalink raw reply

* [net-next PATCH v3 0/6] Series short description
From: Alexander Duyck @ 2018-05-07 18:08 UTC (permalink / raw)
  To: netdev, willemb, davem

This patch set addresses a number of issues I found while sorting out
enabling UDP GSO Segmentation support for ixgbe/ixgbevf. Specifically there
were a number of issues related to the checksum and such that seemed to
cause either minor irregularities or kernel panics in the case of the
offload request being allowed to traverse between name spaces.

With this set applied I am was able to get UDP GSO traffic to pass over
vxlan tunnels in both offloaded modes and non-offloaded modes for ixgbe and
ixgbevf.

I submitted the driver specific patches earlier as an RFC:
https://patchwork.ozlabs.org/project/netdev/list/?series=42477&archive=both&state=*

v2: Updated patches based on feedback from Eric Dumazet
    Split first patch into several patches based on feedback from Eric
v3: Drop patch that was calling pskb_may_pull as it was redundant.
    Added code to use MANGLED_0 in case of UDP checksum
    Drop patch adding NETIF_F_GSO_UDP_L4 to list of GSO software offloads
    Added Acked-by for patches reviewed by Willem and not changed

---

Alexander Duyck (6):
      udp: Record gso_segs when supporting UDP segmentation offload
      udp: Do not pass MSS as parameter to GSO segmentation
      udp: Do not pass checksum as a parameter to GSO segmentation
      udp: Partially unroll handling of first segment and last segment
      udp: Add support for software checksum and GSO_PARTIAL with GSO offload
      udp: Do not copy destructor if one is not present


 include/net/udp.h      |    3 +-
 net/ipv4/udp.c         |    2 +
 net/ipv4/udp_offload.c |   94 +++++++++++++++++++++++++++++++-----------------
 net/ipv6/udp_offload.c |   16 +-------
 4 files changed, 64 insertions(+), 51 deletions(-)

^ permalink raw reply

* [net-next PATCH v3 1/6] udp: Record gso_segs when supporting UDP segmentation offload
From: Alexander Duyck @ 2018-05-07 18:08 UTC (permalink / raw)
  To: netdev, willemb, davem
In-Reply-To: <20180507180310.3486.35994.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

We need to record the number of segments that will be generated when this
frame is segmented. The expectation is that if gso_size is set then
gso_segs is set as well. Without this some drivers such as ixgbe get
confused if they attempt to offload this as they record 0 segments for the
entire packet instead of the correct value.

Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 net/ipv4/udp.c |    2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index dd3102a37ef9..e07db83b311e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -793,6 +793,8 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
 
 		skb_shinfo(skb)->gso_size = cork->gso_size;
 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(uh),
+							 cork->gso_size);
 		goto csum_partial;
 	}
 

^ permalink raw reply related

* [net-next PATCH v3 2/6] udp: Do not pass MSS as parameter to GSO segmentation
From: Alexander Duyck @ 2018-05-07 18:08 UTC (permalink / raw)
  To: netdev, willemb, davem
In-Reply-To: <20180507180310.3486.35994.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

There is no point in passing MSS as a parameter for for the GSO
segmentation call as it is already available via the shared info for the
skb itself.

Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 include/net/udp.h      |    2 +-
 net/ipv4/udp_offload.c |    6 ++++--
 net/ipv6/udp_offload.c |    2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 05990746810e..8bd83b044ecd 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -176,7 +176,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 				  netdev_features_t features,
-				  unsigned int mss, __sum16 check);
+				  __sum16 check);
 
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 006257092f06..c1afcd2f1a76 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -189,14 +189,16 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 				  netdev_features_t features,
-				  unsigned int mss, __sum16 check)
+				  __sum16 check)
 {
 	struct sock *sk = gso_skb->sk;
 	unsigned int sum_truesize = 0;
 	struct sk_buff *segs, *seg;
 	unsigned int hdrlen;
 	struct udphdr *uh;
+	unsigned int mss;
 
+	mss = skb_shinfo(gso_skb)->gso_size;
 	if (gso_skb->len <= sizeof(*uh) + mss)
 		return ERR_PTR(-EINVAL);
 
@@ -244,7 +246,7 @@ static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
 	if (!can_checksum_protocol(features, htons(ETH_P_IP)))
 		return ERR_PTR(-EIO);
 
-	return __udp_gso_segment(gso_skb, features, mss,
+	return __udp_gso_segment(gso_skb, features,
 				 udp_v4_check(sizeof(struct udphdr) + mss,
 					      iph->saddr, iph->daddr, 0));
 }
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index f7b85b1e6b3e..dea03ec09715 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -26,7 +26,7 @@ static struct sk_buff *__udp6_gso_segment(struct sk_buff *gso_skb,
 	if (!can_checksum_protocol(features, htons(ETH_P_IPV6)))
 		return ERR_PTR(-EIO);
 
-	return __udp_gso_segment(gso_skb, features, mss,
+	return __udp_gso_segment(gso_skb, features,
 				 udp_v6_check(sizeof(struct udphdr) + mss,
 					      &ip6h->saddr, &ip6h->daddr, 0));
 }

^ permalink raw reply related

* [net-next PATCH v3 3/6] udp: Do not pass checksum as a parameter to GSO segmentation
From: Alexander Duyck @ 2018-05-07 18:08 UTC (permalink / raw)
  To: netdev, willemb, davem
In-Reply-To: <20180507180310.3486.35994.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch is meant to allow us to avoid having to recompute the checksum
from scratch and have it passed as a parameter.

Instead of taking that approach we can take advantage of the fact that the
length that was used to compute the existing checksum is included in the
UDP header.

Finally to avoid the need to invert the result we can just call csum16_add
and csum16_sub directly. By doing this we can avoid a number of
instructions in the loop that is handling segmentation.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 include/net/udp.h      |    3 +--
 net/ipv4/udp_offload.c |   32 ++++++++++++++++++--------------
 net/ipv6/udp_offload.c |    7 +------
 3 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 8bd83b044ecd..9289b6425032 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -175,8 +175,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
-				  netdev_features_t features,
-				  __sum16 check);
+				  netdev_features_t features);
 
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index c1afcd2f1a76..92c182e99ddc 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -188,8 +188,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 EXPORT_SYMBOL(skb_udp_tunnel_segment);
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
-				  netdev_features_t features,
-				  __sum16 check)
+				  netdev_features_t features)
 {
 	struct sock *sk = gso_skb->sk;
 	unsigned int sum_truesize = 0;
@@ -197,6 +196,8 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	unsigned int hdrlen;
 	struct udphdr *uh;
 	unsigned int mss;
+	__sum16 check;
+	__be16 newlen;
 
 	mss = skb_shinfo(gso_skb)->gso_size;
 	if (gso_skb->len <= sizeof(*uh) + mss)
@@ -215,17 +216,25 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		return segs;
 	}
 
+	uh = udp_hdr(segs);
+
+	/* compute checksum adjustment based on old length versus new */
+	newlen = htons(sizeof(*uh) + mss);
+	check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
+
 	for (seg = segs; seg; seg = seg->next) {
 		uh = udp_hdr(seg);
-		uh->len = htons(seg->len - hdrlen);
-		uh->check = check;
 
 		/* last packet can be partial gso_size */
-		if (!seg->next)
-			csum_replace2(&uh->check, htons(mss),
-				      htons(seg->len - hdrlen - sizeof(*uh)));
+		if (!seg->next) {
+			newlen = htons(seg->len - hdrlen);
+			check = csum16_add(csum16_sub(uh->check, uh->len),
+					   newlen);
+		}
+
+		uh->len = newlen;
+		uh->check = check;
 
-		uh->check = ~uh->check;
 		seg->destructor = sock_wfree;
 		seg->sk = sk;
 		sum_truesize += seg->truesize;
@@ -240,15 +249,10 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
 					  netdev_features_t features)
 {
-	const struct iphdr *iph = ip_hdr(gso_skb);
-	unsigned int mss = skb_shinfo(gso_skb)->gso_size;
-
 	if (!can_checksum_protocol(features, htons(ETH_P_IP)))
 		return ERR_PTR(-EIO);
 
-	return __udp_gso_segment(gso_skb, features,
-				 udp_v4_check(sizeof(struct udphdr) + mss,
-					      iph->saddr, iph->daddr, 0));
+	return __udp_gso_segment(gso_skb, features);
 }
 
 static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index dea03ec09715..61e34f1d2fa2 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -20,15 +20,10 @@
 static struct sk_buff *__udp6_gso_segment(struct sk_buff *gso_skb,
 					  netdev_features_t features)
 {
-	const struct ipv6hdr *ip6h = ipv6_hdr(gso_skb);
-	unsigned int mss = skb_shinfo(gso_skb)->gso_size;
-
 	if (!can_checksum_protocol(features, htons(ETH_P_IPV6)))
 		return ERR_PTR(-EIO);
 
-	return __udp_gso_segment(gso_skb, features,
-				 udp_v6_check(sizeof(struct udphdr) + mss,
-					      &ip6h->saddr, &ip6h->daddr, 0));
+	return __udp_gso_segment(gso_skb, features);
 }
 
 static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,

^ permalink raw reply related

* [net-next PATCH v3 4/6] udp: Partially unroll handling of first segment and last segment
From: Alexander Duyck @ 2018-05-07 18:08 UTC (permalink / raw)
  To: netdev, willemb, davem
In-Reply-To: <20180507180310.3486.35994.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch allows us to take care of unrolling the first segment and the
last segment of the loop for processing the segmented skb. Part of the
motivation for this is that it makes it easier to process the fact that the
first fame and all of the frames in between should be mostly identical
in terms of header data, and the last frame has differences in the length
and partial checksum.

In addition I am dropping the header length calculation since we don't
really need it for anything but the last frame and it can be easily
obtained by just pulling the data_len and offset of tail from the transport
header.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 net/ipv4/udp_offload.c |   33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 92c182e99ddc..b15c78ac3f23 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -193,7 +193,6 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	struct sock *sk = gso_skb->sk;
 	unsigned int sum_truesize = 0;
 	struct sk_buff *segs, *seg;
-	unsigned int hdrlen;
 	struct udphdr *uh;
 	unsigned int mss;
 	__sum16 check;
@@ -203,7 +202,6 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	if (gso_skb->len <= sizeof(*uh) + mss)
 		return ERR_PTR(-EINVAL);
 
-	hdrlen = gso_skb->data - skb_mac_header(gso_skb);
 	skb_pull(gso_skb, sizeof(*uh));
 
 	/* clear destructor to avoid skb_segment assigning it to tail */
@@ -216,30 +214,37 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		return segs;
 	}
 
-	uh = udp_hdr(segs);
+	seg = segs;
+	uh = udp_hdr(seg);
 
 	/* compute checksum adjustment based on old length versus new */
 	newlen = htons(sizeof(*uh) + mss);
 	check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
 
-	for (seg = segs; seg; seg = seg->next) {
-		uh = udp_hdr(seg);
+	for (;;) {
+		seg->destructor = sock_wfree;
+		seg->sk = sk;
+		sum_truesize += seg->truesize;
 
-		/* last packet can be partial gso_size */
-		if (!seg->next) {
-			newlen = htons(seg->len - hdrlen);
-			check = csum16_add(csum16_sub(uh->check, uh->len),
-					   newlen);
-		}
+		if (!seg->next)
+			break;
 
 		uh->len = newlen;
 		uh->check = check;
 
-		seg->destructor = sock_wfree;
-		seg->sk = sk;
-		sum_truesize += seg->truesize;
+		seg = seg->next;
+		uh = udp_hdr(seg);
 	}
 
+	/* last packet can be partial gso_size, account for that in checksum */
+	newlen = htons(skb_tail_pointer(seg) - skb_transport_header(seg) +
+		       seg->data_len);
+	check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
+
+	uh->len = newlen;
+	uh->check = check;
+
+	/* update refcount for the packet */
 	refcount_add(sum_truesize - gso_skb->truesize, &sk->sk_wmem_alloc);
 
 	return segs;

^ permalink raw reply related

* [net-next PATCH v3 5/6] udp: Add support for software checksum and GSO_PARTIAL with GSO offload
From: Alexander Duyck @ 2018-05-07 18:08 UTC (permalink / raw)
  To: netdev, willemb, davem
In-Reply-To: <20180507180310.3486.35994.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch adds support for a software provided checksum and GSO_PARTIAL
segmentation support. With this we can offload UDP segmentation on devices
that only have partial support for tunnels.

Since we are no longer needing the hardware checksum we can drop the checks
in the segmentation code that were verifying if it was present.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 net/ipv4/udp_offload.c |   29 +++++++++++++++++++----------
 net/ipv6/udp_offload.c |   11 +----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index b15c78ac3f23..d4f2daca0c33 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -214,6 +214,13 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		return segs;
 	}
 
+	/* GSO partial and frag_list segmentation only requires splitting
+	 * the frame into an MSS multiple and possibly a remainder, both
+	 * cases return a GSO skb. So update the mss now.
+	 */
+	if (skb_is_gso(segs))
+		mss *= skb_shinfo(segs)->gso_segs;
+
 	seg = segs;
 	uh = udp_hdr(seg);
 
@@ -232,6 +239,12 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		uh->len = newlen;
 		uh->check = check;
 
+		if (seg->ip_summed == CHECKSUM_PARTIAL)
+			gso_reset_checksum(seg, ~check);
+		else
+			uh->check = gso_make_checksum(seg, ~check) ? :
+				    CSUM_MANGLED_0;
+
 		seg = seg->next;
 		uh = udp_hdr(seg);
 	}
@@ -244,6 +257,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	uh->len = newlen;
 	uh->check = check;
 
+	if (seg->ip_summed == CHECKSUM_PARTIAL)
+		gso_reset_checksum(seg, ~check);
+	else
+		uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0;
+
 	/* update refcount for the packet */
 	refcount_add(sum_truesize - gso_skb->truesize, &sk->sk_wmem_alloc);
 
@@ -251,15 +269,6 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 }
 EXPORT_SYMBOL_GPL(__udp_gso_segment);
 
-static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
-					  netdev_features_t features)
-{
-	if (!can_checksum_protocol(features, htons(ETH_P_IP)))
-		return ERR_PTR(-EIO);
-
-	return __udp_gso_segment(gso_skb, features);
-}
-
 static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
 {
@@ -283,7 +292,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 		goto out;
 
 	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
-		return __udp4_gso_segment(skb, features);
+		return __udp_gso_segment(skb, features);
 
 	mss = skb_shinfo(skb)->gso_size;
 	if (unlikely(skb->len <= mss))
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 61e34f1d2fa2..03a2ff3fe1e6 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -17,15 +17,6 @@
 #include <net/ip6_checksum.h>
 #include "ip6_offload.h"
 
-static struct sk_buff *__udp6_gso_segment(struct sk_buff *gso_skb,
-					  netdev_features_t features)
-{
-	if (!can_checksum_protocol(features, htons(ETH_P_IPV6)))
-		return ERR_PTR(-EIO);
-
-	return __udp_gso_segment(gso_skb, features);
-}
-
 static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
 {
@@ -58,7 +49,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 			goto out;
 
 		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
-			return __udp6_gso_segment(skb, features);
+			return __udp_gso_segment(skb, features);
 
 		/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
 		 * do checksum of UDP packets sent as multiple IP fragments.

^ permalink raw reply related

* [net-next PATCH v3 6/6] udp: Do not copy destructor if one is not present
From: Alexander Duyck @ 2018-05-07 18:08 UTC (permalink / raw)
  To: netdev, willemb, davem
In-Reply-To: <20180507180310.3486.35994.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch makes it so that if a destructor is not present we avoid trying
to update the skb socket or any reference counting that would be associated
with the NULL socket and/or descriptor. By doing this we can support
traffic coming from another namespace without any issues.

Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 net/ipv4/udp_offload.c |   22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index d4f2daca0c33..ede2a7305b90 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -195,6 +195,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	struct sk_buff *segs, *seg;
 	struct udphdr *uh;
 	unsigned int mss;
+	bool copy_dtor;
 	__sum16 check;
 	__be16 newlen;
 
@@ -205,12 +206,14 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	skb_pull(gso_skb, sizeof(*uh));
 
 	/* clear destructor to avoid skb_segment assigning it to tail */
-	WARN_ON_ONCE(gso_skb->destructor != sock_wfree);
-	gso_skb->destructor = NULL;
+	copy_dtor = gso_skb->destructor == sock_wfree;
+	if (copy_dtor)
+		gso_skb->destructor = NULL;
 
 	segs = skb_segment(gso_skb, features);
 	if (unlikely(IS_ERR_OR_NULL(segs))) {
-		gso_skb->destructor = sock_wfree;
+		if (copy_dtor)
+			gso_skb->destructor = sock_wfree;
 		return segs;
 	}
 
@@ -229,9 +232,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
 
 	for (;;) {
-		seg->destructor = sock_wfree;
-		seg->sk = sk;
-		sum_truesize += seg->truesize;
+		if (copy_dtor) {
+			seg->destructor = sock_wfree;
+			seg->sk = sk;
+			sum_truesize += seg->truesize;
+		}
 
 		if (!seg->next)
 			break;
@@ -263,8 +268,9 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0;
 
 	/* update refcount for the packet */
-	refcount_add(sum_truesize - gso_skb->truesize, &sk->sk_wmem_alloc);
-
+	if (copy_dtor)
+		refcount_add(sum_truesize - gso_skb->truesize,
+			     &sk->sk_wmem_alloc);
 	return segs;
 }
 EXPORT_SYMBOL_GPL(__udp_gso_segment);

^ permalink raw reply related

* Re: [PATCH v2] net: dsa: drop some VLAs in switch.c
From: Florian Fainelli @ 2018-05-07 18:14 UTC (permalink / raw)
  To: Salvatore Mesoraca, Andrew Lunn
  Cc: linux-kernel, kernel-hardening, netdev, David S. Miller,
	Kees Cook, Vivien Didelot, David Laight
In-Reply-To: <1525706596-13601-1-git-send-email-s.mesoraca16@gmail.com>

On 05/07/2018 08:23 AM, Salvatore Mesoraca wrote:
> We avoid 2 VLAs by using a pre-allocated field in dsa_switch.
> We also try to avoid dynamic allocation whenever possible.
> 
> Link: http://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com
> Link: http://lkml.kernel.org/r/20180505185145.GB32630@lunn.ch
> 
> Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
> ---
>  include/net/dsa.h |  3 +++
>  net/dsa/dsa2.c    | 14 ++++++++++++++
>  net/dsa/switch.c  | 22 ++++++++++------------
>  3 files changed, 27 insertions(+), 12 deletions(-)
> 
> diff --git a/include/net/dsa.h b/include/net/dsa.h
> index 60fb4ec..576791d 100644
> --- a/include/net/dsa.h
> +++ b/include/net/dsa.h
> @@ -256,6 +256,9 @@ struct dsa_switch {
>  	/* Number of switch port queues */
>  	unsigned int		num_tx_queues;
>  
> +	unsigned long		*bitmap;
> +	unsigned long		_bitmap;
> +
>  	/* Dynamically allocated ports, keep last */
>  	size_t num_ports;
>  	struct dsa_port ports[];
> diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
> index adf50fb..cebf35f0 100644
> --- a/net/dsa/dsa2.c
> +++ b/net/dsa/dsa2.c
> @@ -748,6 +748,20 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
>  	if (!ds)
>  		return NULL;
>  
> +	/* We avoid allocating memory outside dsa_switch
> +	 * if it is not needed.
> +	 */
> +	if (n <= sizeof(ds->_bitmap) * 8) {
> +		ds->bitmap = &ds->_bitmap;

Should not this be / BITS_PER_BYTE? If the sizeof(unsigned long) is <=
8, then you don't need to allocate it, otherwise, you have to.

I would actually just always dynamically allocate the bitmap, optimizing
for the case where we have fewer than or 8 ports is not worth IMHO.
-- 
Florian

^ permalink raw reply

* Re: [PATCH bpf-next] xsk: fix 64-bit division
From: Randy Dunlap @ 2018-05-07 18:25 UTC (permalink / raw)
  To: Björn Töpel, ast, daniel, netdev
  Cc: Björn Töpel, Stephen Rothwell, magnus.karlsson,
	linux-next, linux-kernel
In-Reply-To: <20180507174350.3534-1-bjorn.topel@gmail.com>

On 05/07/2018 10:43 AM, Björn Töpel wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
> 
> i386 builds report:
>   net/xdp/xdp_umem.o: In function `xdp_umem_reg':
>   xdp_umem.c:(.text+0x47e): undefined reference to `__udivdi3'
> 
> This fix uses div_u64 instead of the GCC built-in.
> 
> Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt")
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>

I don't know why the subject says xsk (instead of xdp), but anyway:

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>

Thanks.

> ---
>  net/xdp/xdp_umem.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index 881dfdefe235..2b47a1dd7c6c 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -209,7 +209,7 @@ int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
>  	if ((addr + size) < addr)
>  		return -EINVAL;
>  
> -	nframes = size / frame_size;
> +	nframes = (unsigned int)div_u64(size, frame_size);
>  	if (nframes == 0 || nframes > UINT_MAX)
>  		return -EINVAL;
>  
> 


-- 
~Randy

^ permalink raw reply

* Re: [net-next PATCH v3 6/6] udp: Do not copy destructor if one is not present
From: Eric Dumazet @ 2018-05-07 18:25 UTC (permalink / raw)
  To: Alexander Duyck, netdev, willemb, davem
In-Reply-To: <20180507180852.3486.27748.stgit@localhost.localdomain>



On 05/07/2018 11:08 AM, Alexander Duyck wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
> 
> This patch makes it so that if a destructor is not present we avoid trying
> to update the skb socket or any reference counting that would be associated
> with the NULL socket and/or descriptor. By doing this we can support
> traffic coming from another namespace without any issues.
> 
> Acked-by: Willem de Bruijn <willemb@google.com>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>

Reviewed-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply

* Re: [net-next PATCH v3 4/6] udp: Partially unroll handling of first segment and last segment
From: Eric Dumazet @ 2018-05-07 18:27 UTC (permalink / raw)
  To: Alexander Duyck, netdev, willemb, davem
In-Reply-To: <20180507180840.3486.67728.stgit@localhost.localdomain>



On 05/07/2018 11:08 AM, Alexander Duyck wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
> 
> This patch allows us to take care of unrolling the first segment and the
> last segment of the loop for processing the segmented skb. Part of the
> motivation for this is that it makes it easier to process the fact that the
> first fame and all of the frames in between should be mostly identical
> in terms of header data, and the last frame has differences in the length
> and partial checksum.
> 
> In addition I am dropping the header length calculation since we don't
> really need it for anything but the last frame and it can be easily
> obtained by just pulling the data_len and offset of tail from the transport
> header.
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>

Reviewed-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply

* Re: [PATCH net-next v8 1/7] sched: Add Common Applications Kept Enhanced (cake) qdisc
From: Toke Høiland-Jørgensen @ 2018-05-07 18:37 UTC (permalink / raw)
  To: Cong Wang; +Cc: Linux Kernel Network Developers, Cake List
In-Reply-To: <CAM_iQpV4qThp2+_sKC7ZBgUCUu0BSZvOkT0tvf2ATdMFgu+YPg@mail.gmail.com>

Cong Wang <xiyou.wangcong@gmail.com> writes:

> On Fri, May 4, 2018 at 12:10 PM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
>> Thank you for the review! A few comments below, I'll fix the rest.
>>
>>> [...]
>>>
>>> So sch_cake doesn't accept normal tc filters? Is this intentional?
>>> If so, why?
>>
>> For two reasons:
>>
>> - The two-level scheduling used in CAKE (tins / diffserv classes, and
>>   flow hashing) does not map in an obvious way to the classification
>>   index of tc filters.
>
> Sounds like you need to extend struct tcf_result?

Well, the obvious way to support filters would be to have skb->priority
override the diffserv mapping if set, and have the filter classification
result select the queue within that tier. That would probably be doable,
but see below.

>> - No one has asked for it. We have done our best to accommodate the
>>   features people want in a home router qdisc directly in CAKE, and the
>>   ability to integrate tc filters has never been requested.
>
> It is not hard to integrate, basically you need to call
> tcf_classify(). Although it is not mandatory, it is odd to merge a
> qdisc doesn't work with existing tc filters (and actions too).

I looked at the fq_codel code to do this. Is it possible to support
filtering without implementing Qdisc_class_ops? If so, I'll give it a
shot; but implementing the class ops is more than I can commit to...

>>>> +static int cake_init(struct Qdisc *sch, struct nlattr *opt,
>>>> +                    struct netlink_ext_ack *extack)
>>>> +{
>>>> +       struct cake_sched_data *q = qdisc_priv(sch);
>>>> +       int i, j;
>>>> +
>>>> +       sch->limit = 10240;
>>>> +       q->tin_mode = CAKE_DIFFSERV_BESTEFFORT;
>>>> +       q->flow_mode  = CAKE_FLOW_TRIPLE;
>>>> +
>>>> +       q->rate_bps = 0; /* unlimited by default */
>>>> +
>>>> +       q->interval = 100000; /* 100ms default */
>>>> +       q->target   =   5000; /* 5ms: codel RFC argues
>>>> +                              * for 5 to 10% of interval
>>>> +                              */
>>>> +
>>>> +       q->cur_tin = 0;
>>>> +       q->cur_flow  = 0;
>>>> +
>>>> +       if (opt) {
>>>> +               int err = cake_change(sch, opt, extack);
>>>> +
>>>> +               if (err)
>>>> +                       return err;
>>>
>>>
>>> Not sure if you really want to reallocate q->tines below for this
>>> case.
>>
>> I'm not sure what you mean here? If there's an error we return it and
>> the qdisc is not created. If there's not, we allocate and on subsequent
>> changes cake_change() will be called directly, or? Can the init function
>> ever be called again during the lifetime of the qdisc?
>>
>
> In non-error case, you call cake_change() first and then allocate
> ->tins with kvzalloc() below. For me it looks like you don't need to
> allocate it again when ->tins!=NULL.

No, we definitely don't. It's just not clear to me how cake_init() could
ever be called with q->tins already allocated?

I can add a check in any case, though, I see that there is one in
fq_codel as well...

-Toke

^ permalink raw reply

* Re: [PATCH v2 net-next 1/4] umh: introduce fork_usermode_blob() helper
From: Luis R. Rodriguez @ 2018-05-07 18:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Eric Paris, Matthew Auld, Josh Triplett,
	Kirill A. Shutemov, Joonas Lahtinen, Chris Wilson,
	Stephen Smalley, Eric W. Biederman, Mimi Zohar
  Cc: Luis R. Rodriguez, Alexei Starovoitov, davem, daniel, torvalds,
	gregkh, luto, netdev, linux-kernel, kernel-team, Al Viro,
	David Howells, Kees Cook, Andrew Morton, Dominik Brodowski,
	Hugh Dickins, Jann Horn, Jani Nikula, Joonas Lahtinen,
	Rodrigo Vivi, David Airlie, Rafael J. Wysocki,
	Linux FS Devel <linux-fsd
In-Reply-To: <20180505013710.2fb2k6e5uotd22kr@ast-mbp>

On Fri, May 04, 2018 at 06:37:11PM -0700, Alexei Starovoitov wrote:
> On Fri, May 04, 2018 at 07:56:43PM +0000, Luis R. Rodriguez wrote:
> > What a mighty short list of reviewers. Adding some more. My review below.
> > I'd appreciate a Cc on future versions of these patches.
> 
> sure.
> 
> > On Wed, May 02, 2018 at 09:36:01PM -0700, Alexei Starovoitov wrote:
> > > Introduce helper:
> > > int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
> > > struct umh_info {
> > >        struct file *pipe_to_umh;
> > >        struct file *pipe_from_umh;
> > >        pid_t pid;
> > > };
> > > 
> > > that GPLed kernel modules (signed or unsigned) can use it to execute part
> > > of its own data as swappable user mode process.
> > > 
> > > The kernel will do:
> > > - mount "tmpfs"
> > 
> > Actually its a *shared* vfsmount tmpfs for all umh blobs.
> 
> yep

OK just note CONFIG_TMPFS can be disabled, and likewise for CONFIG_SHMEM,
in which case tmpfs and shmem are replaced by a simple ramfs code, more
appropriate for systems without swap.

> > > +static struct vfsmount *umh_fs;
> > > +
> > > +static int init_tmpfs(void)
> > 
> > Please use umh_init_tmpfs(). 
> 
> ok
> 
> > Also see init/main.c do_basic_setup() which calls
> > usermodehelper_enable() prior to do_initcalls(). Now, fortunately TMPFS is only
> > bool, saving us from some races and we do call tmpfs's init first shmem_init():
> > 
> > static void __init do_basic_setup(void)
> > {
> > 	cpuset_init_smp();
> > 	shmem_init();
> > 	driver_init();
> > 	init_irq_proc();
> > 	do_ctors();
> > 	usermodehelper_enable();
> > 	do_initcalls();
> > }
> > 
> > But it also means we're enabling your new call call fork_usermode_blob() on
> > early init code even if we're not setup. Since this umh tmpfs vfsmount is
> > shared I'd say just call this init right before usermodehelper_enable()
> > on do_basic_setup().
> 
> Not following.
> Why init_tmpfs() should be called by __init function?

Nope, not at all, I was suggesting:

diff --git a/init/main.c b/init/main.c
index 0697284a28ee..67a48fbd96ca 100644
--- a/init/main.c
+++ b/init/main.c
@@ -973,6 +973,7 @@ static void __init do_basic_setup(void)
 	driver_init();
 	init_irq_proc();
 	do_ctors();
+	umh_init_tmpfs();
 	usermodehelper_enable();
 	do_initcalls();
 }

Mainly to avoid the locking situation Jann Horn noted, and also provide
proper kernel ordering expectations.

> Are you saying make 'static struct vfsmount *shm_mnt;'
> global and use it here? so no init_tmpfs() necessary?
> I think that can work, but feels that having two
> tmpfs mounts (one for shmem and one for umh) is cleaner.

No, but now that you mention it... if a shared vfsmount is not used the
/sys/kernel/mm/transparent_hugepage/shmem_enabled knob for using huge pages
would not be followed for umh modules. For the i915 driver this was *why*
they ended up adding shmem_file_setup_with_mnt(), they wanted huge pages to
support huge-gtt-pages. What is the rationale behind umh.c using it for
umh modules?

Users of shmem_kernel_file_setup() spawned later out of the desire to
*avoid* LSMs since it didn't make sense in their case as their inodes
are never exposed to userspace. Such is the case for ipc/shm.c and
security/keys/big_key.c. Refer to commit c7277090927a5 ("security: shmem:
implement kernel private shmem inodes") and then commit e1832f2923ec9
("ipc: use private shmem or hugetlbfs inodes for shm segments").

In this new umh usermode modules case we are:

  a) actually mapping data already extracted by the kernel somehow from
     a file somehow, presumably from /lib/modules/ path somewhere, but
     again this is not visible to umc.c, as it just gets called with:

	fork_usermode_blob(void *data, size_t len, struct umh_info *info)

  b) Creating the respective tmpfs file with shmem_file_setup_with_mnt()
     with our on our own shared struct vfsmount *umh_fs.

  c) Populating the file created and stuffing it with our data passed

  d) Calling do_execve_file() on it.

Its not clear to me why you used shmem_file_setup_with_mnt() in this case. What
are the gains? It would make sense to use shmem_kernel_file_setup() to avoid an
LSM check on step b) *but* only if we already had a proper LSM check on step
a).

I checked how you use fork_usermode_blob() in a) and in this case the kernel
module bpfilter would be loaded first, and for that we already have an LSM
check / hook for that module. From my review then, the magic done on your
second patch to stuff the userspace application into the module should be
irrelevant to us from an LSM perspective.

So, I can see a reason to use shmem_kernel_file_setup() but not I cannot
see a reason to be using      shmem_file_setup_with_mnt() at the moment.

I Cc'd tmpfs and LSM folks for further feedback.

> > > +{
> > > +	size_t offset = 0;
> > > +	int err;
> > > +
> > > +	do {
> > > +		unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
> > > +		struct page *page;
> > > +		void *pgdata, *vaddr;
> > > +
> > > +		err = pagecache_write_begin(file, file->f_mapping, offset, len,
> > > +					    0, &page, &pgdata);
> > > +		if (err < 0)
> > > +			goto fail;
> > > +
> > > +		vaddr = kmap(page);
> > > +		memcpy(vaddr, data, len);
> > > +		kunmap(page);
> > > +
> > > +		err = pagecache_write_end(file, file->f_mapping, offset, len,
> > > +					  len, page, pgdata);
> > > +		if (err < 0)
> > > +			goto fail;
> > > +
> > > +		size -= len;
> > > +		data += len;
> > > +		offset += len;
> > > +	} while (size);
> > 
> > Character for character, this looks like a wonderful copy and paste from
> > i915_gem_object_create_from_data()'s own loop which does the same exact
> > thing. Perhaps its time for a helper on mm/filemap.c with an export so
> > if a bug is fixed in one place its fixed in both places.
> 
> yes, of course, but not right now.
> Once it all lands that will be the time to create common helper.
> It's not a good idea to mess with i915 in one patch set.

Either way works with me, so long as its done.

> > > +int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
> > 
> > Please use umh_fork_blob()
> 
> sorry, no. fork_usermode_blob() is much more descriptive name.

Prefixing new umh.c symbols with umh_*() makes it very clear this came from
kernel/umh.c functionality. I've been dealing with other places in the kernel
that have conflated their own use of kernel/umh.c functionality what they
expose in both code and documentation, and correcting this has taken a long
time. Best avoid future confusion and be consistent with new exported symbols
for umc.c functionality.

Also, descriptive as fork_usermode_blob() may seem there is a possible future
clash with a more generic call.

> > Also, can you extend lib/test_kmod.c with a test case for this with its own
> > demo and try to blow it up?
> 
> in what sense? bpfilter is the test and the driving component for it.

That's the thing, it shouldn't be.

We are adding *new* functionality here, I don't want to require enabling
bpfitler or its dependencies to test generic umh user module loading
functionality.  For instance, we have lib/test_module.c to help test generic
module loading, regardless of the functionality or requirements for other
modules.

> I'm expecting that folks who want to use this helper to do usb drivers
> in user space may want to extend this helper further, but that's their job.

I don't even want to test USB, I am just interesting in the *very* *very*
basic aspects of it. A simple lib/test_umh_module.c would do with a respective
check that its loaded, given this is a requirement from the API. It also helps
folks understand the core basic knobs without having to look at bfilter code.

If we're going to get this merged I'd be interested in doing ongoing testing
with 0day with  simple UMH module with and without CONFIG_SHMEM for instance
and check that it works in both cases.

Testing this may seem irrelevant to you but keep in mind we're also already
testing just general kernel module loading. As silly as it may seem, adding new
functionality and a respective test case lets us try to avoid regressions, and
provide small unit tests to help reproduce issues and corner case situations
you may not be considering.

  Luis

^ permalink raw reply related

* [PATCH net-next v3 0/2] socket statistics for ss
From: Stephen Hemminger @ 2018-05-07 18:43 UTC (permalink / raw)
  To: davem, gerrit, kuznet, yoshfuji; +Cc: netdev, dccp, Stephen Hemminger

The eventual goal is to remove all accesses to slab statistics
by iproute2 ss command. This are preliminary steps to add two
statistics which are not working now, because of slab merging
and changes that were done years ago to allow for common inet_hash
tables.

Stephen Hemminger (2):
  inet: add bound ports statistic
  socket: keep track of the number of sockets allocated

v3
  - fix build :-(
  - add allocated sockets statistic

 include/net/inet_hashtables.h    |  3 +++
 include/net/inet_timewait_sock.h |  2 ++
 net/dccp/proto.c                 |  1 +
 net/ipv4/inet_hashtables.c       | 22 +++++++++++++++++++---
 net/ipv4/inet_timewait_sock.c    |  8 +++++---
 net/ipv4/proc.c                  |  5 +++--
 net/ipv4/tcp.c                   |  1 +
 net/socket.c                     | 21 +++++++++++++++++++--
 8 files changed, 53 insertions(+), 10 deletions(-)

-- 
2.17.0

^ permalink raw reply

* [PATCH net-next  v3 1/2] inet: add bound ports statistic
From: Stephen Hemminger @ 2018-05-07 18:43 UTC (permalink / raw)
  To: davem, gerrit, kuznet, yoshfuji
  Cc: netdev, dccp, Stephen Hemminger, Stephen Hemminger
In-Reply-To: <20180507184333.32688-1-sthemmin@microsoft.com>

This adds a number of bound ports which fixes socket summary
command.  The ss -s has been broken since changes to slab info
and this is one way to recover the missing value by adding a
field onto /proc/net/sockstat.

Since this is an informational value only, there is no need
for locking.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 include/net/inet_hashtables.h    |  3 +++
 include/net/inet_timewait_sock.h |  2 ++
 net/dccp/proto.c                 |  1 +
 net/ipv4/inet_hashtables.c       | 22 +++++++++++++++++++---
 net/ipv4/inet_timewait_sock.c    |  8 +++++---
 net/ipv4/proc.c                  |  5 +++--
 net/ipv4/tcp.c                   |  1 +
 7 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 9141e95529e7..b02524e2571e 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -103,6 +103,7 @@ static inline struct net *ib_net(struct inet_bind_bucket *ib)
 
 struct inet_bind_hashbucket {
 	spinlock_t		lock;
+	unsigned int		count;
 	struct hlist_head	chain;
 };
 
@@ -193,7 +194,9 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
 			struct inet_bind_hashbucket *head,
 			const unsigned short snum);
 void inet_bind_bucket_destroy(struct kmem_cache *cachep,
+			      struct inet_bind_hashbucket *head,
 			      struct inet_bind_bucket *tb);
+unsigned int inet_bind_bucket_count(const struct proto *prot);
 
 static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
 			       const u32 bhash_size)
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index c7be1ca8e562..4cdb8034ad80 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -87,7 +87,9 @@ static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
 void inet_twsk_free(struct inet_timewait_sock *tw);
 void inet_twsk_put(struct inet_timewait_sock *tw);
 
+struct inet_bind_hashbucket;
 void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+			   struct inet_bind_hashbucket *head,
 			   struct inet_hashinfo *hashinfo);
 
 struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 84cd4e3fd01b..25f03e62cfea 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1208,6 +1208,7 @@ static int __init dccp_init(void)
 	for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
 		spin_lock_init(&dccp_hashinfo.bhash[i].lock);
 		INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
+		dccp_hashinfo.bhash[i].count = 0;
 	}
 
 	rc = dccp_mib_init();
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 31ff46daae97..8ba6b17d95d5 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -58,6 +58,18 @@ static u32 sk_ehashfn(const struct sock *sk)
 			    sk->sk_daddr, sk->sk_dport);
 }
 
+/* Count how many any entries are in the bind hash table */
+unsigned int inet_bind_bucket_count(const struct proto *prot)
+{
+	const struct inet_hashinfo *hinfo = prot->h.hashinfo;
+	unsigned int i, ports = 0;
+
+	for (i = 0; i < hinfo->bhash_size; i++)
+		ports += hinfo->bhash[i].count;
+
+	return ports;
+}
+
 /*
  * Allocate and initialize a new local port bind bucket.
  * The bindhash mutex for snum's hash chain must be held here.
@@ -76,6 +88,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 		tb->fastreuseport = 0;
 		INIT_HLIST_HEAD(&tb->owners);
 		hlist_add_head(&tb->node, &head->chain);
+		++head->count;
 	}
 	return tb;
 }
@@ -83,10 +96,13 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 /*
  * Caller must hold hashbucket lock for this tb with local BH disabled
  */
-void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
+void inet_bind_bucket_destroy(struct kmem_cache *cachep,
+			      struct inet_bind_hashbucket *head,
+			      struct inet_bind_bucket *tb)
 {
 	if (hlist_empty(&tb->owners)) {
 		__hlist_del(&tb->node);
+		--head->count;
 		kmem_cache_free(cachep, tb);
 	}
 }
@@ -115,7 +131,7 @@ static void __inet_put_port(struct sock *sk)
 	__sk_del_bind_node(sk);
 	inet_csk(sk)->icsk_bind_hash = NULL;
 	inet_sk(sk)->inet_num = 0;
-	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, head, tb);
 	spin_unlock(&head->lock);
 }
 
@@ -756,7 +772,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 		inet_ehash_nolisten(sk, (struct sock *)tw);
 	}
 	if (tw)
-		inet_twsk_bind_unhash(tw, hinfo);
+		inet_twsk_bind_unhash(tw, head, hinfo);
 	spin_unlock(&head->lock);
 	if (tw)
 		inet_twsk_deschedule_put(tw);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 88c5069b5d20..dd888c52f958 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -26,7 +26,8 @@
  *	Returns 1 if caller should call inet_twsk_put() after lock release.
  */
 void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
-			  struct inet_hashinfo *hashinfo)
+			   struct inet_bind_hashbucket *head,
+			   struct inet_hashinfo *hashinfo)
 {
 	struct inet_bind_bucket *tb = tw->tw_tb;
 
@@ -35,7 +36,8 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 
 	__hlist_del(&tw->tw_bind_node);
 	tw->tw_tb = NULL;
-	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep,
+				 head, tb);
 	__sock_put((struct sock *)tw);
 }
 
@@ -55,7 +57,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
 			hashinfo->bhash_size)];
 
 	spin_lock(&bhead->lock);
-	inet_twsk_bind_unhash(tw, hashinfo);
+	inet_twsk_bind_unhash(tw, bhead, hashinfo);
 	spin_unlock(&bhead->lock);
 
 	atomic_dec(&tw->tw_dr->tw_count);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 261b71d0ccc5..83bc9a0f2785 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -60,10 +60,11 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 	sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
 
 	socket_seq_show(seq);
-	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
+	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld ports %u\n",
 		   sock_prot_inuse_get(net, &tcp_prot), orphans,
 		   atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets,
-		   proto_memory_allocated(&tcp_prot));
+		   proto_memory_allocated(&tcp_prot),
+		   inet_bind_bucket_count(&tcp_prot));
 	seq_printf(seq, "UDP: inuse %d mem %ld\n",
 		   sock_prot_inuse_get(net, &udp_prot),
 		   proto_memory_allocated(&udp_prot));
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 868ed74a76a8..f62e2fb02fdf 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3836,6 +3836,7 @@ void __init tcp_init(void)
 	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
 		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
 		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
+		tcp_hashinfo.bhash[i].count = 0;
 	}
 
 
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next v3 2/2] socket: keep track of the number of sockets allocated
From: Stephen Hemminger @ 2018-05-07 18:43 UTC (permalink / raw)
  To: davem, gerrit, kuznet, yoshfuji
  Cc: netdev, dccp, Stephen Hemminger, Stephen Hemminger
In-Reply-To: <20180507184333.32688-1-sthemmin@microsoft.com>

Add a per-cpu counter to keep track of the number of inodes allocated
to sockets to fix incorrect statistics from ss command.

The ss command tries to keep track of the number of sockets
allocated but it was doing by the slabinfo statistics which are
wrong (due to merging) and not available when using slub.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 net/socket.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/net/socket.c b/net/socket.c
index f10f1d947c78..89ec7f41559d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -234,6 +234,18 @@ static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
 }
 
 static struct kmem_cache *sock_inode_cachep __ro_after_init;
+static unsigned int __percpu *sock_pcpu_allocated;
+
+static unsigned int sock_allocated(void)
+{
+	unsigned int res = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		res += *per_cpu_ptr(sock_pcpu_allocated, cpu);
+
+	return res;
+}
 
 static struct inode *sock_alloc_inode(struct super_block *sb)
 {
@@ -248,6 +260,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
 		kmem_cache_free(sock_inode_cachep, ei);
 		return NULL;
 	}
+	this_cpu_inc(*sock_pcpu_allocated);
 	init_waitqueue_head(&wq->wait);
 	wq->fasync_list = NULL;
 	wq->flags = 0;
@@ -270,6 +283,7 @@ static void sock_destroy_inode(struct inode *inode)
 	ei = container_of(inode, struct socket_alloc, vfs_inode);
 	wq = rcu_dereference_protected(ei->socket.wq, 1);
 	kfree_rcu(wq, rcu);
+	this_cpu_dec(*sock_pcpu_allocated);
 	kmem_cache_free(sock_inode_cachep, ei);
 }
 
@@ -290,6 +304,8 @@ static void init_inodecache(void)
 					       SLAB_MEM_SPREAD | SLAB_ACCOUNT),
 					      init_once);
 	BUG_ON(sock_inode_cachep == NULL);
+	sock_pcpu_allocated = alloc_percpu(unsigned int);
+	BUG_ON(sock_pcpu_allocated == NULL);
 }
 
 static const struct super_operations sockfs_ops = {
@@ -2738,8 +2754,9 @@ core_initcall(sock_init);	/* early initcall */
 #ifdef CONFIG_PROC_FS
 void socket_seq_show(struct seq_file *seq)
 {
-	seq_printf(seq, "sockets: used %d\n",
-		   sock_inuse_get(seq->private));
+	seq_printf(seq, "sockets: used %d allocated %u\n",
+		   sock_inuse_get(seq->private),
+		   sock_allocated());
 }
 #endif				/* CONFIG_PROC_FS */
 
-- 
2.17.0

^ permalink raw reply related

* Re: [net-next PATCH v3 5/6] udp: Add support for software checksum and GSO_PARTIAL with GSO offload
From: Willem de Bruijn @ 2018-05-07 18:43 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: Network Development, Willem de Bruijn, David Miller
In-Reply-To: <20180507180846.3486.35452.stgit@localhost.localdomain>

On Mon, May 7, 2018 at 2:08 PM, Alexander Duyck
<alexander.duyck@gmail.com> wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
>
> This patch adds support for a software provided checksum and GSO_PARTIAL
> segmentation support. With this we can offload UDP segmentation on devices
> that only have partial support for tunnels.
>
> Since we are no longer needing the hardware checksum we can drop the checks
> in the segmentation code that were verifying if it was present.
>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>

Acked-by: Willem de Bruijn <willemb@google.com>

^ permalink raw reply

* Re: [PATCH 01/18] docs: can.rst: fix a footnote reference
From: Oliver Hartkopp @ 2018-05-07 18:41 UTC (permalink / raw)
  To: Mauro Carvalho Chehab, Linux Doc Mailing List, Robert Schwebel
  Cc: Mauro Carvalho Chehab, linux-kernel, Jonathan Corbet,
	Marc Kleine-Budde, David S. Miller, linux-can, netdev
In-Reply-To: <2a04ab24f302a0802572c4c80c4d416b953f213d.1525684985.git.mchehab+samsung@kernel.org>

+ Robert Schwebel (who thankfully did the txt -> rst conversion for can.txt)

On 05/07/2018 11:35 AM, Mauro Carvalho Chehab wrote:
> As stated at:
> 	http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#footnotes
> 
> A footnote should contain either a number, a reference or
> an auto number, e. g.:
> 	[1], [#f1] or [#].
> 
> While using [*] accidentaly works for html, it fails for other
> document outputs. In particular, it causes an error with LaTeX
> output, causing all books after networking to not be built.
> 
> So, replace it by a valid syntax.
> 
> Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>

Acked-by: Oliver Hartkopp <socketcan@hartkopp.net>

Thanks Mauro!

Best regards,
Oliver

> ---
>  Documentation/networking/can.rst | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/networking/can.rst b/Documentation/networking/can.rst
> index d23c51abf8c6..2fd0b51a8c52 100644
> --- a/Documentation/networking/can.rst
> +++ b/Documentation/networking/can.rst
> @@ -164,7 +164,7 @@ The Linux network devices (by default) just can handle the
>  transmission and reception of media dependent frames. Due to the
>  arbitration on the CAN bus the transmission of a low prio CAN-ID
>  may be delayed by the reception of a high prio CAN frame. To
> -reflect the correct [*]_ traffic on the node the loopback of the sent
> +reflect the correct [#f1]_ traffic on the node the loopback of the sent
>  data has to be performed right after a successful transmission. If
>  the CAN network interface is not capable of performing the loopback for
>  some reason the SocketCAN core can do this task as a fallback solution.
> @@ -175,7 +175,7 @@ networking behaviour for CAN applications. Due to some requests from
>  the RT-SocketCAN group the loopback optionally may be disabled for each
>  separate socket. See sockopts from the CAN RAW sockets in :ref:`socketcan-raw-sockets`.
>  
> -.. [*] you really like to have this when you're running analyser
> +.. [#f1] you really like to have this when you're running analyser
>         tools like 'candump' or 'cansniffer' on the (same) node.
>  
>  
> 

^ permalink raw reply

* Re: [net-next PATCH v3 3/6] udp: Do not pass checksum as a parameter to GSO segmentation
From: Willem de Bruijn @ 2018-05-07 18:49 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: Network Development, Willem de Bruijn, David Miller
In-Reply-To: <20180507180834.3486.87816.stgit@localhost.localdomain>

On Mon, May 7, 2018 at 2:08 PM, Alexander Duyck
<alexander.duyck@gmail.com> wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
>
> This patch is meant to allow us to avoid having to recompute the checksum
> from scratch and have it passed as a parameter.
>
> Instead of taking that approach we can take advantage of the fact that the
> length that was used to compute the existing checksum is included in the
> UDP header.
>
> Finally to avoid the need to invert the result we can just call csum16_add
> and csum16_sub directly. By doing this we can avoid a number of
> instructions in the loop that is handling segmentation.
>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>

Acked-by: Willem de Bruijn <willemb@google.com>

Small aside: instead of open-coding the csum16 operations,
it might make sense to define a pseudo_csum_replace2 function
and convert csum_replace2 to call that and only do the inversion.

^ permalink raw reply

* Re: [PATCH v2 net-next 2/4] net: add skeleton of bpfilter kernel module
From: Luis R. Rodriguez @ 2018-05-07 18:51 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: davem, daniel, torvalds, gregkh, luto, netdev, linux-kernel,
	kernel-team, Juergen Gross, Eric Paris, Matthew Auld,
	Josh Triplett, Kirill A. Shutemov, Joonas Lahtinen, Chris Wilson,
	Stephen Smalley, Eric W. Biederman, Mimi Zohar, David Howells,
	Kees Cook, Andrew Morton, Dominik Brodowski
In-Reply-To: <20180503043604.1604587-3-ast@kernel.org>

On Wed, May 02, 2018 at 09:36:02PM -0700, Alexei Starovoitov wrote:
> bpfilter.ko consists of bpfilter_kern.c (normal kernel module code)
> and user mode helper code that is embedded into bpfilter.ko
> 
> The steps to build bpfilter.ko are the following:
> - main.c is compiled by HOSTCC into the bpfilter_umh elf executable file
> - with quite a bit of objcopy and Makefile magic the bpfilter_umh elf file
>   is converted into bpfilter_umh.o object file
>   with _binary_net_bpfilter_bpfilter_umh_start and _end symbols
>   Example:
>   $ nm ./bld_x64/net/bpfilter/bpfilter_umh.o
>   0000000000004cf8 T _binary_net_bpfilter_bpfilter_umh_end
>   0000000000004cf8 A _binary_net_bpfilter_bpfilter_umh_size
>   0000000000000000 T _binary_net_bpfilter_bpfilter_umh_start
> - bpfilter_umh.o and bpfilter_kern.o are linked together into bpfilter.ko
> 
> bpfilter_kern.c is a normal kernel module code that calls
> the fork_usermode_blob() helper to execute part of its own data
> as a user mode process.
> 
> Notice that _binary_net_bpfilter_bpfilter_umh_start - end
> is placed into .init.rodata section, so it's freed as soon as __init
> function of bpfilter.ko is finished.
> As part of __init the bpfilter.ko does first request/reply action
> via two unix pipe provided by fork_usermode_blob() helper to
> make sure that umh is healthy. If not it will kill it via pid.

It does this very fast, right away. On a really slow system how are you sure
that this won't race and the execution of the check happens early on prior to
letting the actual setup trigger? After all, we're calling the userpsace
process in async mode. We could preempt it now.

> Later bpfilter_process_sockopt() will be called from bpfilter hooks
> in get/setsockopt() to pass iptable commands into umh via bpfilter.ko
> 
> If admin does 'rmmod bpfilter' the __exit code bpfilter.ko will
> kill umh as well.
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  include/linux/bpfilter.h      | 15 +++++++
>  include/uapi/linux/bpfilter.h | 21 ++++++++++
>  net/Kconfig                   |  2 +
>  net/Makefile                  |  1 +
>  net/bpfilter/Kconfig          | 17 ++++++++
>  net/bpfilter/Makefile         | 24 +++++++++++
>  net/bpfilter/bpfilter_kern.c  | 93 +++++++++++++++++++++++++++++++++++++++++++
>  net/bpfilter/main.c           | 63 +++++++++++++++++++++++++++++
>  net/bpfilter/msgfmt.h         | 17 ++++++++
>  net/ipv4/Makefile             |  2 +
>  net/ipv4/bpfilter/Makefile    |  2 +
>  net/ipv4/bpfilter/sockopt.c   | 42 +++++++++++++++++++
>  net/ipv4/ip_sockglue.c        | 17 ++++++++
>  13 files changed, 316 insertions(+)
>  create mode 100644 include/linux/bpfilter.h
>  create mode 100644 include/uapi/linux/bpfilter.h
>  create mode 100644 net/bpfilter/Kconfig
>  create mode 100644 net/bpfilter/Makefile
>  create mode 100644 net/bpfilter/bpfilter_kern.c
>  create mode 100644 net/bpfilter/main.c
>  create mode 100644 net/bpfilter/msgfmt.h
>  create mode 100644 net/ipv4/bpfilter/Makefile
>  create mode 100644 net/ipv4/bpfilter/sockopt.c
> 
> diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
> new file mode 100644
> index 000000000000..687b1760bb9f
> --- /dev/null
> +++ b/include/linux/bpfilter.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_BPFILTER_H
> +#define _LINUX_BPFILTER_H
> +
> +#include <uapi/linux/bpfilter.h>
> +
> +struct sock;
> +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval,
> +			    unsigned int optlen);
> +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval,
> +			    int *optlen);
> +extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
> +				       char __user *optval,
> +				       unsigned int optlen, bool is_set);
> +#endif
> diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h
> new file mode 100644
> index 000000000000..2ec3cc99ea4c
> --- /dev/null
> +++ b/include/uapi/linux/bpfilter.h
> @@ -0,0 +1,21 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _UAPI_LINUX_BPFILTER_H
> +#define _UAPI_LINUX_BPFILTER_H
> +
> +#include <linux/if.h>
> +
> +enum {
> +	BPFILTER_IPT_SO_SET_REPLACE = 64,
> +	BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65,
> +	BPFILTER_IPT_SET_MAX,
> +};
> +
> +enum {
> +	BPFILTER_IPT_SO_GET_INFO = 64,
> +	BPFILTER_IPT_SO_GET_ENTRIES = 65,
> +	BPFILTER_IPT_SO_GET_REVISION_MATCH = 66,
> +	BPFILTER_IPT_SO_GET_REVISION_TARGET = 67,
> +	BPFILTER_IPT_GET_MAX,
> +};
> +
> +#endif /* _UAPI_LINUX_BPFILTER_H */
> diff --git a/net/Kconfig b/net/Kconfig
> index b62089fb1332..ed6368b306fa 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -201,6 +201,8 @@ source "net/bridge/netfilter/Kconfig"
>  
>  endif
>  
> +source "net/bpfilter/Kconfig"
> +
>  source "net/dccp/Kconfig"
>  source "net/sctp/Kconfig"
>  source "net/rds/Kconfig"
> diff --git a/net/Makefile b/net/Makefile
> index a6147c61b174..7f982b7682bd 100644
> --- a/net/Makefile
> +++ b/net/Makefile
> @@ -20,6 +20,7 @@ obj-$(CONFIG_TLS)		+= tls/
>  obj-$(CONFIG_XFRM)		+= xfrm/
>  obj-$(CONFIG_UNIX)		+= unix/
>  obj-$(CONFIG_NET)		+= ipv6/
> +obj-$(CONFIG_BPFILTER)		+= bpfilter/
>  obj-$(CONFIG_PACKET)		+= packet/
>  obj-$(CONFIG_NET_KEY)		+= key/
>  obj-$(CONFIG_BRIDGE)		+= bridge/
> diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
> new file mode 100644
> index 000000000000..782a732b9a5c
> --- /dev/null
> +++ b/net/bpfilter/Kconfig
> @@ -0,0 +1,17 @@
> +menuconfig BPFILTER
> +	bool "BPF based packet filtering framework (BPFILTER)"
> +	default n
> +	depends on NET && BPF
> +	help
> +	  This builds experimental bpfilter framework that is aiming to
> +	  provide netfilter compatible functionality via BPF
> +
> +if BPFILTER
> +config BPFILTER_UMH
> +	tristate "bpftiler kernel module with user mode helper"
> +	default m
> +	depends on m
> +	help
> +	  This builds bpfilter kernel module with embedded user mode helper
> +endif
> +
> diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
> new file mode 100644
> index 000000000000..897eedae523e
> --- /dev/null
> +++ b/net/bpfilter/Makefile
> @@ -0,0 +1,24 @@
> +# SPDX-License-Identifier: GPL-2.0
> +#
> +# Makefile for the Linux BPFILTER layer.
> +#
> +
> +hostprogs-y := bpfilter_umh
> +bpfilter_umh-objs := main.o
> +HOSTCFLAGS += -I. -Itools/include/
> +
> +# a bit of elf magic to convert bpfilter_umh binary into a binary blob
> +# inside bpfilter_umh.o elf file referenced by
> +# _binary_net_bpfilter_bpfilter_umh_start symbol
> +# which bpfilter_kern.c passes further into umh blob loader at run-time
> +quiet_cmd_copy_umh = GEN $@
> +      cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
> +      $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
> +      -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
> +      --rename-section .data=.init.rodata $< $@

Cool, but so our expectation is that the compiler sets this symbol, how
are we sure it will always be set?

> +
> +$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh
> +	$(call cmd,copy_umh)
> +
> +obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
> +bpfilter-objs += bpfilter_kern.o bpfilter_umh.o
> diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
> new file mode 100644
> index 000000000000..e0a6fdd5842b
> --- /dev/null
> +++ b/net/bpfilter/bpfilter_kern.c
> @@ -0,0 +1,93 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/umh.h>
> +#include <linux/bpfilter.h>
> +#include <linux/sched.h>
> +#include <linux/sched/signal.h>
> +#include <linux/fs.h>
> +#include <linux/file.h>
> +#include "msgfmt.h"
> +
> +#define UMH_start _binary_net_bpfilter_bpfilter_umh_start
> +#define UMH_end _binary_net_bpfilter_bpfilter_umh_end
> +
> +extern char UMH_start;
> +extern char UMH_end;
> +
> +static struct umh_info info;
> +
> +static void shutdown_umh(struct umh_info *info)
> +{
> +	struct task_struct *tsk;
> +
> +	tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
> +	if (tsk)
> +		force_sig(SIGKILL, tsk);
> +	fput(info->pipe_to_umh);
> +	fput(info->pipe_from_umh);
> +}
> +
> +static void stop_umh(void)
> +{
> +	if (bpfilter_process_sockopt) {
> +		bpfilter_process_sockopt = NULL;
> +		shutdown_umh(&info);
> +	}
> +}
> +
> +static int __bpfilter_process_sockopt(struct sock *sk, int optname,
> +				      char __user *optval,
> +				      unsigned int optlen, bool is_set)
> +{
> +	struct mbox_request req;
> +	struct mbox_reply reply;
> +	loff_t pos;
> +	ssize_t n;
> +
> +	req.is_set = is_set;
> +	req.pid = current->pid;
> +	req.cmd = optname;
> +	req.addr = (long)optval;
> +	req.len = optlen;
> +	n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos);
> +	if (n != sizeof(req)) {
> +		pr_err("write fail %zd\n", n);
> +		stop_umh();
> +		return -EFAULT;
> +	}
> +	pos = 0;
> +	n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos);
> +	if (n != sizeof(reply)) {
> +		pr_err("read fail %zd\n", n);
> +		stop_umh();
> +		return -EFAULT;
> +	}
> +	return reply.status;
> +}
> +
> +static int __init load_umh(void)
> +{
> +	int err;
> +
> +	err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info);
> +	if (err)
> +		return err;
> +	pr_info("Loaded umh pid %d\n", info.pid);
> +	bpfilter_process_sockopt = &__bpfilter_process_sockopt;
> +
> +	if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) {

See, here, what if the userspace process gets preemtped and we run this
check afterwards? Is that possible?

  Luis

> +		stop_umh();
> +		return -EFAULT;
> +	}
> +	return 0;
> +}
> +
> +static void __exit fini_umh(void)
> +{
> +	stop_umh();
> +}
> +module_init(load_umh);
> +module_exit(fini_umh);
> +MODULE_LICENSE("GPL");
> diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
> new file mode 100644
> index 000000000000..81bbc1684896
> --- /dev/null
> +++ b/net/bpfilter/main.c
> @@ -0,0 +1,63 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define _GNU_SOURCE
> +#include <sys/uio.h>
> +#include <errno.h>
> +#include <stdio.h>
> +#include <sys/socket.h>
> +#include <fcntl.h>
> +#include <unistd.h>
> +#include "include/uapi/linux/bpf.h"
> +#include <asm/unistd.h>
> +#include "msgfmt.h"
> +
> +int debug_fd;
> +
> +static int handle_get_cmd(struct mbox_request *cmd)
> +{
> +	switch (cmd->cmd) {
> +	case 0:
> +		return 0;
> +	default:
> +		break;
> +	}
> +	return -ENOPROTOOPT;
> +}
> +
> +static int handle_set_cmd(struct mbox_request *cmd)
> +{
> +	return -ENOPROTOOPT;
> +}
> +
> +static void loop(void)
> +{
> +	while (1) {
> +		struct mbox_request req;
> +		struct mbox_reply reply;
> +		int n;
> +
> +		n = read(0, &req, sizeof(req));
> +		if (n != sizeof(req)) {
> +			dprintf(debug_fd, "invalid request %d\n", n);
> +			return;
> +		}
> +
> +		reply.status = req.is_set ?
> +			handle_set_cmd(&req) :
> +			handle_get_cmd(&req);
> +
> +		n = write(1, &reply, sizeof(reply));
> +		if (n != sizeof(reply)) {
> +			dprintf(debug_fd, "reply failed %d\n", n);
> +			return;
> +		}
> +	}
> +}
> +
> +int main(void)
> +{
> +	debug_fd = open("/dev/console", 00000002 | 00000100);
> +	dprintf(debug_fd, "Started bpfilter\n");
> +	loop();
> +	close(debug_fd);
> +	return 0;
> +}
> diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h
> new file mode 100644
> index 000000000000..94b9ac9e5114
> --- /dev/null
> +++ b/net/bpfilter/msgfmt.h
> @@ -0,0 +1,17 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _NET_BPFILTER_MSGFMT_H
> +#define _NET_BPFTILER_MSGFMT_H
> +
> +struct mbox_request {
> +	__u64 addr;
> +	__u32 len;
> +	__u32 is_set;
> +	__u32 cmd;
> +	__u32 pid;
> +};
> +
> +struct mbox_reply {
> +	__u32 status;
> +};
> +
> +#endif
> diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
> index b379520f9133..7018f91c5a39 100644
> --- a/net/ipv4/Makefile
> +++ b/net/ipv4/Makefile
> @@ -16,6 +16,8 @@ obj-y     := route.o inetpeer.o protocol.o \
>  	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
>  	     metrics.o
>  
> +obj-$(CONFIG_BPFILTER) += bpfilter/
> +
>  obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
>  obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
>  obj-$(CONFIG_PROC_FS) += proc.o
> diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
> new file mode 100644
> index 000000000000..ce262d76cc48
> --- /dev/null
> +++ b/net/ipv4/bpfilter/Makefile
> @@ -0,0 +1,2 @@
> +obj-$(CONFIG_BPFILTER) += sockopt.o
> +
> diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
> new file mode 100644
> index 000000000000..42a96d2d8d05
> --- /dev/null
> +++ b/net/ipv4/bpfilter/sockopt.c
> @@ -0,0 +1,42 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/uaccess.h>
> +#include <linux/bpfilter.h>
> +#include <uapi/linux/bpf.h>
> +#include <linux/wait.h>
> +#include <linux/kmod.h>
> +
> +int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
> +				char __user *optval,
> +				unsigned int optlen, bool is_set);
> +EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
> +
> +int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval,
> +			  unsigned int optlen, bool is_set)
> +{
> +	if (!bpfilter_process_sockopt) {
> +		int err = request_module("bpfilter");
> +
> +		if (err)
> +			return err;
> +		if (!bpfilter_process_sockopt)
> +			return -ECHILD;
> +	}
> +	return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set);
> +}
> +
> +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
> +			    unsigned int optlen)
> +{
> +	return bpfilter_mbox_request(sk, optname, optval, optlen, true);
> +}
> +
> +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
> +			    int __user *optlen)
> +{
> +	int len;
> +
> +	if (get_user(len, optlen))
> +		return -EFAULT;
> +
> +	return bpfilter_mbox_request(sk, optname, optval, len, false);
> +}
> diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
> index 5ad2d8ed3a3f..e0791faacb24 100644
> --- a/net/ipv4/ip_sockglue.c
> +++ b/net/ipv4/ip_sockglue.c
> @@ -47,6 +47,8 @@
>  #include <linux/errqueue.h>
>  #include <linux/uaccess.h>
>  
> +#include <linux/bpfilter.h>
> +
>  /*
>   *	SOL_IP control messages.
>   */
> @@ -1244,6 +1246,11 @@ int ip_setsockopt(struct sock *sk, int level,
>  		return -ENOPROTOOPT;
>  
>  	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
> +#ifdef CONFIG_BPFILTER
> +	if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
> +	    optname < BPFILTER_IPT_SET_MAX)
> +		err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
> +#endif
>  #ifdef CONFIG_NETFILTER
>  	/* we need to exclude all possible ENOPROTOOPTs except default case */
>  	if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
> @@ -1552,6 +1559,11 @@ int ip_getsockopt(struct sock *sk, int level,
>  	int err;
>  
>  	err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
> +#ifdef CONFIG_BPFILTER
> +	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
> +	    optname < BPFILTER_IPT_GET_MAX)
> +		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
> +#endif
>  #ifdef CONFIG_NETFILTER
>  	/* we need to exclude all possible ENOPROTOOPTs except default case */
>  	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
> @@ -1584,6 +1596,11 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
>  	err = do_ip_getsockopt(sk, level, optname, optval, optlen,
>  		MSG_CMSG_COMPAT);
>  
> +#ifdef CONFIG_BPFILTER
> +	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
> +	    optname < BPFILTER_IPT_GET_MAX)
> +		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
> +#endif
>  #ifdef CONFIG_NETFILTER
>  	/* we need to exclude all possible ENOPROTOOPTs except default case */
>  	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
> -- 
> 2.9.5

-- 
Do not panic

^ permalink raw reply

* Re: [net-next PATCH v3 4/6] udp: Partially unroll handling of first segment and last segment
From: Willem de Bruijn @ 2018-05-07 18:57 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: Network Development, Willem de Bruijn, David Miller
In-Reply-To: <20180507180840.3486.67728.stgit@localhost.localdomain>

On Mon, May 7, 2018 at 2:08 PM, Alexander Duyck
<alexander.duyck@gmail.com> wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
>
> This patch allows us to take care of unrolling the first segment and the
> last segment of the loop for processing the segmented skb. Part of the
> motivation for this is that it makes it easier to process the fact that the
> first fame and all of the frames in between should be mostly identical
> in terms of header data, and the last frame has differences in the length
> and partial checksum.
>
> In addition I am dropping the header length calculation since we don't
> really need it for anything but the last frame and it can be easily
> obtained by just pulling the data_len and offset of tail from the transport
> header.
>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>

I'm not a fan of the more complicated control flow, as I pointed out
before. It only seems to save one assignment to uh from segs.

Both follow-up patches are now more complex, because they need
to add the same code in two locations.

^ permalink raw reply

* Re: [PATCH v2] net: dsa: drop some VLAs in switch.c
From: Salvatore Mesoraca @ 2018-05-07 19:02 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: Andrew Lunn, linux-kernel, Kernel Hardening, netdev,
	David S. Miller, Kees Cook, Vivien Didelot, David Laight
In-Reply-To: <d7fa9cf7-7c7e-b01c-8925-ce6dafc8721c@gmail.com>

2018-05-07 20:14 GMT+02:00 Florian Fainelli <f.fainelli@gmail.com>:
> On 05/07/2018 08:23 AM, Salvatore Mesoraca wrote:
>> We avoid 2 VLAs by using a pre-allocated field in dsa_switch.
>> We also try to avoid dynamic allocation whenever possible.
>>
>> Link: http://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com
>> Link: http://lkml.kernel.org/r/20180505185145.GB32630@lunn.ch
>>
>> Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
>> ---
>>  include/net/dsa.h |  3 +++
>>  net/dsa/dsa2.c    | 14 ++++++++++++++
>>  net/dsa/switch.c  | 22 ++++++++++------------
>>  3 files changed, 27 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/net/dsa.h b/include/net/dsa.h
>> index 60fb4ec..576791d 100644
>> --- a/include/net/dsa.h
>> +++ b/include/net/dsa.h
>> @@ -256,6 +256,9 @@ struct dsa_switch {
>>       /* Number of switch port queues */
>>       unsigned int            num_tx_queues;
>>
>> +     unsigned long           *bitmap;
>> +     unsigned long           _bitmap;
>> +
>>       /* Dynamically allocated ports, keep last */
>>       size_t num_ports;
>>       struct dsa_port ports[];
>> diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
>> index adf50fb..cebf35f0 100644
>> --- a/net/dsa/dsa2.c
>> +++ b/net/dsa/dsa2.c
>> @@ -748,6 +748,20 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
>>       if (!ds)
>>               return NULL;
>>
>> +     /* We avoid allocating memory outside dsa_switch
>> +      * if it is not needed.
>> +      */
>> +     if (n <= sizeof(ds->_bitmap) * 8) {
>> +             ds->bitmap = &ds->_bitmap;
>
> Should not this be / BITS_PER_BYTE? If the sizeof(unsigned long) is <=
> 8, then you don't need to allocate it, otherwise, you have to.

No.
We need one 1 bit per port, of course sizeof() returns size in bytes,
hence the multiplication to get the number of bits.
I might multiply per BITS_PER_BYTE instead of 8, but I doubt that
Linux supports implementations where a byte is not an octet.

> I would actually just always dynamically allocate the bitmap, optimizing
> for the case where we have fewer than or 8 ports is not worth IMHO.

This optimization will save us an allocation when number of ports is
less than 32 or 64 (depending on arch).
IMHO it's useful, if you consider that, right now, DSA works only with
12-ports switches.

Thank you for your time,

Salvatore

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox