Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf-next 08/13] tools/bpf: add test for bpf_map_update_batch()
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Add tests for bpf_map_update_batch().
  $ ./test_maps
  ...
  test_map_update_batch:PASS
  ...
---
 .../bpf/map_tests/map_update_batch.c          | 115 ++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_update_batch.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_update_batch.c b/tools/testing/selftests/bpf/map_tests/map_update_batch.c
new file mode 100644
index 000000000000..67c1e11fc911
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_update_batch.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+void test_map_update_batch(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	int map_fd, *keys, *values, key, value;
+	const int max_entries = 10;
+	__u32 count, max_count;
+	int err, i;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	keys = malloc(max_entries * sizeof(int));
+	values = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values, "malloc()", "error:%s\n", strerror(errno));
+
+	/* do not fill in the whole hash table, so we could test
+	 * update with new elements.
+	 */
+	max_count = max_entries - 2;
+
+	for (i = 0; i < max_count; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	/* test 1: count == 0, expect success. */
+	count = 0;
+	err = bpf_map_update_batch(map_fd, keys, values, &count, 0, 0);
+	CHECK(err, "count = 0", "error:%s\n", strerror(errno));
+
+	/* test 2: update initial map with BPF_NOEXIST, expect success. */
+	count = max_count;
+	err = bpf_map_update_batch(map_fd, keys, values,
+				   &count, BPF_NOEXIST, 0);
+	CHECK(err, "elem_flags = BPF_NOEXIST",
+	      "error:%s\n", strerror(errno));
+
+	/* use bpf_map_get_next_key to ensure all keys/values are indeed
+	 * covered.
+	 */
+	err = bpf_map_get_next_key(map_fd, NULL, &key);
+	CHECK(err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+	err = bpf_map_lookup_elem(map_fd, &key, &value);
+	CHECK(err, "bpf_map_lookup_elem()", "error: %s\n", strerror(errno));
+	CHECK(key + 1 != value, "key/value checking",
+	      "error: key %d value %d\n", key, value);
+	i = 1;
+	while (!bpf_map_get_next_key(map_fd, &key, &key)) {
+		err = bpf_map_lookup_elem(map_fd, &key, &value);
+		CHECK(err, "bpf_map_lookup_elem()", "error: %s\n",
+		      strerror(errno));
+		CHECK(key + 1 != value,
+		      "key/value checking", "error: key %d value %d\n",
+		      key, value);
+		i++;
+	}
+	CHECK(i != max_count, "checking number of entries",
+	      "err: i %u max_count %u\n", i, max_count);
+
+	/* test 3: elem_flags = BPF_NOEXIST, already exists, expect failure */
+	err = bpf_map_update_batch(map_fd, keys, values,
+				   &count, BPF_NOEXIST, 0);
+	/* failure to due to flag BPF_NOEXIST, count is set to 0 */
+	CHECK(!err || count, "elem_flags = BPF_NOEXIST again",
+	      "unexpected success\n");
+
+	/* test 4: elem_flags = 0, expect success */
+	count = max_count;
+	err = bpf_map_update_batch(map_fd, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "elem_flags = 0", "error %s\n", strerror(errno));
+
+	/* test 5: keys = NULL, expect failure */
+	count = max_count;
+	err = bpf_map_update_batch(map_fd, NULL, values,
+				   &count, 0, 0);
+	CHECK(!err, "keys = NULL", "unexpected success\n");
+
+	/* test 6: values = NULL, expect failure */
+	count = max_count;
+	err = bpf_map_update_batch(map_fd, keys, NULL, &count, 0, 0);
+	CHECK(!err, "values = NULL", "unexpected success\n");
+
+	/* test 7: modify the first key to be max_count + 10,
+	 * elem_flags = BPF_NOEXIST,
+	 * expect failure, the return count = 1.
+	 */
+	count = max_count;
+	keys[0] = max_count + 10;
+	err = bpf_map_update_batch(map_fd, keys, values,
+				   &count, BPF_NOEXIST, 0);
+	CHECK(!err, "keys[0] = max_count + 10", "unexpected success\n");
+	CHECK(count != 1, "keys[0] = max_count + 10",
+	      "error: %s, incorrect count %u\n", strerror(errno), count);
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 06/13] tools/bpf: sync uapi header bpf.h
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

sync uapi header include/uapi/linux/bpf.h to
tools/include/uapi/linux/bpf.h.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/include/uapi/linux/bpf.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 5d2fb183ee2d..576688f13e8c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -107,6 +107,10 @@ enum bpf_cmd {
 	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
 	BPF_MAP_FREEZE,
 	BPF_BTF_GET_NEXT_ID,
+	BPF_MAP_LOOKUP_BATCH,
+	BPF_MAP_LOOKUP_AND_DELETE_BATCH,
+	BPF_MAP_UPDATE_BATCH,
+	BPF_MAP_DELETE_BATCH,
 };
 
 enum bpf_map_type {
@@ -347,6 +351,9 @@ enum bpf_attach_type {
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
+/* flags for BPF_MAP_*_BATCH */
+#define BPF_F_ENFORCE_ENOENT	(1U << 0)
+
 enum bpf_stack_build_id_status {
 	/* user space need an empty entry to identify end of a trace */
 	BPF_STACK_BUILD_ID_EMPTY = 0,
@@ -396,6 +403,26 @@ union bpf_attr {
 		__u64		flags;
 	};
 
+	struct { /* struct used by BPF_MAP_*_BATCH commands */
+		__aligned_u64	start_key;	/* input: storing start key,
+						 * if NULL, starting from the beginning.
+						 */
+		__aligned_u64	next_start_key;	/* output: storing next batch start_key,
+						 * if NULL, no next key.
+						 */
+		__aligned_u64	keys;		/* input/output: key buffer */
+		__aligned_u64	values;		/* input/output: value buffer */
+		__u32		count;		/* input: # of keys/values in
+						 *   or fits in keys[]/values[].
+						 * output: how many successful
+						 *   lookup/lookup_and_delete
+						 *   update/delete operations.
+						 */
+		__u32		map_fd;
+		__u64		elem_flags;	/* BPF_MAP_*_ELEM flags */
+		__u64		flags;		/* flags for batch operation */
+	} batch;
+
 	struct { /* anonymous struct used by BPF_PROG_LOAD command */
 		__u32		prog_type;	/* one of enum bpf_prog_type */
 		__u32		insn_cnt;
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 00/13] bpf: adding map batch processing support
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song

Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one
map entries per syscall.
  https://lore.kernel.org/bpf/CABCgpaU3xxX6CMMxD+1knApivtc2jLBHysDXw-0E9bQEL0qC3A@mail.gmail.com/T/#t

During discussion, we found more use cases can be supported in a similar
map operation batching framework. For example, batched map lookup and delete,
which can be really helpful for bcc.
  https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243
  https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138
    
Also, in bcc, we have API to delete all entries in a map.
  https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264

For map update, batched operations also useful as sometimes applications need
to populate initial maps with more than one entry. For example, the below
example is from kernel/samples/bpf/xdp_redirect_cpu_user.c:
  https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550

This patch addresses all the above use cases. To make uapi stable, it also
covers other potential use cases. Four bpf syscall subcommands are introduced:
    BPF_MAP_LOOKUP_BATCH
    BPF_MAP_LOOKUP_AND_DELETE_BATCH
    BPF_MAP_UPDATE_BATCH
    BPF_MAP_DELETE_BATCH

In userspace, application can iterate through the whole map one batch
as a time, e.g., bpf_map_lookup_batch() in the below:
    p_key = NULL;
    p_next_key = &key;
    while (true) {
       err = bpf_map_lookup_batch(fd, p_key, &p_next_key, keys, values,
                                  &batch_size, elem_flags, flags);
       if (err) ...
       if (p_next_key) break; // done
       if (!p_key) p_key = p_next_key;
    }
Please look at individual patches for details of new syscall subcommands
and examples of user codes.

The testing is also done in a qemu VM environment:
      measure_lookup: max_entries 1000000, batch 10, time 342ms
      measure_lookup: max_entries 1000000, batch 1000, time 295ms
      measure_lookup: max_entries 1000000, batch 1000000, time 270ms
      measure_lookup: max_entries 1000000, no batching, time 1346ms
      measure_lookup_delete: max_entries 1000000, batch 10, time 433ms
      measure_lookup_delete: max_entries 1000000, batch 1000, time 363ms
      measure_lookup_delete: max_entries 1000000, batch 1000000, time 357ms
      measure_lookup_delete: max_entries 1000000, not batch, time 1894ms
      measure_delete: max_entries 1000000, batch, time 220ms
      measure_delete: max_entries 1000000, not batch, time 1289ms
For a 1M entry hash table, batch size of 10 can reduce cpu time
by 70%. Please see patch "tools/bpf: measure map batching perf"
for details of test codes.

Brian Vazquez (1):
  bpf: add bpf_map_value_size and bp_map_copy_value helper functions

Yonghong Song (12):
  bpf: refactor map_update_elem()
  bpf: refactor map_delete_elem()
  bpf: refactor map_get_next_key()
  bpf: adding map batch processing support
  tools/bpf: sync uapi header bpf.h
  tools/bpf: implement libbpf API functions for map batch operations
  tools/bpf: add test for bpf_map_update_batch()
  tools/bpf: add test for bpf_map_lookup_batch()
  tools/bpf: add test for bpf_map_lookup_and_delete_batch()
  tools/bpf: add test for bpf_map_delete_batch()
  tools/bpf: add a multithreaded test for map batch operations
  tools/bpf: measure map batching perf

 include/uapi/linux/bpf.h                      |  27 +
 kernel/bpf/syscall.c                          | 752 ++++++++++++++----
 tools/include/uapi/linux/bpf.h                |  27 +
 tools/lib/bpf/bpf.c                           |  67 ++
 tools/lib/bpf/bpf.h                           |  17 +
 tools/lib/bpf/libbpf.map                      |   4 +
 .../selftests/bpf/map_tests/map_batch_mt.c    | 126 +++
 .../selftests/bpf/map_tests/map_batch_perf.c  | 242 ++++++
 .../bpf/map_tests/map_delete_batch.c          | 139 ++++
 .../map_tests/map_lookup_and_delete_batch.c   | 164 ++++
 .../bpf/map_tests/map_lookup_batch.c          | 166 ++++
 .../bpf/map_tests/map_update_batch.c          | 115 +++
 12 files changed, 1707 insertions(+), 139 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_batch_mt.c
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_batch_perf.c
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_delete_batch.c
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_lookup_batch.c
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_update_batch.c

-- 
2.17.1


^ permalink raw reply

* Re: auto-split of commit. Was: [PATCH bpf-next 04/10] tools/bpf: add libbpf_prog_type_(from|to)_str helpers
From: Greg Kroah-Hartman @ 2019-08-29  6:51 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Jakub Kicinski, Julia Kartseva, ast, Thomas Gleixner, rdna, bpf,
	daniel, netdev, kernel-team
In-Reply-To: <20190828234626.ltfy3qr2nne4uumy@ast-mbp.dhcp.thefacebook.com>

On Wed, Aug 28, 2019 at 04:46:28PM -0700, Alexei Starovoitov wrote:
> On Wed, Aug 28, 2019 at 04:34:22PM -0700, Jakub Kicinski wrote:
> > 
> > Greg, Thomas, libbpf is extracted from the kernel sources and
> > maintained in a clone repo on GitHub for ease of packaging.
> > 
> > IIUC Alexei's concern is that since we are moving the commits from
> > the kernel repo to the GitHub one we have to preserve the commits
> > exactly as they are, otherwise SOB lines lose their power.
> > 
> > Can you provide some guidance on whether that's a valid concern, 
> > or whether it's perfectly fine to apply a partial patch?
> 
> Right. That's exactly the concern.
> 
> Greg, Thomas,
> could you please put your legal hat on and clarify the following.
> Say some developer does a patch that modifies
> include/uapi/linux/bpf.h
> ..some other kernel code...and
> tools/include/uapi/linux/bpf.h
> 
> That tools/include/uapi/linux/bpf.h is used by perf and by libbpf.
> We have automatic mirror of tools/libbpf into github/libbpf/
> so that external projects and can do git submodule of it,
> can build packages out of it, etc.
> 
> The question is whether it's ok to split tools/* part out of
> original commit, keep Author and SOB, create new commit out of it,
> and automatically push that auto-generated commit into github mirror.

Note, I am not a laywer, and am not _your_ lawyer either, only _your_
lawyer can answer questions as to what is best for you.

That being said, from a "are you keeping the correct authorship info",
yes, it sounds like you are doing the correct thing here.

Look at what I do for stable kernels, I take the original commit and add
it to "another tree" keeping the original author and s-o-b chain intact,
and adding a "this is the original git commit id" type message to the
changelog text so that people can link it back to the original.

If you are doing that here as well, I don't see how anyone can object.

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH v2 3/3] arm: Add support for function error injection
From: Russell King - ARM Linux admin @ 2019-08-29  6:57 UTC (permalink / raw)
  To: Leo Yan
  Cc: Oleg Nesterov, Catalin Marinas, Will Deacon,
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin,
	x86, Arnd Bergmann, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Song Liu, Yonghong Song, Naveen N. Rao,
	linux-arm-kernel, linux-kernel, linuxppc-dev, linux-arch, netdev,
	bpf, clang-built-linux, Masami Hiramatsu
In-Reply-To: <20190819091808.GB5599@leoy-ThinkPad-X240s>

I'm sorry, I can't apply this, it produces loads of:

include/linux/error-injection.h:7:10: fatal error: asm/error-injection.h: No such file or directory

Since your patch 1 has been merged by the ARM64 people, I can't take
it until next cycle.

On Mon, Aug 19, 2019 at 05:18:08PM +0800, Leo Yan wrote:
> Hi Russell,
> 
> On Tue, Aug 06, 2019 at 06:00:15PM +0800, Leo Yan wrote:
> > This patch implements arm specific functions regs_set_return_value() and
> > override_function_with_return() to support function error injection.
> > 
> > In the exception flow, it updates pt_regs::ARM_pc with pt_regs::ARM_lr
> > so can override the probed function return.
> 
> Gentle ping ...  Could you review this patch?
> 
> Thanks,
> Leo.
> 
> > Signed-off-by: Leo Yan <leo.yan@linaro.org>
> > ---
> >  arch/arm/Kconfig              |  1 +
> >  arch/arm/include/asm/ptrace.h |  5 +++++
> >  arch/arm/lib/Makefile         |  2 ++
> >  arch/arm/lib/error-inject.c   | 19 +++++++++++++++++++
> >  4 files changed, 27 insertions(+)
> >  create mode 100644 arch/arm/lib/error-inject.c
> > 
> > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> > index 33b00579beff..2d3d44a037f6 100644
> > --- a/arch/arm/Kconfig
> > +++ b/arch/arm/Kconfig
> > @@ -77,6 +77,7 @@ config ARM
> >  	select HAVE_EXIT_THREAD
> >  	select HAVE_FAST_GUP if ARM_LPAE
> >  	select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
> > +	select HAVE_FUNCTION_ERROR_INJECTION if !THUMB2_KERNEL
> >  	select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL && !CC_IS_CLANG
> >  	select HAVE_FUNCTION_TRACER if !XIP_KERNEL
> >  	select HAVE_GCC_PLUGINS
> > diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h
> > index 91d6b7856be4..3b41f37b361a 100644
> > --- a/arch/arm/include/asm/ptrace.h
> > +++ b/arch/arm/include/asm/ptrace.h
> > @@ -89,6 +89,11 @@ static inline long regs_return_value(struct pt_regs *regs)
> >  	return regs->ARM_r0;
> >  }
> >  
> > +static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
> > +{
> > +	regs->ARM_r0 = rc;
> > +}
> > +
> >  #define instruction_pointer(regs)	(regs)->ARM_pc
> >  
> >  #ifdef CONFIG_THUMB2_KERNEL
> > diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
> > index b25c54585048..8f56484a7156 100644
> > --- a/arch/arm/lib/Makefile
> > +++ b/arch/arm/lib/Makefile
> > @@ -42,3 +42,5 @@ ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
> >    CFLAGS_xor-neon.o		+= $(NEON_FLAGS)
> >    obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
> >  endif
> > +
> > +obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
> > diff --git a/arch/arm/lib/error-inject.c b/arch/arm/lib/error-inject.c
> > new file mode 100644
> > index 000000000000..2d696dc94893
> > --- /dev/null
> > +++ b/arch/arm/lib/error-inject.c
> > @@ -0,0 +1,19 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +
> > +#include <linux/error-injection.h>
> > +#include <linux/kprobes.h>
> > +
> > +void override_function_with_return(struct pt_regs *regs)
> > +{
> > +	/*
> > +	 * 'regs' represents the state on entry of a predefined function in
> > +	 * the kernel/module and which is captured on a kprobe.
> > +	 *
> > +	 * 'regs->ARM_lr' contains the the link register for the probed
> > +	 * function, when kprobe returns back from exception it will override
> > +	 * the end of probed function and directly return to the predefined
> > +	 * function's caller.
> > +	 */
> > +	instruction_pointer_set(regs, regs->ARM_lr);
> > +}
> > +NOKPROBE_SYMBOL(override_function_with_return);
> > -- 
> > 2.17.1
> > 
> 

-- 
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 12.1Mbps down 622kbps up
According to speedtest.net: 11.9Mbps down 500kbps up

^ permalink raw reply

* Re: [PATCH net-next v4 3/3] dt-bindings: net: ethernet: Update mt7622 docs and dts to reflect the new phylink API
From: Matthias Brugger @ 2019-08-29  7:03 UTC (permalink / raw)
  To: David Miller
  Cc: opensource, john, sean.wang, nelson.chang, netdev,
	linux-arm-kernel, linux-mediatek, linux-mips, linux, frank-w, sr
In-Reply-To: <20190828.125658.1743313522645522716.davem@davemloft.net>



On 28/08/2019 21:56, David Miller wrote:
> From: Matthias Brugger <matthias.bgg@gmail.com>
> Date: Wed, 28 Aug 2019 11:29:45 +0200
> 
>> Thanks for taking this patch. For the next time, please make sure that dts[i]
>> patches are independent from the binding description, as dts[i] should go
>> through my tree. No problem for this round, just saying for the future.
> 
> That's not always possible nor reasonable, to be quite honest.
> 

Right now no case comes to my mind. What would be a case where this is not
reasonable or possible?

Regards,
Matthias

^ permalink raw reply

* Re: [PATCH ipsec-next 7/7] xfrm: add espintcp (RFC 8229)
From: Steffen Klassert @ 2019-08-29  7:04 UTC (permalink / raw)
  To: Sabrina Dubroca; +Cc: netdev, Herbert Xu
In-Reply-To: <029b59b8f74dbdbdf202fcf41a9a90b41b4821a2.1566395202.git.sd@queasysnail.net>

On Wed, Aug 21, 2019 at 11:46:25PM +0200, Sabrina Dubroca wrote:
> TCP encapsulation of IKE and IPsec messages (RFC 8229) is implemented
> as a TCP ULP, overriding in particular the sendmsg and recvmsg
> operations. A Stream Parser is used to extract messages out of the TCP
> stream using the first 2 bytes as length marker. Received IKE messages
> are put on "ike_queue", waiting to be dequeued by the custom recvmsg
> implementation. Received ESP messages are sent to XFRM, like with UDP
> encapsulation.
> 
> Some of this code is taken from the original submission by Herbert
> Xu. Currently, only IPv4 is supported, like for UDP encapsulation.
> 
> Co-developed-by: Herbert Xu <herbert@gondor.apana.org.au>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
> Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
> ---
>  include/net/espintcp.h   |  38 +++
>  include/net/xfrm.h       |   1 +
>  include/uapi/linux/udp.h |   1 +
>  net/ipv4/esp4.c          | 189 ++++++++++++++-
>  net/xfrm/Kconfig         |   9 +
>  net/xfrm/Makefile        |   1 +
>  net/xfrm/espintcp.c      | 505 +++++++++++++++++++++++++++++++++++++++
>  net/xfrm/xfrm_policy.c   |   7 +
>  net/xfrm/xfrm_state.c    |   3 +
>  9 files changed, 751 insertions(+), 3 deletions(-)
>  create mode 100644 include/net/espintcp.h
>  create mode 100644 net/xfrm/espintcp.c
> 

...

> +static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
> +{
> +	struct xfrm_encap_tmpl *encap = x->encap;
> +	struct esp_tcp_sk *esk;
> +	__be16 sport, dport;
> +	struct sock *nsk;
> +	struct sock *sk;
> +
> +	sk = rcu_dereference(x->encap_sk);
> +	if (sk && sk->sk_state == TCP_ESTABLISHED)
> +		return sk;
> +
> +	spin_lock_bh(&x->lock);
> +	sport = encap->encap_sport;
> +	dport = encap->encap_dport;
> +	nsk = rcu_dereference_protected(x->encap_sk,
> +					lockdep_is_held(&x->lock));
> +	if (sk && sk == nsk) {
> +		esk = kmalloc(sizeof(*esk), GFP_ATOMIC);
> +		if (!esk) {
> +			spin_unlock_bh(&x->lock);
> +			return ERR_PTR(-ENOMEM);
> +		}
> +		RCU_INIT_POINTER(x->encap_sk, NULL);
> +		esk->sk = sk;
> +		call_rcu(&esk->rcu, esp_free_tcp_sk);

I don't understand this, can you please explain what you are doing here?


> +	}
> +	spin_unlock_bh(&x->lock);
> +
> +	sk = inet_lookup_established(xs_net(x), &tcp_hashinfo, x->id.daddr.a4,
> +				     dport, x->props.saddr.a4, sport, 0);
> +	if (!sk)
> +		return ERR_PTR(-ENOENT);
> +
> +	if (!tcp_is_ulp_esp(sk)) {
> +		sock_put(sk);
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	spin_lock_bh(&x->lock);
> +	nsk = rcu_dereference_protected(x->encap_sk,
> +					lockdep_is_held(&x->lock));
> +	if (encap->encap_sport != sport ||
> +	    encap->encap_dport != dport) {
> +		sock_put(sk);
> +		sk = nsk ?: ERR_PTR(-EREMCHG);
> +	} else if (sk == nsk) {
> +		sock_put(sk);
> +	} else {
> +		rcu_assign_pointer(x->encap_sk, sk);
> +	}
> +	spin_unlock_bh(&x->lock);
> +
> +	return sk;
> +}
> +
> +static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
> +{
> +	struct sock *sk;
> +	int err;
> +
> +	rcu_read_lock();
> +
> +	sk = esp_find_tcp_sk(x);
> +	err = PTR_ERR(sk);
> +	if (IS_ERR(sk))

Maybe better 'if (err)'?

> +		goto out;
> +
> +	bh_lock_sock(sk);
> +	if (sock_owned_by_user(sk)) {
> +		err = espintcp_queue_out(sk, skb);
> +		if (err < 0)
> +			goto unlock_sock;

This goto is not needed.

> +	} else {
> +		err = espintcp_push_skb(sk, skb);
> +	}
> +
> +unlock_sock:
> +	bh_unlock_sock(sk);
> +out:
> +	rcu_read_unlock();
> +	return err;
> +}
> +
> +static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk,
> +				   struct sk_buff *skb)
> +{
> +	struct dst_entry *dst = skb_dst(skb);
> +	struct xfrm_state *x = dst->xfrm;
> +
> +	return esp_output_tcp_finish(x, skb);
> +}
> +
> +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
> +{
> +	int err;
> +
> +	local_bh_disable();
> +	err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
> +	local_bh_enable();
> +
> +	/* EINPROGRESS just happens to do the right thing.  It
> +	 * actually means that the skb has been consumed and
> +	 * isn't coming back.
> +	 */
> +	return err ?: -EINPROGRESS;
> +}
> +#endif
> +
>  static void esp_output_done(struct crypto_async_request *base, int err)
>  {
>  	struct sk_buff *skb = base->data;
> @@ -147,7 +272,13 @@ static void esp_output_done(struct crypto_async_request *base, int err)
>  		secpath_reset(skb);
>  		xfrm_dev_resume(skb);
>  	} else {
> -		xfrm_output_resume(skb, err);
> +#ifdef CONFIG_XFRM_ESPINTCP

Do we really need all these ifdef? I guess most of them could
be avoided with some code refactorization.

> +		if (!err &&
> +		    x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
> +			esp_output_tail_tcp(x, skb);
> +		else
> +#endif
> +			xfrm_output_resume(skb, err);
>  	}
>  }

...

> @@ -296,7 +460,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
>  	struct sk_buff *trailer;
>  	int tailen = esp->tailen;
>  
> -	/* this is non-NULL only with UDP Encapsulation */
> +	/* this is non-NULL only with TCP/UDP Encapsulation */
>  	if (x->encap) {
>  		int err = esp_output_encap(x, skb, esp);
>  
> @@ -491,6 +655,11 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
>  	if (sg != dsg)
>  		esp_ssg_unref(x, tmp);
>  
> +#ifdef CONFIG_XFRM_ESPINTCP
> +	if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
> +		err = esp_output_tail_tcp(x, skb);
> +#endif
> +
>  error_free:
>  	kfree(tmp);
>  error:
> @@ -617,10 +786,16 @@ int esp_input_done2(struct sk_buff *skb, int err)
>  
>  	if (x->encap) {
>  		struct xfrm_encap_tmpl *encap = x->encap;
> +		struct tcphdr *th = (void *)(skb_network_header(skb) + ihl);

This gives a unused variable warning if CONFIG_XFRM_ESPINTCP
is not set.

>  		struct udphdr *uh = (void *)(skb_network_header(skb) + ihl);
>  		__be16 source;
>  
>  		switch (x->encap->encap_type) {
> +#ifdef CONFIG_XFRM_ESPINTCP
> +		case TCP_ENCAP_ESPINTCP:
> +			source = th->source;
> +			break;
> +#endif
>  		case UDP_ENCAP_ESPINUDP:
>  		case UDP_ENCAP_ESPINUDP_NON_IKE:
>  			source = uh->source;
> @@ -1017,6 +1192,14 @@ static int esp_init_state(struct xfrm_state *x)
>  		case UDP_ENCAP_ESPINUDP_NON_IKE:
>  			x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
>  			break;
> +#ifdef CONFIG_XFRM_ESPINTCP
> +		case TCP_ENCAP_ESPINTCP:
> +			/* only the length field, TCP encap is done by
> +			 * the socket
> +			 */
> +			x->props.header_len += 2;
> +			break;
> +#endif
>  		}
>  	}
>  
> diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
> index c967fc3c38c8..ccc012b3ea10 100644
> --- a/net/xfrm/Kconfig
> +++ b/net/xfrm/Kconfig
> @@ -71,6 +71,15 @@ config XFRM_IPCOMP
>  	select CRYPTO
>  	select CRYPTO_DEFLATE
>  
> +config XFRM_ESPINTCP
> +	bool "ESP in TCP encapsulation (RFC 8229)"
> +	depends on XFRM && INET_ESP
> +	select STREAM_PARSER

I'm getting these compile errors:

espintcp.o: In function `espintcp_close':
/home/klassert/git/linux-stk/net/xfrm/espintcp.c:469: undefined reference to `sk_msg_free'
net/xfrm/espintcp.o: In function `espintcp_sendmsg':
/home/klassert/git/linux-stk/net/xfrm/espintcp.c:302: undefined reference to `sk_msg_alloc'
/home/klassert/git/linux-stk/net/xfrm/espintcp.c:316: undefined reference to `sk_msg_memcopy_from_iter'
/home/klassert/git/linux-stk/net/xfrm/espintcp.c:341: undefined reference to `sk_msg_free'
/home/klassert/git/linux-stk/net/xfrm/espintcp.c:321: undefined reference to `sk_msg_memcopy_from_iter'
/home/klassert/git/linux-stk/Makefile:1067: recipe for target 'vmlinux' failed
make[1]: *** [vmlinux] Error 1

I guess you need to select NET_SOCK_MSG.

Btw. I've updated the ipsec-next tree, so patch 1 is not needed anymore.

Everything else looks good!


^ permalink raw reply

* Re: [net-next] net: sched: pie: enable timestamp based delay calculation
From: Gautam Ramakrishnan @ 2019-08-29  7:21 UTC (permalink / raw)
  To: Dave Taht
  Cc: Eric Dumazet, Linux Kernel Network Developers, Jamal Hadi Salim,
	David S. Miller, Cong Wang, Leslie Monis, Mohit P . Tahiliani
In-Reply-To: <CAA93jw6PJXsG++0c+mE8REUb0cD4PU2Xck-J9fD1miuKfxS6BQ@mail.gmail.com>

> > No module parameter is accepted these days.
> >
> > Please add a new attribute instead,
> > so that pie can be used in both mode on the same host.

We have prepared a new patch which sets the queue delay estimator as
an attribute instead of using module parameters

>
> I note that I think (but lack independent data) this improvement to
> the rate estimator in pie should improve its usability and accuracy
> in low rate or hw mq situations, and with some benchmarking to
> show the cpu impact (at high and low rates) of this improvement as
> well as the network
> impact, the old way should probably be dropped and new way adopted without
> needing a new variable to control it.
>
> A commit showing the before/after cpu and network impact with a whole
> bunch of flent benchmarks would be great.

We have tested for network impact using flent. However, we are unaware of
any standard methods to test for cpu impact. It would be helpful if
someone could suggest a method to test for cpu impact.
>
> (I'd also love to know if pie can be run with a lower target - like 5ms -
>  with this mod in place)

We will test it with target latencies of 5 and 10 ms as well.
>
> >
> > For a typical example of attribute addition, please take
> > a look at commit 48872c11b77271ef9b070bdc50afe6655c4eb9aa
> > ("net_sched: sch_fq: add dctcp-like marking")
>
> utilizing ce_threshold in this way is actually independent of whether or
> not you are using dctcp-style or rfc3168 style ecn marking.
>
> It's "I'm self congesting... Dooo something! Anything, to reduce the load!"
>
>
>
>
> --
>
> Dave Täht
> CTO, TekLibre, LLC
> http://www.teklibre.com
> Tel: 1-831-205-9740



-- 
-------------
Gautam |

^ permalink raw reply

* Re: [PATCH v2 3/3] arm: Add support for function error injection
From: Leo Yan @ 2019-08-29  7:23 UTC (permalink / raw)
  To: Russell King - ARM Linux admin
  Cc: Oleg Nesterov, Catalin Marinas, Will Deacon,
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin,
	x86, Arnd Bergmann, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Song Liu, Yonghong Song, Naveen N. Rao,
	linux-arm-kernel, linux-kernel, linuxppc-dev, linux-arch, netdev,
	bpf, clang-built-linux, Masami Hiramatsu
In-Reply-To: <20190829065729.GU13294@shell.armlinux.org.uk>

Hi Russell,

On Thu, Aug 29, 2019 at 07:57:29AM +0100, Russell King - ARM Linux admin wrote:
> I'm sorry, I can't apply this, it produces loads of:
> 
> include/linux/error-injection.h:7:10: fatal error: asm/error-injection.h: No such file or directory
> 
> Since your patch 1 has been merged by the ARM64 people, I can't take
> it until next cycle.

For this case, do you want me to resend this patch in next merge
window?  Or you have picked up this patch but will send PR in next
cycle?

Thanks,
Leo Yan

> On Mon, Aug 19, 2019 at 05:18:08PM +0800, Leo Yan wrote:
> > Hi Russell,
> > 
> > On Tue, Aug 06, 2019 at 06:00:15PM +0800, Leo Yan wrote:
> > > This patch implements arm specific functions regs_set_return_value() and
> > > override_function_with_return() to support function error injection.
> > > 
> > > In the exception flow, it updates pt_regs::ARM_pc with pt_regs::ARM_lr
> > > so can override the probed function return.
> > 
> > Gentle ping ...  Could you review this patch?
> > 
> > Thanks,
> > Leo.
> > 
> > > Signed-off-by: Leo Yan <leo.yan@linaro.org>
> > > ---
> > >  arch/arm/Kconfig              |  1 +
> > >  arch/arm/include/asm/ptrace.h |  5 +++++
> > >  arch/arm/lib/Makefile         |  2 ++
> > >  arch/arm/lib/error-inject.c   | 19 +++++++++++++++++++
> > >  4 files changed, 27 insertions(+)
> > >  create mode 100644 arch/arm/lib/error-inject.c
> > > 
> > > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> > > index 33b00579beff..2d3d44a037f6 100644
> > > --- a/arch/arm/Kconfig
> > > +++ b/arch/arm/Kconfig
> > > @@ -77,6 +77,7 @@ config ARM
> > >  	select HAVE_EXIT_THREAD
> > >  	select HAVE_FAST_GUP if ARM_LPAE
> > >  	select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
> > > +	select HAVE_FUNCTION_ERROR_INJECTION if !THUMB2_KERNEL
> > >  	select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL && !CC_IS_CLANG
> > >  	select HAVE_FUNCTION_TRACER if !XIP_KERNEL
> > >  	select HAVE_GCC_PLUGINS
> > > diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h
> > > index 91d6b7856be4..3b41f37b361a 100644
> > > --- a/arch/arm/include/asm/ptrace.h
> > > +++ b/arch/arm/include/asm/ptrace.h
> > > @@ -89,6 +89,11 @@ static inline long regs_return_value(struct pt_regs *regs)
> > >  	return regs->ARM_r0;
> > >  }
> > >  
> > > +static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
> > > +{
> > > +	regs->ARM_r0 = rc;
> > > +}
> > > +
> > >  #define instruction_pointer(regs)	(regs)->ARM_pc
> > >  
> > >  #ifdef CONFIG_THUMB2_KERNEL
> > > diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
> > > index b25c54585048..8f56484a7156 100644
> > > --- a/arch/arm/lib/Makefile
> > > +++ b/arch/arm/lib/Makefile
> > > @@ -42,3 +42,5 @@ ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
> > >    CFLAGS_xor-neon.o		+= $(NEON_FLAGS)
> > >    obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
> > >  endif
> > > +
> > > +obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
> > > diff --git a/arch/arm/lib/error-inject.c b/arch/arm/lib/error-inject.c
> > > new file mode 100644
> > > index 000000000000..2d696dc94893
> > > --- /dev/null
> > > +++ b/arch/arm/lib/error-inject.c
> > > @@ -0,0 +1,19 @@
> > > +// SPDX-License-Identifier: GPL-2.0
> > > +
> > > +#include <linux/error-injection.h>
> > > +#include <linux/kprobes.h>
> > > +
> > > +void override_function_with_return(struct pt_regs *regs)
> > > +{
> > > +	/*
> > > +	 * 'regs' represents the state on entry of a predefined function in
> > > +	 * the kernel/module and which is captured on a kprobe.
> > > +	 *
> > > +	 * 'regs->ARM_lr' contains the the link register for the probed
> > > +	 * function, when kprobe returns back from exception it will override
> > > +	 * the end of probed function and directly return to the predefined
> > > +	 * function's caller.
> > > +	 */
> > > +	instruction_pointer_set(regs, regs->ARM_lr);
> > > +}
> > > +NOKPROBE_SYMBOL(override_function_with_return);
> > > -- 
> > > 2.17.1
> > > 
> > 
> 
> -- 
> RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
> FTTC broadband for 0.8mile line in suburbia: sync at 12.1Mbps down 622kbps up
> According to speedtest.net: 11.9Mbps down 500kbps up

^ permalink raw reply

* RE: [PATCH REPOST 1/2] can: flexcan: fix deadlock when using self wakeup
From: Joakim Zhang @ 2019-08-29  7:30 UTC (permalink / raw)
  To: Sean Nyekjaer, mkl@pengutronix.de, linux-can@vger.kernel.org
  Cc: wg@grandegger.com, netdev@vger.kernel.org, dl-linux-imx,
	Martin Hundebøll
In-Reply-To: <6a9bc081-334a-df91-3a23-b74a6cdd3633@geanix.com>


> -----Original Message-----
> From: Sean Nyekjaer <sean@geanix.com>
> Sent: 2019年8月28日 21:25
> To: Joakim Zhang <qiangqing.zhang@nxp.com>; mkl@pengutronix.de;
> linux-can@vger.kernel.org
> Cc: wg@grandegger.com; netdev@vger.kernel.org; dl-linux-imx
> <linux-imx@nxp.com>; Martin Hundebøll <martin@geanix.com>
> Subject: Re: [PATCH REPOST 1/2] can: flexcan: fix deadlock when using self
> wakeup
> 
> 
> 
> On 20/08/2019 13.55, Sean Nyekjaer wrote:
> >
> > I have added some more debug, same test setup:
> >
> https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgist.gith
> ub.com%2Fsknsean%2F81208714de23aa3639d3e31dccb2f3e0&amp;data=02%
> 7C01%7Cqiangqing.zhang%40nxp.com%7C1dae7ec4f191462be43d08d72bbb1
> e82%7C686ea1d3bc2b4c6fa92cd99c5c301635%7C0%7C0%7C63702595498952
> 9813&amp;sdata=u2t747L4mi87ejuaLwl4WWJNJIFIzvlEfNE%2BSLq2Kbc%3D&
> amp;reserved=0
> >
> > root@iwg26:~# systemctl suspend
> >
> > ...
> >
> https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgist.gith
> ub.com%2Fsknsean%2F2a786f1543305056d4de03d387872403&amp;data=02
> %7C01%7Cqiangqing.zhang%40nxp.com%7C1dae7ec4f191462be43d08d72bbb
> 1e82%7C686ea1d3bc2b4c6fa92cd99c5c301635%7C0%7C0%7C6370259549895
> 29813&amp;sdata=WbI2eeIlrNC6HUSviagaiAyi4YpK%2B2bC3al4Yn9EXZM%3D
> &amp;reserved=0
> >
> > /Sean
> 
> Any luck reproducing this?

Hi Sean,

I'm sorry that I can't get the debug log as the site can't be reached. And I connect two boards to do test at my side, this issue can't be reproduced.

Best Regards,
Joakim Zhang
> /Sean

^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/3] capability: introduce CAP_BPF and CAP_TRACING
From: Toke Høiland-Jørgensen @ 2019-08-29  7:44 UTC (permalink / raw)
  To: Alexei Starovoitov, luto
  Cc: davem, peterz, rostedt, netdev, bpf, kernel-team, linux-api
In-Reply-To: <20190829051253.1927291-1-ast@kernel.org>

Alexei Starovoitov <ast@kernel.org> writes:

> CAP_BPF allows the following BPF operations:
> - Loading all types of BPF programs
> - Creating all types of BPF maps except:
>    - stackmap that needs CAP_TRACING
>    - devmap that needs CAP_NET_ADMIN
>    - cpumap that needs CAP_SYS_ADMIN

Why CAP_SYS_ADMIN instead of CAP_NET_ADMIN for cpumap?

-Toke

^ permalink raw reply

* Re: [PATCH 1/2] rtnetlink: gate MAC address with an LSM hook
From: Michal Kubecek @ 2019-08-29  7:45 UTC (permalink / raw)
  To: netdev; +Cc: Paul Moore, Jeffrey Vander Stoep, David Miller, LSM List, selinux
In-Reply-To: <CAHC9VhRmmEp_nFtOFy_YRa9NwZA4qPnjw7D3JQvqED-tO4Ha1g@mail.gmail.com>

On Tue, Aug 27, 2019 at 04:47:04PM -0400, Paul Moore wrote:
> 
> I'm also not a big fan of inserting the hook in rtnl_fill_ifinfo(); as
> presented it is way too specific for a LSM hook for me to be happy.
> However, I do agree that giving the LSMs some control over netlink
> messages makes sense.  As others have pointed out, it's all a matter
> of where to place the hook.
> 
> If we only care about netlink messages which leverage nlattrs I
> suppose one option that I haven't seen mentioned would be to place a
> hook in nla_put().  While it is a bit of an odd place for a hook, it
> would allow the LSM easy access to the skb and attribute type to make
> decisions, and all of the callers should already be checking the
> return code (although we would need to verify this).  One notable
> drawback (not the only one) is that the hook is going to get hit
> multiple times for each message.

For most messages, "multiple times" would mean tens, for many even
hundreds of calls. For each, you would have to check corresponding
socket (and possibly also genetlink header) to see which netlink based
protocol it is and often even parse existing part of the message to get
the context (because the same numeric attribute type can mean something
completely different if it appears in a nested attribute).

Also, nla_put() (or rather __nla_put()) is not used for all attributes,
one may also use nla_reserve() and then compose the attribute date in
place.

Michal Kubecek

^ permalink raw reply

* Re: [v1] net_sched: act_police: add 2 new attributes to support police 64bit rate and peakrate
From: Eric Dumazet @ 2019-08-29  8:32 UTC (permalink / raw)
  To: David Dai, jhs, xiyou.wangcong, jiri, davem, netdev, linux-kernel; +Cc: zdai
In-Reply-To: <1567032687-973-1-git-send-email-zdai@linux.vnet.ibm.com>



On 8/29/19 12:51 AM, David Dai wrote:
> For high speed adapter like Mellanox CX-5 card, it can reach upto
> 100 Gbits per second bandwidth. Currently htb already supports 64bit rate
> in tc utility. However police action rate and peakrate are still limited
> to 32bit value (upto 32 Gbits per second). Add 2 new attributes
> TCA_POLICE_RATE64 and TCA_POLICE_RATE64 in kernel for 64bit support
> so that tc utility can use them for 64bit rate and peakrate value to
> break the 32bit limit, and still keep the backward binary compatibility.
> 
> Tested-by: David Dai <zdai@linux.vnet.ibm.com>
> Signed-off-by: David Dai <zdai@linux.vnet.ibm.com>
> ---
>  include/uapi/linux/pkt_cls.h |    2 ++
>  net/sched/act_police.c       |   27 +++++++++++++++++++++++----
>  2 files changed, 25 insertions(+), 4 deletions(-)
> 
> diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
> index b057aee..eb4ea4d 100644
> --- a/include/uapi/linux/pkt_cls.h
> +++ b/include/uapi/linux/pkt_cls.h
> @@ -159,6 +159,8 @@ enum {
>  	TCA_POLICE_AVRATE,
>  	TCA_POLICE_RESULT,
>  	TCA_POLICE_TM,
> +	TCA_POLICE_RATE64,
> +	TCA_POLICE_PEAKRATE64,
>  	TCA_POLICE_PAD,
>  	__TCA_POLICE_MAX
>  #define TCA_POLICE_RESULT TCA_POLICE_RESULT

Never insert new attributes, as this breaks compatibility with old binaries (including
old kernels)

Keep TCA_POLICE_PAD value the same, thanks.

^ permalink raw reply

* Re: [PATCH net-next 06/15] net: sgi: ioc3-eth: get rid of ioc3_clean_rx_ring()
From: Thomas Bogendoerfer @ 2019-08-29  8:52 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Ralf Baechle, Paul Burton, James Hogan, David S. Miller,
	linux-mips, linux-kernel, netdev
In-Reply-To: <20190828160246.7b211f8a@cakuba.netronome.com>

On Wed, 28 Aug 2019 16:02:46 -0700
Jakub Kicinski <jakub.kicinski@netronome.com> wrote:

> On Wed, 28 Aug 2019 16:03:05 +0200, Thomas Bogendoerfer wrote:
> > Clean rx ring is just called once after a new ring is allocated, which
> > is per definition clean. So there is not need for this function.
> > 
> > Signed-off-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
> > ---
> >  drivers/net/ethernet/sgi/ioc3-eth.c | 21 ---------------------
> >  1 file changed, 21 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c
> > index 6ca560d4ab79..39631e067b71 100644
> > --- a/drivers/net/ethernet/sgi/ioc3-eth.c
> > +++ b/drivers/net/ethernet/sgi/ioc3-eth.c
> > @@ -761,26 +761,6 @@ static void ioc3_mii_start(struct ioc3_private *ip)
> >  	add_timer(&ip->ioc3_timer);
> >  }
> >  
> > -static inline void ioc3_clean_rx_ring(struct ioc3_private *ip)
> > -{
> > -	struct ioc3_erxbuf *rxb;
> > -	struct sk_buff *skb;
> > -	int i;
> > -
> > -	for (i = ip->rx_ci; i & 15; i++) {
> > -		ip->rx_skbs[ip->rx_pi] = ip->rx_skbs[ip->rx_ci];
> > -		ip->rxr[ip->rx_pi++] = ip->rxr[ip->rx_ci++];
> > -	}
> > -	ip->rx_pi &= RX_RING_MASK;
> > -	ip->rx_ci &= RX_RING_MASK;
> > -
> > -	for (i = ip->rx_ci; i != ip->rx_pi; i = (i + 1) & RX_RING_MASK) {
> > -		skb = ip->rx_skbs[i];
> > -		rxb = (struct ioc3_erxbuf *)(skb->data - RX_OFFSET);
> > -		rxb->w0 = 0;
> 
> There's gotta be some purpose to setting this w0 word to zero no?
> ioc3_rx() uses that to see if the descriptor is done, and dutifully
> clears it after..

you are right. I thought this is already done in alloc_rx_bufs, but it isn't.
I'll add it there and put it into this patch. /me wonders why testing
didn't show this...

Thomas.

-- 
SUSE Software Solutions Germany GmbH
HRB 247165 (AG München)
Geschäftsführer: Felix Imendörffer

^ permalink raw reply

* RE: [PATCH v1 1/5] mdev: Introduce sha1 based mdev alias
From: Parav Pandit @ 2019-08-29  9:06 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Jiri Pirko, kwankhede@nvidia.com, cohuck@redhat.com,
	davem@davemloft.net, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <20190828152544.16ba2617@x1.home>



> -----Original Message-----
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Thursday, August 29, 2019 2:56 AM
> To: Parav Pandit <parav@mellanox.com>
> Cc: Jiri Pirko <jiri@mellanox.com>; kwankhede@nvidia.com;
> cohuck@redhat.com; davem@davemloft.net; kvm@vger.kernel.org; linux-
> kernel@vger.kernel.org; netdev@vger.kernel.org
> Subject: Re: [PATCH v1 1/5] mdev: Introduce sha1 based mdev alias
> 

> > +	/* Allocate and init descriptor */
> > +	hash_desc = kvzalloc(sizeof(*hash_desc) +
> > +			     crypto_shash_descsize(alias_hash),
> > +			     GFP_KERNEL);
> > +	if (!hash_desc)
> > +		goto desc_err;
> > +
> > +	hash_desc->tfm = alias_hash;
> > +
> > +	digest_size = crypto_shash_digestsize(alias_hash);
> > +
> > +	digest = kzalloc(digest_size, GFP_KERNEL);
> > +	if (!digest) {
> > +		ret = -ENOMEM;
> > +		goto digest_err;
> > +	}
> > +	crypto_shash_init(hash_desc);
> > +	crypto_shash_update(hash_desc, uuid, UUID_STRING_LEN);
> > +	crypto_shash_final(hash_desc, digest);
> 
> All of these can fail and many, if not most, of the callers appear that they might
> test the return value.  Thanks,
Right. Changing the signature and honoring return value in v2.

> 
> Alex

^ permalink raw reply

* RE: [PATCH v1 1/5] mdev: Introduce sha1 based mdev alias
From: Parav Pandit @ 2019-08-29  9:07 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Jiri Pirko, kwankhede@nvidia.com, cohuck@redhat.com,
	davem@davemloft.net, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <20190828153431.36c37232@x1.home>



> -----Original Message-----
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Thursday, August 29, 2019 3:05 AM
> To: Parav Pandit <parav@mellanox.com>
> Cc: Jiri Pirko <jiri@mellanox.com>; kwankhede@nvidia.com;
> cohuck@redhat.com; davem@davemloft.net; kvm@vger.kernel.org; linux-
> kernel@vger.kernel.org; netdev@vger.kernel.org
> Subject: Re: [PATCH v1 1/5] mdev: Introduce sha1 based mdev alias
> 
> On Wed, 28 Aug 2019 15:25:44 -0600
> Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> > On Tue, 27 Aug 2019 14:16:50 -0500
> > Parav Pandit <parav@mellanox.com> wrote:
> > >  module_init(mdev_init)
> > > diff --git a/drivers/vfio/mdev/mdev_private.h
> b/drivers/vfio/mdev/mdev_private.h
> > > index 7d922950caaf..cf1c0d9842c6 100644
> > > --- a/drivers/vfio/mdev/mdev_private.h
> > > +++ b/drivers/vfio/mdev/mdev_private.h
> > > @@ -33,6 +33,7 @@ struct mdev_device {
> > >  	struct kobject *type_kobj;
> > >  	struct device *iommu_device;
> > >  	bool active;
> > > +	const char *alias;
> 
> Nit, put this above active to avoid creating a hole in the structure.
> Thanks,
> 
Ack.

> Alex

^ permalink raw reply

* RE: [PATCH v1 2/5] mdev: Make mdev alias unique among all mdevs
From: Parav Pandit @ 2019-08-29  9:07 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Jiri Pirko, kwankhede@nvidia.com, cohuck@redhat.com,
	davem@davemloft.net, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <20190828153652.7eb6d6d6@x1.home>



> -----Original Message-----
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Thursday, August 29, 2019 3:07 AM
> To: Parav Pandit <parav@mellanox.com>
> Cc: Jiri Pirko <jiri@mellanox.com>; kwankhede@nvidia.com;
> cohuck@redhat.com; davem@davemloft.net; kvm@vger.kernel.org; linux-
> kernel@vger.kernel.org; netdev@vger.kernel.org
> Subject: Re: [PATCH v1 2/5] mdev: Make mdev alias unique among all mdevs
> 
> On Tue, 27 Aug 2019 14:16:51 -0500
> Parav Pandit <parav@mellanox.com> wrote:
> 
> > Mdev alias should be unique among all the mdevs, so that when such
> > alias is used by the mdev users to derive other objects, there is no
> > collision in a given system.
> >
> > Signed-off-by: Parav Pandit <parav@mellanox.com>
> >
> > ---
> > Changelog:
> > v0->v1:
> >  - Fixed inclusiong of alias for NULL check
> >  - Added ratelimited debug print for sha1 hash collision error
> > ---
> >  drivers/vfio/mdev/mdev_core.c | 7 +++++++
> >  1 file changed, 7 insertions(+)
> >
> > diff --git a/drivers/vfio/mdev/mdev_core.c
> > b/drivers/vfio/mdev/mdev_core.c index 62d29f57fe0c..4b9899e40665
> > 100644
> > --- a/drivers/vfio/mdev/mdev_core.c
> > +++ b/drivers/vfio/mdev/mdev_core.c
> > @@ -375,6 +375,13 @@ int mdev_device_create(struct kobject *kobj, struct
> device *dev,
> >  			ret = -EEXIST;
> >  			goto mdev_fail;
> >  		}
> > +		if (tmp->alias && alias && strcmp(tmp->alias, alias) == 0) {
> 
> Nit, test if the device we adding has an alias before the device we're testing
> against.  The compiler can better optimize keeping alias hot.
> Thanks,
> 
Ok. will do.

> Alex

^ permalink raw reply

* Re: [RFC PATCH 2/3] reorganize ptp_kvm modules to make it arch-independent.
From: Marc Zyngier @ 2019-08-29  9:09 UTC (permalink / raw)
  To: Jianyong Wu, netdev, pbonzini, sean.j.christopherson,
	richardcochran, Mark.Rutland, Will.Deacon, suzuki.poulose
  Cc: linux-kernel, Steve.Capper, Kaly.Xin, justin.he
In-Reply-To: <20190829063952.18470-3-jianyong.wu@arm.com>

On 29/08/2019 07:39, Jianyong Wu wrote:
> Currently, ptp_kvm modules implementation is only for x86 which includs
> large part of arch-specific code.  This patch move all of those code
> into related arch directory.
> 
> Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
> ---
>  arch/x86/kvm/arch_ptp_kvm.c          | 92 ++++++++++++++++++++++++++++
>  drivers/ptp/Makefile                 |  1 +
>  drivers/ptp/{ptp_kvm.c => kvm_ptp.c} | 77 ++++++-----------------
>  include/asm-generic/ptp_kvm.h        | 12 ++++
>  4 files changed, 123 insertions(+), 59 deletions(-)
>  create mode 100644 arch/x86/kvm/arch_ptp_kvm.c
>  rename drivers/ptp/{ptp_kvm.c => kvm_ptp.c} (63%)
>  create mode 100644 include/asm-generic/ptp_kvm.h
> 
> diff --git a/arch/x86/kvm/arch_ptp_kvm.c b/arch/x86/kvm/arch_ptp_kvm.c
> new file mode 100644
> index 000000000000..56ea84a86da2
> --- /dev/null
> +++ b/arch/x86/kvm/arch_ptp_kvm.c
> @@ -0,0 +1,92 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + *  Virtual PTP 1588 clock for use with KVM guests
> + *
> + *  Copyright (C) 2019 ARM Ltd.
> + *  All Rights Reserved

No. This isn't ARM's code, not by a million mile. You've simply
refactored existing code. Please keep the correct attribution (i.e. that
of the original code).

> + */
> +
> +#include <asm/pvclock.h>
> +#include <asm/kvmclock.h>
> +#include <linux/module.h>
> +#include <uapi/asm/kvm_para.h>
> +#include <uapi/linux/kvm_para.h>
> +#include <linux/ptp_clock_kernel.h>
> +
> +phys_addr_t clock_pair_gpa;
> +struct kvm_clock_pairing clock_pair;
> +struct pvclock_vsyscall_time_info *hv_clock;
> +
> +int kvm_arch_ptp_init(void)
> +{
> +	int ret;
> +
> +	if (!kvm_para_available())
> +		return -ENODEV;
> +
> +	clock_pair_gpa = slow_virt_to_phys(&clock_pair);
> +	hv_clock = pvclock_get_pvti_cpu0_va();
> +	if (!hv_clock)
> +		return -ENODEV;
> +
> +	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
> +			     KVM_CLOCK_PAIRING_WALLCLOCK);
> +	if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
> +		return -ENODEV;
> +
> +	return 0;
> +}
> +
> +int kvm_arch_ptp_get_clock(struct timespec64 *ts)
> +{
> +	long ret;
> +
> +	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
> +			     clock_pair_gpa,
> +			     KVM_CLOCK_PAIRING_WALLCLOCK);
> +	if (ret != 0)
> +		return -EOPNOTSUPP;
> +
> +	ts->tv_sec = clock_pair.sec;
> +	ts->tv_nsec = clock_pair.nsec;
> +
> +	return 0;
> +}
> +
> +int kvm_arch_ptp_get_clock_fn(long *cycle, struct timespec64 *tspec,
> +			      struct clocksource **cs)
> +{
> +	unsigned long ret;
> +	unsigned int version;
> +	int cpu;
> +	struct pvclock_vcpu_time_info *src;
> +
> +	cpu = smp_processor_id();
> +	src = &hv_clock[cpu].pvti;
> +
> +	do {
> +		/*
> +		 * We are using a TSC value read in the hosts
> +		 * kvm_hc_clock_pairing handling.
> +		 * So any changes to tsc_to_system_mul
> +		 * and tsc_shift or any other pvclock
> +		 * data invalidate that measurement.
> +		 */
> +		version = pvclock_read_begin(src);
> +
> +		ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
> +				     clock_pair_gpa,
> +				     KVM_CLOCK_PAIRING_WALLCLOCK);
> +		tspec->tv_sec = clock_pair.sec;
> +		tspec->tv_nsec = clock_pair.nsec;
> +		*cycle = __pvclock_read_cycles(src, clock_pair.tsc);
> +	} while (pvclock_read_retry(src, version));
> +
> +	*cs = &kvm_clock;
> +
> +	return 0;
> +}
> +
> +MODULE_AUTHOR("Marcelo Tosatti <mtosatti@redhat.com>");
> +MODULE_DESCRIPTION("PTP clock using KVMCLOCK");
> +MODULE_LICENSE("GPL");
> diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
> index 677d1d178a3e..5a8c6462fc0f 100644
> --- a/drivers/ptp/Makefile
> +++ b/drivers/ptp/Makefile
> @@ -4,6 +4,7 @@
>  #
>  
>  ptp-y					:= ptp_clock.o ptp_chardev.o ptp_sysfs.o
> +ptp_kvm-y				:= ../../arch/$(ARCH)/kvm/arch_ptp_kvm.o kvm_ptp.o
>  obj-$(CONFIG_PTP_1588_CLOCK)		+= ptp.o
>  obj-$(CONFIG_PTP_1588_CLOCK_DTE)	+= ptp_dte.o
>  obj-$(CONFIG_PTP_1588_CLOCK_IXP46X)	+= ptp_ixp46x.o
> diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/kvm_ptp.c
> similarity index 63%
> rename from drivers/ptp/ptp_kvm.c
> rename to drivers/ptp/kvm_ptp.c
> index fc7d0b77e118..9d07cf872be7 100644
> --- a/drivers/ptp/ptp_kvm.c
> +++ b/drivers/ptp/kvm_ptp.c
> @@ -8,12 +8,12 @@
>  #include <linux/err.h>
>  #include <linux/init.h>
>  #include <linux/kernel.h>
> +#include <linux/slab.h>
>  #include <linux/module.h>
>  #include <uapi/linux/kvm_para.h>
>  #include <asm/kvm_para.h>
> -#include <asm/pvclock.h>
> -#include <asm/kvmclock.h>
>  #include <uapi/asm/kvm_para.h>
> +#include <asm-generic/ptp_kvm.h>
>  
>  #include <linux/ptp_clock_kernel.h>
>  
> @@ -24,56 +24,29 @@ struct kvm_ptp_clock {
>  
>  DEFINE_SPINLOCK(kvm_ptp_lock);
>  
> -static struct pvclock_vsyscall_time_info *hv_clock;
> -
> -static struct kvm_clock_pairing clock_pair;
> -static phys_addr_t clock_pair_gpa;
> -
>  static int ptp_kvm_get_time_fn(ktime_t *device_time,
>  			       struct system_counterval_t *system_counter,
>  			       void *ctx)
>  {
> -	unsigned long ret;
> +	unsigned long ret, cycle;
>  	struct timespec64 tspec;
> -	unsigned version;
> -	int cpu;
> -	struct pvclock_vcpu_time_info *src;
> +	struct clocksource *cs;
>  
>  	spin_lock(&kvm_ptp_lock);
>  
>  	preempt_disable_notrace();
> -	cpu = smp_processor_id();
> -	src = &hv_clock[cpu].pvti;
> -
> -	do {
> -		/*
> -		 * We are using a TSC value read in the hosts
> -		 * kvm_hc_clock_pairing handling.
> -		 * So any changes to tsc_to_system_mul
> -		 * and tsc_shift or any other pvclock
> -		 * data invalidate that measurement.
> -		 */
> -		version = pvclock_read_begin(src);
> -
> -		ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
> -				     clock_pair_gpa,
> -				     KVM_CLOCK_PAIRING_WALLCLOCK);
> -		if (ret != 0) {
> -			pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
> -			spin_unlock(&kvm_ptp_lock);
> -			preempt_enable_notrace();
> -			return -EOPNOTSUPP;
> -		}
> -
> -		tspec.tv_sec = clock_pair.sec;
> -		tspec.tv_nsec = clock_pair.nsec;
> -		ret = __pvclock_read_cycles(src, clock_pair.tsc);
> -	} while (pvclock_read_retry(src, version));
> +	ret = kvm_arch_ptp_get_clock_fn(&cycle, &tspec, &cs);
> +	if (ret != 0) {
> +		pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
> +		spin_unlock(&kvm_ptp_lock);
> +		preempt_enable_notrace();
> +		return -EOPNOTSUPP;
> +	}
>  
>  	preempt_enable_notrace();
>  
> -	system_counter->cycles = ret;
> -	system_counter->cs = &kvm_clock;
> +	system_counter->cycles = cycle;
> +	system_counter->cs = cs;
>  
>  	*device_time = timespec64_to_ktime(tspec);
>  
> @@ -116,17 +89,13 @@ static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
>  
>  	spin_lock(&kvm_ptp_lock);
>  
> -	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
> -			     clock_pair_gpa,
> -			     KVM_CLOCK_PAIRING_WALLCLOCK);
> +	ret = kvm_arch_ptp_get_clock(&tspec);
>  	if (ret != 0) {
>  		pr_err_ratelimited("clock offset hypercall ret %lu\n", ret);
>  		spin_unlock(&kvm_ptp_lock);
>  		return -EOPNOTSUPP;
>  	}
>  
> -	tspec.tv_sec = clock_pair.sec;
> -	tspec.tv_nsec = clock_pair.nsec;
>  	spin_unlock(&kvm_ptp_lock);
>  
>  	memcpy(ts, &tspec, sizeof(struct timespec64));
> @@ -166,21 +135,11 @@ static void __exit ptp_kvm_exit(void)
>  
>  static int __init ptp_kvm_init(void)
>  {
> -	long ret;
> -
> -	if (!kvm_para_available())
> -		return -ENODEV;
> -
> -	clock_pair_gpa = slow_virt_to_phys(&clock_pair);
> -	hv_clock = pvclock_get_pvti_cpu0_va();
> -
> -	if (!hv_clock)
> -		return -ENODEV;
> +	int ret;
>  
> -	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
> -			KVM_CLOCK_PAIRING_WALLCLOCK);
> -	if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
> -		return -ENODEV;
> +	ret = kvm_arch_ptp_init();
> +	if (IS_ERR(ret))
> +		return ret;
>  
>  	kvm_ptp_clock.caps = ptp_kvm_caps;
>  
> diff --git a/include/asm-generic/ptp_kvm.h b/include/asm-generic/ptp_kvm.h
> new file mode 100644
> index 000000000000..128a9d7af161
> --- /dev/null
> +++ b/include/asm-generic/ptp_kvm.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + *  Virtual PTP 1588 clock for use with KVM guests
> + *
> + *  Copyright (C) 2019 ARM Ltd.
> + *  All Rights Reserved

Same here.

> + */
> +
> +static int kvm_arch_ptp_init(void);
> +static int kvm_arch_ptp_get_clock(struct timespec64 *ts);
> +static int kvm_arch_ptp_get_clock_fn(long *cycle,
> +		struct timespec64 *tspec, void *cs);
> 

	M.
-- 
Jazz is not dead, it just smells funny...

^ permalink raw reply

* [PATCH v3 2/2] net: mscc: Implement promisc mode.
From: Horatiu Vultur @ 2019-08-29  9:22 UTC (permalink / raw)
  To: alexandre.belloni, UNGLinuxDriver, davem, andrew, allan.nielsen,
	jiri, ivecera, f.fainelli, netdev, linux-kernel
  Cc: Horatiu Vultur
In-Reply-To: <1567070549-29255-1-git-send-email-horatiu.vultur@microchip.com>

When a port is added to the bridge, the port is added in promisc mode. But
the HW is capable of switching the frames therefore is not needed for the
port to be added in promisc mode. In case a user space application requires
for the port to enter promisc mode then is it needed to enter the promisc
mode.

Therefore listen in when the promiscuity on a dev is change and when the
port enters or leaves a bridge. Having this information it is possible to
know when to set the port in promisc mode and when not:
If the port is part of bridge and promiscuity > 1 or if the port is not
part of bridge and promiscuity is > 0 then add then add the port in promisc
mode otherwise don't.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
---
 drivers/net/ethernet/mscc/ocelot.c | 47 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 4d1bce4..292fcc1 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -1294,6 +1294,37 @@ static void ocelot_port_attr_mc_set(struct ocelot_port *port, bool mc)
 	ocelot_write_gix(ocelot, val, ANA_PORT_CPU_FWD_CFG, port->chip_port);
 }
 
+static void ocelot_port_attr_promisc_set(struct ocelot_port *port,
+					 unsigned int promisc,
+					 bool is_bridge_port)
+{
+	struct ocelot *ocelot = port->ocelot;
+	u32 val;
+
+	val = ocelot_read_gix(ocelot, ANA_PORT_CPU_FWD_CFG, port->chip_port);
+
+	/* When a port is added to a bridge, the port is added in promisc mode,
+	 * by calling the function 'ndo_set_rx_mode'. But the HW is capable
+	 * of switching the frames therefore is not needed for the port to
+	 * enter in promisc mode.
+	 * But a port needs to be added in promisc mode if an application
+	 * requires it(pcap library). Therefore listen when the
+	 * dev->promiscuity is change and when the port is added or removed from
+	 * the bridge. Using this information, calculate if the promisc mode
+	 * is required in the following way:
+	 */
+	if (!is_bridge_port && promisc > 0) {
+		val |= ANA_PORT_CPU_FWD_CFG_CPU_SRC_COPY_ENA;
+	} else {
+		if (is_bridge_port && promisc > 1)
+			val |= ANA_PORT_CPU_FWD_CFG_CPU_SRC_COPY_ENA;
+		else
+			val &= ~(ANA_PORT_CPU_FWD_CFG_CPU_SRC_COPY_ENA);
+	}
+
+	ocelot_write_gix(ocelot, val, ANA_PORT_CPU_FWD_CFG, port->chip_port);
+}
+
 static int ocelot_port_attr_set(struct net_device *dev,
 				const struct switchdev_attr *attr,
 				struct switchdev_trans *trans)
@@ -1316,6 +1347,10 @@ static int ocelot_port_attr_set(struct net_device *dev,
 	case SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED:
 		ocelot_port_attr_mc_set(ocelot_port, !attr->u.mc_disabled);
 		break;
+	case SWITCHDEV_ATTR_ID_PORT_PROMISCUITY:
+		ocelot_port_attr_promisc_set(ocelot_port, dev->promiscuity,
+					     netif_is_bridge_port(dev));
+		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
@@ -1688,6 +1723,18 @@ static int ocelot_netdevice_port_event(struct net_device *dev,
 
 			ocelot_vlan_port_apply(ocelot_port->ocelot,
 					       ocelot_port);
+
+			/* In case the port is added or removed from the bridge
+			 * is it needed to recaulculate the promiscuity. The
+			 * reason is that when a port leaves the bridge first
+			 * it decrease the promiscuity and then the flag
+			 * IFF_BRIDGE_PORT is removed from dev. Therefor the
+			 * function ocelot_port_attr_promisc is called with
+			 * the wrong arguments.
+			 */
+			ocelot_port_attr_promisc_set(ocelot_port,
+						     dev->promiscuity,
+						     info->linking);
 		}
 		if (netif_is_lag_master(info->upper_dev)) {
 			if (info->linking)
-- 
2.7.4


^ permalink raw reply related

* [PATCH v3 1/2] net: core: Notify on changes to dev->promiscuity.
From: Horatiu Vultur @ 2019-08-29  9:22 UTC (permalink / raw)
  To: alexandre.belloni, UNGLinuxDriver, davem, andrew, allan.nielsen,
	jiri, ivecera, f.fainelli, netdev, linux-kernel
  Cc: Horatiu Vultur
In-Reply-To: <1567070549-29255-1-git-send-email-horatiu.vultur@microchip.com>

Add the SWITCHDEV_ATTR_ID_PORT_PROMISCUITY switchdev notification type,
used to indicate whenever the dev promiscuity counter is changed.

The notification doesn't use any switchdev_attr attribute because in the
notifier callbacks is it possible to get the dev and read directly
the promiscuity value.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
---
 include/net/switchdev.h | 1 +
 net/core/dev.c          | 9 +++++++++
 2 files changed, 10 insertions(+)

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index aee86a1..14b1617 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -40,6 +40,7 @@ enum switchdev_attr_id {
 	SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
 	SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
 	SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
+	SWITCHDEV_ATTR_ID_PORT_PROMISCUITY,
 };
 
 struct switchdev_attr {
diff --git a/net/core/dev.c b/net/core/dev.c
index 49589ed..40c74f2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -142,6 +142,7 @@
 #include <linux/net_namespace.h>
 #include <linux/indirect_call_wrapper.h>
 #include <net/devlink.h>
+#include <net/switchdev.h>
 
 #include "net-sysfs.h"
 
@@ -7377,6 +7378,11 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 {
 	unsigned int old_flags = dev->flags;
+	struct switchdev_attr attr = {
+		.orig_dev = dev,
+		.id = SWITCHDEV_ATTR_ID_PORT_PROMISCUITY,
+		.flags = SWITCHDEV_F_DEFER,
+	};
 	kuid_t uid;
 	kgid_t gid;
 
@@ -7419,6 +7425,9 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 	}
 	if (notify)
 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
+
+	switchdev_port_attr_set(dev, &attr);
+
 	return 0;
 }
 
-- 
2.7.4


^ permalink raw reply related

* [PATCH v3 0/2] net: core: Notify on changes to dev->promiscuity
From: Horatiu Vultur @ 2019-08-29  9:22 UTC (permalink / raw)
  To: alexandre.belloni, UNGLinuxDriver, davem, andrew, allan.nielsen,
	jiri, ivecera, f.fainelli, netdev, linux-kernel
  Cc: Horatiu Vultur

Add the SWITCHDEV_ATTR_ID_PORT_PROMISCUITY switchdev notification type,
used to indicate whenever the dev promiscuity counter is changed.

HW that has bridge capabilities are not required to set their ports in
promisc mode when they are added to the bridge. But it is required to
enable promisc mode if a user space application requires it. Therefore
if the driver listens for events SWITCHDEV_ATTR_ID_PORT_PROMISCUITY and
NETDEV_CHANGEUPPER it is possible to deduce when to add/remove ports in
promisc mode.

v2->v3:
  - stop using feature NETIF_F_HW_BR_CAP
  - renmae subject from 'Add NETIF_F_BW_BR_CAP feature' because this
    feature is not used anymore.

v1->v2:
  - rename feature to NETIF_F_HW_BR_CAP
  - add better description in the commit message and in the code
  - remove the check that all network drivers have same netdev_ops and
    just check for the feature NETIF_F_HW_BR_CAP when setting the
    network port in promisc mode.

Horatiu Vultur (2):
  net: core: Notify on changes to dev->promiscuity.
  net: mscc: Implement promisc mode.

 drivers/net/ethernet/mscc/ocelot.c | 47 ++++++++++++++++++++++++++++++++++++++
 include/net/switchdev.h            |  1 +
 net/core/dev.c                     |  9 ++++++++
 3 files changed, 57 insertions(+)

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH net-next 2/2] net/mlx5e: Move local var definition into ifdef block
From: Sergei Shtylyov @ 2019-08-29  9:29 UTC (permalink / raw)
  To: Vlad Buslov, netdev
  Cc: jhs, xiyou.wangcong, jiri, davem, saeedm, idosch, tanhuazhong
In-Reply-To: <20190828164104.6020-3-vladbu@mellanox.com>

Hello!

On 28.08.2019 19:41, Vlad Buslov wrote:

> New local variable "struct flow_block_offload *f" was added to
> mlx5e_setup_tc() in recent rtnl lock removal patches. The variable is used
> in code that is only compiled when CONFIG_MLX5_ESWITCH is enabled. This
> results compilation warning about unused variable when CONFIG_MLX5_ESWITCH
> is not set. Move the variable definition into eswitch-specific code block
> from the begging of mlx5e_setup_tc() function.

    Beginning?

> Fixes: c9f14470d048 ("net: sched: add API for registering unlocked offload block callbacks")
> Reported-by: tanhuazhong <tanhuazhong@huawei.com>
> Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
[...]

MBR, Sergei

^ permalink raw reply

* Re: [RFC PATCH 1/1] phylink: Set speed to SPEED_UNKNOWN when there is no PHY connected
From: Vladimir Oltean @ 2019-08-29  9:34 UTC (permalink / raw)
  To: Russell King - ARM Linux admin
  Cc: Andrew Lunn, Florian Fainelli, Arseny Solokha, netdev,
	Vinicius Costa Gomes, leandro.maciel.dorileo
In-Reply-To: <20190828171431.GR13294@shell.armlinux.org.uk>

Hi Russell,

On Wed, 28 Aug 2019 at 20:14, Russell King - ARM Linux admin
<linux@armlinux.org.uk> wrote:
>
> On Wed, Aug 28, 2019 at 05:58:02PM +0300, Vladimir Oltean wrote:
> > phylink_ethtool_ksettings_get can be called while the interface may not
> > even be up, which should not be a problem. But there are drivers (e.g.
> > gianfar) which connect to the PHY in .ndo_open and disconnect in
> > .ndo_close. While odd, to my knowledge this is again not illegal and
> > there may be more that do the same. But PHYLINK for example has this
> > check in phylink_ethtool_ksettings_get:
> >
> >       if (pl->phydev) {
> >               phy_ethtool_ksettings_get(pl->phydev, kset);
> >       } else {
> >               kset->base.port = pl->link_port;
> >       }
> >
> > So it will not populate kset->base.speed if there is no PHY connected.
> > The speed will be 0, by way of a previous memset. Not SPEED_UNKNOWN.
> > It is arguable whether that is legal or not. include/uapi/linux/ethtool.h
> > says:
> >
> >       All values 0 to INT_MAX are legal.
> >
> > By that measure it may be. But it sure would make users of the
> > __ethtool_get_link_ksettings API need make more complicated checks
> > (against -1, against 0, 1, etc). So far the kernel community has been ok
> > with just checking for SPEED_UNKNOWN.
> >
> > Take net/sched/sch_taprio.c for example. The check in
> > taprio_set_picos_per_byte is currently not robust enough and will
> > trigger this division by zero, due to PHYLINK not setting SPEED_UNKNOWN:
> >
> >       if (!__ethtool_get_link_ksettings(dev, &ecmd) &&
> >           ecmd.base.speed != SPEED_UNKNOWN)
> >               picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8,
> >                                          ecmd.base.speed * 1000 * 1000);
>
> The ethtool API says:
>
>  * If it is enabled then they are read-only; if the link
>  * is up they represent the negotiated link mode; if the link is down,
>  * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and
>  * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode.
>
> So, it seems that taprio is not following the API... I'd suggest either
> fixing taprio, or getting agreement to change the ethtool API.
>

How would you suggest rewriting the line above in taprio to make
correct and robust use of the ethtool API?

> --
> RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
> FTTC broadband for 0.8mile line in suburbia: sync at 12.1Mbps down 622kbps up
> According to speedtest.net: 11.9Mbps down 500kbps up

Regards,
-Vladimir

^ permalink raw reply

* Re: [PATCH v3 1/2] net: core: Notify on changes to dev->promiscuity.
From: Jiri Pirko @ 2019-08-29  9:51 UTC (permalink / raw)
  To: Horatiu Vultur
  Cc: alexandre.belloni, UNGLinuxDriver, davem, andrew, allan.nielsen,
	ivecera, f.fainelli, netdev, linux-kernel
In-Reply-To: <1567070549-29255-2-git-send-email-horatiu.vultur@microchip.com>

Thu, Aug 29, 2019 at 11:22:28AM CEST, horatiu.vultur@microchip.com wrote:
>Add the SWITCHDEV_ATTR_ID_PORT_PROMISCUITY switchdev notification type,
>used to indicate whenever the dev promiscuity counter is changed.
>
>The notification doesn't use any switchdev_attr attribute because in the
>notifier callbacks is it possible to get the dev and read directly
>the promiscuity value.
>
>Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
>---
> include/net/switchdev.h | 1 +
> net/core/dev.c          | 9 +++++++++
> 2 files changed, 10 insertions(+)
>
>diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>index aee86a1..14b1617 100644
>--- a/include/net/switchdev.h
>+++ b/include/net/switchdev.h
>@@ -40,6 +40,7 @@ enum switchdev_attr_id {
> 	SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
> 	SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
> 	SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
>+	SWITCHDEV_ATTR_ID_PORT_PROMISCUITY,
> };
> 
> struct switchdev_attr {
>diff --git a/net/core/dev.c b/net/core/dev.c
>index 49589ed..40c74f2 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -142,6 +142,7 @@
> #include <linux/net_namespace.h>
> #include <linux/indirect_call_wrapper.h>
> #include <net/devlink.h>
>+#include <net/switchdev.h>
> 
> #include "net-sysfs.h"
> 
>@@ -7377,6 +7378,11 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
> static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
> {
> 	unsigned int old_flags = dev->flags;
>+	struct switchdev_attr attr = {
>+		.orig_dev = dev,
>+		.id = SWITCHDEV_ATTR_ID_PORT_PROMISCUITY,
>+		.flags = SWITCHDEV_F_DEFER,

NACK

This is invalid usecase for switchdev infra. Switchdev is there for
bridge offload purposes only.

For promiscuity changes, the infrastructure is already present in the
code. See __dev_notify_flags(). it calls:
call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info)
and you can actually see the changed flag in ".flags_changed".

You just have to register netdev notifier block in your driver. Grep
for: register_netdevice_notifier


>+	};
> 	kuid_t uid;
> 	kgid_t gid;
> 
>@@ -7419,6 +7425,9 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
> 	}
> 	if (notify)
> 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
>+
>+	switchdev_port_attr_set(dev, &attr);
>+
> 	return 0;
> }
> 
>-- 
>2.7.4
>

^ permalink raw reply

* Re: [PATCH v2 0/2] Update ethernet compatible string for SiFive FU540
From: Yash Shah @ 2019-08-29  9:54 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, devicetree, linux-kernel@vger.kernel.org List,
	linux-riscv, Rob Herring, Mark Rutland, Nicolas Ferre,
	Palmer Dabbelt, Paul Walmsley, Petr Štetiar, Sachin Ghadi
In-Reply-To: <20190828.140644.534529249197568915.davem@davemloft.net>

On Thu, Aug 29, 2019 at 2:36 AM David Miller <davem@davemloft.net> wrote:
>
> From: Yash Shah <yash.shah@sifive.com>
> Date: Tue, 27 Aug 2019 10:36:02 +0530
>
> > This patch series renames the compatible property to a more appropriate
> > string. The patchset is based on Linux-5.3-rc6 and tested on SiFive
> > Unleashed board
>
> You should always base changes off of "net" or "net-next" and be explicitly
> in your Subject lines which of those two trees your changes are for f.e.:
>
>         Subject: [PATCH v2 net-next N/M] ...

I will keep this in mind for future patches.

>
> >
> > Change history:
> > Since v1:
> > - Dropped PATCH3 because it's already merged
> > - Change the reference url in the patch descriptions to point to a
> >   'lore.kernel.org' link instead of 'lkml.org'
>
> Series applied to 'net'.

Thanks!

- Yash

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox