Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH v2] riscv: add support for SECCOMP and SECCOMP_FILTER
From: Carlos Eduardo de Paula @ 2019-08-23 22:54 UTC (permalink / raw)
  To: David Abdurachmanov
  Cc: Paul Walmsley, Palmer Dabbelt, Albert Ou, Oleg Nesterov,
	Kees Cook, Andy Lutomirski, Will Drewry, Shuah Khan,
	Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, David Abdurachmanov, Thomas Gleixner,
	Allison Randal, Alexios Zavras, Anup Patel, Vincent Chen,
	Alan Kao, linux-riscv, linux-kernel, linux-kselftest, netdev, bpf
In-Reply-To: <20190822205533.4877-1-david.abdurachmanov@sifive.com>

On Thu, Aug 22, 2019 at 5:56 PM David Abdurachmanov
<david.abdurachmanov@gmail.com> wrote:
>
> This patch was extensively tested on Fedora/RISCV (applied by default on
> top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc
> on QEMU and SiFive Unleashed board.
>
> libseccomp (userspace) was rebased:
> https://github.com/seccomp/libseccomp/pull/134
>
> Fully passes libseccomp regression testing (simulation and live).
>
> There is one failing kernel selftest: global.user_notification_signal
>
> v1 -> v2:
>   - return immediatly if secure_computing(NULL) returns -1
>   - fixed whitespace issues
>   - add missing seccomp.h
>   - remove patch #2 (solved now)
>   - add riscv to seccomp kernel selftest
>
> Cc: keescook@chromium.org
> Cc: me@carlosedp.com
>
> Signed-off-by: David Abdurachmanov <david.abdurachmanov@sifive.com>
> ---
>  arch/riscv/Kconfig                            | 14 ++++++++++
>  arch/riscv/include/asm/seccomp.h              | 10 +++++++
>  arch/riscv/include/asm/thread_info.h          |  5 +++-
>  arch/riscv/kernel/entry.S                     | 27 +++++++++++++++++--
>  arch/riscv/kernel/ptrace.c                    | 10 +++++++
>  tools/testing/selftests/seccomp/seccomp_bpf.c |  8 +++++-
>  6 files changed, 70 insertions(+), 4 deletions(-)
>  create mode 100644 arch/riscv/include/asm/seccomp.h
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 59a4727ecd6c..441e63ff5adc 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -31,6 +31,7 @@ config RISCV
>         select GENERIC_SMP_IDLE_THREAD
>         select GENERIC_ATOMIC64 if !64BIT
>         select HAVE_ARCH_AUDITSYSCALL
> +       select HAVE_ARCH_SECCOMP_FILTER
>         select HAVE_MEMBLOCK_NODE_MAP
>         select HAVE_DMA_CONTIGUOUS
>         select HAVE_FUTEX_CMPXCHG if FUTEX
> @@ -235,6 +236,19 @@ menu "Kernel features"
>
>  source "kernel/Kconfig.hz"
>
> +config SECCOMP
> +       bool "Enable seccomp to safely compute untrusted bytecode"
> +       help
> +         This kernel feature is useful for number crunching applications
> +         that may need to compute untrusted bytecode during their
> +         execution. By using pipes or other transports made available to
> +         the process as file descriptors supporting the read/write
> +         syscalls, it's possible to isolate those applications in
> +         their own address space using seccomp. Once seccomp is
> +         enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
> +         and the task is only allowed to execute a few safe syscalls
> +         defined by each seccomp mode.
> +
>  endmenu
>
>  menu "Boot options"
> diff --git a/arch/riscv/include/asm/seccomp.h b/arch/riscv/include/asm/seccomp.h
> new file mode 100644
> index 000000000000..bf7744ee3b3d
> --- /dev/null
> +++ b/arch/riscv/include/asm/seccomp.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef _ASM_SECCOMP_H
> +#define _ASM_SECCOMP_H
> +
> +#include <asm/unistd.h>
> +
> +#include <asm-generic/seccomp.h>
> +
> +#endif /* _ASM_SECCOMP_H */
> diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
> index 905372d7eeb8..a0b2a29a0da1 100644
> --- a/arch/riscv/include/asm/thread_info.h
> +++ b/arch/riscv/include/asm/thread_info.h
> @@ -75,6 +75,7 @@ struct thread_info {
>  #define TIF_MEMDIE             5       /* is terminating due to OOM killer */
>  #define TIF_SYSCALL_TRACEPOINT  6       /* syscall tracepoint instrumentation */
>  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing */
> +#define TIF_SECCOMP            8       /* syscall secure computing */
>
>  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
>  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
> @@ -82,11 +83,13 @@ struct thread_info {
>  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
>  #define _TIF_SYSCALL_TRACEPOINT        (1 << TIF_SYSCALL_TRACEPOINT)
>  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
> +#define _TIF_SECCOMP           (1 << TIF_SECCOMP)
>
>  #define _TIF_WORK_MASK \
>         (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED)
>
>  #define _TIF_SYSCALL_WORK \
> -       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
> +       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT | \
> +        _TIF_SECCOMP )
>
>  #endif /* _ASM_RISCV_THREAD_INFO_H */
> diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> index bc7a56e1ca6f..0bbedfa3e47d 100644
> --- a/arch/riscv/kernel/entry.S
> +++ b/arch/riscv/kernel/entry.S
> @@ -203,8 +203,25 @@ check_syscall_nr:
>         /* Check to make sure we don't jump to a bogus syscall number. */
>         li t0, __NR_syscalls
>         la s0, sys_ni_syscall
> -       /* Syscall number held in a7 */
> -       bgeu a7, t0, 1f
> +       /*
> +        * The tracer can change syscall number to valid/invalid value.
> +        * We use syscall_set_nr helper in syscall_trace_enter thus we
> +        * cannot trust the current value in a7 and have to reload from
> +        * the current task pt_regs.
> +        */
> +       REG_L a7, PT_A7(sp)
> +       /*
> +        * Syscall number held in a7.
> +        * If syscall number is above allowed value, redirect to ni_syscall.
> +        */
> +       bge a7, t0, 1f
> +       /*
> +        * Check if syscall is rejected by tracer or seccomp, i.e., a7 == -1.
> +        * If yes, we pretend it was executed.
> +        */
> +       li t1, -1
> +       beq a7, t1, ret_from_syscall_rejected
> +       /* Call syscall */
>         la s0, sys_call_table
>         slli t0, a7, RISCV_LGPTR
>         add s0, s0, t0
> @@ -215,6 +232,12 @@ check_syscall_nr:
>  ret_from_syscall:
>         /* Set user a0 to kernel a0 */
>         REG_S a0, PT_A0(sp)
> +       /*
> +        * We didn't execute the actual syscall.
> +        * Seccomp already set return value for the current task pt_regs.
> +        * (If it was configured with SECCOMP_RET_ERRNO/TRACE)
> +        */
> +ret_from_syscall_rejected:
>         /* Trace syscalls, but only if requested by the user. */
>         REG_L t0, TASK_TI_FLAGS(tp)
>         andi t0, t0, _TIF_SYSCALL_WORK
> diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
> index 368751438366..63e47c9f85f0 100644
> --- a/arch/riscv/kernel/ptrace.c
> +++ b/arch/riscv/kernel/ptrace.c
> @@ -154,6 +154,16 @@ void do_syscall_trace_enter(struct pt_regs *regs)
>                 if (tracehook_report_syscall_entry(regs))
>                         syscall_set_nr(current, regs, -1);
>
> +       /*
> +        * Do the secure computing after ptrace; failures should be fast.
> +        * If this fails we might have return value in a0 from seccomp
> +        * (via SECCOMP_RET_ERRNO/TRACE).
> +        */
> +       if (secure_computing(NULL) == -1) {
> +               syscall_set_nr(current, regs, -1);
> +               return;
> +       }
> +
>  #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
>         if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
>                 trace_sys_enter(regs, syscall_get_nr(current, regs));
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 6ef7f16c4cf5..492e0adad9d3 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -112,6 +112,8 @@ struct seccomp_data {
>  #  define __NR_seccomp 383
>  # elif defined(__aarch64__)
>  #  define __NR_seccomp 277
> +# elif defined(__riscv)
> +#  define __NR_seccomp 277
>  # elif defined(__hppa__)
>  #  define __NR_seccomp 338
>  # elif defined(__powerpc__)
> @@ -1582,6 +1584,10 @@ TEST_F(TRACE_poke, getpid_runs_normally)
>  # define ARCH_REGS     struct user_pt_regs
>  # define SYSCALL_NUM   regs[8]
>  # define SYSCALL_RET   regs[0]
> +#elif defined(__riscv) && __riscv_xlen == 64
> +# define ARCH_REGS     struct user_regs_struct
> +# define SYSCALL_NUM   a7
> +# define SYSCALL_RET   a0
>  #elif defined(__hppa__)
>  # define ARCH_REGS     struct user_regs_struct
>  # define SYSCALL_NUM   gr[20]
> @@ -1671,7 +1677,7 @@ void change_syscall(struct __test_metadata *_metadata,
>         EXPECT_EQ(0, ret) {}
>
>  #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \
> -    defined(__s390__) || defined(__hppa__)
> +    defined(__s390__) || defined(__hppa__) || defined(__riscv)
>         {
>                 regs.SYSCALL_NUM = syscall;
>         }
> --
> 2.21.0
>

Tested-by: Carlos de Paula <me@carlosedp.com>
-- 
________________________________________
Carlos Eduardo de Paula
me@carlosedp.com
http://carlosedp.com
http://twitter.com/carlosedp
Linkedin
________________________________________

^ permalink raw reply

* Re: [PATCHv4 net 2/2] xfrm/xfrm_policy: fix dst dev null pointer dereference in collect_md mode
From: Jonathan Lemon @ 2019-08-23 22:30 UTC (permalink / raw)
  To: Hangbin Liu
  Cc: netdev, Stefano Brivio, wenxu, Alexei Starovoitov,
	David S . Miller, Eric Dumazet, Julian Anastasov
In-Reply-To: <20190822141949.29561-3-liuhangbin@gmail.com>

On 22 Aug 2019, at 7:19, Hangbin Liu wrote:

> In decode_session{4,6} there is a possibility that the skb dst dev is NULL,
> e,g, with tunnel collect_md mode, which will cause kernel crash.
> Here is what the code path looks like, for GRE:
>
> - ip6gre_tunnel_xmit
>   - ip6gre_xmit_ipv6
>     - __gre6_xmit
>       - ip6_tnl_xmit
>         - if skb->len - t->tun_hlen - eth_hlen > mtu; return -EMSGSIZE
>     - icmpv6_send
>       - icmpv6_route_lookup
>         - xfrm_decode_session_reverse
>           - decode_session4
>             - oif = skb_dst(skb)->dev->ifindex; <-- here
>           - decode_session6
>             - oif = skb_dst(skb)->dev->ifindex; <-- here
>
> The reason is __metadata_dst_init() init dst->dev to NULL by default.
> We could not fix it in __metadata_dst_init() as there is no dev supplied.
> On the other hand, the skb_dst(skb)->dev is actually not needed as we
> called decode_session{4,6} via xfrm_decode_session_reverse(), so oif is not
> used by: fl4->flowi4_oif = reverse ? skb->skb_iif : oif;
>
> So make a dst dev check here should be clean and safe.
>
> v4: No changes.
>
> v3: No changes.
>
> v2: fix the issue in decode_session{4,6} instead of updating shared dst dev
> in {ip_md, ip6}_tunnel_xmit.
>
> Fixes: 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels")
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>

Tested-by: Jonathan Lemon <jonathan.lemon@gmail.com>

This does resolve a local crash where the dev pointer is NULL.

^ permalink raw reply

* Re: [PATCH v2] riscv: add support for SECCOMP and SECCOMP_FILTER
From: Carlos Eduardo de Paula @ 2019-08-23 23:01 UTC (permalink / raw)
  To: David Abdurachmanov
  Cc: Paul Walmsley, Palmer Dabbelt, Albert Ou, Oleg Nesterov,
	Kees Cook, Andy Lutomirski, Will Drewry, Shuah Khan,
	Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, David Abdurachmanov, Thomas Gleixner,
	Allison Randal, Alexios Zavras, Anup Patel, Vincent Chen,
	Alan Kao, linux-riscv, linux-kernel, linux-kselftest, netdev, bpf
In-Reply-To: <20190822205533.4877-1-david.abdurachmanov@sifive.com>

On Thu, Aug 22, 2019 at 5:56 PM David Abdurachmanov
<david.abdurachmanov@gmail.com> wrote:
>
> This patch was extensively tested on Fedora/RISCV (applied by default on
> top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc
> on QEMU and SiFive Unleashed board.
>
> libseccomp (userspace) was rebased:
> https://github.com/seccomp/libseccomp/pull/134
>
> Fully passes libseccomp regression testing (simulation and live).
>
> There is one failing kernel selftest: global.user_notification_signal
>
> v1 -> v2:
>   - return immediatly if secure_computing(NULL) returns -1
>   - fixed whitespace issues
>   - add missing seccomp.h
>   - remove patch #2 (solved now)
>   - add riscv to seccomp kernel selftest
>
> Cc: keescook@chromium.org
> Cc: me@carlosedp.com
>
> Signed-off-by: David Abdurachmanov <david.abdurachmanov@sifive.com>
> Tested-by: Carlos de Paula <me@carlosedp.com>
> ---
>  arch/riscv/Kconfig                            | 14 ++++++++++
>  arch/riscv/include/asm/seccomp.h              | 10 +++++++
>  arch/riscv/include/asm/thread_info.h          |  5 +++-
>  arch/riscv/kernel/entry.S                     | 27 +++++++++++++++++--
>  arch/riscv/kernel/ptrace.c                    | 10 +++++++
>  tools/testing/selftests/seccomp/seccomp_bpf.c |  8 +++++-
>  6 files changed, 70 insertions(+), 4 deletions(-)
>  create mode 100644 arch/riscv/include/asm/seccomp.h
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 59a4727ecd6c..441e63ff5adc 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -31,6 +31,7 @@ config RISCV
>         select GENERIC_SMP_IDLE_THREAD
>         select GENERIC_ATOMIC64 if !64BIT
>         select HAVE_ARCH_AUDITSYSCALL
> +       select HAVE_ARCH_SECCOMP_FILTER
>         select HAVE_MEMBLOCK_NODE_MAP
>         select HAVE_DMA_CONTIGUOUS
>         select HAVE_FUTEX_CMPXCHG if FUTEX
> @@ -235,6 +236,19 @@ menu "Kernel features"
>
>  source "kernel/Kconfig.hz"
>
> +config SECCOMP
> +       bool "Enable seccomp to safely compute untrusted bytecode"
> +       help
> +         This kernel feature is useful for number crunching applications
> +         that may need to compute untrusted bytecode during their
> +         execution. By using pipes or other transports made available to
> +         the process as file descriptors supporting the read/write
> +         syscalls, it's possible to isolate those applications in
> +         their own address space using seccomp. Once seccomp is
> +         enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
> +         and the task is only allowed to execute a few safe syscalls
> +         defined by each seccomp mode.
> +
>  endmenu
>
>  menu "Boot options"
> diff --git a/arch/riscv/include/asm/seccomp.h b/arch/riscv/include/asm/seccomp.h
> new file mode 100644
> index 000000000000..bf7744ee3b3d
> --- /dev/null
> +++ b/arch/riscv/include/asm/seccomp.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef _ASM_SECCOMP_H
> +#define _ASM_SECCOMP_H
> +
> +#include <asm/unistd.h>
> +
> +#include <asm-generic/seccomp.h>
> +
> +#endif /* _ASM_SECCOMP_H */
> diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
> index 905372d7eeb8..a0b2a29a0da1 100644
> --- a/arch/riscv/include/asm/thread_info.h
> +++ b/arch/riscv/include/asm/thread_info.h
> @@ -75,6 +75,7 @@ struct thread_info {
>  #define TIF_MEMDIE             5       /* is terminating due to OOM killer */
>  #define TIF_SYSCALL_TRACEPOINT  6       /* syscall tracepoint instrumentation */
>  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing */
> +#define TIF_SECCOMP            8       /* syscall secure computing */
>
>  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
>  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
> @@ -82,11 +83,13 @@ struct thread_info {
>  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
>  #define _TIF_SYSCALL_TRACEPOINT        (1 << TIF_SYSCALL_TRACEPOINT)
>  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
> +#define _TIF_SECCOMP           (1 << TIF_SECCOMP)
>
>  #define _TIF_WORK_MASK \
>         (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED)
>
>  #define _TIF_SYSCALL_WORK \
> -       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
> +       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT | \
> +        _TIF_SECCOMP )
>
>  #endif /* _ASM_RISCV_THREAD_INFO_H */
> diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> index bc7a56e1ca6f..0bbedfa3e47d 100644
> --- a/arch/riscv/kernel/entry.S
> +++ b/arch/riscv/kernel/entry.S
> @@ -203,8 +203,25 @@ check_syscall_nr:
>         /* Check to make sure we don't jump to a bogus syscall number. */
>         li t0, __NR_syscalls
>         la s0, sys_ni_syscall
> -       /* Syscall number held in a7 */
> -       bgeu a7, t0, 1f
> +       /*
> +        * The tracer can change syscall number to valid/invalid value.
> +        * We use syscall_set_nr helper in syscall_trace_enter thus we
> +        * cannot trust the current value in a7 and have to reload from
> +        * the current task pt_regs.
> +        */
> +       REG_L a7, PT_A7(sp)
> +       /*
> +        * Syscall number held in a7.
> +        * If syscall number is above allowed value, redirect to ni_syscall.
> +        */
> +       bge a7, t0, 1f
> +       /*
> +        * Check if syscall is rejected by tracer or seccomp, i.e., a7 == -1.
> +        * If yes, we pretend it was executed.
> +        */
> +       li t1, -1
> +       beq a7, t1, ret_from_syscall_rejected
> +       /* Call syscall */
>         la s0, sys_call_table
>         slli t0, a7, RISCV_LGPTR
>         add s0, s0, t0
> @@ -215,6 +232,12 @@ check_syscall_nr:
>  ret_from_syscall:
>         /* Set user a0 to kernel a0 */
>         REG_S a0, PT_A0(sp)
> +       /*
> +        * We didn't execute the actual syscall.
> +        * Seccomp already set return value for the current task pt_regs.
> +        * (If it was configured with SECCOMP_RET_ERRNO/TRACE)
> +        */
> +ret_from_syscall_rejected:
>         /* Trace syscalls, but only if requested by the user. */
>         REG_L t0, TASK_TI_FLAGS(tp)
>         andi t0, t0, _TIF_SYSCALL_WORK
> diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
> index 368751438366..63e47c9f85f0 100644
> --- a/arch/riscv/kernel/ptrace.c
> +++ b/arch/riscv/kernel/ptrace.c
> @@ -154,6 +154,16 @@ void do_syscall_trace_enter(struct pt_regs *regs)
>                 if (tracehook_report_syscall_entry(regs))
>                         syscall_set_nr(current, regs, -1);
>
> +       /*
> +        * Do the secure computing after ptrace; failures should be fast.
> +        * If this fails we might have return value in a0 from seccomp
> +        * (via SECCOMP_RET_ERRNO/TRACE).
> +        */
> +       if (secure_computing(NULL) == -1) {
> +               syscall_set_nr(current, regs, -1);
> +               return;
> +       }
> +
>  #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
>         if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
>                 trace_sys_enter(regs, syscall_get_nr(current, regs));
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 6ef7f16c4cf5..492e0adad9d3 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -112,6 +112,8 @@ struct seccomp_data {
>  #  define __NR_seccomp 383
>  # elif defined(__aarch64__)
>  #  define __NR_seccomp 277
> +# elif defined(__riscv)
> +#  define __NR_seccomp 277
>  # elif defined(__hppa__)
>  #  define __NR_seccomp 338
>  # elif defined(__powerpc__)
> @@ -1582,6 +1584,10 @@ TEST_F(TRACE_poke, getpid_runs_normally)
>  # define ARCH_REGS     struct user_pt_regs
>  # define SYSCALL_NUM   regs[8]
>  # define SYSCALL_RET   regs[0]
> +#elif defined(__riscv) && __riscv_xlen == 64
> +# define ARCH_REGS     struct user_regs_struct
> +# define SYSCALL_NUM   a7
> +# define SYSCALL_RET   a0
>  #elif defined(__hppa__)
>  # define ARCH_REGS     struct user_regs_struct
>  # define SYSCALL_NUM   gr[20]
> @@ -1671,7 +1677,7 @@ void change_syscall(struct __test_metadata *_metadata,
>         EXPECT_EQ(0, ret) {}
>
>  #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \
> -    defined(__s390__) || defined(__hppa__)
> +    defined(__s390__) || defined(__hppa__) || defined(__riscv)
>         {
>                 regs.SYSCALL_NUM = syscall;
>         }
> --
> 2.21.0
>

Kernel selftests results:

➜ uname -a
Linux fedora-unleashed 5.2.0-rc7-30159-g2d072d4-dirty #3 SMP Thu Jul 4
20:18:21 -03 2019 riscv64 riscv64 riscv64 GNU/Linux

➜ sudo ./seccomp_bpf
[==========] Running 74 tests from 1 test cases.
[ RUN      ] global.mode_strict_support
[       OK ] global.mode_strict_support
[ RUN      ] global.mode_strict_cannot_call_prctl
[       OK ] global.mode_strict_cannot_call_prctl
[ RUN      ] global.no_new_privs_support
[       OK ] global.no_new_privs_support
[ RUN      ] global.mode_filter_support
[       OK ] global.mode_filter_support
[ RUN      ] global.mode_filter_without_nnp
[       OK ] global.mode_filter_without_nnp
[ RUN      ] global.filter_size_limits
[       OK ] global.filter_size_limits
[ RUN      ] global.filter_chain_limits
[       OK ] global.filter_chain_limits
[ RUN      ] global.mode_filter_cannot_move_to_strict
[       OK ] global.mode_filter_cannot_move_to_strict
[ RUN      ] global.mode_filter_get_seccomp
[       OK ] global.mode_filter_get_seccomp
[ RUN      ] global.ALLOW_all
[       OK ] global.ALLOW_all
[ RUN      ] global.empty_prog
[       OK ] global.empty_prog
[ RUN      ] global.log_all
[       OK ] global.log_all
[ RUN      ] global.unknown_ret_is_kill_inside
[       OK ] global.unknown_ret_is_kill_inside
[ RUN      ] global.unknown_ret_is_kill_above_allow
[       OK ] global.unknown_ret_is_kill_above_allow
[ RUN      ] global.KILL_all
[       OK ] global.KILL_all
[ RUN      ] global.KILL_one
[       OK ] global.KILL_one
[ RUN      ] global.KILL_one_arg_one
[       OK ] global.KILL_one_arg_one
[ RUN      ] global.KILL_one_arg_six
[       OK ] global.KILL_one_arg_six
[ RUN      ] global.KILL_thread
[       OK ] global.KILL_thread
[ RUN      ] global.KILL_process
[       OK ] global.KILL_process
[ RUN      ] global.arg_out_of_range
[       OK ] global.arg_out_of_range
[ RUN      ] global.ERRNO_valid
[       OK ] global.ERRNO_valid
[ RUN      ] global.ERRNO_zero
[       OK ] global.ERRNO_zero
[ RUN      ] global.ERRNO_capped
[       OK ] global.ERRNO_capped
[ RUN      ] global.ERRNO_order
[       OK ] global.ERRNO_order
[ RUN      ] TRAP.dfl
[       OK ] TRAP.dfl
[ RUN      ] TRAP.ign
[       OK ] TRAP.ign
[ RUN      ] TRAP.handler
[       OK ] TRAP.handler
[ RUN      ] precedence.allow_ok
[       OK ] precedence.allow_ok
[ RUN      ] precedence.kill_is_highest
[       OK ] precedence.kill_is_highest
[ RUN      ] precedence.kill_is_highest_in_any_order
[       OK ] precedence.kill_is_highest_in_any_order
[ RUN      ] precedence.trap_is_second
[       OK ] precedence.trap_is_second
[ RUN      ] precedence.trap_is_second_in_any_order
[       OK ] precedence.trap_is_second_in_any_order
[ RUN      ] precedence.errno_is_third
[       OK ] precedence.errno_is_third
[ RUN      ] precedence.errno_is_third_in_any_order
[       OK ] precedence.errno_is_third_in_any_order
[ RUN      ] precedence.trace_is_fourth
[       OK ] precedence.trace_is_fourth
[ RUN      ] precedence.trace_is_fourth_in_any_order
[       OK ] precedence.trace_is_fourth_in_any_order
[ RUN      ] precedence.log_is_fifth
[       OK ] precedence.log_is_fifth
[ RUN      ] precedence.log_is_fifth_in_any_order
[       OK ] precedence.log_is_fifth_in_any_order
[ RUN      ] TRACE_poke.read_has_side_effects
[       OK ] TRACE_poke.read_has_side_effects
[ RUN      ] TRACE_poke.getpid_runs_normally
[       OK ] TRACE_poke.getpid_runs_normally
[ RUN      ] TRACE_syscall.ptrace_syscall_redirected
[       OK ] TRACE_syscall.ptrace_syscall_redirected
[ RUN      ] TRACE_syscall.ptrace_syscall_errno
[       OK ] TRACE_syscall.ptrace_syscall_errno
[ RUN      ] TRACE_syscall.ptrace_syscall_faked
[       OK ] TRACE_syscall.ptrace_syscall_faked
[ RUN      ] TRACE_syscall.syscall_allowed
[       OK ] TRACE_syscall.syscall_allowed
[ RUN      ] TRACE_syscall.syscall_redirected
[       OK ] TRACE_syscall.syscall_redirected
[ RUN      ] TRACE_syscall.syscall_errno
[       OK ] TRACE_syscall.syscall_errno
[ RUN      ] TRACE_syscall.syscall_faked
[       OK ] TRACE_syscall.syscall_faked
[ RUN      ] TRACE_syscall.skip_after_RET_TRACE
[       OK ] TRACE_syscall.skip_after_RET_TRACE
[ RUN      ] TRACE_syscall.kill_after_RET_TRACE
[       OK ] TRACE_syscall.kill_after_RET_TRACE
[ RUN      ] TRACE_syscall.skip_after_ptrace
[       OK ] TRACE_syscall.skip_after_ptrace
[ RUN      ] TRACE_syscall.kill_after_ptrace
[       OK ] TRACE_syscall.kill_after_ptrace
[ RUN      ] global.seccomp_syscall
[       OK ] global.seccomp_syscall
[ RUN      ] global.seccomp_syscall_mode_lock
[       OK ] global.seccomp_syscall_mode_lock
[ RUN      ] global.detect_seccomp_filter_flags
[       OK ] global.detect_seccomp_filter_flags
[ RUN      ] global.TSYNC_first
[       OK ] global.TSYNC_first
[ RUN      ] TSYNC.siblings_fail_prctl
[       OK ] TSYNC.siblings_fail_prctl
[ RUN      ] TSYNC.two_siblings_with_ancestor
[       OK ] TSYNC.two_siblings_with_ancestor
[ RUN      ] TSYNC.two_sibling_want_nnp
[       OK ] TSYNC.two_sibling_want_nnp
[ RUN      ] TSYNC.two_siblings_with_no_filter
[       OK ] TSYNC.two_siblings_with_no_filter
[ RUN      ] TSYNC.two_siblings_with_one_divergence
[       OK ] TSYNC.two_siblings_with_one_divergence
[ RUN      ] TSYNC.two_siblings_not_under_filter
[       OK ] TSYNC.two_siblings_not_under_filter
[ RUN      ] global.syscall_restart
[       OK ] global.syscall_restart
[ RUN      ] global.filter_flag_log
[       OK ] global.filter_flag_log
[ RUN      ] global.get_action_avail
[       OK ] global.get_action_avail
[ RUN      ] global.get_metadata
[       OK ] global.get_metadata
[ RUN      ] global.user_notification_basic
[       OK ] global.user_notification_basic
[ RUN      ] global.user_notification_kill_in_middle
[       OK ] global.user_notification_kill_in_middle
[ RUN      ] global.user_notification_signal
[1]    5951 alarm      sudo ./seccomp_bpf

carlosedp in ~ at fedora-unleashed
➜ sudo ./seccomp_benchmark
Calibrating reasonable sample size...
1564584448.964538790 - 1564584448.964529687 = 9103
1564584448.964588859 - 1564584448.964575204 = 13655
1564584448.964631342 - 1564584448.964604790 = 26552
1564584448.964710239 - 1564584448.964644997 = 65242
1564584448.964842239 - 1564584448.964726928 = 115311
1564584448.965072859 - 1564584448.964857411 = 215448
1564584448.965513618 - 1564584448.965089549 = 424069
1564584448.966417894 - 1564584448.965532584 = 885310
1564584448.968286377 - 1564584448.966443687 = 1842690
1564584448.971667549 - 1564584448.968314446 = 3353103
1564584448.978288790 - 1564584448.971694101 = 6594689
1564584448.991803618 - 1564584448.978313066 = 13490552
1564584449.017692308 - 1564584448.991836239 = 25856069
1564584449.069651756 - 1564584449.017713549 = 51938207
1564584449.173110928 - 1564584449.069673756 = 103437172
1564584449.380001204 - 1564584449.173132928 = 206868276
1564584449.793857618 - 1564584449.380041411 = 413816207
1564584450.625367342 - 1564584449.793898584 = 831468758
1564584452.299529411 - 1564584450.625426514 = 1674102897
1564584455.665938307 - 1564584452.299592376 = 3366345931
1564584462.331777479 - 1564584455.665973962 = 6665803517
Benchmarking 33554432 samples...
18.107882743 - 12.075641371 = 6032241372
getpid native: 179 ns
34.720410331 - 18.107978605 = 16612431726
getpid RET_ALLOW: 495 ns
Estimated seccomp overhead per syscall: 316 n


-- 
________________________________________
Carlos Eduardo de Paula
me@carlosedp.com
http://carlosedp.com
http://twitter.com/carlosedp
Linkedin
________________________________________

^ permalink raw reply

* Re: RFC: very rough draft of a bpf permission model
From: Andy Lutomirski @ 2019-08-23 23:09 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, Daniel Borkmann, Song Liu, Kees Cook, Networking,
	bpf, Alexei Starovoitov, Kernel Team, Lorenz Bauer, Jann Horn,
	Greg KH, Linux API, LSM List, Chenbo Feng
In-Reply-To: <20190822232620.p5tql4rrlzlk35z7@ast-mbp.dhcp.thefacebook.com>

On Thu, Aug 22, 2019 at 4:26 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> You're proposing all of the above in addition to CAP_BPF, right?
> Otherwise I don't see how it addresses the use cases I kept
> explaining for the last few weeks.

None of my proposal is intended to exclude changes like CAP_BPF to
make privileged bpf() operations need less privilege.  But I think
it's very hard to evaluate CAP_BPF without both a full description of
exactly what CAP_BPF would do and what at least one full example of a
user would look like.

I also think that users who want CAP_BPF should look at manipulating
their effective capability set instead.  A daemon that wants to use
bpf() but otherwise minimize the chance of accidentally causing a
problem can use capset() to clear its effective and inheritable masks.
Then, each time it wants to call bpf(), it could re-add CAP_SYS_ADMIN
or CAP_NET_ADMIN to its effective set, call bpf(), and then clear its
effective set again.  This works in current kernels and is generally
good practice.

Aside from this, and depending on exactly what CAP_BPF would be, I
have some further concerns.  Looking at your example in this email:

> Here is another example of use case that CAP_BPF is solving:
> The daemon X is started by pid=1 and currently runs as root.
> It loads a bunch of tracing progs and attaches them to kprobes
> and tracepoints. It also loads cgroup-bpf progs and attaches them
> to cgroups. All progs are collecting data about the system and
> logging it for further analysis.

This needs more than just bpf().  Creating a perf kprobe event
requires CAP_SYS_ADMIN, and without a perf kprobe event, you can't
attach a bpf program.  And the privilege to attach bpf programs to
cgroups without any DAC or MAC checks (which is what the current API
does) is an extremely broad privilege that is not that much weaker
than CAP_SYS_ADMIN or CAP_NET_ADMIN.  Also:

> This tracing bpf is looking into kernel memory
> and using bpf_probe_read. Clearly it's not _secure_. But it's _safe_.
> The system is not going to crash because of BPF,
> but it can easily crash because of simple coding bugs in the user
> space bits of that daemon.

The BPF verifier and interpreter, taken in isolation, may be extremely
safe, but attaching BPF programs to various hooks can easily take down
the system, deliberately or by accident.  A handler, especially if it
can access user memory or otherwise fault, will explode if attached to
an inappropriate kprobe, hw_breakpoint, or function entry trace event.
(I and the other maintainers consider this to be a bug if it happens,
and we'll fix it, but these bugs definitely exist.)  A cgroup-bpf hook
that blocks all network traffic will effectively kill a machine,
especially if it's a server.  A bpf program that runs excessively
slowly attached to a high-frequency hook will kill the system, too.
(I bet a buggy bpf program that calls bpf_probe_read() on an unmapped
address repeatedly could be make extremely slow.  Page faults take
thousands to tens of thousands of cycles.)  A bpf firewall rule that's
wrong can cut a machine off from the network -- I've killed machines
using iptables more than once, and bpf isn't magically safer.

Something finer-grained can mitigate some of this.  CAP_BPF as I think
you're imagining it will not.

I'm wondering if something like CAP_TRACING would make sense.
CAP_TRACING would allow operations that can reveal kernel memory and
other secret kernel state but that do not, by design, allow modifying
system behavior.  So, for example, CAP_TRACING would allow privileged
perf_event_open() operations and privileged bpf verifier usage.  But
it would not allow cgroup-bpf unless further restrictions were added,
and it would not allow the *_BY_ID operations, as those can modify
other users' bpf programs' behavior.

(To get CAP_TRACING to work with cgroup-bpf, there could be a flag to
attach a "tracing" bpf program to a cgroup.  This program would run in
addition to normal or MULTI programs, but it would not be allowed to
return a rejection result.)

^ permalink raw reply

* Re: [PATCH ipsec-next 0/7] ipsec: add TCP encapsulation support (RFC 8229)
From: Carl-Daniel Hailfinger @ 2019-08-23 23:17 UTC (permalink / raw)
  To: Sabrina Dubroca; +Cc: herbert, netdev, steffen.klassert
In-Reply-To: <cover.1566395202.git.sd@queasysnail.net>

Hi!

On Wed, 21 Aug 2019 23:46:18 +0200, Sabrina Dubroca wrote:
> This patchset introduces support for TCP encapsulation of IKE and ESP
> messages, as defined by RFC 8229 [0]. It is an evolution of what
> Herbert Xu proposed in January 2018 [1] that addresses the main
> criticism against it, by not interfering with the TCP implementation
> at all. The networking stack now has infrastructure for this: TCP ULPs
> and Stream Parsers.
> [...]

Thank you very much for the patchset. Where I live, a substantial amount
of free and paid Wifi networks restrict UDP to port 53. TCP ports are
usually unaffected by such restrictions.
Running IKE/ESP over TCP is sometimes the only remaining option, and
this patch makes that option available.

> The main omission in this submission is IPv6 support. ESP
> encapsulation over UDP with IPv6 is currently not supported in the
> kernel either, as UDP encapsulation is aimed at NAT traversal, and NAT
> is not frequently used with IPv6.

Side note: The lack of support for ESP over UDP with IPv6 is the reason
why third-party Android IPsec management apps (e.g. the strongswan app)
can't connect to IPv6-only remote endpoints. AFAIK Android apps do not
have permission to send ESP packets directly, whereas establishing TCP
connections and sending UDP datagrams is permitted. But even without
IPv6 support, this patch is a great step forward.

Regards,
Carl-Daniel

^ permalink raw reply

* Re: [PATCH 0/3] Add NETIF_F_HW_BRIDGE feature
From: Florian Fainelli @ 2019-08-23 23:25 UTC (permalink / raw)
  To: Horatiu Vultur, roopa, nikolay, davem, UNGLinuxDriver,
	alexandre.belloni, allan.nielsen, netdev, linux-kernel, bridge
In-Reply-To: <1566500850-6247-1-git-send-email-horatiu.vultur@microchip.com>

On 8/22/19 12:07 PM, Horatiu Vultur wrote:
> Current implementation of the SW bridge is setting the interfaces in
> promisc mode when they are added to bridge if learning of the frames is
> enabled.
> In case of Ocelot which has HW capabilities to switch frames, it is not
> needed to set the ports in promisc mode because the HW already capable of
> doing that. Therefore add NETIF_F_HW_BRIDGE feature to indicate that the
> HW has bridge capabilities. Therefore the SW bridge doesn't need to set
> the ports in promisc mode to do the switching.

Then do not do anything when the ndo_set_rx_mode() for the ocelot
network device is called and indicates that IFF_PROMISC is set and that
your network port is a bridge port member. That is what mlxsw does AFAICT.

As other pointed out, the Linux bridge implements a software bridge by
default, and because it needs to operate on a wide variety of network
devices, all with different capabilities, the easiest way to make sure
that all management (IGMP, BPDU, etc. ) as well as non-management
traffic can make it to the bridge ports, is to put the network devices
in promiscuous mode. If this is suboptimal for you, you can take
shortcuts in your driver that do not hinder the overall functionality.

> This optimization takes places only if all the interfaces that are part
> of the bridge have this flag and have the same network driver.
> 
> If the bridge interfaces is added in promisc mode then also the ports part
> of the bridge are set in promisc mode.
> 
> Horatiu Vultur (3):
>   net: Add HW_BRIDGE offload feature
>   net: mscc: Use NETIF_F_HW_BRIDGE
>   net: mscc: Implement promisc mode.
> 
>  drivers/net/ethernet/mscc/ocelot.c | 26 ++++++++++++++++++++++++--
>  include/linux/netdev_features.h    |  3 +++
>  net/bridge/br_if.c                 | 29 ++++++++++++++++++++++++++++-
>  net/core/ethtool.c                 |  1 +
>  4 files changed, 56 insertions(+), 3 deletions(-)
> 

-- 
Florian

^ permalink raw reply

* Re: [PATCH 1/3] net: Add HW_BRIDGE offload feature
From: Florian Fainelli @ 2019-08-23 23:30 UTC (permalink / raw)
  To: Horatiu Vultur, Andrew Lunn
  Cc: roopa, nikolay, davem, UNGLinuxDriver, alexandre.belloni,
	allan.nielsen, netdev, linux-kernel, bridge
In-Reply-To: <20190823123929.ta4ikozz7jwkwbo2@soft-dev3.microsemi.net>

On 8/23/19 5:39 AM, Horatiu Vultur wrote:
> The 08/22/2019 22:08, Andrew Lunn wrote:
>> External E-Mail
>>
>>
>>> +/* Determin if the SW bridge can be offloaded to HW. Return true if all
>>> + * the interfaces of the bridge have the feature NETIF_F_HW_SWITCHDEV set
>>> + * and have the same netdev_ops.
>>> + */
>>
>> Hi Horatiu
>>
>> Why do you need these restrictions. The HW bridge should be able to
>> learn that a destination MAC address can be reached via the SW
>> bridge. The software bridge can then forward it out the correct
>> interface.
>>
>> Or are you saying your hardware cannot learn from frames which come
>> from the CPU?
>>
>> 	Andrew
>>
> Hi Andrew,
> 
> I do not believe that our HW can learn from frames which comes from the
> CPU, at least not in the way they are injected today. But in case of Ocelot
> (and the next chip we are working on), we have other issues in mixing with
> foreign interfaces which is why we have the check in
> ocelot_netdevice_dev_check.
> 
> More important, as we responded to Nikolay, we properly introduced this
> restriction for the wrong reasons.
> 
> In SW bridge I will remove all these restrictions and only set ports in
> promisc mode only if NETIF_F_HW_BRIDGE is not set.
> Then in the network driver I can see if a foreign interface is added to
> the bridge, and when that happens I can set the port in promisc mode.
> Then the frames will be flooded to the SW bridge which eventually will
> send to the foreign interface.

Is that really necessary? Is not the skb->fwd_offload_mark as well as
the phys_switch_id supposed to tell that information to the bridge already?
-- 
Florian

^ permalink raw reply

* Re: [PATCH bpf] flow_dissector: Fix potential use-after-free on BPF_PROG_DETACH
From: Daniel Borkmann @ 2019-08-23 23:34 UTC (permalink / raw)
  To: Jakub Sitnicki, bpf
  Cc: netdev, kernel-team, Petar Penkov, Willem de Bruijn, Lorenz Bauer
In-Reply-To: <20190821121720.22009-1-jakub@cloudflare.com>

On 8/21/19 2:17 PM, Jakub Sitnicki wrote:
> Call to bpf_prog_put(), with help of call_rcu(), queues an RCU-callback to
> free the program once a grace period has elapsed. The callback can run
> together with new RCU readers that started after the last grace period.
> New RCU readers can potentially see the "old" to-be-freed or already-freed
> pointer to the program object before the RCU update-side NULLs it.
> 
> Reorder the operations so that the RCU update-side resets the protected
> pointer before the end of the grace period after which the program will be
> freed.
> 
> Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook")
> Reported-by: Lorenz Bauer <lmb@cloudflare.com>
> Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>

Applied, thanks!

^ permalink raw reply

* Re: [PATCH bpf] bpf: fix precision tracking in presence of bpf2bpf calls
From: Daniel Borkmann @ 2019-08-23 23:35 UTC (permalink / raw)
  To: Alexei Starovoitov, davem; +Cc: netdev, bpf, kernel-team
In-Reply-To: <20190821210710.1276117-1-ast@kernel.org>

On 8/21/19 11:07 PM, Alexei Starovoitov wrote:
> While adding extra tests for precision tracking and extra infra
> to adjust verifier heuristics the existing test
> "calls: cross frame pruning - liveness propagation" started to fail.
> The root cause is the same as described in verifer.c comment:
> 
>   * Also if parent's curframe > frame where backtracking started,
>   * the verifier need to mark registers in both frames, otherwise callees
>   * may incorrectly prune callers. This is similar to
>   * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
>   * For now backtracking falls back into conservative marking.
> 
> Turned out though that returning -ENOTSUPP from backtrack_insn() and
> doing mark_all_scalars_precise() in the current parentage chain is not enough.
> Depending on how is_state_visited() heuristic is creating parentage chain
> it's possible that callee will incorrectly prune caller.
> Fix the issue by setting precise=true earlier and more aggressively.
> Before this fix the precision tracking _within_ functions that don't do
> bpf2bpf calls would still work. Whereas now precision tracking is completely
> disabled when bpf2bpf calls are present anywhere in the program.
> 
> No difference in cilium tests (they don't have bpf2bpf calls).
> No difference in test_progs though some of them have bpf2bpf calls,
> but precision tracking wasn't effective there.
> 
> Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking")
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Applied, thanks!

^ permalink raw reply

* [net-next 02/14] ice: Account for all states of FW DCBx and LLDP
From: Jeff Kirsher @ 2019-08-23 23:37 UTC (permalink / raw)
  To: davem
  Cc: Dave Ertman, netdev, nhorman, sassmann, Tony Nguyen,
	Andrew Bowers, Jeff Kirsher
In-Reply-To: <20190823233750.7997-1-jeffrey.t.kirsher@intel.com>

From: Dave Ertman <david.m.ertman@intel.com>

Currently, only the DCBx status is taken into account to
determine if FW LLDP is possible.  But there are NVM version
coming out with DCBx enabled, and FW LLDP disabled.  This
is causing errors where the driver sees that DCBx is not
disabled, and then tries to register for LLDP MIB change
events, and fails.

Change the logic to detect both DCBx and LLDP states in the
FW engine.

Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_dcb_lib.c | 34 +++++++-------------
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
index bf6cd4760a48..22bdc244c7e0 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
@@ -319,6 +319,11 @@ void ice_dcb_rebuild(struct ice_pf *pf)
 	}
 
 	ice_init_dcb(&pf->hw);
+	if (pf->hw.port_info->dcbx_status == ICE_DCBX_STATUS_DIS)
+		pf->hw.port_info->is_sw_lldp = true;
+	else
+		pf->hw.port_info->is_sw_lldp = false;
+
 	if (ice_dcb_need_recfg(pf, prev_cfg, local_dcbx_cfg)) {
 		/* difference in cfg detected - disable DCB till next MIB */
 		dev_err(&pf->pdev->dev, "Set local MIB not accurate\n");
@@ -440,35 +445,17 @@ int ice_init_pf_dcb(struct ice_pf *pf, bool locked)
 	struct device *dev = &pf->pdev->dev;
 	struct ice_port_info *port_info;
 	struct ice_hw *hw = &pf->hw;
-	int sw_default = 0;
 	int err;
 
 	port_info = hw->port_info;
 
 	err = ice_init_dcb(hw);
 	if (err) {
-		/* FW LLDP is not active, default to SW DCBX/LLDP */
-		dev_info(&pf->pdev->dev, "FW LLDP is not active\n");
-		hw->port_info->dcbx_status = ICE_DCBX_STATUS_NOT_STARTED;
-		hw->port_info->is_sw_lldp = true;
-	}
-
-	if (port_info->dcbx_status == ICE_DCBX_STATUS_DIS)
-		dev_info(&pf->pdev->dev, "DCBX disabled\n");
-
-	/* LLDP disabled in FW */
-	if (port_info->is_sw_lldp) {
-		sw_default = 1;
-		dev_info(&pf->pdev->dev, "DCBx/LLDP in SW mode.\n");
+		/* FW LLDP is disabled, activate SW DCBX/LLDP mode */
+		dev_info(&pf->pdev->dev,
+			 "FW LLDP is disabled, DCBx/LLDP in SW mode.\n");
+		port_info->is_sw_lldp = true;
 		clear_bit(ICE_FLAG_ENABLE_FW_LLDP, pf->flags);
-	} else {
-		set_bit(ICE_FLAG_ENABLE_FW_LLDP, pf->flags);
-	}
-
-	if (port_info->dcbx_status == ICE_DCBX_STATUS_NOT_STARTED)
-		dev_info(&pf->pdev->dev, "DCBX not started\n");
-
-	if (sw_default) {
 		err = ice_dcb_sw_dflt_cfg(pf, locked);
 		if (err) {
 			dev_err(&pf->pdev->dev,
@@ -483,6 +470,9 @@ int ice_init_pf_dcb(struct ice_pf *pf, bool locked)
 		return 0;
 	}
 
+	port_info->is_sw_lldp = false;
+	set_bit(ICE_FLAG_ENABLE_FW_LLDP, pf->flags);
+
 	/* DCBX in FW and LLDP enabled in FW */
 	pf->dcbx_cap = DCB_CAP_DCBX_LLD_MANAGED | DCB_CAP_DCBX_VER_IEEE;
 
-- 
2.21.0


^ permalink raw reply related

* [net-next 01/14] ice: Allow egress control packets from PF_VSI
From: Jeff Kirsher @ 2019-08-23 23:37 UTC (permalink / raw)
  To: davem; +Cc: Dave Ertman, netdev, nhorman, sassmann, Andrew Bowers,
	Jeff Kirsher
In-Reply-To: <20190823233750.7997-1-jeffrey.t.kirsher@intel.com>

From: Dave Ertman <david.m.ertman@intel.com>

For control packets (i.e. LLDP packets) to be able to egress
from the main VSI, a bit has to be set in the TX_descriptor.
This should only be done for the main VSI and only if the
FW LLDP agent is disabled.  A bit to allow this also has to
be set in the VSI context.

Add the logic to add the necessary bits in the VSI context
for the PF_VSI and the TX_descriptors for control packets
egressing the PF_VSI.

Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_lib.c  |  7 +++++++
 drivers/net/ethernet/intel/ice/ice_txrx.c | 11 ++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index 6e34c40e7840..d6279dfe029e 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -1010,6 +1010,13 @@ static int ice_vsi_init(struct ice_vsi *vsi)
 			ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF;
 	}
 
+	/* Allow control frames out of main VSI */
+	if (vsi->type == ICE_VSI_PF) {
+		ctxt->info.sec_flags |= ICE_AQ_VSI_SEC_FLAG_ALLOW_DEST_OVRD;
+		ctxt->info.valid_sections |=
+			cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
+	}
+
 	ret = ice_add_vsi(hw, vsi->idx, ctxt, NULL);
 	if (ret) {
 		dev_err(&pf->pdev->dev,
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index e5c4c9139e54..5bf5c179a738 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -2106,6 +2106,7 @@ static netdev_tx_t
 ice_xmit_frame_ring(struct sk_buff *skb, struct ice_ring *tx_ring)
 {
 	struct ice_tx_offload_params offload = { 0 };
+	struct ice_vsi *vsi = tx_ring->vsi;
 	struct ice_tx_buf *first;
 	unsigned int count;
 	int tso, csum;
@@ -2153,7 +2154,15 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_ring *tx_ring)
 	if (csum < 0)
 		goto out_drop;
 
-	if (tso || offload.cd_tunnel_params) {
+	/* allow CONTROL frames egress from main VSI if FW LLDP disabled */
+	if (unlikely(skb->priority == TC_PRIO_CONTROL &&
+		     vsi->type == ICE_VSI_PF &&
+		     vsi->port_info->is_sw_lldp))
+		offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
+					ICE_TX_CTX_DESC_SWTCH_UPLINK <<
+					ICE_TXD_CTX_QW1_CMD_S);
+
+	if (offload.cd_qw1 & ICE_TX_DESC_DTYPE_CTX) {
 		struct ice_tx_ctx_desc *cdesc;
 		int i = tx_ring->next_to_use;
 
-- 
2.21.0


^ permalink raw reply related

* [net-next 12/14] ice: update ethtool stats on-demand
From: Jeff Kirsher @ 2019-08-23 23:37 UTC (permalink / raw)
  To: davem; +Cc: Bruce Allan, netdev, nhorman, sassmann, Andrew Bowers,
	Jeff Kirsher
In-Reply-To: <20190823233750.7997-1-jeffrey.t.kirsher@intel.com>

From: Bruce Allan <bruce.w.allan@intel.com>

Users expect ethtool statistics to be updated on-demand when invoking
'ethtool -S <iface>' instead of providing a snapshot of statistics taken
once a second (the frequency of the watchdog task where stats are currently
updated).  Update stats every time 'ethtool -S <iface>' is run.

Also, fix an indentation style issue and an unnecessary local variable
initialization in ice_get_ethtool_stats() discovered while investigating
the subject issue.

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice.h         | 2 ++
 drivers/net/ethernet/intel/ice/ice_ethtool.c | 7 +++++--
 drivers/net/ethernet/intel/ice/ice_main.c    | 6 ++----
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index 99e0febd8e50..97d0f61cf52b 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -447,6 +447,8 @@ ice_find_vsi_by_type(struct ice_pf *pf, enum ice_vsi_type type)
 int ice_vsi_setup_tx_rings(struct ice_vsi *vsi);
 int ice_vsi_setup_rx_rings(struct ice_vsi *vsi);
 void ice_set_ethtool_ops(struct net_device *netdev);
+void ice_update_vsi_stats(struct ice_vsi *vsi);
+void ice_update_pf_stats(struct ice_pf *pf);
 int ice_up(struct ice_vsi *vsi);
 int ice_down(struct ice_vsi *vsi);
 int ice_vsi_cfg(struct ice_vsi *vsi);
diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c
index 948a33716290..f7dd0bd03d39 100644
--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -1319,14 +1319,17 @@ ice_get_ethtool_stats(struct net_device *netdev,
 	struct ice_vsi *vsi = np->vsi;
 	struct ice_pf *pf = vsi->back;
 	struct ice_ring *ring;
-	unsigned int j = 0;
+	unsigned int j;
 	int i = 0;
 	char *p;
 
+	ice_update_pf_stats(pf);
+	ice_update_vsi_stats(vsi);
+
 	for (j = 0; j < ICE_VSI_STATS_LEN; j++) {
 		p = (char *)vsi + ice_gstrings_vsi_stats[j].stat_offset;
 		data[i++] = (ice_gstrings_vsi_stats[j].sizeof_stat ==
-			    sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
+			     sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
 	}
 
 	/* populate per queue stats */
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index a0d148f590c2..6dd806b763ea 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -34,8 +34,6 @@ static const struct net_device_ops ice_netdev_ops;
 static void ice_rebuild(struct ice_pf *pf);
 
 static void ice_vsi_release_all(struct ice_pf *pf);
-static void ice_update_vsi_stats(struct ice_vsi *vsi);
-static void ice_update_pf_stats(struct ice_pf *pf);
 
 /**
  * ice_get_tx_pending - returns number of Tx descriptors not processed
@@ -3254,7 +3252,7 @@ static void ice_update_vsi_ring_stats(struct ice_vsi *vsi)
  * ice_update_vsi_stats - Update VSI stats counters
  * @vsi: the VSI to be updated
  */
-static void ice_update_vsi_stats(struct ice_vsi *vsi)
+void ice_update_vsi_stats(struct ice_vsi *vsi)
 {
 	struct rtnl_link_stats64 *cur_ns = &vsi->net_stats;
 	struct ice_eth_stats *cur_es = &vsi->eth_stats;
@@ -3290,7 +3288,7 @@ static void ice_update_vsi_stats(struct ice_vsi *vsi)
  * ice_update_pf_stats - Update PF port stats counters
  * @pf: PF whose stats needs to be updated
  */
-static void ice_update_pf_stats(struct ice_pf *pf)
+void ice_update_pf_stats(struct ice_pf *pf)
 {
 	struct ice_hw_port_stats *prev_ps, *cur_ps;
 	struct ice_hw *hw = &pf->hw;
-- 
2.21.0


^ permalink raw reply related

* [net-next 09/14] ice: Fix flag used for module query
From: Jeff Kirsher @ 2019-08-23 23:37 UTC (permalink / raw)
  To: davem; +Cc: Chinh T Cao, netdev, nhorman, sassmann, Andrew Bowers,
	Jeff Kirsher
In-Reply-To: <20190823233750.7997-1-jeffrey.t.kirsher@intel.com>

From: Chinh T Cao <chinh.t.cao@intel.com>

When checking the PHY for status, by specification, the driver
should be using "topology" mode when querying the module type.

Signed-off-by: Chinh T Cao <chinh.t.cao@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index 5f9dc76699d2..15648d4a8bab 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -2031,7 +2031,7 @@ enum ice_status ice_update_link_info(struct ice_port_info *pi)
 		if (!pcaps)
 			return ICE_ERR_NO_MEMORY;
 
-		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_SW_CFG,
+		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP,
 					     pcaps, NULL);
 		if (!status)
 			memcpy(li->module_type, &pcaps->module_type,
-- 
2.21.0


^ permalink raw reply related

* [net-next 04/14] ice: Treat DCBx state NOT_STARTED as valid
From: Jeff Kirsher @ 2019-08-23 23:37 UTC (permalink / raw)
  To: davem; +Cc: Dave Ertman, netdev, nhorman, sassmann, Andrew Bowers,
	Jeff Kirsher
In-Reply-To: <20190823233750.7997-1-jeffrey.t.kirsher@intel.com>

From: Dave Ertman <david.m.ertman@intel.com>

When a port is not cabled, but DCBx is enabled in the
firmware, the status of DCBx will be NOT_STARTED.  This
is a valid state for FW enabled and should not be
treated as a is_fw_lldp true automatically.

Add the code to treat NOT_STARTED as another valid state.

Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_dcb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_dcb.c b/drivers/net/ethernet/intel/ice/ice_dcb.c
index c2002ded65f6..d60c942249e8 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb.c
+++ b/drivers/net/ethernet/intel/ice/ice_dcb.c
@@ -954,7 +954,8 @@ enum ice_status ice_init_dcb(struct ice_hw *hw)
 	pi->dcbx_status = ice_get_dcbx_status(hw);
 
 	if (pi->dcbx_status == ICE_DCBX_STATUS_DONE ||
-	    pi->dcbx_status == ICE_DCBX_STATUS_IN_PROGRESS) {
+	    pi->dcbx_status == ICE_DCBX_STATUS_IN_PROGRESS ||
+	    pi->dcbx_status == ICE_DCBX_STATUS_NOT_STARTED) {
 		/* Get current DCBX configuration */
 		ret = ice_get_dcb_cfg(pi);
 		pi->is_sw_lldp = (hw->adminq.sq_last_status == ICE_AQ_RC_EPERM);
-- 
2.21.0


^ permalink raw reply related

* [net-next 14/14] ice: Don't allow VSI to remove unassociated ucast filter
From: Jeff Kirsher @ 2019-08-23 23:37 UTC (permalink / raw)
  To: davem
  Cc: Akeem G Abodunrin, netdev, nhorman, sassmann, Andrew Bowers,
	Jeff Kirsher
In-Reply-To: <20190823233750.7997-1-jeffrey.t.kirsher@intel.com>

From: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>

If a VSI is not using a unicast filter or did not configure that
particular unicast filter, driver should not allow it to be removed
by the rogue VSI.

Signed-off-by: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_switch.c | 56 +++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c
index 8271fd651725..99cf527d2b1a 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.c
+++ b/drivers/net/ethernet/intel/ice/ice_switch.c
@@ -2136,6 +2136,38 @@ ice_cfg_dflt_vsi(struct ice_hw *hw, u16 vsi_handle, bool set, u8 direction)
 	return status;
 }
 
+/**
+ * ice_find_ucast_rule_entry - Search for a unicast MAC filter rule entry
+ * @hw: pointer to the hardware structure
+ * @recp_id: lookup type for which the specified rule needs to be searched
+ * @f_info: rule information
+ *
+ * Helper function to search for a unicast rule entry - this is to be used
+ * to remove unicast MAC filter that is not shared with other VSIs on the
+ * PF switch.
+ *
+ * Returns pointer to entry storing the rule if found
+ */
+static struct ice_fltr_mgmt_list_entry *
+ice_find_ucast_rule_entry(struct ice_hw *hw, u8 recp_id,
+			  struct ice_fltr_info *f_info)
+{
+	struct ice_switch_info *sw = hw->switch_info;
+	struct ice_fltr_mgmt_list_entry *list_itr;
+	struct list_head *list_head;
+
+	list_head = &sw->recp_list[recp_id].filt_rules;
+	list_for_each_entry(list_itr, list_head, list_entry) {
+		if (!memcmp(&f_info->l_data, &list_itr->fltr_info.l_data,
+			    sizeof(f_info->l_data)) &&
+		    f_info->fwd_id.hw_vsi_id ==
+		    list_itr->fltr_info.fwd_id.hw_vsi_id &&
+		    f_info->flag == list_itr->fltr_info.flag)
+			return list_itr;
+	}
+	return NULL;
+}
+
 /**
  * ice_remove_mac - remove a MAC address based filter rule
  * @hw: pointer to the hardware structure
@@ -2153,15 +2185,39 @@ enum ice_status
 ice_remove_mac(struct ice_hw *hw, struct list_head *m_list)
 {
 	struct ice_fltr_list_entry *list_itr, *tmp;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
 
 	if (!m_list)
 		return ICE_ERR_PARAM;
 
+	rule_lock = &hw->switch_info->recp_list[ICE_SW_LKUP_MAC].filt_rule_lock;
 	list_for_each_entry_safe(list_itr, tmp, m_list, list_entry) {
 		enum ice_sw_lkup_type l_type = list_itr->fltr_info.lkup_type;
+		u8 *add = &list_itr->fltr_info.l_data.mac.mac_addr[0];
+		u16 vsi_handle;
 
 		if (l_type != ICE_SW_LKUP_MAC)
 			return ICE_ERR_PARAM;
+
+		vsi_handle = list_itr->fltr_info.vsi_handle;
+		if (!ice_is_vsi_valid(hw, vsi_handle))
+			return ICE_ERR_PARAM;
+
+		list_itr->fltr_info.fwd_id.hw_vsi_id =
+					ice_get_hw_vsi_num(hw, vsi_handle);
+		if (is_unicast_ether_addr(add) && !hw->ucast_shared) {
+			/* Don't remove the unicast address that belongs to
+			 * another VSI on the switch, since it is not being
+			 * shared...
+			 */
+			mutex_lock(rule_lock);
+			if (!ice_find_ucast_rule_entry(hw, ICE_SW_LKUP_MAC,
+						       &list_itr->fltr_info)) {
+				mutex_unlock(rule_lock);
+				return ICE_ERR_DOES_NOT_EXIST;
+			}
+			mutex_unlock(rule_lock);
+		}
 		list_itr->status = ice_remove_rule_internal(hw,
 							    ICE_SW_LKUP_MAC,
 							    list_itr);
-- 
2.21.0


^ permalink raw reply related

* [net-next 05/14] ice: Copy dcbx configuration only if mode is correct
From: Jeff Kirsher @ 2019-08-23 23:37 UTC (permalink / raw)
  To: davem
  Cc: Michal Swiatkowski, netdev, nhorman, sassmann, Andrew Bowers,
	Jeff Kirsher
In-Reply-To: <20190823233750.7997-1-jeffrey.t.kirsher@intel.com>

From: Michal Swiatkowski <michal.swiatkowski@intel.com>

In rebuild DCB desired_dcbx_cfg was copy to local_dcbx_cfg, but
if DCBX mode is IEEE desired_dcbx_cfg is not initialized by DCBX
config from FW. Change logic to copy config value only if mode is
set to CEE.

If driver copy desired_dcbx_cfg to local_dcbx_cfg in IEEE mode there
is problem with globr. System is frozen after two or more globr.

Signed-off-by: Michal Swiatkowski <michal.swiatkowski@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_dcb_lib.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
index 22bdc244c7e0..4fc9faf5bc71 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
@@ -334,8 +334,10 @@ void ice_dcb_rebuild(struct ice_pf *pf)
 	devm_kfree(&pf->pdev->dev, prev_cfg);
 
 	/* Set the local desired config */
-	memset(&pf->hw.port_info->local_dcbx_cfg, 0, sizeof(*local_dcbx_cfg));
-	memcpy(local_dcbx_cfg, desired_dcbx_cfg, sizeof(*local_dcbx_cfg));
+	if (local_dcbx_cfg->dcbx_mode == ICE_DCBX_MODE_CEE)
+		memcpy(local_dcbx_cfg, desired_dcbx_cfg,
+		       sizeof(*local_dcbx_cfg));
+
 	ice_cfg_etsrec_defaults(pf->hw.port_info);
 	ret = ice_set_dcb_cfg(pf->hw.port_info);
 	if (ret) {
-- 
2.21.0


^ permalink raw reply related

* [net-next 11/14] ice: Add input handlers for virtual channel handlers
From: Jeff Kirsher @ 2019-08-23 23:37 UTC (permalink / raw)
  To: davem
  Cc: Amruth G.P, netdev, nhorman, sassmann, Nitesh B Venkatesh,
	Tony Nguyen, Andrew Bowers, Jeff Kirsher
In-Reply-To: <20190823233750.7997-1-jeffrey.t.kirsher@intel.com>

From: "Amruth G.P" <amruth.gouda.parameshwarappa@intel.com>

Move the assignment to local variables after validation.

Remove unnecessary checks in ice_vc_process_vf_msg() as the respective
functions are now performing the checks.

Signed-off-by: "Amruth G.P" <amruth.gouda.parameshwarappa@intel.com>
Signed-off-by: Nitesh B Venkatesh <nitesh.b.venkatesh@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 .../net/ethernet/intel/ice/ice_virtchnl_pf.c  | 64 +++++++++----------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index 83e58e71081e..de0a1ef54e83 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -1734,18 +1734,18 @@ static int ice_vc_config_rss_key(struct ice_vf *vf, u8 *msg)
 		goto error_param;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
+	if (vrk->key_len != ICE_VSIQF_HKEY_ARRAY_SIZE) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	if (vrk->key_len != ICE_VSIQF_HKEY_ARRAY_SIZE) {
+	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
+	vsi = pf->vsi[vf->lan_vsi_idx];
+	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
@@ -1781,18 +1781,18 @@ static int ice_vc_config_rss_lut(struct ice_vf *vf, u8 *msg)
 		goto error_param;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
+	if (vrl->lut_entries != ICE_VSIQF_HLUT_ARRAY_SIZE) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	if (vrl->lut_entries != ICE_VSIQF_HLUT_ARRAY_SIZE) {
+	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
+	vsi = pf->vsi[vf->lan_vsi_idx];
+	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
@@ -1877,6 +1877,12 @@ static int ice_vc_ena_qs_msg(struct ice_vf *vf, u8 *msg)
 		goto error_param;
 	}
 
+	if (vqs->rx_queues > ICE_MAX_BASE_QS_PER_VF ||
+	    vqs->tx_queues > ICE_MAX_BASE_QS_PER_VF) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
 	vsi = pf->vsi[vf->lan_vsi_idx];
 	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
@@ -1932,6 +1938,12 @@ static int ice_vc_dis_qs_msg(struct ice_vf *vf, u8 *msg)
 		goto error_param;
 	}
 
+	if (vqs->rx_queues > ICE_MAX_BASE_QS_PER_VF ||
+	    vqs->tx_queues > ICE_MAX_BASE_QS_PER_VF) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
 	vsi = pf->vsi[vf->lan_vsi_idx];
 	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
@@ -1984,12 +1996,6 @@ static int ice_vc_cfg_irq_map_msg(struct ice_vf *vf, u8 *msg)
 	irqmap_info = (struct virtchnl_irq_map_info *)msg;
 	num_q_vectors_mapped = irqmap_info->num_vectors;
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
-	}
-
 	/* Check to make sure number of VF vectors mapped is not greater than
 	 * number of VF vectors originally allocated, and check that
 	 * there is actually at least a single VF queue vector mapped
@@ -2001,6 +2007,12 @@ static int ice_vc_cfg_irq_map_msg(struct ice_vf *vf, u8 *msg)
 		goto error_param;
 	}
 
+	vsi = pf->vsi[vf->lan_vsi_idx];
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
 	for (i = 0; i < num_q_vectors_mapped; i++) {
 		struct ice_q_vector *q_vector;
 
@@ -2092,10 +2104,6 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg)
 		goto error_param;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi)
-		goto error_param;
-
 	if (qci->num_queue_pairs > ICE_MAX_BASE_QS_PER_VF) {
 		dev_err(&pf->pdev->dev,
 			"VF-%d requesting more than supported number of queues: %d\n",
@@ -2104,6 +2112,12 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg)
 		goto error_param;
 	}
 
+	vsi = pf->vsi[vf->lan_vsi_idx];
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
 	for (i = 0; i < qci->num_queue_pairs; i++) {
 		qpi = &qci->qpair[i];
 		if (qpi->txq.vsi_id != qci->vsi_id ||
@@ -2755,20 +2769,6 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event)
 			err = -EPERM;
 		else
 			err = -EINVAL;
-		goto error_handler;
-	}
-
-	/* Perform additional checks specific to RSS and Virtchnl */
-	if (v_opcode == VIRTCHNL_OP_CONFIG_RSS_KEY) {
-		struct virtchnl_rss_key *vrk = (struct virtchnl_rss_key *)msg;
-
-		if (vrk->key_len != ICE_VSIQF_HKEY_ARRAY_SIZE)
-			err = -EINVAL;
-	} else if (v_opcode == VIRTCHNL_OP_CONFIG_RSS_LUT) {
-		struct virtchnl_rss_lut *vrl = (struct virtchnl_rss_lut *)msg;
-
-		if (vrl->lut_entries != ICE_VSIQF_HLUT_ARRAY_SIZE)
-			err = -EINVAL;
 	}
 
 error_handler:
-- 
2.21.0


^ permalink raw reply related

* [net-next 00/14][pull request] 100GbE Intel Wired LAN Driver Updates 2019-08-23
From: Jeff Kirsher @ 2019-08-23 23:37 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, nhorman, sassmann

This series contains updates to ice driver only.

Dave adds logic for the necessary bits to be set in the VSI context for
the PF_VSI and the TX_descriptors for control packets egressing the
PF_VSI.  Updated the logic to detect both DCBx and LLDP states in the
firmware engine to account for situations where DCBx is enabled and LLDP
is disabled.  Fixed the driver to treat the DCBx state of "NOT_STARTED"
as a valid state and should not assume "is_fw_lldp" true automatically.
Since "enable-fw-lldp" flag was confusing and cumbersome, change the
flag to "fw-lldp-agent" with a value of on or off to help clarify
whether the LLDP agent is running or not.

Brett fixes an issue where synchronize_irq() was being called from the
host of VF's, which should not be done.

Michal fixed an issue when rebuilding the DCBx configuration while in
IEEE mode versus CEE mode, so add a check before copying the
configuration value to ensure we are only in CEE mode.

Jake fixes the PF to reject any VF request to setup head writeback since
the support has been deprecated.

Mitch adds an additional check to ensure the VF is active before sending
out an error message that a message was unable to be sent to a
particular VF.

Chinh updates the driver to use "topology" mode when checking the PHY
for status, since this mode provides us the current module type that is
available.  Fixes the driver from clearing the auto_fec_enable bit which
was blocking a user from forcing non-spec compliant FEC configurations.

Amruth does a refactor on the code to first check, then assign in the
virtual channel space.

Bruce updates the driver to actually update the stats when a user runs
the ethtool command 'ethtool -S <iface>' instead of providing a snapshot
of the stats that maybe from a second ago.

Akeem fixes up the adding/removing of VSI MAC filters for VFs, so that
VFs cannot add/remove a filter from another VSI.  We now track the
number of filters added right from when the VF resources get allocated
and won't get into MAC filter mis-match issue in the switch.

The following are changes since commit 6d24e14140053febc5ac1ce46baca6a4334c5f6c:
  net/ncsi: update response packet length for GCPS/GNS/GNPTS commands
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 100GbE

Akeem G Abodunrin (2):
  ice: Fix issues updating VSI MAC filters
  ice: Don't allow VSI to remove unassociated ucast filter

Amruth G.P (1):
  ice: Add input handlers for virtual channel handlers

Brett Creeley (1):
  ice: Don't call synchronize_irq() for VF's from the host

Bruce Allan (1):
  ice: update ethtool stats on-demand

Chinh T Cao (2):
  ice: Fix flag used for module query
  ice: Don't clear auto_fec bit in ice_cfg_phy_fec()

Dave Ertman (4):
  ice: Allow egress control packets from PF_VSI
  ice: Account for all states of FW DCBx and LLDP
  ice: Treat DCBx state NOT_STARTED as valid
  ice: Rename ethtool private flag for lldp

Jacob Keller (1):
  ice: reject VF attempts to enable head writeback

Michal Swiatkowski (1):
  ice: Copy dcbx configuration only if mode is correct

Mitch Williams (1):
  ice: silence some bogus error messages

 drivers/net/ethernet/intel/ice/ice.h          |   4 +-
 .../net/ethernet/intel/ice/ice_adminq_cmd.h   |   1 +
 drivers/net/ethernet/intel/ice/ice_common.c   |  11 +-
 drivers/net/ethernet/intel/ice/ice_dcb.c      |   3 +-
 drivers/net/ethernet/intel/ice/ice_dcb_lib.c  |  42 +++----
 drivers/net/ethernet/intel/ice/ice_ethtool.c  |  13 +-
 drivers/net/ethernet/intel/ice/ice_lib.c      |  45 ++++++-
 drivers/net/ethernet/intel/ice/ice_lib.h      |   4 +
 drivers/net/ethernet/intel/ice/ice_main.c     |  70 +++--------
 drivers/net/ethernet/intel/ice/ice_switch.c   |  56 +++++++++
 drivers/net/ethernet/intel/ice/ice_txrx.c     |  11 +-
 .../net/ethernet/intel/ice/ice_virtchnl_pf.c  | 113 +++++++++---------
 12 files changed, 225 insertions(+), 148 deletions(-)

-- 
2.21.0

^ permalink raw reply

* [net-next 03/14] ice: Don't call synchronize_irq() for VF's from the host
From: Jeff Kirsher @ 2019-08-23 23:37 UTC (permalink / raw)
  To: davem; +Cc: Brett Creeley, netdev, nhorman, sassmann, Andrew Bowers,
	Jeff Kirsher
In-Reply-To: <20190823233750.7997-1-jeffrey.t.kirsher@intel.com>

From: Brett Creeley <brett.creeley@intel.com>

Currently we will call synchronize_irq() from the host for VF's. This is
not correct, so don't allow it.

Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_lib.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index d6279dfe029e..c067ef6be7f4 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -2817,6 +2817,10 @@ void ice_vsi_dis_irq(struct ice_vsi *vsi)
 
 	ice_flush(hw);
 
+	/* don't call synchronize_irq() for VF's from the host */
+	if (vsi->type == ICE_VSI_VF)
+		return;
+
 	ice_for_each_q_vector(vsi, i)
 		synchronize_irq(pf->msix_entries[i + base].vector);
 }
-- 
2.21.0


^ permalink raw reply related

* [net-next 10/14] ice: Don't clear auto_fec bit in ice_cfg_phy_fec()
From: Jeff Kirsher @ 2019-08-23 23:37 UTC (permalink / raw)
  To: davem; +Cc: Chinh T Cao, netdev, nhorman, sassmann, Andrew Bowers,
	Jeff Kirsher
In-Reply-To: <20190823233750.7997-1-jeffrey.t.kirsher@intel.com>

From: Chinh T Cao <chinh.t.cao@intel.com>

The driver should never clear the auto_fec_enable bit.

Signed-off-by: Chinh T Cao <chinh.t.cao@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_common.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index 15648d4a8bab..4b43e6de847b 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -2181,27 +2181,24 @@ ice_cfg_phy_fec(struct ice_aqc_set_phy_cfg_data *cfg, enum ice_fec_mode fec)
 {
 	switch (fec) {
 	case ICE_FEC_BASER:
-		/* Clear auto FEC and RS bits, and AND BASE-R ability
+		/* Clear RS bits, and AND BASE-R ability
 		 * bits and OR request bits.
 		 */
-		cfg->caps &= ~ICE_AQC_PHY_EN_AUTO_FEC;
 		cfg->link_fec_opt &= ICE_AQC_PHY_FEC_10G_KR_40G_KR4_EN |
 				     ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN;
 		cfg->link_fec_opt |= ICE_AQC_PHY_FEC_10G_KR_40G_KR4_REQ |
 				     ICE_AQC_PHY_FEC_25G_KR_REQ;
 		break;
 	case ICE_FEC_RS:
-		/* Clear auto FEC and BASE-R bits, and AND RS ability
+		/* Clear BASE-R bits, and AND RS ability
 		 * bits and OR request bits.
 		 */
-		cfg->caps &= ~ICE_AQC_PHY_EN_AUTO_FEC;
 		cfg->link_fec_opt &= ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN;
 		cfg->link_fec_opt |= ICE_AQC_PHY_FEC_25G_RS_528_REQ |
 				     ICE_AQC_PHY_FEC_25G_RS_544_REQ;
 		break;
 	case ICE_FEC_NONE:
-		/* Clear auto FEC and all FEC option bits. */
-		cfg->caps &= ~ICE_AQC_PHY_EN_AUTO_FEC;
+		/* Clear all FEC option bits. */
 		cfg->link_fec_opt &= ~ICE_AQC_PHY_FEC_MASK;
 		break;
 	case ICE_FEC_AUTO:
-- 
2.21.0


^ permalink raw reply related

* [net-next 13/14] ice: Fix issues updating VSI MAC filters
From: Jeff Kirsher @ 2019-08-23 23:37 UTC (permalink / raw)
  To: davem
  Cc: Akeem G Abodunrin, netdev, nhorman, sassmann, Tony Nguyen,
	Andrew Bowers, Jeff Kirsher
In-Reply-To: <20190823233750.7997-1-jeffrey.t.kirsher@intel.com>

From: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>

VSI, especially VF could request to add or remove filter for another VSI,
driver should really guide such request and disallow it.
However, instead of returning error for such malicious request, driver
can simply return success.

In addition, we are not tracking number of MAC filters configured per
VF correctly - and this leads to issue updating VF MAC filters whenever
they were removed and re-configured via bringing VF interface down and
up. Also, since VF could send request to update multiple MAC filters at
once, driver should program those filters individually in the switch, in
order to determine which action resulted to error, and communicate
accordingly to the VF.

So, with this changes, we now track number of filters added right from
when VF resources allocation is done, and could properly add filters for
both trusted and non_trusted VFs, without MAC filters mis-match issue in
the switch...

Also refactor code, so that driver can use new function to add or remove
MAC filters.

Signed-off-by: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_lib.c      | 30 +++++++++
 drivers/net/ethernet/intel/ice/ice_lib.h      |  4 ++
 drivers/net/ethernet/intel/ice/ice_main.c     | 64 +++++--------------
 .../net/ethernet/intel/ice/ice_virtchnl_pf.c  | 42 ++++++------
 4 files changed, 73 insertions(+), 67 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index 343d0c305423..8d5d6635a123 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -3181,3 +3181,33 @@ int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc)
 	return ret;
 }
 #endif /* CONFIG_DCB */
+
+/**
+ * ice_vsi_cfg_mac_fltr - Add or remove a MAC address filter for a VSI
+ * @vsi: the VSI being configured MAC filter
+ * @macaddr: the MAC address to be added.
+ * @set: Add or delete a MAC filter
+ *
+ * Adds or removes MAC address filter entry for VF VSI
+ */
+enum ice_status
+ice_vsi_cfg_mac_fltr(struct ice_vsi *vsi, const u8 *macaddr, bool set)
+{
+	LIST_HEAD(tmp_add_list);
+	enum ice_status status;
+
+	 /* Update MAC filter list to be added or removed for a VSI */
+	if (ice_add_mac_to_list(vsi, &tmp_add_list, macaddr)) {
+		status = ICE_ERR_NO_MEMORY;
+		goto cfg_mac_fltr_exit;
+	}
+
+	if (set)
+		status = ice_add_mac(&vsi->back->hw, &tmp_add_list);
+	else
+		status = ice_remove_mac(&vsi->back->hw, &tmp_add_list);
+
+cfg_mac_fltr_exit:
+	ice_free_fltr_list(&vsi->back->pdev->dev, &tmp_add_list);
+	return status;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.h b/drivers/net/ethernet/intel/ice/ice_lib.h
index 6e43ef03bfc3..969ba27cba95 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.h
+++ b/drivers/net/ethernet/intel/ice/ice_lib.h
@@ -95,4 +95,8 @@ void ice_vsi_free_tx_rings(struct ice_vsi *vsi);
 int ice_vsi_manage_rss_lut(struct ice_vsi *vsi, bool ena);
 
 u32 ice_intrl_usec_to_reg(u8 intrl, u8 gran);
+
+enum ice_status
+ice_vsi_cfg_mac_fltr(struct ice_vsi *vsi, const u8 *macaddr, bool set);
+
 #endif /* !_ICE_LIB_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 6dd806b763ea..f3923dec32b7 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -116,10 +116,9 @@ static void ice_check_for_hang_subtask(struct ice_pf *pf)
  */
 static int ice_init_mac_fltr(struct ice_pf *pf)
 {
-	LIST_HEAD(tmp_add_list);
+	enum ice_status status;
 	u8 broadcast[ETH_ALEN];
 	struct ice_vsi *vsi;
-	int status;
 
 	vsi = ice_find_vsi_by_type(pf, ICE_VSI_PF);
 	if (!vsi)
@@ -130,8 +129,7 @@ static int ice_init_mac_fltr(struct ice_pf *pf)
 	 */
 
 	 /* Add a unicast MAC filter so the VSI can get its packets */
-	status = ice_add_mac_to_list(vsi, &tmp_add_list,
-				     vsi->port_info->mac.perm_addr);
+	status = ice_vsi_cfg_mac_fltr(vsi, vsi->port_info->mac.perm_addr, true);
 	if (status)
 		goto unregister;
 
@@ -139,18 +137,11 @@ static int ice_init_mac_fltr(struct ice_pf *pf)
 	 * MAC address to the list as well.
 	 */
 	eth_broadcast_addr(broadcast);
-	status = ice_add_mac_to_list(vsi, &tmp_add_list, broadcast);
-	if (status)
-		goto free_mac_list;
-
-	/* Program MAC filters for entries in tmp_add_list */
-	status = ice_add_mac(&pf->hw, &tmp_add_list);
+	status = ice_vsi_cfg_mac_fltr(vsi, broadcast, true);
 	if (status)
-		status = -ENOMEM;
-
-free_mac_list:
-	ice_free_fltr_list(&pf->pdev->dev, &tmp_add_list);
+		goto unregister;
 
+	return 0;
 unregister:
 	/* We aren't useful with no MAC filters, so unregister if we
 	 * had an error
@@ -164,7 +155,7 @@ static int ice_init_mac_fltr(struct ice_pf *pf)
 		vsi->netdev = NULL;
 	}
 
-	return status;
+	return -EIO;
 }
 
 /**
@@ -2834,10 +2825,8 @@ static int ice_set_mac_address(struct net_device *netdev, void *pi)
 	struct ice_hw *hw = &pf->hw;
 	struct sockaddr *addr = pi;
 	enum ice_status status;
-	LIST_HEAD(a_mac_list);
-	LIST_HEAD(r_mac_list);
 	u8 flags = 0;
-	int err;
+	int err = 0;
 	u8 *mac;
 
 	mac = (u8 *)addr->sa_data;
@@ -2860,42 +2849,23 @@ static int ice_set_mac_address(struct net_device *netdev, void *pi)
 	/* When we change the MAC address we also have to change the MAC address
 	 * based filter rules that were created previously for the old MAC
 	 * address. So first, we remove the old filter rule using ice_remove_mac
-	 * and then create a new filter rule using ice_add_mac. Note that for
-	 * both these operations, we first need to form a "list" of MAC
-	 * addresses (even though in this case, we have only 1 MAC address to be
-	 * added/removed) and this done using ice_add_mac_to_list. Depending on
-	 * the ensuing operation this "list" of MAC addresses is either to be
-	 * added or removed from the filter.
+	 * and then create a new filter rule using ice_add_mac via
+	 * ice_vsi_cfg_mac_fltr function call for both add and/or remove
+	 * filters.
 	 */
-	err = ice_add_mac_to_list(vsi, &r_mac_list, netdev->dev_addr);
-	if (err) {
-		err = -EADDRNOTAVAIL;
-		goto free_lists;
-	}
-
-	status = ice_remove_mac(hw, &r_mac_list);
+	status = ice_vsi_cfg_mac_fltr(vsi, netdev->dev_addr, false);
 	if (status) {
 		err = -EADDRNOTAVAIL;
-		goto free_lists;
-	}
-
-	err = ice_add_mac_to_list(vsi, &a_mac_list, mac);
-	if (err) {
-		err = -EADDRNOTAVAIL;
-		goto free_lists;
+		goto err_update_filters;
 	}
 
-	status = ice_add_mac(hw, &a_mac_list);
+	status = ice_vsi_cfg_mac_fltr(vsi, mac, true);
 	if (status) {
 		err = -EADDRNOTAVAIL;
-		goto free_lists;
+		goto err_update_filters;
 	}
 
-free_lists:
-	/* free list entries */
-	ice_free_fltr_list(&pf->pdev->dev, &r_mac_list);
-	ice_free_fltr_list(&pf->pdev->dev, &a_mac_list);
-
+err_update_filters:
 	if (err) {
 		netdev_err(netdev, "can't set MAC %pM. filter update failed\n",
 			   mac);
@@ -2911,8 +2881,8 @@ static int ice_set_mac_address(struct net_device *netdev, void *pi)
 	flags = ICE_AQC_MAN_MAC_UPDATE_LAA_WOL;
 	status = ice_aq_manage_mac_write(hw, mac, flags, NULL);
 	if (status) {
-		netdev_err(netdev, "can't set MAC %pM. write to firmware failed.\n",
-			   mac);
+		netdev_err(netdev, "can't set MAC %pM. write to firmware failed error %d\n",
+			   mac, status);
 	}
 	return 0;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index de0a1ef54e83..86637d99ee77 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -540,7 +540,10 @@ static int ice_alloc_vsi_res(struct ice_vf *vf)
 
 	status = ice_add_mac(&pf->hw, &tmp_add_list);
 	if (status)
-		dev_err(&pf->pdev->dev, "could not add mac filters\n");
+		dev_err(&pf->pdev->dev,
+			"could not add mac filters error %d\n", status);
+	else
+		vf->num_mac = 1;
 
 	/* Clear this bit after VF initialization since we shouldn't reclaim
 	 * and reassign interrupts for synchronous or asynchronous VFR events.
@@ -2208,7 +2211,7 @@ ice_vc_handle_mac_addr_msg(struct ice_vf *vf, u8 *msg, bool set)
 	    (struct virtchnl_ether_addr_list *)msg;
 	struct ice_pf *pf = vf->pf;
 	enum virtchnl_ops vc_op;
-	LIST_HEAD(mac_list);
+	enum ice_status status;
 	struct ice_vsi *vsi;
 	int mac_count = 0;
 	int i;
@@ -2282,33 +2285,32 @@ ice_vc_handle_mac_addr_msg(struct ice_vf *vf, u8 *msg, bool set)
 			goto handle_mac_exit;
 		}
 
-		/* get here if maddr is multicast or if VF can change MAC */
-		if (ice_add_mac_to_list(vsi, &mac_list, al->list[i].addr)) {
-			v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		/* program the updated filter list */
+		status = ice_vsi_cfg_mac_fltr(vsi, maddr, set);
+		if (status == ICE_ERR_DOES_NOT_EXIST ||
+		    status == ICE_ERR_ALREADY_EXISTS) {
+			dev_info(&pf->pdev->dev,
+				 "can't %s MAC filters %pM for VF %d, error %d\n",
+				 set ? "add" : "remove", maddr, vf->vf_id,
+				 status);
+		} else if (status) {
+			dev_err(&pf->pdev->dev,
+				"can't %s MAC filters for VF %d, error %d\n",
+				set ? "add" : "remove", vf->vf_id, status);
+			v_ret = ice_err_to_virt_err(status);
 			goto handle_mac_exit;
 		}
+
 		mac_count++;
 	}
 
-	/* program the updated filter list */
+	/* Track number of MAC filters programmed for the VF VSI */
 	if (set)
-		v_ret = ice_err_to_virt_err(ice_add_mac(&pf->hw, &mac_list));
+		vf->num_mac += mac_count;
 	else
-		v_ret = ice_err_to_virt_err(ice_remove_mac(&pf->hw, &mac_list));
-
-	if (v_ret) {
-		dev_err(&pf->pdev->dev,
-			"can't %s MAC filters for VF %d, error %d\n",
-			set ? "add" : "remove", vf->vf_id, v_ret);
-	} else {
-		if (set)
-			vf->num_mac += mac_count;
-		else
-			vf->num_mac -= mac_count;
-	}
+		vf->num_mac -= mac_count;
 
 handle_mac_exit:
-	ice_free_fltr_list(&pf->pdev->dev, &mac_list);
 	/* send the response to the VF */
 	return ice_vc_send_msg_to_vf(vf, vc_op, v_ret, NULL, 0);
 }
-- 
2.21.0


^ permalink raw reply related

* [net-next 07/14] ice: Rename ethtool private flag for lldp
From: Jeff Kirsher @ 2019-08-23 23:37 UTC (permalink / raw)
  To: davem; +Cc: Dave Ertman, netdev, nhorman, sassmann, Andrew Bowers,
	Jeff Kirsher
In-Reply-To: <20190823233750.7997-1-jeffrey.t.kirsher@intel.com>

From: Dave Ertman <david.m.ertman@intel.com>

The current flag name of "enable-fw-lldp" is a bit cumbersome.

Change priv-flag name to "fw-lldp-agent" with a value of on or
off.  This is more straight-forward in meaning.

Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice.h         | 2 +-
 drivers/net/ethernet/intel/ice/ice_dcb_lib.c | 4 ++--
 drivers/net/ethernet/intel/ice/ice_ethtool.c | 6 +++---
 drivers/net/ethernet/intel/ice/ice_lib.c     | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index 9f9c30d29eb5..99e0febd8e50 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -329,7 +329,7 @@ enum ice_pf_flags {
 	ICE_FLAG_DCB_ENA,
 	ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA,
 	ICE_FLAG_NO_MEDIA,
-	ICE_FLAG_ENABLE_FW_LLDP,
+	ICE_FLAG_FW_LLDP_AGENT,
 	ICE_FLAG_ETHTOOL_CTXT,		/* set when ethtool holds RTNL lock */
 	ICE_PF_FLAGS_NBITS		/* must be last */
 };
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
index 4fc9faf5bc71..734cef8eed9e 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
@@ -457,7 +457,7 @@ int ice_init_pf_dcb(struct ice_pf *pf, bool locked)
 		dev_info(&pf->pdev->dev,
 			 "FW LLDP is disabled, DCBx/LLDP in SW mode.\n");
 		port_info->is_sw_lldp = true;
-		clear_bit(ICE_FLAG_ENABLE_FW_LLDP, pf->flags);
+		clear_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags);
 		err = ice_dcb_sw_dflt_cfg(pf, locked);
 		if (err) {
 			dev_err(&pf->pdev->dev,
@@ -473,7 +473,7 @@ int ice_init_pf_dcb(struct ice_pf *pf, bool locked)
 	}
 
 	port_info->is_sw_lldp = false;
-	set_bit(ICE_FLAG_ENABLE_FW_LLDP, pf->flags);
+	set_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags);
 
 	/* DCBX in FW and LLDP enabled in FW */
 	pf->dcbx_cap = DCB_CAP_DCBX_LLD_MANAGED | DCB_CAP_DCBX_VER_IEEE;
diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c
index 6a97ddbbda76..948a33716290 100644
--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -155,7 +155,7 @@ struct ice_priv_flag {
 
 static const struct ice_priv_flag ice_gstrings_priv_flags[] = {
 	ICE_PRIV_FLAG("link-down-on-close", ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA),
-	ICE_PRIV_FLAG("enable-fw-lldp", ICE_FLAG_ENABLE_FW_LLDP),
+	ICE_PRIV_FLAG("fw-lldp-agent", ICE_FLAG_FW_LLDP_AGENT),
 };
 
 #define ICE_PRIV_FLAG_ARRAY_SIZE	ARRAY_SIZE(ice_gstrings_priv_flags)
@@ -1201,8 +1201,8 @@ static int ice_set_priv_flags(struct net_device *netdev, u32 flags)
 
 	bitmap_xor(change_flags, pf->flags, orig_flags, ICE_PF_FLAGS_NBITS);
 
-	if (test_bit(ICE_FLAG_ENABLE_FW_LLDP, change_flags)) {
-		if (!test_bit(ICE_FLAG_ENABLE_FW_LLDP, pf->flags)) {
+	if (test_bit(ICE_FLAG_FW_LLDP_AGENT, change_flags)) {
+		if (!test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags)) {
 			enum ice_status status;
 
 			/* Disable FW LLDP engine */
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index c067ef6be7f4..343d0c305423 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -2541,7 +2541,7 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 		ice_cfg_sw_lldp(vsi, true, true);
 
 		/* Rx LLDP packets */
-		if (!test_bit(ICE_FLAG_ENABLE_FW_LLDP, pf->flags))
+		if (!test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags))
 			ice_cfg_sw_lldp(vsi, false, true);
 	}
 
@@ -2888,7 +2888,7 @@ int ice_vsi_release(struct ice_vsi *vsi)
 		/* The Rx rule will only exist to remove if the LLDP FW
 		 * engine is currently stopped
 		 */
-		if (!test_bit(ICE_FLAG_ENABLE_FW_LLDP, pf->flags))
+		if (!test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags))
 			ice_cfg_sw_lldp(vsi, false, false);
 	}
 
-- 
2.21.0


^ permalink raw reply related

* [net-next 06/14] ice: reject VF attempts to enable head writeback
From: Jeff Kirsher @ 2019-08-23 23:37 UTC (permalink / raw)
  To: davem; +Cc: Jacob Keller, netdev, nhorman, sassmann, Andrew Bowers,
	Jeff Kirsher
In-Reply-To: <20190823233750.7997-1-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

The virtchnl interface provides a mechanism for a VF driver to request
head writeback support. This feature is deprecated as of AVF 1.0, but
older versions of a VF driver may still attempt to request the mode.

Since the ice hardware does not support head writeback, we should not
accept Tx queue configuration which attempts to enable it.

Currently, the driver simply assumes that the headwb_enabled bit will
never be set.

If a VF driver does request head writeback, the configuration will
return successfully, even though head writeback is not enabled. This
leaves the VF driver in a non functional state since it is assuming to
be operating in head writeback mode.

Fix the PF driver to reject any attempt to setup headwb_enabled.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index 1b1d1ea0c8f9..73ab6222d29b 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -2109,6 +2109,7 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg)
 		if (qpi->txq.vsi_id != qci->vsi_id ||
 		    qpi->rxq.vsi_id != qci->vsi_id ||
 		    qpi->rxq.queue_id != qpi->txq.queue_id ||
+		    qpi->txq.headwb_enabled ||
 		    !ice_vc_isvalid_q_id(vf, qci->vsi_id, qpi->txq.queue_id)) {
 			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 			goto error_param;
-- 
2.21.0

^ permalink raw reply related

* [net-next 08/14] ice: silence some bogus error messages
From: Jeff Kirsher @ 2019-08-23 23:37 UTC (permalink / raw)
  To: davem
  Cc: Mitch Williams, netdev, nhorman, sassmann, Tony Nguyen,
	Andrew Bowers, Jeff Kirsher
In-Reply-To: <20190823233750.7997-1-jeffrey.t.kirsher@intel.com>

From: Mitch Williams <mitch.a.williams@intel.com>

In some circumstances, VF devices can be deactivated while a message is
in-flight. In that case, a series of scary error message will be
printed in the log. Since these are actually harmless, check for this
case and suppress them. No harm, no foul.

Signed-off-by: Mitch Williams <mitch.a.williams@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_adminq_cmd.h  | 1 +
 drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index 765e3c2ed045..bf9aa533a7c6 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -1610,6 +1610,7 @@ enum ice_aq_err {
 	ICE_AQ_RC_EBUSY		= 12, /* Device or resource busy */
 	ICE_AQ_RC_EEXIST	= 13, /* Object already exists */
 	ICE_AQ_RC_ENOSPC	= 16, /* No space left or allocation failure */
+	ICE_AQ_RC_ENOSYS	= 17, /* Function not implemented */
 };
 
 /* Admin Queue command opcodes */
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index 73ab6222d29b..83e58e71081e 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -1512,10 +1512,10 @@ ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode,
 
 	aq_ret = ice_aq_send_msg_to_vf(&pf->hw, vf->vf_id, v_opcode, v_retval,
 				       msg, msglen, NULL);
-	if (aq_ret) {
+	if (aq_ret && pf->hw.mailboxq.sq_last_status != ICE_AQ_RC_ENOSYS) {
 		dev_info(&pf->pdev->dev,
-			 "Unable to send the message to VF %d aq_err %d\n",
-			 vf->vf_id, pf->hw.mailboxq.sq_last_status);
+			 "Unable to send the message to VF %d ret %d aq_err %d\n",
+			 vf->vf_id, aq_ret, pf->hw.mailboxq.sq_last_status);
 		return -EIO;
 	}
 
-- 
2.21.0


^ permalink raw reply related

* [net PATCH] net: route dump netlink NLM_F_MULTI flag missing
From: John Fastabend @ 2019-08-24  0:11 UTC (permalink / raw)
  To: sbrivio, davem, dsahern; +Cc: netdev, john.fastabend

An excerpt from netlink(7) man page,

  In multipart messages (multiple nlmsghdr headers with associated payload
  in one byte stream) the first and all following headers have the
  NLM_F_MULTI flag set, except for the last  header  which  has the type
  NLMSG_DONE.

but, after (ee28906) there is a missing NLM_F_MULTI flag in the middle of a
FIB dump. The result is user space applications following above man page
excerpt may get confused and may stop parsing msg believing something went
wrong.

In the golang netlink lib [0] the library logic stops parsing believing the
message is not a multipart message. Found this running Cilium[1] against
net-next while adding a feature to auto-detect routes. I noticed with
multiple route tables we no longer could detect the default routes on net
tree kernels because the library logic was not returning them.

Fix this by handling the fib_dump_info_fnhe() case the same way the
fib_dump_info() handles it by passing the flags argument through the
call chain and adding a flags argument to rt_fill_info().

Tested with Cilium stack and auto-detection of routes works again. Also
annotated libs to dump netlink msgs and inspected NLM_F_MULTI and
NLMSG_DONE flags look correct after this.

Note: In inet_rtm_getroute() pass rt_fill_info() '0' for flags the same
as is done for fib_dump_info() so this looks correct to me.

[0] https://github.com/vishvananda/netlink/
[1] https://github.com/cilium/

Fixes: ee28906fd7a14 ("ipv4: Dump route exceptions if requested")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 include/net/route.h |    2 +-
 net/ipv4/fib_trie.c |    2 +-
 net/ipv4/route.c    |   17 ++++++++++-------
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 630a0493f1f3..dfce19c9fa96 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -233,7 +233,7 @@ void rt_del_uncached_list(struct rtable *rt);
 
 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
 		       u32 table_id, struct fib_info *fi,
-		       int *fa_index, int fa_start);
+		       int *fa_index, int fa_start, unsigned int flags);
 
 static inline void ip_rt_put(struct rtable *rt)
 {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 2b2b3d291ab0..1ab2fb6bb37d 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2145,7 +2145,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 
 		if (filter->dump_exceptions) {
 			err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
-						 &i_fa, s_fa);
+						 &i_fa, s_fa, flags);
 			if (err < 0)
 				goto stop;
 		}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 517300d587a7..b6a6f18c3dd1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2728,7 +2728,8 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow);
 /* called with rcu_read_lock held */
 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
 			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
-			struct sk_buff *skb, u32 portid, u32 seq)
+			struct sk_buff *skb, u32 portid, u32 seq,
+			unsigned int flags)
 {
 	struct rtmsg *r;
 	struct nlmsghdr *nlh;
@@ -2736,7 +2737,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
 	u32 error;
 	u32 metrics[RTAX_MAX];
 
-	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
+	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
 	if (!nlh)
 		return -EMSGSIZE;
 
@@ -2860,7 +2861,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
 			    struct netlink_callback *cb, u32 table_id,
 			    struct fnhe_hash_bucket *bucket, int genid,
-			    int *fa_index, int fa_start)
+			    int *fa_index, int fa_start, unsigned int flags)
 {
 	int i;
 
@@ -2891,7 +2892,7 @@ static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
 			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
 					   table_id, NULL, skb,
 					   NETLINK_CB(cb->skb).portid,
-					   cb->nlh->nlmsg_seq);
+					   cb->nlh->nlmsg_seq, flags);
 			if (err)
 				return err;
 next:
@@ -2904,7 +2905,7 @@ static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
 
 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
 		       u32 table_id, struct fib_info *fi,
-		       int *fa_index, int fa_start)
+		       int *fa_index, int fa_start, unsigned int flags)
 {
 	struct net *net = sock_net(cb->skb->sk);
 	int nhsel, genid = fnhe_genid(net);
@@ -2922,7 +2923,8 @@ int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
 		err = 0;
 		if (bucket)
 			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
-					       genid, fa_index, fa_start);
+					       genid, fa_index, fa_start,
+					       flags);
 		rcu_read_unlock();
 		if (err)
 			return err;
@@ -3183,7 +3185,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 				    fl4.flowi4_tos, res.fi, 0);
 	} else {
 		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
-				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
+				   NETLINK_CB(in_skb).portid,
+				   nlh->nlmsg_seq, 0);
 	}
 	if (err < 0)
 		goto errout_rcu;


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox