Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf-next 4/6] bpf, arm32: save 4 bytes of unneeded stack space
From: Daniel Borkmann @ 2018-05-14  0:26 UTC (permalink / raw)
  To: alexei.starovoitov; +Cc: netdev, Daniel Borkmann
In-Reply-To: <20180514002612.12083-1-daniel@iogearbox.net>

The extra skb_copy_bits() buffer is not used anymore, therefore
remove the extra 4 byte stack space requirement.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/arm/net/bpf_jit_32.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 82689b9..d3ea645 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -234,18 +234,11 @@ static void jit_fill_hole(void *area, unsigned int size)
 #define SCRATCH_SIZE 80
 
 /* total stack size used in JITed code */
-#define _STACK_SIZE \
-	(ctx->prog->aux->stack_depth + \
-	 + SCRATCH_SIZE + \
-	 + 4 /* extra for skb_copy_bits buffer */)
-
-#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
+#define _STACK_SIZE	(ctx->prog->aux->stack_depth + SCRATCH_SIZE)
+#define STACK_SIZE	ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
 
 /* Get the offset of eBPF REGISTERs stored on scratch space. */
-#define STACK_VAR(off) (STACK_SIZE-off-4)
-
-/* Offset of skb_copy_bits buffer */
-#define SKB_BUFFER STACK_VAR(SCRATCH_SIZE)
+#define STACK_VAR(off) (STACK_SIZE - off)
 
 #if __LINUX_ARM_ARCH__ < 7
 
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next 5/6] bpf, arm64: save 4 bytes of unneeded stack space
From: Daniel Borkmann @ 2018-05-14  0:26 UTC (permalink / raw)
  To: alexei.starovoitov; +Cc: netdev, Daniel Borkmann
In-Reply-To: <20180514002612.12083-1-daniel@iogearbox.net>

Follow-up to 816d9ef32a8b ("bpf, arm64: remove ld_abs/ld_ind") in
that the extra 4 byte JIT scratchpad is not needed anymore since it
was in ld_abs/ld_ind as stack buffer for bpf_load_pointer().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/arm64/net/bpf_jit_comp.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 0b40c8f..85113ca 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -21,7 +21,6 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 #include <linux/printk.h>
-#include <linux/skbuff.h>
 #include <linux/slab.h>
 
 #include <asm/byteorder.h>
@@ -188,7 +187,7 @@ static int build_prologue(struct jit_ctx *ctx)
 	 *                        | ... | BPF prog stack
 	 *                        |     |
 	 *                        +-----+ <= (BPF_FP - prog->aux->stack_depth)
-	 *                        |RSVD | JIT scratchpad
+	 *                        |RSVD | padding
 	 * current A64_SP =>      +-----+ <= (BPF_FP - ctx->stack_size)
 	 *                        |     |
 	 *                        | ... | Function call stack
@@ -220,9 +219,7 @@ static int build_prologue(struct jit_ctx *ctx)
 		return -1;
 	}
 
-	/* 4 byte extra for skb_copy_bits buffer */
-	ctx->stack_size = prog->aux->stack_depth + 4;
-	ctx->stack_size = STACK_ALIGN(ctx->stack_size);
+	ctx->stack_size = STACK_ALIGN(prog->aux->stack_depth);
 
 	/* Set up function call stack */
 	emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next 6/6] bpf, arm64: optimize 32/64 immediate emission
From: Daniel Borkmann @ 2018-05-14  0:26 UTC (permalink / raw)
  To: alexei.starovoitov; +Cc: netdev, Daniel Borkmann
In-Reply-To: <20180514002612.12083-1-daniel@iogearbox.net>

Improve the JIT to emit 64 and 32 bit immediates, the current
algorithm is not optimal and we often emit more instructions
than actually needed. arm64 has movz, movn, movk variants but
for the current 64 bit immediates we only use movz with a
series of movk when needed.

For example loading ffffffffffffabab emits the following 4
instructions in the JIT today:

  * movz: abab, shift:  0, result: 000000000000abab
  * movk: ffff, shift: 16, result: 00000000ffffabab
  * movk: ffff, shift: 32, result: 0000ffffffffabab
  * movk: ffff, shift: 48, result: ffffffffffffabab

Whereas after the patch the same load only needs a single
instruction:

  * movn: 5454, shift:  0, result: ffffffffffffabab

Another example where two extra instructions can be saved:

  * movz: abab, shift:  0, result: 000000000000abab
  * movk: 1f2f, shift: 16, result: 000000001f2fabab
  * movk: ffff, shift: 32, result: 0000ffff1f2fabab
  * movk: ffff, shift: 48, result: ffffffff1f2fabab

After the patch:

  * movn: e0d0, shift: 16, result: ffffffff1f2fffff
  * movk: abab, shift:  0, result: ffffffff1f2fabab

Another example with movz, before:

  * movz: 0000, shift:  0, result: 0000000000000000
  * movk: fea0, shift: 32, result: 0000fea000000000

After:

  * movz: fea0, shift: 32, result: 0000fea000000000

Moreover, reuse emit_a64_mov_i() for 32 bit immediates that
are loaded via emit_a64_mov_i64() which is a similar optimization
as done in 6fe8b9c1f41d ("bpf, x64: save several bytes by using
mov over movabsq when possible"). On arm64, the latter allows to
use a single instruction with movn due to zero extension where
otherwise two would be needed. And last but not least add a
missing optimization in emit_a64_mov_i() where movn is used but
the subsequent movk not needed. With some of the Cilium programs
in use, this shrinks the needed instructions by about three
percent. Tested on Cavium ThunderX CN8890.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/arm64/net/bpf_jit_comp.c | 86 +++++++++++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 31 deletions(-)

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 85113ca..fb75e9c 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -79,23 +79,67 @@ static inline void emit(const u32 insn, struct jit_ctx *ctx)
 	ctx->idx++;
 }
 
+static inline void emit_a64_mov_i(const int is64, const int reg,
+				  const s32 val, struct jit_ctx *ctx)
+{
+	u16 hi = val >> 16;
+	u16 lo = val & 0xffff;
+
+	if (hi & 0x8000) {
+		if (hi == 0xffff) {
+			emit(A64_MOVN(is64, reg, (u16)~lo, 0), ctx);
+		} else {
+			emit(A64_MOVN(is64, reg, (u16)~hi, 16), ctx);
+			if (lo != 0xffff)
+				emit(A64_MOVK(is64, reg, lo, 0), ctx);
+		}
+	} else {
+		emit(A64_MOVZ(is64, reg, lo, 0), ctx);
+		if (hi)
+			emit(A64_MOVK(is64, reg, hi, 16), ctx);
+	}
+}
+
+static int i64_i16_blocks(const u64 val, bool inverse)
+{
+	return (((val >>  0) & 0xffff) != (inverse ? 0xffff : 0x0000)) +
+	       (((val >> 16) & 0xffff) != (inverse ? 0xffff : 0x0000)) +
+	       (((val >> 24) & 0xffff) != (inverse ? 0xffff : 0x0000)) +
+	       (((val >> 32) & 0xffff) != (inverse ? 0xffff : 0x0000)) +
+	       (((val >> 48) & 0xffff) != (inverse ? 0xffff : 0x0000));
+}
+
 static inline void emit_a64_mov_i64(const int reg, const u64 val,
 				    struct jit_ctx *ctx)
 {
-	u64 tmp = val;
-	int shift = 0;
-
-	emit(A64_MOVZ(1, reg, tmp & 0xffff, shift), ctx);
-	tmp >>= 16;
-	shift += 16;
-	while (tmp) {
-		if (tmp & 0xffff)
-			emit(A64_MOVK(1, reg, tmp & 0xffff, shift), ctx);
-		tmp >>= 16;
-		shift += 16;
+	u64 nrm_tmp = val, rev_tmp = ~val;
+	bool inverse;
+	int shift;
+
+	if (!(nrm_tmp >> 32))
+		return emit_a64_mov_i(0, reg, (u32)val, ctx);
+
+	inverse = i64_i16_blocks(nrm_tmp, true) < i64_i16_blocks(nrm_tmp, false);
+	shift = max(round_down((inverse ? (fls64(rev_tmp) - 1) :
+					  (fls64(nrm_tmp) - 1)), 16), 0);
+	if (inverse)
+		emit(A64_MOVN(1, reg, (rev_tmp >> shift) & 0xffff, shift), ctx);
+	else
+		emit(A64_MOVZ(1, reg, (nrm_tmp >> shift) & 0xffff, shift), ctx);
+	shift -= 16;
+	while (shift >= 0) {
+		if (((nrm_tmp >> shift) & 0xffff) != (inverse ? 0xffff : 0x0000))
+			emit(A64_MOVK(1, reg, (nrm_tmp >> shift) & 0xffff, shift), ctx);
+		shift -= 16;
 	}
 }
 
+/*
+ * This is an unoptimized 64 immediate emission used for BPF to BPF call
+ * addresses. It will always do a full 64 bit decomposition as otherwise
+ * more complexity in the last extra pass is required since we previously
+ * reserved 4 instructions for the address.
+ */
 static inline void emit_addr_mov_i64(const int reg, const u64 val,
 				     struct jit_ctx *ctx)
 {
@@ -110,26 +154,6 @@ static inline void emit_addr_mov_i64(const int reg, const u64 val,
 	}
 }
 
-static inline void emit_a64_mov_i(const int is64, const int reg,
-				  const s32 val, struct jit_ctx *ctx)
-{
-	u16 hi = val >> 16;
-	u16 lo = val & 0xffff;
-
-	if (hi & 0x8000) {
-		if (hi == 0xffff) {
-			emit(A64_MOVN(is64, reg, (u16)~lo, 0), ctx);
-		} else {
-			emit(A64_MOVN(is64, reg, (u16)~hi, 16), ctx);
-			emit(A64_MOVK(is64, reg, lo, 0), ctx);
-		}
-	} else {
-		emit(A64_MOVZ(is64, reg, lo, 0), ctx);
-		if (hi)
-			emit(A64_MOVK(is64, reg, hi, 16), ctx);
-	}
-}
-
 static inline int bpf2a64_offset(int bpf_to, int bpf_from,
 				 const struct jit_ctx *ctx)
 {
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next 3/6] bpf, x64: clean up retpoline emission slightly
From: Daniel Borkmann @ 2018-05-14  0:26 UTC (permalink / raw)
  To: alexei.starovoitov; +Cc: netdev, Daniel Borkmann
In-Reply-To: <20180514002612.12083-1-daniel@iogearbox.net>

Make the RETPOLINE_{RA,ED}X_BPF_JIT() a bit more readable by
cleaning up the macro, aligning comments and spacing.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/x86/include/asm/nospec-branch.h | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 2cd344d..2f700a1d 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -301,9 +301,9 @@ do {									\
  *    jmp *%edx for x86_32
  */
 #ifdef CONFIG_RETPOLINE
-#ifdef CONFIG_X86_64
-# define RETPOLINE_RAX_BPF_JIT_SIZE	17
-# define RETPOLINE_RAX_BPF_JIT()				\
+# ifdef CONFIG_X86_64
+#  define RETPOLINE_RAX_BPF_JIT_SIZE	17
+#  define RETPOLINE_RAX_BPF_JIT()				\
 do {								\
 	EMIT1_off32(0xE8, 7);	 /* callq do_rop */		\
 	/* spec_trap: */					\
@@ -314,8 +314,8 @@ do {								\
 	EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */	\
 	EMIT1(0xC3);             /* retq */			\
 } while (0)
-#else
-# define RETPOLINE_EDX_BPF_JIT()				\
+# else /* !CONFIG_X86_64 */
+#  define RETPOLINE_EDX_BPF_JIT()				\
 do {								\
 	EMIT1_off32(0xE8, 7);	 /* call do_rop */		\
 	/* spec_trap: */					\
@@ -326,17 +326,16 @@ do {								\
 	EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */		\
 	EMIT1(0xC3);             /* ret */			\
 } while (0)
-#endif
+# endif
 #else /* !CONFIG_RETPOLINE */
-
-#ifdef CONFIG_X86_64
-# define RETPOLINE_RAX_BPF_JIT_SIZE	2
-# define RETPOLINE_RAX_BPF_JIT()				\
-	EMIT2(0xFF, 0xE0);	 /* jmp *%rax */
-#else
-# define RETPOLINE_EDX_BPF_JIT()				\
-	EMIT2(0xFF, 0xE2) /* jmp *%edx */
-#endif
+# ifdef CONFIG_X86_64
+#  define RETPOLINE_RAX_BPF_JIT_SIZE	2
+#  define RETPOLINE_RAX_BPF_JIT()				\
+	EMIT2(0xFF, 0xE0);       /* jmp *%rax */
+# else /* !CONFIG_X86_64 */
+#  define RETPOLINE_EDX_BPF_JIT()				\
+	EMIT2(0xFF, 0xE2)        /* jmp *%edx */
+# endif
 #endif
 
 #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next 2/6] bpf, sparc: remove unused variable
From: Daniel Borkmann @ 2018-05-14  0:26 UTC (permalink / raw)
  To: alexei.starovoitov; +Cc: netdev, Daniel Borkmann
In-Reply-To: <20180514002612.12083-1-daniel@iogearbox.net>

Since fe83963b7c38 ("bpf, sparc64: remove ld_abs/ld_ind") it's not
used anymore therefore remove it.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/sparc/net/bpf_jit_comp_64.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 9f5918e..222785a 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -894,7 +894,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 	const int i = insn - ctx->prog->insnsi;
 	const s16 off = insn->off;
 	const s32 imm = insn->imm;
-	u32 *func;
 
 	if (insn->src_reg == BPF_REG_FP)
 		ctx->saw_frame_pointer = true;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next 1/6] bpf, mips: remove unused function
From: Daniel Borkmann @ 2018-05-14  0:26 UTC (permalink / raw)
  To: alexei.starovoitov; +Cc: netdev, Daniel Borkmann
In-Reply-To: <20180514002612.12083-1-daniel@iogearbox.net>

The ool_skb_header_pointer() and size_to_len() is unused same as
tmp_offset, therefore remove all of them.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/mips/net/ebpf_jit.c | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 7ba7df9..aeb7b1b 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -95,7 +95,6 @@ enum reg_val_type {
  * struct jit_ctx - JIT context
  * @skf:		The sk_filter
  * @stack_size:		eBPF stack size
- * @tmp_offset:		eBPF $sp offset to 8-byte temporary memory
  * @idx:		Instruction index
  * @flags:		JIT flags
  * @offsets:		Instruction offsets
@@ -105,7 +104,6 @@ enum reg_val_type {
 struct jit_ctx {
 	const struct bpf_prog *skf;
 	int stack_size;
-	int tmp_offset;
 	u32 idx;
 	u32 flags;
 	u32 *offsets;
@@ -293,7 +291,6 @@ static int gen_int_prologue(struct jit_ctx *ctx)
 	locals_size = (ctx->flags & EBPF_SEEN_FP) ? MAX_BPF_STACK : 0;
 
 	stack_adjust += locals_size;
-	ctx->tmp_offset = locals_size;
 
 	ctx->stack_size = stack_adjust;
 
@@ -399,7 +396,6 @@ static void gen_imm_to_reg(const struct bpf_insn *insn, int reg,
 		emit_instr(ctx, lui, reg, upper >> 16);
 		emit_instr(ctx, addiu, reg, reg, lower);
 	}
-
 }
 
 static int gen_imm_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
@@ -547,28 +543,6 @@ static int gen_imm_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 	return 0;
 }
 
-static void * __must_check
-ool_skb_header_pointer(const struct sk_buff *skb, int offset,
-		       int len, void *buffer)
-{
-	return skb_header_pointer(skb, offset, len, buffer);
-}
-
-static int size_to_len(const struct bpf_insn *insn)
-{
-	switch (BPF_SIZE(insn->code)) {
-	case BPF_B:
-		return 1;
-	case BPF_H:
-		return 2;
-	case BPF_W:
-		return 4;
-	case BPF_DW:
-		return 8;
-	}
-	return 0;
-}
-
 static void emit_const_to_reg(struct jit_ctx *ctx, int dst, u64 value)
 {
 	if (value >= 0xffffffffffff8000ull || value < 0x8000ull) {
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next 0/6] Minor follow-up cleanups in BPF JITs and optimized imm emission
From: Daniel Borkmann @ 2018-05-14  0:26 UTC (permalink / raw)
  To: alexei.starovoitov; +Cc: netdev, Daniel Borkmann

This series follows up mostly with with some minor cleanups on top
of 'Move ld_abs/ld_ind to native BPF' as well as implements better
32/64 bit immediate load into register for the arm64 JIT. For details
please see individual patches. Thanks!

Daniel Borkmann (6):
  bpf, mips: remove unused function
  bpf, sparc: remove unused variable
  bpf, x64: clean up retpoline emission slightly
  bpf, arm32: save 4 bytes of unneeded stack space
  bpf, arm64: save 4 bytes of unneeded stack space
  bpf, arm64: optimize 32/64 immediate emission

 arch/arm/net/bpf_jit_32.c            | 13 ++---
 arch/arm64/net/bpf_jit_comp.c        | 93 ++++++++++++++++++++++--------------
 arch/mips/net/ebpf_jit.c             | 26 ----------
 arch/sparc/net/bpf_jit_comp_64.c     |  1 -
 arch/x86/include/asm/nospec-branch.h | 29 ++++++-----
 5 files changed, 74 insertions(+), 88 deletions(-)

-- 
2.9.5

^ permalink raw reply

* Re: [PATCH] 3c59x: convert to generic DMA API
From: David Miller @ 2018-05-14  0:23 UTC (permalink / raw)
  To: hch; +Cc: netdev, linux-pci, linux-kernel, tedheadster
In-Reply-To: <20180512101650.1693-1-hch@lst.de>

From: Christoph Hellwig <hch@lst.de>
Date: Sat, 12 May 2018 12:16:50 +0200

> This driver supports EISA devices in addition to PCI devices, and relied
> on the legacy behavior of the pci_dma* shims to pass on a NULL pointer
> to the DMA API, and the DMA API being able to handle that.  When the
> NULL forwarding broke the EISA support got broken.  Fix this by converting
> to the DMA API instead of the legacy PCI shims.
> 
> Fixes: 4167b2ad ("PCI: Remove NULL device handling from PCI DMA API")
> Reported-by: tedheadster <tedheadster@gmail.com>
> Tested-by: tedheadster <tedheadster@gmail.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [PATCH net] xfrm6: avoid potential infinite loop in _decode_session6()
From: David Miller @ 2018-05-14  0:23 UTC (permalink / raw)
  To: edumazet; +Cc: netdev, eric.dumazet, steffen.klassert, nicolas.dichtel
In-Reply-To: <20180512094930.77801-1-edumazet@google.com>

From: Eric Dumazet <edumazet@google.com>
Date: Sat, 12 May 2018 02:49:30 -0700

> syzbot found a way to trigger an infinitie loop by overflowing
> @offset variable that has been forced to use u16 for some very
> obscure reason in the past.
> 
> We probably want to look at NEXTHDR_FRAGMENT handling which looks
> wrong, in a separate patch.
> 
> In net-next, we shall try to use skb_header_pointer() instead of
> pskb_may_pull().
 ...
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Steffen Klassert <steffen.klassert@secunet.com>
> Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
> Reported-by: syzbot+0053c8...@syzkaller.appspotmail.com

Steffen, I am assuming you will pick this up.

Thank you.

^ permalink raw reply

* Re: [PATCH net-next v2 3/8] sctp: move the flush of ctrl chunks into its own function
From: Marcelo Ricardo Leitner @ 2018-05-14  0:21 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, Neil Horman, Vlad Yasevich, Xin Long
In-Reply-To: <629f02a784ff90761adee451a870c7548069287d.1526142784.git.marcelo.leitner@gmail.com>

On Sat, May 12, 2018 at 07:21:01PM -0300, Marcelo Ricardo Leitner wrote:
> Named sctp_outq_flush_ctrl and, with that, keep the contexts contained.

Trinity triggered a panic on this patch. I'm debugging it.

^ permalink raw reply

* Re: [PATCH net] packet: in packet_snd start writing at link layer allocation
From: David Miller @ 2018-05-14  0:20 UTC (permalink / raw)
  To: willemdebruijn.kernel; +Cc: netdev, eric.dumazet, willemb
In-Reply-To: <20180511172425.213901-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Date: Fri, 11 May 2018 13:24:25 -0400

> From: Willem de Bruijn <willemb@google.com>
> 
> Packet sockets allow construction of packets shorter than
> dev->hard_header_len to accommodate protocols with variable length
> link layer headers. These packets are padded to dev->hard_header_len,
> because some device drivers interpret that as a minimum packet size.
> 
> packet_snd reserves dev->hard_header_len bytes on allocation.
> SOCK_DGRAM sockets call skb_push in dev_hard_header() to ensure that
> link layer headers are stored in the reserved range. SOCK_RAW sockets
> do the same in tpacket_snd, but not in packet_snd.
> 
> Syzbot was able to send a zero byte packet to a device with massive
> 116B link layer header, causing padding to cross over into skb_shinfo.
> Fix this by writing from the start of the llheader reserved range also
> in the case of packet_snd/SOCK_RAW.
> 
> Update skb_set_network_header to the new offset. This also corrects
> it for SOCK_DGRAM, where it incorrectly double counted reserve due to
> the skb_push in dev_hard_header.
> 
> Fixes: 9ed988cd5915 ("packet: validate variable length ll headers")
> Reported-by: syzbot+71d74a5406d02057d559@syzkaller.appspotmail.com
> Signed-off-by: Willem de Bruijn <willemb@google.com>

Applied and queued up for -stable, thanks Willem.

^ permalink raw reply

* Re: [PATCH net V2] tun: fix use after free for ptr_ring
From: David Miller @ 2018-05-14  0:18 UTC (permalink / raw)
  To: jasowang; +Cc: netdev, linux-kernel, xiyou.wangcong, eric.dumazet, mst
In-Reply-To: <1526006965-9124-1-git-send-email-jasowang@redhat.com>

From: Jason Wang <jasowang@redhat.com>
Date: Fri, 11 May 2018 10:49:25 +0800

> We used to initialize ptr_ring during TUNSETIFF, this is because its
> size depends on the tx_queue_len of netdevice. And we try to clean it
> up when socket were detached from netdevice. A race were spotted when
> trying to do uninit during a read which will lead a use after free for
> pointer ring. Solving this by always initialize a zero size ptr_ring
> in open() and do resizing during TUNSETIFF, and then we can safely do
> cleanup during close(). With this, there's no need for the workaround
> that was introduced by commit 4df0bfc79904 ("tun: fix a memory leak
> for tfile->tx_array").
> 
> Reported-by: syzbot+e8b902c3c3fadf0a9dba@syzkaller.appspotmail.com
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Cc: Cong Wang <xiyou.wangcong@gmail.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Fixes: 1576d9860599 ("tun: switch to use skb array for tx")
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Jason, please address Cong Wang's concerns.

Thank you.

^ permalink raw reply

* Re: [PATCH] [PATCH net v3] ipv6: remove min MTU check for ipsec tunnels
From: David Miller @ 2018-05-14  0:14 UTC (permalink / raw)
  To: ashwanth; +Cc: netdev, pabeni, dsahern, eric.dumazet
In-Reply-To: <1525974344-2112-1-git-send-email-ashwanth@codeaurora.org>

From: Ashwanth Goli <ashwanth@codeaurora.org>
Date: Thu, 10 May 2018 23:15:44 +0530

> With 749439bfac "fix udpv6 sendmsg crash caused by too small MTU"
> ipsec tunnels that report a MTU less than IPV6_MIN_MTU are broken
> even for packets that are smaller than IPV6_MIN_MTU.
> 
> According to rfc2473#section-7.1
> 
>     if the original IPv6 packet is equal or  smaller  than  the
>     IPv6 minimum link MTU, the tunnel entry-point node
>     encapsulates the original packet, and subsequently
>     fragments the resulting IPv6 tunnel packet into IPv6
>     fragments that do not exceed the Path MTU to the tunnel
>     exit-point.
> 
> Dropping the MTU check for ipsec tunnel destinations.
> 
> Signed-off-by: Ashwanth Goli <ashwanth@codeaurora.org>

I still hold the fundamental objection to your change which I expressed
from the beginning.

I'll try to be more clear.

This RFC language is talking about tunnels _IN GENERAL_.

Your patch is modifying behavior only for IPSEC tunnels.

That doesn't make any sense.

Either the RFC language is to be followed, and therefore applied
to all tunnels.  Or it doesn't.

I don't see any logic which makes it such that only IPSEC tunnels
should follow this MTU checking rule.

Please do not resubmit this patch again until you resolve this
issue.

Thank you.

^ permalink raw reply

* Re: [PATCH net-next v8 0/3] kernel: add support to collect hardware logs in crash recovery kernel
From: David Miller @ 2018-05-14  0:11 UTC (permalink / raw)
  To: rahul.lakkireddy
  Cc: netdev, kexec, linux-fsdevel, linux-kernel, viro, ebiederm,
	stephen, akpm, torvalds, ganeshgr, nirranjan, indranil
In-Reply-To: <cover.1525253481.git.rahul.lakkireddy@chelsio.com>

I'm deferring this patch series.

If we can't get a reasonable review from an interested party in 10+
days, that is not reasonable.

Resubmit this once someone reviews it properly.

Thank you.

^ permalink raw reply

* Re: [PATCH net-next v9 1/7] sched: Add Common Applications Kept Enhanced (cake) qdisc
From: David Miller @ 2018-05-14  0:09 UTC (permalink / raw)
  To: toke; +Cc: netdev, cake
In-Reply-To: <152579005972.4805.2514975750552805066.stgit@alrua-kau>

From: Toke Høiland-Jørgensen <toke@toke.dk>
Date: Tue, 08 May 2018 16:34:19 +0200

> +struct cake_flow {
> +	/* this stuff is all needed per-flow at dequeue time */
> +	struct sk_buff	  *head;
> +	struct sk_buff	  *tail;

Please do not invent your own SKB list handling mechanism.

Use a standard sk_buff_head.  If you don't need the locking etc., we
have interfaces like __skb_queue_head_init(), __skb_insert(), etc.
which will accomodate that.

> +static void cake_heapify(struct cake_sched_data *q, u16 i)
> +{
> +	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
> +	u32 m = i;
> +	u32 mb = cake_heap_get_backlog(q, m);

Please order local variables from longest to shortest line.

The entire submissions has this problem all over the place, please
correct it patch-series wide.

Thank you.

^ permalink raw reply

* [PATCH] net/smc: check for missing nlattrs in SMC_PNETID messages
From: Eric Biggers @ 2018-05-14  0:01 UTC (permalink / raw)
  To: Ursula Braun, Thomas Richter, linux-s390
  Cc: netdev, syzkaller-bugs, linux-kernel, Eric Biggers
In-Reply-To: <001a113f9bb83e4d560568457853@google.com>

From: Eric Biggers <ebiggers@google.com>

It's possible to crash the kernel in several different ways by sending
messages to the SMC_PNETID generic netlink family that are missing the
expected attributes:

- Missing SMC_PNETID_NAME => null pointer dereference when comparing
  names.
- Missing SMC_PNETID_ETHNAME => null pointer dereference accessing
  smc_pnetentry::ndev.
- Missing SMC_PNETID_IBNAME => null pointer dereference accessing
  smc_pnetentry::smcibdev.
- Missing SMC_PNETID_IBPORT => out of bounds array access to
  smc_ib_device::pattr[-1].

Fix it by validating that all expected attributes are present and that
SMC_PNETID_IBPORT is nonzero.

Reported-by: syzbot+5cd61039dc9b8bfa6e47@syzkaller.appspotmail.com
Fixes: 6812baabf24d ("smc: establish pnet table management")
Cc: <stable@vger.kernel.org> # v4.11+
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 net/smc/smc_pnet.c | 71 ++++++++++++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 31 deletions(-)

diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 74568cdbca708..d7b88b2d1b224 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -245,40 +245,45 @@ static struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
 static int smc_pnet_fill_entry(struct net *net, struct smc_pnetentry *pnetelem,
 			       struct nlattr *tb[])
 {
-	char *string, *ibname = NULL;
-	int rc = 0;
+	char *string, *ibname;
+	int rc;
 
 	memset(pnetelem, 0, sizeof(*pnetelem));
 	INIT_LIST_HEAD(&pnetelem->list);
-	if (tb[SMC_PNETID_NAME]) {
-		string = (char *)nla_data(tb[SMC_PNETID_NAME]);
-		if (!smc_pnetid_valid(string, pnetelem->pnet_name)) {
-			rc = -EINVAL;
-			goto error;
-		}
-	}
-	if (tb[SMC_PNETID_ETHNAME]) {
-		string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
-		pnetelem->ndev = dev_get_by_name(net, string);
-		if (!pnetelem->ndev)
-			return -ENOENT;
-	}
-	if (tb[SMC_PNETID_IBNAME]) {
-		ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
-		ibname = strim(ibname);
-		pnetelem->smcibdev = smc_pnet_find_ib(ibname);
-		if (!pnetelem->smcibdev) {
-			rc = -ENOENT;
-			goto error;
-		}
-	}
-	if (tb[SMC_PNETID_IBPORT]) {
-		pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
-		if (pnetelem->ib_port > SMC_MAX_PORTS) {
-			rc = -EINVAL;
-			goto error;
-		}
-	}
+
+	rc = -EINVAL;
+	if (!tb[SMC_PNETID_NAME])
+		goto error;
+	string = (char *)nla_data(tb[SMC_PNETID_NAME]);
+	if (!smc_pnetid_valid(string, pnetelem->pnet_name))
+		goto error;
+
+	rc = -EINVAL;
+	if (!tb[SMC_PNETID_ETHNAME])
+		goto error;
+	rc = -ENOENT;
+	string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
+	pnetelem->ndev = dev_get_by_name(net, string);
+	if (!pnetelem->ndev)
+		goto error;
+
+	rc = -EINVAL;
+	if (!tb[SMC_PNETID_IBNAME])
+		goto error;
+	rc = -ENOENT;
+	ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
+	ibname = strim(ibname);
+	pnetelem->smcibdev = smc_pnet_find_ib(ibname);
+	if (!pnetelem->smcibdev)
+		goto error;
+
+	rc = -EINVAL;
+	if (!tb[SMC_PNETID_IBPORT])
+		goto error;
+	pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
+	if (pnetelem->ib_port < 1 || pnetelem->ib_port > SMC_MAX_PORTS)
+		goto error;
+
 	return 0;
 
 error:
@@ -307,6 +312,8 @@ static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info)
 	void *hdr;
 	int rc;
 
+	if (!info->attrs[SMC_PNETID_NAME])
+		return -EINVAL;
 	pnetelem = smc_pnet_find_pnetid(
 				(char *)nla_data(info->attrs[SMC_PNETID_NAME]));
 	if (!pnetelem)
@@ -359,6 +366,8 @@ static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
 
 static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
 {
+	if (!info->attrs[SMC_PNETID_NAME])
+		return -EINVAL;
 	return smc_pnet_remove_by_pnetid(
 				(char *)nla_data(info->attrs[SMC_PNETID_NAME]));
 }
-- 
2.17.0

^ permalink raw reply related

* [PATCH v2] net/mlx4_core: Fix error handling in mlx4_init_port_info.
From: Tarick Bedeir @ 2018-05-13 23:38 UTC (permalink / raw)
  To: tariqt, gthelen, netdev, linux-rdma, linux-kernel, tarick
  Cc: Greg Thelen, netdev, linux-rdma, linux-kernel, Tarick Bedeir

Avoid exiting the function with a lingering sysfs file (if the first
call to device_create_file() fails while the second succeeds), and avoid
calling devlink_port_unregister() twice.

In other words, either mlx4_init_port_info() succeeds and returns zero, or
it fails, returns non-zero, and requires no cleanup.

Fixes: 096335b3f983 ("mlx4_core: Allow dynamic MTU configuration for IB
ports")
Signed-off-by: Tarick Bedeir <tarick@google.com>
---
v1 -> v2: Added "Fixes" tag.
---
 drivers/net/ethernet/mellanox/mlx4/main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 4d84cab77105..e8a3a45d0b53 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -3007,6 +3007,7 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 		mlx4_err(dev, "Failed to create file for port %d\n", port);
 		devlink_port_unregister(&info->devlink_port);
 		info->port = -1;
+		return err;
 	}

 	sprintf(info->dev_mtu_name, "mlx4_port%d_mtu", port);
@@ -3028,9 +3029,10 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 				   &info->port_attr);
 		devlink_port_unregister(&info->devlink_port);
 		info->port = -1;
+		return err;
 	}

-	return err;
+	return 0;
 }

 static void mlx4_cleanup_port_info(struct mlx4_port_info *info)
-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply related

* [PATCH] net: ethernet: ti: Use ERR_CAST instead of ERR_PTR(PTR_ERR())
From: Hernán Gonzalez @ 2018-05-13 23:33 UTC (permalink / raw)
  To: davem
  Cc: grygorii.strashko, richardcochran, netdev, linux-kernel,
	Hernán Gonzalez

Use ERR_CAST inlined function instead of ERR_PTR(PTR_ERR(...)).

drivers/net/ethernet/ti/cpts.c:567:9-16: WARNING: ERR_CAST can be used with cpts->refclk
Generated by: scripts/coccinelle/api/err_cast.cocci

Signed-off-by: Hernán Gonzalez <hernan@vanguardiasur.com.ar>
---
 drivers/net/ethernet/ti/cpts.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c
index e7b76f6..7842f09 100644
--- a/drivers/net/ethernet/ti/cpts.c
+++ b/drivers/net/ethernet/ti/cpts.c
@@ -564,7 +564,7 @@ struct cpts *cpts_create(struct device *dev, void __iomem *regs,
 	cpts->refclk = devm_clk_get(dev, "cpts");
 	if (IS_ERR(cpts->refclk)) {
 		dev_err(dev, "Failed to get cpts refclk\n");
-		return ERR_PTR(PTR_ERR(cpts->refclk));
+		return ERR_CAST(cpts->refclk);
 	}
 
 	clk_prepare(cpts->refclk);
-- 
2.7.4

^ permalink raw reply related

* [PATCH 13/15] netfilter: nft_compat: fix handling of large matchinfo size
From: Pablo Neira Ayuso @ 2018-05-13 22:36 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180513223656.10077-1-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

currently matchinfo gets stored in the expression, but some xt matches
are very large.

To handle those we either need to switch nft core to kvmalloc and increase
size limit, or allocate the info blob of large matches separately.

This does the latter, this limits the scope of the changes to
nft_compat.

I picked a threshold of 192, this allows most matches to work as before and
handle only few ones via separate alloation (cgroup, u32, sctp, rt).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 64 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index dec0afb0ffe0..1d99a1efdafc 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -36,6 +36,13 @@ struct nft_xt {
 	struct rcu_head		rcu_head;
 };
 
+/* Used for matches where *info is larger than X byte */
+#define NFT_MATCH_LARGE_THRESH	192
+
+struct nft_xt_match_priv {
+	void *info;
+};
+
 static bool nft_xt_put(struct nft_xt *xt)
 {
 	if (--xt->refcnt == 0) {
@@ -352,6 +359,15 @@ static void __nft_match_eval(const struct nft_expr *expr,
 	}
 }
 
+static void nft_match_large_eval(const struct nft_expr *expr,
+				 struct nft_regs *regs,
+				 const struct nft_pktinfo *pkt)
+{
+	struct nft_xt_match_priv *priv = nft_expr_priv(expr);
+
+	__nft_match_eval(expr, regs, pkt, priv->info);
+}
+
 static void nft_match_eval(const struct nft_expr *expr,
 			   struct nft_regs *regs,
 			   const struct nft_pktinfo *pkt)
@@ -458,6 +474,24 @@ nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 	return __nft_match_init(ctx, expr, tb, nft_expr_priv(expr));
 }
 
+static int
+nft_match_large_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		     const struct nlattr * const tb[])
+{
+	struct nft_xt_match_priv *priv = nft_expr_priv(expr);
+	struct xt_match *m = expr->ops->data;
+	int ret;
+
+	priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL);
+	if (!priv->info)
+		return -ENOMEM;
+
+	ret = __nft_match_init(ctx, expr, tb, priv->info);
+	if (ret)
+		kfree(priv->info);
+	return ret;
+}
+
 static void
 __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		    void *info)
@@ -482,6 +516,15 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 	__nft_match_destroy(ctx, expr, nft_expr_priv(expr));
 }
 
+static void
+nft_match_large_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+	struct nft_xt_match_priv *priv = nft_expr_priv(expr);
+
+	__nft_match_destroy(ctx, expr, priv->info);
+	kfree(priv->info);
+}
+
 static int __nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr,
 			    void *info)
 {
@@ -503,6 +546,13 @@ static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
 	return __nft_match_dump(skb, expr, nft_expr_priv(expr));
 }
 
+static int nft_match_large_dump(struct sk_buff *skb, const struct nft_expr *e)
+{
+	struct nft_xt_match_priv *priv = nft_expr_priv(e);
+
+	return __nft_match_dump(skb, e, priv->info);
+}
+
 static int nft_match_validate(const struct nft_ctx *ctx,
 			      const struct nft_expr *expr,
 			      const struct nft_data **data)
@@ -670,6 +720,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 {
 	struct nft_xt *nft_match;
 	struct xt_match *match;
+	unsigned int matchsize;
 	char *mt_name;
 	u32 rev, family;
 	int err;
@@ -709,7 +760,6 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 
 	nft_match->refcnt = 0;
 	nft_match->ops.type = &nft_match_type;
-	nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize));
 	nft_match->ops.eval = nft_match_eval;
 	nft_match->ops.init = nft_match_init;
 	nft_match->ops.destroy = nft_match_destroy;
@@ -717,6 +767,18 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 	nft_match->ops.validate = nft_match_validate;
 	nft_match->ops.data = match;
 
+	matchsize = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize));
+	if (matchsize > NFT_MATCH_LARGE_THRESH) {
+		matchsize = NFT_EXPR_SIZE(sizeof(struct nft_xt_match_priv));
+
+		nft_match->ops.eval = nft_match_large_eval;
+		nft_match->ops.init = nft_match_large_init;
+		nft_match->ops.destroy = nft_match_large_destroy;
+		nft_match->ops.dump = nft_match_large_dump;
+	}
+
+	nft_match->ops.size = matchsize;
+
 	list_add(&nft_match->head, &nft_match_list);
 
 	return &nft_match->ops;
-- 
2.11.0

^ permalink raw reply related

* [PATCH 15/15] netfilter: nf_tables: fix memory leak on error exit return
From: Pablo Neira Ayuso @ 2018-05-13 22:36 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180513223656.10077-1-pablo@netfilter.org>

From: Colin Ian King <colin.king@canonical.com>

Currently the -EBUSY error return path is not free'ing resources
allocated earlier, leaving a memory leak. Fix this by exiting via the
error exit label err5 that performs the necessary resource clean
up.

Detected by CoverityScan, CID#1432975 ("Resource leak")

Fixes: 9744a6fcefcb ("netfilter: nf_tables: check if same extensions are set when adding elements")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3806db31cbbf..91e80aa852d6 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4080,8 +4080,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 			if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^
 			    nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) ||
 			    nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^
-			    nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF))
-				return -EBUSY;
+			    nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) {
+				err = -EBUSY;
+				goto err5;
+			}
 			if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
 			     nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
 			     memcmp(nft_set_ext_data(ext),
-- 
2.11.0

^ permalink raw reply related

* [PATCH 12/15] netfilter: nft_compat: prepare for indirect info storage
From: Pablo Neira Ayuso @ 2018-05-13 22:36 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180513223656.10077-1-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

Next patch will make it possible for *info to be stored in
a separate allocation instead of the expr private area.

This removes the 'expr priv area is info blob' assumption
from the match init/destroy/eval functions.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 47 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 870d8c29dae9..dec0afb0ffe0 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -324,11 +324,11 @@ static int nft_target_validate(const struct nft_ctx *ctx,
 	return 0;
 }
 
-static void nft_match_eval(const struct nft_expr *expr,
-			   struct nft_regs *regs,
-			   const struct nft_pktinfo *pkt)
+static void __nft_match_eval(const struct nft_expr *expr,
+			     struct nft_regs *regs,
+			     const struct nft_pktinfo *pkt,
+			     void *info)
 {
-	void *info = nft_expr_priv(expr);
 	struct xt_match *match = expr->ops->data;
 	struct sk_buff *skb = pkt->skb;
 	bool ret;
@@ -352,6 +352,13 @@ static void nft_match_eval(const struct nft_expr *expr,
 	}
 }
 
+static void nft_match_eval(const struct nft_expr *expr,
+			   struct nft_regs *regs,
+			   const struct nft_pktinfo *pkt)
+{
+	__nft_match_eval(expr, regs, pkt, nft_expr_priv(expr));
+}
+
 static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = {
 	[NFTA_MATCH_NAME]	= { .type = NLA_NUL_STRING },
 	[NFTA_MATCH_REV]	= { .type = NLA_U32 },
@@ -412,10 +419,10 @@ static void match_compat_from_user(struct xt_match *m, void *in, void *out)
 }
 
 static int
-nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
-		const struct nlattr * const tb[])
+__nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		 const struct nlattr * const tb[],
+		 void *info)
 {
-	void *info = nft_expr_priv(expr);
 	struct xt_match *match = expr->ops->data;
 	struct xt_mtchk_param par;
 	size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO]));
@@ -444,11 +451,18 @@ nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 	return 0;
 }
 
+static int
+nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+	       const struct nlattr * const tb[])
+{
+	return __nft_match_init(ctx, expr, tb, nft_expr_priv(expr));
+}
+
 static void
-nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+__nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		    void *info)
 {
 	struct xt_match *match = expr->ops->data;
-	void *info = nft_expr_priv(expr);
 	struct xt_mtdtor_param par;
 
 	par.net = ctx->net;
@@ -462,9 +476,15 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 		module_put(match->me);
 }
 
-static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static void
+nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+	__nft_match_destroy(ctx, expr, nft_expr_priv(expr));
+}
+
+static int __nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr,
+			    void *info)
 {
-	void *info = nft_expr_priv(expr);
 	struct xt_match *match = expr->ops->data;
 
 	if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) ||
@@ -478,6 +498,11 @@ static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
 	return -1;
 }
 
+static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	return __nft_match_dump(skb, expr, nft_expr_priv(expr));
+}
+
 static int nft_match_validate(const struct nft_ctx *ctx,
 			      const struct nft_expr *expr,
 			      const struct nft_data **data)
-- 
2.11.0

^ permalink raw reply related

* [PATCH 14/15] netfilter: nf_tables: bogus EBUSY in chain deletions
From: Pablo Neira Ayuso @ 2018-05-13 22:36 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180513223656.10077-1-pablo@netfilter.org>

When removing a rule that jumps to chain and such chain in the same
batch, this bogusly hits EBUSY. Add activate and deactivate operations
to expression that can be called from the preparation and the
commit/abort phases.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  5 +++++
 net/netfilter/nf_tables_api.c     | 46 +++++++++++++++++++++++++++++++++++----
 net/netfilter/nft_immediate.c     | 15 ++++++++++---
 3 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index cd368d1b8cb8..a1e28dd5d0bf 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -170,6 +170,7 @@ struct nft_data_desc {
 int nft_data_init(const struct nft_ctx *ctx,
 		  struct nft_data *data, unsigned int size,
 		  struct nft_data_desc *desc, const struct nlattr *nla);
+void nft_data_hold(const struct nft_data *data, enum nft_data_types type);
 void nft_data_release(const struct nft_data *data, enum nft_data_types type);
 int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
 		  enum nft_data_types type, unsigned int len);
@@ -736,6 +737,10 @@ struct nft_expr_ops {
 	int				(*init)(const struct nft_ctx *ctx,
 						const struct nft_expr *expr,
 						const struct nlattr * const tb[]);
+	void				(*activate)(const struct nft_ctx *ctx,
+						    const struct nft_expr *expr);
+	void				(*deactivate)(const struct nft_ctx *ctx,
+						      const struct nft_expr *expr);
 	void				(*destroy)(const struct nft_ctx *ctx,
 						   const struct nft_expr *expr);
 	int				(*dump)(struct sk_buff *skb,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 785d7fcf1fe1..3806db31cbbf 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -214,6 +214,34 @@ static int nft_delchain(struct nft_ctx *ctx)
 	return err;
 }
 
+static void nft_rule_expr_activate(const struct nft_ctx *ctx,
+				   struct nft_rule *rule)
+{
+	struct nft_expr *expr;
+
+	expr = nft_expr_first(rule);
+	while (expr != nft_expr_last(rule) && expr->ops) {
+		if (expr->ops->activate)
+			expr->ops->activate(ctx, expr);
+
+		expr = nft_expr_next(expr);
+	}
+}
+
+static void nft_rule_expr_deactivate(const struct nft_ctx *ctx,
+				     struct nft_rule *rule)
+{
+	struct nft_expr *expr;
+
+	expr = nft_expr_first(rule);
+	while (expr != nft_expr_last(rule) && expr->ops) {
+		if (expr->ops->deactivate)
+			expr->ops->deactivate(ctx, expr);
+
+		expr = nft_expr_next(expr);
+	}
+}
+
 static int
 nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule)
 {
@@ -259,6 +287,7 @@ static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule)
 		nft_trans_destroy(trans);
 		return err;
 	}
+	nft_rule_expr_deactivate(ctx, rule);
 
 	return 0;
 }
@@ -2238,6 +2267,13 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
 	kfree(rule);
 }
 
+static void nf_tables_rule_release(const struct nft_ctx *ctx,
+				   struct nft_rule *rule)
+{
+	nft_rule_expr_deactivate(ctx, rule);
+	nf_tables_rule_destroy(ctx, rule);
+}
+
 #define NFT_RULE_MAXEXPRS	128
 
 static struct nft_expr_info *info;
@@ -2402,7 +2438,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 	return 0;
 
 err2:
-	nf_tables_rule_destroy(&ctx, rule);
+	nf_tables_rule_release(&ctx, rule);
 err1:
 	for (i = 0; i < n; i++) {
 		if (info[i].ops != NULL)
@@ -4130,7 +4166,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
  *	NFT_GOTO verdicts. This function must be called on active data objects
  *	from the second phase of the commit protocol.
  */
-static void nft_data_hold(const struct nft_data *data, enum nft_data_types type)
+void nft_data_hold(const struct nft_data *data, enum nft_data_types type)
 {
 	if (type == NFT_DATA_VERDICT) {
 		switch (data->verdict.code) {
@@ -6015,10 +6051,12 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 		case NFT_MSG_NEWRULE:
 			trans->ctx.chain->use--;
 			list_del_rcu(&nft_trans_rule(trans)->list);
+			nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans));
 			break;
 		case NFT_MSG_DELRULE:
 			trans->ctx.chain->use++;
 			nft_clear(trans->ctx.net, nft_trans_rule(trans));
+			nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans));
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_NEWSET:
@@ -6594,7 +6632,7 @@ int __nft_release_basechain(struct nft_ctx *ctx)
 	list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) {
 		list_del(&rule->list);
 		ctx->chain->use--;
-		nf_tables_rule_destroy(ctx, rule);
+		nf_tables_rule_release(ctx, rule);
 	}
 	list_del(&ctx->chain->list);
 	ctx->table->use--;
@@ -6632,7 +6670,7 @@ static void __nft_release_tables(struct net *net)
 			list_for_each_entry_safe(rule, nr, &chain->rules, list) {
 				list_del(&rule->list);
 				chain->use--;
-				nf_tables_rule_destroy(&ctx, rule);
+				nf_tables_rule_release(&ctx, rule);
 			}
 		}
 		list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) {
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 4717d7796927..aa87ff8beae8 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -69,8 +69,16 @@ static int nft_immediate_init(const struct nft_ctx *ctx,
 	return err;
 }
 
-static void nft_immediate_destroy(const struct nft_ctx *ctx,
-				  const struct nft_expr *expr)
+static void nft_immediate_activate(const struct nft_ctx *ctx,
+				   const struct nft_expr *expr)
+{
+	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+	return nft_data_hold(&priv->data, nft_dreg_to_type(priv->dreg));
+}
+
+static void nft_immediate_deactivate(const struct nft_ctx *ctx,
+				     const struct nft_expr *expr)
 {
 	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
 
@@ -108,7 +116,8 @@ static const struct nft_expr_ops nft_imm_ops = {
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
 	.eval		= nft_immediate_eval,
 	.init		= nft_immediate_init,
-	.destroy	= nft_immediate_destroy,
+	.activate	= nft_immediate_activate,
+	.deactivate	= nft_immediate_deactivate,
 	.dump		= nft_immediate_dump,
 	.validate	= nft_immediate_validate,
 };
-- 
2.11.0

^ permalink raw reply related

* [PATCH 11/15] netfilter: nf_tables: don't assume chain stats are set when jumplabel is set
From: Pablo Neira Ayuso @ 2018-05-13 22:36 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180513223656.10077-1-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

nft_chain_stats_replace() and all other spots assume ->stats can be
NULL, but nft_update_chain_stats does not.  It must do this check,
just because the jump label is set doesn't mean all basechains have stats
assigned.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_core.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index dfd0bf3810d2..942702a2776f 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -119,15 +119,22 @@ DEFINE_STATIC_KEY_FALSE(nft_counters_enabled);
 static noinline void nft_update_chain_stats(const struct nft_chain *chain,
 					    const struct nft_pktinfo *pkt)
 {
+	struct nft_base_chain *base_chain;
 	struct nft_stats *stats;
 
-	local_bh_disable();
-	stats = this_cpu_ptr(rcu_dereference(nft_base_chain(chain)->stats));
-	u64_stats_update_begin(&stats->syncp);
-	stats->pkts++;
-	stats->bytes += pkt->skb->len;
-	u64_stats_update_end(&stats->syncp);
-	local_bh_enable();
+	base_chain = nft_base_chain(chain);
+	if (!base_chain->stats)
+		return;
+
+	stats = this_cpu_ptr(rcu_dereference(base_chain->stats));
+	if (stats) {
+		local_bh_disable();
+		u64_stats_update_begin(&stats->syncp);
+		stats->pkts++;
+		stats->bytes += pkt->skb->len;
+		u64_stats_update_end(&stats->syncp);
+		local_bh_enable();
+	}
 }
 
 struct nft_jumpstack {
-- 
2.11.0

^ permalink raw reply related

* [PATCH 10/15] netfilter: x_tables: add module alias for icmp matches
From: Pablo Neira Ayuso @ 2018-05-13 22:36 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180513223656.10077-1-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

The icmp matches are implemented in ip_tables and ip6_tables,
respectively, so for normal iptables they are always available:
those modules are loaded once iptables calls getsockopt() to fetch
available module revisions.

In iptables-over-nftables case probing occurs via nfnetlink, so
these modules might not be loaded.  Add aliases so modprobe can load
these when icmp/icmp6 is requested.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ip_tables.c  | 1 +
 net/ipv6/netfilter/ip6_tables.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 44b308d93ec2..e85f35b89c49 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -34,6 +34,7 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("IPv4 packet filter");
+MODULE_ALIAS("ipt_icmp");
 
 void *ipt_alloc_initial_table(const struct xt_table *info)
 {
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 65c9e1a58305..97f79dc943d7 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -38,6 +38,7 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("IPv6 packet filter");
+MODULE_ALIAS("ip6t_icmp6");
 
 void *ip6t_alloc_initial_table(const struct xt_table *info)
 {
-- 
2.11.0

^ permalink raw reply related

* [PATCH 09/15] netfilter: prefer nla_strlcpy for dealing with NLA_STRING attributes
From: Pablo Neira Ayuso @ 2018-05-13 22:36 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180513223656.10077-1-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

fixes these warnings:
'nfnl_cthelper_create' at net/netfilter/nfnetlink_cthelper.c:237:2,
'nfnl_cthelper_new' at net/netfilter/nfnetlink_cthelper.c:450:9:
./include/linux/string.h:246:9: warning: '__builtin_strncpy' specified bound 16 equals destination size [-Wstringop-truncation]
  return __builtin_strncpy(p, q, size);
         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Moreover, strncpy assumes null-terminated source buffers, but thats
not the case here.
Unlike strlcpy, nla_strlcpy *does* pad the destination buffer
while also considering nla attribute size.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_acct.c     | 2 +-
 net/netfilter/nfnetlink_cthelper.c | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index b9505bcd3827..6ddf89183e7b 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -115,7 +115,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
 		nfacct->flags = flags;
 	}
 
-	strncpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX);
+	nla_strlcpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX);
 
 	if (tb[NFACCT_BYTES]) {
 		atomic64_set(&nfacct->bytes,
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 4a4b293fb2e5..fa026b269b36 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -149,8 +149,8 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
 	    !tb[NFCTH_POLICY_EXPECT_TIMEOUT])
 		return -EINVAL;
 
-	strncpy(expect_policy->name,
-		nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN);
+	nla_strlcpy(expect_policy->name,
+		    nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN);
 	expect_policy->max_expected =
 		ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
 	if (expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT)
@@ -234,7 +234,8 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
 	if (ret < 0)
 		goto err1;
 
-	strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN);
+	nla_strlcpy(helper->name,
+		    nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN);
 	size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
 	if (size > FIELD_SIZEOF(struct nf_conn_help, data)) {
 		ret = -ENOMEM;
-- 
2.11.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox