* [PATCH bpf-next 12/12] bpf: sync tools bpf.h uapi header
From: Daniel Borkmann @ 2018-05-03 1:05 UTC (permalink / raw)
To: ast; +Cc: netdev, Daniel Borkmann
In-Reply-To: <20180503010536.7917-1-daniel@iogearbox.net>
Only sync the header from include/uapi/linux/bpf.h.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
tools/include/uapi/linux/bpf.h | 33 ++++++++++++++++++++++++++++++++-
1 file changed, 32 insertions(+), 1 deletion(-)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 8daef73..83a95ae 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1801,6 +1801,30 @@ union bpf_attr {
* Return
* a non-negative value equal to or less than size on success, or
* a negative error in case of failure.
+ *
+ * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
+ * Description
+ * This helper is similar to **bpf_skb_load_bytes**\ () in that
+ * it provides an easy way to load *len* bytes from *offset*
+ * from the packet associated to *skb*, into the buffer pointed
+ * by *to*. The difference to **bpf_skb_load_bytes**\ () is that
+ * a fifth argument *start_header* exists in order to select a
+ * base offset to start from. *start_header* can be one of:
+ *
+ * **BPF_HDR_START_MAC**
+ * Base offset to load data from is *skb*'s mac header.
+ * **BPF_HDR_START_NET**
+ * Base offset to load data from is *skb*'s network header.
+ *
+ * In general, "direct packet access" is the preferred method to
+ * access packet data, however, this helper is in particular useful
+ * in socket filters where *skb*\ **->data** does not always point
+ * to the start of the mac header and where "direct packet access"
+ * is not available.
+ *
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -1870,7 +1894,8 @@ union bpf_attr {
FN(bind), \
FN(xdp_adjust_tail), \
FN(skb_get_xfrm_state), \
- FN(get_stack),
+ FN(get_stack), \
+ FN(skb_load_bytes_relative),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -1931,6 +1956,12 @@ enum bpf_adj_room_mode {
BPF_ADJ_ROOM_NET,
};
+/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
+enum bpf_hdr_start_off {
+ BPF_HDR_START_MAC,
+ BPF_HDR_START_NET,
+};
+
/* user accessible mirror of in-kernel sk_buff.
* new fields can only be added to the end of this structure
*/
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next 11/12] bpf, s390x: remove ld_abs/ld_ind
From: Daniel Borkmann @ 2018-05-03 1:05 UTC (permalink / raw)
To: ast; +Cc: netdev, Daniel Borkmann, Michael Holzheu
In-Reply-To: <20180503010536.7917-1-daniel@iogearbox.net>
Since LD_ABS/LD_IND instructions are now removed from the core and
reimplemented through a combination of inlined BPF instructions and
a slow-path helper, we can get rid of the complexity from s390x JIT.
Tested on s390x instance on LinuxONE.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
arch/s390/net/Makefile | 2 +-
arch/s390/net/bpf_jit.S | 116 ---------------------------------------
arch/s390/net/bpf_jit.h | 20 +------
arch/s390/net/bpf_jit_comp.c | 127 ++++---------------------------------------
4 files changed, 13 insertions(+), 252 deletions(-)
delete mode 100644 arch/s390/net/bpf_jit.S
diff --git a/arch/s390/net/Makefile b/arch/s390/net/Makefile
index e0d5f24..d4663b4 100644
--- a/arch/s390/net/Makefile
+++ b/arch/s390/net/Makefile
@@ -2,4 +2,4 @@
#
# Arch-specific network modules
#
-obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o
diff --git a/arch/s390/net/bpf_jit.S b/arch/s390/net/bpf_jit.S
deleted file mode 100644
index 25bb464..0000000
--- a/arch/s390/net/bpf_jit.S
+++ /dev/null
@@ -1,116 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * BPF Jit compiler for s390, help functions.
- *
- * Copyright IBM Corp. 2012,2015
- *
- * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- * Michael Holzheu <holzheu@linux.vnet.ibm.com>
- */
-
-#include <linux/linkage.h>
-#include "bpf_jit.h"
-
-/*
- * Calling convention:
- * registers %r7-%r10, %r11,%r13, and %r15 are call saved
- *
- * Input (64 bit):
- * %r3 (%b2) = offset into skb data
- * %r6 (%b5) = return address
- * %r7 (%b6) = skb pointer
- * %r12 = skb data pointer
- *
- * Output:
- * %r14= %b0 = return value (read skb value)
- *
- * Work registers: %r2,%r4,%r5,%r14
- *
- * skb_copy_bits takes 4 parameters:
- * %r2 = skb pointer
- * %r3 = offset into skb data
- * %r4 = pointer to temp buffer
- * %r5 = length to copy
- * Return value in %r2: 0 = ok
- *
- * bpf_internal_load_pointer_neg_helper takes 3 parameters:
- * %r2 = skb pointer
- * %r3 = offset into data
- * %r4 = length to copy
- * Return value in %r2: Pointer to data
- */
-
-#define SKF_MAX_NEG_OFF -0x200000 /* SKF_LL_OFF from filter.h */
-
-/*
- * Load SIZE bytes from SKB
- */
-#define sk_load_common(NAME, SIZE, LOAD) \
-ENTRY(sk_load_##NAME); \
- ltgr %r3,%r3; /* Is offset negative? */ \
- jl sk_load_##NAME##_slow_neg; \
-ENTRY(sk_load_##NAME##_pos); \
- aghi %r3,SIZE; /* Offset + SIZE */ \
- clg %r3,STK_OFF_HLEN(%r15); /* Offset + SIZE > hlen? */ \
- jh sk_load_##NAME##_slow; \
- LOAD %r14,-SIZE(%r3,%r12); /* Get data from skb */ \
- b OFF_OK(%r6); /* Return */ \
- \
-sk_load_##NAME##_slow:; \
- lgr %r2,%r7; /* Arg1 = skb pointer */ \
- aghi %r3,-SIZE; /* Arg2 = offset */ \
- la %r4,STK_OFF_TMP(%r15); /* Arg3 = temp bufffer */ \
- lghi %r5,SIZE; /* Arg4 = size */ \
- brasl %r14,skb_copy_bits; /* Get data from skb */ \
- LOAD %r14,STK_OFF_TMP(%r15); /* Load from temp bufffer */ \
- ltgr %r2,%r2; /* Set cc to (%r2 != 0) */ \
- br %r6; /* Return */
-
-sk_load_common(word, 4, llgf) /* r14 = *(u32 *) (skb->data+offset) */
-sk_load_common(half, 2, llgh) /* r14 = *(u16 *) (skb->data+offset) */
-
-/*
- * Load 1 byte from SKB (optimized version)
- */
- /* r14 = *(u8 *) (skb->data+offset) */
-ENTRY(sk_load_byte)
- ltgr %r3,%r3 # Is offset negative?
- jl sk_load_byte_slow_neg
-ENTRY(sk_load_byte_pos)
- clg %r3,STK_OFF_HLEN(%r15) # Offset >= hlen?
- jnl sk_load_byte_slow
- llgc %r14,0(%r3,%r12) # Get byte from skb
- b OFF_OK(%r6) # Return OK
-
-sk_load_byte_slow:
- lgr %r2,%r7 # Arg1 = skb pointer
- # Arg2 = offset
- la %r4,STK_OFF_TMP(%r15) # Arg3 = pointer to temp buffer
- lghi %r5,1 # Arg4 = size (1 byte)
- brasl %r14,skb_copy_bits # Get data from skb
- llgc %r14,STK_OFF_TMP(%r15) # Load result from temp buffer
- ltgr %r2,%r2 # Set cc to (%r2 != 0)
- br %r6 # Return cc
-
-#define sk_negative_common(NAME, SIZE, LOAD) \
-sk_load_##NAME##_slow_neg:; \
- cgfi %r3,SKF_MAX_NEG_OFF; \
- jl bpf_error; \
- lgr %r2,%r7; /* Arg1 = skb pointer */ \
- /* Arg2 = offset */ \
- lghi %r4,SIZE; /* Arg3 = size */ \
- brasl %r14,bpf_internal_load_pointer_neg_helper; \
- ltgr %r2,%r2; \
- jz bpf_error; \
- LOAD %r14,0(%r2); /* Get data from pointer */ \
- xr %r3,%r3; /* Set cc to zero */ \
- br %r6; /* Return cc */
-
-sk_negative_common(word, 4, llgf)
-sk_negative_common(half, 2, llgh)
-sk_negative_common(byte, 1, llgc)
-
-bpf_error:
-# force a return 0 from jit handler
- ltgr %r15,%r15 # Set condition code
- br %r6
diff --git a/arch/s390/net/bpf_jit.h b/arch/s390/net/bpf_jit.h
index 5e1e513..7822ea9 100644
--- a/arch/s390/net/bpf_jit.h
+++ b/arch/s390/net/bpf_jit.h
@@ -16,9 +16,6 @@
#include <linux/filter.h>
#include <linux/types.h>
-extern u8 sk_load_word_pos[], sk_load_half_pos[], sk_load_byte_pos[];
-extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
-
#endif /* __ASSEMBLY__ */
/*
@@ -36,15 +33,6 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
* | | |
* | BPF stack | |
* | | |
- * +---------------+ |
- * | 8 byte skbp | |
- * R15+176 -> +---------------+ |
- * | 8 byte hlen | |
- * R15+168 -> +---------------+ |
- * | 4 byte align | |
- * +---------------+ |
- * | 4 byte temp | |
- * | for bpf_jit.S | |
* R15+160 -> +---------------+ |
* | new backchain | |
* R15+152 -> +---------------+ |
@@ -57,17 +45,11 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
* The stack size used by the BPF program ("BPF stack" above) is passed
* via "aux->stack_depth".
*/
-#define STK_SPACE_ADD (8 + 8 + 4 + 4 + 160)
+#define STK_SPACE_ADD (160)
#define STK_160_UNUSED (160 - 12 * 8)
#define STK_OFF (STK_SPACE_ADD - STK_160_UNUSED)
-#define STK_OFF_TMP 160 /* Offset of tmp buffer on stack */
-#define STK_OFF_HLEN 168 /* Offset of SKB header length on stack */
-#define STK_OFF_SKBP 176 /* Offset of SKB pointer on stack */
#define STK_OFF_R6 (160 - 11 * 8) /* Offset of r6 on stack */
#define STK_OFF_TCCNT (160 - 12 * 8) /* Offset of tail_call_cnt on stack */
-/* Offset to skip condition code check */
-#define OFF_OK 4
-
#endif /* __ARCH_S390_NET_BPF_JIT_H */
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 78a19c9..b020bea 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -47,23 +47,21 @@ struct bpf_jit {
#define BPF_SIZE_MAX 0xffff /* Max size for program (16 bit branches) */
-#define SEEN_SKB 1 /* skb access */
-#define SEEN_MEM 2 /* use mem[] for temporary storage */
-#define SEEN_RET0 4 /* ret0_ip points to a valid return 0 */
-#define SEEN_LITERAL 8 /* code uses literals */
-#define SEEN_FUNC 16 /* calls C functions */
-#define SEEN_TAIL_CALL 32 /* code uses tail calls */
-#define SEEN_REG_AX 64 /* code uses constant blinding */
-#define SEEN_STACK (SEEN_FUNC | SEEN_MEM | SEEN_SKB)
+#define SEEN_MEM (1 << 0) /* use mem[] for temporary storage */
+#define SEEN_RET0 (1 << 1) /* ret0_ip points to a valid return 0 */
+#define SEEN_LITERAL (1 << 2) /* code uses literals */
+#define SEEN_FUNC (1 << 3) /* calls C functions */
+#define SEEN_TAIL_CALL (1 << 4) /* code uses tail calls */
+#define SEEN_REG_AX (1 << 5) /* code uses constant blinding */
+#define SEEN_STACK (SEEN_FUNC | SEEN_MEM)
/*
* s390 registers
*/
#define REG_W0 (MAX_BPF_JIT_REG + 0) /* Work register 1 (even) */
#define REG_W1 (MAX_BPF_JIT_REG + 1) /* Work register 2 (odd) */
-#define REG_SKB_DATA (MAX_BPF_JIT_REG + 2) /* SKB data register */
-#define REG_L (MAX_BPF_JIT_REG + 3) /* Literal pool register */
-#define REG_15 (MAX_BPF_JIT_REG + 4) /* Register 15 */
+#define REG_L (MAX_BPF_JIT_REG + 2) /* Literal pool register */
+#define REG_15 (MAX_BPF_JIT_REG + 3) /* Register 15 */
#define REG_0 REG_W0 /* Register 0 */
#define REG_1 REG_W1 /* Register 1 */
#define REG_2 BPF_REG_1 /* Register 2 */
@@ -88,10 +86,8 @@ static const int reg2hex[] = {
[BPF_REG_9] = 10,
/* BPF stack pointer */
[BPF_REG_FP] = 13,
- /* Register for blinding (shared with REG_SKB_DATA) */
+ /* Register for blinding */
[BPF_REG_AX] = 12,
- /* SKB data pointer */
- [REG_SKB_DATA] = 12,
/* Work registers for s390x backend */
[REG_W0] = 0,
[REG_W1] = 1,
@@ -385,27 +381,6 @@ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth)
}
/*
- * For SKB access %b1 contains the SKB pointer. For "bpf_jit.S"
- * we store the SKB header length on the stack and the SKB data
- * pointer in REG_SKB_DATA if BPF_REG_AX is not used.
- */
-static void emit_load_skb_data_hlen(struct bpf_jit *jit)
-{
- /* Header length: llgf %w1,<len>(%b1) */
- EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_1,
- offsetof(struct sk_buff, len));
- /* s %w1,<data_len>(%b1) */
- EMIT4_DISP(0x5b000000, REG_W1, BPF_REG_1,
- offsetof(struct sk_buff, data_len));
- /* stg %w1,ST_OFF_HLEN(%r0,%r15) */
- EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, REG_15, STK_OFF_HLEN);
- if (!(jit->seen & SEEN_REG_AX))
- /* lg %skb_data,data_off(%b1) */
- EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0,
- BPF_REG_1, offsetof(struct sk_buff, data));
-}
-
-/*
* Emit function prologue
*
* Save registers and create stack frame if necessary.
@@ -445,12 +420,6 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,
REG_15, 152);
}
- if (jit->seen & SEEN_SKB) {
- emit_load_skb_data_hlen(jit);
- /* stg %b1,ST_OFF_SKBP(%r0,%r15) */
- EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15,
- STK_OFF_SKBP);
- }
}
/*
@@ -483,12 +452,12 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
{
struct bpf_insn *insn = &fp->insnsi[i];
int jmp_off, last, insn_count = 1;
- unsigned int func_addr, mask;
u32 dst_reg = insn->dst_reg;
u32 src_reg = insn->src_reg;
u32 *addrs = jit->addrs;
s32 imm = insn->imm;
s16 off = insn->off;
+ unsigned int mask;
if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX)
jit->seen |= SEEN_REG_AX;
@@ -970,13 +939,6 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
EMIT2(0x0d00, REG_14, REG_W1);
/* lgr %b0,%r2: load return value into %b0 */
EMIT4(0xb9040000, BPF_REG_0, REG_2);
- if ((jit->seen & SEEN_SKB) &&
- bpf_helper_changes_pkt_data((void *)func)) {
- /* lg %b1,ST_OFF_SKBP(%r15) */
- EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0,
- REG_15, STK_OFF_SKBP);
- emit_load_skb_data_hlen(jit);
- }
break;
}
case BPF_JMP | BPF_TAIL_CALL:
@@ -1176,73 +1138,6 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
jmp_off = addrs[i + off + 1] - (addrs[i + 1] - 4);
EMIT4_PCREL(0xa7040000 | mask << 8, jmp_off);
break;
- /*
- * BPF_LD
- */
- case BPF_LD | BPF_ABS | BPF_B: /* b0 = *(u8 *) (skb->data+imm) */
- case BPF_LD | BPF_IND | BPF_B: /* b0 = *(u8 *) (skb->data+imm+src) */
- if ((BPF_MODE(insn->code) == BPF_ABS) && (imm >= 0))
- func_addr = __pa(sk_load_byte_pos);
- else
- func_addr = __pa(sk_load_byte);
- goto call_fn;
- case BPF_LD | BPF_ABS | BPF_H: /* b0 = *(u16 *) (skb->data+imm) */
- case BPF_LD | BPF_IND | BPF_H: /* b0 = *(u16 *) (skb->data+imm+src) */
- if ((BPF_MODE(insn->code) == BPF_ABS) && (imm >= 0))
- func_addr = __pa(sk_load_half_pos);
- else
- func_addr = __pa(sk_load_half);
- goto call_fn;
- case BPF_LD | BPF_ABS | BPF_W: /* b0 = *(u32 *) (skb->data+imm) */
- case BPF_LD | BPF_IND | BPF_W: /* b0 = *(u32 *) (skb->data+imm+src) */
- if ((BPF_MODE(insn->code) == BPF_ABS) && (imm >= 0))
- func_addr = __pa(sk_load_word_pos);
- else
- func_addr = __pa(sk_load_word);
- goto call_fn;
-call_fn:
- jit->seen |= SEEN_SKB | SEEN_RET0 | SEEN_FUNC;
- REG_SET_SEEN(REG_14); /* Return address of possible func call */
-
- /*
- * Implicit input:
- * BPF_REG_6 (R7) : skb pointer
- * REG_SKB_DATA (R12): skb data pointer (if no BPF_REG_AX)
- *
- * Calculated input:
- * BPF_REG_2 (R3) : offset of byte(s) to fetch in skb
- * BPF_REG_5 (R6) : return address
- *
- * Output:
- * BPF_REG_0 (R14): data read from skb
- *
- * Scratch registers (BPF_REG_1-5)
- */
-
- /* Call function: llilf %w1,func_addr */
- EMIT6_IMM(0xc00f0000, REG_W1, func_addr);
-
- /* Offset: lgfi %b2,imm */
- EMIT6_IMM(0xc0010000, BPF_REG_2, imm);
- if (BPF_MODE(insn->code) == BPF_IND)
- /* agfr %b2,%src (%src is s32 here) */
- EMIT4(0xb9180000, BPF_REG_2, src_reg);
-
- /* Reload REG_SKB_DATA if BPF_REG_AX is used */
- if (jit->seen & SEEN_REG_AX)
- /* lg %skb_data,data_off(%b6) */
- EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0,
- BPF_REG_6, offsetof(struct sk_buff, data));
- /* basr %b5,%w1 (%b5 is call saved) */
- EMIT2(0x0d00, BPF_REG_5, REG_W1);
-
- /*
- * Note: For fast access we jump directly after the
- * jnz instruction from bpf_jit.S
- */
- /* jnz <ret0> */
- EMIT4_PCREL(0xa7740000, jit->ret0_ip - jit->prg);
- break;
default: /* too complex, give up */
pr_err("Unknown opcode %02x\n", insn->code);
return -1;
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next 07/12] bpf, sparc64: remove ld_abs/ld_ind
From: Daniel Borkmann @ 2018-05-03 1:05 UTC (permalink / raw)
To: ast; +Cc: netdev, Daniel Borkmann, David S . Miller
In-Reply-To: <20180503010536.7917-1-daniel@iogearbox.net>
Since LD_ABS/LD_IND instructions are now removed from the core and
reimplemented through a combination of inlined BPF instructions and
a slow-path helper, we can get rid of the complexity from sparc64 JIT.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
arch/sparc/net/Makefile | 5 +-
arch/sparc/net/bpf_jit_64.h | 29 -------
arch/sparc/net/bpf_jit_asm_64.S | 162 ---------------------------------------
arch/sparc/net/bpf_jit_comp_64.c | 79 +------------------
4 files changed, 6 insertions(+), 269 deletions(-)
delete mode 100644 arch/sparc/net/bpf_jit_asm_64.S
diff --git a/arch/sparc/net/Makefile b/arch/sparc/net/Makefile
index 76fa8e9..d32aac3 100644
--- a/arch/sparc/net/Makefile
+++ b/arch/sparc/net/Makefile
@@ -1,4 +1,7 @@
#
# Arch-specific network modules
#
-obj-$(CONFIG_BPF_JIT) += bpf_jit_asm_$(BITS).o bpf_jit_comp_$(BITS).o
+obj-$(CONFIG_BPF_JIT) += bpf_jit_comp_$(BITS).o
+ifeq ($(BITS),32)
+obj-$(CONFIG_BPF_JIT) += bpf_jit_asm_32.o
+endif
diff --git a/arch/sparc/net/bpf_jit_64.h b/arch/sparc/net/bpf_jit_64.h
index 428f7fd..fbc836f 100644
--- a/arch/sparc/net/bpf_jit_64.h
+++ b/arch/sparc/net/bpf_jit_64.h
@@ -33,35 +33,6 @@
#define I5 0x1d
#define FP 0x1e
#define I7 0x1f
-
-#define r_SKB L0
-#define r_HEADLEN L4
-#define r_SKB_DATA L5
-#define r_TMP G1
-#define r_TMP2 G3
-
-/* assembly code in arch/sparc/net/bpf_jit_asm_64.S */
-extern u32 bpf_jit_load_word[];
-extern u32 bpf_jit_load_half[];
-extern u32 bpf_jit_load_byte[];
-extern u32 bpf_jit_load_byte_msh[];
-extern u32 bpf_jit_load_word_positive_offset[];
-extern u32 bpf_jit_load_half_positive_offset[];
-extern u32 bpf_jit_load_byte_positive_offset[];
-extern u32 bpf_jit_load_byte_msh_positive_offset[];
-extern u32 bpf_jit_load_word_negative_offset[];
-extern u32 bpf_jit_load_half_negative_offset[];
-extern u32 bpf_jit_load_byte_negative_offset[];
-extern u32 bpf_jit_load_byte_msh_negative_offset[];
-
-#else
-#define r_RESULT %o0
-#define r_SKB %o0
-#define r_OFF %o1
-#define r_HEADLEN %l4
-#define r_SKB_DATA %l5
-#define r_TMP %g1
-#define r_TMP2 %g3
#endif
#endif /* _BPF_JIT_H */
diff --git a/arch/sparc/net/bpf_jit_asm_64.S b/arch/sparc/net/bpf_jit_asm_64.S
deleted file mode 100644
index 7177867..0000000
--- a/arch/sparc/net/bpf_jit_asm_64.S
+++ /dev/null
@@ -1,162 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <asm/ptrace.h>
-
-#include "bpf_jit_64.h"
-
-#define SAVE_SZ 176
-#define SCRATCH_OFF STACK_BIAS + 128
-#define BE_PTR(label) be,pn %xcc, label
-#define SIGN_EXTEND(reg) sra reg, 0, reg
-
-#define SKF_MAX_NEG_OFF (-0x200000) /* SKF_LL_OFF from filter.h */
-
- .text
- .globl bpf_jit_load_word
-bpf_jit_load_word:
- cmp r_OFF, 0
- bl bpf_slow_path_word_neg
- nop
- .globl bpf_jit_load_word_positive_offset
-bpf_jit_load_word_positive_offset:
- sub r_HEADLEN, r_OFF, r_TMP
- cmp r_TMP, 3
- ble bpf_slow_path_word
- add r_SKB_DATA, r_OFF, r_TMP
- andcc r_TMP, 3, %g0
- bne load_word_unaligned
- nop
- retl
- ld [r_TMP], r_RESULT
-load_word_unaligned:
- ldub [r_TMP + 0x0], r_OFF
- ldub [r_TMP + 0x1], r_TMP2
- sll r_OFF, 8, r_OFF
- or r_OFF, r_TMP2, r_OFF
- ldub [r_TMP + 0x2], r_TMP2
- sll r_OFF, 8, r_OFF
- or r_OFF, r_TMP2, r_OFF
- ldub [r_TMP + 0x3], r_TMP2
- sll r_OFF, 8, r_OFF
- retl
- or r_OFF, r_TMP2, r_RESULT
-
- .globl bpf_jit_load_half
-bpf_jit_load_half:
- cmp r_OFF, 0
- bl bpf_slow_path_half_neg
- nop
- .globl bpf_jit_load_half_positive_offset
-bpf_jit_load_half_positive_offset:
- sub r_HEADLEN, r_OFF, r_TMP
- cmp r_TMP, 1
- ble bpf_slow_path_half
- add r_SKB_DATA, r_OFF, r_TMP
- andcc r_TMP, 1, %g0
- bne load_half_unaligned
- nop
- retl
- lduh [r_TMP], r_RESULT
-load_half_unaligned:
- ldub [r_TMP + 0x0], r_OFF
- ldub [r_TMP + 0x1], r_TMP2
- sll r_OFF, 8, r_OFF
- retl
- or r_OFF, r_TMP2, r_RESULT
-
- .globl bpf_jit_load_byte
-bpf_jit_load_byte:
- cmp r_OFF, 0
- bl bpf_slow_path_byte_neg
- nop
- .globl bpf_jit_load_byte_positive_offset
-bpf_jit_load_byte_positive_offset:
- cmp r_OFF, r_HEADLEN
- bge bpf_slow_path_byte
- nop
- retl
- ldub [r_SKB_DATA + r_OFF], r_RESULT
-
-#define bpf_slow_path_common(LEN) \
- save %sp, -SAVE_SZ, %sp; \
- mov %i0, %o0; \
- mov %i1, %o1; \
- add %fp, SCRATCH_OFF, %o2; \
- call skb_copy_bits; \
- mov (LEN), %o3; \
- cmp %o0, 0; \
- restore;
-
-bpf_slow_path_word:
- bpf_slow_path_common(4)
- bl bpf_error
- ld [%sp + SCRATCH_OFF], r_RESULT
- retl
- nop
-bpf_slow_path_half:
- bpf_slow_path_common(2)
- bl bpf_error
- lduh [%sp + SCRATCH_OFF], r_RESULT
- retl
- nop
-bpf_slow_path_byte:
- bpf_slow_path_common(1)
- bl bpf_error
- ldub [%sp + SCRATCH_OFF], r_RESULT
- retl
- nop
-
-#define bpf_negative_common(LEN) \
- save %sp, -SAVE_SZ, %sp; \
- mov %i0, %o0; \
- mov %i1, %o1; \
- SIGN_EXTEND(%o1); \
- call bpf_internal_load_pointer_neg_helper; \
- mov (LEN), %o2; \
- mov %o0, r_TMP; \
- cmp %o0, 0; \
- BE_PTR(bpf_error); \
- restore;
-
-bpf_slow_path_word_neg:
- sethi %hi(SKF_MAX_NEG_OFF), r_TMP
- cmp r_OFF, r_TMP
- bl bpf_error
- nop
- .globl bpf_jit_load_word_negative_offset
-bpf_jit_load_word_negative_offset:
- bpf_negative_common(4)
- andcc r_TMP, 3, %g0
- bne load_word_unaligned
- nop
- retl
- ld [r_TMP], r_RESULT
-
-bpf_slow_path_half_neg:
- sethi %hi(SKF_MAX_NEG_OFF), r_TMP
- cmp r_OFF, r_TMP
- bl bpf_error
- nop
- .globl bpf_jit_load_half_negative_offset
-bpf_jit_load_half_negative_offset:
- bpf_negative_common(2)
- andcc r_TMP, 1, %g0
- bne load_half_unaligned
- nop
- retl
- lduh [r_TMP], r_RESULT
-
-bpf_slow_path_byte_neg:
- sethi %hi(SKF_MAX_NEG_OFF), r_TMP
- cmp r_OFF, r_TMP
- bl bpf_error
- nop
- .globl bpf_jit_load_byte_negative_offset
-bpf_jit_load_byte_negative_offset:
- bpf_negative_common(1)
- retl
- ldub [r_TMP], r_RESULT
-
-bpf_error:
- /* Make the JIT program itself return zero. */
- ret
- restore %g0, %g0, %o0
diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 48a2586..9f5918e 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -48,10 +48,6 @@ static void bpf_flush_icache(void *start_, void *end_)
}
}
-#define SEEN_DATAREF 1 /* might call external helpers */
-#define SEEN_XREG 2 /* ebx is used */
-#define SEEN_MEM 4 /* use mem[] for temporary storage */
-
#define S13(X) ((X) & 0x1fff)
#define S5(X) ((X) & 0x1f)
#define IMMED 0x00002000
@@ -198,7 +194,6 @@ struct jit_ctx {
bool tmp_1_used;
bool tmp_2_used;
bool tmp_3_used;
- bool saw_ld_abs_ind;
bool saw_frame_pointer;
bool saw_call;
bool saw_tail_call;
@@ -207,9 +202,7 @@ struct jit_ctx {
#define TMP_REG_1 (MAX_BPF_JIT_REG + 0)
#define TMP_REG_2 (MAX_BPF_JIT_REG + 1)
-#define SKB_HLEN_REG (MAX_BPF_JIT_REG + 2)
-#define SKB_DATA_REG (MAX_BPF_JIT_REG + 3)
-#define TMP_REG_3 (MAX_BPF_JIT_REG + 4)
+#define TMP_REG_3 (MAX_BPF_JIT_REG + 2)
/* Map BPF registers to SPARC registers */
static const int bpf2sparc[] = {
@@ -238,9 +231,6 @@ static const int bpf2sparc[] = {
[TMP_REG_1] = G1,
[TMP_REG_2] = G2,
[TMP_REG_3] = G3,
-
- [SKB_HLEN_REG] = L4,
- [SKB_DATA_REG] = L5,
};
static void emit(const u32 insn, struct jit_ctx *ctx)
@@ -800,25 +790,6 @@ static int emit_compare_and_branch(const u8 code, const u8 dst, u8 src,
return 0;
}
-static void load_skb_regs(struct jit_ctx *ctx, u8 r_skb)
-{
- const u8 r_headlen = bpf2sparc[SKB_HLEN_REG];
- const u8 r_data = bpf2sparc[SKB_DATA_REG];
- const u8 r_tmp = bpf2sparc[TMP_REG_1];
- unsigned int off;
-
- off = offsetof(struct sk_buff, len);
- emit(LD32I | RS1(r_skb) | S13(off) | RD(r_headlen), ctx);
-
- off = offsetof(struct sk_buff, data_len);
- emit(LD32I | RS1(r_skb) | S13(off) | RD(r_tmp), ctx);
-
- emit(SUB | RS1(r_headlen) | RS2(r_tmp) | RD(r_headlen), ctx);
-
- off = offsetof(struct sk_buff, data);
- emit(LDPTRI | RS1(r_skb) | S13(off) | RD(r_data), ctx);
-}
-
/* Just skip the save instruction and the ctx register move. */
#define BPF_TAILCALL_PROLOGUE_SKIP 16
#define BPF_TAILCALL_CNT_SP_OFF (STACK_BIAS + 128)
@@ -857,9 +828,6 @@ static void build_prologue(struct jit_ctx *ctx)
emit_reg_move(I0, O0, ctx);
/* If you add anything here, adjust BPF_TAILCALL_PROLOGUE_SKIP above. */
-
- if (ctx->saw_ld_abs_ind)
- load_skb_regs(ctx, bpf2sparc[BPF_REG_1]);
}
static void build_epilogue(struct jit_ctx *ctx)
@@ -1225,16 +1193,11 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
u8 *func = ((u8 *)__bpf_call_base) + imm;
ctx->saw_call = true;
- if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
- emit_reg_move(bpf2sparc[BPF_REG_1], L7, ctx);
emit_call((u32 *)func, ctx);
emit_nop(ctx);
emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx);
-
- if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
- load_skb_regs(ctx, L7);
break;
}
@@ -1412,43 +1375,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
emit_nop(ctx);
break;
}
-#define CHOOSE_LOAD_FUNC(K, func) \
- ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
-
- /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */
- case BPF_LD | BPF_ABS | BPF_W:
- func = CHOOSE_LOAD_FUNC(imm, bpf_jit_load_word);
- goto common_load;
- case BPF_LD | BPF_ABS | BPF_H:
- func = CHOOSE_LOAD_FUNC(imm, bpf_jit_load_half);
- goto common_load;
- case BPF_LD | BPF_ABS | BPF_B:
- func = CHOOSE_LOAD_FUNC(imm, bpf_jit_load_byte);
- goto common_load;
- /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + src + imm)) */
- case BPF_LD | BPF_IND | BPF_W:
- func = bpf_jit_load_word;
- goto common_load;
- case BPF_LD | BPF_IND | BPF_H:
- func = bpf_jit_load_half;
- goto common_load;
-
- case BPF_LD | BPF_IND | BPF_B:
- func = bpf_jit_load_byte;
- common_load:
- ctx->saw_ld_abs_ind = true;
-
- emit_reg_move(bpf2sparc[BPF_REG_6], O0, ctx);
- emit_loadimm(imm, O1, ctx);
-
- if (BPF_MODE(code) == BPF_IND)
- emit_alu(ADD, src, O1, ctx);
-
- emit_call(func, ctx);
- emit_alu_K(SRA, O1, 0, ctx);
-
- emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx);
- break;
default:
pr_err_once("unknown opcode %02x\n", code);
@@ -1583,12 +1509,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
build_epilogue(&ctx);
if (bpf_jit_enable > 1)
- pr_info("Pass %d: shrink = %d, seen = [%c%c%c%c%c%c%c]\n", pass,
+ pr_info("Pass %d: shrink = %d, seen = [%c%c%c%c%c%c]\n", pass,
image_size - (ctx.idx * 4),
ctx.tmp_1_used ? '1' : ' ',
ctx.tmp_2_used ? '2' : ' ',
ctx.tmp_3_used ? '3' : ' ',
- ctx.saw_ld_abs_ind ? 'L' : ' ',
ctx.saw_frame_pointer ? 'F' : ' ',
ctx.saw_call ? 'C' : ' ',
ctx.saw_tail_call ? 'T' : ' ');
--
2.9.5
^ permalink raw reply related
* Re: [PATCH] sctp: fix a potential missing-check bug
From: Wenwen Wang @ 2018-05-03 1:07 UTC (permalink / raw)
To: Marcelo Ricardo Leitner
Cc: Kangjie Lu, Vlad Yasevich, Neil Horman, David S. Miller,
open list:SCTP PROTOCOL, open list:NETWORKING [GENERAL],
open list, Wenwen Wang
In-Reply-To: <20180502232352.GJ5105@localhost.localdomain>
Hi Marcelo,
I guess I worked on an old version of the kernel. I will re-submit the
patch. Sorry :(
Wenwen
On Wed, May 2, 2018 at 6:23 PM, Marcelo Ricardo Leitner
<marcelo.leitner@gmail.com> wrote:
> Hi Wenwen,
>
> On Wed, May 02, 2018 at 05:12:45PM -0500, Wenwen Wang wrote:
>> In sctp_setsockopt_maxseg(), the integer 'val' is compared against min_len
>> and max_len to check whether it is in the appropriate range. If it is not,
>> an error code -EINVAL will be returned. This is enforced by a security
>> check. But, this check is only executed when 'val' is not 0. In fact, if
>
> Which makes sense, no? Especially if considering that 0 should be an
> allowed value as it turns off the user limit.
>
>> 'val' is 0, it will be assigned with a new value (if the return value of
>> the function sctp_id2assoc() is not 0) in the following execution. However,
>> this new value of 'val' is not checked before it is used to assigned to
>
> Which 'new value'? val is not set to something new during the
> function. It always contains the user supplied value.
>
>> asoc->user_frag. That means it is possible that the new value of 'val'
>> could be out of the expected range. This can cause security issues
>> such as buffer overflows, e.g., the new value of 'val' is used as an index
>> to access a buffer.
>>
>> This patch inserts a check for the new value of 'val' to see if it is in
>> the expected range. If it is not, an error code -EINVAL will be returned.
>>
>> Signed-off-by: Wenwen Wang <wang6495@umn.edu>
>> ---
>> net/sctp/socket.c | 21 ++++++++++-----------
>> 1 file changed, 10 insertions(+), 11 deletions(-)
>>
>> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
>> index 80835ac..2beb601 100644
>> --- a/net/sctp/socket.c
>> +++ b/net/sctp/socket.c
>> @@ -3212,6 +3212,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
>> struct sctp_af *af = sp->pf->af;
>> struct sctp_assoc_value params;
>> struct sctp_association *asoc;
>> + int min_len, max_len;
>> int val;
>>
>> if (optlen == sizeof(int)) {
>> @@ -3231,19 +3232,15 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
>> return -EINVAL;
>> }
>>
>> - if (val) {
>> - int min_len, max_len;
>> + min_len = SCTP_DEFAULT_MINSEGMENT - af->net_header_len;
>> + min_len -= af->ip_options_len(sk);
>> + min_len -= sizeof(struct sctphdr) +
>> + sizeof(struct sctp_data_chunk);
>
> On which tree did you base your patch on? Your patch lacks a tag so it
> defaults to net-next, and I reworked this section on current net-next
> and these MTU calculcations are now handled by sctp_mtu_payload().
>
> But even for net tree, I don't understand which issue you're fixing
> here. Actually it seems to me that both codes seems to do the same
> thing.
>
>>
>> - min_len = SCTP_DEFAULT_MINSEGMENT - af->net_header_len;
>> - min_len -= af->ip_options_len(sk);
>> - min_len -= sizeof(struct sctphdr) +
>> - sizeof(struct sctp_data_chunk);
>> + max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk);
>>
>> - max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk);
>> -
>> - if (val < min_len || val > max_len)
>> - return -EINVAL;
>> - }
>> + if (val && (val < min_len || val > max_len))
>> + return -EINVAL;
>>
>> asoc = sctp_id2assoc(sk, params.assoc_id);
>> if (asoc) {
>> @@ -3253,6 +3250,8 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
>> val -= sizeof(struct sctphdr) +
>> sctp_datachk_len(&asoc->stream);
>> }
>> + if (val < min_len || val > max_len)
>> + return -EINVAL;
>> asoc->user_frag = val;
>> asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
>> } else {
>> --
>> 2.7.4
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
^ permalink raw reply
* Re: [RFC v3 4/5] virtio_ring: add event idx support in packed ring
From: Tiwei Bie @ 2018-05-03 1:11 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Jason Wang, virtualization, linux-kernel, netdev, wexu, jfreimann
In-Reply-To: <20180502184015-mutt-send-email-mst@kernel.org>
On Wed, May 02, 2018 at 06:42:57PM +0300, Michael S. Tsirkin wrote:
> On Wed, May 02, 2018 at 11:12:55PM +0800, Tiwei Bie wrote:
> > On Wed, May 02, 2018 at 04:51:01PM +0300, Michael S. Tsirkin wrote:
> > > On Wed, May 02, 2018 at 03:28:19PM +0800, Tiwei Bie wrote:
> > > > On Wed, May 02, 2018 at 10:51:06AM +0800, Jason Wang wrote:
> > > > > On 2018年04月25日 13:15, Tiwei Bie wrote:
> > > > > > This commit introduces the event idx support in packed
> > > > > > ring. This feature is temporarily disabled, because the
> > > > > > implementation in this patch may not work as expected,
> > > > > > and some further discussions on the implementation are
> > > > > > needed, e.g. do we have to check the wrap counter when
> > > > > > checking whether a kick is needed?
> > > > > >
> > > > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > > > ---
> > > > > > drivers/virtio/virtio_ring.c | 53 ++++++++++++++++++++++++++++++++++++++++----
> > > > > > 1 file changed, 49 insertions(+), 4 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > index 0181e93897be..b1039c2985b9 100644
> > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > @@ -986,7 +986,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > > > {
> > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > - u16 flags;
> > > > > > + u16 new, old, off_wrap, flags;
> > > > > > bool needs_kick;
> > > > > > u32 snapshot;
> > > > > > @@ -995,7 +995,12 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > > > * suppressions. */
> > > > > > virtio_mb(vq->weak_barriers);
> > > > > > + old = vq->next_avail_idx - vq->num_added;
> > > > > > + new = vq->next_avail_idx;
> > > > > > + vq->num_added = 0;
> > > > > > +
> > > > > > snapshot = *(u32 *)vq->vring_packed.device;
> > > > > > + off_wrap = virtio16_to_cpu(_vq->vdev, snapshot & 0xffff);
> > > > > > flags = cpu_to_virtio16(_vq->vdev, snapshot >> 16) & 0x3;
> > > > > > #ifdef DEBUG
> > > > > > @@ -1006,7 +1011,10 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > > > vq->last_add_time_valid = false;
> > > > > > #endif
> > > > > > - needs_kick = (flags != VRING_EVENT_F_DISABLE);
> > > > > > + if (flags == VRING_EVENT_F_DESC)
> > > > > > + needs_kick = vring_need_event(off_wrap & ~(1<<15), new, old);
> > > > >
> > > > > I wonder whether or not the math is correct. Both new and event are in the
> > > > > unit of descriptor ring size, but old looks not.
> > > >
> > > > What vring_need_event() cares is the distance between
> > > > `new` and `old`, i.e. vq->num_added. So I think there
> > > > is nothing wrong with `old`. But the calculation of the
> > > > distance between `new` and `event_idx` isn't right when
> > > > `new` wraps. How do you think about the below code:
> > > >
> > > > wrap_counter = off_wrap >> 15;
> > > > event_idx = off_wrap & ~(1<<15);
> > > > if (wrap_counter != vq->wrap_counter)
> > > > event_idx -= vq->vring_packed.num;
> > > >
> > > > needs_kick = vring_need_event(event_idx, new, old);
> > >
> > > I suspect this hack won't work for non power of 2 ring.
> >
> > Above code doesn't require the ring size to be a power of 2.
> >
> > For (__u16)(new_idx - old), what we want to get is vq->num_added.
> >
> > old = vq->next_avail_idx - vq->num_added;
> > new = vq->next_avail_idx;
> >
> > When vq->next_avail_idx >= vq->num_added, it's obvious that,
> > (__u16)(new_idx - old) is vq->num_added.
> >
> > And when vq->next_avail_idx < vq->num_added, new will be smaller
> > than old (old will be a big unsigned number), but (__u16)(new_idx
> > - old) is still vq->num_added.
> >
> > For (__u16)(new_idx - event_idx - 1), when new wraps and event_idx
> > doesn't wrap, the most straightforward way to calculate it is:
> > (new + vq->vring_packed.num) - event_idx - 1.
>
> So how about we use the straightforward way then?
You mean we do new += vq->vring_packed.num instead
of event_idx -= vq->vring_packed.num before calling
vring_need_event()?
The problem is that, the second param (new_idx) of
vring_need_event() will be used for:
(__u16)(new_idx - event_idx - 1)
(__u16)(new_idx - old)
So if we change new, we will need to change old too.
And that would be an ugly hack..
Best regards,
Tiwei Bie
>
> > But we can also calculate it in this way:
> >
> > event_idx -= vq->vring_packed.num;
> > (event_idx will be a big unsigned number)
> >
> > Then (__u16)(new_idx - event_idx - 1) will be the value we want.
> >
> > Best regards,
> > Tiwei Bie
>
>
> > >
> > >
> > > > Best regards,
> > > > Tiwei Bie
> > > >
> > > >
> > > > >
> > > > > Thanks
> > > > >
> > > > > > + else
> > > > > > + needs_kick = (flags != VRING_EVENT_F_DISABLE);
> > > > > > END_USE(vq);
> > > > > > return needs_kick;
> > > > > > }
> > > > > > @@ -1116,6 +1124,15 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> > > > > > if (vq->last_used_idx >= vq->vring_packed.num)
> > > > > > vq->last_used_idx -= vq->vring_packed.num;
> > > > > > + /* If we expect an interrupt for the next entry, tell host
> > > > > > + * by writing event index and flush out the write before
> > > > > > + * the read in the next get_buf call. */
> > > > > > + if (vq->event_flags_shadow == VRING_EVENT_F_DESC)
> > > > > > + virtio_store_mb(vq->weak_barriers,
> > > > > > + &vq->vring_packed.driver->off_wrap,
> > > > > > + cpu_to_virtio16(_vq->vdev, vq->last_used_idx |
> > > > > > + (vq->wrap_counter << 15)));
> > > > > > +
> > > > > > #ifdef DEBUG
> > > > > > vq->last_add_time_valid = false;
> > > > > > #endif
> > > > > > @@ -1143,10 +1160,17 @@ static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> > > > > > /* We optimistically turn back on interrupts, then check if there was
> > > > > > * more to do. */
> > > > > > + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> > > > > > + * either clear the flags bit or point the event index at the next
> > > > > > + * entry. Always update the event index to keep code simple. */
> > > > > > +
> > > > > > + vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> > > > > > + vq->last_used_idx | (vq->wrap_counter << 15));
> > > > > > if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > > > > > virtio_wmb(vq->weak_barriers);
> > > > > > - vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> > > > > > + vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> > > > > > + VRING_EVENT_F_ENABLE;
> > > > > > vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > > > > > vq->event_flags_shadow);
> > > > > > }
> > > > > > @@ -1172,15 +1196,34 @@ static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
> > > > > > static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
> > > > > > {
> > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > + u16 bufs, used_idx, wrap_counter;
> > > > > > START_USE(vq);
> > > > > > /* We optimistically turn back on interrupts, then check if there was
> > > > > > * more to do. */
> > > > > > + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> > > > > > + * either clear the flags bit or point the event index at the next
> > > > > > + * entry. Always update the event index to keep code simple. */
> > > > > > +
> > > > > > + /* TODO: tune this threshold */
> > > > > > + bufs = (u16)(vq->next_avail_idx - vq->last_used_idx) * 3 / 4;
> > > > > > +
> > > > > > + used_idx = vq->last_used_idx + bufs;
> > > > > > + wrap_counter = vq->wrap_counter;
> > > > > > +
> > > > > > + if (used_idx >= vq->vring_packed.num) {
> > > > > > + used_idx -= vq->vring_packed.num;
> > > > > > + wrap_counter ^= 1;
> > > > > > + }
> > > > > > +
> > > > > > + vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> > > > > > + used_idx | (wrap_counter << 15));
> > > > > > if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > > > > > virtio_wmb(vq->weak_barriers);
> > > > > > - vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> > > > > > + vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> > > > > > + VRING_EVENT_F_ENABLE;
> > > > > > vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > > > > > vq->event_flags_shadow);
> > > > > > }
> > > > > > @@ -1822,8 +1865,10 @@ void vring_transport_features(struct virtio_device *vdev)
> > > > > > switch (i) {
> > > > > > case VIRTIO_RING_F_INDIRECT_DESC:
> > > > > > break;
> > > > > > +#if 0
> > > > > > case VIRTIO_RING_F_EVENT_IDX:
> > > > > > break;
> > > > > > +#endif
> > > > > > case VIRTIO_F_VERSION_1:
> > > > > > break;
> > > > > > case VIRTIO_F_IOMMU_PLATFORM:
> > > > >
^ permalink raw reply
* [PATCH] sctp: fix a potential missing-check bug
From: Wenwen Wang @ 2018-05-03 1:15 UTC (permalink / raw)
To: Wenwen Wang
Cc: Kangjie Lu, Vlad Yasevich, Neil Horman, David S. Miller,
open list:SCTP PROTOCOL, open list:NETWORKING [GENERAL],
open list
In sctp_setsockopt_maxseg(), the integer 'val' is compared against min_len
and max_len to check whether it is in the appropriate range. If it is not,
an error code -EINVAL will be returned. This is enforced by a security
check. But, this check is only executed when 'val' is not 0. In fact, if
'val' is 0, it will be assigned with a new value (if the return value of
the function sctp_id2assoc() is not 0) in the following execution. However,
this new value of 'val' is not checked before it is used to assigned to
asoc->user_frag. That means it is possible that the new value of 'val'
could be out of the expected range. This can cause security issues
such as buffer overflows, e.g., the new value of 'val' is used as an index
to access a buffer.
This patch inserts a check for the new value of 'val' to see if it is in
the expected range. If it is not, an error code -EINVAL will be returned.
Signed-off-by: Wenwen Wang <wang6495@umn.edu>
---
net/sctp/socket.c | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 80835ac..03e1cc3 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3212,6 +3212,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
struct sctp_af *af = sp->pf->af;
struct sctp_assoc_value params;
struct sctp_association *asoc;
+ int min_len, max_len;
int val;
if (optlen == sizeof(int)) {
@@ -3231,19 +3232,15 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
return -EINVAL;
}
- if (val) {
- int min_len, max_len;
+ min_len = SCTP_DEFAULT_MINSEGMENT - af->net_header_len;
+ min_len -= af->ip_options_len(sk);
+ min_len -= sizeof(struct sctphdr) +
+ sizeof(struct sctp_data_chunk);
- min_len = SCTP_DEFAULT_MINSEGMENT - af->net_header_len;
- min_len -= af->ip_options_len(sk);
- min_len -= sizeof(struct sctphdr) +
- sizeof(struct sctp_data_chunk);
+ max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk);
- max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk);
-
- if (val < min_len || val > max_len)
- return -EINVAL;
- }
+ if (val && (val < min_len || val > max_len))
+ return -EINVAL;
asoc = sctp_id2assoc(sk, params.assoc_id);
if (asoc) {
@@ -3253,6 +3250,9 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
val -= sizeof(struct sctphdr) +
sctp_datachk_len(&asoc->stream);
}
+ /* Check the new val to make sure it is in the range. */
+ if (val < min_len || val > max_len)
+ return -EINVAL;
asoc->user_frag = val;
asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
} else {
--
2.7.4
^ permalink raw reply related
* Re: [PATCH] NET/netlink: optimize output of seq_puts in af_netlink.c
From: YU Bo @ 2018-05-03 1:16 UTC (permalink / raw)
To: David Miller; +Cc: xiyou.wangcong, yuzibode, netdev, kernel-janitors
In-Reply-To: <20180502.101943.1124614050601030061.davem@davemloft.net>
Hi,
On Wed, May 02, 2018 at 10:19:43AM -0400, David Miller wrote:
>From: Bo YU <tsu.yubo@gmail.com>
>Date: Wed, 2 May 2018 05:54:24 -0400
>
>> Optimization of command output: `cat /proc/net/netlink`
>>
>> After the patch, we will get:
>>
>> https://clbin.com/lnu4L
>>
>> Signed-off-by: Bo YU <tsu.yubo@gmail.com>
>> ---
>> net/netlink/af_netlink.c | 6 +++---
>> 1 file changed, 3 insertions(+), 3 deletions(-)
>>
>> diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
>> index 55342c4d5cec..2e2dd88fc79f 100644
>> --- a/net/netlink/af_netlink.c
>> +++ b/net/netlink/af_netlink.c
>> @@ -2606,13 +2606,13 @@ static int netlink_seq_show(struct seq_file
>> *seq, void *v)
>> {
>> if (v == SEQ_START_TOKEN) {
>> seq_puts(seq,
>> - "sk Eth Pid Groups "
>> - "Rmem Wmem Dump Locks Drops Inode\n");
>> + "sk Eth Pid Groups "
>> + "Rmem Wmem Dump Locks Drops Inode\n");
>
>Please do not break the indentation of the code like this.
Sorry, i am shame to do like it.There are something happened in my different
version vim.Because checkpatch tell me only
"WARNING: quoted string split across lines"
Thank you, i will fix it.
>
>I wish to unfortunately say, that generally speaking, your patch
>submissions are not of the best quality, and take up a lot of reviewer
>time and resources as a result.
>
>If you do not improve the quality of your submissions, I am giving
>you a kind warning that the amount of care and review your patches
>will receive will become lower. Your submissions might even get to
>the point wheere they are effectively ignored.
>
>So please put more care into your work.
>
>Thank you.
^ permalink raw reply
* Re: [PATCH] sctp: fix a potential missing-check bug
From: Marcelo Ricardo Leitner @ 2018-05-03 1:24 UTC (permalink / raw)
To: Wenwen Wang
Cc: Kangjie Lu, Vlad Yasevich, Neil Horman, David S. Miller,
open list:SCTP PROTOCOL, open list:NETWORKING [GENERAL],
open list
In-Reply-To: <1525310145-28102-1-git-send-email-wang6495@umn.edu>
On Wed, May 02, 2018 at 08:15:45PM -0500, Wenwen Wang wrote:
> In sctp_setsockopt_maxseg(), the integer 'val' is compared against min_len
> and max_len to check whether it is in the appropriate range. If it is not,
> an error code -EINVAL will be returned. This is enforced by a security
> check. But, this check is only executed when 'val' is not 0. In fact, if
> 'val' is 0, it will be assigned with a new value (if the return value of
> the function sctp_id2assoc() is not 0) in the following execution. However,
> this new value of 'val' is not checked before it is used to assigned to
> asoc->user_frag. That means it is possible that the new value of 'val'
> could be out of the expected range. This can cause security issues
> such as buffer overflows, e.g., the new value of 'val' is used as an index
> to access a buffer.
>
> This patch inserts a check for the new value of 'val' to see if it is in
> the expected range. If it is not, an error code -EINVAL will be returned.
>
> Signed-off-by: Wenwen Wang <wang6495@umn.edu>
> ---
> net/sctp/socket.c | 22 +++++++++++-----------
> 1 file changed, 11 insertions(+), 11 deletions(-)
?
This patch is the same as previous one. git send-email <old file>
maybe?
Marcelo
^ permalink raw reply
* Re: [PATCH] sctp: fix a potential missing-check bug
From: Wenwen Wang @ 2018-05-03 1:27 UTC (permalink / raw)
To: Marcelo Ricardo Leitner
Cc: Kangjie Lu, Vlad Yasevich, Neil Horman, David S. Miller,
open list:SCTP PROTOCOL, open list:NETWORKING [GENERAL],
open list, Wenwen Wang
In-Reply-To: <20180503012402.GK5105@localhost.localdomain>
On Wed, May 2, 2018 at 8:24 PM, Marcelo Ricardo Leitner
<marcelo.leitner@gmail.com> wrote:
> On Wed, May 02, 2018 at 08:15:45PM -0500, Wenwen Wang wrote:
>> In sctp_setsockopt_maxseg(), the integer 'val' is compared against min_len
>> and max_len to check whether it is in the appropriate range. If it is not,
>> an error code -EINVAL will be returned. This is enforced by a security
>> check. But, this check is only executed when 'val' is not 0. In fact, if
>> 'val' is 0, it will be assigned with a new value (if the return value of
>> the function sctp_id2assoc() is not 0) in the following execution. However,
>> this new value of 'val' is not checked before it is used to assigned to
>> asoc->user_frag. That means it is possible that the new value of 'val'
>> could be out of the expected range. This can cause security issues
>> such as buffer overflows, e.g., the new value of 'val' is used as an index
>> to access a buffer.
>>
>> This patch inserts a check for the new value of 'val' to see if it is in
>> the expected range. If it is not, an error code -EINVAL will be returned.
>>
>> Signed-off-by: Wenwen Wang <wang6495@umn.edu>
>> ---
>> net/sctp/socket.c | 22 +++++++++++-----------
>> 1 file changed, 11 insertions(+), 11 deletions(-)
>
> ?
> This patch is the same as previous one. git send-email <old file>
> maybe?
>
> Marcelo
Thanks for your suggestion, Marcelo. I can send the old file. But, I
have added a line of comment in this patch.
Wenwen
^ permalink raw reply
* [PATCH net-next] ip6_gre: correct the function name in ip6gre_tnl_addr_conflict() comment
From: Sun Lianwen @ 2018-05-03 1:34 UTC (permalink / raw)
To: davem; +Cc: netdev
The function name is wrong in ip6gre_tnl_addr_conflict() comment, which
use ip6_tnl_addr_conflict instead of ip6gre_tnl_addr_conflict.
Signed-off-by: Sun Lianwen <sunlw.fnst@cn.fujitsu.com>
---
net/ipv6/ip6_gre.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 69727bc168cb..4e111da8d453 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -807,7 +807,7 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
}
/**
- * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own
+ * ip6gre_tnl_addr_conflict - compare packet addresses to tunnel's own
* @t: the outgoing tunnel device
* @hdr: IPv6 header from the incoming packet
*
--
2.17.0
^ permalink raw reply related
* Re: [PATCH bpf-next 07/12] bpf, sparc64: remove ld_abs/ld_ind
From: David Miller @ 2018-05-03 1:39 UTC (permalink / raw)
To: daniel; +Cc: ast, netdev
In-Reply-To: <20180503010536.7917-8-daniel@iogearbox.net>
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 May 2018 03:05:31 +0200
> Since LD_ABS/LD_IND instructions are now removed from the core and
> reimplemented through a combination of inlined BPF instructions and
> a slow-path helper, we can get rid of the complexity from sparc64 JIT.
>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Cc: David S. Miller <davem@davemloft.net>
> Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
^ permalink raw reply
* Re: [RFC v3 4/5] virtio_ring: add event idx support in packed ring
From: Michael S. Tsirkin @ 2018-05-03 1:44 UTC (permalink / raw)
To: Tiwei Bie
Cc: Jason Wang, virtualization, linux-kernel, netdev, wexu, jfreimann
In-Reply-To: <20180503011116.qvoyblcpklinrk26@debian>
On Thu, May 03, 2018 at 09:11:16AM +0800, Tiwei Bie wrote:
> On Wed, May 02, 2018 at 06:42:57PM +0300, Michael S. Tsirkin wrote:
> > On Wed, May 02, 2018 at 11:12:55PM +0800, Tiwei Bie wrote:
> > > On Wed, May 02, 2018 at 04:51:01PM +0300, Michael S. Tsirkin wrote:
> > > > On Wed, May 02, 2018 at 03:28:19PM +0800, Tiwei Bie wrote:
> > > > > On Wed, May 02, 2018 at 10:51:06AM +0800, Jason Wang wrote:
> > > > > > On 2018年04月25日 13:15, Tiwei Bie wrote:
> > > > > > > This commit introduces the event idx support in packed
> > > > > > > ring. This feature is temporarily disabled, because the
> > > > > > > implementation in this patch may not work as expected,
> > > > > > > and some further discussions on the implementation are
> > > > > > > needed, e.g. do we have to check the wrap counter when
> > > > > > > checking whether a kick is needed?
> > > > > > >
> > > > > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > > > > ---
> > > > > > > drivers/virtio/virtio_ring.c | 53 ++++++++++++++++++++++++++++++++++++++++----
> > > > > > > 1 file changed, 49 insertions(+), 4 deletions(-)
> > > > > > >
> > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > index 0181e93897be..b1039c2985b9 100644
> > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > @@ -986,7 +986,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > > > > {
> > > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > - u16 flags;
> > > > > > > + u16 new, old, off_wrap, flags;
> > > > > > > bool needs_kick;
> > > > > > > u32 snapshot;
> > > > > > > @@ -995,7 +995,12 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > > > > * suppressions. */
> > > > > > > virtio_mb(vq->weak_barriers);
> > > > > > > + old = vq->next_avail_idx - vq->num_added;
> > > > > > > + new = vq->next_avail_idx;
> > > > > > > + vq->num_added = 0;
> > > > > > > +
> > > > > > > snapshot = *(u32 *)vq->vring_packed.device;
> > > > > > > + off_wrap = virtio16_to_cpu(_vq->vdev, snapshot & 0xffff);
> > > > > > > flags = cpu_to_virtio16(_vq->vdev, snapshot >> 16) & 0x3;
> > > > > > > #ifdef DEBUG
> > > > > > > @@ -1006,7 +1011,10 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > > > > vq->last_add_time_valid = false;
> > > > > > > #endif
> > > > > > > - needs_kick = (flags != VRING_EVENT_F_DISABLE);
> > > > > > > + if (flags == VRING_EVENT_F_DESC)
> > > > > > > + needs_kick = vring_need_event(off_wrap & ~(1<<15), new, old);
> > > > > >
> > > > > > I wonder whether or not the math is correct. Both new and event are in the
> > > > > > unit of descriptor ring size, but old looks not.
> > > > >
> > > > > What vring_need_event() cares is the distance between
> > > > > `new` and `old`, i.e. vq->num_added. So I think there
> > > > > is nothing wrong with `old`. But the calculation of the
> > > > > distance between `new` and `event_idx` isn't right when
> > > > > `new` wraps. How do you think about the below code:
> > > > >
> > > > > wrap_counter = off_wrap >> 15;
> > > > > event_idx = off_wrap & ~(1<<15);
> > > > > if (wrap_counter != vq->wrap_counter)
> > > > > event_idx -= vq->vring_packed.num;
> > > > >
> > > > > needs_kick = vring_need_event(event_idx, new, old);
> > > >
> > > > I suspect this hack won't work for non power of 2 ring.
> > >
> > > Above code doesn't require the ring size to be a power of 2.
> > >
> > > For (__u16)(new_idx - old), what we want to get is vq->num_added.
> > >
> > > old = vq->next_avail_idx - vq->num_added;
> > > new = vq->next_avail_idx;
> > >
> > > When vq->next_avail_idx >= vq->num_added, it's obvious that,
> > > (__u16)(new_idx - old) is vq->num_added.
> > >
> > > And when vq->next_avail_idx < vq->num_added, new will be smaller
> > > than old (old will be a big unsigned number), but (__u16)(new_idx
> > > - old) is still vq->num_added.
> > >
> > > For (__u16)(new_idx - event_idx - 1), when new wraps and event_idx
> > > doesn't wrap, the most straightforward way to calculate it is:
> > > (new + vq->vring_packed.num) - event_idx - 1.
> >
> > So how about we use the straightforward way then?
>
> You mean we do new += vq->vring_packed.num instead
> of event_idx -= vq->vring_packed.num before calling
> vring_need_event()?
>
> The problem is that, the second param (new_idx) of
> vring_need_event() will be used for:
>
> (__u16)(new_idx - event_idx - 1)
> (__u16)(new_idx - old)
>
> So if we change new, we will need to change old too.
I think that since we have a branch there anyway,
we are better off just special-casing if (wrap_counter != vq->wrap_counter).
Treat is differenty and avoid casts.
> And that would be an ugly hack..
>
> Best regards,
> Tiwei Bie
I consider casts and huge numbers with two's complement
games even uglier.
> >
> > > But we can also calculate it in this way:
> > >
> > > event_idx -= vq->vring_packed.num;
> > > (event_idx will be a big unsigned number)
> > >
> > > Then (__u16)(new_idx - event_idx - 1) will be the value we want.
> > >
> > > Best regards,
> > > Tiwei Bie
> >
> >
> > > >
> > > >
> > > > > Best regards,
> > > > > Tiwei Bie
> > > > >
> > > > >
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > > + else
> > > > > > > + needs_kick = (flags != VRING_EVENT_F_DISABLE);
> > > > > > > END_USE(vq);
> > > > > > > return needs_kick;
> > > > > > > }
> > > > > > > @@ -1116,6 +1124,15 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> > > > > > > if (vq->last_used_idx >= vq->vring_packed.num)
> > > > > > > vq->last_used_idx -= vq->vring_packed.num;
> > > > > > > + /* If we expect an interrupt for the next entry, tell host
> > > > > > > + * by writing event index and flush out the write before
> > > > > > > + * the read in the next get_buf call. */
> > > > > > > + if (vq->event_flags_shadow == VRING_EVENT_F_DESC)
> > > > > > > + virtio_store_mb(vq->weak_barriers,
> > > > > > > + &vq->vring_packed.driver->off_wrap,
> > > > > > > + cpu_to_virtio16(_vq->vdev, vq->last_used_idx |
> > > > > > > + (vq->wrap_counter << 15)));
> > > > > > > +
> > > > > > > #ifdef DEBUG
> > > > > > > vq->last_add_time_valid = false;
> > > > > > > #endif
> > > > > > > @@ -1143,10 +1160,17 @@ static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> > > > > > > /* We optimistically turn back on interrupts, then check if there was
> > > > > > > * more to do. */
> > > > > > > + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> > > > > > > + * either clear the flags bit or point the event index at the next
> > > > > > > + * entry. Always update the event index to keep code simple. */
> > > > > > > +
> > > > > > > + vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> > > > > > > + vq->last_used_idx | (vq->wrap_counter << 15));
> > > > > > > if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > > > > > > virtio_wmb(vq->weak_barriers);
> > > > > > > - vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> > > > > > > + vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> > > > > > > + VRING_EVENT_F_ENABLE;
> > > > > > > vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > > > > > > vq->event_flags_shadow);
> > > > > > > }
> > > > > > > @@ -1172,15 +1196,34 @@ static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
> > > > > > > static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
> > > > > > > {
> > > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > + u16 bufs, used_idx, wrap_counter;
> > > > > > > START_USE(vq);
> > > > > > > /* We optimistically turn back on interrupts, then check if there was
> > > > > > > * more to do. */
> > > > > > > + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> > > > > > > + * either clear the flags bit or point the event index at the next
> > > > > > > + * entry. Always update the event index to keep code simple. */
> > > > > > > +
> > > > > > > + /* TODO: tune this threshold */
> > > > > > > + bufs = (u16)(vq->next_avail_idx - vq->last_used_idx) * 3 / 4;
> > > > > > > +
> > > > > > > + used_idx = vq->last_used_idx + bufs;
> > > > > > > + wrap_counter = vq->wrap_counter;
> > > > > > > +
> > > > > > > + if (used_idx >= vq->vring_packed.num) {
> > > > > > > + used_idx -= vq->vring_packed.num;
> > > > > > > + wrap_counter ^= 1;
> > > > > > > + }
> > > > > > > +
> > > > > > > + vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> > > > > > > + used_idx | (wrap_counter << 15));
> > > > > > > if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > > > > > > virtio_wmb(vq->weak_barriers);
> > > > > > > - vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> > > > > > > + vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> > > > > > > + VRING_EVENT_F_ENABLE;
> > > > > > > vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > > > > > > vq->event_flags_shadow);
> > > > > > > }
> > > > > > > @@ -1822,8 +1865,10 @@ void vring_transport_features(struct virtio_device *vdev)
> > > > > > > switch (i) {
> > > > > > > case VIRTIO_RING_F_INDIRECT_DESC:
> > > > > > > break;
> > > > > > > +#if 0
> > > > > > > case VIRTIO_RING_F_EVENT_IDX:
> > > > > > > break;
> > > > > > > +#endif
> > > > > > > case VIRTIO_F_VERSION_1:
> > > > > > > break;
> > > > > > > case VIRTIO_F_IOMMU_PLATFORM:
> > > > > >
^ permalink raw reply
* Re: [PATCH] sctp: fix a potential missing-check bug
From: Marcelo Ricardo Leitner @ 2018-05-03 1:48 UTC (permalink / raw)
To: Wenwen Wang
Cc: Kangjie Lu, Vlad Yasevich, Neil Horman, David S. Miller,
open list:SCTP PROTOCOL, open list:NETWORKING [GENERAL],
open list
In-Reply-To: <CAAa=b7f+dfDUZR7tHvmSQgTcNDmGjvCn9ZK9eevKGF+bNd2Aqg@mail.gmail.com>
On Wed, May 02, 2018 at 08:27:05PM -0500, Wenwen Wang wrote:
> On Wed, May 2, 2018 at 8:24 PM, Marcelo Ricardo Leitner
> <marcelo.leitner@gmail.com> wrote:
> > On Wed, May 02, 2018 at 08:15:45PM -0500, Wenwen Wang wrote:
> >> In sctp_setsockopt_maxseg(), the integer 'val' is compared against min_len
> >> and max_len to check whether it is in the appropriate range. If it is not,
> >> an error code -EINVAL will be returned. This is enforced by a security
> >> check. But, this check is only executed when 'val' is not 0. In fact, if
> >> 'val' is 0, it will be assigned with a new value (if the return value of
> >> the function sctp_id2assoc() is not 0) in the following execution. However,
> >> this new value of 'val' is not checked before it is used to assigned to
> >> asoc->user_frag. That means it is possible that the new value of 'val'
> >> could be out of the expected range. This can cause security issues
> >> such as buffer overflows, e.g., the new value of 'val' is used as an index
> >> to access a buffer.
> >>
> >> This patch inserts a check for the new value of 'val' to see if it is in
> >> the expected range. If it is not, an error code -EINVAL will be returned.
> >>
> >> Signed-off-by: Wenwen Wang <wang6495@umn.edu>
> >> ---
> >> net/sctp/socket.c | 22 +++++++++++-----------
> >> 1 file changed, 11 insertions(+), 11 deletions(-)
> >
> > ?
> > This patch is the same as previous one. git send-email <old file>
> > maybe?
> >
> > Marcelo
>
> Thanks for your suggestion, Marcelo. I can send the old file. But, I
> have added a line of comment in this patch.
I meant if you had sent the old patch again by accident, because you
said you worked on an old version of the tree, but then posted a patch
that also doesn't use the new MTU function I mentioned.
Marcelo
^ permalink raw reply
* Re: [RFC v3 4/5] virtio_ring: add event idx support in packed ring
From: Tiwei Bie @ 2018-05-03 2:09 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180503044218-mutt-send-email-mst@kernel.org>
On Thu, May 03, 2018 at 04:44:39AM +0300, Michael S. Tsirkin wrote:
> On Thu, May 03, 2018 at 09:11:16AM +0800, Tiwei Bie wrote:
> > On Wed, May 02, 2018 at 06:42:57PM +0300, Michael S. Tsirkin wrote:
> > > On Wed, May 02, 2018 at 11:12:55PM +0800, Tiwei Bie wrote:
> > > > On Wed, May 02, 2018 at 04:51:01PM +0300, Michael S. Tsirkin wrote:
> > > > > On Wed, May 02, 2018 at 03:28:19PM +0800, Tiwei Bie wrote:
> > > > > > On Wed, May 02, 2018 at 10:51:06AM +0800, Jason Wang wrote:
> > > > > > > On 2018年04月25日 13:15, Tiwei Bie wrote:
> > > > > > > > This commit introduces the event idx support in packed
> > > > > > > > ring. This feature is temporarily disabled, because the
> > > > > > > > implementation in this patch may not work as expected,
> > > > > > > > and some further discussions on the implementation are
> > > > > > > > needed, e.g. do we have to check the wrap counter when
> > > > > > > > checking whether a kick is needed?
> > > > > > > >
> > > > > > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > > > > > ---
> > > > > > > > drivers/virtio/virtio_ring.c | 53 ++++++++++++++++++++++++++++++++++++++++----
> > > > > > > > 1 file changed, 49 insertions(+), 4 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > index 0181e93897be..b1039c2985b9 100644
> > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > @@ -986,7 +986,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > > static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > > > > > {
> > > > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > - u16 flags;
> > > > > > > > + u16 new, old, off_wrap, flags;
> > > > > > > > bool needs_kick;
> > > > > > > > u32 snapshot;
> > > > > > > > @@ -995,7 +995,12 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > > > > > * suppressions. */
> > > > > > > > virtio_mb(vq->weak_barriers);
> > > > > > > > + old = vq->next_avail_idx - vq->num_added;
> > > > > > > > + new = vq->next_avail_idx;
> > > > > > > > + vq->num_added = 0;
> > > > > > > > +
> > > > > > > > snapshot = *(u32 *)vq->vring_packed.device;
> > > > > > > > + off_wrap = virtio16_to_cpu(_vq->vdev, snapshot & 0xffff);
> > > > > > > > flags = cpu_to_virtio16(_vq->vdev, snapshot >> 16) & 0x3;
> > > > > > > > #ifdef DEBUG
> > > > > > > > @@ -1006,7 +1011,10 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > > > > > vq->last_add_time_valid = false;
> > > > > > > > #endif
> > > > > > > > - needs_kick = (flags != VRING_EVENT_F_DISABLE);
> > > > > > > > + if (flags == VRING_EVENT_F_DESC)
> > > > > > > > + needs_kick = vring_need_event(off_wrap & ~(1<<15), new, old);
> > > > > > >
> > > > > > > I wonder whether or not the math is correct. Both new and event are in the
> > > > > > > unit of descriptor ring size, but old looks not.
> > > > > >
> > > > > > What vring_need_event() cares is the distance between
> > > > > > `new` and `old`, i.e. vq->num_added. So I think there
> > > > > > is nothing wrong with `old`. But the calculation of the
> > > > > > distance between `new` and `event_idx` isn't right when
> > > > > > `new` wraps. How do you think about the below code:
> > > > > >
> > > > > > wrap_counter = off_wrap >> 15;
> > > > > > event_idx = off_wrap & ~(1<<15);
> > > > > > if (wrap_counter != vq->wrap_counter)
> > > > > > event_idx -= vq->vring_packed.num;
> > > > > >
> > > > > > needs_kick = vring_need_event(event_idx, new, old);
> > > > >
> > > > > I suspect this hack won't work for non power of 2 ring.
> > > >
> > > > Above code doesn't require the ring size to be a power of 2.
> > > >
> > > > For (__u16)(new_idx - old), what we want to get is vq->num_added.
> > > >
> > > > old = vq->next_avail_idx - vq->num_added;
> > > > new = vq->next_avail_idx;
> > > >
> > > > When vq->next_avail_idx >= vq->num_added, it's obvious that,
> > > > (__u16)(new_idx - old) is vq->num_added.
> > > >
> > > > And when vq->next_avail_idx < vq->num_added, new will be smaller
> > > > than old (old will be a big unsigned number), but (__u16)(new_idx
> > > > - old) is still vq->num_added.
> > > >
> > > > For (__u16)(new_idx - event_idx - 1), when new wraps and event_idx
> > > > doesn't wrap, the most straightforward way to calculate it is:
> > > > (new + vq->vring_packed.num) - event_idx - 1.
> > >
> > > So how about we use the straightforward way then?
> >
> > You mean we do new += vq->vring_packed.num instead
> > of event_idx -= vq->vring_packed.num before calling
> > vring_need_event()?
> >
> > The problem is that, the second param (new_idx) of
> > vring_need_event() will be used for:
> >
> > (__u16)(new_idx - event_idx - 1)
> > (__u16)(new_idx - old)
> >
> > So if we change new, we will need to change old too.
>
> I think that since we have a branch there anyway,
> we are better off just special-casing if (wrap_counter != vq->wrap_counter).
> Treat is differenty and avoid casts.
>
> > And that would be an ugly hack..
> >
> > Best regards,
> > Tiwei Bie
>
> I consider casts and huge numbers with two's complement
> games even uglier.
The dependency on two's complement game is introduced
since the split ring.
In packed ring, old is calculated via:
old = vq->next_avail_idx - vq->num_added;
In split ring, old is calculated via:
old = vq->avail_idx_shadow - vq->num_added;
In both cases, when vq->num_added is bigger, old will
be a big number.
Best regards,
Tiwei Bie
>
> > >
> > > > But we can also calculate it in this way:
> > > >
> > > > event_idx -= vq->vring_packed.num;
> > > > (event_idx will be a big unsigned number)
> > > >
> > > > Then (__u16)(new_idx - event_idx - 1) will be the value we want.
> > > >
> > > > Best regards,
> > > > Tiwei Bie
> > >
> > >
> > > > >
> > > > >
> > > > > > Best regards,
> > > > > > Tiwei Bie
> > > > > >
> > > > > >
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > > > + else
> > > > > > > > + needs_kick = (flags != VRING_EVENT_F_DISABLE);
> > > > > > > > END_USE(vq);
> > > > > > > > return needs_kick;
> > > > > > > > }
> > > > > > > > @@ -1116,6 +1124,15 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> > > > > > > > if (vq->last_used_idx >= vq->vring_packed.num)
> > > > > > > > vq->last_used_idx -= vq->vring_packed.num;
> > > > > > > > + /* If we expect an interrupt for the next entry, tell host
> > > > > > > > + * by writing event index and flush out the write before
> > > > > > > > + * the read in the next get_buf call. */
> > > > > > > > + if (vq->event_flags_shadow == VRING_EVENT_F_DESC)
> > > > > > > > + virtio_store_mb(vq->weak_barriers,
> > > > > > > > + &vq->vring_packed.driver->off_wrap,
> > > > > > > > + cpu_to_virtio16(_vq->vdev, vq->last_used_idx |
> > > > > > > > + (vq->wrap_counter << 15)));
> > > > > > > > +
> > > > > > > > #ifdef DEBUG
> > > > > > > > vq->last_add_time_valid = false;
> > > > > > > > #endif
> > > > > > > > @@ -1143,10 +1160,17 @@ static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> > > > > > > > /* We optimistically turn back on interrupts, then check if there was
> > > > > > > > * more to do. */
> > > > > > > > + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> > > > > > > > + * either clear the flags bit or point the event index at the next
> > > > > > > > + * entry. Always update the event index to keep code simple. */
> > > > > > > > +
> > > > > > > > + vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> > > > > > > > + vq->last_used_idx | (vq->wrap_counter << 15));
> > > > > > > > if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > > > > > > > virtio_wmb(vq->weak_barriers);
> > > > > > > > - vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> > > > > > > > + vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> > > > > > > > + VRING_EVENT_F_ENABLE;
> > > > > > > > vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > > > > > > > vq->event_flags_shadow);
> > > > > > > > }
> > > > > > > > @@ -1172,15 +1196,34 @@ static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
> > > > > > > > static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
> > > > > > > > {
> > > > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > + u16 bufs, used_idx, wrap_counter;
> > > > > > > > START_USE(vq);
> > > > > > > > /* We optimistically turn back on interrupts, then check if there was
> > > > > > > > * more to do. */
> > > > > > > > + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> > > > > > > > + * either clear the flags bit or point the event index at the next
> > > > > > > > + * entry. Always update the event index to keep code simple. */
> > > > > > > > +
> > > > > > > > + /* TODO: tune this threshold */
> > > > > > > > + bufs = (u16)(vq->next_avail_idx - vq->last_used_idx) * 3 / 4;
> > > > > > > > +
> > > > > > > > + used_idx = vq->last_used_idx + bufs;
> > > > > > > > + wrap_counter = vq->wrap_counter;
> > > > > > > > +
> > > > > > > > + if (used_idx >= vq->vring_packed.num) {
> > > > > > > > + used_idx -= vq->vring_packed.num;
> > > > > > > > + wrap_counter ^= 1;
> > > > > > > > + }
> > > > > > > > +
> > > > > > > > + vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> > > > > > > > + used_idx | (wrap_counter << 15));
> > > > > > > > if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > > > > > > > virtio_wmb(vq->weak_barriers);
> > > > > > > > - vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> > > > > > > > + vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> > > > > > > > + VRING_EVENT_F_ENABLE;
> > > > > > > > vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > > > > > > > vq->event_flags_shadow);
> > > > > > > > }
> > > > > > > > @@ -1822,8 +1865,10 @@ void vring_transport_features(struct virtio_device *vdev)
> > > > > > > > switch (i) {
> > > > > > > > case VIRTIO_RING_F_INDIRECT_DESC:
> > > > > > > > break;
> > > > > > > > +#if 0
> > > > > > > > case VIRTIO_RING_F_EVENT_IDX:
> > > > > > > > break;
> > > > > > > > +#endif
> > > > > > > > case VIRTIO_F_VERSION_1:
> > > > > > > > break;
> > > > > > > > case VIRTIO_F_IOMMU_PLATFORM:
> > > > > > >
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: pull-request: bpf 2018-05-03
From: David Miller @ 2018-05-03 2:47 UTC (permalink / raw)
To: daniel; +Cc: ast, netdev
In-Reply-To: <20180503003712.749-1-daniel@iogearbox.net>
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 May 2018 02:37:12 +0200
> The following pull-request contains BPF updates for your *net* tree.
>
> The main changes are:
...
> Please consider pulling these changes from:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git
Pulled, thanks Daniel.
^ permalink raw reply
* Re: [PATCH net] ipv4: fix fnhe usage by non-cached routes
From: David Miller @ 2018-05-03 2:55 UTC (permalink / raw)
To: ja; +Cc: netdev, kafai, kernel-team, dsahern, lucien.xin
In-Reply-To: <20180502064119.4552-1-ja@ssi.bg>
From: Julian Anastasov <ja@ssi.bg>
Date: Wed, 2 May 2018 09:41:19 +0300
> Allow some non-cached routes to use non-expired fnhe:
>
> 1. ip_del_fnhe: moved above and now called by find_exception.
> The 4.5+ commit deed49df7390 expires fnhe only when caching
> routes. Change that to:
>
> 1.1. use fnhe for non-cached local output routes, with the help
> from (2)
>
> 1.2. allow __mkroute_input to detect expired fnhe (outdated
> fnhe_gw, for example) when do_cache is false, eg. when itag!=0
> for unicast destinations.
>
> 2. __mkroute_output: keep fi to allow local routes with orig_oif != 0
> to use fnhe info even when the new route will not be cached into fnhe.
> After commit 839da4d98960 ("net: ipv4: set orig_oif based on fib
> result for local traffic") it means all local routes will be affected
> because they are not cached. This change is used to solve a PMTU
> problem with IPVS (and probably Netfilter DNAT) setups that redirect
> local clients from target local IP (local route to Virtual IP)
> to new remote IP target, eg. IPVS TUN real server. Loopback has
> 64K MTU and we need to create fnhe on the local route that will
> keep the reduced PMTU for the Virtual IP. Without this change
> fnhe_pmtu is updated from ICMP but never exposed to non-cached
> local routes. This includes routes with flowi4_oif!=0 for 4.6+ and
> with flowi4_oif=any for 4.14+).
>
> 3. update_or_create_fnhe: make sure fnhe_expires is not 0 for
> new entries
>
> Fixes: 839da4d98960 ("net: ipv4: set orig_oif based on fib result for local traffic")
> Fixes: d6d5e999e5df ("route: do not cache fib route info on local routes with oif")
> Fixes: deed49df7390 ("route: check and remove route cache when we get route")
> Cc: David Ahern <dsahern@gmail.com>
> Cc: Xin Long <lucien.xin@gmail.com>
> Signed-off-by: Julian Anastasov <ja@ssi.bg>
Applied and queued up for -stable, thanks Julian.
^ permalink raw reply
* [PATCH net] tcp: restore autocorking
From: Eric Dumazet @ 2018-05-03 3:25 UTC (permalink / raw)
To: David S . Miller; +Cc: netdev, Eric Dumazet, Michael Wenig, Eric Dumazet
When adding rb-tree for TCP retransmit queue, we inadvertently broke
TCP autocorking.
tcp_should_autocork() should really check if the rtx queue is not empty.
Tested:
Before the fix :
$ nstat -n;./netperf -H 10.246.7.152 -Cc -- -m 500;nstat | grep AutoCork
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.246.7.152 () port 0 AF_INET
Recv Send Send Utilization Service Demand
Socket Socket Message Elapsed Send Recv Send Recv
Size Size Size Time Throughput local remote local remote
bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB
540000 262144 500 10.00 2682.85 2.47 1.59 3.618 2.329
TcpExtTCPAutoCorking 33 0.0
// Same test, but forcing TCP_NODELAY
$ nstat -n;./netperf -H 10.246.7.152 -Cc -- -D -m 500;nstat | grep AutoCork
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.246.7.152 () port 0 AF_INET : nodelay
Recv Send Send Utilization Service Demand
Socket Socket Message Elapsed Send Recv Send Recv
Size Size Size Time Throughput local remote local remote
bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB
540000 262144 500 10.00 1408.75 2.44 2.96 6.802 8.259
TcpExtTCPAutoCorking 1 0.0
After the fix :
$ nstat -n;./netperf -H 10.246.7.152 -Cc -- -m 500;nstat | grep AutoCork
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.246.7.152 () port 0 AF_INET
Recv Send Send Utilization Service Demand
Socket Socket Message Elapsed Send Recv Send Recv
Size Size Size Time Throughput local remote local remote
bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB
540000 262144 500 10.00 5472.46 2.45 1.43 1.761 1.027
TcpExtTCPAutoCorking 361293 0.0
// With TCP_NODELAY option
$ nstat -n;./netperf -H 10.246.7.152 -Cc -- -D -m 500;nstat | grep AutoCork
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.246.7.152 () port 0 AF_INET : nodelay
Recv Send Send Utilization Service Demand
Socket Socket Message Elapsed Send Recv Send Recv
Size Size Size Time Throughput local remote local remote
bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB
540000 262144 500 10.00 5454.96 2.46 1.63 1.775 1.174
TcpExtTCPAutoCorking 315448 0.0
Fixes: 75c119afe14f ("tcp: implement rb-tree based retransmit queue")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Michael Wenig <mwenig@vmware.com>
Tested-by: Michael Wenig <mwenig@vmware.com>
---
net/ipv4/tcp.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 44be7f43455e4aefde8db61e2d941a69abcc642a..c9d00ef54deca15d5760bcbe154001a96fa1e2a7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -697,7 +697,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
{
return skb->len < size_goal &&
sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
- skb != tcp_write_queue_head(sk) &&
+ !tcp_rtx_queue_empty(sk) &&
refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
}
--
2.17.0.441.gb46fe60e1d-goog
^ permalink raw reply related
* [bpf-next v1 0/9] bpf: Add helper to do FIB lookups
From: David Ahern @ 2018-05-03 3:53 UTC (permalink / raw)
To: netdev, borkmann, ast
Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
Provide a helper for doing a FIB and neighbor lookup in the kernel
tables from an XDP program. The helper provides a fastpath for forwarding
packets. If the packet is a local delivery or for any reason is not a
simple lookup and forward, the packet is expected to continue up the stack
for full processing.
The response from a FIB and neighbor lookup is either the egress index
with the bpf_fib_lookup struct filled in with dmac and gateway or
0 meaning the packet should continue up the stack. In time we can
visit this to return the FIB lookup result errno if it is one of the
special RTN_'s such as RTN_BLACKHOLE (-EINVAL) so that the XDP
programs can do an early drop if desired.
Patches 1-6 do some more refactoring to IPv6 with the end goal of
extracting a FIB lookup function that aligns with fib_lookup for IPv4,
basically returning a fib6_info without creating a dst based entry.
Patch 7 adds lookup functions to the ipv6 stub. These are needed since
bpf is built into the kernel and ipv6 may not be built or loaded.
Patch 8 adds the bpf helper and 9 adds a sample program.
v1
- updated commit messages and cover letter
- added comment to sample program noting lack of verification on
egress device supporting XDP
RFC v2
- fixed use of foward helper from cls_act as noted by Daniel
- in patch 1 rename fib6_lookup_1 as well for consistency
David Ahern (9):
net/ipv6: Rename fib6_lookup to fib6_node_lookup
net/ipv6: Rename rt6_multipath_select
net/ipv6: Extract table lookup from ip6_pol_route
net/ipv6: Refactor fib6_rule_action
net/ipv6: Add fib6_lookup
net/ipv6: Update fib6 tracepoint to take fib6_info
net/ipv6: Add fib lookup stubs for use in bpf helper
bpf: Provide helper to do lookups in kernel FIB table
samples/bpf: Add example of ipv4 and ipv6 forwarding in XDP
include/net/addrconf.h | 14 ++
include/net/ip6_fib.h | 21 ++-
include/trace/events/fib6.h | 14 +-
include/uapi/linux/bpf.h | 83 +++++++++-
net/core/filter.c | 263 ++++++++++++++++++++++++++++++
net/ipv6/addrconf_core.c | 33 +++-
net/ipv6/af_inet6.c | 6 +-
net/ipv6/fib6_rules.c | 138 +++++++++++++---
net/ipv6/ip6_fib.c | 21 ++-
net/ipv6/route.c | 76 +++++----
samples/bpf/Makefile | 4 +
samples/bpf/xdp_fwd_kern.c | 113 +++++++++++++
samples/bpf/xdp_fwd_user.c | 136 +++++++++++++++
tools/testing/selftests/bpf/bpf_helpers.h | 3 +
14 files changed, 850 insertions(+), 75 deletions(-)
create mode 100644 samples/bpf/xdp_fwd_kern.c
create mode 100644 samples/bpf/xdp_fwd_user.c
--
2.11.0
^ permalink raw reply
* [bpf-next v1 1/9] net/ipv6: Rename fib6_lookup to fib6_node_lookup
From: David Ahern @ 2018-05-03 3:53 UTC (permalink / raw)
To: netdev, borkmann, ast
Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180503035319.18290-1-dsahern@gmail.com>
Rename fib6_lookup to fib6_node_lookup to better reflect what it
returns. The fib6_lookup name will be used in a later patch for
an IPv6 equivalent to IPv4's fib_lookup.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/net/ip6_fib.h | 6 +++---
net/ipv6/ip6_fib.c | 14 ++++++++------
net/ipv6/route.c | 8 ++++----
3 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1af450d4e923..5a16630179cb 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,9 +376,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
const struct sk_buff *skb,
int flags, pol_lookup_t lookup);
-struct fib6_node *fib6_lookup(struct fib6_node *root,
- const struct in6_addr *daddr,
- const struct in6_addr *saddr);
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr);
struct fib6_node *fib6_locate(struct fib6_node *root,
const struct in6_addr *daddr, int dst_len,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 6421c893466e..4cfffa0f676e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1354,8 +1354,8 @@ struct lookup_args {
const struct in6_addr *addr; /* search key */
};
-static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
- struct lookup_args *args)
+static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
+ struct lookup_args *args)
{
struct fib6_node *fn;
__be32 dir;
@@ -1400,7 +1400,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
#ifdef CONFIG_IPV6_SUBTREES
if (subtree) {
struct fib6_node *sfn;
- sfn = fib6_lookup_1(subtree, args + 1);
+ sfn = fib6_node_lookup_1(subtree,
+ args + 1);
if (!sfn)
goto backtrack;
fn = sfn;
@@ -1422,8 +1423,9 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
/* called with rcu_read_lock() held
*/
-struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
- const struct in6_addr *saddr)
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
struct fib6_node *fn;
struct lookup_args args[] = {
@@ -1442,7 +1444,7 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
}
};
- fn = fib6_lookup_1(root, daddr ? args : args + 1);
+ fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
if (!fn || fn->fn_flags & RTN_TL_ROOT)
fn = root;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7ee0a34fba46..d903db30dfff 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1006,7 +1006,7 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
pn = rcu_dereference(fn->parent);
sn = FIB6_SUBTREE(pn);
if (sn && sn != fn)
- fn = fib6_lookup(sn, NULL, saddr);
+ fn = fib6_node_lookup(sn, NULL, saddr);
else
fn = pn;
if (fn->fn_flags & RTN_RTINFO)
@@ -1059,7 +1059,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
flags &= ~RT6_LOOKUP_F_IFACE;
rcu_read_lock();
- fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+ fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
f6i = rcu_dereference(fn->leaf);
if (!f6i) {
@@ -1815,7 +1815,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
rcu_read_lock();
- fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+ fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
saved_fn = fn;
if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
@@ -2420,7 +2420,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
*/
rcu_read_lock();
- fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+ fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
for_each_fib6_node_rt_rcu(fn) {
if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
--
2.11.0
^ permalink raw reply related
* [bpf-next v1 2/9] net/ipv6: Rename rt6_multipath_select
From: David Ahern @ 2018-05-03 3:53 UTC (permalink / raw)
To: netdev, borkmann, ast
Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180503035319.18290-1-dsahern@gmail.com>
Rename rt6_multipath_select to fib6_multipath_select and export it.
A later patch wants access to it similar to IPv4's fib_select_path.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/net/ip6_fib.h | 5 +++++
net/ipv6/route.c | 17 +++++++++--------
2 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 5a16630179cb..80d76d8dc683 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
const struct sk_buff *skb,
int flags, pol_lookup_t lookup);
+struct fib6_info *fib6_multipath_select(const struct net *net,
+ struct fib6_info *match,
+ struct flowi6 *fl6, int oif,
+ const struct sk_buff *skb, int strict);
+
struct fib6_node *fib6_node_lookup(struct fib6_node *root,
const struct in6_addr *daddr,
const struct in6_addr *saddr);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d903db30dfff..58af969f3a2c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -419,11 +419,11 @@ static bool rt6_check_expired(const struct rt6_info *rt)
return false;
}
-static struct fib6_info *rt6_multipath_select(const struct net *net,
- struct fib6_info *match,
- struct flowi6 *fl6, int oif,
- const struct sk_buff *skb,
- int strict)
+struct fib6_info *fib6_multipath_select(const struct net *net,
+ struct fib6_info *match,
+ struct flowi6 *fl6, int oif,
+ const struct sk_buff *skb,
+ int strict)
{
struct fib6_info *sibling, *next_sibling;
@@ -1068,8 +1068,9 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
f6i = rt6_device_match(net, f6i, &fl6->saddr,
fl6->flowi6_oif, flags);
if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
- f6i = rt6_multipath_select(net, f6i, fl6,
- fl6->flowi6_oif, skb, flags);
+ f6i = fib6_multipath_select(net, f6i, fl6,
+ fl6->flowi6_oif, skb,
+ flags);
}
if (f6i == net->ipv6.fib6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
@@ -1824,7 +1825,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
redo_rt6_select:
f6i = rt6_select(net, fn, oif, strict);
if (f6i->fib6_nsiblings)
- f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
+ f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
if (f6i == net->ipv6.fib6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
--
2.11.0
^ permalink raw reply related
* [bpf-next v1 3/9] net/ipv6: Extract table lookup from ip6_pol_route
From: David Ahern @ 2018-05-03 3:53 UTC (permalink / raw)
To: netdev, borkmann, ast
Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180503035319.18290-1-dsahern@gmail.com>
ip6_pol_route is used for ingress and egress FIB lookups. Refactor it
moving the table lookup into a separate fib6_table_lookup that can be
invoked separately and export the new function.
ip6_pol_route now calls fib6_table_lookup and uses the result to generate
a dst based rt6_info.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/net/ip6_fib.h | 4 ++++
net/ipv6/route.c | 39 +++++++++++++++++++++++++--------------
2 files changed, 29 insertions(+), 14 deletions(-)
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 80d76d8dc683..4f7b8f59ea6d 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
const struct sk_buff *skb,
int flags, pol_lookup_t lookup);
+/* called with rcu lock held; caller needs to select path */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+ int oif, struct flowi6 *fl6, int strict);
+
struct fib6_info *fib6_multipath_select(const struct net *net,
struct fib6_info *match,
struct flowi6 *fl6, int oif,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 58af969f3a2c..d0ace0c5c3e9 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1800,21 +1800,12 @@ void rt6_age_exceptions(struct fib6_info *rt,
rcu_read_unlock_bh();
}
-struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
- int oif, struct flowi6 *fl6,
- const struct sk_buff *skb, int flags)
+/* must be called with rcu lock held */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+ int oif, struct flowi6 *fl6, int strict)
{
struct fib6_node *fn, *saved_fn;
struct fib6_info *f6i;
- struct rt6_info *rt;
- int strict = 0;
-
- strict |= flags & RT6_LOOKUP_F_IFACE;
- strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
- if (net->ipv6.devconf_all->forwarding == 0)
- strict |= RT6_LOOKUP_F_REACHABLE;
-
- rcu_read_lock();
fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
saved_fn = fn;
@@ -1824,8 +1815,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
redo_rt6_select:
f6i = rt6_select(net, fn, oif, strict);
- if (f6i->fib6_nsiblings)
- f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
if (f6i == net->ipv6.fib6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
@@ -1838,6 +1827,28 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
}
}
+ return f6i;
+}
+
+struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
+ int oif, struct flowi6 *fl6,
+ const struct sk_buff *skb, int flags)
+{
+ struct fib6_info *f6i;
+ struct rt6_info *rt;
+ int strict = 0;
+
+ strict |= flags & RT6_LOOKUP_F_IFACE;
+ strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
+ if (net->ipv6.devconf_all->forwarding == 0)
+ strict |= RT6_LOOKUP_F_REACHABLE;
+
+ rcu_read_lock();
+
+ f6i = fib6_table_lookup(net, table, oif, fl6, strict);
+ if (f6i->fib6_nsiblings)
+ f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
+
if (f6i == net->ipv6.fib6_null_entry) {
rt = net->ipv6.ip6_null_entry;
rcu_read_unlock();
--
2.11.0
^ permalink raw reply related
* [bpf-next v1 4/9] net/ipv6: Refactor fib6_rule_action
From: David Ahern @ 2018-05-03 3:53 UTC (permalink / raw)
To: netdev, borkmann, ast
Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180503035319.18290-1-dsahern@gmail.com>
Move source address lookup from fib6_rule_action to a helper. It will be
used in a later patch by a second variant for fib6_rule_action.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
net/ipv6/fib6_rules.c | 52 ++++++++++++++++++++++++++++++---------------------
1 file changed, 31 insertions(+), 21 deletions(-)
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 6547fc6491a6..d040c4bff3a0 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -96,6 +96,31 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
return &net->ipv6.ip6_null_entry->dst;
}
+static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
+ struct flowi6 *flp6, const struct net_device *dev)
+{
+ struct fib6_rule *r = (struct fib6_rule *)rule;
+
+ /* If we need to find a source address for this traffic,
+ * we check the result if it meets requirement of the rule.
+ */
+ if ((rule->flags & FIB_RULE_FIND_SADDR) &&
+ r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
+ struct in6_addr saddr;
+
+ if (ipv6_dev_get_saddr(net, dev, &flp6->daddr,
+ rt6_flags2srcprefs(flags), &saddr))
+ return -EAGAIN;
+
+ if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen))
+ return -EAGAIN;
+
+ flp6->saddr = saddr;
+ }
+
+ return 0;
+}
+
static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
int flags, struct fib_lookup_arg *arg)
{
@@ -134,27 +159,12 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
rt = lookup(net, table, flp6, arg->lookup_data, flags);
if (rt != net->ipv6.ip6_null_entry) {
- struct fib6_rule *r = (struct fib6_rule *)rule;
-
- /*
- * If we need to find a source address for this traffic,
- * we check the result if it meets requirement of the rule.
- */
- if ((rule->flags & FIB_RULE_FIND_SADDR) &&
- r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
- struct in6_addr saddr;
-
- if (ipv6_dev_get_saddr(net,
- ip6_dst_idev(&rt->dst)->dev,
- &flp6->daddr,
- rt6_flags2srcprefs(flags),
- &saddr))
- goto again;
- if (!ipv6_prefix_equal(&saddr, &r->src.addr,
- r->src.plen))
- goto again;
- flp6->saddr = saddr;
- }
+ err = fib6_rule_saddr(net, rule, flags, flp6,
+ ip6_dst_idev(&rt->dst)->dev);
+
+ if (err == -EAGAIN)
+ goto again;
+
err = rt->dst.error;
if (err != -EAGAIN)
goto out;
--
2.11.0
^ permalink raw reply related
* [bpf-next v1 5/9] net/ipv6: Add fib6_lookup
From: David Ahern @ 2018-05-03 3:53 UTC (permalink / raw)
To: netdev, borkmann, ast
Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180503035319.18290-1-dsahern@gmail.com>
Add IPv6 equivalent to fib_lookup. Does a fib lookup, including rules,
but returns a FIB entry, fib6_info, rather than a dst based rt6_info.
fib6_lookup is any where from 140% (MULTIPLE_TABLES config disabled)
to 60% faster than any of the dst based lookup methods (without custom
rules) and 25% faster with custom rules (e.g., l3mdev rule).
Since the lookup function has a completely different signature,
fib6_rule_action is split into 2 paths: the existing one is
renamed __fib6_rule_action and a new one for the fib6_info path
is added. fib6_rule_action decides which to call based on the
lookup_ptr. If it is fib6_table_lookup then the new path is taken.
Caller must hold rcu lock as no reference is taken on the returned
fib entry.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/net/ip6_fib.h | 6 ++++
net/ipv6/fib6_rules.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
net/ipv6/ip6_fib.c | 7 +++++
3 files changed, 97 insertions(+), 2 deletions(-)
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 4f7b8f59ea6d..d920dd00139b 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,12 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
const struct sk_buff *skb,
int flags, pol_lookup_t lookup);
+/* called with rcu lock held; can return error pointer
+ * caller needs to select path
+ */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ int flags);
+
/* called with rcu lock held; caller needs to select path */
struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
int oif, struct flowi6 *fl6, int strict);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index d040c4bff3a0..f590446595d8 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -60,6 +60,39 @@ unsigned int fib6_rules_seq_read(struct net *net)
return fib_rules_seq_read(net, AF_INET6);
}
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ int flags)
+{
+ struct fib6_info *f6i;
+ int err;
+
+ if (net->ipv6.fib6_has_custom_rules) {
+ struct fib_lookup_arg arg = {
+ .lookup_ptr = fib6_table_lookup,
+ .lookup_data = &oif,
+ .flags = FIB_LOOKUP_NOREF,
+ };
+
+ l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
+ err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
+ flowi6_to_flowi(fl6), flags, &arg);
+ if (err)
+ return ERR_PTR(err);
+
+ f6i = arg.result ? : net->ipv6.fib6_null_entry;
+ } else {
+ f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl,
+ oif, fl6, flags);
+ if (!f6i || f6i == net->ipv6.fib6_null_entry)
+ f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
+ oif, fl6, flags);
+ }
+
+ return f6i;
+}
+
struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
const struct sk_buff *skb,
int flags, pol_lookup_t lookup)
@@ -121,8 +154,48 @@ static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
return 0;
}
-static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
- int flags, struct fib_lookup_arg *arg)
+static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct flowi6 *flp6 = &flp->u.ip6;
+ struct net *net = rule->fr_net;
+ struct fib6_table *table;
+ struct fib6_info *f6i;
+ int err = -EAGAIN, *oif;
+ u32 tb_id;
+
+ switch (rule->action) {
+ case FR_ACT_TO_TBL:
+ break;
+ case FR_ACT_UNREACHABLE:
+ return -ENETUNREACH;
+ case FR_ACT_PROHIBIT:
+ return -EACCES;
+ case FR_ACT_BLACKHOLE:
+ default:
+ return -EINVAL;
+ }
+
+ tb_id = fib_rule_get_table(rule, arg);
+ table = fib6_get_table(net, tb_id);
+ if (!table)
+ return -EAGAIN;
+
+ oif = (int *)arg->lookup_data;
+ f6i = fib6_table_lookup(net, table, *oif, flp6, flags);
+ if (f6i != net->ipv6.fib6_null_entry) {
+ err = fib6_rule_saddr(net, rule, flags, flp6,
+ fib6_info_nh_dev(f6i));
+
+ if (likely(!err))
+ arg->result = f6i;
+ }
+
+ return err;
+}
+
+static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
{
struct flowi6 *flp6 = &flp->u.ip6;
struct rt6_info *rt = NULL;
@@ -182,6 +255,15 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
return err;
}
+static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ if (arg->lookup_ptr == fib6_table_lookup)
+ return fib6_rule_action_alt(rule, flp, flags, arg);
+
+ return __fib6_rule_action(rule, flp, flags, arg);
+}
+
static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
{
struct rt6_info *rt = (struct rt6_info *) arg->result;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 4cfffa0f676e..0b94c0a631cb 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -354,6 +354,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
return &rt->dst;
}
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ int flags)
+{
+ return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags);
+}
+
static void __net_init fib6_tables_init(struct net *net)
{
fib6_link_table(net, net->ipv6.fib6_main_tbl);
--
2.11.0
^ permalink raw reply related
* [bpf-next v1 6/9] net/ipv6: Update fib6 tracepoint to take fib6_info
From: David Ahern @ 2018-05-03 3:53 UTC (permalink / raw)
To: netdev, borkmann, ast
Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180503035319.18290-1-dsahern@gmail.com>
Similar to IPv4, IPv6 should use the FIB lookup result in the
tracepoint.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/trace/events/fib6.h | 14 +++++++-------
net/ipv6/route.c | 14 ++++++--------
2 files changed, 13 insertions(+), 15 deletions(-)
diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index 7e8d48a81b91..1b8d951e3c12 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -12,10 +12,10 @@
TRACE_EVENT(fib6_table_lookup,
- TP_PROTO(const struct net *net, const struct rt6_info *rt,
+ TP_PROTO(const struct net *net, const struct fib6_info *f6i,
struct fib6_table *table, const struct flowi6 *flp),
- TP_ARGS(net, rt, table, flp),
+ TP_ARGS(net, f6i, table, flp),
TP_STRUCT__entry(
__field( u32, tb_id )
@@ -48,20 +48,20 @@ TRACE_EVENT(fib6_table_lookup,
in6 = (struct in6_addr *)__entry->dst;
*in6 = flp->daddr;
- if (rt->rt6i_idev) {
- __assign_str(name, rt->rt6i_idev->dev->name);
+ if (f6i->fib6_nh.nh_dev) {
+ __assign_str(name, f6i->fib6_nh.nh_dev);
} else {
__assign_str(name, "");
}
- if (rt == net->ipv6.ip6_null_entry) {
+ if (f6i == net->ipv6.fib6_null_entry) {
struct in6_addr in6_zero = {};
in6 = (struct in6_addr *)__entry->gw;
*in6 = in6_zero;
- } else if (rt) {
+ } else if (f6i) {
in6 = (struct in6_addr *)__entry->gw;
- *in6 = rt->rt6i_gateway;
+ *in6 = f6i->fib6_nh.nh_gw;
}
),
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d0ace0c5c3e9..cf8de6899581 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1078,6 +1078,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
goto restart;
}
+ trace_fib6_table_lookup(net, f6i, table, fl6);
+
/* Search through exception table */
rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
if (rt) {
@@ -1096,8 +1098,6 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
rcu_read_unlock();
- trace_fib6_table_lookup(net, rt, table, fl6);
-
return rt;
}
@@ -1827,6 +1827,8 @@ struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
}
}
+ trace_fib6_table_lookup(net, f6i, table, fl6);
+
return f6i;
}
@@ -1853,7 +1855,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
rt = net->ipv6.ip6_null_entry;
rcu_read_unlock();
dst_hold(&rt->dst);
- trace_fib6_table_lookup(net, rt, table, fl6);
return rt;
}
@@ -1864,7 +1865,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
dst_use_noref(&rt->dst, jiffies);
rcu_read_unlock();
- trace_fib6_table_lookup(net, rt, table, fl6);
return rt;
} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
!(f6i->fib6_flags & RTF_GATEWAY))) {
@@ -1890,9 +1890,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
dst_hold(&uncached_rt->dst);
}
- trace_fib6_table_lookup(net, uncached_rt, table, fl6);
return uncached_rt;
-
} else {
/* Get a percpu copy */
@@ -1906,7 +1904,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
local_bh_enable();
rcu_read_unlock();
- trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
+
return pcpu_rt;
}
}
@@ -2486,7 +2484,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
rcu_read_unlock();
- trace_fib6_table_lookup(net, ret, table, fl6);
+ trace_fib6_table_lookup(net, rt, table, fl6);
return ret;
};
--
2.11.0
^ permalink raw reply related
* [bpf-next v1 7/9] net/ipv6: Add fib lookup stubs for use in bpf helper
From: David Ahern @ 2018-05-03 3:53 UTC (permalink / raw)
To: netdev, borkmann, ast
Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180503035319.18290-1-dsahern@gmail.com>
Add stubs to retrieve a handle to an IPv6 FIB table, fib6_get_table,
a stub to do a lookup in a specific table, fib6_table_lookup, and
a stub for a full route lookup.
The stubs are needed for core bpf code to handle the case when the
IPv6 module is not builtin.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/net/addrconf.h | 14 ++++++++++++++
net/ipv6/addrconf_core.c | 33 ++++++++++++++++++++++++++++++++-
net/ipv6/af_inet6.c | 6 +++++-
3 files changed, 51 insertions(+), 2 deletions(-)
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 8312cc25a3af..ff766ab207e0 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -223,6 +223,20 @@ struct ipv6_stub {
const struct in6_addr *addr);
int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
struct dst_entry **dst, struct flowi6 *fl6);
+
+ struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
+ struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
+ struct flowi6 *fl6, int flags);
+ struct fib6_info *(*fib6_table_lookup)(struct net *net,
+ struct fib6_table *table,
+ int oif, struct flowi6 *fl6,
+ int flags);
+ struct fib6_info *(*fib6_multipath_select)(const struct net *net,
+ struct fib6_info *f6i,
+ struct flowi6 *fl6, int oif,
+ const struct sk_buff *skb,
+ int strict);
+
void (*udpv6_encap_enable)(void);
void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
const struct in6_addr *solicited_addr,
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 32b564dfd02a..2fe754fd4f5e 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -134,8 +134,39 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
return -EAFNOSUPPORT;
}
+static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
+{
+ return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table,
+ int oif, struct flowi6 *fl6, int flags)
+{
+ return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ int flags)
+{
+ return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i,
+ struct flowi6 *fl6, int oif,
+ const struct sk_buff *skb, int strict)
+{
+ return f6i;
+}
+
const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
- .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+ .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+ .fib6_get_table = eafnosupport_fib6_get_table,
+ .fib6_table_lookup = eafnosupport_fib6_table_lookup,
+ .fib6_lookup = eafnosupport_fib6_lookup,
+ .fib6_multipath_select = eafnosupport_fib6_multipath_select,
};
EXPORT_SYMBOL_GPL(ipv6_stub);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 36d622c477b1..c0e8255d50bb 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -887,7 +887,11 @@ static struct pernet_operations inet6_net_ops = {
static const struct ipv6_stub ipv6_stub_impl = {
.ipv6_sock_mc_join = ipv6_sock_mc_join,
.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
- .ipv6_dst_lookup = ip6_dst_lookup,
+ .ipv6_dst_lookup = ip6_dst_lookup,
+ .fib6_get_table = fib6_get_table,
+ .fib6_table_lookup = fib6_table_lookup,
+ .fib6_lookup = fib6_lookup,
+ .fib6_multipath_select = fib6_multipath_select,
.udpv6_encap_enable = udpv6_encap_enable,
.ndisc_send_na = ndisc_send_na,
.nd_tbl = &nd_tbl,
--
2.11.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox