* [PATCH] net: filter: Convert the BPF VM to threaded code
@ 2011-07-29 8:10 Rui Ueyama
2011-07-29 9:30 ` Eric Dumazet
2011-08-01 18:16 ` Hagen Paul Pfeifer
0 siblings, 2 replies; 13+ messages in thread
From: Rui Ueyama @ 2011-07-29 8:10 UTC (permalink / raw)
To: netdev
Convert the BPF VM to threaded code to improve performance.
The BPF VM is basically a big for loop containing a switch statement. That is
slow because for each instruction it checks the for loop condition and does the
conditional branch of the switch statement.
This patch eliminates the conditional branch, by replacing it with jump table
using GCC's labels-as-values feature. The for loop condition check can also be
removed, because the filter code always end with a RET instruction.
Signed-off-by: Rui Ueyama <rui314@gmail.com>
---
include/linux/filter.h | 60 +------
include/linux/filter_inst.h | 57 ++++++
net/core/filter.c | 457 +++++++++++++++++++++----------------------
3 files changed, 289 insertions(+), 285 deletions(-)
create mode 100644 include/linux/filter_inst.h
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 741956f..2f72166 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -172,62 +172,10 @@ static inline void bpf_jit_free(struct sk_filter *fp)
#endif
enum {
- BPF_S_RET_K = 1,
- BPF_S_RET_A,
- BPF_S_ALU_ADD_K,
- BPF_S_ALU_ADD_X,
- BPF_S_ALU_SUB_K,
- BPF_S_ALU_SUB_X,
- BPF_S_ALU_MUL_K,
- BPF_S_ALU_MUL_X,
- BPF_S_ALU_DIV_X,
- BPF_S_ALU_AND_K,
- BPF_S_ALU_AND_X,
- BPF_S_ALU_OR_K,
- BPF_S_ALU_OR_X,
- BPF_S_ALU_LSH_K,
- BPF_S_ALU_LSH_X,
- BPF_S_ALU_RSH_K,
- BPF_S_ALU_RSH_X,
- BPF_S_ALU_NEG,
- BPF_S_LD_W_ABS,
- BPF_S_LD_H_ABS,
- BPF_S_LD_B_ABS,
- BPF_S_LD_W_LEN,
- BPF_S_LD_W_IND,
- BPF_S_LD_H_IND,
- BPF_S_LD_B_IND,
- BPF_S_LD_IMM,
- BPF_S_LDX_W_LEN,
- BPF_S_LDX_B_MSH,
- BPF_S_LDX_IMM,
- BPF_S_MISC_TAX,
- BPF_S_MISC_TXA,
- BPF_S_ALU_DIV_K,
- BPF_S_LD_MEM,
- BPF_S_LDX_MEM,
- BPF_S_ST,
- BPF_S_STX,
- BPF_S_JMP_JA,
- BPF_S_JMP_JEQ_K,
- BPF_S_JMP_JEQ_X,
- BPF_S_JMP_JGE_K,
- BPF_S_JMP_JGE_X,
- BPF_S_JMP_JGT_K,
- BPF_S_JMP_JGT_X,
- BPF_S_JMP_JSET_K,
- BPF_S_JMP_JSET_X,
- /* Ancillary data */
- BPF_S_ANC_PROTOCOL,
- BPF_S_ANC_PKTTYPE,
- BPF_S_ANC_IFINDEX,
- BPF_S_ANC_NLATTR,
- BPF_S_ANC_NLATTR_NEST,
- BPF_S_ANC_MARK,
- BPF_S_ANC_QUEUE,
- BPF_S_ANC_HATYPE,
- BPF_S_ANC_RXHASH,
- BPF_S_ANC_CPU,
+ BPF_S_DUMMY = 0,
+#define BPF_INST(inst) inst,
+#include "filter_inst.h"
+#undef BPF_INST
};
#endif /* __KERNEL__ */
diff --git a/include/linux/filter_inst.h b/include/linux/filter_inst.h
new file mode 100644
index 0000000..235a797
--- /dev/null
+++ b/include/linux/filter_inst.h
@@ -0,0 +1,57 @@
+BPF_INST(BPF_S_RET_K)
+BPF_INST(BPF_S_RET_A)
+BPF_INST(BPF_S_ALU_ADD_K)
+BPF_INST(BPF_S_ALU_ADD_X)
+BPF_INST(BPF_S_ALU_SUB_K)
+BPF_INST(BPF_S_ALU_SUB_X)
+BPF_INST(BPF_S_ALU_MUL_K)
+BPF_INST(BPF_S_ALU_MUL_X)
+BPF_INST(BPF_S_ALU_DIV_X)
+BPF_INST(BPF_S_ALU_AND_K)
+BPF_INST(BPF_S_ALU_AND_X)
+BPF_INST(BPF_S_ALU_OR_K)
+BPF_INST(BPF_S_ALU_OR_X)
+BPF_INST(BPF_S_ALU_LSH_K)
+BPF_INST(BPF_S_ALU_LSH_X)
+BPF_INST(BPF_S_ALU_RSH_K)
+BPF_INST(BPF_S_ALU_RSH_X)
+BPF_INST(BPF_S_ALU_NEG)
+BPF_INST(BPF_S_LD_W_ABS)
+BPF_INST(BPF_S_LD_H_ABS)
+BPF_INST(BPF_S_LD_B_ABS)
+BPF_INST(BPF_S_LD_W_LEN)
+BPF_INST(BPF_S_LD_W_IND)
+BPF_INST(BPF_S_LD_H_IND)
+BPF_INST(BPF_S_LD_B_IND)
+BPF_INST(BPF_S_LD_IMM)
+BPF_INST(BPF_S_LDX_W_LEN)
+BPF_INST(BPF_S_LDX_B_MSH)
+BPF_INST(BPF_S_LDX_IMM)
+BPF_INST(BPF_S_MISC_TAX)
+BPF_INST(BPF_S_MISC_TXA)
+BPF_INST(BPF_S_ALU_DIV_K)
+BPF_INST(BPF_S_LD_MEM)
+BPF_INST(BPF_S_LDX_MEM)
+BPF_INST(BPF_S_ST)
+BPF_INST(BPF_S_STX)
+BPF_INST(BPF_S_JMP_JA)
+BPF_INST(BPF_S_JMP_JEQ_K)
+BPF_INST(BPF_S_JMP_JEQ_X)
+BPF_INST(BPF_S_JMP_JGE_K)
+BPF_INST(BPF_S_JMP_JGE_X)
+BPF_INST(BPF_S_JMP_JGT_K)
+BPF_INST(BPF_S_JMP_JGT_X)
+BPF_INST(BPF_S_JMP_JSET_K)
+BPF_INST(BPF_S_JMP_JSET_X)
+
+/* Ancillary data */
+BPF_INST(BPF_S_ANC_PROTOCOL)
+BPF_INST(BPF_S_ANC_PKTTYPE)
+BPF_INST(BPF_S_ANC_IFINDEX)
+BPF_INST(BPF_S_ANC_NLATTR)
+BPF_INST(BPF_S_ANC_NLATTR_NEST)
+BPF_INST(BPF_S_ANC_MARK)
+BPF_INST(BPF_S_ANC_QUEUE)
+BPF_INST(BPF_S_ANC_HATYPE)
+BPF_INST(BPF_S_ANC_RXHASH)
+BPF_INST(BPF_S_ANC_CPU)
diff --git a/net/core/filter.c b/net/core/filter.c
index 36f975f..e0c9d2c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -119,245 +119,244 @@ unsigned int sk_run_filter(const struct sk_buff *skb,
u32 tmp;
int k;
- /*
- * Process array of filter instructions.
- */
- for (;; fentry++) {
-#if defined(CONFIG_X86_32)
+ static void *jump_table[] = {
+ NULL,
+#define BPF_INST(inst) &&inst,
+#include <linux/filter_inst.h>
+#undef BPF_INST
+ };
+
#define K (fentry->k)
-#else
- const u32 K = fentry->k;
-#endif
-
- switch (fentry->code) {
- case BPF_S_ALU_ADD_X:
- A += X;
- continue;
- case BPF_S_ALU_ADD_K:
- A += K;
- continue;
- case BPF_S_ALU_SUB_X:
- A -= X;
- continue;
- case BPF_S_ALU_SUB_K:
- A -= K;
- continue;
- case BPF_S_ALU_MUL_X:
- A *= X;
- continue;
- case BPF_S_ALU_MUL_K:
- A *= K;
- continue;
- case BPF_S_ALU_DIV_X:
- if (X == 0)
- return 0;
- A /= X;
- continue;
- case BPF_S_ALU_DIV_K:
- A = reciprocal_divide(A, K);
- continue;
- case BPF_S_ALU_AND_X:
- A &= X;
- continue;
- case BPF_S_ALU_AND_K:
- A &= K;
- continue;
- case BPF_S_ALU_OR_X:
- A |= X;
- continue;
- case BPF_S_ALU_OR_K:
- A |= K;
- continue;
- case BPF_S_ALU_LSH_X:
- A <<= X;
- continue;
- case BPF_S_ALU_LSH_K:
- A <<= K;
- continue;
- case BPF_S_ALU_RSH_X:
- A >>= X;
- continue;
- case BPF_S_ALU_RSH_K:
- A >>= K;
- continue;
- case BPF_S_ALU_NEG:
- A = -A;
- continue;
- case BPF_S_JMP_JA:
- fentry += K;
- continue;
- case BPF_S_JMP_JGT_K:
- fentry += (A > K) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JGE_K:
- fentry += (A >= K) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JEQ_K:
- fentry += (A == K) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JSET_K:
- fentry += (A & K) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JGT_X:
- fentry += (A > X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JGE_X:
- fentry += (A >= X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JEQ_X:
- fentry += (A == X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JSET_X:
- fentry += (A & X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_LD_W_ABS:
- k = K;
-load_w:
- ptr = load_pointer(skb, k, 4, &tmp);
- if (ptr != NULL) {
- A = get_unaligned_be32(ptr);
- continue;
- }
+#define NEXT goto *jump_table[(++fentry)->code]
+
+ /* Dispatch the first instruction */
+ goto *jump_table[fentry->code];
+
+ BPF_S_ALU_ADD_X:
+ A += X;
+ NEXT;
+ BPF_S_ALU_ADD_K:
+ A += K;
+ NEXT;
+ BPF_S_ALU_SUB_X:
+ A -= X;
+ NEXT;
+ BPF_S_ALU_SUB_K:
+ A -= K;
+ NEXT;
+ BPF_S_ALU_MUL_X:
+ A *= X;
+ NEXT;
+ BPF_S_ALU_MUL_K:
+ A *= K;
+ NEXT;
+ BPF_S_ALU_DIV_X:
+ if (X == 0)
+ return 0;
+ A /= X;
+ NEXT;
+ BPF_S_ALU_DIV_K:
+ A = reciprocal_divide(A, K);
+ NEXT;
+ BPF_S_ALU_AND_X:
+ A &= X;
+ NEXT;
+ BPF_S_ALU_AND_K:
+ A &= K;
+ NEXT;
+ BPF_S_ALU_OR_X:
+ A |= X;
+ NEXT;
+ BPF_S_ALU_OR_K:
+ A |= K;
+ NEXT;
+ BPF_S_ALU_LSH_X:
+ A <<= X;
+ NEXT;
+ BPF_S_ALU_LSH_K:
+ A <<= K;
+ NEXT;
+ BPF_S_ALU_RSH_X:
+ A >>= X;
+ NEXT;
+ BPF_S_ALU_RSH_K:
+ A >>= K;
+ NEXT;
+ BPF_S_ALU_NEG:
+ A = -A;
+ NEXT;
+ BPF_S_JMP_JA:
+ fentry += K;
+ NEXT;
+ BPF_S_JMP_JGT_K:
+ fentry += (A > K) ? fentry->jt : fentry->jf;
+ NEXT;
+ BPF_S_JMP_JGE_K:
+ fentry += (A >= K) ? fentry->jt : fentry->jf;
+ NEXT;
+ BPF_S_JMP_JEQ_K:
+ fentry += (A == K) ? fentry->jt : fentry->jf;
+ NEXT;
+ BPF_S_JMP_JSET_K:
+ fentry += (A & K) ? fentry->jt : fentry->jf;
+ NEXT;
+ BPF_S_JMP_JGT_X:
+ fentry += (A > X) ? fentry->jt : fentry->jf;
+ NEXT;
+ BPF_S_JMP_JGE_X:
+ fentry += (A >= X) ? fentry->jt : fentry->jf;
+ NEXT;
+ BPF_S_JMP_JEQ_X:
+ fentry += (A == X) ? fentry->jt : fentry->jf;
+ NEXT;
+ BPF_S_JMP_JSET_X:
+ fentry += (A & X) ? fentry->jt : fentry->jf;
+ NEXT;
+ BPF_S_LD_W_ABS:
+ k = K;
+ load_w:
+ ptr = load_pointer(skb, k, 4, &tmp);
+ if (ptr != NULL) {
+ A = get_unaligned_be32(ptr);
+ NEXT;
+ }
+ return 0;
+ BPF_S_LD_H_ABS:
+ k = K;
+ load_h:
+ ptr = load_pointer(skb, k, 2, &tmp);
+ if (ptr != NULL) {
+ A = get_unaligned_be16(ptr);
+ NEXT;
+ }
+ return 0;
+ BPF_S_LD_B_ABS:
+ k = K;
+ load_b:
+ ptr = load_pointer(skb, k, 1, &tmp);
+ if (ptr != NULL) {
+ A = *(u8 *)ptr;
+ NEXT;
+ }
+ return 0;
+ BPF_S_LD_W_LEN:
+ A = skb->len;
+ NEXT;
+ BPF_S_LDX_W_LEN:
+ X = skb->len;
+ NEXT;
+ BPF_S_LD_W_IND:
+ k = X + K;
+ goto load_w;
+ BPF_S_LD_H_IND:
+ k = X + K;
+ goto load_h;
+ BPF_S_LD_B_IND:
+ k = X + K;
+ goto load_b;
+ BPF_S_LDX_B_MSH:
+ ptr = load_pointer(skb, K, 1, &tmp);
+ if (ptr != NULL) {
+ X = (*(u8 *)ptr & 0xf) << 2;
+ NEXT;
+ }
+ return 0;
+ BPF_S_LD_IMM:
+ A = K;
+ NEXT;
+ BPF_S_LDX_IMM:
+ X = K;
+ NEXT;
+ BPF_S_LD_MEM:
+ A = mem[K];
+ NEXT;
+ BPF_S_LDX_MEM:
+ X = mem[K];
+ NEXT;
+ BPF_S_MISC_TAX:
+ X = A;
+ NEXT;
+ BPF_S_MISC_TXA:
+ A = X;
+ NEXT;
+ BPF_S_RET_K:
+ return K;
+ BPF_S_RET_A:
+ return A;
+ BPF_S_ST:
+ mem[K] = A;
+ NEXT;
+ BPF_S_STX:
+ mem[K] = X;
+ NEXT;
+ BPF_S_ANC_PROTOCOL:
+ A = ntohs(skb->protocol);
+ NEXT;
+ BPF_S_ANC_PKTTYPE:
+ A = skb->pkt_type;
+ NEXT;
+ BPF_S_ANC_IFINDEX:
+ if (!skb->dev)
+ return 0;
+ A = skb->dev->ifindex;
+ NEXT;
+ BPF_S_ANC_MARK:
+ A = skb->mark;
+ NEXT;
+ BPF_S_ANC_QUEUE:
+ A = skb->queue_mapping;
+ NEXT;
+ BPF_S_ANC_HATYPE:
+ if (!skb->dev)
+ return 0;
+ A = skb->dev->type;
+ NEXT;
+ BPF_S_ANC_RXHASH:
+ A = skb->rxhash;
+ NEXT;
+ BPF_S_ANC_CPU:
+ A = raw_smp_processor_id();
+ NEXT;
+ BPF_S_ANC_NLATTR:
+ {
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
return 0;
- case BPF_S_LD_H_ABS:
- k = K;
-load_h:
- ptr = load_pointer(skb, k, 2, &tmp);
- if (ptr != NULL) {
- A = get_unaligned_be16(ptr);
- continue;
- }
+ if (A > skb->len - sizeof(struct nlattr))
return 0;
- case BPF_S_LD_B_ABS:
- k = K;
-load_b:
- ptr = load_pointer(skb, k, 1, &tmp);
- if (ptr != NULL) {
- A = *(u8 *)ptr;
- continue;
- }
+
+ nla = nla_find((struct nlattr *)&skb->data[A],
+ skb->len - A, X);
+ if (nla)
+ A = (void *)nla - (void *)skb->data;
+ else
+ A = 0;
+ }
+ NEXT;
+ BPF_S_ANC_NLATTR_NEST:
+ {
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
return 0;
- case BPF_S_LD_W_LEN:
- A = skb->len;
- continue;
- case BPF_S_LDX_W_LEN:
- X = skb->len;
- continue;
- case BPF_S_LD_W_IND:
- k = X + K;
- goto load_w;
- case BPF_S_LD_H_IND:
- k = X + K;
- goto load_h;
- case BPF_S_LD_B_IND:
- k = X + K;
- goto load_b;
- case BPF_S_LDX_B_MSH:
- ptr = load_pointer(skb, K, 1, &tmp);
- if (ptr != NULL) {
- X = (*(u8 *)ptr & 0xf) << 2;
- continue;
- }
+ if (A > skb->len - sizeof(struct nlattr))
return 0;
- case BPF_S_LD_IMM:
- A = K;
- continue;
- case BPF_S_LDX_IMM:
- X = K;
- continue;
- case BPF_S_LD_MEM:
- A = mem[K];
- continue;
- case BPF_S_LDX_MEM:
- X = mem[K];
- continue;
- case BPF_S_MISC_TAX:
- X = A;
- continue;
- case BPF_S_MISC_TXA:
- A = X;
- continue;
- case BPF_S_RET_K:
- return K;
- case BPF_S_RET_A:
- return A;
- case BPF_S_ST:
- mem[K] = A;
- continue;
- case BPF_S_STX:
- mem[K] = X;
- continue;
- case BPF_S_ANC_PROTOCOL:
- A = ntohs(skb->protocol);
- continue;
- case BPF_S_ANC_PKTTYPE:
- A = skb->pkt_type;
- continue;
- case BPF_S_ANC_IFINDEX:
- if (!skb->dev)
- return 0;
- A = skb->dev->ifindex;
- continue;
- case BPF_S_ANC_MARK:
- A = skb->mark;
- continue;
- case BPF_S_ANC_QUEUE:
- A = skb->queue_mapping;
- continue;
- case BPF_S_ANC_HATYPE:
- if (!skb->dev)
- return 0;
- A = skb->dev->type;
- continue;
- case BPF_S_ANC_RXHASH:
- A = skb->rxhash;
- continue;
- case BPF_S_ANC_CPU:
- A = raw_smp_processor_id();
- continue;
- case BPF_S_ANC_NLATTR: {
- struct nlattr *nla;
-
- if (skb_is_nonlinear(skb))
- return 0;
- if (A > skb->len - sizeof(struct nlattr))
- return 0;
-
- nla = nla_find((struct nlattr *)&skb->data[A],
- skb->len - A, X);
- if (nla)
- A = (void *)nla - (void *)skb->data;
- else
- A = 0;
- continue;
- }
- case BPF_S_ANC_NLATTR_NEST: {
- struct nlattr *nla;
-
- if (skb_is_nonlinear(skb))
- return 0;
- if (A > skb->len - sizeof(struct nlattr))
- return 0;
-
- nla = (struct nlattr *)&skb->data[A];
- if (nla->nla_len > A - skb->len)
- return 0;
-
- nla = nla_find_nested(nla, X);
- if (nla)
- A = (void *)nla - (void *)skb->data;
- else
- A = 0;
- continue;
- }
- default:
- WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
- fentry->code, fentry->jt,
- fentry->jf, fentry->k);
+
+ nla = (struct nlattr *)&skb->data[A];
+ if (nla->nla_len > A - skb->len)
return 0;
- }
+
+ nla = nla_find_nested(nla, X);
+ if (nla)
+ A = (void *)nla - (void *)skb->data;
+ else
+ A = 0;
}
+ NEXT;
+#undef K
+#undef NEXT
return 0;
}
EXPORT_SYMBOL(sk_run_filter);
--
1.7.3.1
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
2011-07-29 8:10 [PATCH] net: filter: Convert the BPF VM to threaded code Rui Ueyama
@ 2011-07-29 9:30 ` Eric Dumazet
2011-07-30 5:09 ` Rui Ueyama
2011-08-01 18:16 ` Hagen Paul Pfeifer
1 sibling, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2011-07-29 9:30 UTC (permalink / raw)
To: Rui Ueyama; +Cc: netdev
Le vendredi 29 juillet 2011 à 01:10 -0700, Rui Ueyama a écrit :
> Convert the BPF VM to threaded code to improve performance.
>
> The BPF VM is basically a big for loop containing a switch statement. That is
> slow because for each instruction it checks the for loop condition and does the
> conditional branch of the switch statement.
>
> This patch eliminates the conditional branch, by replacing it with jump table
> using GCC's labels-as-values feature. The for loop condition check can also be
> removed, because the filter code always end with a RET instruction.
>
Well...
> +#define NEXT goto *jump_table[(++fentry)->code]
> +
> + /* Dispatch the first instruction */
> + goto *jump_table[fentry->code];
This is the killer, as this cannot be predicted by the cpu.
Do you have benchmark results to provide ?
We now have BPF JIT on x86_64 and powerpc, and possibly on MIPS and ARM
on a near future.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
2011-07-29 9:30 ` Eric Dumazet
@ 2011-07-30 5:09 ` Rui Ueyama
2011-07-30 6:04 ` Eric Dumazet
2011-07-30 9:55 ` Francois Romieu
0 siblings, 2 replies; 13+ messages in thread
From: Rui Ueyama @ 2011-07-30 5:09 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev
The result of benchmark looks good. A simple benchmark that sends 10M UDP
packets to lo took 76.24 seconds on average on Core 2 Duo L7500@1.6GHz.when
tcpdump is running. With this patch it took 75.41 seconds, which means we save
80ns for each packet on that processor.
I think converting the VM to threaded code is low hanging fruit, even
if we'd have
JIT compilers for popular architectures. Most of the lines in my patch
are indentation
change, so the actual change is not big.
Vanilla kernel:
(without tcpdump)
ruiu@blue:~$ time ./udpflood 10000000
real 0m57.909s
user 0m1.368s
sys 0m56.484s
ruiu@blue:~$ time ./udpflood 10000000
real 0m57.686s
user 0m1.360s
sys 0m56.288s
ruiu@blue:~$ time ./udpflood 10000000
real 0m58.457s
user 0m1.300s
sys 0m57.116s
(with tcpdump)
ruiu@blue:~$ time ./udpflood 10000000
real 1m16.025s
user 0m1.464s
sys 1m14.505s
ruiu@blue:~$ time ./udpflood 10000000
real 1m15.860s
user 0m1.232s
sys 1m14.573s
ruiu@blue:~$ time ./udpflood 10000000
real 1m16.861s
user 0m1.504s
sys 1m15.301s
Kernel with the patch:
(without tcpdump)
ruiu@blue:~$ time ./udpflood 10000000
real 0m59.272s
user 0m1.308s
sys 0m57.924s
ruiu@blue:~$ time ./udpflood 10000000
real 0m59.624s
user 0m1.336s
sys 0m58.244s
ruiu@blue:~$ time ./udpflood 10000000
real 0m59.340s
user 0m1.240s
sys 0m58.056s
(with tcpdump)
ruiu@blue:~$ time ./udpflood 10000000
real 1m15.392s
user 0m1.372s
sys 1m13.965s
ruiu@blue:~$ time ./udpflood 10000000
real 1m15.352s
user 0m1.452s
sys 1m13.845s
ruiu@blue:~$ time ./udpflood 10000000
real 1m15.508s
user 0m1.464s
sys 1m13.989s
Tcpdump I used is this: tcpdump -p -n -s -i lo net 192.168.2.0/24
On Fri, Jul 29, 2011 at 2:30 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le vendredi 29 juillet 2011 à 01:10 -0700, Rui Ueyama a écrit :
>> Convert the BPF VM to threaded code to improve performance.
>>
>> The BPF VM is basically a big for loop containing a switch statement. That is
>> slow because for each instruction it checks the for loop condition and does the
>> conditional branch of the switch statement.
>>
>> This patch eliminates the conditional branch, by replacing it with jump table
>> using GCC's labels-as-values feature. The for loop condition check can also be
>> removed, because the filter code always end with a RET instruction.
>>
>
> Well...
>
>
>> +#define NEXT goto *jump_table[(++fentry)->code]
>> +
>> + /* Dispatch the first instruction */
>> + goto *jump_table[fentry->code];
>
> This is the killer, as this cannot be predicted by the cpu.
>
> Do you have benchmark results to provide ?
>
> We now have BPF JIT on x86_64 and powerpc, and possibly on MIPS and ARM
> on a near future.
>
>
>
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
2011-07-30 5:09 ` Rui Ueyama
@ 2011-07-30 6:04 ` Eric Dumazet
2011-07-30 6:20 ` Eric Dumazet
2011-07-30 9:55 ` Francois Romieu
1 sibling, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2011-07-30 6:04 UTC (permalink / raw)
To: Rui Ueyama; +Cc: netdev
Le vendredi 29 juillet 2011 à 22:09 -0700, Rui Ueyama a écrit :
> The result of benchmark looks good. A simple benchmark that sends 10M UDP
> packets to lo took 76.24 seconds on average on Core 2 Duo L7500@1.6GHz.when
> tcpdump is running. With this patch it took 75.41 seconds, which means we save
> 80ns for each packet on that processor.
>
> I think converting the VM to threaded code is low hanging fruit, even
> if we'd have
> JIT compilers for popular architectures. Most of the lines in my patch
> are indentation
> change, so the actual change is not big.
>
...
> Tcpdump I used is this: tcpdump -p -n -s -i lo net 192.168.2.0/24
>
Thanks for providing numbers. Was it on 32 or 64bit kernel ?
Have you done a test with a cold instruction cache ?
Your patch adds 540 bytes of code, so its a potential latency increase.
# size net/core/filter.o net/core/filter.o.old
text data bss dec hex filename
4243 0 0 4243 1093 net/core/filter.o
3703 24 0 3727 e8f net/core/filter.o.old
Each 'NEXT' translates to :
4db: 83 c3 08 add $0x8,%ebx
4de: 0f b7 03 movzwl (%ebx),%eax
4e1: 8b 04 85 00 02 00 00 mov 0x200(,%eax,4),%eax
4e8: ff e0 jmp *%eax
And this is on i386, expect more on cpus with 32bit fixed
instructions ...
We can remove one branch per BPF instruction with following patch :
diff --git a/net/core/filter.c b/net/core/filter.c
index 36f975f..377f3ca 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -119,16 +119,14 @@ unsigned int sk_run_filter(const struct sk_buff *skb,
u32 tmp;
int k;
+ fentry--;
/*
* Process array of filter instructions.
*/
- for (;; fentry++) {
-#if defined(CONFIG_X86_32)
+ for (;;) {
#define K (fentry->k)
-#else
- const u32 K = fentry->k;
-#endif
+ fentry++;
switch (fentry->code) {
case BPF_S_ALU_ADD_X:
A += X;
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
2011-07-30 6:04 ` Eric Dumazet
@ 2011-07-30 6:20 ` Eric Dumazet
0 siblings, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2011-07-30 6:20 UTC (permalink / raw)
To: Rui Ueyama; +Cc: netdev
Le samedi 30 juillet 2011 à 08:04 +0200, Eric Dumazet a écrit :
> We can remove one branch per BPF instruction with following patch :
>
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 36f975f..377f3ca 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -119,16 +119,14 @@ unsigned int sk_run_filter(const struct sk_buff *skb,
> u32 tmp;
> int k;
>
> + fentry--;
> /*
> * Process array of filter instructions.
> */
> - for (;; fentry++) {
> -#if defined(CONFIG_X86_32)
> + for (;;) {
> #define K (fentry->k)
> -#else
> - const u32 K = fentry->k;
> -#endif
>
> + fentry++;
> switch (fentry->code) {
> case BPF_S_ALU_ADD_X:
> A += X;
>
>
BTW, I tried to add one unreachable() in the default: branch, and gcc
4.5.2 generates interesting code :
It still does the compare and test, but both branches ends on same
location :
348: 83 c3 08 add $0x8,%ebx
34b: 66 83 3b 37 cmpw $0x37,(%ebx)
34f: 76 07 jbe 358 <sk_run_filter+0x28>
351: 8d b4 26 00 00 00 00 lea 0x0(%esi,%eiz,1),%esi
358: 0f b7 03 movzwl (%ebx),%eax
35b: ff 24 85 34 01 00 00 jmp *0x134(,%eax,4)
On a 64bit build and gcc 4.1.2, it even generates an infinite loop
730: 66 83 3b 37 cmpw $0x37,(%rbx)
734: 76 02 jbe 738 <sk_run_filter+0x28>
736: eb fe jmp 736 <sk_run_filter+0x26>
738: 0f b7 03 movzwl (%rbx),%eax
73b: ff 24 c5 00 00 00 00 jmpq *0x0(,%rax,8)
diff --git a/net/core/filter.c b/net/core/filter.c
index 36f975f..89221e7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -122,13 +122,11 @@ unsigned int sk_run_filter(const struct sk_buff *skb,
/*
* Process array of filter instructions.
*/
- for (;; fentry++) {
-#if defined(CONFIG_X86_32)
+ fentry--;
+ for (;;) {
#define K (fentry->k)
-#else
- const u32 K = fentry->k;
-#endif
+ fentry++;
switch (fentry->code) {
case BPF_S_ALU_ADD_X:
A += X;
@@ -351,6 +349,7 @@ load_b:
continue;
}
default:
+ unreachable();
WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
fentry->code, fentry->jt,
fentry->jf, fentry->k);
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
2011-07-30 5:09 ` Rui Ueyama
2011-07-30 6:04 ` Eric Dumazet
@ 2011-07-30 9:55 ` Francois Romieu
1 sibling, 0 replies; 13+ messages in thread
From: Francois Romieu @ 2011-07-30 9:55 UTC (permalink / raw)
To: Rui Ueyama; +Cc: Eric Dumazet, netdev
Rui Ueyama <rui314@gmail.com> :
> The result of benchmark looks good.
So far the overall improvement is below 2% (upper bound, without calculator).
And the result of benchmark without tcpdump looks strange.
> A simple benchmark that sends 10M UDP
> packets to lo took 76.24 seconds on average on Core 2 Duo L7500@1.6GHz.when
> tcpdump is running. With this patch it took 75.41 seconds, which means we save
> 80ns for each packet on that processor.
You forgot to analyze the figures without tcpdump.
> I think converting the VM to threaded code is low hanging fruit, even
> if we'd have JIT compilers for popular architectures. Most of the lines
> in my patch are indentation change, so the actual change is not big.
Without further data nor plan for future / possible improvements, it
could be said a gratuitous rewrite too.
--
Ueimor
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
2011-07-29 8:10 [PATCH] net: filter: Convert the BPF VM to threaded code Rui Ueyama
2011-07-29 9:30 ` Eric Dumazet
@ 2011-08-01 18:16 ` Hagen Paul Pfeifer
2011-08-01 18:37 ` Eric Dumazet
1 sibling, 1 reply; 13+ messages in thread
From: Hagen Paul Pfeifer @ 2011-08-01 18:16 UTC (permalink / raw)
To: Rui Ueyama; +Cc: netdev
* Rui Ueyama | 2011-07-29 01:10:26 [-0700]:
>Convert the BPF VM to threaded code to improve performance.
>
>The BPF VM is basically a big for loop containing a switch statement. That is
>slow because for each instruction it checks the for loop condition and does the
>conditional branch of the switch statement.
>
>This patch eliminates the conditional branch, by replacing it with jump table
>using GCC's labels-as-values feature. The for loop condition check can also be
>removed, because the filter code always end with a RET instruction.
With commit 01f2f3f6ef4d076c I reworked the BPF code so that gcc is in the
ability to generate a jump table, I double checked this. Not sure what happened
in the meantime.
Hagen
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
2011-08-01 18:16 ` Hagen Paul Pfeifer
@ 2011-08-01 18:37 ` Eric Dumazet
2011-08-02 0:57 ` David Miller
0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2011-08-01 18:37 UTC (permalink / raw)
To: Hagen Paul Pfeifer; +Cc: Rui Ueyama, netdev
Le lundi 01 août 2011 à 20:16 +0200, Hagen Paul Pfeifer a écrit :
> * Rui Ueyama | 2011-07-29 01:10:26 [-0700]:
>
> >Convert the BPF VM to threaded code to improve performance.
> >
> >The BPF VM is basically a big for loop containing a switch statement. That is
> >slow because for each instruction it checks the for loop condition and does the
> >conditional branch of the switch statement.
> >
> >This patch eliminates the conditional branch, by replacing it with jump table
> >using GCC's labels-as-values feature. The for loop condition check can also be
> >removed, because the filter code always end with a RET instruction.
>
> With commit 01f2f3f6ef4d076c I reworked the BPF code so that gcc is in the
> ability to generate a jump table, I double checked this. Not sure what happened
> in the meantime.
>
A switch() always generates one conditional branch, catching values not
enumerated in the "case ..." clauses.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
2011-08-01 18:37 ` Eric Dumazet
@ 2011-08-02 0:57 ` David Miller
2011-08-09 5:00 ` Eric Dumazet
0 siblings, 1 reply; 13+ messages in thread
From: David Miller @ 2011-08-02 0:57 UTC (permalink / raw)
To: eric.dumazet; +Cc: hagen, rui314, netdev
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 01 Aug 2011 20:37:46 +0200
> Le lundi 01 août 2011 à 20:16 +0200, Hagen Paul Pfeifer a écrit :
>> * Rui Ueyama | 2011-07-29 01:10:26 [-0700]:
>>
>> >Convert the BPF VM to threaded code to improve performance.
>> >
>> >The BPF VM is basically a big for loop containing a switch statement. That is
>> >slow because for each instruction it checks the for loop condition and does the
>> >conditional branch of the switch statement.
>> >
>> >This patch eliminates the conditional branch, by replacing it with jump table
>> >using GCC's labels-as-values feature. The for loop condition check can also be
>> >removed, because the filter code always end with a RET instruction.
>>
>> With commit 01f2f3f6ef4d076c I reworked the BPF code so that gcc is in the
>> ability to generate a jump table, I double checked this. Not sure what happened
>> in the meantime.
>>
>
> A switch() always generates one conditional branch, catching values not
> enumerated in the "case ..." clauses.
Maybe it won't if we use an enum and make sure all enum values are handled
in the switch? :-)
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
2011-08-02 0:57 ` David Miller
@ 2011-08-09 5:00 ` Eric Dumazet
2011-08-09 8:29 ` Hagen Paul Pfeifer
0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2011-08-09 5:00 UTC (permalink / raw)
To: David Miller; +Cc: hagen, rui314, netdev
Le lundi 01 août 2011 à 17:57 -0700, David Miller a écrit :
> Maybe it won't if we use an enum and make sure all enum values are handled
> in the switch? :-)
I tried this idea since its already an enum and all enum values are
handled in the witch, but all gcc versions I used still generate the
useless compare and branch (always predicted by modern CPU, so harmless
anyway ?)
348: 83 c3 08 add $0x8,%ebx
34b: 66 83 3b 37 cmpw $0x37,(%ebx)
34f: 77 f7 ja 348 <sk_run_filter+0x18>
351: 0f b7 03 movzwl (%ebx),%eax
354: ff 24 85 34 01 00 00 jmp *0x134(,%eax,4)
It would be nice to try the jump table idea and avoid the array
indirection and get instead :
add $0xc,%ebx
jmp *(ebx)
(But this would need to use a larger kernel_sock_filter with not an u16
code, but the target address).
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 741956f..f2a1b37 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -171,7 +171,7 @@ static inline void bpf_jit_free(struct sk_filter *fp)
#define SK_RUN_FILTER(FILTER, SKB) sk_run_filter(SKB, FILTER->insns)
#endif
-enum {
+enum bpf_inst {
BPF_S_RET_K = 1,
BPF_S_RET_A,
BPF_S_ALU_ADD_K,
diff --git a/net/core/filter.c b/net/core/filter.c
index 36f975f..ea1f467 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -119,17 +119,15 @@ unsigned int sk_run_filter(const struct sk_buff *skb,
u32 tmp;
int k;
+ fentry--;
/*
* Process array of filter instructions.
*/
- for (;; fentry++) {
-#if defined(CONFIG_X86_32)
+ for (;;) {
#define K (fentry->k)
-#else
- const u32 K = fentry->k;
-#endif
- switch (fentry->code) {
+ fentry++;
+ switch ((enum bpf_inst)fentry->code) {
case BPF_S_ALU_ADD_X:
A += X;
continue;
@@ -350,11 +348,6 @@ load_b:
A = 0;
continue;
}
- default:
- WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
- fentry->code, fentry->jt,
- fentry->jf, fentry->k);
- return 0;
}
}
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
2011-08-09 5:00 ` Eric Dumazet
@ 2011-08-09 8:29 ` Hagen Paul Pfeifer
2011-08-09 8:36 ` Eric Dumazet
0 siblings, 1 reply; 13+ messages in thread
From: Hagen Paul Pfeifer @ 2011-08-09 8:29 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, rui314, netdev
On Tue, 09 Aug 2011 07:00:27 +0200, Eric Dumazet wrote:
> I tried this idea since its already an enum and all enum values are
> handled in the switch, but all gcc versions I used still generate the
> useless compare and branch (always predicted by modern CPU, so harmless
> anyway ?)
Don't think so, the list is rather long - the CPU may not have any
indication. But strange that your gcc generate a conditional sequence of
cmp and conditional jump instruction instead of a jump table! Look at
commit 01f2f3f6ef4d076c, this is what I got after I refactored to an
_dense_ enum list.
> (But this would need to use a larger kernel_sock_filter with not an u16
> code, but the target address).
Not sure if the benefit is worth to try it. Can we avoid any cacheline
misses? I don't think so.
> + switch ((enum bpf_inst)fentry->code) {
That should not differ! Eric, sure that this do the trick?
Hagen
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
2011-08-09 8:29 ` Hagen Paul Pfeifer
@ 2011-08-09 8:36 ` Eric Dumazet
2011-08-09 8:53 ` Hagen Paul Pfeifer
0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2011-08-09 8:36 UTC (permalink / raw)
To: Hagen Paul Pfeifer; +Cc: David Miller, rui314, netdev
Le mardi 09 août 2011 à 10:29 +0200, Hagen Paul Pfeifer a écrit :
> On Tue, 09 Aug 2011 07:00:27 +0200, Eric Dumazet wrote:
>
> > I tried this idea since its already an enum and all enum values are
> > handled in the switch, but all gcc versions I used still generate the
> > useless compare and branch (always predicted by modern CPU, so harmless
> > anyway ?)
>
> Don't think so, the list is rather long - the CPU may not have any
> indication. But strange that your gcc generate a conditional sequence of
> cmp and conditional jump instruction instead of a jump table! Look at
> commit 01f2f3f6ef4d076c, this is what I got after I refactored to an
> _dense_ enum list.
I am speaking of one single conditional branch, the one that is done for
every BPF instruction done in a filter. It's never taken, so must be
predicted by the cpu. A perf session could make sure it is ;)
>
> > (But this would need to use a larger kernel_sock_filter with not an u16
> > code, but the target address).
>
> Not sure if the benefit is worth to try it. Can we avoid any cacheline
> misses? I don't think so.
We avoid some instructions (to load the address from the instruction
code), thats all, for each BPF instruction.
>
>
> > + switch ((enum bpf_inst)fentry->code) {
>
> That should not differ! Eric, sure that this do the trick?
It is the way to tell gcc to make its optimizations, if they exists on
enum switch (), as David suggested.
If I add a new BPF_S_ANC_NEW_INST value in the enum list, not handled in
the switch(), gcc complains correctly.
net/core/filter.c: In function ‘sk_run_filter’:
net/core/filter.c:130:3: warning: enumeration value ‘BPF_S_ANC_NEW_INST’
not handled in switch
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
2011-08-09 8:36 ` Eric Dumazet
@ 2011-08-09 8:53 ` Hagen Paul Pfeifer
0 siblings, 0 replies; 13+ messages in thread
From: Hagen Paul Pfeifer @ 2011-08-09 8:53 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, rui314, netdev
On Tue, 09 Aug 2011 10:36:28 +0200, Eric Dumazet wrote:
> I am speaking of one single conditional branch, the one that is done for
> every BPF instruction done in a filter. It's never taken, so must be
> predicted by the cpu. A perf session could make sure it is ;)
Ahh, ok - perf is your friend, I know Eric ;-)
>> > + switch ((enum bpf_inst)fentry->code) {
>>
>> That should not differ! Eric, sure that this do the trick?
>
> It is the way to tell gcc to make its optimizations, if they exists on
> enum switch (), as David suggested.
Argl, I see sock_filter.code is declared as __u16 not enum.
Hagen
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2011-08-09 8:53 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-07-29 8:10 [PATCH] net: filter: Convert the BPF VM to threaded code Rui Ueyama
2011-07-29 9:30 ` Eric Dumazet
2011-07-30 5:09 ` Rui Ueyama
2011-07-30 6:04 ` Eric Dumazet
2011-07-30 6:20 ` Eric Dumazet
2011-07-30 9:55 ` Francois Romieu
2011-08-01 18:16 ` Hagen Paul Pfeifer
2011-08-01 18:37 ` Eric Dumazet
2011-08-02 0:57 ` David Miller
2011-08-09 5:00 ` Eric Dumazet
2011-08-09 8:29 ` Hagen Paul Pfeifer
2011-08-09 8:36 ` Eric Dumazet
2011-08-09 8:53 ` Hagen Paul Pfeifer
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).