netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] net: filter: Convert the BPF VM to threaded code
@ 2011-07-29  8:10 Rui Ueyama
  2011-07-29  9:30 ` Eric Dumazet
  2011-08-01 18:16 ` Hagen Paul Pfeifer
  0 siblings, 2 replies; 13+ messages in thread
From: Rui Ueyama @ 2011-07-29  8:10 UTC (permalink / raw)
  To: netdev

Convert the BPF VM to threaded code to improve performance.

The BPF VM is basically a big for loop containing a switch statement.  That is
slow because for each instruction it checks the for loop condition and does the
conditional branch of the switch statement.

This patch eliminates the conditional branch, by replacing it with jump table
using GCC's labels-as-values feature. The for loop condition check can also be
removed, because the filter code always end with a RET instruction.

Signed-off-by: Rui Ueyama <rui314@gmail.com>
---
 include/linux/filter.h      |   60 +------
 include/linux/filter_inst.h |   57 ++++++
 net/core/filter.c           |  457 +++++++++++++++++++++----------------------
 3 files changed, 289 insertions(+), 285 deletions(-)
 create mode 100644 include/linux/filter_inst.h

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 741956f..2f72166 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -172,62 +172,10 @@ static inline void bpf_jit_free(struct sk_filter *fp)
 #endif

 enum {
-	BPF_S_RET_K = 1,
-	BPF_S_RET_A,
-	BPF_S_ALU_ADD_K,
-	BPF_S_ALU_ADD_X,
-	BPF_S_ALU_SUB_K,
-	BPF_S_ALU_SUB_X,
-	BPF_S_ALU_MUL_K,
-	BPF_S_ALU_MUL_X,
-	BPF_S_ALU_DIV_X,
-	BPF_S_ALU_AND_K,
-	BPF_S_ALU_AND_X,
-	BPF_S_ALU_OR_K,
-	BPF_S_ALU_OR_X,
-	BPF_S_ALU_LSH_K,
-	BPF_S_ALU_LSH_X,
-	BPF_S_ALU_RSH_K,
-	BPF_S_ALU_RSH_X,
-	BPF_S_ALU_NEG,
-	BPF_S_LD_W_ABS,
-	BPF_S_LD_H_ABS,
-	BPF_S_LD_B_ABS,
-	BPF_S_LD_W_LEN,
-	BPF_S_LD_W_IND,
-	BPF_S_LD_H_IND,
-	BPF_S_LD_B_IND,
-	BPF_S_LD_IMM,
-	BPF_S_LDX_W_LEN,
-	BPF_S_LDX_B_MSH,
-	BPF_S_LDX_IMM,
-	BPF_S_MISC_TAX,
-	BPF_S_MISC_TXA,
-	BPF_S_ALU_DIV_K,
-	BPF_S_LD_MEM,
-	BPF_S_LDX_MEM,
-	BPF_S_ST,
-	BPF_S_STX,
-	BPF_S_JMP_JA,
-	BPF_S_JMP_JEQ_K,
-	BPF_S_JMP_JEQ_X,
-	BPF_S_JMP_JGE_K,
-	BPF_S_JMP_JGE_X,
-	BPF_S_JMP_JGT_K,
-	BPF_S_JMP_JGT_X,
-	BPF_S_JMP_JSET_K,
-	BPF_S_JMP_JSET_X,
-	/* Ancillary data */
-	BPF_S_ANC_PROTOCOL,
-	BPF_S_ANC_PKTTYPE,
-	BPF_S_ANC_IFINDEX,
-	BPF_S_ANC_NLATTR,
-	BPF_S_ANC_NLATTR_NEST,
-	BPF_S_ANC_MARK,
-	BPF_S_ANC_QUEUE,
-	BPF_S_ANC_HATYPE,
-	BPF_S_ANC_RXHASH,
-	BPF_S_ANC_CPU,
+	BPF_S_DUMMY = 0,
+#define BPF_INST(inst) inst,
+#include "filter_inst.h"
+#undef BPF_INST
 };

 #endif /* __KERNEL__ */
diff --git a/include/linux/filter_inst.h b/include/linux/filter_inst.h
new file mode 100644
index 0000000..235a797
--- /dev/null
+++ b/include/linux/filter_inst.h
@@ -0,0 +1,57 @@
+BPF_INST(BPF_S_RET_K)
+BPF_INST(BPF_S_RET_A)
+BPF_INST(BPF_S_ALU_ADD_K)
+BPF_INST(BPF_S_ALU_ADD_X)
+BPF_INST(BPF_S_ALU_SUB_K)
+BPF_INST(BPF_S_ALU_SUB_X)
+BPF_INST(BPF_S_ALU_MUL_K)
+BPF_INST(BPF_S_ALU_MUL_X)
+BPF_INST(BPF_S_ALU_DIV_X)
+BPF_INST(BPF_S_ALU_AND_K)
+BPF_INST(BPF_S_ALU_AND_X)
+BPF_INST(BPF_S_ALU_OR_K)
+BPF_INST(BPF_S_ALU_OR_X)
+BPF_INST(BPF_S_ALU_LSH_K)
+BPF_INST(BPF_S_ALU_LSH_X)
+BPF_INST(BPF_S_ALU_RSH_K)
+BPF_INST(BPF_S_ALU_RSH_X)
+BPF_INST(BPF_S_ALU_NEG)
+BPF_INST(BPF_S_LD_W_ABS)
+BPF_INST(BPF_S_LD_H_ABS)
+BPF_INST(BPF_S_LD_B_ABS)
+BPF_INST(BPF_S_LD_W_LEN)
+BPF_INST(BPF_S_LD_W_IND)
+BPF_INST(BPF_S_LD_H_IND)
+BPF_INST(BPF_S_LD_B_IND)
+BPF_INST(BPF_S_LD_IMM)
+BPF_INST(BPF_S_LDX_W_LEN)
+BPF_INST(BPF_S_LDX_B_MSH)
+BPF_INST(BPF_S_LDX_IMM)
+BPF_INST(BPF_S_MISC_TAX)
+BPF_INST(BPF_S_MISC_TXA)
+BPF_INST(BPF_S_ALU_DIV_K)
+BPF_INST(BPF_S_LD_MEM)
+BPF_INST(BPF_S_LDX_MEM)
+BPF_INST(BPF_S_ST)
+BPF_INST(BPF_S_STX)
+BPF_INST(BPF_S_JMP_JA)
+BPF_INST(BPF_S_JMP_JEQ_K)
+BPF_INST(BPF_S_JMP_JEQ_X)
+BPF_INST(BPF_S_JMP_JGE_K)
+BPF_INST(BPF_S_JMP_JGE_X)
+BPF_INST(BPF_S_JMP_JGT_K)
+BPF_INST(BPF_S_JMP_JGT_X)
+BPF_INST(BPF_S_JMP_JSET_K)
+BPF_INST(BPF_S_JMP_JSET_X)
+
+/* Ancillary data */
+BPF_INST(BPF_S_ANC_PROTOCOL)
+BPF_INST(BPF_S_ANC_PKTTYPE)
+BPF_INST(BPF_S_ANC_IFINDEX)
+BPF_INST(BPF_S_ANC_NLATTR)
+BPF_INST(BPF_S_ANC_NLATTR_NEST)
+BPF_INST(BPF_S_ANC_MARK)
+BPF_INST(BPF_S_ANC_QUEUE)
+BPF_INST(BPF_S_ANC_HATYPE)
+BPF_INST(BPF_S_ANC_RXHASH)
+BPF_INST(BPF_S_ANC_CPU)
diff --git a/net/core/filter.c b/net/core/filter.c
index 36f975f..e0c9d2c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -119,245 +119,244 @@ unsigned int sk_run_filter(const struct sk_buff *skb,
 	u32 tmp;
 	int k;

-	/*
-	 * Process array of filter instructions.
-	 */
-	for (;; fentry++) {
-#if defined(CONFIG_X86_32)
+	static void *jump_table[] = {
+		NULL,
+#define BPF_INST(inst) &&inst,
+#include <linux/filter_inst.h>
+#undef BPF_INST
+	};
+
 #define	K (fentry->k)
-#else
-		const u32 K = fentry->k;
-#endif
-
-		switch (fentry->code) {
-		case BPF_S_ALU_ADD_X:
-			A += X;
-			continue;
-		case BPF_S_ALU_ADD_K:
-			A += K;
-			continue;
-		case BPF_S_ALU_SUB_X:
-			A -= X;
-			continue;
-		case BPF_S_ALU_SUB_K:
-			A -= K;
-			continue;
-		case BPF_S_ALU_MUL_X:
-			A *= X;
-			continue;
-		case BPF_S_ALU_MUL_K:
-			A *= K;
-			continue;
-		case BPF_S_ALU_DIV_X:
-			if (X == 0)
-				return 0;
-			A /= X;
-			continue;
-		case BPF_S_ALU_DIV_K:
-			A = reciprocal_divide(A, K);
-			continue;
-		case BPF_S_ALU_AND_X:
-			A &= X;
-			continue;
-		case BPF_S_ALU_AND_K:
-			A &= K;
-			continue;
-		case BPF_S_ALU_OR_X:
-			A |= X;
-			continue;
-		case BPF_S_ALU_OR_K:
-			A |= K;
-			continue;
-		case BPF_S_ALU_LSH_X:
-			A <<= X;
-			continue;
-		case BPF_S_ALU_LSH_K:
-			A <<= K;
-			continue;
-		case BPF_S_ALU_RSH_X:
-			A >>= X;
-			continue;
-		case BPF_S_ALU_RSH_K:
-			A >>= K;
-			continue;
-		case BPF_S_ALU_NEG:
-			A = -A;
-			continue;
-		case BPF_S_JMP_JA:
-			fentry += K;
-			continue;
-		case BPF_S_JMP_JGT_K:
-			fentry += (A > K) ? fentry->jt : fentry->jf;
-			continue;
-		case BPF_S_JMP_JGE_K:
-			fentry += (A >= K) ? fentry->jt : fentry->jf;
-			continue;
-		case BPF_S_JMP_JEQ_K:
-			fentry += (A == K) ? fentry->jt : fentry->jf;
-			continue;
-		case BPF_S_JMP_JSET_K:
-			fentry += (A & K) ? fentry->jt : fentry->jf;
-			continue;
-		case BPF_S_JMP_JGT_X:
-			fentry += (A > X) ? fentry->jt : fentry->jf;
-			continue;
-		case BPF_S_JMP_JGE_X:
-			fentry += (A >= X) ? fentry->jt : fentry->jf;
-			continue;
-		case BPF_S_JMP_JEQ_X:
-			fentry += (A == X) ? fentry->jt : fentry->jf;
-			continue;
-		case BPF_S_JMP_JSET_X:
-			fentry += (A & X) ? fentry->jt : fentry->jf;
-			continue;
-		case BPF_S_LD_W_ABS:
-			k = K;
-load_w:
-			ptr = load_pointer(skb, k, 4, &tmp);
-			if (ptr != NULL) {
-				A = get_unaligned_be32(ptr);
-				continue;
-			}
+#define NEXT goto *jump_table[(++fentry)->code]
+
+	/* Dispatch the first instruction */
+	goto *jump_table[fentry->code];
+
+ BPF_S_ALU_ADD_X:
+	A += X;
+	NEXT;
+ BPF_S_ALU_ADD_K:
+	A += K;
+	NEXT;
+ BPF_S_ALU_SUB_X:
+	A -= X;
+	NEXT;
+ BPF_S_ALU_SUB_K:
+	A -= K;
+	NEXT;
+ BPF_S_ALU_MUL_X:
+	A *= X;
+	NEXT;
+ BPF_S_ALU_MUL_K:
+	A *= K;
+	NEXT;
+ BPF_S_ALU_DIV_X:
+	if (X == 0)
+		return 0;
+	A /= X;
+	NEXT;
+ BPF_S_ALU_DIV_K:
+	A = reciprocal_divide(A, K);
+	NEXT;
+ BPF_S_ALU_AND_X:
+	A &= X;
+	NEXT;
+ BPF_S_ALU_AND_K:
+	A &= K;
+	NEXT;
+ BPF_S_ALU_OR_X:
+	A |= X;
+	NEXT;
+ BPF_S_ALU_OR_K:
+	A |= K;
+	NEXT;
+ BPF_S_ALU_LSH_X:
+	A <<= X;
+	NEXT;
+ BPF_S_ALU_LSH_K:
+	A <<= K;
+	NEXT;
+ BPF_S_ALU_RSH_X:
+	A >>= X;
+	NEXT;
+ BPF_S_ALU_RSH_K:
+	A >>= K;
+	NEXT;
+ BPF_S_ALU_NEG:
+	A = -A;
+	NEXT;
+ BPF_S_JMP_JA:
+	fentry += K;
+	NEXT;
+ BPF_S_JMP_JGT_K:
+	fentry += (A > K) ? fentry->jt : fentry->jf;
+	NEXT;
+ BPF_S_JMP_JGE_K:
+	fentry += (A >= K) ? fentry->jt : fentry->jf;
+	NEXT;
+ BPF_S_JMP_JEQ_K:
+	fentry += (A == K) ? fentry->jt : fentry->jf;
+	NEXT;
+ BPF_S_JMP_JSET_K:
+	fentry += (A & K) ? fentry->jt : fentry->jf;
+	NEXT;
+ BPF_S_JMP_JGT_X:
+	fentry += (A > X) ? fentry->jt : fentry->jf;
+	NEXT;
+ BPF_S_JMP_JGE_X:
+	fentry += (A >= X) ? fentry->jt : fentry->jf;
+	NEXT;
+ BPF_S_JMP_JEQ_X:
+	fentry += (A == X) ? fentry->jt : fentry->jf;
+	NEXT;
+ BPF_S_JMP_JSET_X:
+	fentry += (A & X) ? fentry->jt : fentry->jf;
+	NEXT;
+ BPF_S_LD_W_ABS:
+	k = K;
+ load_w:
+	ptr = load_pointer(skb, k, 4, &tmp);
+	if (ptr != NULL) {
+		A = get_unaligned_be32(ptr);
+		NEXT;
+	}
+	return 0;
+ BPF_S_LD_H_ABS:
+	k = K;
+ load_h:
+	ptr = load_pointer(skb, k, 2, &tmp);
+	if (ptr != NULL) {
+		A = get_unaligned_be16(ptr);
+		NEXT;
+	}
+	return 0;
+ BPF_S_LD_B_ABS:
+	k = K;
+ load_b:
+	ptr = load_pointer(skb, k, 1, &tmp);
+	if (ptr != NULL) {
+		A = *(u8 *)ptr;
+		NEXT;
+	}
+	return 0;
+ BPF_S_LD_W_LEN:
+	A = skb->len;
+	NEXT;
+ BPF_S_LDX_W_LEN:
+	X = skb->len;
+	NEXT;
+ BPF_S_LD_W_IND:
+	k = X + K;
+	goto load_w;
+ BPF_S_LD_H_IND:
+	k = X + K;
+	goto load_h;
+ BPF_S_LD_B_IND:
+	k = X + K;
+	goto load_b;
+ BPF_S_LDX_B_MSH:
+	ptr = load_pointer(skb, K, 1, &tmp);
+	if (ptr != NULL) {
+		X = (*(u8 *)ptr & 0xf) << 2;
+		NEXT;
+	}
+	return 0;
+ BPF_S_LD_IMM:
+	A = K;
+	NEXT;
+ BPF_S_LDX_IMM:
+	X = K;
+	NEXT;
+ BPF_S_LD_MEM:
+	A = mem[K];
+	NEXT;
+ BPF_S_LDX_MEM:
+	X = mem[K];
+	NEXT;
+ BPF_S_MISC_TAX:
+	X = A;
+	NEXT;
+ BPF_S_MISC_TXA:
+	A = X;
+	NEXT;
+ BPF_S_RET_K:
+	return K;
+ BPF_S_RET_A:
+	return A;
+ BPF_S_ST:
+	mem[K] = A;
+	NEXT;
+ BPF_S_STX:
+	mem[K] = X;
+	NEXT;
+ BPF_S_ANC_PROTOCOL:
+	A = ntohs(skb->protocol);
+	NEXT;
+ BPF_S_ANC_PKTTYPE:
+	A = skb->pkt_type;
+	NEXT;
+ BPF_S_ANC_IFINDEX:
+	if (!skb->dev)
+		return 0;
+	A = skb->dev->ifindex;
+	NEXT;
+ BPF_S_ANC_MARK:
+	A = skb->mark;
+	NEXT;
+ BPF_S_ANC_QUEUE:
+	A = skb->queue_mapping;
+	NEXT;
+ BPF_S_ANC_HATYPE:
+	if (!skb->dev)
+		return 0;
+	A = skb->dev->type;
+	NEXT;
+ BPF_S_ANC_RXHASH:
+	A = skb->rxhash;
+	NEXT;
+ BPF_S_ANC_CPU:
+	A = raw_smp_processor_id();
+	NEXT;
+ BPF_S_ANC_NLATTR:
+	{
+		struct nlattr *nla;
+
+		if (skb_is_nonlinear(skb))
 			return 0;
-		case BPF_S_LD_H_ABS:
-			k = K;
-load_h:
-			ptr = load_pointer(skb, k, 2, &tmp);
-			if (ptr != NULL) {
-				A = get_unaligned_be16(ptr);
-				continue;
-			}
+		if (A > skb->len - sizeof(struct nlattr))
 			return 0;
-		case BPF_S_LD_B_ABS:
-			k = K;
-load_b:
-			ptr = load_pointer(skb, k, 1, &tmp);
-			if (ptr != NULL) {
-				A = *(u8 *)ptr;
-				continue;
-			}
+
+		nla = nla_find((struct nlattr *)&skb->data[A],
+			       skb->len - A, X);
+		if (nla)
+			A = (void *)nla - (void *)skb->data;
+		else
+			A = 0;
+	}
+	NEXT;
+ BPF_S_ANC_NLATTR_NEST:
+	{
+		struct nlattr *nla;
+
+		if (skb_is_nonlinear(skb))
 			return 0;
-		case BPF_S_LD_W_LEN:
-			A = skb->len;
-			continue;
-		case BPF_S_LDX_W_LEN:
-			X = skb->len;
-			continue;
-		case BPF_S_LD_W_IND:
-			k = X + K;
-			goto load_w;
-		case BPF_S_LD_H_IND:
-			k = X + K;
-			goto load_h;
-		case BPF_S_LD_B_IND:
-			k = X + K;
-			goto load_b;
-		case BPF_S_LDX_B_MSH:
-			ptr = load_pointer(skb, K, 1, &tmp);
-			if (ptr != NULL) {
-				X = (*(u8 *)ptr & 0xf) << 2;
-				continue;
-			}
+		if (A > skb->len - sizeof(struct nlattr))
 			return 0;
-		case BPF_S_LD_IMM:
-			A = K;
-			continue;
-		case BPF_S_LDX_IMM:
-			X = K;
-			continue;
-		case BPF_S_LD_MEM:
-			A = mem[K];
-			continue;
-		case BPF_S_LDX_MEM:
-			X = mem[K];
-			continue;
-		case BPF_S_MISC_TAX:
-			X = A;
-			continue;
-		case BPF_S_MISC_TXA:
-			A = X;
-			continue;
-		case BPF_S_RET_K:
-			return K;
-		case BPF_S_RET_A:
-			return A;
-		case BPF_S_ST:
-			mem[K] = A;
-			continue;
-		case BPF_S_STX:
-			mem[K] = X;
-			continue;
-		case BPF_S_ANC_PROTOCOL:
-			A = ntohs(skb->protocol);
-			continue;
-		case BPF_S_ANC_PKTTYPE:
-			A = skb->pkt_type;
-			continue;
-		case BPF_S_ANC_IFINDEX:
-			if (!skb->dev)
-				return 0;
-			A = skb->dev->ifindex;
-			continue;
-		case BPF_S_ANC_MARK:
-			A = skb->mark;
-			continue;
-		case BPF_S_ANC_QUEUE:
-			A = skb->queue_mapping;
-			continue;
-		case BPF_S_ANC_HATYPE:
-			if (!skb->dev)
-				return 0;
-			A = skb->dev->type;
-			continue;
-		case BPF_S_ANC_RXHASH:
-			A = skb->rxhash;
-			continue;
-		case BPF_S_ANC_CPU:
-			A = raw_smp_processor_id();
-			continue;
-		case BPF_S_ANC_NLATTR: {
-			struct nlattr *nla;
-
-			if (skb_is_nonlinear(skb))
-				return 0;
-			if (A > skb->len - sizeof(struct nlattr))
-				return 0;
-
-			nla = nla_find((struct nlattr *)&skb->data[A],
-				       skb->len - A, X);
-			if (nla)
-				A = (void *)nla - (void *)skb->data;
-			else
-				A = 0;
-			continue;
-		}
-		case BPF_S_ANC_NLATTR_NEST: {
-			struct nlattr *nla;
-
-			if (skb_is_nonlinear(skb))
-				return 0;
-			if (A > skb->len - sizeof(struct nlattr))
-				return 0;
-
-			nla = (struct nlattr *)&skb->data[A];
-			if (nla->nla_len > A - skb->len)
-				return 0;
-
-			nla = nla_find_nested(nla, X);
-			if (nla)
-				A = (void *)nla - (void *)skb->data;
-			else
-				A = 0;
-			continue;
-		}
-		default:
-			WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
-				       fentry->code, fentry->jt,
-				       fentry->jf, fentry->k);
+
+		nla = (struct nlattr *)&skb->data[A];
+		if (nla->nla_len > A - skb->len)
 			return 0;
-		}
+
+		nla = nla_find_nested(nla, X);
+		if (nla)
+			A = (void *)nla - (void *)skb->data;
+		else
+			A = 0;
 	}
+	NEXT;

+#undef K
+#undef NEXT
 	return 0;
 }
 EXPORT_SYMBOL(sk_run_filter);
-- 
1.7.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
  2011-07-29  8:10 [PATCH] net: filter: Convert the BPF VM to threaded code Rui Ueyama
@ 2011-07-29  9:30 ` Eric Dumazet
  2011-07-30  5:09   ` Rui Ueyama
  2011-08-01 18:16 ` Hagen Paul Pfeifer
  1 sibling, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2011-07-29  9:30 UTC (permalink / raw)
  To: Rui Ueyama; +Cc: netdev

Le vendredi 29 juillet 2011 à 01:10 -0700, Rui Ueyama a écrit :
> Convert the BPF VM to threaded code to improve performance.
> 
> The BPF VM is basically a big for loop containing a switch statement.  That is
> slow because for each instruction it checks the for loop condition and does the
> conditional branch of the switch statement.
> 
> This patch eliminates the conditional branch, by replacing it with jump table
> using GCC's labels-as-values feature. The for loop condition check can also be
> removed, because the filter code always end with a RET instruction.
> 

Well...


> +#define NEXT goto *jump_table[(++fentry)->code]
> +
> +	/* Dispatch the first instruction */
> +	goto *jump_table[fentry->code];

This is the killer, as this cannot be predicted by the cpu.

Do you have benchmark results to provide ?

We now have BPF JIT on x86_64 and powerpc, and possibly on MIPS and ARM
on a near future.




^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
  2011-07-29  9:30 ` Eric Dumazet
@ 2011-07-30  5:09   ` Rui Ueyama
  2011-07-30  6:04     ` Eric Dumazet
  2011-07-30  9:55     ` Francois Romieu
  0 siblings, 2 replies; 13+ messages in thread
From: Rui Ueyama @ 2011-07-30  5:09 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev

The result of benchmark looks good. A simple benchmark that sends 10M UDP
packets to lo took 76.24 seconds on average on Core 2 Duo L7500@1.6GHz.when
tcpdump is running. With this patch it took 75.41 seconds, which means we save
80ns for each packet on that processor.

I think converting the VM to threaded code is low hanging fruit, even
if we'd have
JIT compilers for popular architectures. Most of the lines in my patch
are indentation
change, so the actual change is not big.

Vanilla kernel:

(without tcpdump)
ruiu@blue:~$ time ./udpflood 10000000
real	0m57.909s
user	0m1.368s
sys	0m56.484s

ruiu@blue:~$ time ./udpflood 10000000
real	0m57.686s
user	0m1.360s
sys	0m56.288s

ruiu@blue:~$ time ./udpflood 10000000
real	0m58.457s
user	0m1.300s
sys	0m57.116s

(with tcpdump)
ruiu@blue:~$ time ./udpflood 10000000
real	1m16.025s
user	0m1.464s
sys	1m14.505s
ruiu@blue:~$ time ./udpflood 10000000

real	1m15.860s
user	0m1.232s
sys	1m14.573s
ruiu@blue:~$ time ./udpflood 10000000

real	1m16.861s
user	0m1.504s
sys	1m15.301s


Kernel with the patch:

(without tcpdump)
ruiu@blue:~$ time ./udpflood 10000000

real	0m59.272s
user	0m1.308s
sys	0m57.924s

ruiu@blue:~$ time ./udpflood 10000000

real	0m59.624s
user	0m1.336s
sys	0m58.244s

ruiu@blue:~$ time ./udpflood 10000000

real	0m59.340s
user	0m1.240s
sys	0m58.056s

(with tcpdump)
ruiu@blue:~$ time ./udpflood 10000000

real	1m15.392s
user	0m1.372s
sys	1m13.965s

ruiu@blue:~$ time ./udpflood 10000000

real	1m15.352s
user	0m1.452s
sys	1m13.845s

ruiu@blue:~$ time ./udpflood 10000000

real	1m15.508s
user	0m1.464s
sys	1m13.989s

Tcpdump I used is this: tcpdump -p -n -s -i lo net 192.168.2.0/24

On Fri, Jul 29, 2011 at 2:30 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le vendredi 29 juillet 2011 à 01:10 -0700, Rui Ueyama a écrit :
>> Convert the BPF VM to threaded code to improve performance.
>>
>> The BPF VM is basically a big for loop containing a switch statement.  That is
>> slow because for each instruction it checks the for loop condition and does the
>> conditional branch of the switch statement.
>>
>> This patch eliminates the conditional branch, by replacing it with jump table
>> using GCC's labels-as-values feature. The for loop condition check can also be
>> removed, because the filter code always end with a RET instruction.
>>
>
> Well...
>
>
>> +#define NEXT goto *jump_table[(++fentry)->code]
>> +
>> +     /* Dispatch the first instruction */
>> +     goto *jump_table[fentry->code];
>
> This is the killer, as this cannot be predicted by the cpu.
>
> Do you have benchmark results to provide ?
>
> We now have BPF JIT on x86_64 and powerpc, and possibly on MIPS and ARM
> on a near future.
>
>
>
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
  2011-07-30  5:09   ` Rui Ueyama
@ 2011-07-30  6:04     ` Eric Dumazet
  2011-07-30  6:20       ` Eric Dumazet
  2011-07-30  9:55     ` Francois Romieu
  1 sibling, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2011-07-30  6:04 UTC (permalink / raw)
  To: Rui Ueyama; +Cc: netdev

Le vendredi 29 juillet 2011 à 22:09 -0700, Rui Ueyama a écrit :
> The result of benchmark looks good. A simple benchmark that sends 10M UDP
> packets to lo took 76.24 seconds on average on Core 2 Duo L7500@1.6GHz.when
> tcpdump is running. With this patch it took 75.41 seconds, which means we save
> 80ns for each packet on that processor.
> 
> I think converting the VM to threaded code is low hanging fruit, even
> if we'd have
> JIT compilers for popular architectures. Most of the lines in my patch
> are indentation
> change, so the actual change is not big.
> 
...
> Tcpdump I used is this: tcpdump -p -n -s -i lo net 192.168.2.0/24
> 

Thanks for providing numbers. Was it on 32 or 64bit kernel ?

Have you done a test with a cold instruction cache ?

Your patch adds 540 bytes of code, so its a potential latency increase.

# size net/core/filter.o net/core/filter.o.old
   text	   data	    bss	    dec	    hex	filename
   4243	      0	      0	   4243	   1093	net/core/filter.o
   3703	     24	      0	   3727	    e8f	net/core/filter.o.old

Each 'NEXT' translates to :

 4db:	83 c3 08             	add    $0x8,%ebx
 4de:	0f b7 03             	movzwl (%ebx),%eax
 4e1:	8b 04 85 00 02 00 00 	mov    0x200(,%eax,4),%eax
 4e8:	ff e0                	jmp    *%eax


And this is on i386, expect more on cpus with 32bit fixed
instructions ...

We can remove one branch per BPF instruction with following patch :

diff --git a/net/core/filter.c b/net/core/filter.c
index 36f975f..377f3ca 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -119,16 +119,14 @@ unsigned int sk_run_filter(const struct sk_buff *skb,
 	u32 tmp;
 	int k;
 
+	fentry--;
 	/*
 	 * Process array of filter instructions.
 	 */
-	for (;; fentry++) {
-#if defined(CONFIG_X86_32)
+	for (;;) {
 #define	K (fentry->k)
-#else
-		const u32 K = fentry->k;
-#endif
 
+		fentry++;
 		switch (fentry->code) {
 		case BPF_S_ALU_ADD_X:
 			A += X;




^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
  2011-07-30  6:04     ` Eric Dumazet
@ 2011-07-30  6:20       ` Eric Dumazet
  0 siblings, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2011-07-30  6:20 UTC (permalink / raw)
  To: Rui Ueyama; +Cc: netdev

Le samedi 30 juillet 2011 à 08:04 +0200, Eric Dumazet a écrit :

> We can remove one branch per BPF instruction with following patch :
> 
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 36f975f..377f3ca 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -119,16 +119,14 @@ unsigned int sk_run_filter(const struct sk_buff *skb,
>  	u32 tmp;
>  	int k;
>  
> +	fentry--;
>  	/*
>  	 * Process array of filter instructions.
>  	 */
> -	for (;; fentry++) {
> -#if defined(CONFIG_X86_32)
> +	for (;;) {
>  #define	K (fentry->k)
> -#else
> -		const u32 K = fentry->k;
> -#endif
>  
> +		fentry++;
>  		switch (fentry->code) {
>  		case BPF_S_ALU_ADD_X:
>  			A += X;
> 
> 

BTW, I tried to add one unreachable() in the default: branch, and gcc
4.5.2 generates interesting code :

It still does the compare and test, but both branches ends on same
location :

 348:	83 c3 08             	add    $0x8,%ebx
 34b:	66 83 3b 37          	cmpw   $0x37,(%ebx)
 34f:	76 07                	jbe    358 <sk_run_filter+0x28>
 351:	8d b4 26 00 00 00 00 	lea    0x0(%esi,%eiz,1),%esi
 358:	0f b7 03             	movzwl (%ebx),%eax
 35b:	ff 24 85 34 01 00 00 	jmp    *0x134(,%eax,4)


On a 64bit build and gcc 4.1.2, it even generates an infinite loop

 730:	66 83 3b 37          	cmpw   $0x37,(%rbx)
 734:	76 02                	jbe    738 <sk_run_filter+0x28>
 736:	eb fe                	jmp    736 <sk_run_filter+0x26>
 738:	0f b7 03             	movzwl (%rbx),%eax
 73b:	ff 24 c5 00 00 00 00 	jmpq   *0x0(,%rax,8)


diff --git a/net/core/filter.c b/net/core/filter.c
index 36f975f..89221e7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -122,13 +122,11 @@ unsigned int sk_run_filter(const struct sk_buff *skb,
 	/*
 	 * Process array of filter instructions.
 	 */
-	for (;; fentry++) {
-#if defined(CONFIG_X86_32)
+	fentry--;
+	for (;;) {
 #define	K (fentry->k)
-#else
-		const u32 K = fentry->k;
-#endif
 
+		fentry++;
 		switch (fentry->code) {
 		case BPF_S_ALU_ADD_X:
 			A += X;
@@ -351,6 +349,7 @@ load_b:
 			continue;
 		}
 		default:
+			unreachable();
 			WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
 				       fentry->code, fentry->jt,
 				       fentry->jf, fentry->k);



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
  2011-07-30  5:09   ` Rui Ueyama
  2011-07-30  6:04     ` Eric Dumazet
@ 2011-07-30  9:55     ` Francois Romieu
  1 sibling, 0 replies; 13+ messages in thread
From: Francois Romieu @ 2011-07-30  9:55 UTC (permalink / raw)
  To: Rui Ueyama; +Cc: Eric Dumazet, netdev

Rui Ueyama <rui314@gmail.com> :
> The result of benchmark looks good.

So far the overall improvement is below 2% (upper bound, without calculator).

And the result of benchmark without tcpdump looks strange.

> A simple benchmark that sends 10M UDP
> packets to lo took 76.24 seconds on average on Core 2 Duo L7500@1.6GHz.when
> tcpdump is running. With this patch it took 75.41 seconds, which means we save
> 80ns for each packet on that processor.

You forgot to analyze the figures without tcpdump.

> I think converting the VM to threaded code is low hanging fruit, even
> if we'd have JIT compilers for popular architectures. Most of the lines
> in my patch are indentation change, so the actual change is not big.

Without further data nor plan for future / possible improvements, it
could be said a gratuitous rewrite too.

-- 
Ueimor

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
  2011-07-29  8:10 [PATCH] net: filter: Convert the BPF VM to threaded code Rui Ueyama
  2011-07-29  9:30 ` Eric Dumazet
@ 2011-08-01 18:16 ` Hagen Paul Pfeifer
  2011-08-01 18:37   ` Eric Dumazet
  1 sibling, 1 reply; 13+ messages in thread
From: Hagen Paul Pfeifer @ 2011-08-01 18:16 UTC (permalink / raw)
  To: Rui Ueyama; +Cc: netdev

* Rui Ueyama | 2011-07-29 01:10:26 [-0700]:

>Convert the BPF VM to threaded code to improve performance.
>
>The BPF VM is basically a big for loop containing a switch statement.  That is
>slow because for each instruction it checks the for loop condition and does the
>conditional branch of the switch statement.
>
>This patch eliminates the conditional branch, by replacing it with jump table
>using GCC's labels-as-values feature. The for loop condition check can also be
>removed, because the filter code always end with a RET instruction.

With commit 01f2f3f6ef4d076c I reworked the BPF code so that gcc is in the
ability to generate a jump table, I double checked this. Not sure what happened
in the meantime.

Hagen



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
  2011-08-01 18:16 ` Hagen Paul Pfeifer
@ 2011-08-01 18:37   ` Eric Dumazet
  2011-08-02  0:57     ` David Miller
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2011-08-01 18:37 UTC (permalink / raw)
  To: Hagen Paul Pfeifer; +Cc: Rui Ueyama, netdev

Le lundi 01 août 2011 à 20:16 +0200, Hagen Paul Pfeifer a écrit :
> * Rui Ueyama | 2011-07-29 01:10:26 [-0700]:
> 
> >Convert the BPF VM to threaded code to improve performance.
> >
> >The BPF VM is basically a big for loop containing a switch statement.  That is
> >slow because for each instruction it checks the for loop condition and does the
> >conditional branch of the switch statement.
> >
> >This patch eliminates the conditional branch, by replacing it with jump table
> >using GCC's labels-as-values feature. The for loop condition check can also be
> >removed, because the filter code always end with a RET instruction.
> 
> With commit 01f2f3f6ef4d076c I reworked the BPF code so that gcc is in the
> ability to generate a jump table, I double checked this. Not sure what happened
> in the meantime.
> 

A switch() always generates one conditional branch, catching values not
enumerated in the "case ..." clauses.




^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
  2011-08-01 18:37   ` Eric Dumazet
@ 2011-08-02  0:57     ` David Miller
  2011-08-09  5:00       ` Eric Dumazet
  0 siblings, 1 reply; 13+ messages in thread
From: David Miller @ 2011-08-02  0:57 UTC (permalink / raw)
  To: eric.dumazet; +Cc: hagen, rui314, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 01 Aug 2011 20:37:46 +0200

> Le lundi 01 août 2011 à 20:16 +0200, Hagen Paul Pfeifer a écrit :
>> * Rui Ueyama | 2011-07-29 01:10:26 [-0700]:
>> 
>> >Convert the BPF VM to threaded code to improve performance.
>> >
>> >The BPF VM is basically a big for loop containing a switch statement.  That is
>> >slow because for each instruction it checks the for loop condition and does the
>> >conditional branch of the switch statement.
>> >
>> >This patch eliminates the conditional branch, by replacing it with jump table
>> >using GCC's labels-as-values feature. The for loop condition check can also be
>> >removed, because the filter code always end with a RET instruction.
>> 
>> With commit 01f2f3f6ef4d076c I reworked the BPF code so that gcc is in the
>> ability to generate a jump table, I double checked this. Not sure what happened
>> in the meantime.
>> 
> 
> A switch() always generates one conditional branch, catching values not
> enumerated in the "case ..." clauses.

Maybe it won't if we use an enum and make sure all enum values are handled
in the switch? :-)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
  2011-08-02  0:57     ` David Miller
@ 2011-08-09  5:00       ` Eric Dumazet
  2011-08-09  8:29         ` Hagen Paul Pfeifer
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2011-08-09  5:00 UTC (permalink / raw)
  To: David Miller; +Cc: hagen, rui314, netdev

Le lundi 01 août 2011 à 17:57 -0700, David Miller a écrit :

> Maybe it won't if we use an enum and make sure all enum values are handled
> in the switch? :-)

I tried this idea since its already an enum and all enum values are
handled in the witch, but all gcc versions I used still generate the
useless compare and branch (always predicted by modern CPU, so harmless
anyway ?)

348:   83 c3 08                add    $0x8,%ebx
34b:   66 83 3b 37             cmpw   $0x37,(%ebx)
34f:   77 f7                   ja     348 <sk_run_filter+0x18>
351:   0f b7 03                movzwl (%ebx),%eax
354:   ff 24 85 34 01 00 00    jmp    *0x134(,%eax,4)


It would be nice to try the jump table idea and avoid the array
indirection and get instead :

add	$0xc,%ebx
jmp	*(ebx)

(But this would need to use a larger kernel_sock_filter with not an u16
code, but the target address).


diff --git a/include/linux/filter.h b/include/linux/filter.h
index 741956f..f2a1b37 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -171,7 +171,7 @@ static inline void bpf_jit_free(struct sk_filter *fp)
 #define SK_RUN_FILTER(FILTER, SKB) sk_run_filter(SKB, FILTER->insns)
 #endif
 
-enum {
+enum bpf_inst {
 	BPF_S_RET_K = 1,
 	BPF_S_RET_A,
 	BPF_S_ALU_ADD_K,
diff --git a/net/core/filter.c b/net/core/filter.c
index 36f975f..ea1f467 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -119,17 +119,15 @@ unsigned int sk_run_filter(const struct sk_buff *skb,
 	u32 tmp;
 	int k;
 
+	fentry--;
 	/*
 	 * Process array of filter instructions.
 	 */
-	for (;; fentry++) {
-#if defined(CONFIG_X86_32)
+	for (;;) {
 #define	K (fentry->k)
-#else
-		const u32 K = fentry->k;
-#endif
 
-		switch (fentry->code) {
+		fentry++;
+		switch ((enum bpf_inst)fentry->code) {
 		case BPF_S_ALU_ADD_X:
 			A += X;
 			continue;
@@ -350,11 +348,6 @@ load_b:
 				A = 0;
 			continue;
 		}
-		default:
-			WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
-				       fentry->code, fentry->jt,
-				       fentry->jf, fentry->k);
-			return 0;
 		}
 	}
 



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
  2011-08-09  5:00       ` Eric Dumazet
@ 2011-08-09  8:29         ` Hagen Paul Pfeifer
  2011-08-09  8:36           ` Eric Dumazet
  0 siblings, 1 reply; 13+ messages in thread
From: Hagen Paul Pfeifer @ 2011-08-09  8:29 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, rui314, netdev


On Tue, 09 Aug 2011 07:00:27 +0200, Eric Dumazet wrote:



> I tried this idea since its already an enum and all enum values are

> handled in the switch, but all gcc versions I used still generate the

> useless compare and branch (always predicted by modern CPU, so harmless

> anyway ?)



Don't think so, the list is rather long - the CPU may not have any

indication. But strange that your gcc generate a conditional sequence of

cmp and conditional jump instruction instead of a jump table! Look at

commit 01f2f3f6ef4d076c, this is what I got after I refactored to an

_dense_ enum list.

 

> (But this would need to use a larger kernel_sock_filter with not an u16

> code, but the target address).



Not sure if the benefit is worth to try it. Can we avoid any cacheline

misses? I don't think so.





> +		switch ((enum bpf_inst)fentry->code) {



That should not differ! Eric, sure that this do the trick?





Hagen

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
  2011-08-09  8:29         ` Hagen Paul Pfeifer
@ 2011-08-09  8:36           ` Eric Dumazet
  2011-08-09  8:53             ` Hagen Paul Pfeifer
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2011-08-09  8:36 UTC (permalink / raw)
  To: Hagen Paul Pfeifer; +Cc: David Miller, rui314, netdev

Le mardi 09 août 2011 à 10:29 +0200, Hagen Paul Pfeifer a écrit :
> On Tue, 09 Aug 2011 07:00:27 +0200, Eric Dumazet wrote:
> 
> > I tried this idea since its already an enum and all enum values are
> > handled in the switch, but all gcc versions I used still generate the
> > useless compare and branch (always predicted by modern CPU, so harmless
> > anyway ?)
> 
> Don't think so, the list is rather long - the CPU may not have any
> indication. But strange that your gcc generate a conditional sequence of
> cmp and conditional jump instruction instead of a jump table! Look at
> commit 01f2f3f6ef4d076c, this is what I got after I refactored to an
> _dense_ enum list.

I am speaking of one single conditional branch, the one that is done for
every BPF instruction done in a filter. It's never taken, so must be
predicted by the cpu. A perf session could make sure it is  ;)

>  
> > (But this would need to use a larger kernel_sock_filter with not an u16
> > code, but the target address).
> 
> Not sure if the benefit is worth to try it. Can we avoid any cacheline
> misses? I don't think so.

We avoid some instructions (to load the address from the instruction
code), thats all, for each BPF instruction.

> 
> 
> > +		switch ((enum bpf_inst)fentry->code) {
> 
> That should not differ! Eric, sure that this do the trick?

It is the way to tell gcc to make its optimizations, if they exists on
enum switch (), as David suggested.

If I add a new BPF_S_ANC_NEW_INST value in the enum list, not handled in
the switch(), gcc complains correctly.

net/core/filter.c: In function ‘sk_run_filter’:
net/core/filter.c:130:3: warning: enumeration value ‘BPF_S_ANC_NEW_INST’
not handled in switch




^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] net: filter: Convert the BPF VM to threaded code
  2011-08-09  8:36           ` Eric Dumazet
@ 2011-08-09  8:53             ` Hagen Paul Pfeifer
  0 siblings, 0 replies; 13+ messages in thread
From: Hagen Paul Pfeifer @ 2011-08-09  8:53 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, rui314, netdev


On Tue, 09 Aug 2011 10:36:28 +0200, Eric Dumazet wrote:



 

> I am speaking of one single conditional branch, the one that is done for

> every BPF instruction done in a filter. It's never taken, so must be

> predicted by the cpu. A perf session could make sure it is  ;)



Ahh, ok - perf is your friend, I know Eric ;-)



>> > +		switch ((enum bpf_inst)fentry->code) {

>> 

>> That should not differ! Eric, sure that this do the trick?

> 

> It is the way to tell gcc to make its optimizations, if they exists on

> enum switch (), as David suggested.



Argl, I see sock_filter.code is declared as __u16 not enum.





Hagen

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2011-08-09  8:53 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-07-29  8:10 [PATCH] net: filter: Convert the BPF VM to threaded code Rui Ueyama
2011-07-29  9:30 ` Eric Dumazet
2011-07-30  5:09   ` Rui Ueyama
2011-07-30  6:04     ` Eric Dumazet
2011-07-30  6:20       ` Eric Dumazet
2011-07-30  9:55     ` Francois Romieu
2011-08-01 18:16 ` Hagen Paul Pfeifer
2011-08-01 18:37   ` Eric Dumazet
2011-08-02  0:57     ` David Miller
2011-08-09  5:00       ` Eric Dumazet
2011-08-09  8:29         ` Hagen Paul Pfeifer
2011-08-09  8:36           ` Eric Dumazet
2011-08-09  8:53             ` Hagen Paul Pfeifer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).