[PATCH v2 net-next 0/2] bpf: performance improvements

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2 net-next 0/2] bpf: performance improvements
@ 2015-09-16  6:05 Alexei Starovoitov
  2015-09-16  6:05 ` [PATCH v2 net-next 1/2] cls_bpf: introduce integrated actions Alexei Starovoitov
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Alexei Starovoitov @ 2015-09-16  6:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: John Fastabend, Jamal Hadi Salim, Daniel Borkmann, netdev

v1->v2: dropped redundant iff_up check in patch 2

At plumbers we discussed different options on how to get rid of skb_clone
from bpf_clone_redirect(), the patch 2 implements the best option.
Patch 1 adds 'integrated exts' to cls_bpf to improve performance by
combining simple actions into bpf classifier.

Alexei Starovoitov (1):
  bpf: add bpf_redirect() helper

Daniel Borkmann (1):
  cls_bpf: introduce integrated actions

 include/net/sch_generic.h    |    3 ++-
 include/uapi/linux/bpf.h     |    9 +++++++
 include/uapi/linux/pkt_cls.h |    4 +++
 net/core/dev.c               |    8 ++++++
 net/core/filter.c            |   58 +++++++++++++++++++++++++++++++++++++++
 net/sched/act_bpf.c          |    1 +
 net/sched/cls_bpf.c          |   61 ++++++++++++++++++++++++++++++++++--------
 samples/bpf/bpf_helpers.h    |    4 +++
 samples/bpf/tcbpf1_kern.c    |   24 ++++++++++++++++-
 9 files changed, 159 insertions(+), 13 deletions(-)

-- 
1.7.9.5

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2 net-next 1/2] cls_bpf: introduce integrated actions
  2015-09-16  6:05 [PATCH v2 net-next 0/2] bpf: performance improvements Alexei Starovoitov
@ 2015-09-16  6:05 ` Alexei Starovoitov
  2015-09-17 12:37   ` Jamal Hadi Salim
  2015-09-16  6:05 ` [PATCH v2 net-next 2/2] bpf: add bpf_redirect() helper Alexei Starovoitov
  2015-09-18  4:10 ` [PATCH v2 net-next 0/2] bpf: performance improvements David Miller
  2 siblings, 1 reply; 11+ messages in thread
From: Alexei Starovoitov @ 2015-09-16  6:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: John Fastabend, Jamal Hadi Salim, Daniel Borkmann, netdev

From: Daniel Borkmann <daniel@iogearbox.net>

Often cls_bpf classifier is used with single action drop attached.
Optimize this use case and let cls_bpf return both classid and action.
For backwards compatibility reasons enable this feature under
TCA_BPF_FLAG_ACT_DIRECT flag.

Then more interesting programs like the following are easier to write:
int cls_bpf_prog(struct __sk_buff *skb)
{
  /* classify arp, ip, ipv6 into different traffic classes
   * and drop all other packets
   */
  switch (skb->protocol) {
  case htons(ETH_P_ARP):
    skb->tc_classid = 1;
    break;
  case htons(ETH_P_IP):
    skb->tc_classid = 2;
    break;
  case htons(ETH_P_IPV6):
    skb->tc_classid = 3;
    break;
  default:
    return TC_ACT_SHOT;
  }

  return TC_ACT_OK;
}

Joint work with Daniel Borkmann.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
v1->v2: no changes

 include/net/sch_generic.h    |    2 +-
 include/uapi/linux/bpf.h     |    1 +
 include/uapi/linux/pkt_cls.h |    3 +++
 net/core/filter.c            |   14 ++++++++++
 net/sched/cls_bpf.c          |   60 ++++++++++++++++++++++++++++++++++--------
 5 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 444faa89a55f..da61febb9091 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -251,7 +251,7 @@ struct tcf_proto {
 struct qdisc_skb_cb {
 	unsigned int		pkt_len;
 	u16			slave_dev_queue_mapping;
-	u16			_pad;
+	u16			tc_classid;
 #define QDISC_CB_PRIV_LEN 20
 	unsigned char		data[QDISC_CB_PRIV_LEN];
 };
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 92a48e2d5461..2fbd1c71fa3b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -293,6 +293,7 @@ struct __sk_buff {
 	__u32 tc_index;
 	__u32 cb[5];
 	__u32 hash;
+	__u32 tc_classid;
 };
 
 struct bpf_tunnel_key {
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 4f0d1bc3647d..0a262a83f9d4 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -373,6 +373,8 @@ enum {
 
 /* BPF classifier */
 
+#define TCA_BPF_FLAG_ACT_DIRECT		(1 << 0)
+
 enum {
 	TCA_BPF_UNSPEC,
 	TCA_BPF_ACT,
@@ -382,6 +384,7 @@ enum {
 	TCA_BPF_OPS,
 	TCA_BPF_FD,
 	TCA_BPF_NAME,
+	TCA_BPF_FLAGS,
 	__TCA_BPF_MAX,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 13079f03902e..971d6ba89758 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1632,6 +1632,9 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 static bool sk_filter_is_valid_access(int off, int size,
 				      enum bpf_access_type type)
 {
+	if (off == offsetof(struct __sk_buff, tc_classid))
+		return false;
+
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case offsetof(struct __sk_buff, cb[0]) ...
@@ -1648,6 +1651,9 @@ static bool sk_filter_is_valid_access(int off, int size,
 static bool tc_cls_act_is_valid_access(int off, int size,
 				       enum bpf_access_type type)
 {
+	if (off == offsetof(struct __sk_buff, tc_classid))
+		return type == BPF_WRITE ? true : false;
+
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case offsetof(struct __sk_buff, mark):
@@ -1760,6 +1766,14 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
 		break;
 
+	case offsetof(struct __sk_buff, tc_classid):
+		ctx_off -= offsetof(struct __sk_buff, tc_classid);
+		ctx_off += offsetof(struct sk_buff, cb);
+		ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
+		WARN_ON(type != BPF_WRITE);
+		*insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+		break;
+
 	case offsetof(struct __sk_buff, tc_index):
 #ifdef CONFIG_NET_SCHED
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index e5168f8b9640..77b0ef148256 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -38,6 +38,7 @@ struct cls_bpf_prog {
 	struct bpf_prog *filter;
 	struct list_head link;
 	struct tcf_result res;
+	bool exts_integrated;
 	struct tcf_exts exts;
 	u32 handle;
 	union {
@@ -52,6 +53,7 @@ struct cls_bpf_prog {
 
 static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
 	[TCA_BPF_CLASSID]	= { .type = NLA_U32 },
+	[TCA_BPF_FLAGS]		= { .type = NLA_U32 },
 	[TCA_BPF_FD]		= { .type = NLA_U32 },
 	[TCA_BPF_NAME]		= { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN },
 	[TCA_BPF_OPS_LEN]	= { .type = NLA_U16 },
@@ -59,6 +61,22 @@ static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
 				    .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
 };
 
+static int cls_bpf_exec_opcode(int code)
+{
+	switch (code) {
+	case TC_ACT_OK:
+	case TC_ACT_RECLASSIFY:
+	case TC_ACT_SHOT:
+	case TC_ACT_PIPE:
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+	case TC_ACT_UNSPEC:
+		return code;
+	default:
+		return TC_ACT_UNSPEC;
+	}
+}
+
 static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			    struct tcf_result *res)
 {
@@ -79,6 +97,8 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	list_for_each_entry_rcu(prog, &head->plist, link) {
 		int filter_res;
 
+		qdisc_skb_cb(skb)->tc_classid = prog->res.classid;
+
 		if (at_ingress) {
 			/* It is safe to push/pull even if skb_shared() */
 			__skb_push(skb, skb->mac_len);
@@ -88,6 +108,16 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 		}
 
+		if (prog->exts_integrated) {
+			res->class = prog->res.class;
+			res->classid = qdisc_skb_cb(skb)->tc_classid;
+
+			ret = cls_bpf_exec_opcode(filter_res);
+			if (ret == TC_ACT_UNSPEC)
+				continue;
+			break;
+		}
+
 		if (filter_res == 0)
 			continue;
 
@@ -195,8 +225,7 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
 	return ret;
 }
 
-static int cls_bpf_prog_from_ops(struct nlattr **tb,
-				 struct cls_bpf_prog *prog, u32 classid)
+static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog)
 {
 	struct sock_filter *bpf_ops;
 	struct sock_fprog_kern fprog_tmp;
@@ -230,15 +259,13 @@ static int cls_bpf_prog_from_ops(struct nlattr **tb,
 	prog->bpf_ops = bpf_ops;
 	prog->bpf_num_ops = bpf_num_ops;
 	prog->bpf_name = NULL;
-
 	prog->filter = fp;
-	prog->res.classid = classid;
 
 	return 0;
 }
 
-static int cls_bpf_prog_from_efd(struct nlattr **tb,
-				 struct cls_bpf_prog *prog, u32 classid)
+static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
+				 const struct tcf_proto *tp)
 {
 	struct bpf_prog *fp;
 	char *name = NULL;
@@ -268,9 +295,7 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb,
 	prog->bpf_ops = NULL;
 	prog->bpf_fd = bpf_fd;
 	prog->bpf_name = name;
-
 	prog->filter = fp;
-	prog->res.classid = classid;
 
 	return 0;
 }
@@ -280,8 +305,8 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
 				   unsigned long base, struct nlattr **tb,
 				   struct nlattr *est, bool ovr)
 {
+	bool is_bpf, is_ebpf, have_exts = false;
 	struct tcf_exts exts;
-	bool is_bpf, is_ebpf;
 	u32 classid;
 	int ret;
 
@@ -298,9 +323,22 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
 		return ret;
 
 	classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
+	if (tb[TCA_BPF_FLAGS]) {
+		u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]);
+
+		if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) {
+			tcf_exts_destroy(&exts);
+			return -EINVAL;
+		}
+
+		have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT;
+	}
+
+	prog->res.classid = classid;
+	prog->exts_integrated = have_exts;
 
-	ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog, classid) :
-		       cls_bpf_prog_from_efd(tb, prog, classid);
+	ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
+		       cls_bpf_prog_from_efd(tb, prog, tp);
 	if (ret < 0) {
 		tcf_exts_destroy(&exts);
 		return ret;
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next 1/2] cls_bpf: introduce integrated actions
  2015-09-16  6:05 ` [PATCH v2 net-next 1/2] cls_bpf: introduce integrated actions Alexei Starovoitov
@ 2015-09-17 12:37   ` Jamal Hadi Salim
  2015-09-17 13:13     ` Daniel Borkmann
  0 siblings, 1 reply; 11+ messages in thread
From: Jamal Hadi Salim @ 2015-09-17 12:37 UTC (permalink / raw)
  To: Alexei Starovoitov, David S. Miller
  Cc: John Fastabend, Daniel Borkmann, netdev

On 09/16/15 02:05, Alexei Starovoitov wrote:
> From: Daniel Borkmann <daniel@iogearbox.net>
>
> Often cls_bpf classifier is used with single action drop attached.
> Optimize this use case and let cls_bpf return both classid and action.
> For backwards compatibility reasons enable this feature under
> TCA_BPF_FLAG_ACT_DIRECT flag.
>

This is going off in a different direction really.
You are replicating the infrastructure inside bpf.

cheers,
jamal

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next 1/2] cls_bpf: introduce integrated actions
  2015-09-17 12:37   ` Jamal Hadi Salim
@ 2015-09-17 13:13     ` Daniel Borkmann
  2015-09-17 15:19       ` Alexei Starovoitov
  2015-09-18 12:04       ` Jamal Hadi Salim
  0 siblings, 2 replies; 11+ messages in thread
From: Daniel Borkmann @ 2015-09-17 13:13 UTC (permalink / raw)
  To: Jamal Hadi Salim, Alexei Starovoitov, David S. Miller
  Cc: John Fastabend, netdev

Hi Jamal,

On 09/17/2015 02:37 PM, Jamal Hadi Salim wrote:
> On 09/16/15 02:05, Alexei Starovoitov wrote:
>> From: Daniel Borkmann <daniel@iogearbox.net>
>>
>> Often cls_bpf classifier is used with single action drop attached.
>> Optimize this use case and let cls_bpf return both classid and action.
>> For backwards compatibility reasons enable this feature under
>> TCA_BPF_FLAG_ACT_DIRECT flag.
>>
>
> This is going off in a different direction really.
> You are replicating the infrastructure inside bpf.

Hmm, I don't really agree. With cls_bpf you have non-linear
classifications as opposed to walking a chain of classifiers:
worst case, I have to walk through N classifiers just to find
out that the last one matches that I need to drop - this doesn't
scale at all. Given that we can make this decision right here,
we can use this fact and have simple return codes provided as
well. It only supplements non-linear classification that was
from the very beginning of cls_bpf a core part of it.

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next 1/2] cls_bpf: introduce integrated actions
  2015-09-17 13:13     ` Daniel Borkmann
@ 2015-09-17 15:19       ` Alexei Starovoitov
  2015-09-18 12:13         ` Jamal Hadi Salim
  2015-09-18 12:04       ` Jamal Hadi Salim
  1 sibling, 1 reply; 11+ messages in thread
From: Alexei Starovoitov @ 2015-09-17 15:19 UTC (permalink / raw)
  To: Daniel Borkmann, Jamal Hadi Salim, David S. Miller; +Cc: John Fastabend, netdev

On 9/17/15 6:13 AM, Daniel Borkmann wrote:
> Hi Jamal,
>
> On 09/17/2015 02:37 PM, Jamal Hadi Salim wrote:
>> On 09/16/15 02:05, Alexei Starovoitov wrote:
>>> From: Daniel Borkmann <daniel@iogearbox.net>
>>>
>>> Often cls_bpf classifier is used with single action drop attached.
>>> Optimize this use case and let cls_bpf return both classid and action.
>>> For backwards compatibility reasons enable this feature under
>>> TCA_BPF_FLAG_ACT_DIRECT flag.
>>>
>>
>> This is going off in a different direction really.
>> You are replicating the infrastructure inside bpf.
>
> Hmm, I don't really agree. With cls_bpf you have non-linear
> classifications as opposed to walking a chain of classifiers:
> worst case, I have to walk through N classifiers just to find
> out that the last one matches that I need to drop - this doesn't
> scale at all. Given that we can make this decision right here,
> we can use this fact and have simple return codes provided as
> well. It only supplements non-linear classification that was
> from the very beginning of cls_bpf a core part of it.

I don't see the replication either. May be the commit log was
misread as bpf program now executes the actions and bypasses
tcf_exts_exec() ? Well, that may be interesting idea for
the future, but that's not what the patch is doing.
With this patch cls_bpf can return single integer like
TC_ACT_SHOT/TC_ACT_OK that gact/act_bpf can already do as
an _optimization_ to avoid extra hops. To do full-fledged
action chaining the tcf_exts_exec() is used.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next 1/2] cls_bpf: introduce integrated actions
  2015-09-17 15:19       ` Alexei Starovoitov
@ 2015-09-18 12:13         ` Jamal Hadi Salim
  2015-09-18 12:51           ` Daniel Borkmann
  0 siblings, 1 reply; 11+ messages in thread
From: Jamal Hadi Salim @ 2015-09-18 12:13 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, David S. Miller
  Cc: John Fastabend, netdev

On 09/17/15 11:19, Alexei Starovoitov wrote:

> misread as bpf program now executes the actions and bypasses
> tcf_exts_exec() ? Well, that may be interesting idea for
> the future,

And above is precisely why i raised the concern. You are already
bypassing tcf_exts_exec with that patch. It is a big jump.
It is kind of hard to continue the discussion because i notice
Dave just took in the patches.

Please dont go the above path of fully fledged bypass.
The architecture is about small tools that come together to
provide complex processing. ebpf may be the best classifier
today - but by no means the only one or guaranteed that nothing
better will exist for speficic use cases.
If there is something in the core that needs improvement
for the benefit of all, then lets do that. Example i find
the classid metadata interesting but flinching at ACT_REDIRECT.

cheers,
jamal

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next 1/2] cls_bpf: introduce integrated actions
  2015-09-18 12:13         ` Jamal Hadi Salim
@ 2015-09-18 12:51           ` Daniel Borkmann
  0 siblings, 0 replies; 11+ messages in thread
From: Daniel Borkmann @ 2015-09-18 12:51 UTC (permalink / raw)
  To: Jamal Hadi Salim, Alexei Starovoitov, David S. Miller
  Cc: John Fastabend, netdev

Hi Jamal,

On 09/18/2015 02:13 PM, Jamal Hadi Salim wrote:
...
> The architecture is about small tools that come together to
> provide complex processing. ebpf may be the best classifier
> today - but by no means the only one or guaranteed that nothing
> better will exist for speficic use cases.

Yes, I agree, I think nobody claimed that, and it also doesn't
limit anyone from their choices of configuration possibilities
one can do nowadays.

> If there is something in the core that needs improvement
> for the benefit of all, then lets do that.

Yeah, I think i.e. since the last couple of months that is already
happening from various people.

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next 1/2] cls_bpf: introduce integrated actions
  2015-09-17 13:13     ` Daniel Borkmann
  2015-09-17 15:19       ` Alexei Starovoitov
@ 2015-09-18 12:04       ` Jamal Hadi Salim
  1 sibling, 0 replies; 11+ messages in thread
From: Jamal Hadi Salim @ 2015-09-18 12:04 UTC (permalink / raw)
  To: Daniel Borkmann, Alexei Starovoitov, David S. Miller
  Cc: John Fastabend, netdev

Hi Daniel,

On 09/17/15 09:13, Daniel Borkmann wrote:

> Hmm, I don't really agree. With cls_bpf you have non-linear
> classifications as opposed to walking a chain of classifiers:

A chain of classifiers is a better description today (non-linear would
be an appropriate description before cls_bpf ;->).

> worst case, I have to walk through N classifiers just to find
> out that the last one matches that I need to drop - this doesn't
> scale at all.

The scaling reason with that posted example is not
a strong one. You can get good performance with any classifier
for that policy description.
F.E with Alexei's second best classifier:->:

tc filter add dev eth0 parent ffff: protocol arp prio 1 u32\
match all ..
tc filter add dev eth0 parent ffff: protocol ip prio 1 u32\
...

But I do get the gist of your arguement otherwise and some
short circuits are ok as you had earlier.

>Given that we can make this decision right here,
> we can use this fact and have simple return codes provided as
> well.

I think it makes sense for the simple case.
But you have every other opcode in there, not just basic
accept/drop. I am worried this is leading towards an
enclave of bpf do-everything.

cheers,
jamal

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2 net-next 2/2] bpf: add bpf_redirect() helper
  2015-09-16  6:05 [PATCH v2 net-next 0/2] bpf: performance improvements Alexei Starovoitov
  2015-09-16  6:05 ` [PATCH v2 net-next 1/2] cls_bpf: introduce integrated actions Alexei Starovoitov
@ 2015-09-16  6:05 ` Alexei Starovoitov
  2015-09-16  6:45   ` John Fastabend
  2015-09-18  4:10 ` [PATCH v2 net-next 0/2] bpf: performance improvements David Miller
  2 siblings, 1 reply; 11+ messages in thread
From: Alexei Starovoitov @ 2015-09-16  6:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: John Fastabend, Jamal Hadi Salim, Daniel Borkmann, netdev

Existing bpf_clone_redirect() helper clones skb before redirecting
it to RX or TX of destination netdev.
Introduce bpf_redirect() helper that does that without cloning.

Benchmarked with two hosts using 10G ixgbe NICs.
One host is doing line rate pktgen.
Another host is configured as:
$ tc qdisc add dev $dev ingress
$ tc filter add dev $dev root pref 10 u32 match u32 0 0 flowid 1:2 \
   action bpf run object-file tcbpf1_kern.o section clone_redirect_xmit drop
so it receives the packet on $dev and immediately xmits it on $dev + 1
The section 'clone_redirect_xmit' in tcbpf1_kern.o file has the program
that does bpf_clone_redirect() and performance is 2.0 Mpps

$ tc filter add dev $dev root pref 10 u32 match u32 0 0 flowid 1:2 \
   action bpf run object-file tcbpf1_kern.o section redirect_xmit drop
which is using bpf_redirect() - 2.4 Mpps

and using cls_bpf with integrated actions as:
$ tc filter add dev $dev root pref 10 \
  bpf run object-file tcbpf1_kern.o section redirect_xmit integ_act classid 1
performance is 2.5 Mpps

To summarize:
u32+act_bpf using clone_redirect - 2.0 Mpps
u32+act_bpf using redirect - 2.4 Mpps
cls_bpf using redirect - 2.5 Mpps

For comparison linux bridge in this setup is doing 2.1 Mpps
and ixgbe rx + drop in ip_rcv - 7.8 Mpps

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
---
This approach is using per_cpu scratch area to store ifindex and flags.
The other alternatives discussed at plumbers are slower and more intrusive.
v1->v2: dropped redundant iff_up check

 include/net/sch_generic.h    |    1 +
 include/uapi/linux/bpf.h     |    8 ++++++++
 include/uapi/linux/pkt_cls.h |    1 +
 net/core/dev.c               |    8 ++++++++
 net/core/filter.c            |   44 ++++++++++++++++++++++++++++++++++++++++++
 net/sched/act_bpf.c          |    1 +
 net/sched/cls_bpf.c          |    1 +
 samples/bpf/bpf_helpers.h    |    4 ++++
 samples/bpf/tcbpf1_kern.c    |   24 ++++++++++++++++++++++-
 9 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index da61febb9091..4c79ce8c1f92 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -402,6 +402,7 @@ void __qdisc_calculate_pkt_len(struct sk_buff *skb,
 			       const struct qdisc_size_table *stab);
 bool tcf_destroy(struct tcf_proto *tp, bool force);
 void tcf_destroy_chain(struct tcf_proto __rcu **fl);
+int skb_do_redirect(struct sk_buff *);
 
 /* Reset all TX qdiscs greater then index of a device.  */
 static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2fbd1c71fa3b..4ec0b5488294 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -272,6 +272,14 @@ enum bpf_func_id {
 	BPF_FUNC_skb_get_tunnel_key,
 	BPF_FUNC_skb_set_tunnel_key,
 	BPF_FUNC_perf_event_read,	/* u64 bpf_perf_event_read(&map, index) */
+	/**
+	 * bpf_redirect(ifindex, flags) - redirect to another netdev
+	 * @ifindex: ifindex of the net device
+	 * @flags: bit 0 - if set, redirect to ingress instead of egress
+	 *         other bits - reserved
+	 * Return: TC_ACT_REDIRECT
+	 */
+	BPF_FUNC_redirect,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 0a262a83f9d4..439873775d49 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -87,6 +87,7 @@ enum {
 #define TC_ACT_STOLEN		4
 #define TC_ACT_QUEUED		5
 #define TC_ACT_REPEAT		6
+#define TC_ACT_REDIRECT		7
 #define TC_ACT_JUMP		0x10000000
 
 /* Action type identifiers*/
diff --git a/net/core/dev.c b/net/core/dev.c
index 877c84834d81..d6a492e57874 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3668,6 +3668,14 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 	case TC_ACT_QUEUED:
 		kfree_skb(skb);
 		return NULL;
+	case TC_ACT_REDIRECT:
+		/* skb_mac_header check was done by cls/act_bpf, so
+		 * we can safely push the L2 header back before
+		 * redirecting to another netdev
+		 */
+		__skb_push(skb, skb->mac_len);
+		skb_do_redirect(skb);
+		return NULL;
 	default:
 		break;
 	}
diff --git a/net/core/filter.c b/net/core/filter.c
index 971d6ba89758..da3f3d94d6e9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1427,6 +1427,48 @@ const struct bpf_func_proto bpf_clone_redirect_proto = {
 	.arg3_type      = ARG_ANYTHING,
 };
 
+struct redirect_info {
+	u32 ifindex;
+	u32 flags;
+};
+
+static DEFINE_PER_CPU(struct redirect_info, redirect_info);
+static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+
+	ri->ifindex = ifindex;
+	ri->flags = flags;
+	return TC_ACT_REDIRECT;
+}
+
+int skb_do_redirect(struct sk_buff *skb)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct net_device *dev;
+
+	dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
+	ri->ifindex = 0;
+	if (unlikely(!dev)) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	if (BPF_IS_REDIRECT_INGRESS(ri->flags))
+		return dev_forward_skb(dev, skb);
+
+	skb->dev = dev;
+	return dev_queue_xmit(skb);
+}
+
+const struct bpf_func_proto bpf_redirect_proto = {
+	.func           = bpf_redirect,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_ANYTHING,
+	.arg2_type      = ARG_ANYTHING,
+};
+
 static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
 	return task_get_classid((struct sk_buff *) (unsigned long) r1);
@@ -1607,6 +1649,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_skb_get_tunnel_key_proto;
 	case BPF_FUNC_skb_set_tunnel_key:
 		return bpf_get_skb_set_tunnel_key_proto();
+	case BPF_FUNC_redirect:
+		return &bpf_redirect_proto;
 	default:
 		return sk_filter_func_proto(func_id);
 	}
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 559bfa011bda..0bc6f912f870 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -72,6 +72,7 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 	case TC_ACT_PIPE:
 	case TC_ACT_RECLASSIFY:
 	case TC_ACT_OK:
+	case TC_ACT_REDIRECT:
 		action = filter_res;
 		break;
 	case TC_ACT_SHOT:
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 77b0ef148256..0590816ab7b0 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -70,6 +70,7 @@ static int cls_bpf_exec_opcode(int code)
 	case TC_ACT_PIPE:
 	case TC_ACT_STOLEN:
 	case TC_ACT_QUEUED:
+	case TC_ACT_REDIRECT:
 	case TC_ACT_UNSPEC:
 		return code;
 	default:
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index 3a44d3a272af..21aa1b44c30c 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -33,6 +33,10 @@ static int (*bpf_get_current_comm)(void *buf, int buf_size) =
 	(void *) BPF_FUNC_get_current_comm;
 static int (*bpf_perf_event_read)(void *map, int index) =
 	(void *) BPF_FUNC_perf_event_read;
+static int (*bpf_clone_redirect)(void *ctx, int ifindex, int flags) =
+	(void *) BPF_FUNC_clone_redirect;
+static int (*bpf_redirect)(int ifindex, int flags) =
+	(void *) BPF_FUNC_redirect;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/tcbpf1_kern.c b/samples/bpf/tcbpf1_kern.c
index 9bfb2eb34563..fa051b3d53ee 100644
--- a/samples/bpf/tcbpf1_kern.c
+++ b/samples/bpf/tcbpf1_kern.c
@@ -5,7 +5,7 @@
 #include <uapi/linux/in.h>
 #include <uapi/linux/tcp.h>
 #include <uapi/linux/filter.h>
-
+#include <uapi/linux/pkt_cls.h>
 #include "bpf_helpers.h"
 
 /* compiler workaround */
@@ -64,4 +64,26 @@ int bpf_prog1(struct __sk_buff *skb)
 
 	return 0;
 }
+SEC("redirect_xmit")
+int _redirect_xmit(struct __sk_buff *skb)
+{
+	return bpf_redirect(skb->ifindex + 1, 0);
+}
+SEC("redirect_recv")
+int _redirect_recv(struct __sk_buff *skb)
+{
+	return bpf_redirect(skb->ifindex + 1, 1);
+}
+SEC("clone_redirect_xmit")
+int _clone_redirect_xmit(struct __sk_buff *skb)
+{
+	bpf_clone_redirect(skb, skb->ifindex + 1, 0);
+	return TC_ACT_SHOT;
+}
+SEC("clone_redirect_recv")
+int _clone_redirect_recv(struct __sk_buff *skb)
+{
+	bpf_clone_redirect(skb, skb->ifindex + 1, 1);
+	return TC_ACT_SHOT;
+}
 char _license[] SEC("license") = "GPL";
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next 2/2] bpf: add bpf_redirect() helper
  2015-09-16  6:05 ` [PATCH v2 net-next 2/2] bpf: add bpf_redirect() helper Alexei Starovoitov
@ 2015-09-16  6:45   ` John Fastabend
  0 siblings, 0 replies; 11+ messages in thread
From: John Fastabend @ 2015-09-16  6:45 UTC (permalink / raw)
  To: Alexei Starovoitov, David S. Miller
  Cc: Jamal Hadi Salim, Daniel Borkmann, netdev

On 15-09-15 11:05 PM, Alexei Starovoitov wrote:
> Existing bpf_clone_redirect() helper clones skb before redirecting
> it to RX or TX of destination netdev.
> Introduce bpf_redirect() helper that does that without cloning.
> 
> Benchmarked with two hosts using 10G ixgbe NICs.
> One host is doing line rate pktgen.
> Another host is configured as:
> $ tc qdisc add dev $dev ingress
> $ tc filter add dev $dev root pref 10 u32 match u32 0 0 flowid 1:2 \
>    action bpf run object-file tcbpf1_kern.o section clone_redirect_xmit drop
> so it receives the packet on $dev and immediately xmits it on $dev + 1
> The section 'clone_redirect_xmit' in tcbpf1_kern.o file has the program
> that does bpf_clone_redirect() and performance is 2.0 Mpps
> 
> $ tc filter add dev $dev root pref 10 u32 match u32 0 0 flowid 1:2 \
>    action bpf run object-file tcbpf1_kern.o section redirect_xmit drop
> which is using bpf_redirect() - 2.4 Mpps
> 
> and using cls_bpf with integrated actions as:
> $ tc filter add dev $dev root pref 10 \
>   bpf run object-file tcbpf1_kern.o section redirect_xmit integ_act classid 1
> performance is 2.5 Mpps
> 
> To summarize:
> u32+act_bpf using clone_redirect - 2.0 Mpps
> u32+act_bpf using redirect - 2.4 Mpps
> cls_bpf using redirect - 2.5 Mpps
> 
> For comparison linux bridge in this setup is doing 2.1 Mpps
> and ixgbe rx + drop in ip_rcv - 7.8 Mpps
> 
> Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
> Acked-by: Daniel Borkmann <daniel@iogearbox.net>
> ---
> This approach is using per_cpu scratch area to store ifindex and flags.
> The other alternatives discussed at plumbers are slower and more intrusive.
> v1->v2: dropped redundant iff_up check
> 
>  include/net/sch_generic.h    |    1 +
>  include/uapi/linux/bpf.h     |    8 ++++++++
>  include/uapi/linux/pkt_cls.h |    1 +
>  net/core/dev.c               |    8 ++++++++
>  net/core/filter.c            |   44 ++++++++++++++++++++++++++++++++++++++++++
>  net/sched/act_bpf.c          |    1 +
>  net/sched/cls_bpf.c          |    1 +
>  samples/bpf/bpf_helpers.h    |    4 ++++
>  samples/bpf/tcbpf1_kern.c    |   24 ++++++++++++++++++++++-
>  9 files changed, 91 insertions(+), 1 deletion(-)
> 

Acked-by: John Fastabend <john.r.fastabend@intel.com>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next 0/2] bpf: performance improvements
  2015-09-16  6:05 [PATCH v2 net-next 0/2] bpf: performance improvements Alexei Starovoitov
  2015-09-16  6:05 ` [PATCH v2 net-next 1/2] cls_bpf: introduce integrated actions Alexei Starovoitov
  2015-09-16  6:05 ` [PATCH v2 net-next 2/2] bpf: add bpf_redirect() helper Alexei Starovoitov
@ 2015-09-18  4:10 ` David Miller
  2 siblings, 0 replies; 11+ messages in thread
From: David Miller @ 2015-09-18  4:10 UTC (permalink / raw)
  To: ast; +Cc: john.fastabend, jhs, daniel, netdev

From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 15 Sep 2015 23:05:41 -0700

> v1->v2: dropped redundant iff_up check in patch 2
> 
> At plumbers we discussed different options on how to get rid of skb_clone
> from bpf_clone_redirect(), the patch 2 implements the best option.
> Patch 1 adds 'integrated exts' to cls_bpf to improve performance by
> combining simple actions into bpf classifier.

Series applied, thanks.

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2015-09-18 12:51 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-09-16  6:05 [PATCH v2 net-next 0/2] bpf: performance improvements Alexei Starovoitov
2015-09-16  6:05 ` [PATCH v2 net-next 1/2] cls_bpf: introduce integrated actions Alexei Starovoitov
2015-09-17 12:37   ` Jamal Hadi Salim
2015-09-17 13:13     ` Daniel Borkmann
2015-09-17 15:19       ` Alexei Starovoitov
2015-09-18 12:13         ` Jamal Hadi Salim
2015-09-18 12:51           ` Daniel Borkmann
2015-09-18 12:04       ` Jamal Hadi Salim
2015-09-16  6:05 ` [PATCH v2 net-next 2/2] bpf: add bpf_redirect() helper Alexei Starovoitov
2015-09-16  6:45   ` John Fastabend
2015-09-18  4:10 ` [PATCH v2 net-next 0/2] bpf: performance improvements David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).