* [PATCH net-next 0/4] packet: add BPF and eBPF fanout modes
@ 2015-08-14 15:50 Willem de Bruijn
2015-08-14 15:50 ` [PATCH net-next 1/4] packet: add BPF fanout mode Willem de Bruijn
` (3 more replies)
0 siblings, 4 replies; 12+ messages in thread
From: Willem de Bruijn @ 2015-08-14 15:50 UTC (permalink / raw)
To: netdev; +Cc: davem, edumazet, dborkman, ast, Willem de Bruijn
From: Willem de Bruijn <willemb@google.com>
Allow programmable fanout modes. Support both classical BPF programs
passed directly and eBPF programs passed by file descriptor.
One use case is packet steering by deep packet inspection, for
instance for packet steering by application layer header fields.
Separate the configuration of the fanout mode and the configuration
of the program, to allow dynamic updates to the latter at runtime.
Willem de Bruijn (4):
packet: add BPF fanout mode
packet: add eBPF fanout mode
selftests/net: test bpf fanout mode
selftests/net: test eBPF fanout mode
include/uapi/linux/if_packet.h | 3 +
net/packet/af_packet.c | 122 ++++++++++++++++++++++++++++-
net/packet/internal.h | 5 +-
tools/testing/selftests/net/psock_fanout.c | 69 +++++++++++++++-
tools/testing/selftests/net/psock_lib.h | 29 +++++--
5 files changed, 214 insertions(+), 14 deletions(-)
--
2.5.0.276.gf5e568e
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH net-next 1/4] packet: add BPF fanout mode
2015-08-14 15:50 [PATCH net-next 0/4] packet: add BPF and eBPF fanout modes Willem de Bruijn
@ 2015-08-14 15:50 ` Willem de Bruijn
2015-08-17 14:29 ` Eric Dumazet
2015-08-14 15:50 ` [PATCH net-next 2/4] packet: add eBPF " Willem de Bruijn
` (2 subsequent siblings)
3 siblings, 1 reply; 12+ messages in thread
From: Willem de Bruijn @ 2015-08-14 15:50 UTC (permalink / raw)
To: netdev; +Cc: davem, edumazet, dborkman, ast, Willem de Bruijn
From: Willem de Bruijn <willemb@google.com>
Add a fanout mode that accepts a BPF program to select a socket.
This avoids having to keep adding special case fanout modes. One
example use case is application layer load balancing. The QUIC
protocol, for instance, encodes a connection ID in UDP payload.
Also add socket option SOL_PACKET/PACKET_FANOUT_DATA that updates data
associated with the socket group. Fanout mode PACKET_FANOUT_BPF is the
only user so far.
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
include/uapi/linux/if_packet.h | 2 +
net/packet/af_packet.c | 97 +++++++++++++++++++++++++++++++++++++++++-
net/packet/internal.h | 5 ++-
3 files changed, 102 insertions(+), 2 deletions(-)
diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index d3d715f8c..d41280a 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -55,6 +55,7 @@ struct sockaddr_ll {
#define PACKET_TX_HAS_OFF 19
#define PACKET_QDISC_BYPASS 20
#define PACKET_ROLLOVER_STATS 21
+#define PACKET_FANOUT_DATA 22
#define PACKET_FANOUT_HASH 0
#define PACKET_FANOUT_LB 1
@@ -62,6 +63,7 @@ struct sockaddr_ll {
#define PACKET_FANOUT_ROLLOVER 3
#define PACKET_FANOUT_RND 4
#define PACKET_FANOUT_QM 5
+#define PACKET_FANOUT_BPF 6
#define PACKET_FANOUT_FLAG_ROLLOVER 0x1000
#define PACKET_FANOUT_FLAG_DEFRAG 0x8000
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index b5afe53..80d68c9 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -92,6 +92,7 @@
#ifdef CONFIG_INET
#include <net/inet_common.h>
#endif
+#include <linux/bpf.h>
#include "internal.h"
@@ -1410,6 +1411,22 @@ static unsigned int fanout_demux_qm(struct packet_fanout *f,
return skb_get_queue_mapping(skb) % num;
}
+static unsigned int fanout_demux_bpf(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
+{
+ struct bpf_prog *prog;
+ unsigned int ret = 0;
+
+ rcu_read_lock();
+ prog = rcu_dereference(f->bpf_prog);
+ if (prog)
+ ret = BPF_PROG_RUN(prog, skb) % num;
+ rcu_read_unlock();
+
+ return ret;
+}
+
static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
{
return f->flags & (flag >> 8);
@@ -1454,6 +1471,9 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
case PACKET_FANOUT_ROLLOVER:
idx = fanout_demux_rollover(f, skb, 0, false, num);
break;
+ case PACKET_FANOUT_BPF:
+ idx = fanout_demux_bpf(f, skb, num);
+ break;
}
if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
@@ -1502,6 +1522,72 @@ static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
return false;
}
+static void fanout_init_data(struct packet_fanout *f)
+{
+ switch (f->type) {
+ case PACKET_FANOUT_LB:
+ atomic_set(&f->rr_cur, 0);
+ break;
+ case PACKET_FANOUT_BPF:
+ RCU_INIT_POINTER(f->bpf_prog, NULL);
+ break;
+ }
+}
+
+static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
+{
+ struct bpf_prog *old;
+
+ spin_lock(&f->lock);
+ old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
+ rcu_assign_pointer(f->bpf_prog, new);
+ spin_unlock(&f->lock);
+
+ if (old) {
+ synchronize_net();
+ bpf_prog_destroy(old);
+ }
+}
+
+static int fanout_set_data_bpf(struct packet_fanout *f, char __user *data,
+ unsigned int len)
+{
+ struct bpf_prog *new;
+ struct sock_fprog fprog;
+ int ret;
+
+ if (len != sizeof(fprog))
+ return -EINVAL;
+ if (copy_from_user(&fprog, data, len))
+ return -EFAULT;
+
+ ret = bpf_prog_create_from_user(&new, &fprog, NULL);
+ if (ret)
+ return ret;
+
+ __fanout_set_data_bpf(f, new);
+ return 0;
+}
+
+static int fanout_set_data(struct packet_fanout *f, char __user *data,
+ unsigned int len)
+{
+ switch (f->type) {
+ case PACKET_FANOUT_BPF:
+ return fanout_set_data_bpf(f, data, len);
+ default:
+ return -EINVAL;
+ };
+}
+
+static void fanout_release_data(struct packet_fanout *f)
+{
+ switch (f->type) {
+ case PACKET_FANOUT_BPF:
+ __fanout_set_data_bpf(f, NULL);
+ };
+}
+
static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
{
struct packet_sock *po = pkt_sk(sk);
@@ -1519,6 +1605,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
case PACKET_FANOUT_CPU:
case PACKET_FANOUT_RND:
case PACKET_FANOUT_QM:
+ case PACKET_FANOUT_BPF:
break;
default:
return -EINVAL;
@@ -1561,10 +1648,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
match->id = id;
match->type = type;
match->flags = flags;
- atomic_set(&match->rr_cur, 0);
INIT_LIST_HEAD(&match->list);
spin_lock_init(&match->lock);
atomic_set(&match->sk_ref, 0);
+ fanout_init_data(match);
match->prot_hook.type = po->prot_hook.type;
match->prot_hook.dev = po->prot_hook.dev;
match->prot_hook.func = packet_rcv_fanout;
@@ -1610,6 +1697,7 @@ static void fanout_release(struct sock *sk)
if (atomic_dec_and_test(&f->sk_ref)) {
list_del(&f->list);
dev_remove_pack(&f->prot_hook);
+ fanout_release_data(f);
kfree(f);
}
mutex_unlock(&fanout_mutex);
@@ -3529,6 +3617,13 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
return fanout_add(sk, val & 0xffff, val >> 16);
}
+ case PACKET_FANOUT_DATA:
+ {
+ if (!po->fanout)
+ return -EINVAL;
+
+ return fanout_set_data(po->fanout, optval, optlen);
+ }
case PACKET_TX_HAS_OFF:
{
unsigned int val;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index e20b3e8..9ee4631 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -79,7 +79,10 @@ struct packet_fanout {
u16 id;
u8 type;
u8 flags;
- atomic_t rr_cur;
+ union {
+ atomic_t rr_cur;
+ struct bpf_prog __rcu *bpf_prog;
+ };
struct list_head list;
struct sock *arr[PACKET_FANOUT_MAX];
spinlock_t lock;
--
2.5.0.276.gf5e568e
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH net-next 2/4] packet: add eBPF fanout mode
2015-08-14 15:50 [PATCH net-next 0/4] packet: add BPF and eBPF fanout modes Willem de Bruijn
2015-08-14 15:50 ` [PATCH net-next 1/4] packet: add BPF fanout mode Willem de Bruijn
@ 2015-08-14 15:50 ` Willem de Bruijn
2015-08-14 17:03 ` Alexei Starovoitov
2015-08-14 15:50 ` [PATCH net-next 3/4] selftests/net: test bpf " Willem de Bruijn
2015-08-14 15:50 ` [PATCH net-next 4/4] selftests/net: test eBPF " Willem de Bruijn
3 siblings, 1 reply; 12+ messages in thread
From: Willem de Bruijn @ 2015-08-14 15:50 UTC (permalink / raw)
To: netdev; +Cc: davem, edumazet, dborkman, ast, Willem de Bruijn
From: Willem de Bruijn <willemb@google.com>
Add a fanout mode that accepts an eBPF program to select a socket.
Update the internal eBPF program by passing to socket option
SOL_PACKET/PACKET_FANOUT_DATA a file descriptor returned by bpf().
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
include/uapi/linux/if_packet.h | 1 +
net/packet/af_packet.c | 25 +++++++++++++++++++++++++
2 files changed, 26 insertions(+)
diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index d41280a..daa0ddd 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -64,6 +64,7 @@ struct sockaddr_ll {
#define PACKET_FANOUT_RND 4
#define PACKET_FANOUT_QM 5
#define PACKET_FANOUT_BPF 6
+#define PACKET_FANOUT_EBPF 7
#define PACKET_FANOUT_FLAG_ROLLOVER 0x1000
#define PACKET_FANOUT_FLAG_DEFRAG 0x8000
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 80d68c9..6352b7d 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1472,6 +1472,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
idx = fanout_demux_rollover(f, skb, 0, false, num);
break;
case PACKET_FANOUT_BPF:
+ case PACKET_FANOUT_EBPF:
idx = fanout_demux_bpf(f, skb, num);
break;
}
@@ -1529,6 +1530,7 @@ static void fanout_init_data(struct packet_fanout *f)
atomic_set(&f->rr_cur, 0);
break;
case PACKET_FANOUT_BPF:
+ case PACKET_FANOUT_EBPF:
RCU_INIT_POINTER(f->bpf_prog, NULL);
break;
}
@@ -1569,12 +1571,33 @@ static int fanout_set_data_bpf(struct packet_fanout *f, char __user *data,
return 0;
}
+static int fanout_set_data_ebpf(struct packet_fanout *f, char __user *data,
+ unsigned int len)
+{
+ struct bpf_prog *new;
+ u32 fd;
+
+ if (len != sizeof(fd))
+ return -EINVAL;
+ if (copy_from_user(&fd, data, len))
+ return -EFAULT;
+
+ new = bpf_prog_get(fd);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ __fanout_set_data_bpf(f, new);
+ return 0;
+}
+
static int fanout_set_data(struct packet_fanout *f, char __user *data,
unsigned int len)
{
switch (f->type) {
case PACKET_FANOUT_BPF:
return fanout_set_data_bpf(f, data, len);
+ case PACKET_FANOUT_EBPF:
+ return fanout_set_data_ebpf(f, data, len);
default:
return -EINVAL;
};
@@ -1584,6 +1607,7 @@ static void fanout_release_data(struct packet_fanout *f)
{
switch (f->type) {
case PACKET_FANOUT_BPF:
+ case PACKET_FANOUT_EBPF:
__fanout_set_data_bpf(f, NULL);
};
}
@@ -1606,6 +1630,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
case PACKET_FANOUT_RND:
case PACKET_FANOUT_QM:
case PACKET_FANOUT_BPF:
+ case PACKET_FANOUT_EBPF:
break;
default:
return -EINVAL;
--
2.5.0.276.gf5e568e
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH net-next 3/4] selftests/net: test bpf fanout mode
2015-08-14 15:50 [PATCH net-next 0/4] packet: add BPF and eBPF fanout modes Willem de Bruijn
2015-08-14 15:50 ` [PATCH net-next 1/4] packet: add BPF fanout mode Willem de Bruijn
2015-08-14 15:50 ` [PATCH net-next 2/4] packet: add eBPF " Willem de Bruijn
@ 2015-08-14 15:50 ` Willem de Bruijn
2015-08-14 15:50 ` [PATCH net-next 4/4] selftests/net: test eBPF " Willem de Bruijn
3 siblings, 0 replies; 12+ messages in thread
From: Willem de Bruijn @ 2015-08-14 15:50 UTC (permalink / raw)
To: netdev; +Cc: davem, edumazet, dborkman, ast, Willem de Bruijn
From: Willem de Bruijn <willemb@google.com>
Test PACKET_FANOUT_BPF by inserting a BPF program that selects a
socket by payload. Requires modifying the test program to send
packets with multiple payloads.
Also fix a bug in testing the return value of mmap()
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
tools/testing/selftests/net/psock_fanout.c | 16 ++++++++++++----
tools/testing/selftests/net/psock_lib.h | 29 +++++++++++++++++++++--------
2 files changed, 33 insertions(+), 12 deletions(-)
diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c
index 08c2a36..7270132 100644
--- a/tools/testing/selftests/net/psock_fanout.c
+++ b/tools/testing/selftests/net/psock_fanout.c
@@ -19,6 +19,7 @@
* - PACKET_FANOUT_LB
* - PACKET_FANOUT_CPU
* - PACKET_FANOUT_ROLLOVER
+ * - PACKET_FANOUT_BPF
*
* Todo:
* - functionality: PACKET_FANOUT_FLAG_DEFRAG
@@ -115,8 +116,8 @@ static char *sock_fanout_open_ring(int fd)
ring = mmap(0, req.tp_block_size * req.tp_block_nr,
PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
- if (!ring) {
- fprintf(stderr, "packetsock ring mmap\n");
+ if (ring == MAP_FAILED) {
+ perror("packetsock ring mmap");
exit(1);
}
@@ -209,6 +210,7 @@ static int test_datapath(uint16_t typeflags, int port_off,
{
const int expect0[] = { 0, 0 };
char *rings[2];
+ uint8_t type = typeflags & 0xFF;
int fds[2], fds_udp[2][2], ret;
fprintf(stderr, "test: datapath 0x%hx\n", typeflags);
@@ -219,6 +221,9 @@ static int test_datapath(uint16_t typeflags, int port_off,
fprintf(stderr, "ERROR: failed open\n");
exit(1);
}
+ if (type == PACKET_FANOUT_BPF)
+ sock_setfilter(fds[0], SOL_PACKET, PACKET_FANOUT_DATA);
+
rings[0] = sock_fanout_open_ring(fds[0]);
rings[1] = sock_fanout_open_ring(fds[1]);
pair_udp_open(fds_udp[0], PORT_BASE);
@@ -227,11 +232,11 @@ static int test_datapath(uint16_t typeflags, int port_off,
/* Send data, but not enough to overflow a queue */
pair_udp_send(fds_udp[0], 15);
- pair_udp_send(fds_udp[1], 5);
+ pair_udp_send_char(fds_udp[1], 5, DATA_CHAR_1);
ret = sock_fanout_read(fds, rings, expect1);
/* Send more data, overflow the queue */
- pair_udp_send(fds_udp[0], 15);
+ pair_udp_send_char(fds_udp[0], 15, DATA_CHAR_1);
/* TODO: ensure consistent order between expect1 and expect2 */
ret |= sock_fanout_read(fds, rings, expect2);
@@ -275,6 +280,7 @@ int main(int argc, char **argv)
const int expect_rb[2][2] = { { 15, 5 }, { 20, 15 } };
const int expect_cpu0[2][2] = { { 20, 0 }, { 20, 0 } };
const int expect_cpu1[2][2] = { { 0, 20 }, { 0, 20 } };
+ const int expect_bpf[2][2] = { { 15, 5 }, { 15, 20 } };
int port_off = 2, tries = 5, ret;
test_control_single();
@@ -295,6 +301,8 @@ int main(int argc, char **argv)
port_off, expect_lb[0], expect_lb[1]);
ret |= test_datapath(PACKET_FANOUT_ROLLOVER,
port_off, expect_rb[0], expect_rb[1]);
+ ret |= test_datapath(PACKET_FANOUT_BPF,
+ port_off, expect_bpf[0], expect_bpf[1]);
set_cpuaffinity(0);
ret |= test_datapath(PACKET_FANOUT_CPU, port_off,
diff --git a/tools/testing/selftests/net/psock_lib.h b/tools/testing/selftests/net/psock_lib.h
index 37da54a..24bc7ec 100644
--- a/tools/testing/selftests/net/psock_lib.h
+++ b/tools/testing/selftests/net/psock_lib.h
@@ -30,6 +30,7 @@
#define DATA_LEN 100
#define DATA_CHAR 'a'
+#define DATA_CHAR_1 'b'
#define PORT_BASE 8000
@@ -37,29 +38,36 @@
# define __maybe_unused __attribute__ ((__unused__))
#endif
-static __maybe_unused void pair_udp_setfilter(int fd)
+static __maybe_unused void sock_setfilter(int fd, int lvl, int optnum)
{
struct sock_filter bpf_filter[] = {
{ 0x80, 0, 0, 0x00000000 }, /* LD pktlen */
- { 0x35, 0, 5, DATA_LEN }, /* JGE DATA_LEN [f goto nomatch]*/
+ { 0x35, 0, 4, DATA_LEN }, /* JGE DATA_LEN [f goto nomatch]*/
{ 0x30, 0, 0, 0x00000050 }, /* LD ip[80] */
- { 0x15, 0, 3, DATA_CHAR }, /* JEQ DATA_CHAR [f goto nomatch]*/
- { 0x30, 0, 0, 0x00000051 }, /* LD ip[81] */
- { 0x15, 0, 1, DATA_CHAR }, /* JEQ DATA_CHAR [f goto nomatch]*/
+ { 0x15, 1, 0, DATA_CHAR }, /* JEQ DATA_CHAR [t goto match]*/
+ { 0x15, 0, 1, DATA_CHAR_1}, /* JEQ DATA_CHAR_1 [t goto match]*/
{ 0x06, 0, 0, 0x00000060 }, /* RET match */
{ 0x06, 0, 0, 0x00000000 }, /* RET no match */
};
struct sock_fprog bpf_prog;
+ if (lvl == SOL_PACKET && optnum == PACKET_FANOUT_DATA)
+ bpf_filter[5].code = 0x16; /* RET A */
+
bpf_prog.filter = bpf_filter;
bpf_prog.len = sizeof(bpf_filter) / sizeof(struct sock_filter);
- if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &bpf_prog,
+ if (setsockopt(fd, lvl, optnum, &bpf_prog,
sizeof(bpf_prog))) {
perror("setsockopt SO_ATTACH_FILTER");
exit(1);
}
}
+static __maybe_unused void pair_udp_setfilter(int fd)
+{
+ sock_setfilter(fd, SOL_SOCKET, SO_ATTACH_FILTER);
+}
+
static __maybe_unused void pair_udp_open(int fds[], uint16_t port)
{
struct sockaddr_in saddr, daddr;
@@ -96,11 +104,11 @@ static __maybe_unused void pair_udp_open(int fds[], uint16_t port)
}
}
-static __maybe_unused void pair_udp_send(int fds[], int num)
+static __maybe_unused void pair_udp_send_char(int fds[], int num, char payload)
{
char buf[DATA_LEN], rbuf[DATA_LEN];
- memset(buf, DATA_CHAR, sizeof(buf));
+ memset(buf, payload, sizeof(buf));
while (num--) {
/* Should really handle EINTR and EAGAIN */
if (write(fds[0], buf, sizeof(buf)) != sizeof(buf)) {
@@ -118,6 +126,11 @@ static __maybe_unused void pair_udp_send(int fds[], int num)
}
}
+static __maybe_unused void pair_udp_send(int fds[], int num)
+{
+ return pair_udp_send_char(fds, num, DATA_CHAR);
+}
+
static __maybe_unused void pair_udp_close(int fds[])
{
close(fds[0]);
--
2.5.0.276.gf5e568e
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH net-next 4/4] selftests/net: test eBPF fanout mode
2015-08-14 15:50 [PATCH net-next 0/4] packet: add BPF and eBPF fanout modes Willem de Bruijn
` (2 preceding siblings ...)
2015-08-14 15:50 ` [PATCH net-next 3/4] selftests/net: test bpf " Willem de Bruijn
@ 2015-08-14 15:50 ` Willem de Bruijn
3 siblings, 0 replies; 12+ messages in thread
From: Willem de Bruijn @ 2015-08-14 15:50 UTC (permalink / raw)
To: netdev; +Cc: davem, edumazet, dborkman, ast, Willem de Bruijn
From: Willem de Bruijn <willemb@google.com>
Test PACKET_FANOUT_EBPF by inserting a program into the the kernel
with bpf(), then attaching it to the fanout group. Observe the same
payload-based distribution as in the PACKET_FANOUT_BPF test.
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
tools/testing/selftests/net/psock_fanout.c | 53 ++++++++++++++++++++++++++++++
1 file changed, 53 insertions(+)
diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c
index 7270132..ed609bf 100644
--- a/tools/testing/selftests/net/psock_fanout.c
+++ b/tools/testing/selftests/net/psock_fanout.c
@@ -20,6 +20,7 @@
* - PACKET_FANOUT_CPU
* - PACKET_FANOUT_ROLLOVER
* - PACKET_FANOUT_BPF
+ * - PACKET_FANOUT_EBPF
*
* Todo:
* - functionality: PACKET_FANOUT_FLAG_DEFRAG
@@ -45,7 +46,9 @@
#include <arpa/inet.h>
#include <errno.h>
#include <fcntl.h>
+#include <linux/unistd.h> /* for __NR_bpf */
#include <linux/filter.h>
+#include <linux/bpf.h>
#include <linux/if_packet.h>
#include <net/ethernet.h>
#include <netinet/ip.h>
@@ -92,6 +95,51 @@ static int sock_fanout_open(uint16_t typeflags, int num_packets)
return fd;
}
+static void sock_fanout_set_ebpf(int fd)
+{
+ const int len_off = __builtin_offsetof(struct __sk_buff, len);
+ struct bpf_insn prog[] = {
+ { BPF_ALU64 | BPF_MOV | BPF_X, 6, 1, 0, 0 },
+ { BPF_LDX | BPF_W | BPF_MEM, 0, 6, len_off, 0 },
+ { BPF_JMP | BPF_JGE | BPF_K, 0, 0, 1, DATA_LEN },
+ { BPF_JMP | BPF_JA | BPF_K, 0, 0, 4, 0 },
+ { BPF_LD | BPF_B | BPF_ABS, 0, 0, 0, 0x50 },
+ { BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 2, DATA_CHAR },
+ { BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1, DATA_CHAR_1 },
+ { BPF_ALU | BPF_MOV | BPF_K, 0, 0, 0, 0 },
+ { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 }
+ };
+ char log_buf[512];
+ union bpf_attr attr;
+ int pfd;
+
+ memset(&attr, 0, sizeof(attr));
+ attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ attr.insns = (unsigned long) prog;
+ attr.insn_cnt = sizeof(prog) / sizeof(prog[0]);
+ attr.license = (unsigned long) "GPL";
+ attr.log_buf = (unsigned long) log_buf,
+ attr.log_size = sizeof(log_buf),
+ attr.log_level = 1,
+
+ pfd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+ if (pfd < 0) {
+ perror("bpf");
+ fprintf(stderr, "bpf verifier:\n%s\n", log_buf);
+ exit(1);
+ }
+
+ if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT_DATA, &pfd, sizeof(pfd))) {
+ perror("fanout data ebpf");
+ exit(1);
+ }
+
+ if (close(pfd)) {
+ perror("close ebpf");
+ exit(1);
+ }
+}
+
static char *sock_fanout_open_ring(int fd)
{
struct tpacket_req req = {
@@ -223,6 +271,8 @@ static int test_datapath(uint16_t typeflags, int port_off,
}
if (type == PACKET_FANOUT_BPF)
sock_setfilter(fds[0], SOL_PACKET, PACKET_FANOUT_DATA);
+ else if (type == PACKET_FANOUT_EBPF)
+ sock_fanout_set_ebpf(fds[0]);
rings[0] = sock_fanout_open_ring(fds[0]);
rings[1] = sock_fanout_open_ring(fds[1]);
@@ -301,8 +351,11 @@ int main(int argc, char **argv)
port_off, expect_lb[0], expect_lb[1]);
ret |= test_datapath(PACKET_FANOUT_ROLLOVER,
port_off, expect_rb[0], expect_rb[1]);
+
ret |= test_datapath(PACKET_FANOUT_BPF,
port_off, expect_bpf[0], expect_bpf[1]);
+ ret |= test_datapath(PACKET_FANOUT_EBPF,
+ port_off, expect_bpf[0], expect_bpf[1]);
set_cpuaffinity(0);
ret |= test_datapath(PACKET_FANOUT_CPU, port_off,
--
2.5.0.276.gf5e568e
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH net-next 2/4] packet: add eBPF fanout mode
2015-08-14 15:50 ` [PATCH net-next 2/4] packet: add eBPF " Willem de Bruijn
@ 2015-08-14 17:03 ` Alexei Starovoitov
2015-08-14 18:47 ` Willem de Bruijn
2015-08-14 19:01 ` Daniel Borkmann
0 siblings, 2 replies; 12+ messages in thread
From: Alexei Starovoitov @ 2015-08-14 17:03 UTC (permalink / raw)
To: Willem de Bruijn, netdev; +Cc: davem, edumazet, dborkman
On 8/14/15 8:50 AM, Willem de Bruijn wrote:
> +static int fanout_set_data_ebpf(struct packet_fanout *f, char __user *data,
> + unsigned int len)
> +{
> + struct bpf_prog *new;
> + u32 fd;
> +
> + if (len != sizeof(fd))
> + return -EINVAL;
> + if (copy_from_user(&fd, data, len))
> + return -EFAULT;
> +
> + new = bpf_prog_get(fd);
> + if (IS_ERR(new))
> + return PTR_ERR(new);
> +
> + __fanout_set_data_bpf(f, new);
> + return 0;
> +}
all looks great except in the above the check:
if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) {
bpf_prog_put(new);
return -EINVAL;
}
is missing. Otherwise user will be able to attach programs
of wrong types to fanout.
Also instead of:
#define PACKET_FANOUT_BPF 6
#define PACKET_FANOUT_EBPF 7
I would call them FANOUT_CBPF and FANOUT_EBPF to be unambiguous.
This is how bpf manpage distinguishes them.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next 2/4] packet: add eBPF fanout mode
2015-08-14 17:03 ` Alexei Starovoitov
@ 2015-08-14 18:47 ` Willem de Bruijn
2015-08-14 19:01 ` Daniel Borkmann
1 sibling, 0 replies; 12+ messages in thread
From: Willem de Bruijn @ 2015-08-14 18:47 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Network Development, David Miller, Eric Dumazet, Daniel Borkmann
On Fri, Aug 14, 2015 at 1:03 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
> On 8/14/15 8:50 AM, Willem de Bruijn wrote:
>>
>> +static int fanout_set_data_ebpf(struct packet_fanout *f, char __user
>> *data,
>> + unsigned int len)
>> +{
>> + struct bpf_prog *new;
>> + u32 fd;
>> +
>> + if (len != sizeof(fd))
>> + return -EINVAL;
>> + if (copy_from_user(&fd, data, len))
>> + return -EFAULT;
>> +
>> + new = bpf_prog_get(fd);
>> + if (IS_ERR(new))
>> + return PTR_ERR(new);
>> +
>> + __fanout_set_data_bpf(f, new);
>> + return 0;
>> +}
>
>
> all looks great except in the above the check:
> if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) {
> bpf_prog_put(new);
> return -EINVAL;
> }
> is missing. Otherwise user will be able to attach programs
> of wrong types to fanout.
Ai, good point!
> Also instead of:
> #define PACKET_FANOUT_BPF 6
> #define PACKET_FANOUT_EBPF 7
>
> I would call them FANOUT_CBPF and FANOUT_EBPF to be unambiguous.
> This is how bpf manpage distinguishes them.
>
Sounds good. I'll make both changes in v2. Thanks for reviewing, Alexei.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next 2/4] packet: add eBPF fanout mode
2015-08-14 17:03 ` Alexei Starovoitov
2015-08-14 18:47 ` Willem de Bruijn
@ 2015-08-14 19:01 ` Daniel Borkmann
2015-08-14 19:27 ` Willem de Bruijn
1 sibling, 1 reply; 12+ messages in thread
From: Daniel Borkmann @ 2015-08-14 19:01 UTC (permalink / raw)
To: Willem de Bruijn; +Cc: Alexei Starovoitov, netdev, davem, edumazet
[ @Willem: RH email doesn't exist anymore, I took it out, otherwise
every reply gets a bounce. ;) ]
On 08/14/2015 07:03 PM, Alexei Starovoitov wrote:
> On 8/14/15 8:50 AM, Willem de Bruijn wrote:
...
> all looks great except in the above the check:
> if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) {
> bpf_prog_put(new);
> return -EINVAL;
> }
> is missing. Otherwise user will be able to attach programs
> of wrong types to fanout.
>
> Also instead of:
> #define PACKET_FANOUT_BPF 6
> #define PACKET_FANOUT_EBPF 7
>
> I would call them FANOUT_CBPF and FANOUT_EBPF to be unambiguous.
> This is how bpf manpage distinguishes them.
We have SO_ATTACH_FILTER and SO_ATTACH_BPF, could also be
analogous for fanout, if we want to be consistent with the API?
But C/E prefix seems okay too, how you want ...
Btw, in case someone sets sock_flag(sk, SOCK_FILTER_LOCKED),
perhaps we should also apply it on fanout?
Thanks,
Daniel
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next 2/4] packet: add eBPF fanout mode
2015-08-14 19:01 ` Daniel Borkmann
@ 2015-08-14 19:27 ` Willem de Bruijn
2015-08-14 19:46 ` Daniel Borkmann
0 siblings, 1 reply; 12+ messages in thread
From: Willem de Bruijn @ 2015-08-14 19:27 UTC (permalink / raw)
To: Daniel Borkmann
Cc: Alexei Starovoitov, Network Development, David Miller,
Eric Dumazet
> [ @Willem: RH email doesn't exist anymore, I took it out, otherwise
> every reply gets a bounce. ;) ]
Sorry for using the wrong address, Daniel.
>> Also instead of:
>> #define PACKET_FANOUT_BPF 6
>> #define PACKET_FANOUT_EBPF 7
>>
>> I would call them FANOUT_CBPF and FANOUT_EBPF to be unambiguous.
>> This is how bpf manpage distinguishes them.
>
> We have SO_ATTACH_FILTER and SO_ATTACH_BPF, could also be
> analogous for fanout, if we want to be consistent with the API?
>
> But C/E prefix seems okay too, how you want ...
I don't feel very strongly, either. But CBPF/EBPF is a bit more
descriptive, so let's do that.
> Btw, in case someone sets sock_flag(sk, SOCK_FILTER_LOCKED),
> perhaps we should also apply it on fanout?
Good point. With classic bpf, packet access control is fully
enforced in per-socket filters, but playing with load balancing
filters could allow an adversary to infer some information
about the dropped packets*. With eBPF and maps, access
is even more direct. Let's support locking of fanout filters in
place.
I intend to test the existing socket flag. No need to add a
separate flag for the fanout group, as far as I can see.
(*) I noticed that a similar unintended effect also causes the
PACKET_FANOUT_LB selftest to be flaky: filters on the
sockets ensure that the test only reads expected packets.
But, all traffic makes it through packet_rcv_fanout. Packets
that are later dropped by sk_filter have already incremented
rr_cur. Worst case, with 2 sockets and each accepted packet
interleaved with a dropped packet, all packets are queued on
only one socket. Test flakiness is fixed, e.g., by running in a
private network namespace. The implementation behavior
may be unexpected in other, production, environments.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next 2/4] packet: add eBPF fanout mode
2015-08-14 19:27 ` Willem de Bruijn
@ 2015-08-14 19:46 ` Daniel Borkmann
2015-08-15 2:28 ` Willem de Bruijn
0 siblings, 1 reply; 12+ messages in thread
From: Daniel Borkmann @ 2015-08-14 19:46 UTC (permalink / raw)
To: Willem de Bruijn
Cc: Alexei Starovoitov, Network Development, David Miller,
Eric Dumazet
On 08/14/2015 09:27 PM, Willem de Bruijn wrote:
...
>> Btw, in case someone sets sock_flag(sk, SOCK_FILTER_LOCKED),
>> perhaps we should also apply it on fanout?
>
> Good point. With classic bpf, packet access control is fully
> enforced in per-socket filters, but playing with load balancing
> filters could allow an adversary to infer some information
> about the dropped packets*. With eBPF and maps, access
> is even more direct. Let's support locking of fanout filters in
> place.
Right, a process could share a map between the fanout lb filter
and actual sk filter, i.e. to look up how much actually passed
through on the later sk level filter, and use that information
in addition for its lb decisions.
> I intend to test the existing socket flag. No need to add a
> separate flag for the fanout group, as far as I can see.
Agreed, should be okay.
Thanks Willem!
> (*) I noticed that a similar unintended effect also causes the
> PACKET_FANOUT_LB selftest to be flaky: filters on the
> sockets ensure that the test only reads expected packets.
> But, all traffic makes it through packet_rcv_fanout. Packets
> that are later dropped by sk_filter have already incremented
> rr_cur. Worst case, with 2 sockets and each accepted packet
> interleaved with a dropped packet, all packets are queued on
> only one socket. Test flakiness is fixed, e.g., by running in a
> private network namespace. The implementation behavior
> may be unexpected in other, production, environments.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next 2/4] packet: add eBPF fanout mode
2015-08-14 19:46 ` Daniel Borkmann
@ 2015-08-15 2:28 ` Willem de Bruijn
0 siblings, 0 replies; 12+ messages in thread
From: Willem de Bruijn @ 2015-08-15 2:28 UTC (permalink / raw)
To: Daniel Borkmann
Cc: Alexei Starovoitov, Network Development, David Miller,
Eric Dumazet
On Fri, Aug 14, 2015 at 3:46 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> On 08/14/2015 09:27 PM, Willem de Bruijn wrote:
> ...
>>>
>>> Btw, in case someone sets sock_flag(sk, SOCK_FILTER_LOCKED),
>>> perhaps we should also apply it on fanout?
>>
>>
>> Good point. With classic bpf, packet access control is fully
>> enforced in per-socket filters, but playing with load balancing
>> filters could allow an adversary to infer some information
>> about the dropped packets*. With eBPF and maps, access
>> is even more direct. Let's support locking of fanout filters in
>> place.
>
>
> Right, a process could share a map between the fanout lb filter
> and actual sk filter, i.e. to look up how much actually passed
> through on the later sk level filter, and use that information
> in addition for its lb decisions.
>
>> I intend to test the existing socket flag. No need to add a
>> separate flag for the fanout group, as far as I can see.
>
>
> Agreed, should be okay.
Great. Thanks for the suggestion, Daniel! I'll send a v2 the
three suggested changes in a minute.
>
> Thanks Willem!
>
>> (*) I noticed that a similar unintended effect also causes the
>> PACKET_FANOUT_LB selftest to be flaky: filters on the
>> sockets ensure that the test only reads expected packets.
>> But, all traffic makes it through packet_rcv_fanout. Packets
>> that are later dropped by sk_filter have already incremented
>> rr_cur. Worst case, with 2 sockets and each accepted packet
>> interleaved with a dropped packet, all packets are queued on
>> only one socket. Test flakiness is fixed, e.g., by running in a
>> private network namespace. The implementation behavior
>> may be unexpected in other, production, environments.
>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next 1/4] packet: add BPF fanout mode
2015-08-14 15:50 ` [PATCH net-next 1/4] packet: add BPF fanout mode Willem de Bruijn
@ 2015-08-17 14:29 ` Eric Dumazet
0 siblings, 0 replies; 12+ messages in thread
From: Eric Dumazet @ 2015-08-17 14:29 UTC (permalink / raw)
To: Willem de Bruijn; +Cc: netdev, davem, edumazet, dborkman, ast
On Fri, 2015-08-14 at 11:50 -0400, Willem de Bruijn wrote:
> From: Willem de Bruijn <willemb@google.com>
>
> Add a fanout mode that accepts a BPF program to select a socket.
>
> This avoids having to keep adding special case fanout modes. One
> example use case is application layer load balancing. The QUIC
> protocol, for instance, encodes a connection ID in UDP payload.
>
> Also add socket option SOL_PACKET/PACKET_FANOUT_DATA that updates data
> associated with the socket group. Fanout mode PACKET_FANOUT_BPF is the
> only user so far.
>
> Signed-off-by: Willem de Bruijn <willemb@google.com
Acked-by: Eric Dumazet <edumazet@google.com>
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2015-08-17 15:26 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-08-14 15:50 [PATCH net-next 0/4] packet: add BPF and eBPF fanout modes Willem de Bruijn
2015-08-14 15:50 ` [PATCH net-next 1/4] packet: add BPF fanout mode Willem de Bruijn
2015-08-17 14:29 ` Eric Dumazet
2015-08-14 15:50 ` [PATCH net-next 2/4] packet: add eBPF " Willem de Bruijn
2015-08-14 17:03 ` Alexei Starovoitov
2015-08-14 18:47 ` Willem de Bruijn
2015-08-14 19:01 ` Daniel Borkmann
2015-08-14 19:27 ` Willem de Bruijn
2015-08-14 19:46 ` Daniel Borkmann
2015-08-15 2:28 ` Willem de Bruijn
2015-08-14 15:50 ` [PATCH net-next 3/4] selftests/net: test bpf " Willem de Bruijn
2015-08-14 15:50 ` [PATCH net-next 4/4] selftests/net: test eBPF " Willem de Bruijn
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).