* [RFC PATCH bpf-next 05/14] xdp_flow: Prepare flow tables in bpf
From: Toshiaki Makita @ 2019-08-13 12:05 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
Yonghong Song, David S. Miller, Jakub Kicinski,
Jesper Dangaard Brouer, John Fastabend, Jamal Hadi Salim,
Cong Wang, Jiri Pirko
Cc: Toshiaki Makita, netdev, bpf, William Tu
In-Reply-To: <20190813120558.6151-1-toshiaki.makita1@gmail.com>
Add maps for flow tables in bpf. TC flower has hash tables for each flow
mask ordered by priority. To do the same thing, prepare
hashmap-in-arraymap. As bpf does not provide ordered list, we emulate it
by an array. Each array entry has one-byte next index field to implement
a list. Also prepare a one-element array to point to the head index of
the list.
Because of the limitation of bpf maps, the outer array is implemented
using two array maps. "flow_masks" is the array to emulate the list and
its entries have the priority and mask of each flow table. For each
priority/mask, the same index entry of another map "flow_tables", which
is the hashmap-in-arraymap, points to the actual flow table.
The flow insertion logic in UMH and lookup logic in BPF will be
implemented in the following commits.
NOTE: This list emulation by array may be able to be realized by adding
ordered-list type map. In that case we also need map iteration API for
bpf progs.
Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
---
net/xdp_flow/umh_bpf.h | 18 +++++++++++
net/xdp_flow/xdp_flow_kern_bpf.c | 22 +++++++++++++
net/xdp_flow/xdp_flow_umh.c | 70 ++++++++++++++++++++++++++++++++++++++--
3 files changed, 108 insertions(+), 2 deletions(-)
create mode 100644 net/xdp_flow/umh_bpf.h
diff --git a/net/xdp_flow/umh_bpf.h b/net/xdp_flow/umh_bpf.h
new file mode 100644
index 0000000..b4fe0c6
--- /dev/null
+++ b/net/xdp_flow/umh_bpf.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NET_XDP_FLOW_UMH_BPF_H
+#define _NET_XDP_FLOW_UMH_BPF_H
+
+#include "msgfmt.h"
+
+#define MAX_FLOWS 1024
+#define MAX_FLOW_MASKS 255
+#define FLOW_MASKS_TAIL 255
+
+struct xdp_flow_mask_entry {
+ struct xdp_flow_key mask;
+ __u16 priority;
+ short count;
+ int next;
+};
+
+#endif
diff --git a/net/xdp_flow/xdp_flow_kern_bpf.c b/net/xdp_flow/xdp_flow_kern_bpf.c
index 74cdb1d..c101156 100644
--- a/net/xdp_flow/xdp_flow_kern_bpf.c
+++ b/net/xdp_flow/xdp_flow_kern_bpf.c
@@ -2,6 +2,28 @@
#define KBUILD_MODNAME "foo"
#include <uapi/linux/bpf.h>
#include <bpf_helpers.h>
+#include "umh_bpf.h"
+
+struct bpf_map_def SEC("maps") flow_masks_head = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(int),
+ .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") flow_masks = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct xdp_flow_mask_entry),
+ .max_entries = MAX_FLOW_MASKS,
+};
+
+struct bpf_map_def SEC("maps") flow_tables = {
+ .type = BPF_MAP_TYPE_ARRAY_OF_MAPS,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = MAX_FLOW_MASKS,
+};
SEC("xdp_flow")
int xdp_flow_prog(struct xdp_md *ctx)
diff --git a/net/xdp_flow/xdp_flow_umh.c b/net/xdp_flow/xdp_flow_umh.c
index 734db00..e35666a 100644
--- a/net/xdp_flow/xdp_flow_umh.c
+++ b/net/xdp_flow/xdp_flow_umh.c
@@ -13,7 +13,7 @@
#include <sys/resource.h>
#include <linux/hashtable.h>
#include <linux/err.h>
-#include "msgfmt.h"
+#include "umh_bpf.h"
extern char xdp_flow_bpf_start;
extern char xdp_flow_bpf_end;
@@ -95,11 +95,13 @@ static int setup(void)
static int load_bpf(int ifindex, struct bpf_object **objp)
{
+ int prog_fd, flow_tables_fd, flow_meta_fd, flow_masks_head_fd, err;
+ struct bpf_map *flow_tables, *flow_masks_head;
+ int zero = 0, flow_masks_tail = FLOW_MASKS_TAIL;
struct bpf_object_open_attr attr = {};
char path[256], errbuf[ERRBUF_SIZE];
struct bpf_program *prog;
struct bpf_object *obj;
- int prog_fd, err;
ssize_t len;
len = snprintf(path, 256, "/proc/self/fd/%d", progfile_fd);
@@ -127,6 +129,48 @@ static int load_bpf(int ifindex, struct bpf_object **objp)
bpf_object__for_each_program(prog, obj)
bpf_program__set_type(prog, attr.prog_type);
+ flow_meta_fd = bpf_create_map(BPF_MAP_TYPE_HASH,
+ sizeof(struct xdp_flow_key),
+ sizeof(struct xdp_flow_actions),
+ MAX_FLOWS, 0);
+ if (flow_meta_fd < 0) {
+ err = -errno;
+ pr_err("map creation for flow_tables meta failed: %s\n",
+ strerror(errno));
+ goto err;
+ }
+
+ flow_tables_fd = bpf_create_map_in_map(BPF_MAP_TYPE_ARRAY_OF_MAPS,
+ "flow_tables", sizeof(__u32),
+ flow_meta_fd, MAX_FLOW_MASKS, 0);
+ if (flow_tables_fd < 0) {
+ err = -errno;
+ pr_err("map creation for flow_tables failed: %s\n",
+ strerror(errno));
+ close(flow_meta_fd);
+ goto err;
+ }
+
+ close(flow_meta_fd);
+
+ flow_tables = bpf_object__find_map_by_name(obj, "flow_tables");
+ if (!flow_tables) {
+ pr_err("Cannot find flow_tables\n");
+ err = -ENOENT;
+ close(flow_tables_fd);
+ goto err;
+ }
+
+ err = bpf_map__reuse_fd(flow_tables, flow_tables_fd);
+ if (err) {
+ err = libbpf_err(err, errbuf);
+ pr_err("Failed to reuse flow_tables fd: %s\n", errbuf);
+ close(flow_tables_fd);
+ goto err;
+ }
+
+ close(flow_tables_fd);
+
err = bpf_object__load(obj);
if (err) {
err = libbpf_err(err, errbuf);
@@ -134,6 +178,28 @@ static int load_bpf(int ifindex, struct bpf_object **objp)
goto err;
}
+ flow_masks_head = bpf_object__find_map_by_name(obj, "flow_masks_head");
+ if (!flow_masks_head) {
+ pr_err("Cannot find flow_masks_head map\n");
+ err = -ENOENT;
+ goto err;
+ }
+
+ flow_masks_head_fd = bpf_map__fd(flow_masks_head);
+ if (flow_masks_head_fd < 0) {
+ err = libbpf_err(flow_masks_head_fd, errbuf);
+ pr_err("Invalid flow_masks_head fd: %s\n", errbuf);
+ goto err;
+ }
+
+ if (bpf_map_update_elem(flow_masks_head_fd, &zero, &flow_masks_tail,
+ 0)) {
+ err = -errno;
+ pr_err("Failed to initialize flow_masks_head: %s\n",
+ strerror(errno));
+ goto err;
+ }
+
prog = bpf_object__find_program_by_title(obj, "xdp_flow");
if (!prog) {
pr_err("Cannot find xdp_flow program\n");
--
1.8.3.1
^ permalink raw reply related
* [RFC PATCH bpf-next 04/14] xdp_flow: Attach bpf prog to XDP in kernel after UMH loaded program
From: Toshiaki Makita @ 2019-08-13 12:05 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
Yonghong Song, David S. Miller, Jakub Kicinski,
Jesper Dangaard Brouer, John Fastabend, Jamal Hadi Salim,
Cong Wang, Jiri Pirko
Cc: Toshiaki Makita, netdev, bpf, William Tu
In-Reply-To: <20190813120558.6151-1-toshiaki.makita1@gmail.com>
As UMH runs under RTNL, it cannot attach XDP from userspace. Thus the
kernel, xdp_flow module, installs the XDP program.
NOTE: As an RFC, XDP-related logic is emulating dev_change_xdp_fd().
I'm thinking I should factor out the logic from dev_change_xdp_fd() and
export it instead.
Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
---
include/linux/netdevice.h | 4 +++
net/core/dev.c | 11 ++++---
net/xdp_flow/xdp_flow_kern_mod.c | 63 ++++++++++++++++++++++++++++++++++++----
3 files changed, 69 insertions(+), 9 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8829295..c99e022 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3678,6 +3678,10 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq, int *ret);
typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
+int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
+ struct netlink_ext_ack *extack, u32 flags,
+ struct bpf_prog *prog);
+int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp);
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
int fd, u32 flags);
u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op,
diff --git a/net/core/dev.c b/net/core/dev.c
index fc676b2..a45d2e4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5145,7 +5145,7 @@ static void __netif_receive_skb_list(struct list_head *head)
memalloc_noreclaim_restore(noreclaim_flag);
}
-static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
+int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
{
struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
struct bpf_prog *new = xdp->prog;
@@ -5177,6 +5177,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
return ret;
}
+EXPORT_SYMBOL_GPL(generic_xdp_install);
static int netif_receive_skb_internal(struct sk_buff *skb)
{
@@ -8001,10 +8002,11 @@ u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
return xdp.prog_id;
}
+EXPORT_SYMBOL_GPL(__dev_xdp_query);
-static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
- struct netlink_ext_ack *extack, u32 flags,
- struct bpf_prog *prog)
+int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
+ struct netlink_ext_ack *extack, u32 flags,
+ struct bpf_prog *prog)
{
struct netdev_bpf xdp;
@@ -8019,6 +8021,7 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
return bpf_op(dev, &xdp);
}
+EXPORT_SYMBOL_GPL(dev_xdp_install);
static void dev_xdp_uninstall(struct net_device *dev)
{
diff --git a/net/xdp_flow/xdp_flow_kern_mod.c b/net/xdp_flow/xdp_flow_kern_mod.c
index 823ab65..9cf527d 100644
--- a/net/xdp_flow/xdp_flow_kern_mod.c
+++ b/net/xdp_flow/xdp_flow_kern_mod.c
@@ -116,10 +116,26 @@ static int xdp_flow_setup_block_cb(enum tc_setup_type type, void *type_data,
static int xdp_flow_setup_bind(struct net_device *dev,
struct netlink_ext_ack *extack)
{
+ enum bpf_prog_type attach_type = BPF_PROG_TYPE_XDP;
struct mbox_request *req;
+ bpf_op_t bpf_op, bpf_chk;
+ struct bpf_prog *prog;
u32 id = 0;
int err;
+ bpf_op = bpf_chk = dev->netdev_ops->ndo_bpf;
+ if (!bpf_op)
+ bpf_op = generic_xdp_install;
+ else
+ bpf_chk = generic_xdp_install;
+
+ /* TODO: These checks should be unified with net core */
+ if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG))
+ return -EEXIST;
+
+ if (__dev_xdp_query(dev, bpf_op, XDP_QUERY_PROG))
+ return -EBUSY;
+
req = kzalloc(sizeof(*req), GFP_KERNEL);
if (!req)
return -ENOMEM;
@@ -129,21 +145,56 @@ static int xdp_flow_setup_bind(struct net_device *dev,
/* Load bpf in UMH and get prog id */
err = transact_umh(req, &id);
+ if (err)
+ goto out;
+
+ prog = bpf_prog_get_by_id(id);
+ if (IS_ERR(prog)) {
+ err = PTR_ERR(prog);
+ goto err_umh;
+ }
+
+ if (!bpf_prog_get_ok(prog, &attach_type, false)) {
+ err = -EINVAL;
+ goto err_prog;
+ }
- /* TODO: id will be used to attach bpf prog to XDP
- * As we have rtnl_lock, UMH cannot attach prog to XDP
- */
+ /* As we have rtnl_lock, install XDP in kernel */
+ err = dev_xdp_install(dev, bpf_op, extack, 0, prog);
+ if (err)
+ goto err_prog;
+ /* TODO: Should get prog once more and save it for later check */
+out:
kfree(req);
return err;
+err_prog:
+ bpf_prog_put(prog);
+err_umh:
+ req->cmd = XDP_FLOW_CMD_UNLOAD;
+ transact_umh(req, NULL);
+
+ goto out;
}
static int xdp_flow_setup_unbind(struct net_device *dev,
struct netlink_ext_ack *extack)
{
struct mbox_request *req;
- int err;
+ int err, ret = 0;
+ bpf_op_t bpf_op;
+
+ bpf_op = dev->netdev_ops->ndo_bpf;
+ if (!bpf_op)
+ bpf_op = generic_xdp_install;
+
+ /* TODO: Should check if prog is not changed */
+ err = dev_xdp_install(dev, bpf_op, extack, 0, NULL);
+ if (err) {
+ pr_warn("Failed to uninstall XDP prog: %d\n", err);
+ ret = err;
+ }
req = kzalloc(sizeof(*req), GFP_KERNEL);
if (!req)
@@ -153,10 +204,12 @@ static int xdp_flow_setup_unbind(struct net_device *dev,
req->ifindex = dev->ifindex;
err = transact_umh(req, NULL);
+ if (err)
+ ret = err;
kfree(req);
- return err;
+ return ret;
}
static int xdp_flow_setup(struct net_device *dev, bool do_bind,
--
1.8.3.1
^ permalink raw reply related
* [RFC PATCH bpf-next 03/14] bpf: Add API to get program from id
From: Toshiaki Makita @ 2019-08-13 12:05 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
Yonghong Song, David S. Miller, Jakub Kicinski,
Jesper Dangaard Brouer, John Fastabend, Jamal Hadi Salim,
Cong Wang, Jiri Pirko
Cc: Toshiaki Makita, netdev, bpf, William Tu
In-Reply-To: <20190813120558.6151-1-toshiaki.makita1@gmail.com>
Factor out the logic in bpf_prog_get_fd_by_id() and add
bpf_prog_get_by_id(). Also export bpf_prog_get_ok().
They are used by the next commit to get bpf prog from its id.
Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
---
include/linux/bpf.h | 6 ++++++
kernel/bpf/syscall.c | 26 ++++++++++++++++++--------
2 files changed, 24 insertions(+), 8 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f9a5061..d8ad865 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -633,6 +633,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
struct bpf_prog *bpf_prog_get(u32 ufd);
struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
bool attach_drv);
+struct bpf_prog *bpf_prog_get_by_id(u32 id);
struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
void bpf_prog_sub(struct bpf_prog *prog, int i);
struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
@@ -755,6 +756,11 @@ static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
return ERR_PTR(-EOPNOTSUPP);
}
+static inline struct bpf_prog *bpf_prog_get_by_id(u32 id)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog,
int i)
{
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5d141f1..cb5ecc4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1495,6 +1495,7 @@ bool bpf_prog_get_ok(struct bpf_prog *prog,
return true;
}
+EXPORT_SYMBOL_GPL(bpf_prog_get_ok);
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
bool attach_drv)
@@ -2122,6 +2123,22 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
return err;
}
+struct bpf_prog *bpf_prog_get_by_id(u32 id)
+{
+ struct bpf_prog *prog;
+
+ spin_lock_bh(&prog_idr_lock);
+ prog = idr_find(&prog_idr, id);
+ if (prog)
+ prog = bpf_prog_inc_not_zero(prog);
+ else
+ prog = ERR_PTR(-ENOENT);
+ spin_unlock_bh(&prog_idr_lock);
+
+ return prog;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get_by_id);
+
#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
@@ -2136,14 +2153,7 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- spin_lock_bh(&prog_idr_lock);
- prog = idr_find(&prog_idr, id);
- if (prog)
- prog = bpf_prog_inc_not_zero(prog);
- else
- prog = ERR_PTR(-ENOENT);
- spin_unlock_bh(&prog_idr_lock);
-
+ prog = bpf_prog_get_by_id(id);
if (IS_ERR(prog))
return PTR_ERR(prog);
--
1.8.3.1
^ permalink raw reply related
* [RFC PATCH bpf-next 02/14] xdp_flow: Add skeleton bpf program for XDP
From: Toshiaki Makita @ 2019-08-13 12:05 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
Yonghong Song, David S. Miller, Jakub Kicinski,
Jesper Dangaard Brouer, John Fastabend, Jamal Hadi Salim,
Cong Wang, Jiri Pirko
Cc: Toshiaki Makita, netdev, bpf, William Tu
In-Reply-To: <20190813120558.6151-1-toshiaki.makita1@gmail.com>
The program is meant to be loaded when a device is bound to an ingress
TC block and should be attached to XDP on the device.
Typically it should be loaded when TC ingress or clsact qdisc is added.
The program is prebuilt and embedded in the UMH, instead of generated
dynamically. This is because TC filter is frequently changed when it is
used by OVS, and the latency of TC filter change will affect the latency
of datapath.
Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
---
net/xdp_flow/Makefile | 87 +++++++++++-
net/xdp_flow/xdp_flow_kern_bpf.c | 12 ++
net/xdp_flow/xdp_flow_kern_bpf_blob.S | 7 +
net/xdp_flow/xdp_flow_umh.c | 241 +++++++++++++++++++++++++++++++++-
4 files changed, 343 insertions(+), 4 deletions(-)
create mode 100644 net/xdp_flow/xdp_flow_kern_bpf.c
create mode 100644 net/xdp_flow/xdp_flow_kern_bpf_blob.S
diff --git a/net/xdp_flow/Makefile b/net/xdp_flow/Makefile
index f6138c2..b3a0416 100644
--- a/net/xdp_flow/Makefile
+++ b/net/xdp_flow/Makefile
@@ -2,25 +2,106 @@
obj-$(CONFIG_XDP_FLOW) += xdp_flow_core.o
+XDP_FLOW_PATH ?= $(abspath $(srctree)/$(src))
+TOOLS_PATH := $(XDP_FLOW_PATH)/../../tools
+
+# Libbpf dependencies
+LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
+
+LLC ?= llc
+CLANG ?= clang
+LLVM_OBJCOPY ?= llvm-objcopy
+BTF_PAHOLE ?= pahole
+
+ifdef CROSS_COMPILE
+CLANG_ARCH_ARGS = -target $(ARCH)
+endif
+
+BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris)
+BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
+BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm')
+BTF_LLVM_PROBE := $(shell echo "int main() { return 0; }" | \
+ $(CLANG) -target bpf -O2 -g -c -x c - -o ./llvm_btf_verify.o; \
+ readelf -S ./llvm_btf_verify.o | grep BTF; \
+ /bin/rm -f ./llvm_btf_verify.o)
+
+ifneq ($(BTF_LLVM_PROBE),)
+ EXTRA_CFLAGS += -g
+else
+ifneq ($(and $(BTF_LLC_PROBE),$(BTF_PAHOLE_PROBE),$(BTF_OBJCOPY_PROBE)),)
+ EXTRA_CFLAGS += -g
+ LLC_FLAGS += -mattr=dwarfris
+ DWARF2BTF = y
+endif
+endif
+
+$(LIBBPF): FORCE
+# Fix up variables inherited from Kbuild that tools/ build system won't like
+ $(MAKE) -C $(dir $@) RM='rm -rf' LDFLAGS= srctree=$(XDP_FLOW_PATH)/../../ O=
+
+# Verify LLVM compiler tools are available and bpf target is supported by llc
+.PHONY: verify_cmds verify_target_bpf $(CLANG) $(LLC)
+
+verify_cmds: $(CLANG) $(LLC)
+ @for TOOL in $^ ; do \
+ if ! (which -- "$${TOOL}" > /dev/null 2>&1); then \
+ echo "*** ERROR: Cannot find LLVM tool $${TOOL}" ;\
+ exit 1; \
+ else true; fi; \
+ done
+
+verify_target_bpf: verify_cmds
+ @if ! (${LLC} -march=bpf -mattr=help > /dev/null 2>&1); then \
+ echo "*** ERROR: LLVM (${LLC}) does not support 'bpf' target" ;\
+ echo " NOTICE: LLVM version >= 3.7.1 required" ;\
+ exit 2; \
+ else true; fi
+
+$(src)/xdp_flow_kern_bpf.c: verify_target_bpf
+
+$(obj)/xdp_flow_kern_bpf.o: $(src)/xdp_flow_kern_bpf.c FORCE
+ @echo " CLANG-bpf " $@
+ $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \
+ -I$(srctree)/tools/testing/selftests/bpf/ \
+ -D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \
+ -D__TARGET_ARCH_$(SRCARCH) -Wno-compare-distinct-pointer-types \
+ -Wno-gnu-variable-sized-type-not-at-end \
+ -Wno-address-of-packed-member -Wno-tautological-compare \
+ -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
+ -I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \
+ -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@
+ifeq ($(DWARF2BTF),y)
+ $(BTF_PAHOLE) -J $@
+endif
+
ifeq ($(CONFIG_XDP_FLOW_UMH), y)
# builtin xdp_flow_umh should be compiled with -static
# since rootfs isn't mounted at the time of __init
# function is called and do_execv won't find elf interpreter
STATIC := -static
+STATICLDLIBS := -lz
endif
+quiet_cmd_as_user = AS $@
+ cmd_as_user = $(AS) -c -o $@ $<
+
quiet_cmd_cc_user = CC $@
cmd_cc_user = $(CC) -Wall -Wmissing-prototypes -O2 -std=gnu89 \
- -I$(srctree)/tools/include/ \
+ -I$(srctree)/tools/lib/ -I$(srctree)/tools/include/ \
-c -o $@ $<
quiet_cmd_ld_user = LD $@
- cmd_ld_user = $(CC) $(STATIC) -o $@ $^
+ cmd_ld_user = $(CC) $(STATIC) -o $@ $^ $(LIBBPF) -lelf $(STATICLDLIBS)
+
+$(obj)/xdp_flow_kern_bpf_blob.o: $(src)/xdp_flow_kern_bpf_blob.S \
+ $(obj)/xdp_flow_kern_bpf.o
+ $(call if_changed,as_user)
$(obj)/xdp_flow_umh.o: $(src)/xdp_flow_umh.c FORCE
$(call if_changed,cc_user)
-$(obj)/xdp_flow_umh: $(obj)/xdp_flow_umh.o
+$(obj)/xdp_flow_umh: $(obj)/xdp_flow_umh.o $(LIBBPF) \
+ $(obj)/xdp_flow_kern_bpf_blob.o
$(call if_changed,ld_user)
clean-files := xdp_flow_umh
diff --git a/net/xdp_flow/xdp_flow_kern_bpf.c b/net/xdp_flow/xdp_flow_kern_bpf.c
new file mode 100644
index 0000000..74cdb1d
--- /dev/null
+++ b/net/xdp_flow/xdp_flow_kern_bpf.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <bpf_helpers.h>
+
+SEC("xdp_flow")
+int xdp_flow_prog(struct xdp_md *ctx)
+{
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/net/xdp_flow/xdp_flow_kern_bpf_blob.S b/net/xdp_flow/xdp_flow_kern_bpf_blob.S
new file mode 100644
index 0000000..d180c1b
--- /dev/null
+++ b/net/xdp_flow/xdp_flow_kern_bpf_blob.S
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+ .section .rodata, "a"
+ .global xdp_flow_bpf_start
+xdp_flow_bpf_start:
+ .incbin "net/xdp_flow/xdp_flow_kern_bpf.o"
+ .global xdp_flow_bpf_end
+xdp_flow_bpf_end:
diff --git a/net/xdp_flow/xdp_flow_umh.c b/net/xdp_flow/xdp_flow_umh.c
index 6729bdf..734db00 100644
--- a/net/xdp_flow/xdp_flow_umh.c
+++ b/net/xdp_flow/xdp_flow_umh.c
@@ -6,9 +6,19 @@
#include <fcntl.h>
#include <unistd.h>
#include <syslog.h>
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+#include <sys/mman.h>
#include <sys/types.h>
+#include <sys/resource.h>
+#include <linux/hashtable.h>
+#include <linux/err.h>
#include "msgfmt.h"
+extern char xdp_flow_bpf_start;
+extern char xdp_flow_bpf_end;
+int progfile_fd;
+
/* FIXME: syslog is used for easy debugging. As writing /dev/log can be stuck
* due to reader side, should use another log mechanism like kmsg.
*/
@@ -17,15 +27,241 @@
#define pr_warn(fmt, ...) syslog(LOG_DAEMON | LOG_WARNING, fmt, ##__VA_ARGS__)
#define pr_err(fmt, ...) syslog(LOG_DAEMON | LOG_ERR, fmt, ##__VA_ARGS__)
+#define ERRBUF_SIZE 64
+
+/* This key represents a net device */
+struct netdev_info_key {
+ int ifindex;
+};
+
+struct netdev_info {
+ struct netdev_info_key key;
+ struct hlist_node node;
+ struct bpf_object *obj;
+};
+
+DEFINE_HASHTABLE(netdev_info_table, 16);
+
+static int libbpf_err(int err, char *errbuf)
+{
+ libbpf_strerror(err, errbuf, ERRBUF_SIZE);
+
+ if (-err < __LIBBPF_ERRNO__START)
+ return err;
+
+ return -EINVAL;
+}
+
+static int setup(void)
+{
+ size_t size = &xdp_flow_bpf_end - &xdp_flow_bpf_start;
+ struct rlimit r = { RLIM_INFINITY, RLIM_INFINITY };
+ ssize_t len;
+ int err;
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ err = -errno;
+ pr_err("setrlimit MEMLOCK failed: %s\n", strerror(errno));
+ return err;
+ }
+
+ progfile_fd = memfd_create("xdp_flow_kern_bpf.o", 0);
+ if (progfile_fd < 0) {
+ err = -errno;
+ pr_err("memfd_create failed: %s\n", strerror(errno));
+ return err;
+ }
+
+ len = write(progfile_fd, &xdp_flow_bpf_start, size);
+ if (len < 0) {
+ err = -errno;
+ pr_err("Failed to write bpf prog: %s\n", strerror(errno));
+ goto err;
+ }
+
+ if (len < size) {
+ pr_err("bpf prog written too short: expected %ld, actual %ld\n",
+ size, len);
+ err = -EIO;
+ goto err;
+ }
+
+ return 0;
+err:
+ close(progfile_fd);
+
+ return err;
+}
+
+static int load_bpf(int ifindex, struct bpf_object **objp)
+{
+ struct bpf_object_open_attr attr = {};
+ char path[256], errbuf[ERRBUF_SIZE];
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ int prog_fd, err;
+ ssize_t len;
+
+ len = snprintf(path, 256, "/proc/self/fd/%d", progfile_fd);
+ if (len < 0) {
+ err = -errno;
+ pr_err("Failed to setup prog fd path string: %s\n",
+ strerror(errno));
+ return err;
+ }
+
+ attr.file = path;
+ attr.prog_type = BPF_PROG_TYPE_XDP;
+ obj = bpf_object__open_xattr(&attr);
+ if (IS_ERR_OR_NULL(obj)) {
+ if (IS_ERR(obj)) {
+ err = libbpf_err((int)PTR_ERR(obj), errbuf);
+ } else {
+ err = -ENOENT;
+ strerror_r(-err, errbuf, sizeof(errbuf));
+ }
+ pr_err("Cannot open bpf prog: %s\n", errbuf);
+ return err;
+ }
+
+ bpf_object__for_each_program(prog, obj)
+ bpf_program__set_type(prog, attr.prog_type);
+
+ err = bpf_object__load(obj);
+ if (err) {
+ err = libbpf_err(err, errbuf);
+ pr_err("Failed to load bpf prog: %s\n", errbuf);
+ goto err;
+ }
+
+ prog = bpf_object__find_program_by_title(obj, "xdp_flow");
+ if (!prog) {
+ pr_err("Cannot find xdp_flow program\n");
+ err = -ENOENT;
+ goto err;
+ }
+
+ prog_fd = bpf_program__fd(prog);
+ if (prog_fd < 0) {
+ err = libbpf_err(prog_fd, errbuf);
+ pr_err("Invalid program fd: %s\n", errbuf);
+ goto err;
+ }
+
+ *objp = obj;
+
+ return prog_fd;
+err:
+ bpf_object__close(obj);
+ return err;
+}
+
+static int get_netdev_info_keyval(const struct netdev_info_key *key)
+{
+ return key->ifindex;
+}
+
+static struct netdev_info *find_netdev_info(const struct netdev_info_key *key)
+{
+ int keyval = get_netdev_info_keyval(key);
+ struct netdev_info *netdev_info;
+
+ hash_for_each_possible(netdev_info_table, netdev_info, node, keyval) {
+ if (netdev_info->key.ifindex == key->ifindex)
+ return netdev_info;
+ }
+
+ return NULL;
+}
+
+static int get_netdev_info_key(const struct mbox_request *req,
+ struct netdev_info_key *key)
+{
+ key->ifindex = req->ifindex;
+
+ return 0;
+}
+
+static struct netdev_info *get_netdev_info(const struct mbox_request *req)
+{
+ struct netdev_info *netdev_info;
+ struct netdev_info_key key;
+ int err;
+
+ err = get_netdev_info_key(req, &key);
+ if (err)
+ return ERR_PTR(err);
+
+ netdev_info = find_netdev_info(&key);
+ if (!netdev_info) {
+ pr_err("BUG: netdev_info for if %d not found.\n",
+ key.ifindex);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return netdev_info;
+}
+
static int handle_load(const struct mbox_request *req, __u32 *prog_id)
{
- *prog_id = 0;
+ struct netdev_info *netdev_info;
+ struct bpf_prog_info info = {};
+ struct netdev_info_key key;
+ __u32 len = sizeof(info);
+ int err, prog_fd;
+
+ err = get_netdev_info_key(req, &key);
+ if (err)
+ return err;
+
+ netdev_info = find_netdev_info(&key);
+ if (netdev_info)
+ return 0;
+
+ netdev_info = malloc(sizeof(*netdev_info));
+ if (!netdev_info) {
+ pr_err("malloc for netdev_info failed.\n");
+ return -ENOMEM;
+ }
+ netdev_info->key.ifindex = key.ifindex;
+
+ prog_fd = load_bpf(req->ifindex, &netdev_info->obj);
+ if (prog_fd < 0) {
+ err = prog_fd;
+ goto err_netdev_info;
+ }
+
+ err = bpf_obj_get_info_by_fd(prog_fd, &info, &len);
+ if (err)
+ goto err_obj;
+
+ *prog_id = info.id;
+ hash_add(netdev_info_table, &netdev_info->node,
+ get_netdev_info_keyval(&netdev_info->key));
+ pr_debug("XDP program for if %d was loaded\n", req->ifindex);
return 0;
+err_obj:
+ bpf_object__close(netdev_info->obj);
+err_netdev_info:
+ free(netdev_info);
+
+ return err;
}
static int handle_unload(const struct mbox_request *req)
{
+ struct netdev_info *netdev_info;
+
+ netdev_info = get_netdev_info(req);
+ if (IS_ERR(netdev_info))
+ return PTR_ERR(netdev_info);
+
+ hash_del(&netdev_info->node);
+ bpf_object__close(netdev_info->obj);
+ free(netdev_info);
+ pr_debug("XDP program for if %d was closed\n", req->ifindex);
+
return 0;
}
@@ -103,7 +339,10 @@ static void loop(void)
int main(void)
{
pr_info("Started xdp_flow\n");
+ if (setup())
+ return -1;
loop();
+ close(progfile_fd);
return 0;
}
--
1.8.3.1
^ permalink raw reply related
* [RFC PATCH bpf-next 01/14] xdp_flow: Add skeleton of XDP based TC offload driver
From: Toshiaki Makita @ 2019-08-13 12:05 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
Yonghong Song, David S. Miller, Jakub Kicinski,
Jesper Dangaard Brouer, John Fastabend, Jamal Hadi Salim,
Cong Wang, Jiri Pirko
Cc: Toshiaki Makita, netdev, bpf, William Tu
In-Reply-To: <20190813120558.6151-1-toshiaki.makita1@gmail.com>
Add TC offload driver, xdp_flow_core.c, and skeleton of UMH handling
mechanism. The driver is not called from anywhere yet.
xdp_flow_setup_block() in xdp_flow_core.c is meant to be called when
ingress qdisc is added. It loads xdp_flow kernel module and the kmod
provides some callbacks for setup phase and flow insertion phase.
xdp_flow_setup() in the kmod will be called from xdp_flow_setup_block()
when ingress qdisc is added, and xdp_flow_setup_block_cb() will be
called when a tc flower filter is added.
The former will request the UMH to load the eBPF program and the latter
will request the UMH to populate maps for flow tables. In this patch
no actual processing is implemented and the following commits implement
them.
The overall mechanism of UMH handling is written referring to bpfilter.
Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
---
include/net/flow_offload_xdp.h | 33 ++++++
net/Kconfig | 1 +
net/Makefile | 1 +
net/xdp_flow/.gitignore | 1 +
net/xdp_flow/Kconfig | 16 +++
net/xdp_flow/Makefile | 31 +++++
net/xdp_flow/msgfmt.h | 102 ++++++++++++++++
net/xdp_flow/xdp_flow_core.c | 126 ++++++++++++++++++++
net/xdp_flow/xdp_flow_kern_mod.c | 250 +++++++++++++++++++++++++++++++++++++++
net/xdp_flow/xdp_flow_umh.c | 109 +++++++++++++++++
net/xdp_flow/xdp_flow_umh_blob.S | 7 ++
11 files changed, 677 insertions(+)
create mode 100644 include/net/flow_offload_xdp.h
create mode 100644 net/xdp_flow/.gitignore
create mode 100644 net/xdp_flow/Kconfig
create mode 100644 net/xdp_flow/Makefile
create mode 100644 net/xdp_flow/msgfmt.h
create mode 100644 net/xdp_flow/xdp_flow_core.c
create mode 100644 net/xdp_flow/xdp_flow_kern_mod.c
create mode 100644 net/xdp_flow/xdp_flow_umh.c
create mode 100644 net/xdp_flow/xdp_flow_umh_blob.S
diff --git a/include/net/flow_offload_xdp.h b/include/net/flow_offload_xdp.h
new file mode 100644
index 0000000..d04a73d
--- /dev/null
+++ b/include/net/flow_offload_xdp.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_FLOW_OFFLOAD_XDP_H
+#define _LINUX_FLOW_OFFLOAD_XDP_H
+
+#include <linux/netdevice.h>
+#include <linux/umh.h>
+#include <net/flow_offload.h>
+
+struct xdp_flow_umh_ops {
+ struct umh_info info;
+ /* serialize access to this object and UMH */
+ struct mutex lock;
+ flow_setup_cb_t *setup_cb;
+ int (*setup)(struct net_device *dev, bool do_bind,
+ struct netlink_ext_ack *extack);
+ int (*start)(void);
+ bool stop;
+ struct module *module;
+};
+
+extern struct xdp_flow_umh_ops xdp_flow_ops;
+
+#ifdef CONFIG_XDP_FLOW
+int xdp_flow_setup_block(struct net_device *dev, struct flow_block_offload *f);
+#else
+static inline int xdp_flow_setup_block(struct net_device *dev,
+ struct flow_block_offload *f)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+#endif
diff --git a/net/Kconfig b/net/Kconfig
index 57f51a2..08d36444 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -206,6 +206,7 @@ source "net/bridge/netfilter/Kconfig"
endif
source "net/bpfilter/Kconfig"
+source "net/xdp_flow/Kconfig"
source "net/dccp/Kconfig"
source "net/sctp/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index 449fc0b..b78d1ef 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -87,3 +87,4 @@ endif
obj-$(CONFIG_QRTR) += qrtr/
obj-$(CONFIG_NET_NCSI) += ncsi/
obj-$(CONFIG_XDP_SOCKETS) += xdp/
+obj-$(CONFIG_XDP_FLOW) += xdp_flow/
diff --git a/net/xdp_flow/.gitignore b/net/xdp_flow/.gitignore
new file mode 100644
index 0000000..8cad817
--- /dev/null
+++ b/net/xdp_flow/.gitignore
@@ -0,0 +1 @@
+xdp_flow_umh
diff --git a/net/xdp_flow/Kconfig b/net/xdp_flow/Kconfig
new file mode 100644
index 0000000..82e7bf3
--- /dev/null
+++ b/net/xdp_flow/Kconfig
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig XDP_FLOW
+ bool "XDP based flow offload engine (XDP_FLOW)"
+ depends on NET && BPF_SYSCALL && NET_CLS_FLOWER && MEMFD_CREATE
+ help
+ This builds experimental xdp_flow framework that is aiming to
+ provide flow software offload functionality via XDP
+
+if XDP_FLOW
+config XDP_FLOW_UMH
+ tristate "xdp_flow kernel module with user mode helper"
+ depends on $(success,$(srctree)/scripts/cc-can-link.sh $(CC))
+ default m
+ help
+ This builds xdp_flow kernel module with embedded user mode helper
+endif
diff --git a/net/xdp_flow/Makefile b/net/xdp_flow/Makefile
new file mode 100644
index 0000000..f6138c2
--- /dev/null
+++ b/net/xdp_flow/Makefile
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_XDP_FLOW) += xdp_flow_core.o
+
+ifeq ($(CONFIG_XDP_FLOW_UMH), y)
+# builtin xdp_flow_umh should be compiled with -static
+# since rootfs isn't mounted at the time of __init
+# function is called and do_execv won't find elf interpreter
+STATIC := -static
+endif
+
+quiet_cmd_cc_user = CC $@
+ cmd_cc_user = $(CC) -Wall -Wmissing-prototypes -O2 -std=gnu89 \
+ -I$(srctree)/tools/include/ \
+ -c -o $@ $<
+
+quiet_cmd_ld_user = LD $@
+ cmd_ld_user = $(CC) $(STATIC) -o $@ $^
+
+$(obj)/xdp_flow_umh.o: $(src)/xdp_flow_umh.c FORCE
+ $(call if_changed,cc_user)
+
+$(obj)/xdp_flow_umh: $(obj)/xdp_flow_umh.o
+ $(call if_changed,ld_user)
+
+clean-files := xdp_flow_umh
+
+$(obj)/xdp_flow_umh_blob.o: $(obj)/xdp_flow_umh
+
+obj-$(CONFIG_XDP_FLOW_UMH) += xdp_flow.o
+xdp_flow-objs += xdp_flow_kern_mod.o xdp_flow_umh_blob.o
diff --git a/net/xdp_flow/msgfmt.h b/net/xdp_flow/msgfmt.h
new file mode 100644
index 0000000..97d8490
--- /dev/null
+++ b/net/xdp_flow/msgfmt.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NET_XDP_FLOW_MSGFMT_H
+#define _NET_XDP_FLOW_MSGFMT_H
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
+
+#define MAX_XDP_FLOW_ACTIONS 32
+
+enum xdp_flow_action_id {
+ /* ABORT if 0, i.e. uninitialized */
+ XDP_FLOW_ACTION_ACCEPT = 1,
+ XDP_FLOW_ACTION_DROP,
+ XDP_FLOW_ACTION_REDIRECT,
+ XDP_FLOW_ACTION_VLAN_PUSH,
+ XDP_FLOW_ACTION_VLAN_POP,
+ XDP_FLOW_ACTION_VLAN_MANGLE,
+ XDP_FLOW_ACTION_MANGLE,
+ XDP_FLOW_ACTION_CSUM,
+ NR_XDP_FLOW_ACTION,
+};
+
+struct xdp_flow_action {
+ enum xdp_flow_action_id id;
+ union {
+ int ifindex; /* REDIRECT */
+ struct { /* VLAN */
+ __be16 proto;
+ __be16 tci;
+ } vlan;
+ };
+};
+
+struct xdp_flow_actions {
+ unsigned int num_actions;
+ struct xdp_flow_action actions[MAX_XDP_FLOW_ACTIONS];
+};
+
+struct xdp_flow_key {
+ struct {
+ __u8 dst[ETH_ALEN] __aligned(2);
+ __u8 src[ETH_ALEN] __aligned(2);
+ __be16 type;
+ } eth;
+ struct {
+ __be16 tpid;
+ __be16 tci;
+ } vlan;
+ struct {
+ __u8 proto;
+ __u8 ttl;
+ __u8 tos;
+ __u8 frag;
+ } ip;
+ union {
+ struct {
+ __be32 src;
+ __be32 dst;
+ } ipv4;
+ struct {
+ struct in6_addr src;
+ struct in6_addr dst;
+ } ipv6;
+ };
+ struct {
+ __be16 src;
+ __be16 dst;
+ } l4port;
+ struct {
+ __be16 flags;
+ } tcp;
+} __aligned(BITS_PER_LONG / 8);
+
+struct xdp_flow {
+ struct xdp_flow_key key;
+ struct xdp_flow_key mask;
+ struct xdp_flow_actions actions;
+ __u16 priority;
+};
+
+enum xdp_flow_cmd {
+ XDP_FLOW_CMD_NOOP = 0,
+ XDP_FLOW_CMD_LOAD,
+ XDP_FLOW_CMD_UNLOAD,
+ XDP_FLOW_CMD_REPLACE,
+ XDP_FLOW_CMD_DELETE,
+};
+
+struct mbox_request {
+ int ifindex;
+ __u8 cmd;
+ struct xdp_flow flow;
+};
+
+struct mbox_reply {
+ int status;
+ __u32 id;
+};
+
+#endif
diff --git a/net/xdp_flow/xdp_flow_core.c b/net/xdp_flow/xdp_flow_core.c
new file mode 100644
index 0000000..ab84863
--- /dev/null
+++ b/net/xdp_flow/xdp_flow_core.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <net/flow_offload_xdp.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <net/pkt_cls.h>
+#include <linux/netdevice.h>
+
+struct xdp_flow_umh_ops xdp_flow_ops;
+EXPORT_SYMBOL_GPL(xdp_flow_ops);
+
+static LIST_HEAD(xdp_block_cb_list);
+
+static void xdp_flow_block_release(void *cb_priv)
+{
+ struct net_device *dev = cb_priv;
+ struct netlink_ext_ack extack;
+
+ mutex_lock(&xdp_flow_ops.lock);
+ xdp_flow_ops.setup(dev, false, &extack);
+ module_put(xdp_flow_ops.module);
+ mutex_unlock(&xdp_flow_ops.lock);
+}
+
+int xdp_flow_setup_block(struct net_device *dev, struct flow_block_offload *f)
+{
+ struct flow_block_cb *block_cb;
+ int err = 0;
+
+ /* TODO: Remove this limitation */
+ if (!net_eq(current->nsproxy->net_ns, &init_net))
+ return -EOPNOTSUPP;
+
+ if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&xdp_flow_ops.lock);
+ if (!xdp_flow_ops.module) {
+ mutex_unlock(&xdp_flow_ops.lock);
+ err = request_module("xdp_flow");
+ if (err)
+ return err;
+ mutex_lock(&xdp_flow_ops.lock);
+ if (!xdp_flow_ops.module) {
+ err = -ECHILD;
+ goto out;
+ }
+ }
+ if (xdp_flow_ops.stop) {
+ err = xdp_flow_ops.start();
+ if (err)
+ goto out;
+ }
+
+ f->driver_block_list = &xdp_block_cb_list;
+
+ switch (f->command) {
+ case FLOW_BLOCK_BIND:
+ if (flow_block_cb_is_busy(xdp_flow_ops.setup_cb, dev,
+ &xdp_block_cb_list)) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (!try_module_get(xdp_flow_ops.module)) {
+ err = -ECHILD;
+ goto out;
+ }
+
+ err = xdp_flow_ops.setup(dev, true, f->extack);
+ if (err) {
+ module_put(xdp_flow_ops.module);
+ goto out;
+ }
+
+ block_cb = flow_block_cb_alloc(xdp_flow_ops.setup_cb, dev, dev,
+ xdp_flow_block_release);
+ if (IS_ERR(block_cb)) {
+ xdp_flow_ops.setup(dev, false, f->extack);
+ module_put(xdp_flow_ops.module);
+ err = PTR_ERR(block_cb);
+ goto out;
+ }
+
+ flow_block_cb_add(block_cb, f);
+ list_add_tail(&block_cb->driver_list, &xdp_block_cb_list);
+ break;
+ case FLOW_BLOCK_UNBIND:
+ block_cb = flow_block_cb_lookup(f->block, xdp_flow_ops.setup_cb,
+ dev);
+ if (!block_cb) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ flow_block_cb_remove(block_cb, f);
+ list_del(&block_cb->driver_list);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ }
+out:
+ mutex_unlock(&xdp_flow_ops.lock);
+
+ return err;
+}
+
+static void xdp_flow_umh_cleanup(struct umh_info *info)
+{
+ mutex_lock(&xdp_flow_ops.lock);
+ xdp_flow_ops.stop = true;
+ fput(info->pipe_to_umh);
+ fput(info->pipe_from_umh);
+ info->pid = 0;
+ mutex_unlock(&xdp_flow_ops.lock);
+}
+
+static int __init xdp_flow_init(void)
+{
+ mutex_init(&xdp_flow_ops.lock);
+ xdp_flow_ops.stop = true;
+ xdp_flow_ops.info.cmdline = "xdp_flow_umh";
+ xdp_flow_ops.info.cleanup = &xdp_flow_umh_cleanup;
+
+ return 0;
+}
+device_initcall(xdp_flow_init);
diff --git a/net/xdp_flow/xdp_flow_kern_mod.c b/net/xdp_flow/xdp_flow_kern_mod.c
new file mode 100644
index 0000000..823ab65
--- /dev/null
+++ b/net/xdp_flow/xdp_flow_kern_mod.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/umh.h>
+#include <linux/sched/signal.h>
+#include <net/pkt_cls.h>
+#include <net/flow_offload_xdp.h>
+#include "msgfmt.h"
+
+extern char xdp_flow_umh_start;
+extern char xdp_flow_umh_end;
+
+static void shutdown_umh(void)
+{
+ struct task_struct *tsk;
+
+ if (xdp_flow_ops.stop)
+ return;
+
+ tsk = get_pid_task(find_vpid(xdp_flow_ops.info.pid), PIDTYPE_PID);
+ if (tsk) {
+ send_sig(SIGKILL, tsk, 1);
+ put_task_struct(tsk);
+ }
+}
+
+static int transact_umh(struct mbox_request *req, u32 *id)
+{
+ struct mbox_reply reply;
+ int ret = -EFAULT;
+ loff_t pos;
+ ssize_t n;
+
+ if (!xdp_flow_ops.info.pid)
+ goto out;
+
+ n = __kernel_write(xdp_flow_ops.info.pipe_to_umh, req, sizeof(*req),
+ &pos);
+ if (n != sizeof(*req)) {
+ pr_err("write fail %zd\n", n);
+ shutdown_umh();
+ goto out;
+ }
+
+ pos = 0;
+ n = kernel_read(xdp_flow_ops.info.pipe_from_umh, &reply,
+ sizeof(reply), &pos);
+ if (n != sizeof(reply)) {
+ pr_err("read fail %zd\n", n);
+ shutdown_umh();
+ goto out;
+ }
+
+ ret = reply.status;
+ if (id)
+ *id = reply.id;
+out:
+ return ret;
+}
+
+static int xdp_flow_replace(struct net_device *dev, struct flow_cls_offload *f)
+{
+ return -EOPNOTSUPP;
+}
+
+int xdp_flow_destroy(struct net_device *dev, struct flow_cls_offload *f)
+{
+ return -EOPNOTSUPP;
+}
+
+static int xdp_flow_setup_flower(struct net_device *dev,
+ struct flow_cls_offload *f)
+{
+ switch (f->command) {
+ case FLOW_CLS_REPLACE:
+ return xdp_flow_replace(dev, f);
+ case FLOW_CLS_DESTROY:
+ return xdp_flow_destroy(dev, f);
+ case FLOW_CLS_STATS:
+ case FLOW_CLS_TMPLT_CREATE:
+ case FLOW_CLS_TMPLT_DESTROY:
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int xdp_flow_setup_block_cb(enum tc_setup_type type, void *type_data,
+ void *cb_priv)
+{
+ struct flow_cls_common_offload *common = type_data;
+ struct net_device *dev = cb_priv;
+ int err = 0;
+
+ if (common->chain_index) {
+ NL_SET_ERR_MSG(common->extack,
+ "xdp_flow supports only offload of chain 0");
+ return -EOPNOTSUPP;
+ }
+
+ if (type != TC_SETUP_CLSFLOWER)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&xdp_flow_ops.lock);
+ if (xdp_flow_ops.stop) {
+ err = xdp_flow_ops.start();
+ if (err)
+ goto out;
+ }
+
+ err = xdp_flow_setup_flower(dev, type_data);
+out:
+ mutex_unlock(&xdp_flow_ops.lock);
+ return err;
+}
+
+static int xdp_flow_setup_bind(struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct mbox_request *req;
+ u32 id = 0;
+ int err;
+
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ req->cmd = XDP_FLOW_CMD_LOAD;
+ req->ifindex = dev->ifindex;
+
+ /* Load bpf in UMH and get prog id */
+ err = transact_umh(req, &id);
+
+ /* TODO: id will be used to attach bpf prog to XDP
+ * As we have rtnl_lock, UMH cannot attach prog to XDP
+ */
+
+ kfree(req);
+
+ return err;
+}
+
+static int xdp_flow_setup_unbind(struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct mbox_request *req;
+ int err;
+
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ req->cmd = XDP_FLOW_CMD_UNLOAD;
+ req->ifindex = dev->ifindex;
+
+ err = transact_umh(req, NULL);
+
+ kfree(req);
+
+ return err;
+}
+
+static int xdp_flow_setup(struct net_device *dev, bool do_bind,
+ struct netlink_ext_ack *extack)
+{
+ ASSERT_RTNL();
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return -EINVAL;
+
+ return do_bind ?
+ xdp_flow_setup_bind(dev, extack) :
+ xdp_flow_setup_unbind(dev, extack);
+}
+
+static int xdp_flow_test(void)
+{
+ struct mbox_request *req;
+ int err;
+
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ req->cmd = XDP_FLOW_CMD_NOOP;
+ err = transact_umh(req, NULL);
+
+ kfree(req);
+
+ return err;
+}
+
+static int start_umh(void)
+{
+ int err;
+
+ /* fork usermode process */
+ err = fork_usermode_blob(&xdp_flow_umh_start,
+ &xdp_flow_umh_end - &xdp_flow_umh_start,
+ &xdp_flow_ops.info);
+ if (err)
+ return err;
+
+ xdp_flow_ops.stop = false;
+ pr_info("Loaded xdp_flow_umh pid %d\n", xdp_flow_ops.info.pid);
+
+ /* health check that usermode process started correctly */
+ if (xdp_flow_test()) {
+ shutdown_umh();
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int __init load_umh(void)
+{
+ int err = 0;
+
+ mutex_lock(&xdp_flow_ops.lock);
+ if (!xdp_flow_ops.stop) {
+ err = -EFAULT;
+ goto err;
+ }
+
+ err = start_umh();
+ if (err)
+ goto err;
+
+ xdp_flow_ops.setup_cb = &xdp_flow_setup_block_cb;
+ xdp_flow_ops.setup = &xdp_flow_setup;
+ xdp_flow_ops.start = &start_umh;
+ xdp_flow_ops.module = THIS_MODULE;
+err:
+ mutex_unlock(&xdp_flow_ops.lock);
+ return err;
+}
+
+static void __exit fini_umh(void)
+{
+ mutex_lock(&xdp_flow_ops.lock);
+ shutdown_umh();
+ xdp_flow_ops.module = NULL;
+ xdp_flow_ops.start = NULL;
+ xdp_flow_ops.setup = NULL;
+ xdp_flow_ops.setup_cb = NULL;
+ mutex_unlock(&xdp_flow_ops.lock);
+}
+module_init(load_umh);
+module_exit(fini_umh);
+MODULE_LICENSE("GPL");
diff --git a/net/xdp_flow/xdp_flow_umh.c b/net/xdp_flow/xdp_flow_umh.c
new file mode 100644
index 0000000..6729bdf
--- /dev/null
+++ b/net/xdp_flow/xdp_flow_umh.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <sys/types.h>
+#include "msgfmt.h"
+
+/* FIXME: syslog is used for easy debugging. As writing /dev/log can be stuck
+ * due to reader side, should use another log mechanism like kmsg.
+ */
+#define pr_debug(fmt, ...) syslog(LOG_DAEMON | LOG_DEBUG, fmt, ##__VA_ARGS__)
+#define pr_info(fmt, ...) syslog(LOG_DAEMON | LOG_INFO, fmt, ##__VA_ARGS__)
+#define pr_warn(fmt, ...) syslog(LOG_DAEMON | LOG_WARNING, fmt, ##__VA_ARGS__)
+#define pr_err(fmt, ...) syslog(LOG_DAEMON | LOG_ERR, fmt, ##__VA_ARGS__)
+
+static int handle_load(const struct mbox_request *req, __u32 *prog_id)
+{
+ *prog_id = 0;
+
+ return 0;
+}
+
+static int handle_unload(const struct mbox_request *req)
+{
+ return 0;
+}
+
+static int handle_replace(struct mbox_request *req)
+{
+ return -EOPNOTSUPP;
+}
+
+static int handle_delete(const struct mbox_request *req)
+{
+ return -EOPNOTSUPP;
+}
+
+static void loop(void)
+{
+ struct mbox_request *req;
+
+ req = malloc(sizeof(struct mbox_request));
+ if (!req) {
+ pr_err("Memory allocation for mbox_request failed\n");
+ return;
+ }
+
+ while (1) {
+ struct mbox_reply reply;
+ int n;
+
+ n = read(0, req, sizeof(*req));
+ if (n < 0) {
+ pr_err("read for mbox_request failed: %s\n",
+ strerror(errno));
+ break;
+ }
+ if (n != sizeof(*req)) {
+ pr_err("Invalid request size %d\n", n);
+ break;
+ }
+
+ switch (req->cmd) {
+ case XDP_FLOW_CMD_NOOP:
+ reply.status = 0;
+ break;
+ case XDP_FLOW_CMD_LOAD:
+ reply.status = handle_load(req, &reply.id);
+ break;
+ case XDP_FLOW_CMD_UNLOAD:
+ reply.status = handle_unload(req);
+ break;
+ case XDP_FLOW_CMD_REPLACE:
+ reply.status = handle_replace(req);
+ break;
+ case XDP_FLOW_CMD_DELETE:
+ reply.status = handle_delete(req);
+ break;
+ default:
+ pr_err("Invalid command %d\n", req->cmd);
+ reply.status = -EOPNOTSUPP;
+ }
+
+ n = write(1, &reply, sizeof(reply));
+ if (n < 0) {
+ pr_err("write for mbox_reply failed: %s\n",
+ strerror(errno));
+ break;
+ }
+ if (n != sizeof(reply)) {
+ pr_err("reply written too short: %d\n", n);
+ break;
+ }
+ }
+
+ free(req);
+}
+
+int main(void)
+{
+ pr_info("Started xdp_flow\n");
+ loop();
+
+ return 0;
+}
diff --git a/net/xdp_flow/xdp_flow_umh_blob.S b/net/xdp_flow/xdp_flow_umh_blob.S
new file mode 100644
index 0000000..6edcb0e
--- /dev/null
+++ b/net/xdp_flow/xdp_flow_umh_blob.S
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+ .section .rodata, "a"
+ .global xdp_flow_umh_start
+xdp_flow_umh_start:
+ .incbin "net/xdp_flow/xdp_flow_umh"
+ .global xdp_flow_umh_end
+xdp_flow_umh_end:
--
1.8.3.1
^ permalink raw reply related
* [RFC PATCH bpf-next 00/14] xdp_flow: Flow offload to XDP
From: Toshiaki Makita @ 2019-08-13 12:05 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
Yonghong Song, David S. Miller, Jakub Kicinski,
Jesper Dangaard Brouer, John Fastabend, Jamal Hadi Salim,
Cong Wang, Jiri Pirko
Cc: Toshiaki Makita, netdev, bpf, William Tu
This is a rough PoC for an idea to offload TC flower to XDP.
* Motivation
The purpose is to speed up software TC flower by using XDP.
I chose TC flower because my current interest is in OVS. OVS uses TC to
offload flow tables to hardware, so if TC can offload flows to XDP, OVS
also can be offloaded to XDP.
When TC flower filter is offloaded to XDP, the received packets are
handled by XDP first, and if their protocol or something is not
supported by the eBPF program, the program returns XDP_PASS and packets
are passed to upper layer TC.
The packet processing flow will be like this when this mechanism,
xdp_flow, is used with OVS.
+-------------+
| openvswitch |
| kmod |
+-------------+
^
| if not match in filters (flow key or action not supported by TC)
+-------------+
| TC flower |
+-------------+
^
| if not match in flow tables (flow key or action not supported by XDP)
+-------------+
| XDP prog |
+-------------+
^
| incoming packets
Of course we can directly use TC flower without OVS to speed up TC.
This is useful especially when the device does not support HW-offload.
Such interfaces include virtual interfaces like veth.
* How to use
It only supports ingress (clsact) flower filter at this point.
Enable the feature via ethtool before adding ingress/clsact qdisc.
$ ethtool -K eth0 tc-offload-xdp on
Then add qdisc/filters as normal.
$ tc qdisc add dev eth0 clsact
$ tc filter add dev eth0 ingress protocol ip flower skip_sw ...
Alternatively, when using OVS, adding qdisc and filters will be
automatically done by setting hw-offload.
$ ovs-vsctl set Open_vSwitch . other_config:hw-offload=true
$ systemctl stop openvswitch
$ tc qdisc del dev eth0 ingress # or reboot
$ ethtool -K eth0 tc-offload-xdp on
$ systemctl start openvswitch
* Performance
I measured drop rate at veth interface with redirect action from physical
interface (i40e 25G NIC, XXV 710) to veth. The CPU is Xeon Silver 4114
(2.20 GHz).
XDP_DROP
+------+ +-------+ +-------+
pktgen -- wire --> | eth0 | -- TC/OVS redirect --> | veth0 |----| veth1 |
+------+ (offloaded to XDP) +-------+ +-------+
The setup for redirect is done by OVS like this.
$ ovs-vsctl add-br ovsbr0
$ ovs-vsctl add-port ovsbr0 eth0
$ ovs-vsctl add-port ovsbr0 veth0
$ ovs-vsctl set Open_vSwitch . other_config:hw-offload=true
$ systemctl stop openvswitch
$ tc qdisc del dev eth0 ingress
$ tc qdisc del dev veth0 ingress
$ ethtool -K eth0 tc-offload-xdp on
$ ethtool -K veth0 tc-offload-xdp on
$ systemctl start openvswitch
Tested single core/single flow with 3 configurations.
- xdp_flow: hw-offload=true, tc-offload-xdp on
- TC: hw-offload=true, tc-offload-xdp off (software TC)
- ovs kmod: hw-offload=false
xdp_flow TC ovs kmod
-------- -------- --------
4.0 Mpps 1.1 Mpps 1.1 Mpps
So xdp_flow drop rate is roughly 4x faster than software TC or ovs kmod.
OTOH the time to add a flow increases with xdp_flow.
ping latency of first packet when veth1 does XDP_PASS instead of DROP:
xdp_flow TC ovs kmod
-------- -------- --------
25ms 12ms 0.6ms
xdp_flow does a lot of work to emulate TC behavior including UMH
transaction and multiple bpf map update from UMH which I think increases
the latency.
* Implementation
xdp_flow makes use of UMH to load an eBPF program for XDP, similar to
bpfilter. The difference is that xdp_flow does not generate the eBPF
program dynamically but a prebuilt program is embedded in UMH. This is
mainly because flow insertion is considerably frequent. If we generate
and load an eBPF program on each insertion of a flow, the latency of the
first packet of ping in above test will incease, which I want to avoid.
+----------------------+
| xdp_flow_umh | load eBPF prog for XDP
| (eBPF prog embedded) | update maps for flow tables
+----------------------+
^ |
request | v eBPF prog id
+-----------+ offload +-----------------------+
| TC flower | --------> | xdp_flow kmod | attach the prog to XDP
+-----------+ | (flow offload driver) |
+-----------------------+
- When ingress/clsact qdisc is created, i.e. a device is bound to a flow
block, xdp_flow kmod requests xdp_flow_umh to load eBPF prog.
xdp_flow_umh returns prog id and xdp_flow kmod attach the prog to XDP
(the reason of attaching XDP from kmod is that rtnl_lock is held here).
- When flower filter is added, xdp_flow kmod requests xdp_flow_umh to
update maps for flow tables.
* Patches
- patch 1
Basic framework for xdp_flow kmod and UMH.
- patch 2
Add prebuilt eBPF program embedded in UMH.
- patch 3, 4
Attach the prog to XDP in kmod after using the prog id returned from
UMH.
- patch 5, 6
Add maps for flow tables and flow table manipulation logic in UMH.
- patch 7
Implement flow lookup and basic actions in eBPF prog.
- patch 8
Implement flow manipulation logic, serialize flow key and actions from
TC flower and make requests to UMH in kmod.
- patch 9
Add tc-offload-xdp netdev feature and hooks to call xdp_flow kmod in
TC flower offload code.
- patch 10, 11
Add example actions, redirect and vlan_push.
- patch 12
Add testcase for xdp_flow.
- patch 13, 14
These are unrelated patches. They just improves XDP program's
performance. They are included to demonstrate to what extent xdp_flow
performance can increase. Without them, drop rate goes down from 4Mpps
to 3Mpps.
* About OVS AF_XDP netdev
Recently OVS has added AF_XDP netdev type support. This also makes use
of XDP, but in some ways different from this patch set.
- AF_XDP work originally started in order to bring BPF's flexibility to
OVS, which enables us to upgrade datapath without updating kernel.
AF_XDP solution uses userland datapath so it achieved its goal.
xdp_flow will not replace OVS datapath completely, but offload it
partially just for speed up.
- OVS AF_XDP requires PMD for the best performance so consumes 100% CPU.
- OVS AF_XDP needs packet copy when forwarding packets.
- xdp_flow can be used not only for OVS. It works for direct use of TC
flower. nftables also can be offloaded by the same mechanism in the
future.
* About alternative userland (ovs-vswitchd etc.) implementation
Maybe a similar logic can be implemented in ovs-vswitchd offload
mechanism, instead of adding code to kernel. I just thought offloading
TC is more generic and allows wider usage with direct TC command.
For example, considering that OVS inserts a flow to kernel only when
flow miss happens in kernel, we can in advance add offloaded flows via
tc filter to avoid flow insertion latency for certain sensitive flows.
TC flower usage without using OVS is also possible.
Also as written above nftables can be offloaded to XDP with this
mechanism as well.
* Note
This patch set is based on top of commit a664a834579a ("tools: bpftool:
fix reading from /proc/config.gz").
Any feedback is welcome.
Thanks!
Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
Toshiaki Makita (14):
xdp_flow: Add skeleton of XDP based TC offload driver
xdp_flow: Add skeleton bpf program for XDP
bpf: Add API to get program from id
xdp_flow: Attach bpf prog to XDP in kernel after UMH loaded program
xdp_flow: Prepare flow tables in bpf
xdp_flow: Add flow entry insertion/deletion logic in UMH
xdp_flow: Add flow handling and basic actions in bpf prog
xdp_flow: Implement flow replacement/deletion logic in xdp_flow kmod
xdp_flow: Add netdev feature for enabling TC flower offload to XDP
xdp_flow: Implement redirect action
xdp_flow: Implement vlan_push action
bpf, selftest: Add test for xdp_flow
i40e: prefetch xdp->data before running XDP prog
bpf, hashtab: Compare keys in long
drivers/net/ethernet/intel/i40e/i40e_txrx.c | 1 +
include/linux/bpf.h | 6 +
include/linux/netdev_features.h | 2 +
include/linux/netdevice.h | 4 +
include/net/flow_offload_xdp.h | 33 +
include/net/pkt_cls.h | 5 +
include/net/sch_generic.h | 1 +
kernel/bpf/hashtab.c | 27 +-
kernel/bpf/syscall.c | 26 +-
net/Kconfig | 1 +
net/Makefile | 1 +
net/core/dev.c | 13 +-
net/core/ethtool.c | 1 +
net/sched/cls_api.c | 67 +-
net/xdp_flow/.gitignore | 1 +
net/xdp_flow/Kconfig | 16 +
net/xdp_flow/Makefile | 112 +++
net/xdp_flow/msgfmt.h | 102 +++
net/xdp_flow/umh_bpf.h | 34 +
net/xdp_flow/xdp_flow_core.c | 126 ++++
net/xdp_flow/xdp_flow_kern_bpf.c | 358 +++++++++
net/xdp_flow/xdp_flow_kern_bpf_blob.S | 7 +
net/xdp_flow/xdp_flow_kern_mod.c | 645 ++++++++++++++++
net/xdp_flow/xdp_flow_umh.c | 1034 ++++++++++++++++++++++++++
net/xdp_flow/xdp_flow_umh_blob.S | 7 +
tools/testing/selftests/bpf/Makefile | 1 +
tools/testing/selftests/bpf/test_xdp_flow.sh | 103 +++
27 files changed, 2716 insertions(+), 18 deletions(-)
create mode 100644 include/net/flow_offload_xdp.h
create mode 100644 net/xdp_flow/.gitignore
create mode 100644 net/xdp_flow/Kconfig
create mode 100644 net/xdp_flow/Makefile
create mode 100644 net/xdp_flow/msgfmt.h
create mode 100644 net/xdp_flow/umh_bpf.h
create mode 100644 net/xdp_flow/xdp_flow_core.c
create mode 100644 net/xdp_flow/xdp_flow_kern_bpf.c
create mode 100644 net/xdp_flow/xdp_flow_kern_bpf_blob.S
create mode 100644 net/xdp_flow/xdp_flow_kern_mod.c
create mode 100644 net/xdp_flow/xdp_flow_umh.c
create mode 100644 net/xdp_flow/xdp_flow_umh_blob.S
create mode 100755 tools/testing/selftests/bpf/test_xdp_flow.sh
--
1.8.3.1
^ permalink raw reply
* [PATCHv3] zd1211rw: remove false assertion from zd_mac_clear()
From: Oliver Neukum @ 2019-08-13 12:04 UTC (permalink / raw)
To: davem, netdev, dsd, kune, kvalo, linux-wireless; +Cc: Oliver Neukum
The function is called before the lock which is asserted was ever used.
Just remove it.
V2: correct CCs
V3: correct name
Reported-by: syzbot+74c65761783d66a9c97c@syzkaller.appspotmail.com
Signed-off-by: Oliver Neukum <oneukum@suse.com>
---
drivers/net/wireless/zydas/zd1211rw/zd_mac.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/drivers/net/wireless/zydas/zd1211rw/zd_mac.c b/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
index da7e63fca9f5..a9999d10ae81 100644
--- a/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
+++ b/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
@@ -223,7 +223,6 @@ void zd_mac_clear(struct zd_mac *mac)
{
flush_workqueue(zd_workqueue);
zd_chip_clear(&mac->chip);
- lockdep_assert_held(&mac->lock);
ZD_MEMCLEAR(mac, sizeof(struct zd_mac));
}
--
2.16.4
^ permalink raw reply related
* [PATCHv2] zdnet: remove false assertion from zd_mac_clear()
From: Oliver Neukum @ 2019-08-13 12:01 UTC (permalink / raw)
To: davem, netdev, dsd, kune, kvalo, linux-wireless; +Cc: Oliver Neukum
The function is called before the lock which is asserted was ever used.
Just remove it.
V2: correct CCs
Reported-by: syzbot+74c65761783d66a9c97c@syzkaller.appspotmail.com
Signed-off-by: Oliver Neukum <oneukum@suse.com>
---
drivers/net/wireless/zydas/zd1211rw/zd_mac.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/drivers/net/wireless/zydas/zd1211rw/zd_mac.c b/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
index da7e63fca9f5..a9999d10ae81 100644
--- a/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
+++ b/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
@@ -223,7 +223,6 @@ void zd_mac_clear(struct zd_mac *mac)
{
flush_workqueue(zd_workqueue);
zd_chip_clear(&mac->chip);
- lockdep_assert_held(&mac->lock);
ZD_MEMCLEAR(mac, sizeof(struct zd_mac));
}
--
2.16.4
^ permalink raw reply related
* Re: [PATCH V5 0/9] Fixes for vhost metadata acceleration
From: Jason Gunthorpe @ 2019-08-13 11:57 UTC (permalink / raw)
To: Jason Wang
Cc: Michael S. Tsirkin, kvm, virtualization, netdev, linux-kernel,
linux-mm
In-Reply-To: <9a9641fe-b48f-f32a-eecc-af9c2f4fbe0e@redhat.com>
On Tue, Aug 13, 2019 at 04:31:07PM +0800, Jason Wang wrote:
> What kind of issues do you see? Spinlock is to synchronize GUP with MMU
> notifier in this series.
A GUP that can't sleep can't pagefault which makes it a really weird
pattern
> Btw, back to the original question. May I know why synchronize_rcu() is not
> suitable? Consider:
We already went over this. You'd need to determine it doesn't somehow
deadlock the mm on reclaim paths. Maybe it is OK, the rcq_gq_wq is
marked WQ_MEM_RECLAIM at least..
I also think Michael was concerned about the latency spikes a long RCU
delay would cause.
Jason
^ permalink raw reply
* Re: [PATCH net-next v2 6/9] net: macsec: hardware offloading infrastructure
From: Igor Russkikh @ 2019-08-13 11:46 UTC (permalink / raw)
To: Andrew Lunn, Antoine Tenart
Cc: davem@davemloft.net, sd@queasysnail.net, f.fainelli@gmail.com,
hkallweit1@gmail.com, netdev@vger.kernel.org,
linux-kernel@vger.kernel.org, thomas.petazzoni@bootlin.com,
alexandre.belloni@bootlin.com, allan.nielsen@microchip.com,
camelia.groza@nxp.com, Simon Edelhaus
In-Reply-To: <20190810163423.GA30120@lunn.ch>
On 10.08.2019 19:34, Andrew Lunn wrote:
> On Thu, Aug 08, 2019 at 04:05:57PM +0200, Antoine Tenart wrote:
>> The MACsec configuration is passed to device drivers supporting it
>> through macsec_hw_offload() which is called from the MACsec genl
>> helpers. This function calls the macsec ops of PHY and Ethernet
>> drivers in two steps
>
> Hi Antoine, Igor
>
> It is great that you are thinking how a MAC driver would make use of
> this. But on the flip side, we don't usual add an API unless there is
> a user. And as far as i see, you only add a PHY level implementation,
> not a MAC level.
>
> Igor, what is your interest here? I know the Aquantia PHY can do
> MACsec, but i guess you are more interested in the atlantic and AQC111
> MAC drivers which hide the PHY behind firmware rather than make use of
> the Linux aquantia PHY driver. Are you likely to be contributing a MAC
> driver level implementation of MACsec soon?
Hi Andrew,
Yes, we are interested in MAC level MACSec offload implementation.
Although in our solution macsec engine itself is in Phy, we do
actively use firmware support in areas of configuration, interrupt management.
So from SW perspective that'll be MAC driver level macsec.
Regards,
Igor
^ permalink raw reply
* Re: [PATCH] MAINTAINERS: PHY LIBRARY: Remove sysfs-bus-mdio record
From: Andrew Lunn @ 2019-08-13 11:45 UTC (permalink / raw)
To: Denis Efremov
Cc: linux-kernel, joe, Florian Fainelli, David S . Miller,
Heiner Kallweit, netdev
In-Reply-To: <20190813061439.17529-1-efremov@linux.com>
On Tue, Aug 13, 2019 at 09:14:39AM +0300, Denis Efremov wrote:
> Update MAINTAINERS to reflect that sysfs-bus-mdio documentation
> was removed.
>
> Cc: Florian Fainelli <f.fainelli@gmail.com>
> Cc: David S. Miller <davem@davemloft.net>
> Cc: Andrew Lunn <andrew@lunn.ch>
> Cc: Heiner Kallweit <hkallweit1@gmail.com>
> Cc: netdev@vger.kernel.org
> Fixes: a6cd0d2d493a ("Documentation: net-sysfs: Remove duplicate PHY device documentation")
> Signed-off-by: Denis Efremov <efremov@linux.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Andrew
^ permalink raw reply
* Re: [PATCH net-next v1 0/8] netfilter: header compilation fixes
From: Jeremy Sowden @ 2019-08-13 11:36 UTC (permalink / raw)
To: Pablo Neira Ayuso; +Cc: Netfilter Devel, Net Dev, Masahiro Yamada
In-Reply-To: <20190813101403.ly5z5q6xvyno3xdd@salvia>
[-- Attachment #1: Type: text/plain, Size: 1253 bytes --]
On 2019-08-13, at 12:14:03 +0200, Pablo Neira Ayuso wrote:
> On Tue, Aug 13, 2019 at 11:04:24AM +0100, Jeremy Sowden wrote:
> > On 2019-08-13, at 11:55:29 +0200, Pablo Neira Ayuso wrote:
> > > Would you mind if - before pushing this out - I do this string
> > > replacement for the patch subject?
> > >
> > > s/added/add
> > > s/removed/remove
> > > s/inlined/inline
> > >
> > > I was told present tense is preferred for description. Otherwise, I'll
> > > leave them as is.
> >
> > I adopted past tenses because at the point at which one is reading
> > the description of a commit, one is usually reading about old
> > behaviour and what has been done to change it. However, I wasn't
> > aware that there was a preference and I am happy to switch to the
> > present tense instead, so by all means feel free to change them.
>
> This is not in the Documentation tree, or I could not find this in a
> quick git grep:
>
> https://kernelnewbies.org/PatchPhilosophy
>
> "In patch descriptions and in the subject, it is common and preferable
> to use present-tense, imperative language. Write as if you are telling
> git what to do with your patch."
>
> I remember though that maintainers have been asking for this in the
> past.
Thanks for the pointer.
J.
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply
* Re: [PATCH net] netlink: Fix nlmsg_parse as a wrapper for strict message parsing
From: Eric Dumazet @ 2019-08-13 11:31 UTC (permalink / raw)
To: David Ahern, davem; +Cc: netdev, johannes.berg, edumazet, David Ahern
In-Reply-To: <20190812200707.25587-1-dsahern@kernel.org>
On 8/12/19 10:07 PM, David Ahern wrote:
> From: David Ahern <dsahern@gmail.com>
>
> Eric reported a syzbot warning:
>
>
> The root cause is nlmsg_parse calling __nla_parse which means the
> header struct size is not checked.
>
> nlmsg_parse should be a wrapper around __nlmsg_parse with
> NL_VALIDATE_STRICT for the validate argument very much like
> nlmsg_parse_deprecated is for NL_VALIDATE_LIBERAL.
>
> Fixes: 3de6440354465 ("netlink: re-add parse/validate functions in strict mode")
> Reported-by: Eric Dumazet <edumazet@google.com>
> Reported-by: syzbot <syzkaller@googlegroups.com>
> Signed-off-by: David Ahern <dsahern@gmail.com>
> ---
Reviewed-by: Eric Dumazet <edumazet@google.com>
Thanks !
^ permalink raw reply
* Re: [PATCH 2/2] net: gmii2rgmii: Switch priv field in mdio device structure
From: Harini Katakam @ 2019-08-13 11:16 UTC (permalink / raw)
To: Andrew Lunn
Cc: Harini Katakam, Florian Fainelli, Heiner Kallweit, David Miller,
Michal Simek, netdev, linux-arm-kernel, linux-kernel,
radhey.shyam.pandey
In-Reply-To: <20190801040648.GJ2713@lunn.ch>
Hi Andrew,
On Thu, Aug 1, 2019 at 9:36 AM Andrew Lunn <andrew@lunn.ch> wrote:
>
> On Wed, Jul 31, 2019 at 03:06:19PM +0530, Harini Katakam wrote:
> > Use the priv field in mdio device structure instead of the one in
> > phy device structure. The phy device priv field may be used by the
> > external phy driver and should not be overwritten.
>
> Hi Harini
>
> I _think_ you could use dev_set_drvdata(&mdiodev->dev) in xgmiitorgmii_probe() and
> dev_get_drvdata(&phydev->mdiomdio.dev) in _read_status()
Thanks for the review. This works if I do:
dev_set_drvdata(&priv->phy_dev->mdio.dev->dev) in probe
and then
dev_get_drvdata(&phydev->mdio.dev) in _read_status()
i.e mdiodev in gmii2rgmii probe and priv->phy_dev->mdio are not the same.
If this is acceptable, I can send a v2.
Regards,
Harini
^ permalink raw reply
* Re: [PATCH] virtio-net: parameterize min ring num_free for virtio receive
From: Michael S. Tsirkin @ 2019-08-13 10:55 UTC (permalink / raw)
To: 冉 jiang
Cc: Jason Wang, davem@davemloft.net, ast@kernel.org,
daniel@iogearbox.net, jakub.kicinski@netronome.com,
hawk@kernel.org, john.fastabend@gmail.com, kafai@fb.com,
songliubraving@fb.com, yhs@fb.com,
virtualization@lists.linux-foundation.org, netdev@vger.kernel.org,
linux-kernel@vger.kernel.org, xdp-newbies@vger.kernel.org,
bpf@vger.kernel.org, jiangran.jr@alibaba-inc.com
In-Reply-To: <DM6PR14MB3212E9CD5E95249564B8FBCFA6C70@DM6PR14MB3212.namprd14.prod.outlook.com>
On Tue, Jul 23, 2019 at 12:05:03PM +0000, 冉 jiang wrote:
>
> On 2019/7/20 0:13, Michael S. Tsirkin wrote:
> > On Fri, Jul 19, 2019 at 03:31:29PM +0000, 冉 jiang wrote:
> >> On 2019/7/19 22:29, Jiang wrote:
> >>> On 2019/7/19 10:36, Jason Wang wrote:
> >>>> On 2019/7/18 下午10:43, Michael S. Tsirkin wrote:
> >>>>> On Thu, Jul 18, 2019 at 10:42:47AM -0400, Michael S. Tsirkin wrote:
> >>>>>> On Thu, Jul 18, 2019 at 10:01:05PM +0800, Jason Wang wrote:
> >>>>>>> On 2019/7/18 下午9:04, Michael S. Tsirkin wrote:
> >>>>>>>> On Thu, Jul 18, 2019 at 12:55:50PM +0000, ? jiang wrote:
> >>>>>>>>> This change makes ring buffer reclaim threshold num_free
> >>>>>>>>> configurable
> >>>>>>>>> for better performance, while it's hard coded as 1/2 * queue now.
> >>>>>>>>> According to our test with qemu + dpdk, packet dropping happens
> >>>>>>>>> when
> >>>>>>>>> the guest is not able to provide free buffer in avail ring timely.
> >>>>>>>>> Smaller value of num_free does decrease the number of packet
> >>>>>>>>> dropping
> >>>>>>>>> during our test as it makes virtio_net reclaim buffer earlier.
> >>>>>>>>>
> >>>>>>>>> At least, we should leave the value changeable to user while the
> >>>>>>>>> default value as 1/2 * queue is kept.
> >>>>>>>>>
> >>>>>>>>> Signed-off-by: jiangkidd<jiangkidd@hotmail.com>
> >>>>>>>> That would be one reason, but I suspect it's not the
> >>>>>>>> true one. If you need more buffer due to jitter
> >>>>>>>> then just increase the queue size. Would be cleaner.
> >>>>>>>>
> >>>>>>>>
> >>>>>>>> However are you sure this is the reason for
> >>>>>>>> packet drops? Do you see them dropped by dpdk
> >>>>>>>> due to lack of space in the ring? As opposed to
> >>>>>>>> by guest?
> >>>>>>>>
> >>>>>>>>
> >>>>>>> Besides those, this patch depends on the user to choose a suitable
> >>>>>>> threshold
> >>>>>>> which is not good. You need either a good value with demonstrated
> >>>>>>> numbers or
> >>>>>>> something smarter.
> >>>>>>>
> >>>>>>> Thanks
> >>>>>> I do however think that we have a problem right now: try_fill_recv can
> >>>>>> take up a long time during which net stack does not run at all.
> >>>>>> Imagine
> >>>>>> a 1K queue - we are talking 512 packets. That's exceessive.
> >>>>
> >>>> Yes, we will starve a fast host in this case.
> >>>>
> >>>>
> >>>>>> napi poll
> >>>>>> weight solves a similar problem, so it might make sense to cap this at
> >>>>>> napi_poll_weight.
> >>>>>>
> >>>>>> Which will allow tweaking it through a module parameter as a
> >>>>>> side effect :) Maybe just do NAPI_POLL_WEIGHT.
> >>>>> Or maybe NAPI_POLL_WEIGHT/2 like we do at half the queue ;). Please
> >>>>> experiment, measure performance and let the list know
> >>>>>
> >>>>>> Need to be careful though: queues can also be small and I don't
> >>>>>> think we
> >>>>>> want to exceed queue size / 2, or maybe queue size - napi_poll_weight.
> >>>>>> Definitely must not exceed the full queue size.
> >>>>
> >>>> Looking at intel, it uses 16 and i40e uses 32. It looks to me
> >>>> NAPI_POLL_WEIGHT/2 is better.
> >>>>
> >>>> Jiang, want to try that and post a new patch?
> >>>>
> >>>> Thanks
> >>>>
> >>>>
> >>>>>> --
> >>>>>> MST
> >>> We did have completed several rounds of test with setting the value to
> >>> budget (64 as the default value). It does improve a lot with pps is
> >>> below 400pps for a single stream. Let me consolidate the data and will
> >>> send it soon. Actually, we are confident that it runs out of free
> >>> buffer in avail ring when packet dropping happens with below systemtap:
> >>>
> >>> Just a snippet:
> >>>
> >>> probe module("virtio_ring").function("virtqueue_get_buf")
> >>> {
> >>> x = (@cast($_vq, "vring_virtqueue")->vring->used->idx)-
> >>> (@cast($_vq, "vring_virtqueue")->last_used_idx) ---> we use this one
> >>> to verify if the queue is full, which means guest is not able to take
> >>> buffer from the queue timely
> >>>
> >>> if (x<0 && (x+65535)<4096)
> >>> x = x+65535
> >>>
> >>> if((x==1024) && @cast($_vq, "vring_virtqueue")->vq->callback ==
> >>> callback_addr)
> >>> netrxcount[x] <<< gettimeofday_s()
> >>> }
> >>>
> >>>
> >>> probe module("virtio_ring").function("virtqueue_add_inbuf")
> >>> {
> >>> y = (@cast($vq, "vring_virtqueue")->vring->avail->idx)-
> >>> (@cast($vq, "vring_virtqueue")->vring->used->idx) ---> we use this one
> >>> to verify if we run out of free buffer in avail ring
> >>> if (y<0 && (y+65535)<4096)
> >>> y = y+65535
> >>>
> >>> if(@2=="debugon")
> >>> {
> >>> if(y==0 && @cast($vq, "vring_virtqueue")->vq->callback ==
> >>> callback_addr)
> >>> {
> >>> netrxfreecount[y] <<< gettimeofday_s()
> >>>
> >>> printf("no avail ring left seen, printing most recent 5
> >>> num free, vq: %lx, current index: %d\n", $vq, recentfreecount)
> >>> for(i=recentfreecount; i!=((recentfreecount+4) % 5);
> >>> i=((i+1) % 5))
> >>> {
> >>> printf("index: %d, num free: %d\n", i, recentfree[$vq,
> >>> i])
> >>> }
> >>>
> >>> printf("index: %d, num free: %d\n", i, recentfree[$vq, i])
> >>> //exit()
> >>> }
> >>> }
> >>> }
> >>>
> >>>
> >>> probe
> >>> module("virtio_net").statement("virtnet_receive@drivers/net/virtio_net.c:732")
> >>> {
> >>> recentfreecount++
> >>> recentfreecount = recentfreecount % 5
> >>> recentfree[$rq->vq, recentfreecount] = $rq->vq->num_free --->
> >>> record the num_free for the last 5 calls to virtnet_receive, so we can
> >>> see if lowering the bar helps.
> >>> }
> >>>
> >>>
> >>> Here is the result:
> >>>
> >>> no avail ring left seen, printing most recent 5 num free, vq:
> >>> ffff9c13c1200000, current index: 1
> >>> index: 1, num free: 561
> >>> index: 2, num free: 305
> >>> index: 3, num free: 369
> >>> index: 4, num free: 433
> >>> index: 0, num free: 497
> >>> no avail ring left seen, printing most recent 5 num free, vq:
> >>> ffff9c13c1200000, current index: 1
> >>> index: 1, num free: 543
> >>> index: 2, num free: 463
> >>> index: 3, num free: 469
> >>> index: 4, num free: 476
> >>> index: 0, num free: 479
> >>> no avail ring left seen, printing most recent 5 num free, vq:
> >>> ffff9c13c1200000, current index: 2
> >>> index: 2, num free: 555
> >>> index: 3, num free: 414
> >>> index: 4, num free: 420
> >>> index: 0, num free: 427
> >>> index: 1, num free: 491
> >>>
> >>> You can see in the last 4 calls to virtnet_receive before we run out
> >>> of free buffer and start to relaim, num_free is quite high. So if we
> >>> can do the reclaim earlier, it will certainly help.
> >>>
> >>> Meanwhile, the patch I proposed actually keeps the default value as
> >>> 1/2 * queue. So the default behavior remains and only leave the
> >>> interface to advanced users, who really understands what they are
> >>> doing. Also, the best value may vary in different environment. Do you
> >>> still think hardcoding this is better option?
> >>>
> >>>
> >>> Jiang
> >>>
> >> Here is the snippet from our test result. Test1 was done with default
> >> driver with the value of 1/2 * queue, while test2 is with my patch and
> >> min_numfree set to 64 (the default budget value). We can see average
> >> drop packets do decrease a lot in test2. Let me know if you need the
> >> full testing data.
> >>
> >> test1Time avgDropPackets test2Time avgDropPackets pps
> >>
> >>> 16:21.0 12.295 56:50.4 0 300k
> >>> 17:19.1 15.244 56:50.4 0 300k
> >>> 18:17.5 18.789 56:50.4 0 300k
> >>> 19:15.1 14.208 56:50.4 0 300k
> >>> 20:13.2 20.818 56:50.4 0.267 300k
> >>> 21:11.2 12.397 56:50.4 0 300k
> >>> 22:09.3 12.599 56:50.4 0 300k
> >>> 23:07.3 15.531 57:48.4 0 300k
> >>> 24:05.5 13.664 58:46.5 0 300k
> >>> 25:03.7 13.158 59:44.5 4.73 300k
> >>> 26:01.1 2.486 00:42.6 0 300k
> >>> 26:59.1 11.241 01:40.6 0 300k
> >>> 27:57.2 20.521 02:38.6 0 300k
> >>> 28:55.2 30.094 03:36.7 0 300k
> >>> 29:53.3 16.828 04:34.7 0.963 300k
> >>> 30:51.3 46.916 05:32.8 0 400k
> >>> 31:49.3 56.214 05:32.8 0 400k
> >>> 32:47.3 58.69 05:32.8 0 400k
> >>> 33:45.3 61.486 05:32.8 0 400k
> >>> 34:43.3 72.175 05:32.8 0.598 400k
> >>> 35:41.3 56.699 05:32.8 0 400k
> >>> 36:39.3 61.071 05:32.8 0 400k
> >>> 37:37.3 43.355 06:30.8 0 400k
> >>> 38:35.4 44.644 06:30.8 0 400k
> >>> 39:33.4 72.336 06:30.8 0 400k
> >>> 40:31.4 70.676 06:30.8 0 400k
> >>> 41:29.4 108.009 06:30.8 0 400k
> >>> 42:27.4 65.216 06:30.8 0 400k
> >>
> >> Jiang
> >
> > OK I find this surprising but I accept what you see.
> > I'm inclined not to add a tunable and just select
> > a value ourselves.
> > I'm also fine with using the napi poll module parameter
> > which will give you a bit of tunability.
>
> OK, kindly take a look if you prefer the below code change. I tested
> budget/2 and the result is almost the same as budget when pps below
> 400k, but a little better when it goes beyond 400k in my environment.
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>
> index 0d4115c9e20b..bc08be7925eb 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -1331,7 +1331,7 @@ static int virtnet_receive(struct receive_queue
> *rq, int budget,
> }
> }
>
> - if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
> + if (rq->vq->num_free > min((unsigned int)budget,
> virtqueue_get_vring_size(rq->vq)) / 2) {
> if (!try_fill_recv(vi, rq, GFP_ATOMIC))
> schedule_delayed_work(&vi->refill, 0);
> }
>
>
> Jiang
>
Looks good to me.
Pls post for inclusion in -net.
--
MST
^ permalink raw reply
* Re: [PATCH iproute2-next v2 4/4] devlink: Add man page for devlink-trap
From: Jiri Pirko @ 2019-08-13 10:52 UTC (permalink / raw)
To: Ido Schimmel; +Cc: netdev, dsahern, stephen, jiri, mlxsw, Ido Schimmel
In-Reply-To: <20190813103904.GA16305@splinter>
Tue, Aug 13, 2019 at 12:39:04PM CEST, idosch@idosch.org wrote:
>On Tue, Aug 13, 2019 at 12:20:37PM +0200, Jiri Pirko wrote:
>> Tue, Aug 13, 2019 at 10:31:43AM CEST, idosch@idosch.org wrote:
>> >From: Ido Schimmel <idosch@mellanox.com>
>> >
>> >Signed-off-by: Ido Schimmel <idosch@mellanox.com>
>> >---
>> > man/man8/devlink-monitor.8 | 3 +-
>> > man/man8/devlink-trap.8 | 138 +++++++++++++++++++++++++++++++++++++
>> > man/man8/devlink.8 | 11 ++-
>> > 3 files changed, 150 insertions(+), 2 deletions(-)
>> > create mode 100644 man/man8/devlink-trap.8
>> >
>> >diff --git a/man/man8/devlink-monitor.8 b/man/man8/devlink-monitor.8
>> >index 13fe641dc8f5..fffab3a4ce88 100644
>> >--- a/man/man8/devlink-monitor.8
>> >+++ b/man/man8/devlink-monitor.8
>> >@@ -21,7 +21,7 @@ command is the first in the command line and then the object list.
>> > .I OBJECT-LIST
>> > is the list of object types that we want to monitor.
>> > It may contain
>> >-.BR dev ", " port ".
>> >+.BR dev ", " port ", " trap ", " trap-group .
>>
>> Looks like "trap-group" is a leftover here, isn't it?
>
>You get events when traps and groups are created / destroyed. See below output
>when creating a new netdevsim device:
Ah! Makes sense. Thanks!
>
>$ devlink mon trap-group
>[trap-group,new] netdevsim/netdevsim20: name l2_drops generic true
>[trap-group,new] netdevsim/netdevsim20: name l3_drops generic true
>[trap-group,new] netdevsim/netdevsim20: name buffer_drops generic true
>
>$ devlink mon trap
>[trap,new] netdevsim/netdevsim10: name source_mac_is_multicast type drop generic true action drop group l2_drops
>[trap,new] netdevsim/netdevsim10: name vlan_tag_mismatch type drop generic true action drop group l2_drops
>[trap,new] netdevsim/netdevsim10: name ingress_vlan_filter type drop generic true action drop group l2_drops
>[trap,new] netdevsim/netdevsim10: name ingress_spanning_tree_filter type drop generic true action drop group l2_drops
>[trap,new] netdevsim/netdevsim10: name port_list_is_empty type drop generic true action drop group l2_drops
>[trap,new] netdevsim/netdevsim10: name port_loopback_filter type drop generic true action drop group l2_drops
>[trap,new] netdevsim/netdevsim10: name fid_miss type exception generic false action trap group l2_drops
>[trap,new] netdevsim/netdevsim10: name blackhole_route type drop generic true action drop group l3_drops
>[trap,new] netdevsim/netdevsim10: name ttl_value_is_too_small type exception generic true action trap group l3_drops
>[trap,new] netdevsim/netdevsim10: name tail_drop type drop generic true action drop group buffer_drops
^ permalink raw reply
* tc - mirred ingress not supported at the moment
From: Martin Olsson @ 2019-08-13 10:51 UTC (permalink / raw)
To: netdev
In-Reply-To: <CAAT+qEa6Yw-tf3L_R-phzSvLiGOdW9uLhFGNTz+i9eWhBT_+DA@mail.gmail.com>
Two questions regarding tc-mirred:
1)
The manual ( https://www.linux.org/docs/man8/tc-mirred.html ) states:
OPTIONS
ingress
egress
Specify the direction in which the packet shall appear on the
destination interface.
Currently only egress is implemented.
I verify to see if this is still true, and unfortunately it is:
# tc filter add dev eno2 parent ffff: prio 999 protocol all matchall
action mirred ingress redirect dev mon0
mirred ingress not supported at the moment
bad action parsing
parse_action: bad value (5:mirred)!
Illegal "action"
Q1: Why was 'ingress' not implemented at the same time as 'egress'?
2)
Ok, so I have to use 'egress':
# tc filter add dev eno2 parent ffff: prio 999 protocol all matchall
action mirred egress redirect dev mon0
Since the mirred action forces me to use 'egress' as the direction on
the dest interface, all kinds of network statistics tools show
incorrect counters. :-(
eno2 is a pure sniffer interface (it is connected to the SPAN dest
port of a switch).
All packets (matchall) on eno2 are mirrored to mon0.
# ip -s link show dev eno2
...
...
RX: bytes packets errors dropped overrun mcast
13660757 16329 0 0 0 0
TX: bytes packets errors dropped carrier collsns
0 0 0 0 0 0
# ip -s link show dev mon0
...
...
RX: bytes packets errors dropped overrun mcast
0 0 0 0 0 0
TX: bytes packets errors dropped carrier collsns
13660757 16329 0 0 0 0
eno2 and mon0 should be identical, but they are inverted.
When I graph all interfaces of the machine, the traffic graph for
'mon0' is incorrect since it shows 100% egress when the traffic really
is ingress.
As a human I can re-enterpret the mon0 graph when looking at it, but
it is harder for automated tools to do the right thing without
explicit node configuration/exceptions in the tool. This is annoying
when you have tools that graph hundreds of different types of nodes
with different kinds of interface types, and want all graphs to be
visually simillar for easy comparison.
Tool output that mon0 has sent 16329 packets is also plain wrong. It
has really *received* these packets.
Q2: So... Can the 'ingress' option please be implemented? (I'm no
programmer, so unfortunetly I can't do it myself).
/Martin
^ permalink raw reply
* RE: [EXT] [PATCH] qed: Add cleanup in qed_slowpath_start()
From: Sudarsana Reddy Kalluru @ 2019-08-13 10:46 UTC (permalink / raw)
To: Wenwen Wang
Cc: Ariel Elior, GR-everest-linux-l2, David S. Miller,
open list:QLOGIC QL4xxx ETHERNET DRIVER, open list
In-Reply-To: <1565690709-3186-1-git-send-email-wenwen@cs.uga.edu>
> -----Original Message-----
> From: Wenwen Wang <wenwen@cs.uga.edu>
> Sent: Tuesday, August 13, 2019 3:35 PM
> To: Wenwen Wang <wenwen@cs.uga.edu>
> Cc: Ariel Elior <aelior@marvell.com>; GR-everest-linux-l2 <GR-everest-linux-
> l2@marvell.com>; David S. Miller <davem@davemloft.net>; open
> list:QLOGIC QL4xxx ETHERNET DRIVER <netdev@vger.kernel.org>; open list
> <linux-kernel@vger.kernel.org>
> Subject: [EXT] [PATCH] qed: Add cleanup in qed_slowpath_start()
>
> External Email
>
> ----------------------------------------------------------------------
> If qed_mcp_send_drv_version() fails, no cleanup is executed, leading to
> memory leaks. To fix this issue, redirect the execution to the label 'err3'
> before returning the error.
>
> Signed-off-by: Wenwen Wang <wenwen@cs.uga.edu>
> ---
> drivers/net/ethernet/qlogic/qed/qed_main.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c
> b/drivers/net/ethernet/qlogic/qed/qed_main.c
> index 829dd60..d16a251 100644
> --- a/drivers/net/ethernet/qlogic/qed/qed_main.c
> +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
> @@ -1325,7 +1325,7 @@ static int qed_slowpath_start(struct qed_dev
> *cdev,
> &drv_version);
> if (rc) {
> DP_NOTICE(cdev, "Failed sending drv version
> command\n");
> - return rc;
> + goto err3;
In this case, we might need to free the ll2-buf allocated at the below path (?),
1312 /* Allocate LL2 interface if needed */
1313 if (QED_LEADING_HWFN(cdev)->using_ll2) {
1314 rc = qed_ll2_alloc_if(cdev);
May be by adding a new goto label 'err4'.
> }
> }
>
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH iproute2-next v2 4/4] devlink: Add man page for devlink-trap
From: Ido Schimmel @ 2019-08-13 10:39 UTC (permalink / raw)
To: Jiri Pirko; +Cc: netdev, dsahern, stephen, jiri, mlxsw, Ido Schimmel
In-Reply-To: <20190813102037.GP2428@nanopsycho>
On Tue, Aug 13, 2019 at 12:20:37PM +0200, Jiri Pirko wrote:
> Tue, Aug 13, 2019 at 10:31:43AM CEST, idosch@idosch.org wrote:
> >From: Ido Schimmel <idosch@mellanox.com>
> >
> >Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> >---
> > man/man8/devlink-monitor.8 | 3 +-
> > man/man8/devlink-trap.8 | 138 +++++++++++++++++++++++++++++++++++++
> > man/man8/devlink.8 | 11 ++-
> > 3 files changed, 150 insertions(+), 2 deletions(-)
> > create mode 100644 man/man8/devlink-trap.8
> >
> >diff --git a/man/man8/devlink-monitor.8 b/man/man8/devlink-monitor.8
> >index 13fe641dc8f5..fffab3a4ce88 100644
> >--- a/man/man8/devlink-monitor.8
> >+++ b/man/man8/devlink-monitor.8
> >@@ -21,7 +21,7 @@ command is the first in the command line and then the object list.
> > .I OBJECT-LIST
> > is the list of object types that we want to monitor.
> > It may contain
> >-.BR dev ", " port ".
> >+.BR dev ", " port ", " trap ", " trap-group .
>
> Looks like "trap-group" is a leftover here, isn't it?
You get events when traps and groups are created / destroyed. See below output
when creating a new netdevsim device:
$ devlink mon trap-group
[trap-group,new] netdevsim/netdevsim20: name l2_drops generic true
[trap-group,new] netdevsim/netdevsim20: name l3_drops generic true
[trap-group,new] netdevsim/netdevsim20: name buffer_drops generic true
$ devlink mon trap
[trap,new] netdevsim/netdevsim10: name source_mac_is_multicast type drop generic true action drop group l2_drops
[trap,new] netdevsim/netdevsim10: name vlan_tag_mismatch type drop generic true action drop group l2_drops
[trap,new] netdevsim/netdevsim10: name ingress_vlan_filter type drop generic true action drop group l2_drops
[trap,new] netdevsim/netdevsim10: name ingress_spanning_tree_filter type drop generic true action drop group l2_drops
[trap,new] netdevsim/netdevsim10: name port_list_is_empty type drop generic true action drop group l2_drops
[trap,new] netdevsim/netdevsim10: name port_loopback_filter type drop generic true action drop group l2_drops
[trap,new] netdevsim/netdevsim10: name fid_miss type exception generic false action trap group l2_drops
[trap,new] netdevsim/netdevsim10: name blackhole_route type drop generic true action drop group l3_drops
[trap,new] netdevsim/netdevsim10: name ttl_value_is_too_small type exception generic true action trap group l3_drops
[trap,new] netdevsim/netdevsim10: name tail_drop type drop generic true action drop group buffer_drops
^ permalink raw reply
* [PATCH bpf-next 0/3] xdpsock: allow mmap2 usage for 32bits
From: Ivan Khoronzhuk @ 2019-08-13 10:23 UTC (permalink / raw)
To: magnus.karlsson, bjorn.topel
Cc: davem, hawk, john.fastabend, jakub.kicinski, daniel, netdev, bpf,
xdp-newbies, linux-kernel, Ivan Khoronzhuk
This patchset contains several improvements for af_xdp socket umem
mappings for 32bit systems. Also, there is one more patch outside of
othis series that can be applied to another tree and related to mmap2
af_xdp umem offsets:
"mm: mmap: increase sockets maximum memory size pgoff for 32bits"
https://lkml.org/lkml/2019/8/12/549
Based on bpf-next/master
Ivan Khoronzhuk (3):
libbpf: add asm/unistd.h to xsk to get __NR_mmap2
xdp: xdp_umem: replace kmap on vmap for umem map
samples: bpf: syscal_nrs: use mmap2 if defined
net/xdp/xdp_umem.c | 16 ++++++++++++----
samples/bpf/syscall_nrs.c | 5 +++++
samples/bpf/tracex5_kern.c | 11 +++++++++++
tools/lib/bpf/xsk.c | 1 +
4 files changed, 29 insertions(+), 4 deletions(-)
--
2.17.1
^ permalink raw reply
* [PATCH bpf-next 2/3] xdp: xdp_umem: replace kmap on vmap for umem map
From: Ivan Khoronzhuk @ 2019-08-13 10:23 UTC (permalink / raw)
To: magnus.karlsson, bjorn.topel
Cc: davem, hawk, john.fastabend, jakub.kicinski, daniel, netdev, bpf,
xdp-newbies, linux-kernel, Ivan Khoronzhuk
In-Reply-To: <20190813102318.5521-1-ivan.khoronzhuk@linaro.org>
For 64-bit there is no reason to use vmap/vunmap, so use page_address
as it was initially. For 32 bits, in some apps, like in samples
xdpsock_user.c when number of pgs in use is quite big, the kmap
memory can be not enough, despite on this, kmap looks like is
deprecated in such cases as it can block and should be used rather
for dynamic mm.
Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
---
net/xdp/xdp_umem.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index a0607969f8c0..907c9019fe21 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -14,7 +14,7 @@
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/idr.h>
-#include <linux/highmem.h>
+#include <linux/vmalloc.h>
#include "xdp_umem.h"
#include "xsk_queue.h"
@@ -167,10 +167,12 @@ void xdp_umem_clear_dev(struct xdp_umem *umem)
static void xdp_umem_unmap_pages(struct xdp_umem *umem)
{
+#if BITS_PER_LONG == 32
unsigned int i;
for (i = 0; i < umem->npgs; i++)
- kunmap(umem->pgs[i]);
+ vunmap(umem->pages[i].addr);
+#endif
}
static void xdp_umem_unpin_pages(struct xdp_umem *umem)
@@ -378,8 +380,14 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
goto out_account;
}
- for (i = 0; i < umem->npgs; i++)
- umem->pages[i].addr = kmap(umem->pgs[i]);
+ for (i = 0; i < umem->npgs; i++) {
+#if BITS_PER_LONG == 32
+ umem->pages[i].addr = vmap(&umem->pgs[i], 1, VM_MAP,
+ PAGE_KERNEL);
+#else
+ umem->pages[i].addr = page_address(umem->pgs[i]);
+#endif
+ }
return 0;
--
2.17.1
^ permalink raw reply related
* [PATCH bpf-next 3/3] samples: bpf: syscal_nrs: use mmap2 if defined
From: Ivan Khoronzhuk @ 2019-08-13 10:23 UTC (permalink / raw)
To: magnus.karlsson, bjorn.topel
Cc: davem, hawk, john.fastabend, jakub.kicinski, daniel, netdev, bpf,
xdp-newbies, linux-kernel, Ivan Khoronzhuk
In-Reply-To: <20190813102318.5521-1-ivan.khoronzhuk@linaro.org>
For arm32 xdp sockets mmap2 is preferred, so use it if it's defined.
Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
---
samples/bpf/syscall_nrs.c | 5 +++++
samples/bpf/tracex5_kern.c | 11 +++++++++++
2 files changed, 16 insertions(+)
diff --git a/samples/bpf/syscall_nrs.c b/samples/bpf/syscall_nrs.c
index 516e255cbe8f..2dec94238350 100644
--- a/samples/bpf/syscall_nrs.c
+++ b/samples/bpf/syscall_nrs.c
@@ -9,5 +9,10 @@ void syscall_defines(void)
COMMENT("Linux system call numbers.");
SYSNR(__NR_write);
SYSNR(__NR_read);
+#ifdef __NR_mmap2
+ SYSNR(__NR_mmap2);
+#else
SYSNR(__NR_mmap);
+#endif
+
}
diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c
index f57f4e1ea1ec..300350ad299a 100644
--- a/samples/bpf/tracex5_kern.c
+++ b/samples/bpf/tracex5_kern.c
@@ -68,12 +68,23 @@ PROG(SYS__NR_read)(struct pt_regs *ctx)
return 0;
}
+#ifdef __NR_mmap2
+PROG(SYS__NR_mmap2)(struct pt_regs *ctx)
+{
+ char fmt[] = "mmap2\n";
+
+ bpf_trace_printk(fmt, sizeof(fmt));
+ return 0;
+}
+#else
PROG(SYS__NR_mmap)(struct pt_regs *ctx)
{
char fmt[] = "mmap\n";
+
bpf_trace_printk(fmt, sizeof(fmt));
return 0;
}
+#endif
char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE;
--
2.17.1
^ permalink raw reply related
* [PATCH bpf-next 1/3] libbpf: add asm/unistd.h to xsk to get __NR_mmap2
From: Ivan Khoronzhuk @ 2019-08-13 10:23 UTC (permalink / raw)
To: magnus.karlsson, bjorn.topel
Cc: davem, hawk, john.fastabend, jakub.kicinski, daniel, netdev, bpf,
xdp-newbies, linux-kernel, Ivan Khoronzhuk
In-Reply-To: <20190813102318.5521-1-ivan.khoronzhuk@linaro.org>
That's needed to get __NR_mmap2 when mmap2 syscall is used.
Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
---
tools/lib/bpf/xsk.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index 5007b5d4fd2c..f2fc40f9804c 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -12,6 +12,7 @@
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include <asm/unistd.h>
#include <arpa/inet.h>
#include <asm/barrier.h>
#include <linux/compiler.h>
--
2.17.1
^ permalink raw reply related
* Re: [PATCH iproute2-next v2 4/4] devlink: Add man page for devlink-trap
From: Jiri Pirko @ 2019-08-13 10:20 UTC (permalink / raw)
To: Ido Schimmel; +Cc: netdev, dsahern, stephen, jiri, mlxsw, Ido Schimmel
In-Reply-To: <20190813083143.13509-5-idosch@idosch.org>
Tue, Aug 13, 2019 at 10:31:43AM CEST, idosch@idosch.org wrote:
>From: Ido Schimmel <idosch@mellanox.com>
>
>Signed-off-by: Ido Schimmel <idosch@mellanox.com>
>---
> man/man8/devlink-monitor.8 | 3 +-
> man/man8/devlink-trap.8 | 138 +++++++++++++++++++++++++++++++++++++
> man/man8/devlink.8 | 11 ++-
> 3 files changed, 150 insertions(+), 2 deletions(-)
> create mode 100644 man/man8/devlink-trap.8
>
>diff --git a/man/man8/devlink-monitor.8 b/man/man8/devlink-monitor.8
>index 13fe641dc8f5..fffab3a4ce88 100644
>--- a/man/man8/devlink-monitor.8
>+++ b/man/man8/devlink-monitor.8
>@@ -21,7 +21,7 @@ command is the first in the command line and then the object list.
> .I OBJECT-LIST
> is the list of object types that we want to monitor.
> It may contain
>-.BR dev ", " port ".
>+.BR dev ", " port ", " trap ", " trap-group .
Looks like "trap-group" is a leftover here, isn't it?
>
> .B devlink
> opens Devlink Netlink socket, listens on it and dumps state changes.
>@@ -31,6 +31,7 @@ opens Devlink Netlink socket, listens on it and dumps state changes.
> .BR devlink-dev (8),
> .BR devlink-sb (8),
> .BR devlink-port (8),
>+.BR devlink-trap (8),
> .br
>
> .SH AUTHOR
>diff --git a/man/man8/devlink-trap.8 b/man/man8/devlink-trap.8
>new file mode 100644
>index 000000000000..4f079eb86d7b
>--- /dev/null
>+++ b/man/man8/devlink-trap.8
>@@ -0,0 +1,138 @@
>+.TH DEVLINK\-TRAP 8 "2 August 2019" "iproute2" "Linux"
>+.SH NAME
>+devlink-trap \- devlink trap configuration
>+.SH SYNOPSIS
>+.sp
>+.ad l
>+.in +8
>+.ti -8
>+.B devlink
>+.RI "[ " OPTIONS " ]"
>+.B trap
>+.RI "{ " COMMAND " |"
>+.BR help " }"
>+.sp
>+
>+.ti -8
>+.IR OPTIONS " := { "
>+\fB\-v\fR[\fIerbose\fR] |
>+\fB\-s\fR[\fItatistics\fR] }
Not sure you need to put generic option here. But I don't mind much.
Otherwise this looks fine.
Acked-by: Jiri Pirko <jiri@mellanox.com>
[...]
^ permalink raw reply
* Re: [PATCH iproute2-next v2 3/4] devlink: Add devlink trap group set and show commands
From: Jiri Pirko @ 2019-08-13 10:15 UTC (permalink / raw)
To: Ido Schimmel; +Cc: netdev, dsahern, stephen, jiri, mlxsw, Ido Schimmel
In-Reply-To: <20190813083143.13509-4-idosch@idosch.org>
Tue, Aug 13, 2019 at 10:31:42AM CEST, idosch@idosch.org wrote:
>From: Ido Schimmel <idosch@mellanox.com>
>
>These commands are similar to the trap set and show commands, but
>operate on a trap group and not individual traps. Example:
>
># devlink trap group set netdevsim/netdevsim10 group l3_drops action trap
># devlink -jps trap group show netdevsim/netdevsim10 group l3_drops
>{
> "trap_group": {
> "netdevsim/netdevsim10": [ {
> "name": "l3_drops",
> "generic": true,
> "stats": {
> "rx": {
> "bytes": 0,
> "packets": 0
> }
> }
> } ]
> }
>}
>
>Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox