* [PATCH bpf-next 10/16] nfp: bpf: add map data structure
From: Jakub Kicinski @ 2018-01-05 6:09 UTC (permalink / raw)
To: netdev, alexei.starovoitov, daniel, davem
Cc: oss-drivers, tehnerd, Jakub Kicinski
In-Reply-To: <20180105060931.30815-1-jakub.kicinski@netronome.com>
To be able to split code into reasonable chunks we need to add
the map data structures already. Later patches will add code
piece by piece.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
drivers/net/ethernet/netronome/nfp/bpf/main.c | 7 ++++++-
drivers/net/ethernet/netronome/nfp/bpf/main.h | 18 ++++++++++++++++++
2 files changed, 24 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 684b16cb6a20..2506d1139fa9 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -288,6 +288,8 @@ static int nfp_bpf_init(struct nfp_app *app)
bpf->app = app;
app->priv = bpf;
+ INIT_LIST_HEAD(&bpf->map_list);
+
err = nfp_bpf_parse_capabilities(app);
if (err)
goto err_free_bpf;
@@ -301,7 +303,10 @@ static int nfp_bpf_init(struct nfp_app *app)
static void nfp_bpf_clean(struct nfp_app *app)
{
- kfree(app->priv);
+ struct nfp_app_bpf *bpf = app->priv;
+
+ WARN_ON(!list_empty(&bpf->map_list));
+ kfree(bpf);
}
const struct nfp_app_type app_bpf = {
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index c7d815455cb0..71c58d25858b 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -82,6 +82,8 @@ enum pkt_vec {
* struct nfp_app_bpf - bpf app priv structure
* @app: backpointer to the app
*
+ * @map_list: list of offloaded maps
+ *
* @adjust_head: adjust head capability
* @flags: extra flags for adjust head
* @off_min: minimal packet offset within buffer required
@@ -92,6 +94,8 @@ enum pkt_vec {
struct nfp_app_bpf {
struct nfp_app *app;
+ struct list_head map_list;
+
struct nfp_bpf_cap_adjust_head {
u32 flags;
int off_min;
@@ -101,6 +105,20 @@ struct nfp_app_bpf {
} adjust_head;
};
+/**
+ * struct nfp_bpf_map - private per-map data attached to BPF maps for offload
+ * @offmap: pointer to the offloaded BPF map
+ * @bpf: back pointer to bpf app private structure
+ * @tid: table id identifying map on datapath
+ * @l: link on the nfp_app_bpf->map_list list
+ */
+struct nfp_bpf_map {
+ struct bpf_offloaded_map *offmap;
+ struct nfp_app_bpf *bpf;
+ u32 tid;
+ struct list_head l;
+};
+
struct nfp_prog;
struct nfp_insn_meta;
typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *);
--
2.15.1
^ permalink raw reply related
* [PATCH bpf-next 09/16] nfp: hand over to BPF offload app at coarser granularity
From: Jakub Kicinski @ 2018-01-05 6:09 UTC (permalink / raw)
To: netdev, alexei.starovoitov, daniel, davem
Cc: oss-drivers, tehnerd, Jakub Kicinski
In-Reply-To: <20180105060931.30815-1-jakub.kicinski@netronome.com>
Instead of having an app callback per message type hand off
all offload-related handling to apps with one "rest of ndo_bpf"
callback.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
drivers/net/ethernet/netronome/nfp/bpf/main.c | 5 +--
drivers/net/ethernet/netronome/nfp/bpf/main.h | 8 +---
drivers/net/ethernet/netronome/nfp/bpf/offload.c | 25 +++++++++---
drivers/net/ethernet/netronome/nfp/nfp_app.h | 47 +++++-----------------
.../net/ethernet/netronome/nfp/nfp_net_common.c | 10 +----
5 files changed, 34 insertions(+), 61 deletions(-)
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 4b63167906ca..684b16cb6a20 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -318,9 +318,6 @@ const struct nfp_app_type app_bpf = {
.setup_tc = nfp_bpf_setup_tc,
.tc_busy = nfp_bpf_tc_busy,
+ .bpf = nfp_ndo_bpf,
.xdp_offload = nfp_bpf_xdp_offload,
-
- .bpf_verifier_prep = nfp_bpf_verifier_prep,
- .bpf_translate = nfp_bpf_translate,
- .bpf_destroy = nfp_bpf_destroy,
};
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 89a9b6393882..c7d815455cb0 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -244,15 +244,11 @@ struct netdev_bpf;
struct nfp_app;
struct nfp_net;
+int nfp_ndo_bpf(struct nfp_app *app, struct nfp_net *nn,
+ struct netdev_bpf *bpf);
int nfp_net_bpf_offload(struct nfp_net *nn, struct bpf_prog *prog,
bool old_prog);
-int nfp_bpf_verifier_prep(struct nfp_app *app, struct nfp_net *nn,
- struct netdev_bpf *bpf);
-int nfp_bpf_translate(struct nfp_app *app, struct nfp_net *nn,
- struct bpf_prog *prog);
-int nfp_bpf_destroy(struct nfp_app *app, struct nfp_net *nn,
- struct bpf_prog *prog);
struct nfp_insn_meta *
nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
unsigned int insn_idx, unsigned int n_insns);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 1a67673261b7..12a48b7583a3 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -102,8 +102,9 @@ static void nfp_prog_free(struct nfp_prog *nfp_prog)
kfree(nfp_prog);
}
-int nfp_bpf_verifier_prep(struct nfp_app *app, struct nfp_net *nn,
- struct netdev_bpf *bpf)
+static int
+nfp_bpf_verifier_prep(struct nfp_app *app, struct nfp_net *nn,
+ struct netdev_bpf *bpf)
{
struct bpf_prog *prog = bpf->verifier.prog;
struct nfp_prog *nfp_prog;
@@ -133,8 +134,7 @@ int nfp_bpf_verifier_prep(struct nfp_app *app, struct nfp_net *nn,
return ret;
}
-int nfp_bpf_translate(struct nfp_app *app, struct nfp_net *nn,
- struct bpf_prog *prog)
+static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog)
{
struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv;
unsigned int stack_size;
@@ -161,8 +161,7 @@ int nfp_bpf_translate(struct nfp_app *app, struct nfp_net *nn,
return nfp_bpf_jit(nfp_prog);
}
-int nfp_bpf_destroy(struct nfp_app *app, struct nfp_net *nn,
- struct bpf_prog *prog)
+static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog)
{
struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv;
@@ -172,6 +171,20 @@ int nfp_bpf_destroy(struct nfp_app *app, struct nfp_net *nn,
return 0;
}
+int nfp_ndo_bpf(struct nfp_app *app, struct nfp_net *nn, struct netdev_bpf *bpf)
+{
+ switch (bpf->command) {
+ case BPF_OFFLOAD_VERIFIER_PREP:
+ return nfp_bpf_verifier_prep(app, nn, bpf);
+ case BPF_OFFLOAD_TRANSLATE:
+ return nfp_bpf_translate(nn, bpf->offload.prog);
+ case BPF_OFFLOAD_DESTROY:
+ return nfp_bpf_destroy(nn, bpf->offload.prog);
+ default:
+ return -EINVAL;
+ }
+}
+
static int nfp_net_bpf_load(struct nfp_net *nn, struct bpf_prog *prog)
{
struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index 3af1943a8521..ee67a7202819 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -87,10 +87,8 @@ extern const struct nfp_app_type app_flower;
* @ctrl_msg_rx: control message handler
* @setup_tc: setup TC ndo
* @tc_busy: TC HW offload busy (rules loaded)
+ * @bpf: BPF ndo offload-related calls
* @xdp_offload: offload an XDP program
- * @bpf_verifier_prep: verifier prep for dev-specific BPF programs
- * @bpf_translate: translate call for dev-specific BPF programs
- * @bpf_destroy: destroy for dev-specific BPF programs
* @eswitch_mode_get: get SR-IOV eswitch mode
* @sriov_enable: app-specific sriov initialisation
* @sriov_disable: app-specific sriov clean-up
@@ -128,14 +126,10 @@ struct nfp_app_type {
int (*setup_tc)(struct nfp_app *app, struct net_device *netdev,
enum tc_setup_type type, void *type_data);
bool (*tc_busy)(struct nfp_app *app, struct nfp_net *nn);
+ int (*bpf)(struct nfp_app *app, struct nfp_net *nn,
+ struct netdev_bpf *xdp);
int (*xdp_offload)(struct nfp_app *app, struct nfp_net *nn,
struct bpf_prog *prog);
- int (*bpf_verifier_prep)(struct nfp_app *app, struct nfp_net *nn,
- struct netdev_bpf *bpf);
- int (*bpf_translate)(struct nfp_app *app, struct nfp_net *nn,
- struct bpf_prog *prog);
- int (*bpf_destroy)(struct nfp_app *app, struct nfp_net *nn,
- struct bpf_prog *prog);
int (*sriov_enable)(struct nfp_app *app, int num_vfs);
void (*sriov_disable)(struct nfp_app *app);
@@ -303,6 +297,14 @@ static inline int nfp_app_setup_tc(struct nfp_app *app,
return app->type->setup_tc(app, netdev, type, type_data);
}
+static inline int nfp_app_bpf(struct nfp_app *app, struct nfp_net *nn,
+ struct netdev_bpf *bpf)
+{
+ if (!app || !app->type->bpf)
+ return -EINVAL;
+ return app->type->bpf(app, nn, bpf);
+}
+
static inline int nfp_app_xdp_offload(struct nfp_app *app, struct nfp_net *nn,
struct bpf_prog *prog)
{
@@ -311,33 +313,6 @@ static inline int nfp_app_xdp_offload(struct nfp_app *app, struct nfp_net *nn,
return app->type->xdp_offload(app, nn, prog);
}
-static inline int
-nfp_app_bpf_verifier_prep(struct nfp_app *app, struct nfp_net *nn,
- struct netdev_bpf *bpf)
-{
- if (!app || !app->type->bpf_verifier_prep)
- return -EOPNOTSUPP;
- return app->type->bpf_verifier_prep(app, nn, bpf);
-}
-
-static inline int
-nfp_app_bpf_translate(struct nfp_app *app, struct nfp_net *nn,
- struct bpf_prog *prog)
-{
- if (!app || !app->type->bpf_translate)
- return -EOPNOTSUPP;
- return app->type->bpf_translate(app, nn, prog);
-}
-
-static inline int
-nfp_app_bpf_destroy(struct nfp_app *app, struct nfp_net *nn,
- struct bpf_prog *prog)
-{
- if (!app || !app->type->bpf_destroy)
- return -EOPNOTSUPP;
- return app->type->bpf_destroy(app, nn, prog);
-}
-
static inline bool nfp_app_ctrl_tx(struct nfp_app *app, struct sk_buff *skb)
{
trace_devlink_hwmsg(priv_to_devlink(app->pf), false, 0,
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 0add4870ce2e..773089442b64 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3394,16 +3394,8 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
xdp->prog_id = nn->xdp_prog ? nn->xdp_prog->aux->id : 0;
xdp->prog_flags = nn->xdp_prog ? nn->xdp_flags : 0;
return 0;
- case BPF_OFFLOAD_VERIFIER_PREP:
- return nfp_app_bpf_verifier_prep(nn->app, nn, xdp);
- case BPF_OFFLOAD_TRANSLATE:
- return nfp_app_bpf_translate(nn->app, nn,
- xdp->offload.prog);
- case BPF_OFFLOAD_DESTROY:
- return nfp_app_bpf_destroy(nn->app, nn,
- xdp->offload.prog);
default:
- return -EINVAL;
+ return nfp_app_bpf(nn->app, nn, xdp);
}
}
--
2.15.1
^ permalink raw reply related
* [PATCH bpf-next 08/16] bpf: offload: add map offload infrastructure
From: Jakub Kicinski @ 2018-01-05 6:09 UTC (permalink / raw)
To: netdev, alexei.starovoitov, daniel, davem
Cc: oss-drivers, tehnerd, Jakub Kicinski
In-Reply-To: <20180105060931.30815-1-jakub.kicinski@netronome.com>
BPF map offload follow similar path to program offload. At creation
time users may specify ifindex of the device on which they want to
create the map. Map will be validated by the kernel's
.map_alloc_check callback and device driver will be called for the
actual allocation. Map will have an empty set of operations
associated with it (save for alloc and free callbacks). The real
device callbacks are kept in map->offload->dev_ops because they
have slightly different signatures. Map operations are called in
process context so the driver may communicate with HW freely,
msleep(), wait() etc.
Map alloc and free callbacks are muxed via existing .ndo_bpf, and
are always called with rtnl lock held. Maps and programs are
guaranteed to be destroyed before .ndo_uninit (i.e. before
unregister_netdev() returns). Map callbacks are invoked with
bpf_devs_lock *read* locked, drivers must take care of exclusive
locking if necessary.
All offload-specific branches are marked with unlikely() (through
bpf_map_is_dev_bound()), given that branch penalty will be
negligible compared to IO anyway, and we don't want to penalize
SW path unnecessarily.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
include/linux/bpf.h | 59 +++++++++++++
include/linux/netdevice.h | 6 ++
include/uapi/linux/bpf.h | 1 +
kernel/bpf/offload.c | 189 +++++++++++++++++++++++++++++++++++++++--
kernel/bpf/syscall.c | 42 +++++++--
kernel/bpf/verifier.c | 7 ++
tools/include/uapi/linux/bpf.h | 1 +
7 files changed, 293 insertions(+), 12 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 425ca3cd1e32..d9a0beb78825 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -65,6 +65,33 @@ struct bpf_map {
#endif
};
+struct bpf_offloaded_map;
+
+struct bpf_map_dev_ops {
+ int (*map_get_next_key)(struct bpf_offloaded_map *map,
+ void *key, void *next_key);
+ int (*map_lookup_elem)(struct bpf_offloaded_map *map,
+ void *key, void *value);
+ int (*map_update_elem)(struct bpf_offloaded_map *map,
+ void *key, void *value, u64 flags);
+ int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key);
+};
+
+struct bpf_offloaded_map {
+ struct bpf_map map;
+ struct net_device *netdev;
+ const struct bpf_map_dev_ops *dev_ops;
+ void *dev_priv;
+ struct list_head offloads;
+};
+
+static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map)
+{
+ return container_of(map, struct bpf_offloaded_map, map);
+}
+
+extern const struct bpf_map_ops bpf_map_offload_ops;
+
/* function argument constraints */
enum bpf_arg_type {
ARG_DONTCARE = 0, /* unused argument in helper function */
@@ -359,6 +386,7 @@ int __bpf_prog_charge(struct user_struct *user, u32 pages);
void __bpf_prog_uncharge(struct user_struct *user, u32 pages);
void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock);
+void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);
struct bpf_map *bpf_map_get_with_uref(u32 ufd);
struct bpf_map *__bpf_map_get(struct fd f);
@@ -536,6 +564,15 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog);
int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
struct bpf_prog *prog);
+int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value);
+int bpf_map_offload_update_elem(struct bpf_map *map,
+ void *key, void *value, u64 flags);
+int bpf_map_offload_delete_elem(struct bpf_map *map, void *key);
+int bpf_map_offload_get_next_key(struct bpf_map *map,
+ void *key, void *next_key);
+
+bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map);
+
#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
@@ -543,6 +580,14 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
{
return aux->offload_requested;
}
+
+static inline bool bpf_map_is_dev_bound(struct bpf_map *map)
+{
+ return unlikely(map->ops == &bpf_map_offload_ops);
+}
+
+struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr);
+void bpf_map_offload_map_free(struct bpf_map *map);
#else
static inline int bpf_prog_offload_init(struct bpf_prog *prog,
union bpf_attr *attr)
@@ -554,6 +599,20 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
{
return false;
}
+
+static inline bool bpf_map_is_dev_bound(struct bpf_map *map)
+{
+ return false;
+}
+
+static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void bpf_map_offload_map_free(struct bpf_map *map)
+{
+}
#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 49bfc6eec74c..ecbdb5d519d6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -802,6 +802,8 @@ enum bpf_netdev_command {
BPF_OFFLOAD_VERIFIER_PREP,
BPF_OFFLOAD_TRANSLATE,
BPF_OFFLOAD_DESTROY,
+ BPF_OFFLOAD_MAP_ALLOC,
+ BPF_OFFLOAD_MAP_FREE,
};
struct bpf_prog_offload_ops;
@@ -832,6 +834,10 @@ struct netdev_bpf {
struct {
struct bpf_prog *prog;
} offload;
+ /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */
+ struct {
+ struct bpf_offloaded_map *offmap;
+ };
};
};
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f2f8b36e2ad4..9910ab632652 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -245,6 +245,7 @@ union bpf_attr {
* BPF_F_NUMA_NODE is set).
*/
char map_name[BPF_OBJ_NAME_LEN];
+ __u32 map_ifindex; /* ifindex of netdev to create on */
};
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index cdd1e19a668b..dddc172c8818 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -24,11 +24,13 @@
#include <linux/rtnetlink.h>
#include <linux/rwsem.h>
-/* Protects bpf_prog_offload_devs and offload members of all progs.
+/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members
+ * of all progs.
* RTNL lock cannot be taken when holding this lock.
*/
static DECLARE_RWSEM(bpf_devs_lock);
static LIST_HEAD(bpf_prog_offload_devs);
+static LIST_HEAD(bpf_map_offload_devs);
static int bpf_dev_offload_check(struct net_device *netdev)
{
@@ -250,11 +252,187 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
const struct bpf_prog_ops bpf_offload_prog_ops = {
};
+static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
+ enum bpf_netdev_command cmd)
+{
+ struct netdev_bpf data = {};
+ struct net_device *netdev;
+
+ ASSERT_RTNL();
+
+ data.command = cmd;
+ data.offmap = offmap;
+ /* Caller must make sure netdev is valid */
+ netdev = offmap->netdev;
+
+ return netdev->netdev_ops->ndo_bpf(netdev, &data);
+}
+
+struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_offloaded_map *offmap;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
+ attr->map_type != BPF_MAP_TYPE_HASH)
+ return ERR_PTR(-EINVAL);
+
+ offmap = kzalloc(sizeof(*offmap), GFP_USER);
+ if (!offmap)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&offmap->map, attr);
+
+ rtnl_lock();
+ down_write(&bpf_devs_lock);
+ offmap->netdev = __dev_get_by_index(net, attr->map_ifindex);
+ err = bpf_dev_offload_check(offmap->netdev);
+ if (err)
+ goto err_unlock;
+
+ err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC);
+ if (err)
+ goto err_unlock;
+
+ list_add_tail(&offmap->offloads, &bpf_map_offload_devs);
+ up_write(&bpf_devs_lock);
+ rtnl_unlock();
+
+ return &offmap->map;
+
+err_unlock:
+ up_write(&bpf_devs_lock);
+ rtnl_unlock();
+ kfree(offmap);
+ return ERR_PTR(err);
+}
+
+static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
+{
+ WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
+ /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
+ bpf_map_free_id(&offmap->map, true);
+ list_del_init(&offmap->offloads);
+ offmap->netdev = NULL;
+}
+
+void bpf_map_offload_map_free(struct bpf_map *map)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+
+ rtnl_lock();
+ down_write(&bpf_devs_lock);
+ if (offmap->netdev)
+ __bpf_map_offload_destroy(offmap);
+ up_write(&bpf_devs_lock);
+ rtnl_unlock();
+
+ kfree(offmap);
+}
+
+int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_lookup_elem(offmap, key, value);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_map_offload_update_elem(struct bpf_map *map,
+ void *key, void *value, u64 flags)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_update_elem(offmap, key, value,
+ flags);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_map_offload_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_delete_elem(offmap, key);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
+{
+ struct bpf_offloaded_map *offmap;
+ struct bpf_prog_offload *offload;
+ bool ret;
+
+ if (!!bpf_prog_is_dev_bound(prog->aux) != !!bpf_map_is_dev_bound(map))
+ return false;
+ if (!bpf_prog_is_dev_bound(prog->aux))
+ return true;
+
+ down_read(&bpf_devs_lock);
+ offload = prog->aux->offload;
+ offmap = map_to_offmap(map);
+
+ ret = offload && offload->netdev == offmap->netdev;
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+static void bpf_offload_orphan_all_progs(struct net_device *netdev)
+{
+ struct bpf_prog_offload *offload, *tmp;
+
+ list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads)
+ if (offload->netdev == netdev)
+ __bpf_prog_offload_destroy(offload->prog);
+}
+
+static void bpf_offload_orphan_all_maps(struct net_device *netdev)
+{
+ struct bpf_offloaded_map *offmap, *tmp;
+
+ list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads)
+ if (offmap->netdev == netdev)
+ __bpf_map_offload_destroy(offmap);
+}
+
static int bpf_offload_notification(struct notifier_block *notifier,
ulong event, void *ptr)
{
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
- struct bpf_prog_offload *offload, *tmp;
ASSERT_RTNL();
@@ -265,11 +443,8 @@ static int bpf_offload_notification(struct notifier_block *notifier,
break;
down_write(&bpf_devs_lock);
- list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
- offloads) {
- if (offload->netdev == netdev)
- __bpf_prog_offload_destroy(offload->prog);
- }
+ bpf_offload_orphan_all_progs(netdev);
+ bpf_offload_orphan_all_maps(netdev);
up_write(&bpf_devs_lock);
break;
default:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 14adf29fc0f2..d5b773f04da1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -94,6 +94,11 @@ static int check_uarg_tail_zero(void __user *uaddr,
return 0;
}
+const struct bpf_map_ops bpf_map_offload_ops = {
+ .map_alloc = bpf_map_offload_map_alloc,
+ .map_free = bpf_map_offload_map_free,
+};
+
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
const struct bpf_map_ops *ops;
@@ -111,6 +116,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
if (err)
return ERR_PTR(err);
}
+ if (attr->map_ifindex)
+ ops = &bpf_map_offload_ops;
map = ops->map_alloc(attr);
if (IS_ERR(map))
return map;
@@ -208,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map)
return id > 0 ? 0 : id;
}
-static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
{
unsigned long flags;
+ /* Offloaded maps are removed from the IDR store when their device
+ * disappears - even if someone holds an fd to them they are unusable,
+ * the memory is gone, all ops will fail; they are simply waiting for
+ * refcnt to drop to be freed.
+ */
+ if (!map->id)
+ return;
+
if (do_idr_lock)
spin_lock_irqsave(&map_idr_lock, flags);
else
__acquire(&map_idr_lock);
idr_remove(&map_idr, map->id);
+ map->id = 0;
if (do_idr_lock)
spin_unlock_irqrestore(&map_idr_lock, flags);
@@ -397,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
return 0;
}
-#define BPF_MAP_CREATE_LAST_FIELD map_name
+#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -585,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!value)
goto free_key;
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_lookup_elem(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value);
@@ -676,6 +694,9 @@ static int map_update_elem(union bpf_attr *attr)
if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
err = map->ops->map_update_elem(map, key, value, attr->flags);
goto out;
+ } else if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_update_elem(map, key, value, attr->flags);
+ goto out;
}
/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
@@ -750,6 +771,11 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put;
}
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_delete_elem(map, key);
+ goto out;
+ }
+
preempt_disable();
__this_cpu_inc(bpf_prog_active);
rcu_read_lock();
@@ -757,7 +783,7 @@ static int map_delete_elem(union bpf_attr *attr)
rcu_read_unlock();
__this_cpu_dec(bpf_prog_active);
preempt_enable();
-
+out:
if (!err)
trace_bpf_map_delete_elem(map, ufd, key);
kfree(key);
@@ -807,9 +833,15 @@ static int map_get_next_key(union bpf_attr *attr)
if (!next_key)
goto free_key;
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_get_next_key(map, key, next_key);
+ goto out;
+ }
+
rcu_read_lock();
err = map->ops->map_get_next_key(map, key, next_key);
rcu_read_unlock();
+out:
if (err)
goto free_next_key;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a2b211262c25..187f245a3338 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4795,6 +4795,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
}
+
+ if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
+ !bpf_offload_dev_match(prog, map)) {
+ verbose(env, "offload device mismatch between prog and map\n");
+ return -EINVAL;
+ }
+
return 0;
}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4e8c60acfa32..69f96af4a569 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -245,6 +245,7 @@ union bpf_attr {
* BPF_F_NUMA_NODE is set).
*/
char map_name[BPF_OBJ_NAME_LEN];
+ __u32 map_ifindex; /* ifindex of netdev to create on */
};
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
--
2.15.1
^ permalink raw reply related
* [PATCH bpf-next 07/16] bpf: offload: factor out netdev checking at allocation time
From: Jakub Kicinski @ 2018-01-05 6:09 UTC (permalink / raw)
To: netdev, alexei.starovoitov, daniel, davem
Cc: oss-drivers, tehnerd, Jakub Kicinski
In-Reply-To: <20180105060931.30815-1-jakub.kicinski@netronome.com>
Add a helper to check if netdev could be found and whether it
has .ndo_bpf callback. There is no need to check the callback
every time it's invoked, ndos can't reasonably be swapped for
a set without .ndp_bpf while program is loaded.
bpf_dev_offload_check() will also be used by map offload.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
kernel/bpf/offload.c | 28 ++++++++++++++++++++--------
1 file changed, 20 insertions(+), 8 deletions(-)
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 001ddfde7874..cdd1e19a668b 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -30,9 +30,19 @@
static DECLARE_RWSEM(bpf_devs_lock);
static LIST_HEAD(bpf_prog_offload_devs);
+static int bpf_dev_offload_check(struct net_device *netdev)
+{
+ if (!netdev)
+ return -EINVAL;
+ if (!netdev->netdev_ops->ndo_bpf)
+ return -EOPNOTSUPP;
+ return 0;
+}
+
int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
{
struct bpf_prog_offload *offload;
+ int err;
if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
attr->prog_type != BPF_PROG_TYPE_XDP)
@@ -49,12 +59,15 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
offload->netdev = dev_get_by_index(current->nsproxy->net_ns,
attr->prog_ifindex);
- if (!offload->netdev)
- goto err_free;
+ err = bpf_dev_offload_check(offload->netdev);
+ if (err)
+ goto err_maybe_put;
down_write(&bpf_devs_lock);
- if (offload->netdev->reg_state != NETREG_REGISTERED)
+ if (offload->netdev->reg_state != NETREG_REGISTERED) {
+ err = -EINVAL;
goto err_unlock;
+ }
prog->aux->offload = offload;
list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
dev_put(offload->netdev);
@@ -63,10 +76,11 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
return 0;
err_unlock:
up_write(&bpf_devs_lock);
- dev_put(offload->netdev);
-err_free:
+err_maybe_put:
+ if (offload->netdev)
+ dev_put(offload->netdev);
kfree(offload);
- return -EINVAL;
+ return err;
}
static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
@@ -80,8 +94,6 @@ static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
if (!offload)
return -ENODEV;
netdev = offload->netdev;
- if (!netdev->netdev_ops->ndo_bpf)
- return -EOPNOTSUPP;
data->command = cmd;
--
2.15.1
^ permalink raw reply related
* [PATCH bpf-next 06/16] bpf: rename bpf_dev_offload -> bpf_prog_offload
From: Jakub Kicinski @ 2018-01-05 6:09 UTC (permalink / raw)
To: netdev, alexei.starovoitov, daniel, davem
Cc: oss-drivers, tehnerd, Jakub Kicinski
In-Reply-To: <20180105060931.30815-1-jakub.kicinski@netronome.com>
With map offload coming, we need to call program offload structure
something less ambiguous. Pure rename, no functional changes.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
drivers/net/ethernet/netronome/nfp/bpf/offload.c | 2 +-
include/linux/bpf.h | 4 ++--
kernel/bpf/offload.c | 10 +++++-----
3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index fa2905e67b07..1a67673261b7 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -234,7 +234,7 @@ int nfp_net_bpf_offload(struct nfp_net *nn, struct bpf_prog *prog,
int err;
if (prog) {
- struct bpf_dev_offload *offload = prog->aux->offload;
+ struct bpf_prog_offload *offload = prog->aux->offload;
if (!offload)
return -EINVAL;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2146520e5214..425ca3cd1e32 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -191,7 +191,7 @@ struct bpf_prog_offload_ops {
int insn_idx, int prev_insn_idx);
};
-struct bpf_dev_offload {
+struct bpf_prog_offload {
struct bpf_prog *prog;
struct net_device *netdev;
void *dev_priv;
@@ -221,7 +221,7 @@ struct bpf_prog_aux {
#ifdef CONFIG_SECURITY
void *security;
#endif
- struct bpf_dev_offload *offload;
+ struct bpf_prog_offload *offload;
union {
struct work_struct work;
struct rcu_head rcu;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 040d4e0edf3f..001ddfde7874 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -32,7 +32,7 @@ static LIST_HEAD(bpf_prog_offload_devs);
int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
{
- struct bpf_dev_offload *offload;
+ struct bpf_prog_offload *offload;
if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
attr->prog_type != BPF_PROG_TYPE_XDP)
@@ -72,7 +72,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
struct netdev_bpf *data)
{
- struct bpf_dev_offload *offload = prog->aux->offload;
+ struct bpf_prog_offload *offload = prog->aux->offload;
struct net_device *netdev;
ASSERT_RTNL();
@@ -110,7 +110,7 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx)
{
- struct bpf_dev_offload *offload;
+ struct bpf_prog_offload *offload;
int ret = -ENODEV;
down_read(&bpf_devs_lock);
@@ -124,7 +124,7 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
{
- struct bpf_dev_offload *offload = prog->aux->offload;
+ struct bpf_prog_offload *offload = prog->aux->offload;
struct netdev_bpf data = {};
data.offload.prog = prog;
@@ -242,7 +242,7 @@ static int bpf_offload_notification(struct notifier_block *notifier,
ulong event, void *ptr)
{
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
- struct bpf_dev_offload *offload, *tmp;
+ struct bpf_prog_offload *offload, *tmp;
ASSERT_RTNL();
--
2.15.1
^ permalink raw reply related
* [PATCH bpf-next 05/16] bpf: add helper for copying attrs to struct bpf_map
From: Jakub Kicinski @ 2018-01-05 6:09 UTC (permalink / raw)
To: netdev, alexei.starovoitov, daniel, davem
Cc: oss-drivers, tehnerd, Jakub Kicinski
In-Reply-To: <20180105060931.30815-1-jakub.kicinski@netronome.com>
All map types reimplement the field-by-field copy of union bpf_attr
members into struct bpf_map. Add a helper to perform this operation.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
include/linux/bpf.h | 1 +
kernel/bpf/arraymap.c | 7 +------
kernel/bpf/cpumap.c | 8 +-------
kernel/bpf/devmap.c | 8 +-------
kernel/bpf/hashtab.c | 9 +--------
kernel/bpf/lpm_trie.c | 7 +------
kernel/bpf/sockmap.c | 8 +-------
kernel/bpf/stackmap.c | 6 +-----
kernel/bpf/syscall.c | 10 ++++++++++
9 files changed, 18 insertions(+), 46 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ab8988d5e6c5..2146520e5214 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -368,6 +368,7 @@ void bpf_map_put(struct bpf_map *map);
int bpf_map_precharge_memlock(u32 pages);
void *bpf_map_area_alloc(size_t size, int numa_node);
void bpf_map_area_free(void *base);
+void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
extern int sysctl_unprivileged_bpf_disabled;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 7f9a684e4947..e25984885d27 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -96,12 +96,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
/* copy mandatory map attributes */
- array->map.map_type = attr->map_type;
- array->map.key_size = attr->key_size;
- array->map.value_size = attr->value_size;
- array->map.max_entries = attr->max_entries;
- array->map.map_flags = attr->map_flags;
- array->map.numa_node = numa_node;
+ bpf_map_init_from_attr(&array->map, attr);
array->elem_size = elem_size;
if (!percpu)
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ce5b669003b2..192151ec9d12 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -94,13 +94,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
if (!cmap)
return ERR_PTR(-ENOMEM);
- /* mandatory map attributes */
- cmap->map.map_type = attr->map_type;
- cmap->map.key_size = attr->key_size;
- cmap->map.value_size = attr->value_size;
- cmap->map.max_entries = attr->max_entries;
- cmap->map.map_flags = attr->map_flags;
- cmap->map.numa_node = bpf_map_attr_numa_node(attr);
+ bpf_map_init_from_attr(&cmap->map, attr);
/* Pre-limit array size based on NR_CPUS, not final CPU check */
if (cmap->map.max_entries > NR_CPUS) {
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index ebdef54bf7df..565f9ece9115 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -93,13 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (!dtab)
return ERR_PTR(-ENOMEM);
- /* mandatory map attributes */
- dtab->map.map_type = attr->map_type;
- dtab->map.key_size = attr->key_size;
- dtab->map.value_size = attr->value_size;
- dtab->map.max_entries = attr->max_entries;
- dtab->map.map_flags = attr->map_flags;
- dtab->map.numa_node = bpf_map_attr_numa_node(attr);
+ bpf_map_init_from_attr(&dtab->map, attr);
/* make sure page count doesn't overflow */
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 7fd6519444d3..b76828f23b49 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -304,7 +304,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
- int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_htab *htab;
int err, i;
u64 cost;
@@ -313,13 +312,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (!htab)
return ERR_PTR(-ENOMEM);
- /* mandatory map attributes */
- htab->map.map_type = attr->map_type;
- htab->map.key_size = attr->key_size;
- htab->map.value_size = attr->value_size;
- htab->map.max_entries = attr->max_entries;
- htab->map.map_flags = attr->map_flags;
- htab->map.numa_node = numa_node;
+ bpf_map_init_from_attr(&htab->map, attr);
if (percpu_lru) {
/* ensure each CPU's lru list has >=1 elements.
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 885e45479680..584e02227671 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -522,12 +522,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
/* copy mandatory map attributes */
- trie->map.map_type = attr->map_type;
- trie->map.key_size = attr->key_size;
- trie->map.value_size = attr->value_size;
- trie->map.max_entries = attr->max_entries;
- trie->map.map_flags = attr->map_flags;
- trie->map.numa_node = bpf_map_attr_numa_node(attr);
+ bpf_map_init_from_attr(&trie->map, attr);
trie->data_size = attr->key_size -
offsetof(struct bpf_lpm_trie_key, data);
trie->max_prefixlen = trie->data_size * 8;
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 5ee2e41893d9..2ccd4b1a8683 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -521,13 +521,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
if (!stab)
return ERR_PTR(-ENOMEM);
- /* mandatory map attributes */
- stab->map.map_type = attr->map_type;
- stab->map.key_size = attr->key_size;
- stab->map.value_size = attr->value_size;
- stab->map.max_entries = attr->max_entries;
- stab->map.map_flags = attr->map_flags;
- stab->map.numa_node = bpf_map_attr_numa_node(attr);
+ bpf_map_init_from_attr(&stab->map, attr);
/* make sure page count doesn't overflow */
cost = (u64) stab->map.max_entries * sizeof(struct sock *);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index a15bc636cc98..6039bf1aa945 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -88,14 +88,10 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (cost >= U32_MAX - PAGE_SIZE)
goto free_smap;
- smap->map.map_type = attr->map_type;
- smap->map.key_size = attr->key_size;
+ bpf_map_init_from_attr(&smap->map, attr);
smap->map.value_size = value_size;
- smap->map.max_entries = attr->max_entries;
- smap->map.map_flags = attr->map_flags;
smap->n_buckets = n_buckets;
smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
- smap->map.numa_node = bpf_map_attr_numa_node(attr);
err = bpf_map_precharge_memlock(smap->map.pages);
if (err)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 297677c27095..14adf29fc0f2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -143,6 +143,16 @@ void bpf_map_area_free(void *area)
kvfree(area);
}
+void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
+{
+ map->map_type = attr->map_type;
+ map->key_size = attr->key_size;
+ map->value_size = attr->value_size;
+ map->max_entries = attr->max_entries;
+ map->map_flags = attr->map_flags;
+ map->numa_node = bpf_map_attr_numa_node(attr);
+}
+
int bpf_map_precharge_memlock(u32 pages)
{
struct user_struct *user = get_current_user();
--
2.15.1
^ permalink raw reply related
* [PATCH bpf-next 04/16] bpf: hashtab: move checks out of alloc function
From: Jakub Kicinski @ 2018-01-05 6:09 UTC (permalink / raw)
To: netdev, alexei.starovoitov, daniel, davem
Cc: oss-drivers, tehnerd, Jakub Kicinski
In-Reply-To: <20180105060931.30815-1-jakub.kicinski@netronome.com>
Use the new callback to perform allocation checks for hash maps.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
kernel/bpf/hashtab.c | 55 +++++++++++++++++++++++++++++++++++++---------------
1 file changed, 39 insertions(+), 16 deletions(-)
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b80f42adf068..7fd6519444d3 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -227,7 +227,7 @@ static int alloc_extra_elems(struct bpf_htab *htab)
}
/* Called from syscall */
-static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+static int htab_map_alloc_check(union bpf_attr *attr)
{
bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
@@ -241,9 +241,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
int numa_node = bpf_map_attr_numa_node(attr);
- struct bpf_htab *htab;
- int err, i;
- u64 cost;
BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
offsetof(struct htab_elem, hash_node.pprev));
@@ -254,33 +251,33 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
/* LRU implementation is much complicated than other
* maps. Hence, limit to CAP_SYS_ADMIN for now.
*/
- return ERR_PTR(-EPERM);
+ return -EPERM;
if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
/* reserved bits should not be used */
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
if (!lru && percpu_lru)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
if (lru && !prealloc)
- return ERR_PTR(-ENOTSUPP);
+ return -ENOTSUPP;
if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
/* check sanity of attributes.
* value_size == 0 may be allowed in the future to use map as a set
*/
if (attr->max_entries == 0 || attr->key_size == 0 ||
attr->value_size == 0)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
if (attr->key_size > MAX_BPF_STACK)
/* eBPF programs initialize keys on stack, so they cannot be
* larger than max stack size
*/
- return ERR_PTR(-E2BIG);
+ return -E2BIG;
if (attr->value_size >= KMALLOC_MAX_SIZE -
MAX_BPF_STACK - sizeof(struct htab_elem))
@@ -289,7 +286,28 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
* sure that the elem_size doesn't overflow and it's
* kmalloc-able later in htab_map_update_elem()
*/
- return ERR_PTR(-E2BIG);
+ return -E2BIG;
+
+ return 0;
+}
+
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+ bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+ bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
+ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+ /* percpu_lru means each cpu has its own LRU list.
+ * it is different from BPF_MAP_TYPE_PERCPU_HASH where
+ * the map's value itself is percpu. percpu_lru has
+ * nothing to do with the map's value.
+ */
+ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
+ bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+ int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_htab *htab;
+ int err, i;
+ u64 cost;
htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab)
@@ -1142,6 +1160,7 @@ static void htab_map_free(struct bpf_map *map)
}
const struct bpf_map_ops htab_map_ops = {
+ .map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1152,6 +1171,7 @@ const struct bpf_map_ops htab_map_ops = {
};
const struct bpf_map_ops htab_lru_map_ops = {
+ .map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1235,6 +1255,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
}
const struct bpf_map_ops htab_percpu_map_ops = {
+ .map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1244,6 +1265,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
};
const struct bpf_map_ops htab_lru_percpu_map_ops = {
+ .map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1252,11 +1274,11 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
};
-static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
+static int fd_htab_map_alloc_check(union bpf_attr *attr)
{
if (attr->value_size != sizeof(u32))
- return ERR_PTR(-EINVAL);
- return htab_map_alloc(attr);
+ return -EINVAL;
+ return htab_map_alloc_check(attr);
}
static void fd_htab_map_free(struct bpf_map *map)
@@ -1327,7 +1349,7 @@ static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
if (IS_ERR(inner_map_meta))
return inner_map_meta;
- map = fd_htab_map_alloc(attr);
+ map = htab_map_alloc(attr);
if (IS_ERR(map)) {
bpf_map_meta_free(inner_map_meta);
return map;
@@ -1371,6 +1393,7 @@ static void htab_of_map_free(struct bpf_map *map)
}
const struct bpf_map_ops htab_of_maps_map_ops = {
+ .map_alloc_check = fd_htab_map_alloc_check,
.map_alloc = htab_of_map_alloc,
.map_free = htab_of_map_free,
.map_get_next_key = htab_map_get_next_key,
--
2.15.1
^ permalink raw reply related
* [PATCH bpf-next 03/16] bpf: hashtab: move attribute validation before allocation
From: Jakub Kicinski @ 2018-01-05 6:09 UTC (permalink / raw)
To: netdev, alexei.starovoitov, daniel, davem
Cc: oss-drivers, tehnerd, Jakub Kicinski
In-Reply-To: <20180105060931.30815-1-jakub.kicinski@netronome.com>
Number of attribute checks are currently performed after hashtab
is already allocated. Move them to be able to split them out to
the check function later on. Checks have to now be performed on
the attr union directly instead of the members of bpf_map, since
bpf_map will be allocated later. No functional changes.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
kernel/bpf/hashtab.c | 47 +++++++++++++++++++++++------------------------
1 file changed, 23 insertions(+), 24 deletions(-)
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3905d4bc5b80..b80f42adf068 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -269,6 +269,28 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
return ERR_PTR(-EINVAL);
+ /* check sanity of attributes.
+ * value_size == 0 may be allowed in the future to use map as a set
+ */
+ if (attr->max_entries == 0 || attr->key_size == 0 ||
+ attr->value_size == 0)
+ return ERR_PTR(-EINVAL);
+
+ if (attr->key_size > MAX_BPF_STACK)
+ /* eBPF programs initialize keys on stack, so they cannot be
+ * larger than max stack size
+ */
+ return ERR_PTR(-E2BIG);
+
+ if (attr->value_size >= KMALLOC_MAX_SIZE -
+ MAX_BPF_STACK - sizeof(struct htab_elem))
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements via bpf syscall. This check also makes
+ * sure that the elem_size doesn't overflow and it's
+ * kmalloc-able later in htab_map_update_elem()
+ */
+ return ERR_PTR(-E2BIG);
+
htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab)
return ERR_PTR(-ENOMEM);
@@ -281,14 +303,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->map.map_flags = attr->map_flags;
htab->map.numa_node = numa_node;
- /* check sanity of attributes.
- * value_size == 0 may be allowed in the future to use map as a set
- */
- err = -EINVAL;
- if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
- htab->map.value_size == 0)
- goto free_htab;
-
if (percpu_lru) {
/* ensure each CPU's lru list has >=1 elements.
* since we are at it, make each lru list has the same
@@ -304,22 +318,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
/* hash table size must be power of 2 */
htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
- err = -E2BIG;
- if (htab->map.key_size > MAX_BPF_STACK)
- /* eBPF programs initialize keys on stack, so they cannot be
- * larger than max stack size
- */
- goto free_htab;
-
- if (htab->map.value_size >= KMALLOC_MAX_SIZE -
- MAX_BPF_STACK - sizeof(struct htab_elem))
- /* if value_size is bigger, the user space won't be able to
- * access the elements via bpf syscall. This check also makes
- * sure that the elem_size doesn't overflow and it's
- * kmalloc-able later in htab_map_update_elem()
- */
- goto free_htab;
-
htab->elem_size = sizeof(struct htab_elem) +
round_up(htab->map.key_size, 8);
if (percpu)
@@ -327,6 +325,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
else
htab->elem_size += round_up(htab->map.value_size, 8);
+ err = -E2BIG;
/* prevent zero size kmalloc and check for u32 overflow */
if (htab->n_buckets == 0 ||
htab->n_buckets > U32_MAX / sizeof(struct bucket))
--
2.15.1
^ permalink raw reply related
* [PATCH bpf-next 02/16] bpf: array: move checks out of alloc function
From: Jakub Kicinski @ 2018-01-05 6:09 UTC (permalink / raw)
To: netdev, alexei.starovoitov, daniel, davem
Cc: oss-drivers, tehnerd, Jakub Kicinski
In-Reply-To: <20180105060931.30815-1-jakub.kicinski@netronome.com>
Use the new callback to perform allocation checks for array maps.
The fd maps don't need a special allocation callback, they only
need a special check callback.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
kernel/bpf/arraymap.c | 40 +++++++++++++++++++++++++++-------------
1 file changed, 27 insertions(+), 13 deletions(-)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 7c25426d3cf5..7f9a684e4947 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -49,26 +49,34 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
}
/* Called from syscall */
-static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+static int array_map_alloc_check(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int numa_node = bpf_map_attr_numa_node(attr);
- struct bpf_array *array;
- u64 array_size;
- u32 elem_size;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
attr->value_size == 0 ||
attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
(percpu && numa_node != NUMA_NO_NODE))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
if (attr->value_size > KMALLOC_MAX_SIZE)
/* if value_size is bigger, the user space won't be able to
* access the elements.
*/
- return ERR_PTR(-E2BIG);
+ return -E2BIG;
+
+ return 0;
+}
+
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+ bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_array *array;
+ u64 array_size;
+ u32 elem_size;
elem_size = round_up(attr->value_size, 8);
@@ -296,6 +304,7 @@ static void array_map_free(struct bpf_map *map)
}
const struct bpf_map_ops array_map_ops = {
+ .map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
@@ -306,6 +315,7 @@ const struct bpf_map_ops array_map_ops = {
};
const struct bpf_map_ops percpu_array_map_ops = {
+ .map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
@@ -314,12 +324,12 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_delete_elem = array_map_delete_elem,
};
-static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
+static int fd_array_map_alloc_check(union bpf_attr *attr)
{
/* only file descriptors can be stored in this type of map */
if (attr->value_size != sizeof(u32))
- return ERR_PTR(-EINVAL);
- return array_map_alloc(attr);
+ return -EINVAL;
+ return array_map_alloc_check(attr);
}
static void fd_array_map_free(struct bpf_map *map)
@@ -443,7 +453,8 @@ void bpf_fd_array_map_clear(struct bpf_map *map)
}
const struct bpf_map_ops prog_array_map_ops = {
- .map_alloc = fd_array_map_alloc,
+ .map_alloc_check = fd_array_map_alloc_check,
+ .map_alloc = array_map_alloc,
.map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
@@ -530,7 +541,8 @@ static void perf_event_fd_array_release(struct bpf_map *map,
}
const struct bpf_map_ops perf_event_array_map_ops = {
- .map_alloc = fd_array_map_alloc,
+ .map_alloc_check = fd_array_map_alloc_check,
+ .map_alloc = array_map_alloc,
.map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
@@ -561,7 +573,8 @@ static void cgroup_fd_array_free(struct bpf_map *map)
}
const struct bpf_map_ops cgroup_array_map_ops = {
- .map_alloc = fd_array_map_alloc,
+ .map_alloc_check = fd_array_map_alloc_check,
+ .map_alloc = array_map_alloc,
.map_free = cgroup_fd_array_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
@@ -579,7 +592,7 @@ static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
if (IS_ERR(inner_map_meta))
return inner_map_meta;
- map = fd_array_map_alloc(attr);
+ map = array_map_alloc(attr);
if (IS_ERR(map)) {
bpf_map_meta_free(inner_map_meta);
return map;
@@ -636,6 +649,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
}
const struct bpf_map_ops array_of_maps_map_ops = {
+ .map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_of_map_alloc,
.map_free = array_of_map_free,
.map_get_next_key = array_map_get_next_key,
--
2.15.1
^ permalink raw reply related
* [PATCH bpf-next 01/16] bpf: add map_alloc_check callback
From: Jakub Kicinski @ 2018-01-05 6:09 UTC (permalink / raw)
To: netdev, alexei.starovoitov, daniel, davem
Cc: oss-drivers, tehnerd, Jakub Kicinski
In-Reply-To: <20180105060931.30815-1-jakub.kicinski@netronome.com>
.map_alloc callbacks contain a number of checks validating user-
-provided map attributes against constraints of a particular map
type. For offloaded maps we will need to check map attributes
without actually allocating any memory on the host. Add a new
callback for validating attributes before any memory is allocated.
This callback can be selectively implemented by map types for
sharing code with offloads, or simply to separate the logical
steps of validation and allocation.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
include/linux/bpf.h | 1 +
kernel/bpf/syscall.c | 17 +++++++++++++----
2 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7810ae57b357..ab8988d5e6c5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -25,6 +25,7 @@ struct bpf_map;
/* map is generic key/value storage optionally accesible by eBPF programs */
struct bpf_map_ops {
/* funcs callable from userspace (via syscall) */
+ int (*map_alloc_check)(union bpf_attr *attr);
struct bpf_map *(*map_alloc)(union bpf_attr *attr);
void (*map_release)(struct bpf_map *map, struct file *map_file);
void (*map_free)(struct bpf_map *map);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ebf0fb23e237..297677c27095 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -96,16 +96,25 @@ static int check_uarg_tail_zero(void __user *uaddr,
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
+ const struct bpf_map_ops *ops;
struct bpf_map *map;
+ int err;
- if (attr->map_type >= ARRAY_SIZE(bpf_map_types) ||
- !bpf_map_types[attr->map_type])
+ if (attr->map_type >= ARRAY_SIZE(bpf_map_types))
+ return ERR_PTR(-EINVAL);
+ ops = bpf_map_types[attr->map_type];
+ if (!ops)
return ERR_PTR(-EINVAL);
- map = bpf_map_types[attr->map_type]->map_alloc(attr);
+ if (ops->map_alloc_check) {
+ err = ops->map_alloc_check(attr);
+ if (err)
+ return ERR_PTR(err);
+ }
+ map = ops->map_alloc(attr);
if (IS_ERR(map))
return map;
- map->ops = bpf_map_types[attr->map_type];
+ map->ops = ops;
map->map_type = attr->map_type;
return map;
}
--
2.15.1
^ permalink raw reply related
* [PATCH bpf-next 00/16] bpf: support creating maps on networking devices
From: Jakub Kicinski @ 2018-01-05 6:09 UTC (permalink / raw)
To: netdev, alexei.starovoitov, daniel, davem
Cc: oss-drivers, tehnerd, Jakub Kicinski
Hi!
This set adds support for creating maps on networking devices. BPF is
programs+maps, the pure program offload has been around for quite some
time, this patchset adds the map part of the equation.
Maps are allocated on the target device from the start. There is no
host copy when map is created on the device. Device maps are represented
by struct bpf_offloaded_map, regardless of type. Host programs can't
access such maps, access is only possible from a program also loaded
to the same device and/or via the BPF syscall.
Two types of maps are supported - global hash maps and array maps.
Offloaded programs are currently only allowed to perform lookups,
control plane is responsible for populating the maps.
For brevity only infrastructure and basic NFP patches are included.
Target device reporting, netdevsim and tests will follow up as well as
some further optimizations to the NFP code.
Jakub Kicinski (16):
bpf: add map_alloc_check callback
bpf: array: move checks out of alloc function
bpf: hashtab: move attribute validation before allocation
bpf: hashtab: move checks out of alloc function
bpf: add helper for copying attrs to struct bpf_map
bpf: rename bpf_dev_offload -> bpf_prog_offload
bpf: offload: factor out netdev checking at allocation time
bpf: offload: add map offload infrastructure
nfp: hand over to BPF offload app at coarser granularity
nfp: bpf: add map data structure
nfp: bpf: add basic control channel communication
nfp: bpf: implement helpers for FW map ops
nfp: bpf: parse function call and map capabilities
nfp: bpf: add verification and codegen for map lookups
nfp: bpf: add support for reading map memory
nfp: bpf: implement bpf map offload
drivers/net/ethernet/netronome/nfp/Makefile | 1 +
drivers/net/ethernet/netronome/nfp/bpf/cmsg.c | 446 +++++++++++++++++++++
drivers/net/ethernet/netronome/nfp/bpf/fw.h | 103 +++++
drivers/net/ethernet/netronome/nfp/bpf/jit.c | 128 +++++-
drivers/net/ethernet/netronome/nfp/bpf/main.c | 65 ++-
drivers/net/ethernet/netronome/nfp/bpf/main.h | 100 ++++-
drivers/net/ethernet/netronome/nfp/bpf/offload.c | 133 +++++-
drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 47 +++
drivers/net/ethernet/netronome/nfp/nfp_app.h | 48 +--
drivers/net/ethernet/netronome/nfp/nfp_net.h | 12 +
.../net/ethernet/netronome/nfp/nfp_net_common.c | 17 +-
include/linux/bpf.h | 65 ++-
include/linux/netdevice.h | 6 +
include/uapi/linux/bpf.h | 1 +
kernel/bpf/arraymap.c | 47 ++-
kernel/bpf/cpumap.c | 8 +-
kernel/bpf/devmap.c | 8 +-
kernel/bpf/hashtab.c | 103 +++--
kernel/bpf/lpm_trie.c | 7 +-
kernel/bpf/offload.c | 225 ++++++++++-
kernel/bpf/sockmap.c | 8 +-
kernel/bpf/stackmap.c | 6 +-
kernel/bpf/syscall.c | 69 +++-
kernel/bpf/verifier.c | 7 +
tools/include/uapi/linux/bpf.h | 1 +
25 files changed, 1466 insertions(+), 195 deletions(-)
create mode 100644 drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
--
2.15.1
^ permalink raw reply
* [PATCH v2 bpf] bpf: prevent out-of-bounds speculation
From: Alexei Starovoitov @ 2018-01-05 5:45 UTC (permalink / raw)
To: David S . Miller; +Cc: Daniel Borkmann, netdev
From: Alexei Starovoitov <ast@kernel.org>
Under speculation, CPUs may mis-predict branches in bounds checks. Thus,
memory accesses under a bounds check may be speculated even if the
bounds check fails, providing a primitive for building a side channel.
To avoid leaking kernel data round up array-based maps and mask the index
after bounds check, so speculated load with out of bounds index will load
either valid value from the array or zero from the padded area.
To avoid duplicating map_lookup functions for root/unpriv always generate
a sequence of bpf instructions equivalent to map_lookup function for
array and array_of_maps map types when map was created by unpriv user.
And unconditionally mask index for percpu_array, since it's fast enough,
even when max_entries are not rounded to power of 2 for root user,
since percpu_array doesn't have map_gen_lookup callback yet.
If prog_array map is created by unpriv user replace
bpf_tail_call(ctx, map, index);
with
if (index >= max_entries) {
index &= map->index_mask;
bpf_tail_call(ctx, map, index);
}
(along with roundup to power 2) to prevent out-of-bounds speculation.
There is secondary redundant 'if (index >= max_entries)' in the interpreter
and in all JITs, but they can be optimized later if necessary.
Other array-like maps (cpumap, devmap, sockmap, perf_event_array, cgroup_array)
cannot be used by unpriv, so no changes there.
That fixes bpf side of "Variant 1: bounds check bypass (CVE-2017-5753)" on
all architectures with and without JIT.
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
v1->v2: no difference.
No idea why vger stopped liking my emails.
Resending for the 3rd time with tiny cc hoping that would do the trick.
I don't think I changed anything on my side.
include/linux/bpf.h | 2 ++
kernel/bpf/arraymap.c | 36 ++++++++++++++++++++++++++++++------
kernel/bpf/verifier.c | 47 +++++++++++++++++++++++++++++++++++++++++++++--
3 files changed, 77 insertions(+), 8 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e55e4255a210..877eaa14a89b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -52,6 +52,7 @@ struct bpf_map {
u32 pages;
u32 id;
int numa_node;
+ bool unpriv;
struct user_struct *user;
const struct bpf_map_ops *ops;
struct work_struct work;
@@ -221,6 +222,7 @@ struct bpf_prog_aux {
struct bpf_array {
struct bpf_map map;
u32 elem_size;
+ u32 index_mask;
/* 'ownership' of prog_array is claimed by the first program that
* is going to use this map or by the first program which FD is stored
* in the map to make sure that all callers and callees have the same
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 7c25426d3cf5..9937fdf0de68 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -53,9 +53,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int numa_node = bpf_map_attr_numa_node(attr);
+ u32 elem_size, index_mask, max_entries;
+ bool unpriv = !capable(CAP_SYS_ADMIN);
struct bpf_array *array;
u64 array_size;
- u32 elem_size;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -72,11 +73,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
elem_size = round_up(attr->value_size, 8);
+ max_entries = attr->max_entries;
+ index_mask = roundup_pow_of_two(max_entries) - 1;
+
+ if (unpriv)
+ /* round up array size to nearest power of 2,
+ * since cpu will speculate within index_mask limits
+ */
+ max_entries = index_mask + 1;
+
array_size = sizeof(*array);
if (percpu)
- array_size += (u64) attr->max_entries * sizeof(void *);
+ array_size += (u64) max_entries * sizeof(void *);
else
- array_size += (u64) attr->max_entries * elem_size;
+ array_size += (u64) max_entries * elem_size;
/* make sure there is no u32 overflow later in round_up() */
if (array_size >= U32_MAX - PAGE_SIZE)
@@ -86,6 +96,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array = bpf_map_area_alloc(array_size, numa_node);
if (!array)
return ERR_PTR(-ENOMEM);
+ array->index_mask = index_mask;
+ array->map.unpriv = unpriv;
/* copy mandatory map attributes */
array->map.map_type = attr->map_type;
@@ -127,6 +139,7 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
u32 elem_size = round_up(map->value_size, 8);
const int ret = BPF_REG_0;
@@ -135,7 +148,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+ if (map->unpriv) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+ }
if (is_power_of_2(elem_size)) {
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
@@ -157,7 +175,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;
- return this_cpu_ptr(array->pptrs[index]);
+ return this_cpu_ptr(array->pptrs[index & array->index_mask]);
}
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
@@ -613,6 +631,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
static u32 array_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 elem_size = round_up(map->value_size, 8);
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
@@ -621,7 +640,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ if (map->unpriv) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ }
if (is_power_of_2(elem_size))
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
else
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 04b24876cd23..f3a2de94d73b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1729,6 +1729,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
if (err)
return err;
+ if (func_id == BPF_FUNC_tail_call) {
+ if (meta.map_ptr == NULL) {
+ verbose(env, "verifier bug\n");
+ return -EINVAL;
+ }
+ env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
+ }
err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
if (err)
return err;
@@ -4456,19 +4463,55 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
*/
insn->imm = 0;
insn->code = BPF_JMP | BPF_TAIL_CALL;
+
+ /* instead of changing every JIT dealing with tail_call
+ * emit two extra insns:
+ * if (index >= max_entries) goto out;
+ * index &= array->index_mask;
+ * to avoid out-of-bounds cpu speculation
+ */
+ map_ptr = env->insn_aux_data[i + delta].map_ptr;
+ if (map_ptr == BPF_MAP_PTR_POISON) {
+ verbose(env, "tail_call obusing map_ptr\n");
+ return -EINVAL;
+ }
+ if (!map_ptr->unpriv)
+ continue;
+ insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
+ map_ptr->max_entries, 2);
+ insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
+ container_of(map_ptr,
+ struct bpf_array,
+ map)->index_mask);
+ insn_buf[2] = *insn;
+ cnt = 3;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
continue;
}
/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
* handlers are currently limited to 64 bit only.
*/
- if (ebpf_jit_enabled() && BITS_PER_LONG == 64 &&
- insn->imm == BPF_FUNC_map_lookup_elem) {
+ if (insn->imm == BPF_FUNC_map_lookup_elem) {
map_ptr = env->insn_aux_data[i + delta].map_ptr;
if (map_ptr == BPF_MAP_PTR_POISON ||
!map_ptr->ops->map_gen_lookup)
goto patch_call_imm;
+ /* for unpriv map the verifier should emit bpf insns
+ * instead of map_lookup call regardless of JIT on/off
+ * and 32/64 bit
+ */
+ if (!map_ptr->unpriv &&
+ !(ebpf_jit_enabled() && BITS_PER_LONG == 64))
+ goto patch_call_imm;
+
cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
verbose(env, "bpf verifier is misconfigured\n");
--
2.9.5
^ permalink raw reply related
* Re: [bpf-next V4 PATCH 01/14] xdp: base API for new XDP rx-queue info concept
From: John Fastabend @ 2018-01-05 4:54 UTC (permalink / raw)
To: Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov
Cc: netdev, dsahern, gospo, bjorn.topel, michael.chan
In-Reply-To: <151497511370.18176.16970502050847272706.stgit@firesoul>
On 01/03/2018 02:25 AM, Jesper Dangaard Brouer wrote:
> This patch only introduce the core data structures and API functions.
> All XDP enabled drivers must use the API before this info can used.
>
> There is a need for XDP to know more about the RX-queue a given XDP
> frames have arrived on. For both the XDP bpf-prog and kernel side.
>
> Instead of extending xdp_buff each time new info is needed, the patch
> creates a separate read-mostly struct xdp_rxq_info, that contains this
> info. We stress this data/cache-line is for read-only info. This is
> NOT for dynamic per packet info, use the data_meta for such use-cases.
>
> The performance advantage is this info can be setup at RX-ring init
> time, instead of updating N-members in xdp_buff. A possible (driver
> level) micro optimization is that xdp_buff->rxq assignment could be
> done once per XDP/NAPI loop. The extra pointer deref only happens for
> program needing access to this info (thus, no slowdown to existing
> use-cases).
>
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
> include/linux/filter.h | 2 +
> include/net/xdp.h | 47 ++++++++++++++++++++++++++++++++++
> net/core/Makefile | 2 +
> net/core/xdp.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 117 insertions(+), 1 deletion(-)
> create mode 100644 include/net/xdp.h
> create mode 100644 net/core/xdp.c
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 2b0df2703671..425056c7f96c 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -20,6 +20,7 @@
> #include <linux/set_memory.h>
> #include <linux/kallsyms.h>
>
> +#include <net/xdp.h>
> #include <net/sch_generic.h>
Perhaps just 'struct xdp_rxq_info' is need here instead of
the full include. At least that is the pattern used for sk_buff
and sock. (by the way sorry for the late v4 feedback)
>
> #include <uapi/linux/filter.h>
> @@ -503,6 +504,7 @@ struct xdp_buff {
> void *data_end;
> void *data_meta;
> void *data_hard_start;
> + struct xdp_rxq_info *rxq;
> };
>
> /* Compute the linear packet data range [data, data_end) which
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> new file mode 100644
> index 000000000000..86c41631a908
> --- /dev/null
> +++ b/include/net/xdp.h
[...]
> +
> +/* Returns 0 on success, negative on failure */
> +int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
> + struct net_device *dev, u32 queue_index)
> +{
> + if (xdp_rxq->reg_state == REG_STATE_UNUSED) {
> + WARN(1, "Driver promised not to register this");
> + return -EINVAL;
> + }
> +
> + if (xdp_rxq->reg_state == REG_STATE_REGISTERED) {
> + WARN(1, "Missing unregister, handled but fix driver");
> + xdp_rxq_info_unreg(xdp_rxq);
> + }
> +
> + if (!dev) {
Seems a bit paranoid, driver passing a NULL dev would be
badly broken. And probably not important but could make
above tests unlikely().
> + WARN(1, "Missing net_device from driver");
> + return -ENODEV;
> + }
> +
> + /* State either UNREGISTERED or NEW */
> + xdp_rxq_info_init(xdp_rxq);
> + xdp_rxq->dev = dev;
> + xdp_rxq->queue_index = queue_index;
> +
> + xdp_rxq->reg_state = REG_STATE_REGISTERED;
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg);
> +
> +void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
> +{
> + xdp_rxq->reg_state = REG_STATE_UNUSED;
> +}
> +EXPORT_SYMBOL_GPL(xdp_rxq_info_unused);
>
^ permalink raw reply
* Re: [bpf-next V4 PATCH 07/14] bnxt_en: setup xdp_rxq_info
From: Michael Chan @ 2018-01-05 4:35 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: Daniel Borkmann, Alexei Starovoitov, Netdev, David Ahern,
Björn Töpel, Andy Gospodarek, Andy Gospodarek
In-Reply-To: <151497514422.18176.14804839354011251645.stgit@firesoul>
On Wed, Jan 3, 2018 at 2:25 AM, Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
> Driver hook points for xdp_rxq_info:
> * reg : bnxt_alloc_rx_rings
> * unreg: bnxt_free_rx_rings
>
> This driver should be updated to re-register when changing
> allocation mode of RX rings.
>
> Tested on actual hardware.
>
> Cc: Andy Gospodarek <andy@greyhouse.net>
> Cc: Michael Chan <michael.chan@broadcom.com>
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Looks good, thanks. Just one comment below.
> ---
> drivers/net/ethernet/broadcom/bnxt/bnxt.c | 10 ++++++++++
> drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 ++
> drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 1 +
> 3 files changed, 13 insertions(+)
>
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> index 9efbdc6f1fcb..89c3c8760a78 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> @@ -2247,6 +2247,9 @@ static void bnxt_free_rx_rings(struct bnxt *bp)
> if (rxr->xdp_prog)
> bpf_prog_put(rxr->xdp_prog);
>
> + if (xdp_rxq_info_is_reg(&rxr->xdp_rxq))
> + xdp_rxq_info_unreg(&rxr->xdp_rxq);
> +
> kfree(rxr->rx_tpa);
> rxr->rx_tpa = NULL;
>
> @@ -2280,6 +2283,10 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
>
> ring = &rxr->rx_ring_struct;
>
> + rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i);
We can optimize it a bit by only making this call if
BNXT_RX_PAGE_MODE() is true. We can only run XDP if
BNXT_RX_PAGE_MODE() is true.
> + if (rc < 0)
> + return rc;
> +
> rc = bnxt_alloc_ring(bp, ring);
> if (rc)
> return rc;
^ permalink raw reply
* Re: [bpf-next V4 PATCH 13/14] bpf: finally expose xdp_rxq_info to XDP bpf-programs
From: John Fastabend @ 2018-01-05 4:34 UTC (permalink / raw)
To: Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov
Cc: netdev, Alexei Starovoitov, dsahern, gospo, bjorn.topel,
michael.chan
In-Reply-To: <151497517476.18176.6299242208330652860.stgit@firesoul>
On 01/03/2018 02:26 AM, Jesper Dangaard Brouer wrote:
> Now all XDP driver have been updated to setup xdp_rxq_info and assign
> this to xdp_buff->rxq. Thus, it is now safe to enable access to some
> of the xdp_rxq_info struct members.
>
> This patch extend xdp_md and expose UAPI to userspace for
> ingress_ifindex and rx_queue_index. Access happens via bpf
> instruction rewrite, that load data directly from struct xdp_rxq_info.
>
> * ingress_ifindex map to xdp_rxq_info->dev->ifindex
> * rx_queue_index map to xdp_rxq_info->queue_index
>
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Acked-by: Alexei Starovoitov <ast@kernel.org>
> ---
LGTM
Acked-by: John Fastabend <john.fastabend@gmail.com>
^ permalink raw reply
* Re: [Intel-wired-lan] [bpf-next V4 PATCH 03/14] i40e: setup xdp_rxq_info
From: John Fastabend @ 2018-01-05 4:17 UTC (permalink / raw)
To: Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov
Cc: Paul Menzel, netdev, dsahern, intel-wired-lan, gospo, bjorn.topel,
michael.chan
In-Reply-To: <151497512387.18176.16261505821783867005.stgit@firesoul>
On 01/03/2018 02:25 AM, Jesper Dangaard Brouer wrote:
> The i40e driver has a special "FDIR" RX-ring (I40E_VSI_FDIR) which is
> a sideband channel for configuring/updating the flow director tables.
> This (i40e_vsi_)type does not invoke XDP-ebpf code.
>
> As suggested by Björn (V2): Instead of marking this I40E_VSI_FDIR RX-ring
> a special case, reverse the logic and only select RX-rings of type
> I40E_VSI_MAIN to register xdp_rxq_info's for.
>
> Driver hook points for xdp_rxq_info:
> * reg : i40e_setup_rx_descriptors (via i40e_vsi_setup_rx_resources)
> * unreg: i40e_free_rx_resources (via i40e_vsi_free_rx_resources)
>
> Tested on actual hardware with samples/bpf program.
>
> V2: Fixed bug in i40e_set_ringparam (memset zero) + match on I40E_VSI_MAIN.
> V4: Update patch desc that got out-of-sync with code.
>
> Cc: intel-wired-lan@lists.osuosl.org
> Cc: Björn Töpel <bjorn.topel@intel.com>
> Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> Cc: Paul Menzel <pmenzel@molgen.mpg.de>
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
> ---
Same here. LGTM.
Acked-by: John Fastabend <john.fastabend@gmail.com>
^ permalink raw reply
* Re: [Intel-wired-lan] [bpf-next V4 PATCH 04/14] ixgbe: setup xdp_rxq_info
From: John Fastabend @ 2018-01-05 4:13 UTC (permalink / raw)
To: Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov
Cc: netdev, dsahern, intel-wired-lan, gospo, bjorn.topel,
michael.chan
In-Reply-To: <151497512896.18176.14287471693450621693.stgit@firesoul>
On 01/03/2018 02:25 AM, Jesper Dangaard Brouer wrote:
> Driver hook points for xdp_rxq_info:
> * reg : ixgbe_setup_rx_resources()
> * unreg: ixgbe_free_rx_resources()
>
> Tested on actual hardware.
>
> V2: Fix ixgbe_set_ringparam, clear xdp_rxq_info in temp_ring
>
> Cc: intel-wired-lan@lists.osuosl.org
> Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> Cc: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
Looked a bit for reset paths that might be missed but didn't
find any. LGTM.
Acked-by: John Fastabend <john.fastabend@gmail.com>
^ permalink raw reply
* [PATCH NET-NEXT 1/1] forcedeth: remove duplicate structure member in rx
From: Zhu Yanjun @ 2018-01-05 4:06 UTC (permalink / raw)
To: keescook, netdev
Since both first_rx and rx_ring are the head of rx ring, it not
necessary to use two structure members to statically indicate
the head of rx ring. So first_rx is removed.
CC: Srinivas Eeda <srinivas.eeda@oracle.com>
CC: Joe Jin <joe.jin@oracle.com>
CC: Junxiao Bi <junxiao.bi@oracle.com>
Signed-off-by: Zhu Yanjun <yanjun.zhu@oracle.com>
---
drivers/net/ethernet/nvidia/forcedeth.c | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index a79b9f8..21e15cb 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -793,7 +793,7 @@ struct fe_priv {
/* rx specific fields.
* Locking: Within irq hander or disable_irq+spin_lock(&np->lock);
*/
- union ring_type get_rx, put_rx, first_rx, last_rx;
+ union ring_type get_rx, put_rx, last_rx;
struct nv_skb_map *get_rx_ctx, *put_rx_ctx;
struct nv_skb_map *first_rx_ctx, *last_rx_ctx;
struct nv_skb_map *rx_skb;
@@ -1812,7 +1812,7 @@ static int nv_alloc_rx(struct net_device *dev)
struct ring_desc *less_rx;
less_rx = np->get_rx.orig;
- if (less_rx-- == np->first_rx.orig)
+ if (less_rx-- == np->rx_ring.orig)
less_rx = np->last_rx.orig;
while (np->put_rx.orig != less_rx) {
@@ -1833,7 +1833,7 @@ static int nv_alloc_rx(struct net_device *dev)
wmb();
np->put_rx.orig->flaglen = cpu_to_le32(np->rx_buf_sz | NV_RX_AVAIL);
if (unlikely(np->put_rx.orig++ == np->last_rx.orig))
- np->put_rx.orig = np->first_rx.orig;
+ np->put_rx.orig = np->rx_ring.orig;
if (unlikely(np->put_rx_ctx++ == np->last_rx_ctx))
np->put_rx_ctx = np->first_rx_ctx;
} else {
@@ -1853,7 +1853,7 @@ static int nv_alloc_rx_optimized(struct net_device *dev)
struct ring_desc_ex *less_rx;
less_rx = np->get_rx.ex;
- if (less_rx-- == np->first_rx.ex)
+ if (less_rx-- == np->rx_ring.ex)
less_rx = np->last_rx.ex;
while (np->put_rx.ex != less_rx) {
@@ -1875,7 +1875,7 @@ static int nv_alloc_rx_optimized(struct net_device *dev)
wmb();
np->put_rx.ex->flaglen = cpu_to_le32(np->rx_buf_sz | NV_RX2_AVAIL);
if (unlikely(np->put_rx.ex++ == np->last_rx.ex))
- np->put_rx.ex = np->first_rx.ex;
+ np->put_rx.ex = np->rx_ring.ex;
if (unlikely(np->put_rx_ctx++ == np->last_rx_ctx))
np->put_rx_ctx = np->first_rx_ctx;
} else {
@@ -1903,7 +1903,8 @@ static void nv_init_rx(struct net_device *dev)
struct fe_priv *np = netdev_priv(dev);
int i;
- np->get_rx = np->put_rx = np->first_rx = np->rx_ring;
+ np->get_rx = np->rx_ring;
+ np->put_rx = np->rx_ring;
if (!nv_optimized(np))
np->last_rx.orig = &np->rx_ring.orig[np->rx_ring_size-1];
@@ -2911,7 +2912,7 @@ static int nv_rx_process(struct net_device *dev, int limit)
u64_stats_update_end(&np->swstats_rx_syncp);
next_pkt:
if (unlikely(np->get_rx.orig++ == np->last_rx.orig))
- np->get_rx.orig = np->first_rx.orig;
+ np->get_rx.orig = np->rx_ring.orig;
if (unlikely(np->get_rx_ctx++ == np->last_rx_ctx))
np->get_rx_ctx = np->first_rx_ctx;
@@ -3000,7 +3001,7 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit)
}
next_pkt:
if (unlikely(np->get_rx.ex++ == np->last_rx.ex))
- np->get_rx.ex = np->first_rx.ex;
+ np->get_rx.ex = np->rx_ring.ex;
if (unlikely(np->get_rx_ctx++ == np->last_rx_ctx))
np->get_rx_ctx = np->first_rx_ctx;
--
2.7.4
^ permalink raw reply related
* [bpf-net PATCH v2] bpf: sockmap missing NULL psock check
From: John Fastabend @ 2018-01-05 4:02 UTC (permalink / raw)
To: borkmann, alexei.starovoitov; +Cc: netdev
Add psock NULL check to handle a racing sock event that can get the
sk_callback_lock before this case but after xchg happens causing the
refcnt to hit zero and sock user data (psock) to be null and queued
for garbage collection.
Also add a comment in the code because this is a bit subtle and
not obvious in my opinion.
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
kernel/bpf/sockmap.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 5ee2e41..1712d31 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -591,8 +591,15 @@ static void sock_map_free(struct bpf_map *map)
write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
- smap_list_remove(psock, &stab->sock_map[i]);
- smap_release_sock(psock, sock);
+ /* This check handles a racing sock event that can get the
+ * sk_callback_lock before this case but after xchg happens
+ * causing the refcnt to hit zero and sock user data (psock)
+ * to be null and queued for garbage collection.
+ */
+ if (likely(psock)) {
+ smap_list_remove(psock, &stab->sock_map[i]);
+ smap_release_sock(psock, sock);
+ }
write_unlock_bh(&sock->sk_callback_lock);
}
rcu_read_unlock();
^ permalink raw reply related
* [PATCH net-next V2 2/2] tun: allow to attach ebpf socket filter
From: Jason Wang @ 2018-01-05 3:54 UTC (permalink / raw)
To: netdev, linux-kernel; +Cc: mst, willemb, Jason Wang
In-Reply-To: <1515124488-396-1-git-send-email-jasowang@redhat.com>
This patch allows userspace to attach eBPF filter to tun. This will
allow to implement VM dataplane filtering in a more efficient way
compared to cBPF filter by allowing either qemu or libvirt to
attach eBPF filter to tun.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 39 +++++++++++++++++++++++++++++++++++----
include/uapi/linux/if_tun.h | 1 +
2 files changed, 36 insertions(+), 4 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 0853829..9fc8b70 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -238,6 +238,12 @@ struct tun_struct {
struct tun_pcpu_stats __percpu *pcpu_stats;
struct bpf_prog __rcu *xdp_prog;
struct tun_prog __rcu *steering_prog;
+ struct tun_prog __rcu *filter_prog;
+};
+
+struct veth {
+ __be16 h_vlan_proto;
+ __be16 h_vlan_TCI;
};
static int tun_napi_receive(struct napi_struct *napi, int budget)
@@ -984,12 +990,25 @@ static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
#endif
}
+static unsigned int run_ebpf_filter(struct tun_struct *tun,
+ struct sk_buff *skb,
+ int len)
+{
+ struct tun_prog *prog = rcu_dereference(tun->filter_prog);
+
+ if (prog)
+ len = bpf_prog_run_clear_cb(prog->prog, skb);
+
+ return len;
+}
+
/* Net device start xmit */
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
int txq = skb->queue_mapping;
struct tun_file *tfile;
+ int len = skb->len;
rcu_read_lock();
tfile = rcu_dereference(tun->tfiles[txq]);
@@ -1015,6 +1034,16 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
sk_filter(tfile->socket.sk, skb))
goto drop;
+ len = run_ebpf_filter(tun, skb, len);
+
+ /* Trim extra bytes since we may inster vlan proto & TCI
+ * in tun_put_user().
+ */
+ if (skb_vlan_tag_present(skb))
+ len -= skb_vlan_tag_present(skb) ? sizeof(struct veth) : 0;
+ if (len <= 0 || pskb_trim(skb, len))
+ goto drop;
+
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
goto drop;
@@ -1904,10 +1933,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
if (vlan_hlen) {
int ret;
- struct {
- __be16 h_vlan_proto;
- __be16 h_vlan_TCI;
- } veth;
+ struct veth veth;
veth.h_vlan_proto = skb->vlan_proto;
veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
@@ -2068,6 +2094,7 @@ static void tun_free_netdev(struct net_device *dev)
tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
__tun_set_ebpf(tun, &tun->steering_prog, NULL);
+ __tun_set_ebpf(tun, &tun->filter_prog, NULL);
}
static void tun_setup(struct net_device *dev)
@@ -2849,6 +2876,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
break;
+ case TUNSETFILTEREBPF:
+ ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
+ break;
+
default:
ret = -EINVAL;
break;
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index fb38c17..ee432cd 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -58,6 +58,7 @@
#define TUNSETVNETBE _IOW('T', 222, int)
#define TUNGETVNETBE _IOR('T', 223, int)
#define TUNSETSTEERINGEBPF _IOR('T', 224, int)
+#define TUNSETFILTEREBPF _IOR('T', 225, int)
/* TUNSETIFF ifr flags */
#define IFF_TUN 0x0001
--
2.7.4
^ permalink raw reply related
* [PATCH net-next V2 1/2] tuntap: rename struct tun_steering_prog to struct tun_prog
From: Jason Wang @ 2018-01-05 3:54 UTC (permalink / raw)
To: netdev, linux-kernel; +Cc: mst, willemb, Jason Wang
In-Reply-To: <1515124488-396-1-git-send-email-jasowang@redhat.com>
To be reused by other eBPF program other than queue selection.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 32 ++++++++++++++++----------------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index e367d631..0853829 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -195,7 +195,7 @@ struct tun_flow_entry {
#define TUN_NUM_FLOW_ENTRIES 1024
-struct tun_steering_prog {
+struct tun_prog {
struct rcu_head rcu;
struct bpf_prog *prog;
};
@@ -237,7 +237,7 @@ struct tun_struct {
u32 rx_batched;
struct tun_pcpu_stats __percpu *pcpu_stats;
struct bpf_prog __rcu *xdp_prog;
- struct tun_steering_prog __rcu *steering_prog;
+ struct tun_prog __rcu *steering_prog;
};
static int tun_napi_receive(struct napi_struct *napi, int budget)
@@ -571,7 +571,7 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
- struct tun_steering_prog *prog;
+ struct tun_prog *prog;
u16 ret = 0;
prog = rcu_dereference(tun->steering_prog);
@@ -2027,19 +2027,18 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
return ret;
}
-static void tun_steering_prog_free(struct rcu_head *rcu)
+static void tun_prog_free(struct rcu_head *rcu)
{
- struct tun_steering_prog *prog = container_of(rcu,
- struct tun_steering_prog, rcu);
+ struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu);
bpf_prog_destroy(prog->prog);
kfree(prog);
}
-static int __tun_set_steering_ebpf(struct tun_struct *tun,
- struct bpf_prog *prog)
+static int __tun_set_ebpf(struct tun_struct *tun, struct tun_prog **prog_p,
+ struct bpf_prog *prog)
{
- struct tun_steering_prog *old, *new = NULL;
+ struct tun_prog *old, *new = NULL;
if (prog) {
new = kmalloc(sizeof(*new), GFP_KERNEL);
@@ -2049,13 +2048,13 @@ static int __tun_set_steering_ebpf(struct tun_struct *tun,
}
spin_lock_bh(&tun->lock);
- old = rcu_dereference_protected(tun->steering_prog,
+ old = rcu_dereference_protected(*prog_p,
lockdep_is_held(&tun->lock));
- rcu_assign_pointer(tun->steering_prog, new);
+ rcu_assign_pointer(*prog_p, new);
spin_unlock_bh(&tun->lock);
if (old)
- call_rcu(&old->rcu, tun_steering_prog_free);
+ call_rcu(&old->rcu, tun_prog_free);
return 0;
}
@@ -2068,7 +2067,7 @@ static void tun_free_netdev(struct net_device *dev)
free_percpu(tun->pcpu_stats);
tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
- __tun_set_steering_ebpf(tun, NULL);
+ __tun_set_ebpf(tun, &tun->steering_prog, NULL);
}
static void tun_setup(struct net_device *dev)
@@ -2550,7 +2549,8 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
return ret;
}
-static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
+static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog **prog_p,
+ void __user *data)
{
struct bpf_prog *prog;
int fd;
@@ -2566,7 +2566,7 @@ static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
return PTR_ERR(prog);
}
- return __tun_set_steering_ebpf(tun, prog);
+ return __tun_set_ebpf(tun, prog_p, prog);
}
static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
@@ -2846,7 +2846,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
break;
case TUNSETSTEERINGEBPF:
- ret = tun_set_steering_ebpf(tun, argp);
+ ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
break;
default:
--
2.7.4
^ permalink raw reply related
* [PATCH net-next V2 0/2] tun: allow to attahc eBPF filter
From: Jason Wang @ 2018-01-05 3:54 UTC (permalink / raw)
To: netdev, linux-kernel; +Cc: mst, willemb, Jason Wang
Hi all:
This series tries to implement eBPF socket filter for tun. This could
be used for implementing efficient virtio-net receive filter for
vhost-net.
Thanks
Changes from V1:
- trim more bytes if vlan tag is existed to make sure the packet
length does not exceed what is allowed by the filter.
Jason Wang (2):
tuntap: rename struct tun_steering_prog to struct tun_prog
tun: allow to attach ebpf socket filter
drivers/net/tun.c | 71 ++++++++++++++++++++++++++++++++-------------
include/uapi/linux/if_tun.h | 1 +
2 files changed, 52 insertions(+), 20 deletions(-)
--
2.7.4
^ permalink raw reply
* Re: [PATCH v2] openvswitch: Trim off padding before L3+ netfilter processing
From: Pravin Shelar @ 2018-01-05 3:36 UTC (permalink / raw)
To: Ed Swierk
Cc: ovs-dev, Linux Kernel Network Developers, Benjamin Warren,
Keith Holleman
In-Reply-To: <CAO_EM_=J-1s4DPTt6WpBPD2eC2Ek13THQ=6jYeuB0mZB41tpvQ@mail.gmail.com>
On Wed, Jan 3, 2018 at 7:49 PM, Ed Swierk <eswierk@skyportsystems.com> wrote:
> On Fri, Dec 22, 2017 at 3:31 PM, Pravin Shelar <pshelar@ovn.org> wrote:
>> On Thu, Dec 21, 2017 at 7:17 AM, Ed Swierk <eswierk@skyportsystems.com> wrote:
>>> IPv4 and IPv6 packets may arrive with lower-layer padding that is not
>>> included in the L3 length. For example, a short IPv4 packet may have
>>> up to 6 bytes of padding following the IP payload when received on an
>>> Ethernet device. In the normal IPv4 receive path, ip_rcv() trims the
>>> packet to ip_hdr->tot_len before invoking netfilter hooks (including
>>> conntrack and nat).
>>>
>>> In the IPv6 receive path, ip6_rcv() does the same using
>>> ipv6_hdr->payload_len. Similarly in the br_netfilter receive path,
>>> br_validate_ipv4() and br_validate_ipv6() trim the packet to the L3
>>> length before invoking NF_INET_PRE_ROUTING hooks.
>>>
>>> In the OVS conntrack receive path, ovs_ct_execute() pulls the skb to
>>> the L3 header but does not trim it to the L3 length before calling
>>> nf_conntrack_in(NF_INET_PRE_ROUTING). When nf_conntrack_proto_tcp
>>> encounters a packet with lower-layer padding, nf_checksum() fails and
>>> logs "nf_ct_tcp: bad TCP checksum". While extra zero bytes don't
>>> affect the checksum, the length in the IP pseudoheader does. That
>>> length is based on skb->len, and without trimming, it doesn't match
>>> the length the sender used when computing the checksum.
>>>
>>> The assumption throughout nf_conntrack and nf_nat is that skb->len
>>> reflects the length of the L3 header and payload, so there is no need
>>> to refer back to ip_hdr->tot_len or ipv6_hdr->payload_len.
>>>
>>> This change brings OVS into line with other netfilter users, trimming
>>> IPv4 and IPv6 packets prior to L3+ netfilter processing.
>>>
>>> Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
>>> ---
>>> v2:
>>> - Trim packet in nat receive path as well as conntrack
>>> - Free skb on error
>>> ---
>>> net/openvswitch/conntrack.c | 34 ++++++++++++++++++++++++++++++++++
>>> 1 file changed, 34 insertions(+)
>>>
>>> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
>>> index b27c5c6..1bdc78f 100644
>>> --- a/net/openvswitch/conntrack.c
>>> +++ b/net/openvswitch/conntrack.c
>>> @@ -703,6 +703,33 @@ static bool skb_nfct_cached(struct net *net,
>>> return ct_executed;
>>> }
>>>
>>> +/* Trim the skb to the L3 length. Assumes the skb is already pulled to
>>> + * the L3 header. The skb is freed on error.
>>> + */
>>> +static int skb_trim_l3(struct sk_buff *skb)
>>> +{
>>> + unsigned int nh_len;
>>> + int err;
>>> +
>>> + switch (skb->protocol) {
>>> + case htons(ETH_P_IP):
>>> + nh_len = ntohs(ip_hdr(skb)->tot_len);
>>> + break;
>>> + case htons(ETH_P_IPV6):
>>> + nh_len = ntohs(ipv6_hdr(skb)->payload_len)
>>> + + sizeof(struct ipv6hdr);
>>> + break;
>>> + default:
>>> + nh_len = skb->len;
>>> + }
>>> +
>>> + err = pskb_trim_rcsum(skb, nh_len);
>>> + if (err)
>> This should is unlikely.
>>> + kfree_skb(skb);
>>> +
>>> + return err;
>>> +}
>>> +
>> This looks like a generic function, it probably does not belong to OVS
>> code base.
>
> It occurs to me that skb_trim_l3() can't just reach into ip_hdr(skb)
> before calling pskb_may_pull(skb, sizeof(struct iphdr)) to make sure
> the IP header is actually there; and for IPv4 it should validate the
> IP header checksum, including options. Once we add all these steps,
> skb_trim_l3() starts to look an awful lot like br_validate_ipv4() and
> br_validate_ipv6(). And those in turn are eerily similar to ip_rcv()
> and ip6_rcv(). It would be nice to avoid duplicating this logic yet
> again.
>
OVS already pull all required headers in skb linear data, so no need
to redo all of it. only check required is the ip-checksum validation.
I think we could avoid it in most of cases by checking skb length to
ipheader length before verifying the ip header-checksum.
^ permalink raw reply
* Re: [PATCH v2] openvswitch: Trim off padding before L3+ netfilter processing
From: Pravin Shelar @ 2018-01-05 3:36 UTC (permalink / raw)
To: Ed Swierk
Cc: ovs-dev, Linux Kernel Network Developers, Benjamin Warren,
Keith Holleman
In-Reply-To: <CAO_EM_=J-1s4DPTt6WpBPD2eC2Ek13THQ=6jYeuB0mZB41tpvQ@mail.gmail.com>
On Wed, Jan 3, 2018 at 7:49 PM, Ed Swierk <eswierk@skyportsystems.com> wrote:
> On Fri, Dec 22, 2017 at 3:31 PM, Pravin Shelar <pshelar@ovn.org> wrote:
>> On Thu, Dec 21, 2017 at 7:17 AM, Ed Swierk <eswierk@skyportsystems.com> wrote:
>>> IPv4 and IPv6 packets may arrive with lower-layer padding that is not
>>> included in the L3 length. For example, a short IPv4 packet may have
>>> up to 6 bytes of padding following the IP payload when received on an
>>> Ethernet device. In the normal IPv4 receive path, ip_rcv() trims the
>>> packet to ip_hdr->tot_len before invoking netfilter hooks (including
>>> conntrack and nat).
>>>
>>> In the IPv6 receive path, ip6_rcv() does the same using
>>> ipv6_hdr->payload_len. Similarly in the br_netfilter receive path,
>>> br_validate_ipv4() and br_validate_ipv6() trim the packet to the L3
>>> length before invoking NF_INET_PRE_ROUTING hooks.
>>>
>>> In the OVS conntrack receive path, ovs_ct_execute() pulls the skb to
>>> the L3 header but does not trim it to the L3 length before calling
>>> nf_conntrack_in(NF_INET_PRE_ROUTING). When nf_conntrack_proto_tcp
>>> encounters a packet with lower-layer padding, nf_checksum() fails and
>>> logs "nf_ct_tcp: bad TCP checksum". While extra zero bytes don't
>>> affect the checksum, the length in the IP pseudoheader does. That
>>> length is based on skb->len, and without trimming, it doesn't match
>>> the length the sender used when computing the checksum.
>>>
>>> The assumption throughout nf_conntrack and nf_nat is that skb->len
>>> reflects the length of the L3 header and payload, so there is no need
>>> to refer back to ip_hdr->tot_len or ipv6_hdr->payload_len.
>>>
>>> This change brings OVS into line with other netfilter users, trimming
>>> IPv4 and IPv6 packets prior to L3+ netfilter processing.
>>>
>>> Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
>>> ---
>>> v2:
>>> - Trim packet in nat receive path as well as conntrack
>>> - Free skb on error
>>> ---
>>> net/openvswitch/conntrack.c | 34 ++++++++++++++++++++++++++++++++++
>>> 1 file changed, 34 insertions(+)
>>>
>>> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
>>> index b27c5c6..1bdc78f 100644
>>> --- a/net/openvswitch/conntrack.c
>>> +++ b/net/openvswitch/conntrack.c
>>> @@ -703,6 +703,33 @@ static bool skb_nfct_cached(struct net *net,
>>> return ct_executed;
>>> }
>>>
>>> +/* Trim the skb to the L3 length. Assumes the skb is already pulled to
>>> + * the L3 header. The skb is freed on error.
>>> + */
>>> +static int skb_trim_l3(struct sk_buff *skb)
>>> +{
>>> + unsigned int nh_len;
>>> + int err;
>>> +
>>> + switch (skb->protocol) {
>>> + case htons(ETH_P_IP):
>>> + nh_len = ntohs(ip_hdr(skb)->tot_len);
>>> + break;
>>> + case htons(ETH_P_IPV6):
>>> + nh_len = ntohs(ipv6_hdr(skb)->payload_len)
>>> + + sizeof(struct ipv6hdr);
>>> + break;
>>> + default:
>>> + nh_len = skb->len;
>>> + }
>>> +
>>> + err = pskb_trim_rcsum(skb, nh_len);
>>> + if (err)
>> This should is unlikely.
>>> + kfree_skb(skb);
>>> +
>>> + return err;
>>> +}
>>> +
>> This looks like a generic function, it probably does not belong to OVS
>> code base.
>
> It occurs to me that skb_trim_l3() can't just reach into ip_hdr(skb)
> before calling pskb_may_pull(skb, sizeof(struct iphdr)) to make sure
> the IP header is actually there; and for IPv4 it should validate the
> IP header checksum, including options. Once we add all these steps,
> skb_trim_l3() starts to look an awful lot like br_validate_ipv4() and
> br_validate_ipv6(). And those in turn are eerily similar to ip_rcv()
> and ip6_rcv(). It would be nice to avoid duplicating this logic yet
> again.
>
OVS already pull all required headers in skb linear data, so no need
to redo all of it. only check required is the ip-checksum validation.
I think we could avoid it in most of cases by checking skb length to
ipheader length before verifying the ip header-checksum.
^ permalink raw reply
* Re: [PATCH iproute2-next v1 3/9] rdma: Add filtering infrastructure
From: David Ahern @ 2018-01-05 3:29 UTC (permalink / raw)
To: Leon Romanovsky, Doug Ledford, Jason Gunthorpe
Cc: RDMA mailing list, netdev, Stephen Hemminger, Leon Romanovsky
In-Reply-To: <20180104070150.15625-4-leon@kernel.org>
On 1/4/18 12:01 AM, Leon Romanovsky wrote:
> diff --git a/rdma/utils.c b/rdma/utils.c
> index af2b374d..446c23da 100644
> --- a/rdma/utils.c
> +++ b/rdma/utils.c
> @@ -114,6 +114,225 @@ static void dev_map_cleanup(struct rd *rd)
> }
> }
>
> +static int add_filter(struct rd *rd, char *key, char *value,
> + const char * const valid_filters[])
> +{
> + char cset[] = "1234567890,-";
> + struct filter_entry *fe;
> + bool key_found = false;
> + int idx = 0;
> + int ret;
> +
> + fe = calloc(1, sizeof(*fe));
> + if (!fe)
> + return -ENOMEM;
> +
> + while (valid_filters[idx]) {
> + if (!strcmpx(key, valid_filters[idx])) {
> + key_found = true;
> + break;
> + }
> + idx++;
> + }
> + if (!key_found) {
> + pr_err("Unsupported filter option: %s\n", key);
> + ret = -EINVAL;
> + goto err;
> + }
> +
> + /*
> + * Check the filter validity, not optimal, but works
> + *
> + * Actually, there are three types of filters
> + * numeric - for example PID or QPN
> + * string - for example states, currently only "display"
> + * is from this type
> + * link - user requested to filter on specific link
> + * e.g. mlx5_1/1, mlx5_1/-, mlx5_1 ...
> + */
> + if (strcmpx(key, "link") && strcmpx(key, "display") &&
> + strspn(value, cset) != strlen(value)) {
> + pr_err("%s filter accepts \"%s\" characters only\n", key, cset);
> + ret = -EINVAL;
> + goto err;
> + }
> +
> + fe->key = strdup(key);
> + fe->value = strdup(value);
> +
Missed this the other day. 2 more strdup's that should be checked.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox