* [PATCH bpf-next v3 1/8] bpf: Maintain the refcount of struct_ops maps directly.
2023-03-03 1:21 [PATCH bpf-next v3 0/8] Transit between BPF TCP congestion controls Kui-Feng Lee
@ 2023-03-03 1:21 ` Kui-Feng Lee
2023-03-06 20:10 ` Stanislav Fomichev
2023-03-06 23:16 ` Martin KaFai Lau
2023-03-03 1:21 ` [PATCH bpf-next v3 2/8] bpf: Create links for BPF struct_ops maps Kui-Feng Lee
` (6 subsequent siblings)
7 siblings, 2 replies; 20+ messages in thread
From: Kui-Feng Lee @ 2023-03-03 1:21 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
The refcount of the kvalue for struct_ops was quite intricate to keep
track of. By no longer utilizing it and replacing it with the refcount
from the struct_ops map, this process became more transparent and
uncomplicated.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
include/linux/bpf.h | 3 ++
kernel/bpf/bpf_struct_ops.c | 84 ++++++++++++++++++++++---------------
kernel/bpf/syscall.c | 22 ++++++----
3 files changed, 68 insertions(+), 41 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8b5d0b4c4ada..cb837f42b99d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -78,6 +78,7 @@ struct bpf_map_ops {
struct bpf_map *(*map_alloc)(union bpf_attr *attr);
void (*map_release)(struct bpf_map *map, struct file *map_file);
void (*map_free)(struct bpf_map *map);
+ void (*map_free_rcu)(struct bpf_map *map);
int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
void (*map_release_uref)(struct bpf_map *map);
void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
@@ -1869,8 +1870,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd);
struct bpf_map *__bpf_map_get(struct fd f);
void bpf_map_inc(struct bpf_map *map);
void bpf_map_inc_with_uref(struct bpf_map *map);
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref);
struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
void bpf_map_put_with_uref(struct bpf_map *map);
+void bpf_map_free_deferred(struct work_struct *work);
void bpf_map_put(struct bpf_map *map);
void *bpf_map_area_alloc(u64 size, int numa_node);
void *bpf_map_area_mmapable_alloc(u64 size, int numa_node);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index ece9870cab68..bba03b6b010b 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -58,6 +58,8 @@ struct bpf_struct_ops_map {
struct bpf_struct_ops_value kvalue;
};
+static DEFINE_MUTEX(update_mutex);
+
#define VALUE_PREFIX "bpf_struct_ops_"
#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
@@ -249,6 +251,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
struct bpf_struct_ops_value *uvalue, *kvalue;
enum bpf_struct_ops_state state;
+ s64 refcnt;
if (unlikely(*(u32 *)key != 0))
return -ENOENT;
@@ -261,13 +264,13 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
return 0;
}
- /* No lock is needed. state and refcnt do not need
- * to be updated together under atomic context.
- */
uvalue = value;
memcpy(uvalue, st_map->uvalue, map->value_size);
uvalue->state = state;
- refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+
+ refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
+ refcount_set(&uvalue->refcnt,
+ refcnt > 0 ? refcnt : 0);
return 0;
}
@@ -491,7 +494,6 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*(unsigned long *)(udata + moff) = prog->aux->id;
}
- refcount_set(&kvalue->refcnt, 1);
bpf_map_inc(map);
set_memory_rox((long)st_map->image, 1);
@@ -536,8 +538,7 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
switch (prev_state) {
case BPF_STRUCT_OPS_STATE_INUSE:
st_map->st_ops->unreg(&st_map->kvalue.data);
- if (refcount_dec_and_test(&st_map->kvalue.refcnt))
- bpf_map_put(map);
+ bpf_map_put(map);
return 0;
case BPF_STRUCT_OPS_STATE_TOBEFREE:
return -EINPROGRESS;
@@ -582,6 +583,38 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
bpf_map_area_free(st_map);
}
+static void bpf_struct_ops_map_free_wq(struct rcu_head *head)
+{
+ struct bpf_struct_ops_map *st_map;
+
+ st_map = container_of(head, struct bpf_struct_ops_map, rcu);
+
+ /* bpf_map_free_deferred should not be called in a RCU callback. */
+ INIT_WORK(&st_map->map.work, bpf_map_free_deferred);
+ queue_work(system_unbound_wq, &st_map->map.work);
+}
+
+static void bpf_struct_ops_map_free_rcu(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ /* Wait for a grace period of RCU. Then, post the map_free
+ * work to the system_unbound_wq workqueue to free resources.
+ *
+ * The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its refcount may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * Thus, a rcu grace period is needed here.
+ */
+ call_rcu(&st_map->rcu, bpf_struct_ops_map_free_wq);
+}
+
static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
{
if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
@@ -646,6 +679,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_alloc_check = bpf_struct_ops_map_alloc_check,
.map_alloc = bpf_struct_ops_map_alloc,
.map_free = bpf_struct_ops_map_free,
+ .map_free_rcu = bpf_struct_ops_map_free_rcu,
.map_get_next_key = bpf_struct_ops_map_get_next_key,
.map_lookup_elem = bpf_struct_ops_map_lookup_elem,
.map_delete_elem = bpf_struct_ops_map_delete_elem,
@@ -660,41 +694,23 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
bool bpf_struct_ops_get(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
- return refcount_inc_not_zero(&kvalue->refcnt);
-}
-
-static void bpf_struct_ops_put_rcu(struct rcu_head *head)
-{
- struct bpf_struct_ops_map *st_map;
-
- st_map = container_of(head, struct bpf_struct_ops_map, rcu);
- bpf_map_put(&st_map->map);
+ map = __bpf_map_inc_not_zero(&st_map->map, false);
+ return !IS_ERR(map);
}
void bpf_struct_ops_put(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
- if (refcount_dec_and_test(&kvalue->refcnt)) {
- struct bpf_struct_ops_map *st_map;
-
- st_map = container_of(kvalue, struct bpf_struct_ops_map,
- kvalue);
- /* The struct_ops's function may switch to another struct_ops.
- *
- * For example, bpf_tcp_cc_x->init() may switch to
- * another tcp_cc_y by calling
- * setsockopt(TCP_CONGESTION, "tcp_cc_y").
- * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
- * and its map->refcnt may reach 0 which then free its
- * trampoline image while tcp_cc_x is still running.
- *
- * Thus, a rcu grace period is needed here.
- */
- call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
- }
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+ bpf_map_put(&st_map->map);
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cda8d00f3762..358a0e40555e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -684,7 +684,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
}
/* called from workqueue */
-static void bpf_map_free_deferred(struct work_struct *work)
+void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
struct btf_field_offs *foffs = map->field_offs;
@@ -715,6 +715,15 @@ static void bpf_map_put_uref(struct bpf_map *map)
}
}
+static void bpf_map_put_wq(struct bpf_map *map)
+{
+ INIT_WORK(&map->work, bpf_map_free_deferred);
+ /* Avoid spawning kworkers, since they all might contend
+ * for the same mutex like slab_mutex.
+ */
+ queue_work(system_unbound_wq, &map->work);
+}
+
/* decrement map refcnt and schedule it for freeing via workqueue
* (underlying map implementation ops->map_free() might sleep)
*/
@@ -724,11 +733,10 @@ void bpf_map_put(struct bpf_map *map)
/* bpf_map_free_id() must be called first */
bpf_map_free_id(map);
btf_put(map->btf);
- INIT_WORK(&map->work, bpf_map_free_deferred);
- /* Avoid spawning kworkers, since they all might contend
- * for the same mutex like slab_mutex.
- */
- queue_work(system_unbound_wq, &map->work);
+ if (map->ops->map_free_rcu)
+ map->ops->map_free_rcu(map);
+ else
+ bpf_map_put_wq(map);
}
}
EXPORT_SYMBOL_GPL(bpf_map_put);
@@ -1276,7 +1284,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
}
/* map_idr_lock should have been held */
-static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
{
int refold;
--
2.30.2
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH bpf-next v3 1/8] bpf: Maintain the refcount of struct_ops maps directly.
2023-03-03 1:21 ` [PATCH bpf-next v3 1/8] bpf: Maintain the refcount of struct_ops maps directly Kui-Feng Lee
@ 2023-03-06 20:10 ` Stanislav Fomichev
2023-03-06 21:45 ` Kui-Feng Lee
2023-03-06 23:16 ` Martin KaFai Lau
1 sibling, 1 reply; 20+ messages in thread
From: Stanislav Fomichev @ 2023-03-06 20:10 UTC (permalink / raw)
To: Kui-Feng Lee; +Cc: bpf, ast, martin.lau, song, kernel-team, andrii
On 03/02, Kui-Feng Lee wrote:
> The refcount of the kvalue for struct_ops was quite intricate to keep
> track of. By no longer utilizing it and replacing it with the refcount
> from the struct_ops map, this process became more transparent and
> uncomplicated.
> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
> ---
> include/linux/bpf.h | 3 ++
> kernel/bpf/bpf_struct_ops.c | 84 ++++++++++++++++++++++---------------
> kernel/bpf/syscall.c | 22 ++++++----
> 3 files changed, 68 insertions(+), 41 deletions(-)
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 8b5d0b4c4ada..cb837f42b99d 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -78,6 +78,7 @@ struct bpf_map_ops {
> struct bpf_map *(*map_alloc)(union bpf_attr *attr);
> void (*map_release)(struct bpf_map *map, struct file *map_file);
> void (*map_free)(struct bpf_map *map);
> + void (*map_free_rcu)(struct bpf_map *map);
> int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
> void (*map_release_uref)(struct bpf_map *map);
> void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
> @@ -1869,8 +1870,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd);
> struct bpf_map *__bpf_map_get(struct fd f);
> void bpf_map_inc(struct bpf_map *map);
> void bpf_map_inc_with_uref(struct bpf_map *map);
> +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref);
> struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
> void bpf_map_put_with_uref(struct bpf_map *map);
> +void bpf_map_free_deferred(struct work_struct *work);
> void bpf_map_put(struct bpf_map *map);
> void *bpf_map_area_alloc(u64 size, int numa_node);
> void *bpf_map_area_mmapable_alloc(u64 size, int numa_node);
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index ece9870cab68..bba03b6b010b 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -58,6 +58,8 @@ struct bpf_struct_ops_map {
> struct bpf_struct_ops_value kvalue;
> };
> +static DEFINE_MUTEX(update_mutex);
Defined but unused?
> +
> #define VALUE_PREFIX "bpf_struct_ops_"
> #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
> @@ -249,6 +251,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map
> *map, void *key,
> struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
> struct bpf_struct_ops_value *uvalue, *kvalue;
> enum bpf_struct_ops_state state;
> + s64 refcnt;
> if (unlikely(*(u32 *)key != 0))
> return -ENOENT;
> @@ -261,13 +264,13 @@ int bpf_struct_ops_map_sys_lookup_elem(struct
> bpf_map *map, void *key,
> return 0;
> }
> - /* No lock is needed. state and refcnt do not need
> - * to be updated together under atomic context.
> - */
> uvalue = value;
> memcpy(uvalue, st_map->uvalue, map->value_size);
> uvalue->state = state;
> - refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
> +
> + refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
> + refcount_set(&uvalue->refcnt,
> + refcnt > 0 ? refcnt : 0);
> return 0;
> }
> @@ -491,7 +494,6 @@ static int bpf_struct_ops_map_update_elem(struct
> bpf_map *map, void *key,
> *(unsigned long *)(udata + moff) = prog->aux->id;
> }
> - refcount_set(&kvalue->refcnt, 1);
> bpf_map_inc(map);
> set_memory_rox((long)st_map->image, 1);
> @@ -536,8 +538,7 @@ static int bpf_struct_ops_map_delete_elem(struct
> bpf_map *map, void *key)
> switch (prev_state) {
> case BPF_STRUCT_OPS_STATE_INUSE:
> st_map->st_ops->unreg(&st_map->kvalue.data);
> - if (refcount_dec_and_test(&st_map->kvalue.refcnt))
> - bpf_map_put(map);
> + bpf_map_put(map);
> return 0;
> case BPF_STRUCT_OPS_STATE_TOBEFREE:
> return -EINPROGRESS;
> @@ -582,6 +583,38 @@ static void bpf_struct_ops_map_free(struct bpf_map
> *map)
> bpf_map_area_free(st_map);
> }
> +static void bpf_struct_ops_map_free_wq(struct rcu_head *head)
> +{
> + struct bpf_struct_ops_map *st_map;
> +
> + st_map = container_of(head, struct bpf_struct_ops_map, rcu);
> +
> + /* bpf_map_free_deferred should not be called in a RCU callback. */
> + INIT_WORK(&st_map->map.work, bpf_map_free_deferred);
> + queue_work(system_unbound_wq, &st_map->map.work);
> +}
> +
> +static void bpf_struct_ops_map_free_rcu(struct bpf_map *map)
> +{
> + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
> +
> + /* Wait for a grace period of RCU. Then, post the map_free
> + * work to the system_unbound_wq workqueue to free resources.
> + *
> + * The struct_ops's function may switch to another struct_ops.
> + *
> + * For example, bpf_tcp_cc_x->init() may switch to
> + * another tcp_cc_y by calling
> + * setsockopt(TCP_CONGESTION, "tcp_cc_y").
> + * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
> + * and its refcount may reach 0 which then free its
> + * trampoline image while tcp_cc_x is still running.
"is still running" where? Why existing deferred work doesn't protect
against this condition?
> + *
> + * Thus, a rcu grace period is needed here.
> + */
> + call_rcu(&st_map->rcu, bpf_struct_ops_map_free_wq);
> +}
> +
> static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
> {
> if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
> @@ -646,6 +679,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
> .map_alloc_check = bpf_struct_ops_map_alloc_check,
> .map_alloc = bpf_struct_ops_map_alloc,
[..]
> .map_free = bpf_struct_ops_map_free,
Since we have map_free_rcu check in bpf_map_put, does it mean the above
is not needed?
> + .map_free_rcu = bpf_struct_ops_map_free_rcu,
> .map_get_next_key = bpf_struct_ops_map_get_next_key,
> .map_lookup_elem = bpf_struct_ops_map_lookup_elem,
> .map_delete_elem = bpf_struct_ops_map_delete_elem,
> @@ -660,41 +694,23 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
> bool bpf_struct_ops_get(const void *kdata)
> {
> struct bpf_struct_ops_value *kvalue;
> + struct bpf_struct_ops_map *st_map;
> + struct bpf_map *map;
> kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
> + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
> - return refcount_inc_not_zero(&kvalue->refcnt);
> -}
> -
> -static void bpf_struct_ops_put_rcu(struct rcu_head *head)
> -{
> - struct bpf_struct_ops_map *st_map;
> -
> - st_map = container_of(head, struct bpf_struct_ops_map, rcu);
> - bpf_map_put(&st_map->map);
> + map = __bpf_map_inc_not_zero(&st_map->map, false);
> + return !IS_ERR(map);
> }
> void bpf_struct_ops_put(const void *kdata)
> {
> struct bpf_struct_ops_value *kvalue;
> + struct bpf_struct_ops_map *st_map;
> kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
> - if (refcount_dec_and_test(&kvalue->refcnt)) {
> - struct bpf_struct_ops_map *st_map;
> -
> - st_map = container_of(kvalue, struct bpf_struct_ops_map,
> - kvalue);
> - /* The struct_ops's function may switch to another struct_ops.
> - *
> - * For example, bpf_tcp_cc_x->init() may switch to
> - * another tcp_cc_y by calling
> - * setsockopt(TCP_CONGESTION, "tcp_cc_y").
> - * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
> - * and its map->refcnt may reach 0 which then free its
> - * trampoline image while tcp_cc_x is still running.
> - *
> - * Thus, a rcu grace period is needed here.
> - */
> - call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
> - }
> + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
> +
> + bpf_map_put(&st_map->map);
> }
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index cda8d00f3762..358a0e40555e 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -684,7 +684,7 @@ void bpf_obj_free_fields(const struct btf_record
> *rec, void *obj)
> }
> /* called from workqueue */
> -static void bpf_map_free_deferred(struct work_struct *work)
> +void bpf_map_free_deferred(struct work_struct *work)
> {
> struct bpf_map *map = container_of(work, struct bpf_map, work);
> struct btf_field_offs *foffs = map->field_offs;
> @@ -715,6 +715,15 @@ static void bpf_map_put_uref(struct bpf_map *map)
> }
> }
> +static void bpf_map_put_wq(struct bpf_map *map)
> +{
> + INIT_WORK(&map->work, bpf_map_free_deferred);
> + /* Avoid spawning kworkers, since they all might contend
> + * for the same mutex like slab_mutex.
> + */
> + queue_work(system_unbound_wq, &map->work);
> +}
> +
> /* decrement map refcnt and schedule it for freeing via workqueue
> * (underlying map implementation ops->map_free() might sleep)
> */
> @@ -724,11 +733,10 @@ void bpf_map_put(struct bpf_map *map)
> /* bpf_map_free_id() must be called first */
> bpf_map_free_id(map);
> btf_put(map->btf);
> - INIT_WORK(&map->work, bpf_map_free_deferred);
> - /* Avoid spawning kworkers, since they all might contend
> - * for the same mutex like slab_mutex.
> - */
> - queue_work(system_unbound_wq, &map->work);
> + if (map->ops->map_free_rcu)
> + map->ops->map_free_rcu(map);
> + else
> + bpf_map_put_wq(map);
> }
> }
> EXPORT_SYMBOL_GPL(bpf_map_put);
> @@ -1276,7 +1284,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
> }
> /* map_idr_lock should have been held */
> -static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool
> uref)
> +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
> {
> int refold;
> --
> 2.30.2
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH bpf-next v3 1/8] bpf: Maintain the refcount of struct_ops maps directly.
2023-03-06 20:10 ` Stanislav Fomichev
@ 2023-03-06 21:45 ` Kui-Feng Lee
0 siblings, 0 replies; 20+ messages in thread
From: Kui-Feng Lee @ 2023-03-06 21:45 UTC (permalink / raw)
To: Stanislav Fomichev, Kui-Feng Lee
Cc: bpf, ast, martin.lau, song, kernel-team, andrii
On 3/6/23 12:10, Stanislav Fomichev wrote:
> On 03/02, Kui-Feng Lee wrote:
>> The refcount of the kvalue for struct_ops was quite intricate to keep
>> track of. By no longer utilizing it and replacing it with the refcount
>> from the struct_ops map, this process became more transparent and
>> uncomplicated.
>
>> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
>> ---
>> include/linux/bpf.h | 3 ++
>> kernel/bpf/bpf_struct_ops.c | 84 ++++++++++++++++++++++---------------
>> kernel/bpf/syscall.c | 22 ++++++----
>> 3 files changed, 68 insertions(+), 41 deletions(-)
>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 8b5d0b4c4ada..cb837f42b99d 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -78,6 +78,7 @@ struct bpf_map_ops {
>> struct bpf_map *(*map_alloc)(union bpf_attr *attr);
>> void (*map_release)(struct bpf_map *map, struct file *map_file);
>> void (*map_free)(struct bpf_map *map);
>> + void (*map_free_rcu)(struct bpf_map *map);
>> int (*map_get_next_key)(struct bpf_map *map, void *key, void
>> *next_key);
>> void (*map_release_uref)(struct bpf_map *map);
>> void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
>> @@ -1869,8 +1870,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd);
>> struct bpf_map *__bpf_map_get(struct fd f);
>> void bpf_map_inc(struct bpf_map *map);
>> void bpf_map_inc_with_uref(struct bpf_map *map);
>> +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref);
>> struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map
>> *map);
>> void bpf_map_put_with_uref(struct bpf_map *map);
>> +void bpf_map_free_deferred(struct work_struct *work);
>> void bpf_map_put(struct bpf_map *map);
>> void *bpf_map_area_alloc(u64 size, int numa_node);
>> void *bpf_map_area_mmapable_alloc(u64 size, int numa_node);
>> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
>> index ece9870cab68..bba03b6b010b 100644
>> --- a/kernel/bpf/bpf_struct_ops.c
>> +++ b/kernel/bpf/bpf_struct_ops.c
>> @@ -58,6 +58,8 @@ struct bpf_struct_ops_map {
>> struct bpf_struct_ops_value kvalue;
>> };
>
>> +static DEFINE_MUTEX(update_mutex);
>
> Defined but unused?
>
>> +
>> #define VALUE_PREFIX "bpf_struct_ops_"
>> #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
>
>> @@ -249,6 +251,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct
>> bpf_map *map, void *key,
>> struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map
>> *)map;
>> struct bpf_struct_ops_value *uvalue, *kvalue;
>> enum bpf_struct_ops_state state;
>> + s64 refcnt;
>
>> if (unlikely(*(u32 *)key != 0))
>> return -ENOENT;
>> @@ -261,13 +264,13 @@ int bpf_struct_ops_map_sys_lookup_elem(struct
>> bpf_map *map, void *key,
>> return 0;
>> }
>
>> - /* No lock is needed. state and refcnt do not need
>> - * to be updated together under atomic context.
>> - */
>> uvalue = value;
>> memcpy(uvalue, st_map->uvalue, map->value_size);
>> uvalue->state = state;
>> - refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
>> +
>> + refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
>> + refcount_set(&uvalue->refcnt,
>> + refcnt > 0 ? refcnt : 0);
>
>> return 0;
>> }
>> @@ -491,7 +494,6 @@ static int bpf_struct_ops_map_update_elem(struct
>> bpf_map *map, void *key,
>> *(unsigned long *)(udata + moff) = prog->aux->id;
>> }
>
>> - refcount_set(&kvalue->refcnt, 1);
>> bpf_map_inc(map);
>
>> set_memory_rox((long)st_map->image, 1);
>> @@ -536,8 +538,7 @@ static int bpf_struct_ops_map_delete_elem(struct
>> bpf_map *map, void *key)
>> switch (prev_state) {
>> case BPF_STRUCT_OPS_STATE_INUSE:
>> st_map->st_ops->unreg(&st_map->kvalue.data);
>> - if (refcount_dec_and_test(&st_map->kvalue.refcnt))
>> - bpf_map_put(map);
>> + bpf_map_put(map);
>> return 0;
>> case BPF_STRUCT_OPS_STATE_TOBEFREE:
>> return -EINPROGRESS;
>> @@ -582,6 +583,38 @@ static void bpf_struct_ops_map_free(struct
>> bpf_map *map)
>> bpf_map_area_free(st_map);
>> }
>
>> +static void bpf_struct_ops_map_free_wq(struct rcu_head *head)
>> +{
>> + struct bpf_struct_ops_map *st_map;
>> +
>> + st_map = container_of(head, struct bpf_struct_ops_map, rcu);
>> +
>> + /* bpf_map_free_deferred should not be called in a RCU callback. */
>> + INIT_WORK(&st_map->map.work, bpf_map_free_deferred);
>> + queue_work(system_unbound_wq, &st_map->map.work);
>> +}
>> +
>> +static void bpf_struct_ops_map_free_rcu(struct bpf_map *map)
>> +{
>> + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map
>> *)map;
>> +
>> + /* Wait for a grace period of RCU. Then, post the map_free
>> + * work to the system_unbound_wq workqueue to free resources.
>> + *
>> + * The struct_ops's function may switch to another struct_ops.
>> + *
>> + * For example, bpf_tcp_cc_x->init() may switch to
>> + * another tcp_cc_y by calling
>> + * setsockopt(TCP_CONGESTION, "tcp_cc_y").
>> + * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
>> + * and its refcount may reach 0 which then free its
>> + * trampoline image while tcp_cc_x is still running.
>
> "is still running" where? Why existing deferred work doesn't protect
> against this condition?
bpf_module_put() is protected by rcu_read_lock() alone. As the comment
mentions, setsockopt() can lead to a switch to another algorithm. It is
exposed to the BPF world as bpf_setscokopt(), so it could potentially
call bpf_struct_ops_put() while a BPF program is running. In order to
guarantee we don't free the trampoline while it is still running, we
have to wait for a grace period.
>
>> + *
>> + * Thus, a rcu grace period is needed here.
>> + */
>> + call_rcu(&st_map->rcu, bpf_struct_ops_map_free_wq);
>> +}
>> +
>> static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
>> {
>> if (attr->key_size != sizeof(unsigned int) || attr->max_entries
>> != 1 ||
>> @@ -646,6 +679,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
>> .map_alloc_check = bpf_struct_ops_map_alloc_check,
>> .map_alloc = bpf_struct_ops_map_alloc,
>
> [..]
>
>> .map_free = bpf_struct_ops_map_free,
>
> Since we have map_free_rcu check in bpf_map_put, does it mean the above
> is not needed?
It will be called by bpf_map_free_deferred().
>
>> + .map_free_rcu = bpf_struct_ops_map_free_rcu,
>> .map_get_next_key = bpf_struct_ops_map_get_next_key,
>> .map_lookup_elem = bpf_struct_ops_map_lookup_elem,
>> .map_delete_elem = bpf_struct_ops_map_delete_elem,
>> @@ -660,41 +694,23 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
>> bool bpf_struct_ops_get(const void *kdata)
>> {
>> struct bpf_struct_ops_value *kvalue;
>> + struct bpf_struct_ops_map *st_map;
>> + struct bpf_map *map;
>
>> kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
>> + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
>
>> - return refcount_inc_not_zero(&kvalue->refcnt);
>> -}
>> -
>> -static void bpf_struct_ops_put_rcu(struct rcu_head *head)
>> -{
>> - struct bpf_struct_ops_map *st_map;
>> -
>> - st_map = container_of(head, struct bpf_struct_ops_map, rcu);
>> - bpf_map_put(&st_map->map);
>> + map = __bpf_map_inc_not_zero(&st_map->map, false);
>> + return !IS_ERR(map);
>> }
>
>> void bpf_struct_ops_put(const void *kdata)
>> {
>> struct bpf_struct_ops_value *kvalue;
>> + struct bpf_struct_ops_map *st_map;
>
>> kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
>> - if (refcount_dec_and_test(&kvalue->refcnt)) {
>> - struct bpf_struct_ops_map *st_map;
>> -
>> - st_map = container_of(kvalue, struct bpf_struct_ops_map,
>> - kvalue);
>> - /* The struct_ops's function may switch to another struct_ops.
>> - *
>> - * For example, bpf_tcp_cc_x->init() may switch to
>> - * another tcp_cc_y by calling
>> - * setsockopt(TCP_CONGESTION, "tcp_cc_y").
>> - * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
>> - * and its map->refcnt may reach 0 which then free its
>> - * trampoline image while tcp_cc_x is still running.
>> - *
>> - * Thus, a rcu grace period is needed here.
>> - */
>> - call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
>> - }
>> + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
>> +
>> + bpf_map_put(&st_map->map);
>> }
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index cda8d00f3762..358a0e40555e 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -684,7 +684,7 @@ void bpf_obj_free_fields(const struct btf_record
>> *rec, void *obj)
>> }
>
>> /* called from workqueue */
>> -static void bpf_map_free_deferred(struct work_struct *work)
>> +void bpf_map_free_deferred(struct work_struct *work)
>> {
>> struct bpf_map *map = container_of(work, struct bpf_map, work);
>> struct btf_field_offs *foffs = map->field_offs;
>> @@ -715,6 +715,15 @@ static void bpf_map_put_uref(struct bpf_map *map)
>> }
>> }
>
>> +static void bpf_map_put_wq(struct bpf_map *map)
>> +{
>> + INIT_WORK(&map->work, bpf_map_free_deferred);
>> + /* Avoid spawning kworkers, since they all might contend
>> + * for the same mutex like slab_mutex.
>> + */
>> + queue_work(system_unbound_wq, &map->work);
>> +}
>> +
>> /* decrement map refcnt and schedule it for freeing via workqueue
>> * (underlying map implementation ops->map_free() might sleep)
>> */
>> @@ -724,11 +733,10 @@ void bpf_map_put(struct bpf_map *map)
>> /* bpf_map_free_id() must be called first */
>> bpf_map_free_id(map);
>> btf_put(map->btf);
>> - INIT_WORK(&map->work, bpf_map_free_deferred);
>> - /* Avoid spawning kworkers, since they all might contend
>> - * for the same mutex like slab_mutex.
>> - */
>> - queue_work(system_unbound_wq, &map->work);
>> + if (map->ops->map_free_rcu)
>> + map->ops->map_free_rcu(map);
>> + else
>> + bpf_map_put_wq(map);
>> }
>> }
>> EXPORT_SYMBOL_GPL(bpf_map_put);
>> @@ -1276,7 +1284,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
>> }
>
>> /* map_idr_lock should have been held */
>> -static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map,
>> bool uref)
>> +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
>> {
>> int refold;
>
>> --
>> 2.30.2
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH bpf-next v3 1/8] bpf: Maintain the refcount of struct_ops maps directly.
2023-03-03 1:21 ` [PATCH bpf-next v3 1/8] bpf: Maintain the refcount of struct_ops maps directly Kui-Feng Lee
2023-03-06 20:10 ` Stanislav Fomichev
@ 2023-03-06 23:16 ` Martin KaFai Lau
2023-03-06 23:54 ` Kui-Feng Lee
1 sibling, 1 reply; 20+ messages in thread
From: Martin KaFai Lau @ 2023-03-06 23:16 UTC (permalink / raw)
To: Kui-Feng Lee; +Cc: bpf, ast, song, kernel-team, andrii, sdf
On 3/2/23 5:21 PM, Kui-Feng Lee wrote:
> The refcount of the kvalue for struct_ops was quite intricate to keep
> track of. By no longer utilizing it and replacing it with the refcount
> from the struct_ops map, this process became more transparent and
> uncomplicated.
The patch's subject is not very clear. may be 'Retire the struct_ops map
kvalue->refcnt' better reflect what the patch is doing?
The commit message also needs details on the major change and the reason for the
change. eg. Why freeing the struct_ops map needs to go through the rcu grace
period and it is the reason on the rcu related changes in this patch.
Why retiring kvalue->refcnt is needed for (or can simplify?) the later patches?
> @@ -261,13 +264,13 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
> return 0;
> }
>
> - /* No lock is needed. state and refcnt do not need
> - * to be updated together under atomic context.
> - */
This comment is still valid in this patch?
> uvalue = value;
> memcpy(uvalue, st_map->uvalue, map->value_size);
> uvalue->state = state;
> - refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
> +
> + refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
> + refcount_set(&uvalue->refcnt,
> + refcnt > 0 ? refcnt : 0);
nit. max_t().
It also needs comment on why it will work or at least good enough.
>
> return 0;
> }
> @@ -491,7 +494,6 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
> *(unsigned long *)(udata + moff) = prog->aux->id;
> }
>
> - refcount_set(&kvalue->refcnt, 1);
> bpf_map_inc(map);
>
> set_memory_rox((long)st_map->image, 1);
> @@ -536,8 +538,7 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
> switch (prev_state) {
> case BPF_STRUCT_OPS_STATE_INUSE:
> st_map->st_ops->unreg(&st_map->kvalue.data);
> - if (refcount_dec_and_test(&st_map->kvalue.refcnt))
> - bpf_map_put(map);
> + bpf_map_put(map);
> return 0;
> case BPF_STRUCT_OPS_STATE_TOBEFREE:
> return -EINPROGRESS;
> @@ -582,6 +583,38 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
> bpf_map_area_free(st_map);
> }
>
> +static void bpf_struct_ops_map_free_wq(struct rcu_head *head)
> +{
> + struct bpf_struct_ops_map *st_map;
> +
> + st_map = container_of(head, struct bpf_struct_ops_map, rcu);
> +
> + /* bpf_map_free_deferred should not be called in a RCU callback. */
> + INIT_WORK(&st_map->map.work, bpf_map_free_deferred);
> + queue_work(system_unbound_wq, &st_map->map.work);
> +}
> +
> +static void bpf_struct_ops_map_free_rcu(struct bpf_map *map)
> +{
> + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
> +
> + /* Wait for a grace period of RCU. Then, post the map_free
> + * work to the system_unbound_wq workqueue to free resources.
> + *
> + * The struct_ops's function may switch to another struct_ops.
> + *
> + * For example, bpf_tcp_cc_x->init() may switch to
> + * another tcp_cc_y by calling
> + * setsockopt(TCP_CONGESTION, "tcp_cc_y").
> + * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
> + * and its refcount may reach 0 which then free its
> + * trampoline image while tcp_cc_x is still running.
> + *
> + * Thus, a rcu grace period is needed here.
> + */
> + call_rcu(&st_map->rcu, bpf_struct_ops_map_free_wq);
> +}
> +
> static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
> {
> if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
> @@ -646,6 +679,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
> .map_alloc_check = bpf_struct_ops_map_alloc_check,
> .map_alloc = bpf_struct_ops_map_alloc,
> .map_free = bpf_struct_ops_map_free,
> + .map_free_rcu = bpf_struct_ops_map_free_rcu,
just came to my mind. Instead of having a rcu callback, synchronize_rcu() can be
called in bpf_struct_ops_map_free(). Then the '.map_free_rcu' addition and its
related change is not needed.
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH bpf-next v3 1/8] bpf: Maintain the refcount of struct_ops maps directly.
2023-03-06 23:16 ` Martin KaFai Lau
@ 2023-03-06 23:54 ` Kui-Feng Lee
2023-03-07 0:36 ` Martin KaFai Lau
0 siblings, 1 reply; 20+ messages in thread
From: Kui-Feng Lee @ 2023-03-06 23:54 UTC (permalink / raw)
To: Martin KaFai Lau, Kui-Feng Lee; +Cc: bpf, ast, song, kernel-team, andrii, sdf
On 3/6/23 15:16, Martin KaFai Lau wrote:
> On 3/2/23 5:21 PM, Kui-Feng Lee wrote:
>> The refcount of the kvalue for struct_ops was quite intricate to keep
>> track of. By no longer utilizing it and replacing it with the refcount
>> from the struct_ops map, this process became more transparent and
>> uncomplicated.
>
> The patch's subject is not very clear. may be 'Retire the struct_ops map
> kvalue->refcnt' better reflect what the patch is doing?
>
> The commit message also needs details on the major change and the reason
> for the change. eg. Why freeing the struct_ops map needs to go through
> the rcu grace period and it is the reason on the rcu related changes in
> this patch.
> Why retiring kvalue->refcnt is needed for (or can simplify?) the later
> patches?
Sure!
>
>> @@ -261,13 +264,13 @@ int bpf_struct_ops_map_sys_lookup_elem(struct
>> bpf_map *map, void *key,
>> return 0;
>> }
>> - /* No lock is needed. state and refcnt do not need
>> - * to be updated together under atomic context.
>> - */
>
> This comment is still valid in this patch?
>
>> uvalue = value;
>> memcpy(uvalue, st_map->uvalue, map->value_size);
>> uvalue->state = state;
>> - refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
>> +
>> + refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
>> + refcount_set(&uvalue->refcnt,
>> + refcnt > 0 ? refcnt : 0);
>
> nit. max_t().
>
> It also needs comment on why it will work or at least good enough.
Got it.
>
>> return 0;
>> }
>> @@ -491,7 +494,6 @@ static int bpf_struct_ops_map_update_elem(struct
>> bpf_map *map, void *key,
>> *(unsigned long *)(udata + moff) = prog->aux->id;
>> }
>> - refcount_set(&kvalue->refcnt, 1);
>> bpf_map_inc(map);
>> set_memory_rox((long)st_map->image, 1);
>> @@ -536,8 +538,7 @@ static int bpf_struct_ops_map_delete_elem(struct
>> bpf_map *map, void *key)
>> switch (prev_state) {
>> case BPF_STRUCT_OPS_STATE_INUSE:
>> st_map->st_ops->unreg(&st_map->kvalue.data);
>> - if (refcount_dec_and_test(&st_map->kvalue.refcnt))
>> - bpf_map_put(map);
>> + bpf_map_put(map);
>> return 0;
>> case BPF_STRUCT_OPS_STATE_TOBEFREE:
>> return -EINPROGRESS;
>> @@ -582,6 +583,38 @@ static void bpf_struct_ops_map_free(struct
>> bpf_map *map)
>> bpf_map_area_free(st_map);
>> }
>> +static void bpf_struct_ops_map_free_wq(struct rcu_head *head)
>> +{
>> + struct bpf_struct_ops_map *st_map;
>> +
>> + st_map = container_of(head, struct bpf_struct_ops_map, rcu);
>> +
>> + /* bpf_map_free_deferred should not be called in a RCU callback. */
>> + INIT_WORK(&st_map->map.work, bpf_map_free_deferred);
>> + queue_work(system_unbound_wq, &st_map->map.work);
>> +}
>> +
>> +static void bpf_struct_ops_map_free_rcu(struct bpf_map *map)
>> +{
>> + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map
>> *)map;
>> +
>> + /* Wait for a grace period of RCU. Then, post the map_free
>> + * work to the system_unbound_wq workqueue to free resources.
>> + *
>> + * The struct_ops's function may switch to another struct_ops.
>> + *
>> + * For example, bpf_tcp_cc_x->init() may switch to
>> + * another tcp_cc_y by calling
>> + * setsockopt(TCP_CONGESTION, "tcp_cc_y").
>> + * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
>> + * and its refcount may reach 0 which then free its
>> + * trampoline image while tcp_cc_x is still running.
>> + *
>> + * Thus, a rcu grace period is needed here.
>> + */
>> + call_rcu(&st_map->rcu, bpf_struct_ops_map_free_wq);
>> +}
>> +
>> static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
>> {
>> if (attr->key_size != sizeof(unsigned int) || attr->max_entries
>> != 1 ||
>> @@ -646,6 +679,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
>> .map_alloc_check = bpf_struct_ops_map_alloc_check,
>> .map_alloc = bpf_struct_ops_map_alloc,
>> .map_free = bpf_struct_ops_map_free,
>> + .map_free_rcu = bpf_struct_ops_map_free_rcu,
>
> just came to my mind. Instead of having a rcu callback,
> synchronize_rcu() can be called in bpf_struct_ops_map_free(). Then the
> '.map_free_rcu' addition and its related change is not needed.
>
synchronize_rcu() probably blocks other subsystem, right?
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH bpf-next v3 1/8] bpf: Maintain the refcount of struct_ops maps directly.
2023-03-06 23:54 ` Kui-Feng Lee
@ 2023-03-07 0:36 ` Martin KaFai Lau
0 siblings, 0 replies; 20+ messages in thread
From: Martin KaFai Lau @ 2023-03-07 0:36 UTC (permalink / raw)
To: Kui-Feng Lee, Kui-Feng Lee; +Cc: bpf, ast, song, kernel-team, andrii, sdf
On 3/6/23 3:54 PM, Kui-Feng Lee wrote:
>>> @@ -646,6 +679,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
>>> .map_alloc_check = bpf_struct_ops_map_alloc_check,
>>> .map_alloc = bpf_struct_ops_map_alloc,
>>> .map_free = bpf_struct_ops_map_free,
>>> + .map_free_rcu = bpf_struct_ops_map_free_rcu,
>>
>> just came to my mind. Instead of having a rcu callback, synchronize_rcu() can
>> be called in bpf_struct_ops_map_free(). Then the '.map_free_rcu' addition and
>> its related change is not needed.
>>
>
> synchronize_rcu() probably blocks other subsystem, right?
.map_free is called from system_unbound_wq, so it can block.
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH bpf-next v3 2/8] bpf: Create links for BPF struct_ops maps.
2023-03-03 1:21 [PATCH bpf-next v3 0/8] Transit between BPF TCP congestion controls Kui-Feng Lee
2023-03-03 1:21 ` [PATCH bpf-next v3 1/8] bpf: Maintain the refcount of struct_ops maps directly Kui-Feng Lee
@ 2023-03-03 1:21 ` Kui-Feng Lee
2023-03-06 20:23 ` Stanislav Fomichev
2023-03-07 2:11 ` Martin KaFai Lau
2023-03-03 1:21 ` [PATCH bpf-next v3 3/8] net: Update an existing TCP congestion control algorithm Kui-Feng Lee
` (5 subsequent siblings)
7 siblings, 2 replies; 20+ messages in thread
From: Kui-Feng Lee @ 2023-03-03 1:21 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
BPF struct_ops maps are employed directly to register TCP Congestion
Control algorithms. Unlike other BPF programs that terminate when
their links gone. The link of a BPF struct_ops map provides a uniform
experience akin to other types of BPF programs.
bpf_links are responsible for registering their associated
struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag
set to create a bpf_link, while a structs without this flag behaves in
the same manner as before and is registered upon updating its value.
The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it
used to craft the links for BPF struct_ops programs, but also to
create links for BPF struct_ops them-self. Since the links of BPF
struct_ops programs are only used to create trampolines internally,
they are never seen in other contexts. Thus, they can be reused for
struct_ops themself.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
include/linux/bpf.h | 11 +++
include/uapi/linux/bpf.h | 12 +++-
kernel/bpf/bpf_struct_ops.c | 118 +++++++++++++++++++++++++++++++--
kernel/bpf/syscall.c | 26 +++++---
tools/include/uapi/linux/bpf.h | 12 +++-
5 files changed, 164 insertions(+), 15 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cb837f42b99d..b845be719422 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1396,6 +1396,11 @@ struct bpf_link {
struct work_struct work;
};
+struct bpf_struct_ops_link {
+ struct bpf_link link;
+ struct bpf_map __rcu *map;
+};
+
struct bpf_link_ops {
void (*release)(struct bpf_link *link);
void (*dealloc)(struct bpf_link *link);
@@ -1964,6 +1969,7 @@ int bpf_link_new_fd(struct bpf_link *link);
struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd);
struct bpf_link *bpf_link_get_from_fd(u32 ufd);
struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
+int bpf_struct_ops_link_create(union bpf_attr *attr);
int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
int bpf_obj_get_user(const char __user *pathname, int flags);
@@ -2308,6 +2314,11 @@ static inline void bpf_link_put(struct bpf_link *link)
{
}
+static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
static inline int bpf_obj_get_user(const char __user *pathname, int flags)
{
return -EOPNOTSUPP;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 17afd2b35ee5..cd0ff39981e8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1033,6 +1033,7 @@ enum bpf_attach_type {
BPF_PERF_EVENT,
BPF_TRACE_KPROBE_MULTI,
BPF_LSM_CGROUP,
+ BPF_STRUCT_OPS,
__MAX_BPF_ATTACH_TYPE
};
@@ -1266,6 +1267,9 @@ enum {
/* Create a map that is suitable to be an inner map with dynamic max entries */
BPF_F_INNER_MAP = (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+ BPF_F_LINK = (1U << 13),
};
/* Flags for BPF_PROG_QUERY. */
@@ -1507,7 +1511,10 @@ union bpf_attr {
} task_fd_query;
struct { /* struct used by BPF_LINK_CREATE command */
- __u32 prog_fd; /* eBPF program to attach */
+ union {
+ __u32 prog_fd; /* eBPF program to attach */
+ __u32 map_fd; /* eBPF struct_ops to attach */
+ };
union {
__u32 target_fd; /* object to attach to */
__u32 target_ifindex; /* target ifindex */
@@ -6354,6 +6361,9 @@ struct bpf_link_info {
struct {
__u32 ifindex;
} xdp;
+ struct {
+ __u32 map_id;
+ } struct_ops;
};
} __attribute__((aligned(8)));
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index bba03b6b010b..9ec675576d97 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -14,6 +14,7 @@
enum bpf_struct_ops_state {
BPF_STRUCT_OPS_STATE_INIT,
+ BPF_STRUCT_OPS_STATE_READY,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE,
};
@@ -494,11 +495,19 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*(unsigned long *)(udata + moff) = prog->aux->id;
}
- bpf_map_inc(map);
-
set_memory_rox((long)st_map->image, 1);
+ if (st_map->map.map_flags & BPF_F_LINK) {
+ /* Let bpf_link handle registration & unregistration.
+ *
+ * Pair with smp_load_acquire() during lookup_elem().
+ */
+ smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
+ goto unlock;
+ }
+
err = st_ops->reg(kdata);
if (likely(!err)) {
+ bpf_map_inc(map);
/* Pair with smp_load_acquire() during lookup_elem().
* It ensures the above udata updates (e.g. prog->aux->id)
* can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
@@ -514,7 +523,6 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*/
set_memory_nx((long)st_map->image, 1);
set_memory_rw((long)st_map->image, 1);
- bpf_map_put(map);
reset_unlock:
bpf_struct_ops_map_put_progs(st_map);
@@ -532,10 +540,15 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
struct bpf_struct_ops_map *st_map;
st_map = (struct bpf_struct_ops_map *)map;
+ if (st_map->map.map_flags & BPF_F_LINK)
+ return -EOPNOTSUPP;
+
prev_state = cmpxchg(&st_map->kvalue.state,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE);
switch (prev_state) {
+ case BPF_STRUCT_OPS_STATE_READY:
+ return -EOPNOTSUPP;
case BPF_STRUCT_OPS_STATE_INUSE:
st_map->st_ops->unreg(&st_map->kvalue.data);
bpf_map_put(map);
@@ -618,7 +631,7 @@ static void bpf_struct_ops_map_free_rcu(struct bpf_map *map)
static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
{
if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
- attr->map_flags || !attr->btf_vmlinux_value_type_id)
+ (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
return -EINVAL;
return 0;
}
@@ -714,3 +727,100 @@ void bpf_struct_ops_put(const void *kdata)
bpf_map_put(&st_map->map);
}
+
+static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_struct_ops_map *st_map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ if (st_link->map) {
+ st_map = (struct bpf_struct_ops_map *)st_link->map;
+ st_map->st_ops->unreg(&st_map->kvalue.data);
+ bpf_map_put(st_link->map);
+ }
+ kfree(st_link);
+}
+
+static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock_trace();
+ map = rcu_dereference(st_link->map);
+ if (map)
+ seq_printf(seq, "map_id:\t%d\n", map->id);
+ rcu_read_unlock_trace();
+}
+
+static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock_trace();
+ map = rcu_dereference(st_link->map);
+ if (map)
+ info->struct_ops.map_id = map->id;
+ rcu_read_unlock_trace();
+ return 0;
+}
+
+static const struct bpf_link_ops bpf_struct_ops_map_lops = {
+ .dealloc = bpf_struct_ops_map_link_dealloc,
+ .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
+ .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+};
+
+int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+ struct bpf_struct_ops_link *link = NULL;
+ struct bpf_link_primer link_primer;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+ int err;
+
+ map = bpf_map_get(attr->link_create.map_fd);
+ if (!map)
+ return -EINVAL;
+
+ st_map = (struct bpf_struct_ops_map *)map;
+
+ if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags & BPF_F_LINK) ||
+ /* Pair with smp_store_release() during map_update */
+ smp_load_acquire(&st_map->kvalue.state) != BPF_STRUCT_OPS_STATE_READY) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
+ link->map = map;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto err_out;
+
+ err = st_map->st_ops->reg(st_map->kvalue.data);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ goto err_out;
+ }
+
+ return bpf_link_settle(&link_primer);
+
+err_out:
+ bpf_map_put(map);
+ kfree(link);
+ return err;
+}
+
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 358a0e40555e..3db4938212d6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2743,10 +2743,11 @@ void bpf_link_inc(struct bpf_link *link)
static void bpf_link_free(struct bpf_link *link)
{
bpf_link_free_id(link->id);
+ /* detach BPF program, clean up used resources */
if (link->prog) {
- /* detach BPF program, clean up used resources */
link->ops->release(link);
bpf_prog_put(link->prog);
+ /* The struct_ops links clean up map by them-selves. */
}
/* free bpf_link and its containing memory */
link->ops->dealloc(link);
@@ -2802,16 +2803,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
const struct bpf_prog *prog = link->prog;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"link_type:\t%s\n"
- "link_id:\t%u\n"
- "prog_tag:\t%s\n"
- "prog_id:\t%u\n",
+ "link_id:\t%u\n",
bpf_link_type_strs[link->type],
- link->id,
- prog_tag,
- prog->aux->id);
+ link->id);
+ if (prog) {
+ bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+ seq_printf(m,
+ "prog_tag:\t%s\n"
+ "prog_id:\t%u\n",
+ prog_tag,
+ prog->aux->id);
+ }
if (link->ops->show_fdinfo)
link->ops->show_fdinfo(link, m);
}
@@ -4286,7 +4290,8 @@ static int bpf_link_get_info_by_fd(struct file *file,
info.type = link->type;
info.id = link->id;
- info.prog_id = link->prog->aux->id;
+ if (link->prog)
+ info.prog_id = link->prog->aux->id;
if (link->ops->fill_link_info) {
err = link->ops->fill_link_info(link, &info);
@@ -4549,6 +4554,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
if (CHECK_ATTR(BPF_LINK_CREATE))
return -EINVAL;
+ if (attr->link_create.attach_type == BPF_STRUCT_OPS)
+ return bpf_struct_ops_link_create(attr);
+
prog = bpf_prog_get(attr->link_create.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 17afd2b35ee5..cd0ff39981e8 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1033,6 +1033,7 @@ enum bpf_attach_type {
BPF_PERF_EVENT,
BPF_TRACE_KPROBE_MULTI,
BPF_LSM_CGROUP,
+ BPF_STRUCT_OPS,
__MAX_BPF_ATTACH_TYPE
};
@@ -1266,6 +1267,9 @@ enum {
/* Create a map that is suitable to be an inner map with dynamic max entries */
BPF_F_INNER_MAP = (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+ BPF_F_LINK = (1U << 13),
};
/* Flags for BPF_PROG_QUERY. */
@@ -1507,7 +1511,10 @@ union bpf_attr {
} task_fd_query;
struct { /* struct used by BPF_LINK_CREATE command */
- __u32 prog_fd; /* eBPF program to attach */
+ union {
+ __u32 prog_fd; /* eBPF program to attach */
+ __u32 map_fd; /* eBPF struct_ops to attach */
+ };
union {
__u32 target_fd; /* object to attach to */
__u32 target_ifindex; /* target ifindex */
@@ -6354,6 +6361,9 @@ struct bpf_link_info {
struct {
__u32 ifindex;
} xdp;
+ struct {
+ __u32 map_id;
+ } struct_ops;
};
} __attribute__((aligned(8)));
--
2.30.2
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH bpf-next v3 2/8] bpf: Create links for BPF struct_ops maps.
2023-03-03 1:21 ` [PATCH bpf-next v3 2/8] bpf: Create links for BPF struct_ops maps Kui-Feng Lee
@ 2023-03-06 20:23 ` Stanislav Fomichev
2023-03-06 22:02 ` Kui-Feng Lee
2023-03-07 2:11 ` Martin KaFai Lau
1 sibling, 1 reply; 20+ messages in thread
From: Stanislav Fomichev @ 2023-03-06 20:23 UTC (permalink / raw)
To: Kui-Feng Lee; +Cc: bpf, ast, martin.lau, song, kernel-team, andrii
On 03/02, Kui-Feng Lee wrote:
> BPF struct_ops maps are employed directly to register TCP Congestion
> Control algorithms. Unlike other BPF programs that terminate when
> their links gone. The link of a BPF struct_ops map provides a uniform
> experience akin to other types of BPF programs.
> bpf_links are responsible for registering their associated
> struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag
> set to create a bpf_link, while a structs without this flag behaves in
> the same manner as before and is registered upon updating its value.
> The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it
> used to craft the links for BPF struct_ops programs, but also to
> create links for BPF struct_ops them-self. Since the links of BPF
> struct_ops programs are only used to create trampolines internally,
> they are never seen in other contexts. Thus, they can be reused for
> struct_ops themself.
> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
> ---
> include/linux/bpf.h | 11 +++
> include/uapi/linux/bpf.h | 12 +++-
> kernel/bpf/bpf_struct_ops.c | 118 +++++++++++++++++++++++++++++++--
> kernel/bpf/syscall.c | 26 +++++---
> tools/include/uapi/linux/bpf.h | 12 +++-
> 5 files changed, 164 insertions(+), 15 deletions(-)
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index cb837f42b99d..b845be719422 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1396,6 +1396,11 @@ struct bpf_link {
> struct work_struct work;
> };
> +struct bpf_struct_ops_link {
> + struct bpf_link link;
> + struct bpf_map __rcu *map;
> +};
> +
> struct bpf_link_ops {
> void (*release)(struct bpf_link *link);
> void (*dealloc)(struct bpf_link *link);
> @@ -1964,6 +1969,7 @@ int bpf_link_new_fd(struct bpf_link *link);
> struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd);
> struct bpf_link *bpf_link_get_from_fd(u32 ufd);
> struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
> +int bpf_struct_ops_link_create(union bpf_attr *attr);
> int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
> int bpf_obj_get_user(const char __user *pathname, int flags);
> @@ -2308,6 +2314,11 @@ static inline void bpf_link_put(struct bpf_link
> *link)
> {
> }
> +static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
> +{
> + return -EOPNOTSUPP;
> +}
> +
> static inline int bpf_obj_get_user(const char __user *pathname, int
> flags)
> {
> return -EOPNOTSUPP;
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 17afd2b35ee5..cd0ff39981e8 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1033,6 +1033,7 @@ enum bpf_attach_type {
> BPF_PERF_EVENT,
> BPF_TRACE_KPROBE_MULTI,
> BPF_LSM_CGROUP,
> + BPF_STRUCT_OPS,
> __MAX_BPF_ATTACH_TYPE
> };
> @@ -1266,6 +1267,9 @@ enum {
> /* Create a map that is suitable to be an inner map with dynamic max
> entries */
> BPF_F_INNER_MAP = (1U << 12),
> +
> +/* Create a map that will be registered/unregesitered by the backed
> bpf_link */
> + BPF_F_LINK = (1U << 13),
> };
> /* Flags for BPF_PROG_QUERY. */
> @@ -1507,7 +1511,10 @@ union bpf_attr {
> } task_fd_query;
> struct { /* struct used by BPF_LINK_CREATE command */
> - __u32 prog_fd; /* eBPF program to attach */
> + union {
> + __u32 prog_fd; /* eBPF program to attach */
> + __u32 map_fd; /* eBPF struct_ops to attach */
> + };
> union {
> __u32 target_fd; /* object to attach to */
> __u32 target_ifindex; /* target ifindex */
> @@ -6354,6 +6361,9 @@ struct bpf_link_info {
> struct {
> __u32 ifindex;
> } xdp;
> + struct {
> + __u32 map_id;
> + } struct_ops;
> };
> } __attribute__((aligned(8)));
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index bba03b6b010b..9ec675576d97 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -14,6 +14,7 @@
> enum bpf_struct_ops_state {
> BPF_STRUCT_OPS_STATE_INIT,
> + BPF_STRUCT_OPS_STATE_READY,
> BPF_STRUCT_OPS_STATE_INUSE,
> BPF_STRUCT_OPS_STATE_TOBEFREE,
> };
> @@ -494,11 +495,19 @@ static int bpf_struct_ops_map_update_elem(struct
> bpf_map *map, void *key,
> *(unsigned long *)(udata + moff) = prog->aux->id;
> }
> - bpf_map_inc(map);
> -
> set_memory_rox((long)st_map->image, 1);
> + if (st_map->map.map_flags & BPF_F_LINK) {
> + /* Let bpf_link handle registration & unregistration.
> + *
> + * Pair with smp_load_acquire() during lookup_elem().
> + */
> + smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
Any reason not to create a link here and return to the user? (without
extra link_create API)
> + goto unlock;
> + }
> +
> err = st_ops->reg(kdata);
> if (likely(!err)) {
> + bpf_map_inc(map);
> /* Pair with smp_load_acquire() during lookup_elem().
> * It ensures the above udata updates (e.g. prog->aux->id)
> * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
> @@ -514,7 +523,6 @@ static int bpf_struct_ops_map_update_elem(struct
> bpf_map *map, void *key,
> */
> set_memory_nx((long)st_map->image, 1);
> set_memory_rw((long)st_map->image, 1);
> - bpf_map_put(map);
> reset_unlock:
> bpf_struct_ops_map_put_progs(st_map);
> @@ -532,10 +540,15 @@ static int bpf_struct_ops_map_delete_elem(struct
> bpf_map *map, void *key)
> struct bpf_struct_ops_map *st_map;
> st_map = (struct bpf_struct_ops_map *)map;
> + if (st_map->map.map_flags & BPF_F_LINK)
> + return -EOPNOTSUPP;
> +
> prev_state = cmpxchg(&st_map->kvalue.state,
> BPF_STRUCT_OPS_STATE_INUSE,
> BPF_STRUCT_OPS_STATE_TOBEFREE);
> switch (prev_state) {
> + case BPF_STRUCT_OPS_STATE_READY:
> + return -EOPNOTSUPP;
> case BPF_STRUCT_OPS_STATE_INUSE:
> st_map->st_ops->unreg(&st_map->kvalue.data);
> bpf_map_put(map);
> @@ -618,7 +631,7 @@ static void bpf_struct_ops_map_free_rcu(struct
> bpf_map *map)
> static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
> {
> if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
> - attr->map_flags || !attr->btf_vmlinux_value_type_id)
> + (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
> return -EINVAL;
> return 0;
> }
> @@ -714,3 +727,100 @@ void bpf_struct_ops_put(const void *kdata)
> bpf_map_put(&st_map->map);
> }
> +
> +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
> +{
> + struct bpf_struct_ops_link *st_link;
> + struct bpf_struct_ops_map *st_map;
> +
> + st_link = container_of(link, struct bpf_struct_ops_link, link);
> + if (st_link->map) {
> + st_map = (struct bpf_struct_ops_map *)st_link->map;
> + st_map->st_ops->unreg(&st_map->kvalue.data);
> + bpf_map_put(st_link->map);
> + }
> + kfree(st_link);
> +}
> +
> +static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link
> *link,
> + struct seq_file *seq)
> +{
> + struct bpf_struct_ops_link *st_link;
> + struct bpf_map *map;
> +
> + st_link = container_of(link, struct bpf_struct_ops_link, link);
> + rcu_read_lock_trace();
> + map = rcu_dereference(st_link->map);
> + if (map)
> + seq_printf(seq, "map_id:\t%d\n", map->id);
> + rcu_read_unlock_trace();
> +}
> +
> +static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link
> *link,
> + struct bpf_link_info *info)
> +{
> + struct bpf_struct_ops_link *st_link;
> + struct bpf_map *map;
> +
> + st_link = container_of(link, struct bpf_struct_ops_link, link);
> + rcu_read_lock_trace();
> + map = rcu_dereference(st_link->map);
> + if (map)
> + info->struct_ops.map_id = map->id;
> + rcu_read_unlock_trace();
> + return 0;
> +}
> +
> +static const struct bpf_link_ops bpf_struct_ops_map_lops = {
> + .dealloc = bpf_struct_ops_map_link_dealloc,
> + .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
> + .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
> +};
> +
> +int bpf_struct_ops_link_create(union bpf_attr *attr)
> +{
> + struct bpf_struct_ops_link *link = NULL;
> + struct bpf_link_primer link_primer;
> + struct bpf_struct_ops_map *st_map;
> + struct bpf_map *map;
> + int err;
> +
> + map = bpf_map_get(attr->link_create.map_fd);
> + if (!map)
> + return -EINVAL;
> +
> + st_map = (struct bpf_struct_ops_map *)map;
> +
> + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags &
> BPF_F_LINK) ||
> + /* Pair with smp_store_release() during map_update */
> + smp_load_acquire(&st_map->kvalue.state) !=
> BPF_STRUCT_OPS_STATE_READY) {
> + err = -EINVAL;
> + goto err_out;
> + }
> +
> + link = kzalloc(sizeof(*link), GFP_USER);
> + if (!link) {
> + err = -ENOMEM;
> + goto err_out;
> + }
> + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
> &bpf_struct_ops_map_lops, NULL);
> + link->map = map;
> +
> + err = bpf_link_prime(&link->link, &link_primer);
> + if (err)
> + goto err_out;
> +
> + err = st_map->st_ops->reg(st_map->kvalue.data);
> + if (err) {
> + bpf_link_cleanup(&link_primer);
> + goto err_out;
> + }
> +
> + return bpf_link_settle(&link_primer);
> +
> +err_out:
> + bpf_map_put(map);
> + kfree(link);
> + return err;
> +}
> +
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 358a0e40555e..3db4938212d6 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2743,10 +2743,11 @@ void bpf_link_inc(struct bpf_link *link)
> static void bpf_link_free(struct bpf_link *link)
> {
> bpf_link_free_id(link->id);
> + /* detach BPF program, clean up used resources */
> if (link->prog) {
> - /* detach BPF program, clean up used resources */
> link->ops->release(link);
> bpf_prog_put(link->prog);
> + /* The struct_ops links clean up map by them-selves. */
> }
> /* free bpf_link and its containing memory */
> link->ops->dealloc(link);
> @@ -2802,16 +2803,19 @@ static void bpf_link_show_fdinfo(struct seq_file
> *m, struct file *filp)
> const struct bpf_prog *prog = link->prog;
> char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
> - bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
> seq_printf(m,
> "link_type:\t%s\n"
> - "link_id:\t%u\n"
> - "prog_tag:\t%s\n"
> - "prog_id:\t%u\n",
> + "link_id:\t%u\n",
> bpf_link_type_strs[link->type],
> - link->id,
> - prog_tag,
> - prog->aux->id);
> + link->id);
> + if (prog) {
> + bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
> + seq_printf(m,
> + "prog_tag:\t%s\n"
> + "prog_id:\t%u\n",
> + prog_tag,
> + prog->aux->id);
> + }
> if (link->ops->show_fdinfo)
> link->ops->show_fdinfo(link, m);
> }
> @@ -4286,7 +4290,8 @@ static int bpf_link_get_info_by_fd(struct file
> *file,
> info.type = link->type;
> info.id = link->id;
> - info.prog_id = link->prog->aux->id;
> + if (link->prog)
> + info.prog_id = link->prog->aux->id;
> if (link->ops->fill_link_info) {
> err = link->ops->fill_link_info(link, &info);
> @@ -4549,6 +4554,9 @@ static int link_create(union bpf_attr *attr,
> bpfptr_t uattr)
> if (CHECK_ATTR(BPF_LINK_CREATE))
> return -EINVAL;
> + if (attr->link_create.attach_type == BPF_STRUCT_OPS)
> + return bpf_struct_ops_link_create(attr);
> +
> prog = bpf_prog_get(attr->link_create.prog_fd);
> if (IS_ERR(prog))
> return PTR_ERR(prog);
> diff --git a/tools/include/uapi/linux/bpf.h
> b/tools/include/uapi/linux/bpf.h
> index 17afd2b35ee5..cd0ff39981e8 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -1033,6 +1033,7 @@ enum bpf_attach_type {
> BPF_PERF_EVENT,
> BPF_TRACE_KPROBE_MULTI,
> BPF_LSM_CGROUP,
> + BPF_STRUCT_OPS,
> __MAX_BPF_ATTACH_TYPE
> };
> @@ -1266,6 +1267,9 @@ enum {
> /* Create a map that is suitable to be an inner map with dynamic max
> entries */
> BPF_F_INNER_MAP = (1U << 12),
> +
> +/* Create a map that will be registered/unregesitered by the backed
> bpf_link */
> + BPF_F_LINK = (1U << 13),
> };
> /* Flags for BPF_PROG_QUERY. */
> @@ -1507,7 +1511,10 @@ union bpf_attr {
> } task_fd_query;
> struct { /* struct used by BPF_LINK_CREATE command */
> - __u32 prog_fd; /* eBPF program to attach */
> + union {
> + __u32 prog_fd; /* eBPF program to attach */
> + __u32 map_fd; /* eBPF struct_ops to attach */
> + };
> union {
> __u32 target_fd; /* object to attach to */
> __u32 target_ifindex; /* target ifindex */
> @@ -6354,6 +6361,9 @@ struct bpf_link_info {
> struct {
> __u32 ifindex;
> } xdp;
> + struct {
> + __u32 map_id;
> + } struct_ops;
> };
> } __attribute__((aligned(8)));
> --
> 2.30.2
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH bpf-next v3 2/8] bpf: Create links for BPF struct_ops maps.
2023-03-06 20:23 ` Stanislav Fomichev
@ 2023-03-06 22:02 ` Kui-Feng Lee
0 siblings, 0 replies; 20+ messages in thread
From: Kui-Feng Lee @ 2023-03-06 22:02 UTC (permalink / raw)
To: Stanislav Fomichev, Kui-Feng Lee
Cc: bpf, ast, martin.lau, song, kernel-team, andrii
On 3/6/23 12:23, Stanislav Fomichev wrote:
> On 03/02, Kui-Feng Lee wrote:
>> BPF struct_ops maps are employed directly to register TCP Congestion
>> Control algorithms. Unlike other BPF programs that terminate when
>> their links gone. The link of a BPF struct_ops map provides a uniform
>> experience akin to other types of BPF programs.
>
>> bpf_links are responsible for registering their associated
>> struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag
>> set to create a bpf_link, while a structs without this flag behaves in
>> the same manner as before and is registered upon updating its value.
>
>> The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it
>> used to craft the links for BPF struct_ops programs, but also to
>> create links for BPF struct_ops them-self. Since the links of BPF
>> struct_ops programs are only used to create trampolines internally,
>> they are never seen in other contexts. Thus, they can be reused for
>> struct_ops themself.
>
>> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
>> ---
>> include/linux/bpf.h | 11 +++
>> include/uapi/linux/bpf.h | 12 +++-
>> kernel/bpf/bpf_struct_ops.c | 118 +++++++++++++++++++++++++++++++--
>> kernel/bpf/syscall.c | 26 +++++---
>> tools/include/uapi/linux/bpf.h | 12 +++-
>> 5 files changed, 164 insertions(+), 15 deletions(-)
>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index cb837f42b99d..b845be719422 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1396,6 +1396,11 @@ struct bpf_link {
>> struct work_struct work;
>> };
>
>> +struct bpf_struct_ops_link {
>> + struct bpf_link link;
>> + struct bpf_map __rcu *map;
>> +};
>> +
>> struct bpf_link_ops {
>> void (*release)(struct bpf_link *link);
>> void (*dealloc)(struct bpf_link *link);
>> @@ -1964,6 +1969,7 @@ int bpf_link_new_fd(struct bpf_link *link);
>> struct file *bpf_link_new_file(struct bpf_link *link, int
>> *reserved_fd);
>> struct bpf_link *bpf_link_get_from_fd(u32 ufd);
>> struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
>> +int bpf_struct_ops_link_create(union bpf_attr *attr);
>
>> int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
>> int bpf_obj_get_user(const char __user *pathname, int flags);
>> @@ -2308,6 +2314,11 @@ static inline void bpf_link_put(struct bpf_link
>> *link)
>> {
>> }
>
>> +static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
>> +{
>> + return -EOPNOTSUPP;
>> +}
>> +
>> static inline int bpf_obj_get_user(const char __user *pathname, int
>> flags)
>> {
>> return -EOPNOTSUPP;
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 17afd2b35ee5..cd0ff39981e8 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1033,6 +1033,7 @@ enum bpf_attach_type {
>> BPF_PERF_EVENT,
>> BPF_TRACE_KPROBE_MULTI,
>> BPF_LSM_CGROUP,
>> + BPF_STRUCT_OPS,
>> __MAX_BPF_ATTACH_TYPE
>> };
>
>> @@ -1266,6 +1267,9 @@ enum {
>
>> /* Create a map that is suitable to be an inner map with dynamic max
>> entries */
>> BPF_F_INNER_MAP = (1U << 12),
>> +
>> +/* Create a map that will be registered/unregesitered by the backed
>> bpf_link */
>> + BPF_F_LINK = (1U << 13),
>> };
>
>> /* Flags for BPF_PROG_QUERY. */
>> @@ -1507,7 +1511,10 @@ union bpf_attr {
>> } task_fd_query;
>
>> struct { /* struct used by BPF_LINK_CREATE command */
>> - __u32 prog_fd; /* eBPF program to attach */
>> + union {
>> + __u32 prog_fd; /* eBPF program to attach */
>> + __u32 map_fd; /* eBPF struct_ops to attach */
>> + };
>> union {
>> __u32 target_fd; /* object to attach to */
>> __u32 target_ifindex; /* target ifindex */
>> @@ -6354,6 +6361,9 @@ struct bpf_link_info {
>> struct {
>> __u32 ifindex;
>> } xdp;
>> + struct {
>> + __u32 map_id;
>> + } struct_ops;
>> };
>> } __attribute__((aligned(8)));
>
>> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
>> index bba03b6b010b..9ec675576d97 100644
>> --- a/kernel/bpf/bpf_struct_ops.c
>> +++ b/kernel/bpf/bpf_struct_ops.c
>> @@ -14,6 +14,7 @@
>
>> enum bpf_struct_ops_state {
>> BPF_STRUCT_OPS_STATE_INIT,
>> + BPF_STRUCT_OPS_STATE_READY,
>> BPF_STRUCT_OPS_STATE_INUSE,
>> BPF_STRUCT_OPS_STATE_TOBEFREE,
>> };
>> @@ -494,11 +495,19 @@ static int bpf_struct_ops_map_update_elem(struct
>> bpf_map *map, void *key,
>> *(unsigned long *)(udata + moff) = prog->aux->id;
>> }
>
>> - bpf_map_inc(map);
>> -
>> set_memory_rox((long)st_map->image, 1);
>> + if (st_map->map.map_flags & BPF_F_LINK) {
>> + /* Let bpf_link handle registration & unregistration.
>> + *
>> + * Pair with smp_load_acquire() during lookup_elem().
>> + */
>> + smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
>
> Any reason not to create a link here and return to the user? (without
> extra link_create API)
>
We want the ability to substitute an existing struct_ops with a
different one. If we establish a link here, that implies we can't load
multiple struct_ops and switch between them since the struct_ops will be
registered instantly. Likewise, bpf_map_update_elem() is not supposed to
generate a new FD.
>> + goto unlock;
>> + }
>> +
>> err = st_ops->reg(kdata);
>> if (likely(!err)) {
>> + bpf_map_inc(map);
>> /* Pair with smp_load_acquire() during lookup_elem().
>> * It ensures the above udata updates (e.g. prog->aux->id)
>> * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
>> @@ -514,7 +523,6 @@ static int bpf_struct_ops_map_update_elem(struct
>> bpf_map *map, void *key,
>> */
>> set_memory_nx((long)st_map->image, 1);
>> set_memory_rw((long)st_map->image, 1);
>> - bpf_map_put(map);
>
>> reset_unlock:
>> bpf_struct_ops_map_put_progs(st_map);
>> @@ -532,10 +540,15 @@ static int bpf_struct_ops_map_delete_elem(struct
>> bpf_map *map, void *key)
>> struct bpf_struct_ops_map *st_map;
>
>> st_map = (struct bpf_struct_ops_map *)map;
>> + if (st_map->map.map_flags & BPF_F_LINK)
>> + return -EOPNOTSUPP;
>> +
>> prev_state = cmpxchg(&st_map->kvalue.state,
>> BPF_STRUCT_OPS_STATE_INUSE,
>> BPF_STRUCT_OPS_STATE_TOBEFREE);
>> switch (prev_state) {
>> + case BPF_STRUCT_OPS_STATE_READY:
>> + return -EOPNOTSUPP;
>> case BPF_STRUCT_OPS_STATE_INUSE:
>> st_map->st_ops->unreg(&st_map->kvalue.data);
>> bpf_map_put(map);
>> @@ -618,7 +631,7 @@ static void bpf_struct_ops_map_free_rcu(struct
>> bpf_map *map)
>> static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
>> {
>> if (attr->key_size != sizeof(unsigned int) || attr->max_entries
>> != 1 ||
>> - attr->map_flags || !attr->btf_vmlinux_value_type_id)
>> + (attr->map_flags & ~BPF_F_LINK) ||
>> !attr->btf_vmlinux_value_type_id)
>> return -EINVAL;
>> return 0;
>> }
>> @@ -714,3 +727,100 @@ void bpf_struct_ops_put(const void *kdata)
>
>> bpf_map_put(&st_map->map);
>> }
>> +
>> +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
>> +{
>> + struct bpf_struct_ops_link *st_link;
>> + struct bpf_struct_ops_map *st_map;
>> +
>> + st_link = container_of(link, struct bpf_struct_ops_link, link);
>> + if (st_link->map) {
>> + st_map = (struct bpf_struct_ops_map *)st_link->map;
>> + st_map->st_ops->unreg(&st_map->kvalue.data);
>> + bpf_map_put(st_link->map);
>> + }
>> + kfree(st_link);
>> +}
>> +
>> +static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link
>> *link,
>> + struct seq_file *seq)
>> +{
>> + struct bpf_struct_ops_link *st_link;
>> + struct bpf_map *map;
>> +
>> + st_link = container_of(link, struct bpf_struct_ops_link, link);
>> + rcu_read_lock_trace();
>> + map = rcu_dereference(st_link->map);
>> + if (map)
>> + seq_printf(seq, "map_id:\t%d\n", map->id);
>> + rcu_read_unlock_trace();
>> +}
>> +
>> +static int bpf_struct_ops_map_link_fill_link_info(const struct
>> bpf_link *link,
>> + struct bpf_link_info *info)
>> +{
>> + struct bpf_struct_ops_link *st_link;
>> + struct bpf_map *map;
>> +
>> + st_link = container_of(link, struct bpf_struct_ops_link, link);
>> + rcu_read_lock_trace();
>> + map = rcu_dereference(st_link->map);
>> + if (map)
>> + info->struct_ops.map_id = map->id;
>> + rcu_read_unlock_trace();
>> + return 0;
>> +}
>> +
>> +static const struct bpf_link_ops bpf_struct_ops_map_lops = {
>> + .dealloc = bpf_struct_ops_map_link_dealloc,
>> + .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
>> + .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
>> +};
>> +
>> +int bpf_struct_ops_link_create(union bpf_attr *attr)
>> +{
>> + struct bpf_struct_ops_link *link = NULL;
>> + struct bpf_link_primer link_primer;
>> + struct bpf_struct_ops_map *st_map;
>> + struct bpf_map *map;
>> + int err;
>> +
>> + map = bpf_map_get(attr->link_create.map_fd);
>> + if (!map)
>> + return -EINVAL;
>> +
>> + st_map = (struct bpf_struct_ops_map *)map;
>> +
>> + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags
>> & BPF_F_LINK) ||
>> + /* Pair with smp_store_release() during map_update */
>> + smp_load_acquire(&st_map->kvalue.state) !=
>> BPF_STRUCT_OPS_STATE_READY) {
>> + err = -EINVAL;
>> + goto err_out;
>> + }
>> +
>> + link = kzalloc(sizeof(*link), GFP_USER);
>> + if (!link) {
>> + err = -ENOMEM;
>> + goto err_out;
>> + }
>> + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
>> &bpf_struct_ops_map_lops, NULL);
>> + link->map = map;
>> +
>> + err = bpf_link_prime(&link->link, &link_primer);
>> + if (err)
>> + goto err_out;
>> +
>> + err = st_map->st_ops->reg(st_map->kvalue.data);
>> + if (err) {
>> + bpf_link_cleanup(&link_primer);
>> + goto err_out;
>> + }
>> +
>> + return bpf_link_settle(&link_primer);
>> +
>> +err_out:
>> + bpf_map_put(map);
>> + kfree(link);
>> + return err;
>> +}
>> +
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 358a0e40555e..3db4938212d6 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -2743,10 +2743,11 @@ void bpf_link_inc(struct bpf_link *link)
>> static void bpf_link_free(struct bpf_link *link)
>> {
>> bpf_link_free_id(link->id);
>> + /* detach BPF program, clean up used resources */
>> if (link->prog) {
>> - /* detach BPF program, clean up used resources */
>> link->ops->release(link);
>> bpf_prog_put(link->prog);
>> + /* The struct_ops links clean up map by them-selves. */
>> }
>> /* free bpf_link and its containing memory */
>> link->ops->dealloc(link);
>> @@ -2802,16 +2803,19 @@ static void bpf_link_show_fdinfo(struct
>> seq_file *m, struct file *filp)
>> const struct bpf_prog *prog = link->prog;
>> char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
>
>> - bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
>> seq_printf(m,
>> "link_type:\t%s\n"
>> - "link_id:\t%u\n"
>> - "prog_tag:\t%s\n"
>> - "prog_id:\t%u\n",
>> + "link_id:\t%u\n",
>> bpf_link_type_strs[link->type],
>> - link->id,
>> - prog_tag,
>> - prog->aux->id);
>> + link->id);
>> + if (prog) {
>> + bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
>> + seq_printf(m,
>> + "prog_tag:\t%s\n"
>> + "prog_id:\t%u\n",
>> + prog_tag,
>> + prog->aux->id);
>> + }
>> if (link->ops->show_fdinfo)
>> link->ops->show_fdinfo(link, m);
>> }
>> @@ -4286,7 +4290,8 @@ static int bpf_link_get_info_by_fd(struct file
>> *file,
>
>> info.type = link->type;
>> info.id = link->id;
>> - info.prog_id = link->prog->aux->id;
>> + if (link->prog)
>> + info.prog_id = link->prog->aux->id;
>
>> if (link->ops->fill_link_info) {
>> err = link->ops->fill_link_info(link, &info);
>> @@ -4549,6 +4554,9 @@ static int link_create(union bpf_attr *attr,
>> bpfptr_t uattr)
>> if (CHECK_ATTR(BPF_LINK_CREATE))
>> return -EINVAL;
>
>> + if (attr->link_create.attach_type == BPF_STRUCT_OPS)
>> + return bpf_struct_ops_link_create(attr);
>> +
>> prog = bpf_prog_get(attr->link_create.prog_fd);
>> if (IS_ERR(prog))
>> return PTR_ERR(prog);
>> diff --git a/tools/include/uapi/linux/bpf.h
>> b/tools/include/uapi/linux/bpf.h
>> index 17afd2b35ee5..cd0ff39981e8 100644
>> --- a/tools/include/uapi/linux/bpf.h
>> +++ b/tools/include/uapi/linux/bpf.h
>> @@ -1033,6 +1033,7 @@ enum bpf_attach_type {
>> BPF_PERF_EVENT,
>> BPF_TRACE_KPROBE_MULTI,
>> BPF_LSM_CGROUP,
>> + BPF_STRUCT_OPS,
>> __MAX_BPF_ATTACH_TYPE
>> };
>
>> @@ -1266,6 +1267,9 @@ enum {
>
>> /* Create a map that is suitable to be an inner map with dynamic max
>> entries */
>> BPF_F_INNER_MAP = (1U << 12),
>> +
>> +/* Create a map that will be registered/unregesitered by the backed
>> bpf_link */
>> + BPF_F_LINK = (1U << 13),
>> };
>
>> /* Flags for BPF_PROG_QUERY. */
>> @@ -1507,7 +1511,10 @@ union bpf_attr {
>> } task_fd_query;
>
>> struct { /* struct used by BPF_LINK_CREATE command */
>> - __u32 prog_fd; /* eBPF program to attach */
>> + union {
>> + __u32 prog_fd; /* eBPF program to attach */
>> + __u32 map_fd; /* eBPF struct_ops to attach */
>> + };
>> union {
>> __u32 target_fd; /* object to attach to */
>> __u32 target_ifindex; /* target ifindex */
>> @@ -6354,6 +6361,9 @@ struct bpf_link_info {
>> struct {
>> __u32 ifindex;
>> } xdp;
>> + struct {
>> + __u32 map_id;
>> + } struct_ops;
>> };
>> } __attribute__((aligned(8)));
>
>> --
>> 2.30.2
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH bpf-next v3 2/8] bpf: Create links for BPF struct_ops maps.
2023-03-03 1:21 ` [PATCH bpf-next v3 2/8] bpf: Create links for BPF struct_ops maps Kui-Feng Lee
2023-03-06 20:23 ` Stanislav Fomichev
@ 2023-03-07 2:11 ` Martin KaFai Lau
2023-03-07 18:04 ` Kui-Feng Lee
1 sibling, 1 reply; 20+ messages in thread
From: Martin KaFai Lau @ 2023-03-07 2:11 UTC (permalink / raw)
To: Kui-Feng Lee; +Cc: bpf, ast, song, kernel-team, andrii, sdf
On 3/2/23 5:21 PM, Kui-Feng Lee wrote:
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index cb837f42b99d..b845be719422 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1396,6 +1396,11 @@ struct bpf_link {
> struct work_struct work;
> };
>
> +struct bpf_struct_ops_link {
> + struct bpf_link link;
> + struct bpf_map __rcu *map;
__rcu is only needed after the link_update change in patch 5?
It is fine to keep it in this patch but please leave a comment in the commit
message.
Does 'struct bpf_struct_ops_link' have to be in bpf.h?
> +};
> +
> struct bpf_link_ops {
> void (*release)(struct bpf_link *link);
> void (*dealloc)(struct bpf_link *link);
> @@ -1964,6 +1969,7 @@ int bpf_link_new_fd(struct bpf_link *link);
> struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd);
> struct bpf_link *bpf_link_get_from_fd(u32 ufd);
> struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
> +int bpf_struct_ops_link_create(union bpf_attr *attr);
>
> int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
> int bpf_obj_get_user(const char __user *pathname, int flags);
> @@ -2308,6 +2314,11 @@ static inline void bpf_link_put(struct bpf_link *link)
> {
> }
>
> +static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
> +{
> + return -EOPNOTSUPP;
> +}
Is this currently under '#ifdef CONFIG_BPF_SYSCALL' alone?
Not sure if it is correct. Please double check.
ifeq ($(CONFIG_BPF_JIT),y)
obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif
obj-$(CONFIG_BPF_SYSCALL) += syscall.o ...
> +
> static inline int bpf_obj_get_user(const char __user *pathname, int flags)
> {
> return -EOPNOTSUPP;
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 17afd2b35ee5..cd0ff39981e8 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1033,6 +1033,7 @@ enum bpf_attach_type {
> BPF_PERF_EVENT,
> BPF_TRACE_KPROBE_MULTI,
> BPF_LSM_CGROUP,
> + BPF_STRUCT_OPS,
> __MAX_BPF_ATTACH_TYPE
> };
>
> @@ -1266,6 +1267,9 @@ enum {
>
> /* Create a map that is suitable to be an inner map with dynamic max entries */
> BPF_F_INNER_MAP = (1U << 12),
> +
> +/* Create a map that will be registered/unregesitered by the backed bpf_link */
> + BPF_F_LINK = (1U << 13),
> };
>
> /* Flags for BPF_PROG_QUERY. */
> @@ -1507,7 +1511,10 @@ union bpf_attr {
> } task_fd_query;
>
> struct { /* struct used by BPF_LINK_CREATE command */
> - __u32 prog_fd; /* eBPF program to attach */
> + union {
> + __u32 prog_fd; /* eBPF program to attach */
> + __u32 map_fd; /* eBPF struct_ops to attach */
nit. Remove eBPF. "struct_ops to attach"
> + };
> union {
> __u32 target_fd; /* object to attach to */
> __u32 target_ifindex; /* target ifindex */
> @@ -6354,6 +6361,9 @@ struct bpf_link_info {
> struct {
> __u32 ifindex;
> } xdp;
> + struct {
> + __u32 map_id;
> + } struct_ops;
> };
> } __attribute__((aligned(8)));
>
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index bba03b6b010b..9ec675576d97 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -14,6 +14,7 @@
>
> enum bpf_struct_ops_state {
> BPF_STRUCT_OPS_STATE_INIT,
> + BPF_STRUCT_OPS_STATE_READY,
Please add it to the end. Although it is not in uapi, try not to disrupt the
userspace introspection tool if it does not have to.
> BPF_STRUCT_OPS_STATE_INUSE,
> BPF_STRUCT_OPS_STATE_TOBEFREE,
> };
> @@ -494,11 +495,19 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
> *(unsigned long *)(udata + moff) = prog->aux->id;
> }
>
> - bpf_map_inc(map);
> -
> set_memory_rox((long)st_map->image, 1);
> + if (st_map->map.map_flags & BPF_F_LINK) {
> + /* Let bpf_link handle registration & unregistration.
> + *
> + * Pair with smp_load_acquire() during lookup_elem().
> + */
> + smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
> + goto unlock;
> + }
> +
> err = st_ops->reg(kdata);
> if (likely(!err)) {
> + bpf_map_inc(map);
> /* Pair with smp_load_acquire() during lookup_elem().
> * It ensures the above udata updates (e.g. prog->aux->id)
> * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
> @@ -514,7 +523,6 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
> */
> set_memory_nx((long)st_map->image, 1);
> set_memory_rw((long)st_map->image, 1);
> - bpf_map_put(map);
>
> reset_unlock:
> bpf_struct_ops_map_put_progs(st_map);
> @@ -532,10 +540,15 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
> struct bpf_struct_ops_map *st_map;
>
> st_map = (struct bpf_struct_ops_map *)map;
> + if (st_map->map.map_flags & BPF_F_LINK)
> + return -EOPNOTSUPP;
> +
> prev_state = cmpxchg(&st_map->kvalue.state,
> BPF_STRUCT_OPS_STATE_INUSE,
> BPF_STRUCT_OPS_STATE_TOBEFREE);
> switch (prev_state) {
> + case BPF_STRUCT_OPS_STATE_READY:
> + return -EOPNOTSUPP;
If this case never happens, this case should be removed. The WARN in the default
case at the end is a better handling.
> case BPF_STRUCT_OPS_STATE_INUSE:
> st_map->st_ops->unreg(&st_map->kvalue.data);
> bpf_map_put(map);
> @@ -618,7 +631,7 @@ static void bpf_struct_ops_map_free_rcu(struct bpf_map *map)
> static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
> {
> if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
> - attr->map_flags || !attr->btf_vmlinux_value_type_id)
> + (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
> return -EINVAL;
> return 0;
> }
> @@ -714,3 +727,100 @@ void bpf_struct_ops_put(const void *kdata)
>
> bpf_map_put(&st_map->map);
> }
> +
> +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
> +{
> + struct bpf_struct_ops_link *st_link;
> + struct bpf_struct_ops_map *st_map;
> +
> + st_link = container_of(link, struct bpf_struct_ops_link, link);
> + if (st_link->map) {
Will map ever be NULL
> + st_map = (struct bpf_struct_ops_map *)st_link->map;
> + st_map->st_ops->unreg(&st_map->kvalue.data);
> + bpf_map_put(st_link->map);
> + }
> + kfree(st_link);
> +}
> +
> +static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
> + struct seq_file *seq)
> +{
> + struct bpf_struct_ops_link *st_link;
> + struct bpf_map *map;
> +
> + st_link = container_of(link, struct bpf_struct_ops_link, link);
> + rcu_read_lock_trace();
Should it be rcu_read_lock()?
> + map = rcu_dereference(st_link->map);
> + if (map)
> + seq_printf(seq, "map_id:\t%d\n", map->id);
> + rcu_read_unlock_trace();
> +}
> +
> +static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
> + struct bpf_link_info *info)
> +{
> + struct bpf_struct_ops_link *st_link;
> + struct bpf_map *map;
> +
> + st_link = container_of(link, struct bpf_struct_ops_link, link);
> + rcu_read_lock_trace();
> + map = rcu_dereference(st_link->map);
> + if (map)
> + info->struct_ops.map_id = map->id;
> + rcu_read_unlock_trace();
> + return 0;
> +}
> +
> +static const struct bpf_link_ops bpf_struct_ops_map_lops = {
> + .dealloc = bpf_struct_ops_map_link_dealloc,
> + .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
> + .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
> +};
> +
> +int bpf_struct_ops_link_create(union bpf_attr *attr)
> +{
> + struct bpf_struct_ops_link *link = NULL;
> + struct bpf_link_primer link_primer;
> + struct bpf_struct_ops_map *st_map;
> + struct bpf_map *map;
> + int err;
> +
> + map = bpf_map_get(attr->link_create.map_fd);
> + if (!map)
> + return -EINVAL;
> +
> + st_map = (struct bpf_struct_ops_map *)map;
> +
> + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags & BPF_F_LINK) ||
> + /* Pair with smp_store_release() during map_update */
> + smp_load_acquire(&st_map->kvalue.state) != BPF_STRUCT_OPS_STATE_READY) {
> + err = -EINVAL;
> + goto err_out;
> + }
> +
> + link = kzalloc(sizeof(*link), GFP_USER);
> + if (!link) {
> + err = -ENOMEM;
> + goto err_out;
> + }
> + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
> + link->map = map;
RCU_INIT_POINTER().
> +
> + err = bpf_link_prime(&link->link, &link_primer);
> + if (err)
> + goto err_out;
> +
> + err = st_map->st_ops->reg(st_map->kvalue.data);
> + if (err) {
> + bpf_link_cleanup(&link_primer);
> + goto err_out;
> + }
> +
> + return bpf_link_settle(&link_primer);
> +
> +err_out:
> + bpf_map_put(map);
> + kfree(link);
> + return err;
> +}
> +
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 358a0e40555e..3db4938212d6 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2743,10 +2743,11 @@ void bpf_link_inc(struct bpf_link *link)
> static void bpf_link_free(struct bpf_link *link)
> {
> bpf_link_free_id(link->id);
> + /* detach BPF program, clean up used resources */
> if (link->prog) {
> - /* detach BPF program, clean up used resources */
This comment move seems unnecessary.
> link->ops->release(link);
> bpf_prog_put(link->prog);
> + /* The struct_ops links clean up map by them-selves. */
This also seems unnecessary to only spell out for struct_ops link. Each specific
link type does its cleanup in ->dealloc.
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH bpf-next v3 2/8] bpf: Create links for BPF struct_ops maps.
2023-03-07 2:11 ` Martin KaFai Lau
@ 2023-03-07 18:04 ` Kui-Feng Lee
0 siblings, 0 replies; 20+ messages in thread
From: Kui-Feng Lee @ 2023-03-07 18:04 UTC (permalink / raw)
To: Martin KaFai Lau, Kui-Feng Lee; +Cc: bpf, ast, song, kernel-team, andrii, sdf
On 3/6/23 18:11, Martin KaFai Lau wrote:
> On 3/2/23 5:21 PM, Kui-Feng Lee wrote:
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index cb837f42b99d..b845be719422 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1396,6 +1396,11 @@ struct bpf_link {
>> struct work_struct work;
>> };
>> +struct bpf_struct_ops_link {
>> + struct bpf_link link;
>> + struct bpf_map __rcu *map;
>
> __rcu is only needed after the link_update change in patch 5?
> It is fine to keep it in this patch but please leave a comment in the
> commit message.
>
> Does 'struct bpf_struct_ops_link' have to be in bpf.h?
>
>> +};
>> +
>> struct bpf_link_ops {
>> void (*release)(struct bpf_link *link);
>> void (*dealloc)(struct bpf_link *link);
>> @@ -1964,6 +1969,7 @@ int bpf_link_new_fd(struct bpf_link *link);
>> struct file *bpf_link_new_file(struct bpf_link *link, int
>> *reserved_fd);
>> struct bpf_link *bpf_link_get_from_fd(u32 ufd);
>> struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
>> +int bpf_struct_ops_link_create(union bpf_attr *attr);
>> int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
>> int bpf_obj_get_user(const char __user *pathname, int flags);
>> @@ -2308,6 +2314,11 @@ static inline void bpf_link_put(struct bpf_link
>> *link)
>> {
>> }
>> +static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
>> +{
>> + return -EOPNOTSUPP;
>> +}
>
> Is this currently under '#ifdef CONFIG_BPF_SYSCALL' alone?
>
> Not sure if it is correct. Please double check.
>
> ifeq ($(CONFIG_BPF_JIT),y)
> obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
> obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
> obj-${CONFIG_BPF_LSM} += bpf_lsm.o
> endif
>
> obj-$(CONFIG_BPF_SYSCALL) += syscall.o ...
You are right! Should be under CONFIG_BPF_JIT.
>
>> +
>> static inline int bpf_obj_get_user(const char __user *pathname, int
>> flags)
>> {
>> return -EOPNOTSUPP;
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 17afd2b35ee5..cd0ff39981e8 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1033,6 +1033,7 @@ enum bpf_attach_type {
>> BPF_PERF_EVENT,
>> BPF_TRACE_KPROBE_MULTI,
>> BPF_LSM_CGROUP,
>> + BPF_STRUCT_OPS,
>> __MAX_BPF_ATTACH_TYPE
>> };
>> @@ -1266,6 +1267,9 @@ enum {
>> /* Create a map that is suitable to be an inner map with dynamic max
>> entries */
>> BPF_F_INNER_MAP = (1U << 12),
>> +
>> +/* Create a map that will be registered/unregesitered by the backed
>> bpf_link */
>> + BPF_F_LINK = (1U << 13),
>> };
>> /* Flags for BPF_PROG_QUERY. */
>> @@ -1507,7 +1511,10 @@ union bpf_attr {
>> } task_fd_query;
>> struct { /* struct used by BPF_LINK_CREATE command */
>> - __u32 prog_fd; /* eBPF program to attach */
>> + union {
>> + __u32 prog_fd; /* eBPF program to attach */
>> + __u32 map_fd; /* eBPF struct_ops to attach */
>
> nit. Remove eBPF. "struct_ops to attach"
>
>> + };
>> union {
>> __u32 target_fd; /* object to attach to */
>> __u32 target_ifindex; /* target ifindex */
>> @@ -6354,6 +6361,9 @@ struct bpf_link_info {
>> struct {
>> __u32 ifindex;
>> } xdp;
>> + struct {
>> + __u32 map_id;
>> + } struct_ops;
>> };
>> } __attribute__((aligned(8)));
>> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
>> index bba03b6b010b..9ec675576d97 100644
>> --- a/kernel/bpf/bpf_struct_ops.c
>> +++ b/kernel/bpf/bpf_struct_ops.c
>> @@ -14,6 +14,7 @@
>> enum bpf_struct_ops_state {
>> BPF_STRUCT_OPS_STATE_INIT,
>> + BPF_STRUCT_OPS_STATE_READY,
>
> Please add it to the end. Although it is not in uapi, try not to disrupt
> the userspace introspection tool if it does not have to.
>
Got it!
>> BPF_STRUCT_OPS_STATE_INUSE,
>> BPF_STRUCT_OPS_STATE_TOBEFREE,
>> };
>> @@ -494,11 +495,19 @@ static int bpf_struct_ops_map_update_elem(struct
>> bpf_map *map, void *key,
>> *(unsigned long *)(udata + moff) = prog->aux->id;
>> }
>> - bpf_map_inc(map);
>> -
>> set_memory_rox((long)st_map->image, 1);
>> + if (st_map->map.map_flags & BPF_F_LINK) {
>> + /* Let bpf_link handle registration & unregistration.
>> + *
>> + * Pair with smp_load_acquire() during lookup_elem().
>> + */
>> + smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
>> + goto unlock;
>> + }
>> +
>> err = st_ops->reg(kdata);
>> if (likely(!err)) {
>> + bpf_map_inc(map);
>> /* Pair with smp_load_acquire() during lookup_elem().
>> * It ensures the above udata updates (e.g. prog->aux->id)
>> * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
>> @@ -514,7 +523,6 @@ static int bpf_struct_ops_map_update_elem(struct
>> bpf_map *map, void *key,
>> */
>> set_memory_nx((long)st_map->image, 1);
>> set_memory_rw((long)st_map->image, 1);
>> - bpf_map_put(map);
>> reset_unlock:
>> bpf_struct_ops_map_put_progs(st_map);
>> @@ -532,10 +540,15 @@ static int bpf_struct_ops_map_delete_elem(struct
>> bpf_map *map, void *key)
>> struct bpf_struct_ops_map *st_map;
>> st_map = (struct bpf_struct_ops_map *)map;
>> + if (st_map->map.map_flags & BPF_F_LINK)
>> + return -EOPNOTSUPP;
>> +
>> prev_state = cmpxchg(&st_map->kvalue.state,
>> BPF_STRUCT_OPS_STATE_INUSE,
>> BPF_STRUCT_OPS_STATE_TOBEFREE);
>> switch (prev_state) {
>> + case BPF_STRUCT_OPS_STATE_READY:
>> + return -EOPNOTSUPP;
>
> If this case never happens, this case should be removed. The WARN in the
> default case at the end is a better handling
No, it never happens. I will remove it.
>
>> case BPF_STRUCT_OPS_STATE_INUSE:
>> st_map->st_ops->unreg(&st_map->kvalue.data);
>> bpf_map_put(map);
>> @@ -618,7 +631,7 @@ static void bpf_struct_ops_map_free_rcu(struct
>> bpf_map *map)
>> static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
>> {
>> if (attr->key_size != sizeof(unsigned int) || attr->max_entries
>> != 1 ||
>> - attr->map_flags || !attr->btf_vmlinux_value_type_id)
>> + (attr->map_flags & ~BPF_F_LINK) ||
>> !attr->btf_vmlinux_value_type_id)
>> return -EINVAL;
>> return 0;
>> }
>> @@ -714,3 +727,100 @@ void bpf_struct_ops_put(const void *kdata)
>> bpf_map_put(&st_map->map);
>> }
>> +
>> +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
>> +{
>> + struct bpf_struct_ops_link *st_link;
>> + struct bpf_struct_ops_map *st_map;
>> +
>> + st_link = container_of(link, struct bpf_struct_ops_link, link);
>> + if (st_link->map) {
>
> Will map ever be NULL
No, it is never NULL after removing the detach feature.
>
>> + st_map = (struct bpf_struct_ops_map *)st_link->map;
>> + st_map->st_ops->unreg(&st_map->kvalue.data);
>> + bpf_map_put(st_link->map);
>> + }
>> + kfree(st_link);
>> +}
>> +
>> +static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link
>> *link,
>> + struct seq_file *seq)
>> +{
>> + struct bpf_struct_ops_link *st_link;
>> + struct bpf_map *map;
>> +
>> + st_link = container_of(link, struct bpf_struct_ops_link, link);
>> + rcu_read_lock_trace();
>
> Should it be rcu_read_lock()?
Got it!
>
>> + map = rcu_dereference(st_link->map);
>> + if (map)
>> + seq_printf(seq, "map_id:\t%d\n", map->id);
>> + rcu_read_unlock_trace();
>> +}
>> +
>> +static int bpf_struct_ops_map_link_fill_link_info(const struct
>> bpf_link *link,
>> + struct bpf_link_info *info)
>> +{
>> + struct bpf_struct_ops_link *st_link;
>> + struct bpf_map *map;
>> +
>> + st_link = container_of(link, struct bpf_struct_ops_link, link);
>> + rcu_read_lock_trace();
>> + map = rcu_dereference(st_link->map);
>> + if (map)
>> + info->struct_ops.map_id = map->id;
>> + rcu_read_unlock_trace();
>> + return 0;
>> +}
>> +
>> +static const struct bpf_link_ops bpf_struct_ops_map_lops = {
>> + .dealloc = bpf_struct_ops_map_link_dealloc,
>> + .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
>> + .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
>> +};
>> +
>> +int bpf_struct_ops_link_create(union bpf_attr *attr)
>> +{
>> + struct bpf_struct_ops_link *link = NULL;
>> + struct bpf_link_primer link_primer;
>> + struct bpf_struct_ops_map *st_map;
>> + struct bpf_map *map;
>> + int err;
>> +
>> + map = bpf_map_get(attr->link_create.map_fd);
>> + if (!map)
>> + return -EINVAL;
>> +
>> + st_map = (struct bpf_struct_ops_map *)map;
>> +
>> + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags
>> & BPF_F_LINK) ||
>> + /* Pair with smp_store_release() during map_update */
>> + smp_load_acquire(&st_map->kvalue.state) !=
>> BPF_STRUCT_OPS_STATE_READY) {
>> + err = -EINVAL;
>> + goto err_out;
>> + }
>> +
>> + link = kzalloc(sizeof(*link), GFP_USER);
>> + if (!link) {
>> + err = -ENOMEM;
>> + goto err_out;
>> + }
>> + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
>> &bpf_struct_ops_map_lops, NULL);
>> + link->map = map;
>
> RCU_INIT_POINTER().
Got it!
>
>> +
>> + err = bpf_link_prime(&link->link, &link_primer);
>> + if (err)
>> + goto err_out;
>> +
>> + err = st_map->st_ops->reg(st_map->kvalue.data);
>> + if (err) {
>> + bpf_link_cleanup(&link_primer);
>> + goto err_out;
>> + }
>> +
>> + return bpf_link_settle(&link_primer);
>> +
>> +err_out:
>> + bpf_map_put(map);
>> + kfree(link);
>> + return err;
>> +}
>> +
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 358a0e40555e..3db4938212d6 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -2743,10 +2743,11 @@ void bpf_link_inc(struct bpf_link *link)
>> static void bpf_link_free(struct bpf_link *link)
>> {
>> bpf_link_free_id(link->id);
>> + /* detach BPF program, clean up used resources */
>> if (link->prog) {
>> - /* detach BPF program, clean up used resources */
>
> This comment move seems unnecessary.
>
>> link->ops->release(link);
>> bpf_prog_put(link->prog);
>> + /* The struct_ops links clean up map by them-selves. */
>
> This also seems unnecessary to only spell out for struct_ops link. Each
> specific link type does its cleanup in ->dealloc.
>
Got it!
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH bpf-next v3 3/8] net: Update an existing TCP congestion control algorithm.
2023-03-03 1:21 [PATCH bpf-next v3 0/8] Transit between BPF TCP congestion controls Kui-Feng Lee
2023-03-03 1:21 ` [PATCH bpf-next v3 1/8] bpf: Maintain the refcount of struct_ops maps directly Kui-Feng Lee
2023-03-03 1:21 ` [PATCH bpf-next v3 2/8] bpf: Create links for BPF struct_ops maps Kui-Feng Lee
@ 2023-03-03 1:21 ` Kui-Feng Lee
2023-03-07 2:17 ` Martin KaFai Lau
2023-03-03 1:21 ` [PATCH bpf-next v3 4/8] libbpf: Create a bpf_link in bpf_map__attach_struct_ops() Kui-Feng Lee
` (4 subsequent siblings)
7 siblings, 1 reply; 20+ messages in thread
From: Kui-Feng Lee @ 2023-03-03 1:21 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
This feature lets you immediately transition to another congestion
control algorithm or implementation with the same name. Once a name
is updated, new connections will apply this new algorithm.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
include/linux/bpf.h | 1 +
include/net/tcp.h | 2 ++
net/bpf/bpf_dummy_struct_ops.c | 6 ++++
net/ipv4/bpf_tcp_ca.c | 8 +++--
net/ipv4/tcp_cong.c | 57 +++++++++++++++++++++++++++++-----
5 files changed, 65 insertions(+), 9 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b845be719422..acb18aa64b77 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1452,6 +1452,7 @@ struct bpf_struct_ops {
void *kdata, const void *udata);
int (*reg)(void *kdata);
void (*unreg)(void *kdata);
+ int (*update)(void *kdata, void *old_kdata);
const struct btf_type *type;
const struct btf_type *value_type;
const char *name;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index db9f828e9d1e..239cc0e2639c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1117,6 +1117,8 @@ struct tcp_congestion_ops {
int tcp_register_congestion_control(struct tcp_congestion_ops *type);
void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
+int tcp_update_congestion_control(struct tcp_congestion_ops *type,
+ struct tcp_congestion_ops *old_type);
void tcp_assign_congestion_control(struct sock *sk);
void tcp_init_congestion_control(struct sock *sk);
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index ff4f89a2b02a..158f14e240d0 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -222,12 +222,18 @@ static void bpf_dummy_unreg(void *kdata)
{
}
+static int bpf_dummy_update(void *kdata, void *old_kdata)
+{
+ return -EOPNOTSUPP;
+}
+
struct bpf_struct_ops bpf_bpf_dummy_ops = {
.verifier_ops = &bpf_dummy_verifier_ops,
.init = bpf_dummy_init,
.check_member = bpf_dummy_ops_check_member,
.init_member = bpf_dummy_init_member,
.reg = bpf_dummy_reg,
+ .update = bpf_dummy_update,
.unreg = bpf_dummy_unreg,
.name = "bpf_dummy_ops",
};
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 13fc0c185cd9..558b01d5250f 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -239,8 +239,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name,
sizeof(tcp_ca->name)) <= 0)
return -EINVAL;
- if (tcp_ca_find(utcp_ca->name))
- return -EEXIST;
return 1;
}
@@ -266,10 +264,16 @@ static void bpf_tcp_ca_unreg(void *kdata)
tcp_unregister_congestion_control(kdata);
}
+static int bpf_tcp_ca_update(void *kdata, void *old_kdata)
+{
+ return tcp_update_congestion_control(kdata, old_kdata);
+}
+
struct bpf_struct_ops bpf_tcp_congestion_ops = {
.verifier_ops = &bpf_tcp_ca_verifier_ops,
.reg = bpf_tcp_ca_reg,
.unreg = bpf_tcp_ca_unreg,
+ .update = bpf_tcp_ca_update,
.check_member = bpf_tcp_ca_check_member,
.init_member = bpf_tcp_ca_init_member,
.init = bpf_tcp_ca_init,
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index db8b4b488c31..981501871609 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -75,14 +75,8 @@ struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
return NULL;
}
-/*
- * Attach new congestion control algorithm to the list
- * of available options.
- */
-int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+static int tcp_ca_validate(struct tcp_congestion_ops *ca)
{
- int ret = 0;
-
/* all algorithms must implement these */
if (!ca->ssthresh || !ca->undo_cwnd ||
!(ca->cong_avoid || ca->cong_control)) {
@@ -90,6 +84,20 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
return -EINVAL;
}
+ return 0;
+}
+
+/* Attach new congestion control algorithm to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+ int ret;
+
+ ret = tcp_ca_validate(ca);
+ if (ret)
+ return ret;
+
ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
spin_lock(&tcp_cong_list_lock);
@@ -130,6 +138,41 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
+/* Replace a registered old ca with a new one.
+ *
+ * The new ca must have the same name as the old one, that has been
+ * registered.
+ */
+int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca)
+{
+ struct tcp_congestion_ops *existing;
+ int ret;
+
+ ret = tcp_ca_validate(ca);
+ if (ret)
+ return ret;
+
+ ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
+ spin_lock(&tcp_cong_list_lock);
+ existing = tcp_ca_find_key(old_ca->key);
+ if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) {
+ pr_notice("%s not registered or non-unique key\n",
+ ca->name);
+ ret = -EINVAL;
+ } else if (existing != old_ca) {
+ pr_notice("invalid old congestion control algorithm to replace\n");
+ ret = -EINVAL;
+ } else {
+ list_add_tail_rcu(&ca->list, &tcp_cong_list);
+ list_del_rcu(&existing->list);
+ pr_debug("%s updated\n", ca->name);
+ }
+ spin_unlock(&tcp_cong_list_lock);
+
+ return ret;
+}
+
u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
{
const struct tcp_congestion_ops *ca;
--
2.30.2
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH bpf-next v3 3/8] net: Update an existing TCP congestion control algorithm.
2023-03-03 1:21 ` [PATCH bpf-next v3 3/8] net: Update an existing TCP congestion control algorithm Kui-Feng Lee
@ 2023-03-07 2:17 ` Martin KaFai Lau
2023-03-07 19:17 ` Kui-Feng Lee
0 siblings, 1 reply; 20+ messages in thread
From: Martin KaFai Lau @ 2023-03-07 2:17 UTC (permalink / raw)
To: Kui-Feng Lee; +Cc: bpf, ast, song, kernel-team, andrii, sdf
On 3/2/23 5:21 PM, Kui-Feng Lee wrote:
> diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
> index db8b4b488c31..981501871609 100644
> --- a/net/ipv4/tcp_cong.c
> +++ b/net/ipv4/tcp_cong.c
> @@ -75,14 +75,8 @@ struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
> return NULL;
> }
>
> -/*
> - * Attach new congestion control algorithm to the list
> - * of available options.
> - */
> -int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
> +static int tcp_ca_validate(struct tcp_congestion_ops *ca)
It is useful to call tcp_ca_validate() before update_elem transiting a
struct_ops to BPF_STRUCT_OPS_STATE_READY. Otherwise, the user space will end up
having a struct_ops that can never be used to create a link.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH bpf-next v3 3/8] net: Update an existing TCP congestion control algorithm.
2023-03-07 2:17 ` Martin KaFai Lau
@ 2023-03-07 19:17 ` Kui-Feng Lee
0 siblings, 0 replies; 20+ messages in thread
From: Kui-Feng Lee @ 2023-03-07 19:17 UTC (permalink / raw)
To: Martin KaFai Lau, Kui-Feng Lee; +Cc: bpf, ast, song, kernel-team, andrii, sdf
On 3/6/23 18:17, Martin KaFai Lau wrote:
> On 3/2/23 5:21 PM, Kui-Feng Lee wrote:
>> diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
>> index db8b4b488c31..981501871609 100644
>> --- a/net/ipv4/tcp_cong.c
>> +++ b/net/ipv4/tcp_cong.c
>> @@ -75,14 +75,8 @@ struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
>> return NULL;
>> }
>> -/*
>> - * Attach new congestion control algorithm to the list
>> - * of available options.
>> - */
>> -int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
>> +static int tcp_ca_validate(struct tcp_congestion_ops *ca)
>
> It is useful to call tcp_ca_validate() before update_elem transiting a
> struct_ops to BPF_STRUCT_OPS_STATE_READY. Otherwise, the user space will
> end up having a struct_ops that can never be used to create a link.
I will add a function pointer in struct bpf_struct_ops to expose this
check function. At the struct_ops side, it will call the function to
check kdata before transiting to READY if the function pointer is set.
>
>
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH bpf-next v3 4/8] libbpf: Create a bpf_link in bpf_map__attach_struct_ops().
2023-03-03 1:21 [PATCH bpf-next v3 0/8] Transit between BPF TCP congestion controls Kui-Feng Lee
` (2 preceding siblings ...)
2023-03-03 1:21 ` [PATCH bpf-next v3 3/8] net: Update an existing TCP congestion control algorithm Kui-Feng Lee
@ 2023-03-03 1:21 ` Kui-Feng Lee
2023-03-03 1:21 ` [PATCH bpf-next v3 5/8] bpf: Update the struct_ops of a bpf_link Kui-Feng Lee
` (3 subsequent siblings)
7 siblings, 0 replies; 20+ messages in thread
From: Kui-Feng Lee @ 2023-03-03 1:21 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
bpf_map__attach_struct_ops() was creating a dummy bpf_link as a
placeholder, but now it is constructing an authentic one by calling
bpf_link_create() if the map has the BPF_F_LINK flag.
You can flag a struct_ops map with BPF_F_LINK by calling
bpf_map__set_map_flags().
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
tools/lib/bpf/libbpf.c | 84 +++++++++++++++++++++++++++++++-----------
1 file changed, 62 insertions(+), 22 deletions(-)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 35a698eb825d..a67efc3b3763 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -115,6 +115,7 @@ static const char * const attach_type_name[] = {
[BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_reuseport_select_or_migrate",
[BPF_PERF_EVENT] = "perf_event",
[BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi",
+ [BPF_STRUCT_OPS] = "struct_ops",
};
static const char * const link_type_name[] = {
@@ -7677,6 +7678,26 @@ static int bpf_object__resolve_externs(struct bpf_object *obj,
return 0;
}
+static void bpf_map_prepare_vdata(const struct bpf_map *map)
+{
+ struct bpf_struct_ops *st_ops;
+ __u32 i;
+
+ st_ops = map->st_ops;
+ for (i = 0; i < btf_vlen(st_ops->type); i++) {
+ struct bpf_program *prog = st_ops->progs[i];
+ void *kern_data;
+ int prog_fd;
+
+ if (!prog)
+ continue;
+
+ prog_fd = bpf_program__fd(prog);
+ kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i];
+ *(unsigned long *)kern_data = prog_fd;
+ }
+}
+
static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const char *target_btf_path)
{
int err, i;
@@ -7728,6 +7749,10 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch
btf__free(obj->btf_vmlinux);
obj->btf_vmlinux = NULL;
+ for (i = 0; i < obj->nr_maps; i++)
+ if (bpf_map__is_struct_ops(&obj->maps[i]))
+ bpf_map_prepare_vdata(&obj->maps[i]);
+
obj->loaded = true; /* doesn't matter if successfully or not */
if (err)
@@ -11429,22 +11454,34 @@ struct bpf_link *bpf_program__attach(const struct bpf_program *prog)
return link;
}
+struct bpf_link_struct_ops {
+ struct bpf_link link;
+ int map_fd;
+};
+
static int bpf_link__detach_struct_ops(struct bpf_link *link)
{
+ struct bpf_link_struct_ops *st_link;
__u32 zero = 0;
- if (bpf_map_delete_elem(link->fd, &zero))
- return -errno;
+ st_link = container_of(link, struct bpf_link_struct_ops, link);
- return 0;
+ if (st_link->map_fd < 0) {
+ /* Fake bpf_link */
+ if (bpf_map_delete_elem(link->fd, &zero))
+ return -errno;
+ return 0;
+ }
+
+ /* Doesn't support detaching. */
+ return -EOPNOTSUPP;
}
struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
{
- struct bpf_struct_ops *st_ops;
- struct bpf_link *link;
- __u32 i, zero = 0;
- int err;
+ struct bpf_link_struct_ops *link;
+ __u32 zero = 0;
+ int err, fd;
if (!bpf_map__is_struct_ops(map) || map->fd == -1)
return libbpf_err_ptr(-EINVAL);
@@ -11453,31 +11490,34 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
if (!link)
return libbpf_err_ptr(-EINVAL);
- st_ops = map->st_ops;
- for (i = 0; i < btf_vlen(st_ops->type); i++) {
- struct bpf_program *prog = st_ops->progs[i];
- void *kern_data;
- int prog_fd;
+ /* kern_vdata should be prepared during the loading phase. */
+ err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0);
+ if (err) {
+ err = -errno;
+ free(link);
+ return libbpf_err_ptr(err);
+ }
- if (!prog)
- continue;
- prog_fd = bpf_program__fd(prog);
- kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i];
- *(unsigned long *)kern_data = prog_fd;
+ if (!(map->def.map_flags & BPF_F_LINK)) {
+ /* Fake bpf_link */
+ link->link.fd = map->fd;
+ link->map_fd = -1;
+ link->link.detach = bpf_link__detach_struct_ops;
+ return &link->link;
}
- err = bpf_map_update_elem(map->fd, &zero, st_ops->kern_vdata, 0);
- if (err) {
+ fd = bpf_link_create(map->fd, -1, BPF_STRUCT_OPS, NULL);
+ if (fd < 0) {
err = -errno;
free(link);
return libbpf_err_ptr(err);
}
- link->detach = bpf_link__detach_struct_ops;
- link->fd = map->fd;
+ link->link.fd = fd;
+ link->map_fd = map->fd;
- return link;
+ return &link->link;
}
typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr,
--
2.30.2
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH bpf-next v3 5/8] bpf: Update the struct_ops of a bpf_link.
2023-03-03 1:21 [PATCH bpf-next v3 0/8] Transit between BPF TCP congestion controls Kui-Feng Lee
` (3 preceding siblings ...)
2023-03-03 1:21 ` [PATCH bpf-next v3 4/8] libbpf: Create a bpf_link in bpf_map__attach_struct_ops() Kui-Feng Lee
@ 2023-03-03 1:21 ` Kui-Feng Lee
2023-03-03 1:21 ` [PATCH bpf-next v3 6/8] libbpf: Update a bpf_link with another struct_ops Kui-Feng Lee
` (2 subsequent siblings)
7 siblings, 0 replies; 20+ messages in thread
From: Kui-Feng Lee @ 2023-03-03 1:21 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
By improving the BPF_LINK_UPDATE command of bpf(), it should allow you
to conveniently switch between different struct_ops on a single
bpf_link. This would enable smoother transitions from one struct_ops
to another.
The struct_ops maps passing along with BPF_LINK_UPDATE should have the
BPF_F_LINK flag.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
include/linux/bpf.h | 1 +
include/uapi/linux/bpf.h | 8 +++++--
kernel/bpf/bpf_struct_ops.c | 46 +++++++++++++++++++++++++++++++++++++
kernel/bpf/syscall.c | 43 ++++++++++++++++++++++++++++++----
4 files changed, 92 insertions(+), 6 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index acb18aa64b77..643f94432b65 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1410,6 +1410,7 @@ struct bpf_link_ops {
void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
int (*fill_link_info)(const struct bpf_link *link,
struct bpf_link_info *info);
+ int (*update_map)(struct bpf_link *link, struct bpf_map *new_map);
};
struct bpf_tramp_link {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cd0ff39981e8..0702e88f7c08 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1555,8 +1555,12 @@ union bpf_attr {
struct { /* struct used by BPF_LINK_UPDATE command */
__u32 link_fd; /* link fd */
- /* new program fd to update link with */
- __u32 new_prog_fd;
+ union {
+ /* new program fd to update link with */
+ __u32 new_prog_fd;
+ /* new struct_ops map fd to update link with */
+ __u32 new_map_fd;
+ };
__u32 flags; /* extra flags */
/* expected link's program fd; is specified only if
* BPF_F_REPLACE flag is set in flags */
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 9ec675576d97..ce856811359f 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -771,10 +771,56 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
return 0;
}
+static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map)
+{
+ struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map, *old_st_map;
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *old_map;
+ int err = 0;
+
+ if (new_map->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
+ !(new_map->map_flags & BPF_F_LINK))
+ return -EINVAL;
+
+ mutex_lock(&update_mutex);
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+
+ /* The new and old struct_ops must be the same type. */
+ st_map = container_of(new_map, struct bpf_struct_ops_map, map);
+
+ old_map = st_link->map;
+ old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
+ if (st_map->st_ops != old_st_map->st_ops ||
+ /* Pair with smp_store_release() during map_update */
+ smp_load_acquire(&st_map->kvalue.state) != BPF_STRUCT_OPS_STATE_READY) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ kvalue = &st_map->kvalue;
+
+ err = st_map->st_ops->update(kvalue->data, old_st_map->kvalue.data);
+ if (err)
+ goto err_out;
+
+ bpf_map_inc(new_map);
+ rcu_assign_pointer(st_link->map, new_map);
+
+ bpf_map_put(old_map);
+
+err_out:
+ mutex_unlock(&update_mutex);
+
+ return err;
+}
+
static const struct bpf_link_ops bpf_struct_ops_map_lops = {
.dealloc = bpf_struct_ops_map_link_dealloc,
.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+ .update_map = bpf_struct_ops_map_link_update,
};
int bpf_struct_ops_link_create(union bpf_attr *attr)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3db4938212d6..a640a280bff9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4655,6 +4655,30 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
return ret;
}
+static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
+{
+ struct bpf_map *new_map;
+ int ret = 0;
+
+ new_map = bpf_map_get(attr->link_update.new_map_fd);
+ if (IS_ERR(new_map))
+ return -EINVAL;
+
+ if (new_map->map_type != BPF_MAP_TYPE_STRUCT_OPS) {
+ ret = -EINVAL;
+ goto out_put_map;
+ }
+
+ if (link->ops->update_map)
+ ret = link->ops->update_map(link, new_map);
+ else
+ ret = -EINVAL;
+
+out_put_map:
+ bpf_map_put(new_map);
+ return ret;
+}
+
#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
static int link_update(union bpf_attr *attr)
@@ -4667,14 +4691,25 @@ static int link_update(union bpf_attr *attr)
if (CHECK_ATTR(BPF_LINK_UPDATE))
return -EINVAL;
- flags = attr->link_update.flags;
- if (flags & ~BPF_F_REPLACE)
- return -EINVAL;
-
link = bpf_link_get_from_fd(attr->link_update.link_fd);
if (IS_ERR(link))
return PTR_ERR(link);
+ flags = attr->link_update.flags;
+
+ if (link->ops->update_map) {
+ if (flags) /* always replace the existing one */
+ ret = -EINVAL;
+ else
+ ret = link_update_map(link, attr);
+ goto out_put_link;
+ }
+
+ if (flags & ~BPF_F_REPLACE) {
+ ret = -EINVAL;
+ goto out_put_link;
+ }
+
new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
if (IS_ERR(new_prog)) {
ret = PTR_ERR(new_prog);
--
2.30.2
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH bpf-next v3 6/8] libbpf: Update a bpf_link with another struct_ops.
2023-03-03 1:21 [PATCH bpf-next v3 0/8] Transit between BPF TCP congestion controls Kui-Feng Lee
` (4 preceding siblings ...)
2023-03-03 1:21 ` [PATCH bpf-next v3 5/8] bpf: Update the struct_ops of a bpf_link Kui-Feng Lee
@ 2023-03-03 1:21 ` Kui-Feng Lee
2023-03-03 1:21 ` [PATCH bpf-next v3 7/8] libbpf: Use .struct_ops.link section to indicate a struct_ops with a link Kui-Feng Lee
2023-03-03 1:21 ` [PATCH bpf-next v3 8/8] selftests/bpf: Test switching TCP Congestion Control algorithms Kui-Feng Lee
7 siblings, 0 replies; 20+ messages in thread
From: Kui-Feng Lee @ 2023-03-03 1:21 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
Introduce bpf_link__update_struct_ops(), which will allow you to
effortlessly transition the struct_ops map of any given bpf_link into
an alternative.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
tools/lib/bpf/libbpf.c | 36 ++++++++++++++++++++++++++++++++++++
tools/lib/bpf/libbpf.h | 1 +
tools/lib/bpf/libbpf.map | 2 ++
3 files changed, 39 insertions(+)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index a67efc3b3763..247de39d136f 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -11520,6 +11520,42 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
return &link->link;
}
+/*
+ * Swap the back struct_ops of a link with a new struct_ops map.
+ */
+int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map)
+{
+ struct bpf_link_struct_ops *st_ops_link;
+ __u32 zero = 0;
+ int err, fd;
+
+ if (!bpf_map__is_struct_ops(map) || map->fd == -1)
+ return -EINVAL;
+
+ st_ops_link = container_of(link, struct bpf_link_struct_ops, link);
+ /* Ensure the type of a link is correct */
+ if (st_ops_link->map_fd < 0)
+ return -EINVAL;
+
+ err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0);
+ if (err && errno != EBUSY) {
+ err = -errno;
+ free(link);
+ return err;
+ }
+
+ fd = bpf_link_update(link->fd, map->fd, NULL);
+ if (fd < 0) {
+ err = -errno;
+ free(link);
+ return err;
+ }
+
+ st_ops_link->map_fd = map->fd;
+
+ return 0;
+}
+
typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr,
void *private_data);
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 2efd80f6f7b9..5e62878d184c 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -695,6 +695,7 @@ bpf_program__attach_freplace(const struct bpf_program *prog,
struct bpf_map;
LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map);
+LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map);
struct bpf_iter_attach_opts {
size_t sz; /* size of this struct for forward/backward compatibility */
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 11c36a3c1a9f..e83571b04c19 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -384,4 +384,6 @@ LIBBPF_1.1.0 {
} LIBBPF_1.0.0;
LIBBPF_1.2.0 {
+ global:
+ bpf_link__update_map;
} LIBBPF_1.1.0;
--
2.30.2
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH bpf-next v3 7/8] libbpf: Use .struct_ops.link section to indicate a struct_ops with a link.
2023-03-03 1:21 [PATCH bpf-next v3 0/8] Transit between BPF TCP congestion controls Kui-Feng Lee
` (5 preceding siblings ...)
2023-03-03 1:21 ` [PATCH bpf-next v3 6/8] libbpf: Update a bpf_link with another struct_ops Kui-Feng Lee
@ 2023-03-03 1:21 ` Kui-Feng Lee
2023-03-03 1:21 ` [PATCH bpf-next v3 8/8] selftests/bpf: Test switching TCP Congestion Control algorithms Kui-Feng Lee
7 siblings, 0 replies; 20+ messages in thread
From: Kui-Feng Lee @ 2023-03-03 1:21 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
Flags a struct_ops is to back a bpf_link by putting it to the
".struct_ops.link" section. Once it is flagged, the created
struct_ops can be used to create a bpf_link or update a bpf_link that
has been backed by another struct_ops.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
tools/lib/bpf/libbpf.c | 64 +++++++++++++++++++++++++++++++++---------
1 file changed, 50 insertions(+), 14 deletions(-)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 247de39d136f..d66acd2fdbaa 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -467,6 +467,7 @@ struct bpf_struct_ops {
#define KCONFIG_SEC ".kconfig"
#define KSYMS_SEC ".ksyms"
#define STRUCT_OPS_SEC ".struct_ops"
+#define STRUCT_OPS_LINK_SEC ".struct_ops.link"
enum libbpf_map_type {
LIBBPF_MAP_UNSPEC,
@@ -596,6 +597,7 @@ struct elf_state {
Elf64_Ehdr *ehdr;
Elf_Data *symbols;
Elf_Data *st_ops_data;
+ Elf_Data *st_ops_link_data;
size_t shstrndx; /* section index for section name strings */
size_t strtabidx;
struct elf_sec_desc *secs;
@@ -605,6 +607,7 @@ struct elf_state {
int text_shndx;
int symbols_shndx;
int st_ops_shndx;
+ int st_ops_link_shndx;
};
struct usdt_manager;
@@ -1119,7 +1122,7 @@ static int bpf_object__init_kern_struct_ops_maps(struct bpf_object *obj)
return 0;
}
-static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
+static int bpf_object__init_struct_ops_maps_link(struct bpf_object *obj, bool link)
{
const struct btf_type *type, *datasec;
const struct btf_var_secinfo *vsi;
@@ -1127,18 +1130,33 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
const char *tname, *var_name;
__s32 type_id, datasec_id;
const struct btf *btf;
+ const char *sec_name;
struct bpf_map *map;
- __u32 i;
+ __u32 i, map_flags;
+ Elf_Data *data;
+ int shndx;
- if (obj->efile.st_ops_shndx == -1)
+ if (link) {
+ sec_name = STRUCT_OPS_LINK_SEC;
+ shndx = obj->efile.st_ops_link_shndx;
+ data = obj->efile.st_ops_link_data;
+ map_flags = BPF_F_LINK;
+ } else {
+ sec_name = STRUCT_OPS_SEC;
+ shndx = obj->efile.st_ops_shndx;
+ data = obj->efile.st_ops_data;
+ map_flags = 0;
+ }
+
+ if (shndx == -1)
return 0;
btf = obj->btf;
- datasec_id = btf__find_by_name_kind(btf, STRUCT_OPS_SEC,
+ datasec_id = btf__find_by_name_kind(btf, sec_name,
BTF_KIND_DATASEC);
if (datasec_id < 0) {
pr_warn("struct_ops init: DATASEC %s not found\n",
- STRUCT_OPS_SEC);
+ sec_name);
return -EINVAL;
}
@@ -1151,7 +1169,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
type_id = btf__resolve_type(obj->btf, vsi->type);
if (type_id < 0) {
pr_warn("struct_ops init: Cannot resolve var type_id %u in DATASEC %s\n",
- vsi->type, STRUCT_OPS_SEC);
+ vsi->type, sec_name);
return -EINVAL;
}
@@ -1170,7 +1188,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
if (IS_ERR(map))
return PTR_ERR(map);
- map->sec_idx = obj->efile.st_ops_shndx;
+ map->sec_idx = shndx;
map->sec_offset = vsi->offset;
map->name = strdup(var_name);
if (!map->name)
@@ -1180,6 +1198,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
map->def.key_size = sizeof(int);
map->def.value_size = type->size;
map->def.max_entries = 1;
+ map->def.map_flags = map_flags;
map->st_ops = calloc(1, sizeof(*map->st_ops));
if (!map->st_ops)
@@ -1192,14 +1211,14 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
if (!st_ops->data || !st_ops->progs || !st_ops->kern_func_off)
return -ENOMEM;
- if (vsi->offset + type->size > obj->efile.st_ops_data->d_size) {
+ if (vsi->offset + type->size > data->d_size) {
pr_warn("struct_ops init: var %s is beyond the end of DATASEC %s\n",
- var_name, STRUCT_OPS_SEC);
+ var_name, sec_name);
return -EINVAL;
}
memcpy(st_ops->data,
- obj->efile.st_ops_data->d_buf + vsi->offset,
+ data->d_buf + vsi->offset,
type->size);
st_ops->tname = tname;
st_ops->type = type;
@@ -1212,6 +1231,15 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
return 0;
}
+static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
+{
+ int err;
+
+ err = bpf_object__init_struct_ops_maps_link(obj, false);
+ err = err ?: bpf_object__init_struct_ops_maps_link(obj, true);
+ return err;
+}
+
static struct bpf_object *bpf_object__new(const char *path,
const void *obj_buf,
size_t obj_buf_sz,
@@ -1248,6 +1276,7 @@ static struct bpf_object *bpf_object__new(const char *path,
obj->efile.obj_buf_sz = obj_buf_sz;
obj->efile.btf_maps_shndx = -1;
obj->efile.st_ops_shndx = -1;
+ obj->efile.st_ops_link_shndx = -1;
obj->kconfig_map_idx = -1;
obj->kern_version = get_kernel_version();
@@ -1265,6 +1294,7 @@ static void bpf_object__elf_finish(struct bpf_object *obj)
obj->efile.elf = NULL;
obj->efile.symbols = NULL;
obj->efile.st_ops_data = NULL;
+ obj->efile.st_ops_link_data = NULL;
zfree(&obj->efile.secs);
obj->efile.sec_cnt = 0;
@@ -2753,12 +2783,13 @@ static bool libbpf_needs_btf(const struct bpf_object *obj)
{
return obj->efile.btf_maps_shndx >= 0 ||
obj->efile.st_ops_shndx >= 0 ||
+ obj->efile.st_ops_link_shndx >= 0 ||
obj->nr_extern > 0;
}
static bool kernel_needs_btf(const struct bpf_object *obj)
{
- return obj->efile.st_ops_shndx >= 0;
+ return obj->efile.st_ops_shndx >= 0 || obj->efile.st_ops_link_shndx >= 0;
}
static int bpf_object__init_btf(struct bpf_object *obj,
@@ -3451,6 +3482,9 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
} else if (strcmp(name, STRUCT_OPS_SEC) == 0) {
obj->efile.st_ops_data = data;
obj->efile.st_ops_shndx = idx;
+ } else if (strcmp(name, STRUCT_OPS_LINK_SEC) == 0) {
+ obj->efile.st_ops_link_data = data;
+ obj->efile.st_ops_link_shndx = idx;
} else {
pr_info("elf: skipping unrecognized data section(%d) %s\n",
idx, name);
@@ -3465,6 +3499,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
/* Only do relo for section with exec instructions */
if (!section_have_execinstr(obj, targ_sec_idx) &&
strcmp(name, ".rel" STRUCT_OPS_SEC) &&
+ strcmp(name, ".rel" STRUCT_OPS_LINK_SEC) &&
strcmp(name, ".rel" MAPS_ELF_SEC)) {
pr_info("elf: skipping relo section(%d) %s for section(%d) %s\n",
idx, name, targ_sec_idx,
@@ -6611,7 +6646,7 @@ static int bpf_object__collect_relos(struct bpf_object *obj)
return -LIBBPF_ERRNO__INTERNAL;
}
- if (idx == obj->efile.st_ops_shndx)
+ if (idx == obj->efile.st_ops_shndx || idx == obj->efile.st_ops_link_shndx)
err = bpf_object__collect_st_ops_relos(obj, shdr, data);
else if (idx == obj->efile.btf_maps_shndx)
err = bpf_object__collect_map_relos(obj, shdr, data);
@@ -8954,8 +8989,9 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
}
/* struct_ops BPF prog can be re-used between multiple
- * .struct_ops as long as it's the same struct_ops struct
- * definition and the same function pointer field
+ * .struct_ops & .struct_ops.link as long as it's the
+ * same struct_ops struct definition and the same
+ * function pointer field
*/
if (prog->attach_btf_id != st_ops->type_id ||
prog->expected_attach_type != member_idx) {
--
2.30.2
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH bpf-next v3 8/8] selftests/bpf: Test switching TCP Congestion Control algorithms.
2023-03-03 1:21 [PATCH bpf-next v3 0/8] Transit between BPF TCP congestion controls Kui-Feng Lee
` (6 preceding siblings ...)
2023-03-03 1:21 ` [PATCH bpf-next v3 7/8] libbpf: Use .struct_ops.link section to indicate a struct_ops with a link Kui-Feng Lee
@ 2023-03-03 1:21 ` Kui-Feng Lee
7 siblings, 0 replies; 20+ messages in thread
From: Kui-Feng Lee @ 2023-03-03 1:21 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
Create a pair of sockets that utilize the congestion control algorithm
under a particular name. Then switch up this congestion control
algorithm to another implementation and check whether newly created
connections using the same cc name now run the new implementation.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
.../selftests/bpf/prog_tests/bpf_tcp_ca.c | 38 ++++++++++++
.../selftests/bpf/progs/tcp_ca_update.c | 62 +++++++++++++++++++
2 files changed, 100 insertions(+)
create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_update.c
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
index e980188d4124..caaa9175ee36 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -8,6 +8,7 @@
#include "bpf_dctcp.skel.h"
#include "bpf_cubic.skel.h"
#include "bpf_tcp_nogpl.skel.h"
+#include "tcp_ca_update.skel.h"
#include "bpf_dctcp_release.skel.h"
#include "tcp_ca_write_sk_pacing.skel.h"
#include "tcp_ca_incompl_cong_ops.skel.h"
@@ -381,6 +382,41 @@ static void test_unsupp_cong_op(void)
libbpf_set_print(old_print_fn);
}
+static void test_update_ca(void)
+{
+ struct tcp_ca_update *skel;
+ struct bpf_link *link;
+ int saved_ca1_cnt;
+ int err;
+
+ skel = tcp_ca_update__open();
+ if (!ASSERT_OK_PTR(skel, "open"))
+ return;
+
+ err = tcp_ca_update__load(skel);
+ if (!ASSERT_OK(err, "load")) {
+ tcp_ca_update__destroy(skel);
+ return;
+ }
+
+ link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
+ ASSERT_OK_PTR(link, "attach_struct_ops");
+
+ do_test("tcp_ca_update", NULL);
+ saved_ca1_cnt = skel->bss->ca1_cnt;
+ ASSERT_GT(saved_ca1_cnt, 0, "ca1_ca1_cnt");
+
+ err = bpf_link__update_map(link, skel->maps.ca_update_2);
+ ASSERT_OK(err, "update_struct_ops");
+
+ do_test("tcp_ca_update", NULL);
+ ASSERT_EQ(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt");
+ ASSERT_GT(skel->bss->ca2_cnt, 0, "ca2_ca2_cnt");
+
+ bpf_link__destroy(link);
+ tcp_ca_update__destroy(skel);
+}
+
void test_bpf_tcp_ca(void)
{
if (test__start_subtest("dctcp"))
@@ -399,4 +435,6 @@ void test_bpf_tcp_ca(void)
test_incompl_cong_ops();
if (test__start_subtest("unsupp_cong_op"))
test_unsupp_cong_op();
+ if (test__start_subtest("update_ca"))
+ test_update_ca();
}
diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
new file mode 100644
index 000000000000..36a04be95df5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+int ca1_cnt = 0;
+int ca2_cnt = 0;
+
+#define USEC_PER_SEC 1000000UL
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+
+static inline struct tcp_sock *tcp_sk(const struct sock *sk)
+{
+ return (struct tcp_sock *)sk;
+}
+
+SEC("struct_ops/ca_update_1_cong_control")
+void BPF_PROG(ca_update_1_cong_control, struct sock *sk,
+ const struct rate_sample *rs)
+{
+ ca1_cnt++;
+}
+
+SEC("struct_ops/ca_update_2_cong_control")
+void BPF_PROG(ca_update_2_cong_control, struct sock *sk,
+ const struct rate_sample *rs)
+{
+ ca2_cnt++;
+}
+
+SEC("struct_ops/ca_update_ssthresh")
+__u32 BPF_PROG(ca_update_ssthresh, struct sock *sk)
+{
+ return tcp_sk(sk)->snd_ssthresh;
+}
+
+SEC("struct_ops/ca_update_undo_cwnd")
+__u32 BPF_PROG(ca_update_undo_cwnd, struct sock *sk)
+{
+ return tcp_sk(sk)->snd_cwnd;
+}
+
+SEC(".struct_ops.link")
+struct tcp_congestion_ops ca_update_1 = {
+ .cong_control = (void *)ca_update_1_cong_control,
+ .ssthresh = (void *)ca_update_ssthresh,
+ .undo_cwnd = (void *)ca_update_undo_cwnd,
+ .name = "tcp_ca_update",
+};
+
+SEC(".struct_ops.link")
+struct tcp_congestion_ops ca_update_2 = {
+ .cong_control = (void *)ca_update_2_cong_control,
+ .ssthresh = (void *)ca_update_ssthresh,
+ .undo_cwnd = (void *)ca_update_undo_cwnd,
+ .name = "tcp_ca_update",
+};
--
2.30.2
^ permalink raw reply related [flat|nested] 20+ messages in thread