* [PATCH 1/2] bpf: Introduce cpu affinity for sockmap
@ 2024-11-01 2:38 mrpre
2024-11-01 2:38 ` [PATCH 2/2] bpf: implement libbpf sockmap cpu affinity mrpre
` (4 more replies)
0 siblings, 5 replies; 8+ messages in thread
From: mrpre @ 2024-11-01 2:38 UTC (permalink / raw)
To: yonghong.song, john.fastabend, martin.lau, edumazet, jakub, davem,
dsahern, kuba, pabeni, netdev, bpf, linux-kernel
Cc: mrpre
Why we need cpu affinity:
Mainstream data planes, like Nginx and HAProxy, utilize CPU affinity
by binding user processes to specific CPUs. This avoids interference
between processes and prevents impact from other processes.
Sockmap, as an optimization to accelerate such proxy programs,
currently lacks the ability to specify CPU affinity. The current
implementation of sockmap handling backlog is based on workqueue,
which operates by calling 'schedule_delayed_work()'. It's current
implementation prefers to schedule on the local CPU, i.e., the CPU
that handled the packet under softirq.
For extremely high traffic with large numbers of packets,
'sk_psock_backlog' becomes a large loop.
For multi-threaded programs with only one map, we expect different
sockets to run on different CPUs. It is important to note that this
feature is not a general performance optimization. Instead, it
provides users with the ability to bind to specific CPU, allowing
them to enhance overall operating system utilization based on their
own system environments.
Implementation:
1.When updating the sockmap, support passing a CPU parameter and
save it to the psock.
2.When scheduling psock, determine which CPU to run on using the
psock's CPU information.
3.For thoes sockmap without CPU affinity, keep original logic by using
'schedule_delayed_work()'.
Performance Testing:
'client <-> sockmap proxy <-> server'
Using 'iperf3' tests, with the iperf server bound to CPU5 and the iperf
client bound to CPU6, performance without using CPU affinity is
around 34 Gbits/s, and CPU usage is concentrated on CPU5 and CPU6.
'''
[ 5] local 127.0.0.1 port 57144 connected to 127.0.0.1 port 10000
[ ID] Interval Transfer Bitrate
[ 5] 0.00-1.00 sec 3.95 GBytes 33.9 Gbits/sec
[ 5] 1.00-2.00 sec 3.95 GBytes 34.0 Gbits/sec
......
'''
With using CPU affinity, the performnce is close to direct connection
(without any proxy).
'''
[ 5] local 127.0.0.1 port 56518 connected to 127.0.0.1 port 10000
[ ID] Interval Transfer Bitrate
[ 5] 0.00-1.00 sec 7.76 GBytes 66.6 Gbits/sec
[ 5] 1.00-2.00 sec 7.76 GBytes 66.7 Gbits/sec
......
'''
Signed-off-by: Jiayuan Chen <mrpre@163.com>
---
include/linux/bpf.h | 3 ++-
include/linux/skmsg.h | 8 ++++++++
include/uapi/linux/bpf.h | 4 ++++
kernel/bpf/syscall.c | 23 +++++++++++++++++------
net/core/skmsg.c | 11 +++++++----
net/core/sock_map.c | 12 +++++++-----
6 files changed, 45 insertions(+), 16 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c3ba4d475174..a56028c389e7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3080,7 +3080,8 @@ int bpf_prog_test_run_syscall(struct bpf_prog *prog,
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
-int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
+int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags,
+ s32 target_cpu);
int sock_map_bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr);
int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index d9b03e0746e7..919425a92adf 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -117,6 +117,7 @@ struct sk_psock {
struct delayed_work work;
struct sock *sk_pair;
struct rcu_work rwork;
+ s32 target_cpu;
};
int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
@@ -514,6 +515,13 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
return !!psock->saved_data_ready;
}
+static inline int sk_psock_strp_get_cpu(struct sk_psock *psock)
+{
+ if (psock->target_cpu != -1)
+ return psock->target_cpu;
+ return WORK_CPU_UNBOUND;
+}
+
#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
#define BPF_F_STRPARSER (1UL << 1)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f28b6527e815..2019a87b5d4a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1509,6 +1509,10 @@ union bpf_attr {
__aligned_u64 next_key;
};
__u64 flags;
+ union {
+ /* specify the CPU where the sockmap job run on */
+ __aligned_u64 target_cpu;
+ };
};
struct { /* struct used by BPF_MAP_*_BATCH commands */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8254b2973157..95f719b9c3f3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -239,10 +239,9 @@ static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
}
static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
- void *key, void *value, __u64 flags)
+ void *key, void *value, __u64 flags, s32 target_cpu)
{
int err;
-
/* Need to create a kthread, thus must support schedule */
if (bpf_map_is_offloaded(map)) {
return bpf_map_offload_update_elem(map, key, value, flags);
@@ -252,7 +251,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
return map->ops->map_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
map->map_type == BPF_MAP_TYPE_SOCKMAP) {
- return sock_map_update_elem_sys(map, key, value, flags);
+ return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
} else if (IS_FD_PROG_ARRAY(map)) {
return bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
@@ -1680,12 +1679,14 @@ static int map_lookup_elem(union bpf_attr *attr)
}
-#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD target_cpu
static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
{
bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
+ bpfptr_t utarget_cpu = make_bpfptr(attr->target_cpu, uattr.is_kernel);
+ s64 target_cpu = 0;
struct bpf_map *map;
void *key, *value;
u32 value_size;
@@ -1723,7 +1724,17 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
goto free_key;
}
- err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags);
+ if (map->map_type == BPF_MAP_TYPE_SOCKMAP &&
+ !bpfptr_is_null(utarget_cpu)) {
+ if (copy_from_bpfptr(&target_cpu, utarget_cpu, sizeof(target_cpu)) ||
+ target_cpu > nr_cpu_ids) {
+ err = -EINVAL;
+ goto err_put;
+ }
+ } else {
+ target_cpu = -1;
+ }
+ err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags, (s32)target_cpu);
if (!err)
maybe_wait_bpf_programs(map);
@@ -1947,7 +1958,7 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
break;
err = bpf_map_update_value(map, map_file, key, value,
- attr->batch.elem_flags);
+ attr->batch.elem_flags, -1);
if (err)
break;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index b1dcbd3be89e..d3b6f2468dab 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -679,7 +679,8 @@ static void sk_psock_backlog(struct work_struct *work)
* other work that might be here.
*/
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
- schedule_delayed_work(&psock->work, 1);
+ schedule_delayed_work_on(sk_psock_strp_get_cpu(psock),
+ &psock->work, 1);
goto end;
}
/* Hard errors break pipe and stop xmit. */
@@ -729,6 +730,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
psock->saved_destroy = prot->destroy;
psock->saved_close = prot->close;
psock->saved_write_space = sk->sk_write_space;
+ psock->target_cpu = -1;
INIT_LIST_HEAD(&psock->link);
spin_lock_init(&psock->link_lock);
@@ -934,7 +936,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb)
}
skb_queue_tail(&psock_other->ingress_skb, skb);
- schedule_delayed_work(&psock_other->work, 0);
+ schedule_delayed_work_on(sk_psock_strp_get_cpu(psock_other), &psock_other->work, 0);
spin_unlock_bh(&psock_other->ingress_lock);
return 0;
}
@@ -1012,7 +1014,8 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
spin_lock_bh(&psock->ingress_lock);
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
skb_queue_tail(&psock->ingress_skb, skb);
- schedule_delayed_work(&psock->work, 0);
+ schedule_delayed_work_on(sk_psock_strp_get_cpu(psock),
+ &psock->work, 0);
err = 0;
}
spin_unlock_bh(&psock->ingress_lock);
@@ -1044,7 +1047,7 @@ static void sk_psock_write_space(struct sock *sk)
psock = sk_psock(sk);
if (likely(psock)) {
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
- schedule_delayed_work(&psock->work, 0);
+ schedule_delayed_work_on(sk_psock_strp_get_cpu(psock), &psock->work, 0);
write_space = psock->saved_write_space;
}
rcu_read_unlock();
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 07d6aa4e39ef..36e9787c60de 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -465,7 +465,7 @@ static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
}
static int sock_map_update_common(struct bpf_map *map, u32 idx,
- struct sock *sk, u64 flags)
+ struct sock *sk, u64 flags, s32 target_cpu)
{
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
struct sk_psock_link *link;
@@ -490,6 +490,8 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
psock = sk_psock(sk);
WARN_ON_ONCE(!psock);
+ psock->target_cpu = target_cpu;
+
spin_lock_bh(&stab->lock);
osk = stab->sks[idx];
if (osk && flags == BPF_NOEXIST) {
@@ -548,7 +550,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
struct sock *sk, u64 flags);
int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
- u64 flags)
+ u64 flags, s32 target_cpu)
{
struct socket *sock;
struct sock *sk;
@@ -579,7 +581,7 @@ int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
if (!sock_map_sk_state_allowed(sk))
ret = -EOPNOTSUPP;
else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
- ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
+ ret = sock_map_update_common(map, *(u32 *)key, sk, flags, target_cpu);
else
ret = sock_hash_update_common(map, key, sk, flags);
sock_map_sk_release(sk);
@@ -605,7 +607,7 @@ static long sock_map_update_elem(struct bpf_map *map, void *key,
if (!sock_map_sk_state_allowed(sk))
ret = -EOPNOTSUPP;
else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
- ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
+ ret = sock_map_update_common(map, *(u32 *)key, sk, flags, -1);
else
ret = sock_hash_update_common(map, key, sk, flags);
bh_unlock_sock(sk);
@@ -621,7 +623,7 @@ BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops,
if (likely(sock_map_sk_is_suitable(sops->sk) &&
sock_map_op_okay(sops)))
return sock_map_update_common(map, *(u32 *)key, sops->sk,
- flags);
+ flags, -1);
return -EOPNOTSUPP;
}
--
2.43.5
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH 2/2] bpf: implement libbpf sockmap cpu affinity
2024-11-01 2:38 [PATCH 1/2] bpf: Introduce cpu affinity for sockmap mrpre
@ 2024-11-01 2:38 ` mrpre
2024-11-01 13:20 ` [PATCH 1/2] bpf: Introduce cpu affinity for sockmap kernel test robot
` (3 subsequent siblings)
4 siblings, 0 replies; 8+ messages in thread
From: mrpre @ 2024-11-01 2:38 UTC (permalink / raw)
To: yonghong.song, john.fastabend, martin.lau, edumazet, jakub, davem,
dsahern, kuba, pabeni, netdev, bpf, linux-kernel
Cc: mrpre
implement libbpf sockmap cpu affinity
Signed-off-by: Jiayuan Chen <mrpre@163.com>
---
tools/include/uapi/linux/bpf.h | 4 ++++
tools/lib/bpf/bpf.c | 22 +++++++++++++++++++
tools/lib/bpf/bpf.h | 9 ++++++++
tools/lib/bpf/libbpf.map | 1 +
.../selftests/bpf/prog_tests/sockmap_basic.c | 19 ++++++++++++----
5 files changed, 51 insertions(+), 4 deletions(-)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f28b6527e815..2019a87b5d4a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1509,6 +1509,10 @@ union bpf_attr {
__aligned_u64 next_key;
};
__u64 flags;
+ union {
+ /* specify the CPU where the sockmap job run on */
+ __aligned_u64 target_cpu;
+ };
};
struct { /* struct used by BPF_MAP_*_BATCH commands */
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 2a4c71501a17..13c3f3cfe889 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -401,6 +401,28 @@ int bpf_map_update_elem(int fd, const void *key, const void *value,
return libbpf_err_errno(ret);
}
+int bpf_map_update_elem_opts(int fd, const void *key, const void *value,
+ __u64 flags, const struct bpf_map_update_opts *opts)
+{
+ union bpf_attr attr;
+ int ret;
+ __u64 *target_cpu;
+
+ if (!OPTS_VALID(opts, bpf_map_update_opts))
+ return libbpf_err(-EINVAL);
+
+ target_cpu = OPTS_GET(opts, target_cpu, NULL);
+ memset(&attr, 0, sizeof(attr));
+ attr.map_fd = fd;
+ attr.key = ptr_to_u64(key);
+ attr.value = ptr_to_u64(value);
+ attr.flags = flags;
+ attr.target_cpu = ptr_to_u64(target_cpu);
+
+ ret = sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
+ return libbpf_err_errno(ret);
+}
+
int bpf_map_lookup_elem(int fd, const void *key, void *value)
{
const size_t attr_sz = offsetofend(union bpf_attr, flags);
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index a4a7b1ad1b63..aec6dfddf697 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -147,6 +147,15 @@ LIBBPF_API int bpf_btf_load(const void *btf_data, size_t btf_size,
LIBBPF_API int bpf_map_update_elem(int fd, const void *key, const void *value,
__u64 flags);
+struct bpf_map_update_opts {
+ size_t sz; /* size of this struct for forward/backward compatibility */
+ /* specify the CPU where the sockmap job run on */
+ __u64 *target_cpu;
+ size_t :0;
+};
+#define bpf_map_update_opts__last_field target_cpu
+LIBBPF_API int bpf_map_update_elem_opts(int fd, const void *key, const void *value,
+ __u64 flags, const struct bpf_map_update_opts *opts);
LIBBPF_API int bpf_map_lookup_elem(int fd, const void *key, void *value);
LIBBPF_API int bpf_map_lookup_elem_flags(int fd, const void *key, void *value,
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 54b6f312cfa8..ab5ec29f542d 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -17,6 +17,7 @@ LIBBPF_0.0.1 {
bpf_map_lookup_and_delete_elem;
bpf_map_lookup_elem;
bpf_map_update_elem;
+ bpf_map_update_elem_opts;
bpf_obj_get;
bpf_obj_get_info_by_fd;
bpf_obj_pin;
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index 82bfb266741c..84a35cb4b9fe 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -190,13 +190,18 @@ static void test_skmsg_helpers_with_link(enum bpf_map_type map_type)
test_skmsg_load_helpers__destroy(skel);
}
-static void test_sockmap_update(enum bpf_map_type map_type)
+static void test_sockmap_update(enum bpf_map_type map_type, bool cpu_affinity)
{
int err, prog, src;
struct test_sockmap_update *skel;
struct bpf_map *dst_map;
const __u32 zero = 0;
char dummy[14] = {0};
+ __u64 target_cpu = 0;
+
+ LIBBPF_OPTS(bpf_map_update_opts, update_opts,
+ .target_cpu = &target_cpu,
+ );
LIBBPF_OPTS(bpf_test_run_opts, topts,
.data_in = dummy,
.data_size_in = sizeof(dummy),
@@ -219,7 +224,11 @@ static void test_sockmap_update(enum bpf_map_type map_type)
else
dst_map = skel->maps.dst_sock_hash;
- err = bpf_map_update_elem(src, &zero, &sk, BPF_NOEXIST);
+ if (cpu_affinity)
+ err = bpf_map_update_elem_opts(src, &zero, &sk, BPF_NOEXIST, &update_opts);
+ else
+ err = bpf_map_update_elem(src, &zero, &sk, BPF_NOEXIST);
+
if (!ASSERT_OK(err, "update_elem(src)"))
goto out;
@@ -896,9 +905,11 @@ void test_sockmap_basic(void)
if (test__start_subtest("sockhash sk_msg load helpers"))
test_skmsg_helpers(BPF_MAP_TYPE_SOCKHASH);
if (test__start_subtest("sockmap update"))
- test_sockmap_update(BPF_MAP_TYPE_SOCKMAP);
+ test_sockmap_update(BPF_MAP_TYPE_SOCKMAP, false);
+ if (test__start_subtest("sockmap update cpu affinity"))
+ test_sockmap_update(BPF_MAP_TYPE_SOCKMAP, true);
if (test__start_subtest("sockhash update"))
- test_sockmap_update(BPF_MAP_TYPE_SOCKHASH);
+ test_sockmap_update(BPF_MAP_TYPE_SOCKHASH, false);
if (test__start_subtest("sockmap update in unsafe context"))
test_sockmap_invalid_update();
if (test__start_subtest("sockmap copy"))
--
2.43.5
^ permalink raw reply related [flat|nested] 8+ messages in thread* Re: [PATCH 1/2] bpf: Introduce cpu affinity for sockmap
2024-11-01 2:38 [PATCH 1/2] bpf: Introduce cpu affinity for sockmap mrpre
2024-11-01 2:38 ` [PATCH 2/2] bpf: implement libbpf sockmap cpu affinity mrpre
@ 2024-11-01 13:20 ` kernel test robot
2024-11-01 14:02 ` kernel test robot
` (2 subsequent siblings)
4 siblings, 0 replies; 8+ messages in thread
From: kernel test robot @ 2024-11-01 13:20 UTC (permalink / raw)
To: mrpre, yonghong.song, john.fastabend, martin.lau, edumazet, jakub,
davem, dsahern, kuba, pabeni, netdev, bpf, linux-kernel
Cc: llvm, oe-kbuild-all, mrpre
Hi mrpre,
kernel test robot noticed the following build errors:
[auto build test ERROR on bpf-next/master]
[also build test ERROR on bpf/master net-next/main net/main linus/master v6.12-rc5 next-20241101]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/mrpre/bpf-implement-libbpf-sockmap-cpu-affinity/20241101-104144
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link: https://lore.kernel.org/r/20241101023832.32404-1-mrpre%40163.com
patch subject: [PATCH 1/2] bpf: Introduce cpu affinity for sockmap
config: i386-buildonly-randconfig-001-20241101 (https://download.01.org/0day-ci/archive/20241101/202411012119.5fFOfrH9-lkp@intel.com/config)
compiler: clang version 19.1.3 (https://github.com/llvm/llvm-project ab51eccf88f5321e7c60591c5546b254b6afab99)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241101/202411012119.5fFOfrH9-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202411012119.5fFOfrH9-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from kernel/bpf/syscall.c:4:
In file included from include/linux/bpf.h:21:
In file included from include/linux/kallsyms.h:13:
In file included from include/linux/mm.h:2213:
include/linux/vmstat.h:518:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
518 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
| ~~~~~~~~~~~ ^ ~~~
>> kernel/bpf/syscall.c:254:59: error: too many arguments to function call, expected 4, have 5
254 | return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
| ~~~~~~~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~
include/linux/bpf.h:3175:19: note: 'sock_map_update_elem_sys' declared here
3175 | static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3176 | u64 flags)
| ~~~~~~~~~
kernel/bpf/syscall.c:5961:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
5961 | .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
| ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
kernel/bpf/syscall.c:6011:41: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
6011 | .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
| ~~~~~~~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
3 warnings and 1 error generated.
vim +254 kernel/bpf/syscall.c
240
241 static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
242 void *key, void *value, __u64 flags, s32 target_cpu)
243 {
244 int err;
245 /* Need to create a kthread, thus must support schedule */
246 if (bpf_map_is_offloaded(map)) {
247 return bpf_map_offload_update_elem(map, key, value, flags);
248 } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
249 map->map_type == BPF_MAP_TYPE_ARENA ||
250 map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
251 return map->ops->map_update_elem(map, key, value, flags);
252 } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
253 map->map_type == BPF_MAP_TYPE_SOCKMAP) {
> 254 return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
255 } else if (IS_FD_PROG_ARRAY(map)) {
256 return bpf_fd_array_map_update_elem(map, map_file, key, value,
257 flags);
258 }
259
260 bpf_disable_instrumentation();
261 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
262 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
263 err = bpf_percpu_hash_update(map, key, value, flags);
264 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
265 err = bpf_percpu_array_update(map, key, value, flags);
266 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
267 err = bpf_percpu_cgroup_storage_update(map, key, value,
268 flags);
269 } else if (IS_FD_ARRAY(map)) {
270 err = bpf_fd_array_map_update_elem(map, map_file, key, value,
271 flags);
272 } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
273 err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
274 flags);
275 } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
276 /* rcu_read_lock() is not needed */
277 err = bpf_fd_reuseport_array_update_elem(map, key, value,
278 flags);
279 } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
280 map->map_type == BPF_MAP_TYPE_STACK ||
281 map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
282 err = map->ops->map_push_elem(map, value, flags);
283 } else {
284 err = bpf_obj_pin_uptrs(map->record, value);
285 if (!err) {
286 rcu_read_lock();
287 err = map->ops->map_update_elem(map, key, value, flags);
288 rcu_read_unlock();
289 if (err)
290 bpf_obj_unpin_uptrs(map->record, value);
291 }
292 }
293 bpf_enable_instrumentation();
294
295 return err;
296 }
297
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [PATCH 1/2] bpf: Introduce cpu affinity for sockmap
2024-11-01 2:38 [PATCH 1/2] bpf: Introduce cpu affinity for sockmap mrpre
2024-11-01 2:38 ` [PATCH 2/2] bpf: implement libbpf sockmap cpu affinity mrpre
2024-11-01 13:20 ` [PATCH 1/2] bpf: Introduce cpu affinity for sockmap kernel test robot
@ 2024-11-01 14:02 ` kernel test robot
2024-11-01 19:25 ` Andrii Nakryiko
2024-11-06 13:49 ` Simon Horman
4 siblings, 0 replies; 8+ messages in thread
From: kernel test robot @ 2024-11-01 14:02 UTC (permalink / raw)
To: mrpre, yonghong.song, john.fastabend, martin.lau, edumazet, jakub,
davem, dsahern, kuba, pabeni, netdev, bpf, linux-kernel
Cc: oe-kbuild-all, mrpre
Hi mrpre,
kernel test robot noticed the following build errors:
[auto build test ERROR on bpf-next/master]
[also build test ERROR on bpf/master net-next/main net/main linus/master v6.12-rc5 next-20241101]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/mrpre/bpf-implement-libbpf-sockmap-cpu-affinity/20241101-104144
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link: https://lore.kernel.org/r/20241101023832.32404-1-mrpre%40163.com
patch subject: [PATCH 1/2] bpf: Introduce cpu affinity for sockmap
config: arc-randconfig-001-20241101 (https://download.01.org/0day-ci/archive/20241101/202411012135.447KNHZK-lkp@intel.com/config)
compiler: arc-elf-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241101/202411012135.447KNHZK-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202411012135.447KNHZK-lkp@intel.com/
All errors (new ones prefixed by >>):
kernel/bpf/syscall.c: In function 'bpf_map_update_value':
>> kernel/bpf/syscall.c:254:24: error: too many arguments to function 'sock_map_update_elem_sys'
254 | return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
| ^~~~~~~~~~~~~~~~~~~~~~~~
In file included from kernel/bpf/syscall.c:4:
include/linux/bpf.h:3175:19: note: declared here
3175 | static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
| ^~~~~~~~~~~~~~~~~~~~~~~~
Kconfig warnings: (for reference only)
WARNING: unmet direct dependencies detected for GET_FREE_REGION
Depends on [n]: SPARSEMEM [=n]
Selected by [m]:
- RESOURCE_KUNIT_TEST [=m] && RUNTIME_TESTING_MENU [=y] && KUNIT [=m]
vim +/sock_map_update_elem_sys +254 kernel/bpf/syscall.c
240
241 static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
242 void *key, void *value, __u64 flags, s32 target_cpu)
243 {
244 int err;
245 /* Need to create a kthread, thus must support schedule */
246 if (bpf_map_is_offloaded(map)) {
247 return bpf_map_offload_update_elem(map, key, value, flags);
248 } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
249 map->map_type == BPF_MAP_TYPE_ARENA ||
250 map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
251 return map->ops->map_update_elem(map, key, value, flags);
252 } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
253 map->map_type == BPF_MAP_TYPE_SOCKMAP) {
> 254 return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
255 } else if (IS_FD_PROG_ARRAY(map)) {
256 return bpf_fd_array_map_update_elem(map, map_file, key, value,
257 flags);
258 }
259
260 bpf_disable_instrumentation();
261 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
262 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
263 err = bpf_percpu_hash_update(map, key, value, flags);
264 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
265 err = bpf_percpu_array_update(map, key, value, flags);
266 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
267 err = bpf_percpu_cgroup_storage_update(map, key, value,
268 flags);
269 } else if (IS_FD_ARRAY(map)) {
270 err = bpf_fd_array_map_update_elem(map, map_file, key, value,
271 flags);
272 } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
273 err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
274 flags);
275 } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
276 /* rcu_read_lock() is not needed */
277 err = bpf_fd_reuseport_array_update_elem(map, key, value,
278 flags);
279 } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
280 map->map_type == BPF_MAP_TYPE_STACK ||
281 map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
282 err = map->ops->map_push_elem(map, value, flags);
283 } else {
284 err = bpf_obj_pin_uptrs(map->record, value);
285 if (!err) {
286 rcu_read_lock();
287 err = map->ops->map_update_elem(map, key, value, flags);
288 rcu_read_unlock();
289 if (err)
290 bpf_obj_unpin_uptrs(map->record, value);
291 }
292 }
293 bpf_enable_instrumentation();
294
295 return err;
296 }
297
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [PATCH 1/2] bpf: Introduce cpu affinity for sockmap
2024-11-01 2:38 [PATCH 1/2] bpf: Introduce cpu affinity for sockmap mrpre
` (2 preceding siblings ...)
2024-11-01 14:02 ` kernel test robot
@ 2024-11-01 19:25 ` Andrii Nakryiko
2024-11-04 6:12 ` Jiayuan Chen
2024-11-06 13:49 ` Simon Horman
4 siblings, 1 reply; 8+ messages in thread
From: Andrii Nakryiko @ 2024-11-01 19:25 UTC (permalink / raw)
To: mrpre
Cc: yonghong.song, john.fastabend, martin.lau, edumazet, jakub, davem,
dsahern, kuba, pabeni, netdev, bpf, linux-kernel
On Thu, Oct 31, 2024 at 7:40 PM mrpre <mrpre@163.com> wrote:
>
> Why we need cpu affinity:
> Mainstream data planes, like Nginx and HAProxy, utilize CPU affinity
> by binding user processes to specific CPUs. This avoids interference
> between processes and prevents impact from other processes.
>
> Sockmap, as an optimization to accelerate such proxy programs,
> currently lacks the ability to specify CPU affinity. The current
> implementation of sockmap handling backlog is based on workqueue,
> which operates by calling 'schedule_delayed_work()'. It's current
> implementation prefers to schedule on the local CPU, i.e., the CPU
> that handled the packet under softirq.
>
> For extremely high traffic with large numbers of packets,
> 'sk_psock_backlog' becomes a large loop.
>
> For multi-threaded programs with only one map, we expect different
> sockets to run on different CPUs. It is important to note that this
> feature is not a general performance optimization. Instead, it
> provides users with the ability to bind to specific CPU, allowing
> them to enhance overall operating system utilization based on their
> own system environments.
>
> Implementation:
> 1.When updating the sockmap, support passing a CPU parameter and
> save it to the psock.
> 2.When scheduling psock, determine which CPU to run on using the
> psock's CPU information.
> 3.For thoes sockmap without CPU affinity, keep original logic by using
> 'schedule_delayed_work()'.
>
> Performance Testing:
> 'client <-> sockmap proxy <-> server'
>
> Using 'iperf3' tests, with the iperf server bound to CPU5 and the iperf
> client bound to CPU6, performance without using CPU affinity is
> around 34 Gbits/s, and CPU usage is concentrated on CPU5 and CPU6.
> '''
> [ 5] local 127.0.0.1 port 57144 connected to 127.0.0.1 port 10000
> [ ID] Interval Transfer Bitrate
> [ 5] 0.00-1.00 sec 3.95 GBytes 33.9 Gbits/sec
> [ 5] 1.00-2.00 sec 3.95 GBytes 34.0 Gbits/sec
> ......
> '''
>
> With using CPU affinity, the performnce is close to direct connection
> (without any proxy).
> '''
> [ 5] local 127.0.0.1 port 56518 connected to 127.0.0.1 port 10000
> [ ID] Interval Transfer Bitrate
> [ 5] 0.00-1.00 sec 7.76 GBytes 66.6 Gbits/sec
> [ 5] 1.00-2.00 sec 7.76 GBytes 66.7 Gbits/sec
> ......
> '''
>
> Signed-off-by: Jiayuan Chen <mrpre@163.com>
> ---
> include/linux/bpf.h | 3 ++-
> include/linux/skmsg.h | 8 ++++++++
> include/uapi/linux/bpf.h | 4 ++++
> kernel/bpf/syscall.c | 23 +++++++++++++++++------
> net/core/skmsg.c | 11 +++++++----
> net/core/sock_map.c | 12 +++++++-----
> 6 files changed, 45 insertions(+), 16 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index c3ba4d475174..a56028c389e7 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -3080,7 +3080,8 @@ int bpf_prog_test_run_syscall(struct bpf_prog *prog,
>
> int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
> int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
> -int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
> +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags,
> + s32 target_cpu);
> int sock_map_bpf_prog_query(const union bpf_attr *attr,
> union bpf_attr __user *uattr);
> int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);
> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> index d9b03e0746e7..919425a92adf 100644
> --- a/include/linux/skmsg.h
> +++ b/include/linux/skmsg.h
> @@ -117,6 +117,7 @@ struct sk_psock {
> struct delayed_work work;
> struct sock *sk_pair;
> struct rcu_work rwork;
> + s32 target_cpu;
> };
>
> int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
> @@ -514,6 +515,13 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
> return !!psock->saved_data_ready;
> }
>
> +static inline int sk_psock_strp_get_cpu(struct sk_psock *psock)
> +{
> + if (psock->target_cpu != -1)
> + return psock->target_cpu;
> + return WORK_CPU_UNBOUND;
> +}
> +
> #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
>
> #define BPF_F_STRPARSER (1UL << 1)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f28b6527e815..2019a87b5d4a 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1509,6 +1509,10 @@ union bpf_attr {
> __aligned_u64 next_key;
> };
> __u64 flags;
> + union {
> + /* specify the CPU where the sockmap job run on */
> + __aligned_u64 target_cpu;
I have no opinion on the feature itself, I'll leave this to others.
But from UAPI perspective:
a) why is this a u64 and not, say, int?
b) maybe we should just specify this as flags and not have to update
all the UAPIs (including libbpf-side)? Just add a new
BPF_F_SOCKNMAP_TARGET_CPU flag or something, and specify that highest
32 bits specify the CPU itself?
We have similar schema for some other helpers, so not *that* unusual.
> + };
> };
>
> struct { /* struct used by BPF_MAP_*_BATCH commands */
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 8254b2973157..95f719b9c3f3 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -239,10 +239,9 @@ static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
> }
>
> static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
> - void *key, void *value, __u64 flags)
> + void *key, void *value, __u64 flags, s32 target_cpu)
yeah, this is what I'm talking about. Think how ridiculous it is for a
generic "BPF map update" operation to accept the "target_cpu"
parameter.
pw-bot: cr
> {
> int err;
> -
why? don't break whitespace formatting
> /* Need to create a kthread, thus must support schedule */
> if (bpf_map_is_offloaded(map)) {
> return bpf_map_offload_update_elem(map, key, value, flags);
> @@ -252,7 +251,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
> return map->ops->map_update_elem(map, key, value, flags);
> } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
> map->map_type == BPF_MAP_TYPE_SOCKMAP) {
> - return sock_map_update_elem_sys(map, key, value, flags);
> + return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
> } else if (IS_FD_PROG_ARRAY(map)) {
> return bpf_fd_array_map_update_elem(map, map_file, key, value,
> flags);
[...]
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [PATCH 1/2] bpf: Introduce cpu affinity for sockmap
2024-11-01 19:25 ` Andrii Nakryiko
@ 2024-11-04 6:12 ` Jiayuan Chen
2024-11-06 21:43 ` Andrii Nakryiko
0 siblings, 1 reply; 8+ messages in thread
From: Jiayuan Chen @ 2024-11-04 6:12 UTC (permalink / raw)
To: Andrii Nakryiko; +Cc: netdev, bpf, linux-kernel, martin.lau
On Fri, Nov 01, 2024 at 12:25:51PM -0700, Andrii Nakryiko wrote:
> On Thu, Oct 31, 2024 at 7:40 PM mrpre <mrpre@163.com> wrote:
> >
> > Why we need cpu affinity:
> > Mainstream data planes, like Nginx and HAProxy, utilize CPU affinity
> > by binding user processes to specific CPUs. This avoids interference
> > between processes and prevents impact from other processes.
> >
> > Sockmap, as an optimization to accelerate such proxy programs,
> > currently lacks the ability to specify CPU affinity. The current
> > implementation of sockmap handling backlog is based on workqueue,
> > which operates by calling 'schedule_delayed_work()'. It's current
> > implementation prefers to schedule on the local CPU, i.e., the CPU
> > that handled the packet under softirq.
> >
> > For extremely high traffic with large numbers of packets,
> > 'sk_psock_backlog' becomes a large loop.
> >
> > For multi-threaded programs with only one map, we expect different
> > sockets to run on different CPUs. It is important to note that this
> > feature is not a general performance optimization. Instead, it
> > provides users with the ability to bind to specific CPU, allowing
> > them to enhance overall operating system utilization based on their
> > own system environments.
> >
> > Implementation:
> > 1.When updating the sockmap, support passing a CPU parameter and
> > save it to the psock.
> > 2.When scheduling psock, determine which CPU to run on using the
> > psock's CPU information.
> > 3.For thoes sockmap without CPU affinity, keep original logic by using
> > 'schedule_delayed_work()'.
> >
> > Performance Testing:
> > 'client <-> sockmap proxy <-> server'
> >
> > Using 'iperf3' tests, with the iperf server bound to CPU5 and the iperf
> > client bound to CPU6, performance without using CPU affinity is
> > around 34 Gbits/s, and CPU usage is concentrated on CPU5 and CPU6.
> > '''
> > [ 5] local 127.0.0.1 port 57144 connected to 127.0.0.1 port 10000
> > [ ID] Interval Transfer Bitrate
> > [ 5] 0.00-1.00 sec 3.95 GBytes 33.9 Gbits/sec
> > [ 5] 1.00-2.00 sec 3.95 GBytes 34.0 Gbits/sec
> > ......
> > '''
> >
> > With using CPU affinity, the performnce is close to direct connection
> > (without any proxy).
> > '''
> > [ 5] local 127.0.0.1 port 56518 connected to 127.0.0.1 port 10000
> > [ ID] Interval Transfer Bitrate
> > [ 5] 0.00-1.00 sec 7.76 GBytes 66.6 Gbits/sec
> > [ 5] 1.00-2.00 sec 7.76 GBytes 66.7 Gbits/sec
> > ......
> > '''
> >
> > Signed-off-by: Jiayuan Chen <mrpre@163.com>
> > ---
> > include/linux/bpf.h | 3 ++-
> > include/linux/skmsg.h | 8 ++++++++
> > include/uapi/linux/bpf.h | 4 ++++
> > kernel/bpf/syscall.c | 23 +++++++++++++++++------
> > net/core/skmsg.c | 11 +++++++----
> > net/core/sock_map.c | 12 +++++++-----
> > 6 files changed, 45 insertions(+), 16 deletions(-)
> >
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index c3ba4d475174..a56028c389e7 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -3080,7 +3080,8 @@ int bpf_prog_test_run_syscall(struct bpf_prog *prog,
> >
> > int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
> > int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
> > -int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
> > +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags,
> > + s32 target_cpu);
> > int sock_map_bpf_prog_query(const union bpf_attr *attr,
> > union bpf_attr __user *uattr);
> > int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);
> > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> > index d9b03e0746e7..919425a92adf 100644
> > --- a/include/linux/skmsg.h
> > +++ b/include/linux/skmsg.h
> > @@ -117,6 +117,7 @@ struct sk_psock {
> > struct delayed_work work;
> > struct sock *sk_pair;
> > struct rcu_work rwork;
> > + s32 target_cpu;
> > };
> >
> > int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
> > @@ -514,6 +515,13 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
> > return !!psock->saved_data_ready;
> > }
> >
> > +static inline int sk_psock_strp_get_cpu(struct sk_psock *psock)
> > +{
> > + if (psock->target_cpu != -1)
> > + return psock->target_cpu;
> > + return WORK_CPU_UNBOUND;
> > +}
> > +
> > #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
> >
> > #define BPF_F_STRPARSER (1UL << 1)
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index f28b6527e815..2019a87b5d4a 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -1509,6 +1509,10 @@ union bpf_attr {
> > __aligned_u64 next_key;
> > };
> > __u64 flags;
> > + union {
> > + /* specify the CPU where the sockmap job run on */
> > + __aligned_u64 target_cpu;
>
> I have no opinion on the feature itself, I'll leave this to others.
> But from UAPI perspective:
>
> a) why is this a u64 and not, say, int?
> b) maybe we should just specify this as flags and not have to update
> all the UAPIs (including libbpf-side)? Just add a new
> BPF_F_SOCKNMAP_TARGET_CPU flag or something, and specify that highest
> 32 bits specify the CPU itself?
>
> We have similar schema for some other helpers, so not *that* unusual.
>
Thank you for your response. I think I should clarify my thoughts:
My idea is to pass a user-space pointer, with the pointer being null
to indicate that the user has not provided anything.For example, when
users use the old interface 'bpf_map_update_elem' and pass in u64 of
0, it means that the user hasn't specified a CPU. If a u32 or another
type of value is passed in, when it is 0, it's ambiguous whether this
indicates target CPU 0 or that the user hasn't provided a value. So
my design involves passing a user-space pointer.
I also considered using the highest 32 bits of the flag as target_cpu, but
this approach still encounters the ambiguity mentioned above. Of course
for programs using libbpf, I can naturally init all the higher 32 bits
default to 1 to indicate the user hasn't specified a CPU, but this is
incompatible with programs not using libbpf. Another approach could be
that a value of 1 for the higher 32 bits indicates CPU 0, and 2 indicates
CPU 1..., but this seems odd and would require a helper to assist users
in passing arguments.
There is another method, like providing an extra 'attr', to replace the
passed 'target_cpu', which maintains the general nature of
'map_update_elem' interface, like:
'''
+struct extra_bpf_attr {
+ u32 target_cpu;
+};
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
__u32 map_fd;
__aligned_u64 key;
union {
__aligned_u64 value;
__aligned_u64 next_key;
};
__u64 flags;
+struct extra_bpf_attr extra;
};
static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
- void *key, void *value, __u64 flags)
+ void *key, void *value, __u64 flags, struct bpf_attr_extra *extra);
'''
> > + };
> > };
> >
> > struct { /* struct used by BPF_MAP_*_BATCH commands */
> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > index 8254b2973157..95f719b9c3f3 100644
> > --- a/kernel/bpf/syscall.c
> > +++ b/kernel/bpf/syscall.c
> > @@ -239,10 +239,9 @@ static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
> > }
> >
> > static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
> > - void *key, void *value, __u64 flags)
> > + void *key, void *value, __u64 flags, s32 target_cpu)
>
> yeah, this is what I'm talking about. Think how ridiculous it is for a
> generic "BPF map update" operation to accept the "target_cpu"
> parameter.
>
> pw-bot: cr
>
> > {
> > int err;
> > -
>
> why? don't break whitespace formatting
>
> > /* Need to create a kthread, thus must support schedule */
> > if (bpf_map_is_offloaded(map)) {
> > return bpf_map_offload_update_elem(map, key, value, flags);
> > @@ -252,7 +251,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
> > return map->ops->map_update_elem(map, key, value, flags);
> > } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
> > map->map_type == BPF_MAP_TYPE_SOCKMAP) {
> > - return sock_map_update_elem_sys(map, key, value, flags);
> > + return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
> > } else if (IS_FD_PROG_ARRAY(map)) {
> > return bpf_fd_array_map_update_elem(map, map_file, key, value,
> > flags);
>
> [...]
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [PATCH 1/2] bpf: Introduce cpu affinity for sockmap
2024-11-04 6:12 ` Jiayuan Chen
@ 2024-11-06 21:43 ` Andrii Nakryiko
0 siblings, 0 replies; 8+ messages in thread
From: Andrii Nakryiko @ 2024-11-06 21:43 UTC (permalink / raw)
To: Jiayuan Chen; +Cc: netdev, bpf, linux-kernel, martin.lau
On Sun, Nov 3, 2024 at 10:12 PM Jiayuan Chen <mrpre@163.com> wrote:
>
> On Fri, Nov 01, 2024 at 12:25:51PM -0700, Andrii Nakryiko wrote:
> > On Thu, Oct 31, 2024 at 7:40 PM mrpre <mrpre@163.com> wrote:
> > >
> > > Why we need cpu affinity:
> > > Mainstream data planes, like Nginx and HAProxy, utilize CPU affinity
> > > by binding user processes to specific CPUs. This avoids interference
> > > between processes and prevents impact from other processes.
> > >
> > > Sockmap, as an optimization to accelerate such proxy programs,
> > > currently lacks the ability to specify CPU affinity. The current
> > > implementation of sockmap handling backlog is based on workqueue,
> > > which operates by calling 'schedule_delayed_work()'. It's current
> > > implementation prefers to schedule on the local CPU, i.e., the CPU
> > > that handled the packet under softirq.
> > >
> > > For extremely high traffic with large numbers of packets,
> > > 'sk_psock_backlog' becomes a large loop.
> > >
> > > For multi-threaded programs with only one map, we expect different
> > > sockets to run on different CPUs. It is important to note that this
> > > feature is not a general performance optimization. Instead, it
> > > provides users with the ability to bind to specific CPU, allowing
> > > them to enhance overall operating system utilization based on their
> > > own system environments.
> > >
> > > Implementation:
> > > 1.When updating the sockmap, support passing a CPU parameter and
> > > save it to the psock.
> > > 2.When scheduling psock, determine which CPU to run on using the
> > > psock's CPU information.
> > > 3.For thoes sockmap without CPU affinity, keep original logic by using
> > > 'schedule_delayed_work()'.
> > >
> > > Performance Testing:
> > > 'client <-> sockmap proxy <-> server'
> > >
> > > Using 'iperf3' tests, with the iperf server bound to CPU5 and the iperf
> > > client bound to CPU6, performance without using CPU affinity is
> > > around 34 Gbits/s, and CPU usage is concentrated on CPU5 and CPU6.
> > > '''
> > > [ 5] local 127.0.0.1 port 57144 connected to 127.0.0.1 port 10000
> > > [ ID] Interval Transfer Bitrate
> > > [ 5] 0.00-1.00 sec 3.95 GBytes 33.9 Gbits/sec
> > > [ 5] 1.00-2.00 sec 3.95 GBytes 34.0 Gbits/sec
> > > ......
> > > '''
> > >
> > > With using CPU affinity, the performnce is close to direct connection
> > > (without any proxy).
> > > '''
> > > [ 5] local 127.0.0.1 port 56518 connected to 127.0.0.1 port 10000
> > > [ ID] Interval Transfer Bitrate
> > > [ 5] 0.00-1.00 sec 7.76 GBytes 66.6 Gbits/sec
> > > [ 5] 1.00-2.00 sec 7.76 GBytes 66.7 Gbits/sec
> > > ......
> > > '''
> > >
> > > Signed-off-by: Jiayuan Chen <mrpre@163.com>
> > > ---
> > > include/linux/bpf.h | 3 ++-
> > > include/linux/skmsg.h | 8 ++++++++
> > > include/uapi/linux/bpf.h | 4 ++++
> > > kernel/bpf/syscall.c | 23 +++++++++++++++++------
> > > net/core/skmsg.c | 11 +++++++----
> > > net/core/sock_map.c | 12 +++++++-----
> > > 6 files changed, 45 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > index c3ba4d475174..a56028c389e7 100644
> > > --- a/include/linux/bpf.h
> > > +++ b/include/linux/bpf.h
> > > @@ -3080,7 +3080,8 @@ int bpf_prog_test_run_syscall(struct bpf_prog *prog,
> > >
> > > int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
> > > int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
> > > -int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
> > > +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags,
> > > + s32 target_cpu);
> > > int sock_map_bpf_prog_query(const union bpf_attr *attr,
> > > union bpf_attr __user *uattr);
> > > int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);
> > > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> > > index d9b03e0746e7..919425a92adf 100644
> > > --- a/include/linux/skmsg.h
> > > +++ b/include/linux/skmsg.h
> > > @@ -117,6 +117,7 @@ struct sk_psock {
> > > struct delayed_work work;
> > > struct sock *sk_pair;
> > > struct rcu_work rwork;
> > > + s32 target_cpu;
> > > };
> > >
> > > int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
> > > @@ -514,6 +515,13 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
> > > return !!psock->saved_data_ready;
> > > }
> > >
> > > +static inline int sk_psock_strp_get_cpu(struct sk_psock *psock)
> > > +{
> > > + if (psock->target_cpu != -1)
> > > + return psock->target_cpu;
> > > + return WORK_CPU_UNBOUND;
> > > +}
> > > +
> > > #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
> > >
> > > #define BPF_F_STRPARSER (1UL << 1)
> > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > index f28b6527e815..2019a87b5d4a 100644
> > > --- a/include/uapi/linux/bpf.h
> > > +++ b/include/uapi/linux/bpf.h
> > > @@ -1509,6 +1509,10 @@ union bpf_attr {
> > > __aligned_u64 next_key;
> > > };
> > > __u64 flags;
> > > + union {
> > > + /* specify the CPU where the sockmap job run on */
> > > + __aligned_u64 target_cpu;
> >
> > I have no opinion on the feature itself, I'll leave this to others.
> > But from UAPI perspective:
> >
> > a) why is this a u64 and not, say, int?
> > b) maybe we should just specify this as flags and not have to update
> > all the UAPIs (including libbpf-side)? Just add a new
> > BPF_F_SOCKNMAP_TARGET_CPU flag or something, and specify that highest
> > 32 bits specify the CPU itself?
> >
> > We have similar schema for some other helpers, so not *that* unusual.
> >
> Thank you for your response. I think I should clarify my thoughts:
>
> My idea is to pass a user-space pointer, with the pointer being null
> to indicate that the user has not provided anything.For example, when
> users use the old interface 'bpf_map_update_elem' and pass in u64 of
> 0, it means that the user hasn't specified a CPU. If a u32 or another
> type of value is passed in, when it is 0, it's ambiguous whether this
> indicates target CPU 0 or that the user hasn't provided a value. So
> my design involves passing a user-space pointer.
>
> I also considered using the highest 32 bits of the flag as target_cpu, but
> this approach still encounters the ambiguity mentioned above. Of course
> for programs using libbpf, I can naturally init all the higher 32 bits
> default to 1 to indicate the user hasn't specified a CPU, but this is
> incompatible with programs not using libbpf. Another approach could be
> that a value of 1 for the higher 32 bits indicates CPU 0, and 2 indicates
> CPU 1..., but this seems odd and would require a helper to assist users
> in passing arguments.
See BPF_F_SOCKMAP_TARGET_CPU flag point in my reply. You need an extra
flag that would specify that those 32 bits are specifying a CPU
number. There is no ambiguity. No flag - no CPU, Flag - CPU (even if
zero).
>
> There is another method, like providing an extra 'attr', to replace the
> passed 'target_cpu', which maintains the general nature of
> 'map_update_elem' interface, like:
> '''
> +struct extra_bpf_attr {
> + u32 target_cpu;
> +};
> struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
> __u32 map_fd;
> __aligned_u64 key;
> union {
> __aligned_u64 value;
> __aligned_u64 next_key;
> };
> __u64 flags;
> +struct extra_bpf_attr extra;
> };
>
> static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
> - void *key, void *value, __u64 flags)
> + void *key, void *value, __u64 flags, struct bpf_attr_extra *extra);
> '''
>
> > > + };
> > > };
> > >
> > > struct { /* struct used by BPF_MAP_*_BATCH commands */
> > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > > index 8254b2973157..95f719b9c3f3 100644
> > > --- a/kernel/bpf/syscall.c
> > > +++ b/kernel/bpf/syscall.c
> > > @@ -239,10 +239,9 @@ static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
> > > }
> > >
> > > static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
> > > - void *key, void *value, __u64 flags)
> > > + void *key, void *value, __u64 flags, s32 target_cpu)
> >
> > yeah, this is what I'm talking about. Think how ridiculous it is for a
> > generic "BPF map update" operation to accept the "target_cpu"
> > parameter.
> >
> > pw-bot: cr
> >
> > > {
> > > int err;
> > > -
> >
> > why? don't break whitespace formatting
> >
> > > /* Need to create a kthread, thus must support schedule */
> > > if (bpf_map_is_offloaded(map)) {
> > > return bpf_map_offload_update_elem(map, key, value, flags);
> > > @@ -252,7 +251,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
> > > return map->ops->map_update_elem(map, key, value, flags);
> > > } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
> > > map->map_type == BPF_MAP_TYPE_SOCKMAP) {
> > > - return sock_map_update_elem_sys(map, key, value, flags);
> > > + return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
> > > } else if (IS_FD_PROG_ARRAY(map)) {
> > > return bpf_fd_array_map_update_elem(map, map_file, key, value,
> > > flags);
> >
> > [...]
>
>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 1/2] bpf: Introduce cpu affinity for sockmap
2024-11-01 2:38 [PATCH 1/2] bpf: Introduce cpu affinity for sockmap mrpre
` (3 preceding siblings ...)
2024-11-01 19:25 ` Andrii Nakryiko
@ 2024-11-06 13:49 ` Simon Horman
4 siblings, 0 replies; 8+ messages in thread
From: Simon Horman @ 2024-11-06 13:49 UTC (permalink / raw)
To: mrpre
Cc: yonghong.song, john.fastabend, martin.lau, edumazet, jakub, davem,
dsahern, kuba, pabeni, netdev, bpf, linux-kernel
On Fri, Nov 01, 2024 at 10:38:31AM +0800, mrpre wrote:
...
> diff --git a/net/core/sock_map.c b/net/core/sock_map.c
> index 07d6aa4e39ef..36e9787c60de 100644
> --- a/net/core/sock_map.c
> +++ b/net/core/sock_map.c
> @@ -465,7 +465,7 @@ static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
> }
>
> static int sock_map_update_common(struct bpf_map *map, u32 idx,
> - struct sock *sk, u64 flags)
> + struct sock *sk, u64 flags, s32 target_cpu)
> {
> struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
> struct sk_psock_link *link;
> @@ -490,6 +490,8 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
> psock = sk_psock(sk);
> WARN_ON_ONCE(!psock);
>
> + psock->target_cpu = target_cpu;
> +
> spin_lock_bh(&stab->lock);
> osk = stab->sks[idx];
> if (osk && flags == BPF_NOEXIST) {
Hi Jiayuan Chen,
The code immediately following the hunk above is:
ret = -EEXIST;
goto out_unlock;
} else if (!osk && flags == BPF_EXIST) {
ret = -ENOENT;
goto out_unlock;
}
And it seems that these gotos are the only code paths that lead to
out_unlock, which looks like this:
out_unlock:
spin_unlock_bh(&stab->lock);
if (psock)
sk_psock_put(sk, psock);
out_free:
sk_psock_free_link(link);
return ret;
}
As you can see, the code under out_unlock expects that psock may be NULL.
But the code added to this function by your patch dereferences it
unconditionally. This seems inconsistent.
Flagged by Smatch.
...
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2024-11-06 21:43 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-11-01 2:38 [PATCH 1/2] bpf: Introduce cpu affinity for sockmap mrpre
2024-11-01 2:38 ` [PATCH 2/2] bpf: implement libbpf sockmap cpu affinity mrpre
2024-11-01 13:20 ` [PATCH 1/2] bpf: Introduce cpu affinity for sockmap kernel test robot
2024-11-01 14:02 ` kernel test robot
2024-11-01 19:25 ` Andrii Nakryiko
2024-11-04 6:12 ` Jiayuan Chen
2024-11-06 21:43 ` Andrii Nakryiko
2024-11-06 13:49 ` Simon Horman
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).