[PATCH v2 bpf-next 0/2] bpf: Add bpf_iter

BPF List
 help / color / mirror / Atom feed

* [PATCH v2 bpf-next 0/2] bpf: Add bpf_iter_cpumask
@ 2024-01-10  6:00 Yafang Shao
  2024-01-10  6:00 ` [PATCH v2 bpf-next 1/2] bpf: Add bpf_iter_cpumask kfuncs Yafang Shao
  2024-01-10  6:00 ` [PATCH v2 bpf-next 2/2] selftests/bpf: Add selftests for cpumask iter Yafang Shao
  0 siblings, 2 replies; 8+ messages in thread
From: Yafang Shao @ 2024-01-10  6:00 UTC (permalink / raw)
  To: ast, daniel, john.fastabend, andrii, martin.lau, song,
	yonghong.song, kpsingh, sdf, haoluo, jolsa, tj
  Cc: bpf, Yafang Shao

Three new kfuncs, namely bpf_iter_cpumask_{new,next,destroy}, have been
added for the new bpf_iter_cpumask functionality. These kfuncs enable the
iteration of percpu data, such as runqueues, system_group_pcpu, and more.

In our specific use case, we leverage the cgroup iterator to traverse
percpu data, subsequently exposing it to userspace through a seq file.
Refer to the test cases in patch #2 for further context and examples.

Changes:
- v1 -> v2: 
  - Avoid changing cgroup subsystem (Tejun)
  - Remove bpf_cpumask_set_from_pid(), and use bpf_cpumask_copy()
    instead (Tejun)
  - Use `int cpu;` field in bpf_iter_cpumask_kern (Andrii)
- bpf: Add new bpf helper bpf_for_each_cpu
  https://lwn.net/ml/bpf/20230801142912.55078-1-laoar.shao@gmail.com/

Yafang Shao (2):
  bpf: Add bpf_iter_cpumask kfuncs
  selftests/bpf: Add selftests for cpumask iter

 kernel/bpf/cpumask.c                          |  69 +++++++++
 .../selftests/bpf/prog_tests/cpumask_iter.c   | 134 ++++++++++++++++++
 .../selftests/bpf/progs/cpumask_common.h      |   3 +
 .../selftests/bpf/progs/test_cpumask_iter.c   |  62 ++++++++
 4 files changed, 268 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/cpumask_iter.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_cpumask_iter.c

-- 
2.30.1 (Apple Git-130)


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2 bpf-next 1/2] bpf: Add bpf_iter_cpumask kfuncs
  2024-01-10  6:00 [PATCH v2 bpf-next 0/2] bpf: Add bpf_iter_cpumask Yafang Shao
@ 2024-01-10  6:00 ` Yafang Shao
  2024-01-10 17:50   ` Alexei Starovoitov
  2024-01-10  6:00 ` [PATCH v2 bpf-next 2/2] selftests/bpf: Add selftests for cpumask iter Yafang Shao
  1 sibling, 1 reply; 8+ messages in thread
From: Yafang Shao @ 2024-01-10  6:00 UTC (permalink / raw)
  To: ast, daniel, john.fastabend, andrii, martin.lau, song,
	yonghong.song, kpsingh, sdf, haoluo, jolsa, tj
  Cc: bpf, Yafang Shao

Add three new kfuncs for bpf_iter_cpumask.
- bpf_iter_cpumask_new
- bpf_iter_cpumask_next
- bpf_iter_cpumask_destroy

These new kfuncs facilitate the iteration of percpu data, such as
runqueues, psi_cgroup_cpu, and more.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 kernel/bpf/cpumask.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 2e73533a3811..366ebe604b1d 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -422,6 +422,72 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
 	return cpumask_weight(cpumask);
 }
 
+struct bpf_iter_cpumask {
+	__u64 __opaque[2];
+} __aligned(8);
+
+struct bpf_iter_cpumask_kern {
+	struct cpumask *mask;
+	int cpu;
+} __aligned(8);
+
+/**
+ * bpf_iter_cpumask_new() - Create a new bpf_iter_cpumask for a specified cpumask
+ * @it: Pointer to the newly created bpf_iter_cpumask structure.
+ * @mask: The cpumask to be iterated over.
+ *
+ * This function initializes a new bpf_iter_cpumask structure for iterating over
+ * the specified CPU mask. It assigns the provided cpumask to the newly created
+ * bpf_iter_cpumask @it for subsequent iteration operations.
+ *
+ * On success, 0 is returen. On failure, ERR is returned.
+ */
+__bpf_kfunc int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, struct cpumask *mask)
+{
+	struct bpf_iter_cpumask_kern *kit = (void *)it;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_cpumask_kern) > sizeof(struct bpf_iter_cpumask));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_cpumask_kern) !=
+		     __alignof__(struct bpf_iter_cpumask));
+
+	kit->mask = mask;
+	kit->cpu = -1;
+	return 0;
+}
+
+/**
+ * bpf_iter_cpumask_next() - Get the next CPU in a bpf_iter_cpumask
+ * @it: The bpf_iter_cpumask structure for iteration.
+ *
+ * This function retrieves a pointer to the number of the next CPU within the
+ * specified bpf_iter_cpumask. It allows sequential access to CPUs within the
+ * cpumask. If there are no further CPUs available, it returns NULL.
+ *
+ * Returns a pointer to the number of the next CPU in the cpumask or NULL if no
+ * further CPUs.
+ */
+__bpf_kfunc int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it)
+{
+	struct bpf_iter_cpumask_kern *kit = (void *)it;
+	struct cpumask *mask = kit->mask;
+	int cpu;
+
+	cpu = cpumask_next(kit->cpu, mask);
+	if (cpu >= nr_cpu_ids)
+		return NULL;
+
+	kit->cpu = cpu;
+	return &kit->cpu;
+}
+
+/**
+ * bpf_iter_cpumask_destroy() - Destroy a bpf_iter_cpumask
+ * @it: Pointer to the bpf_iter_cpumask structure to be destroyed.
+ */
+__bpf_kfunc void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it)
+{
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_SET8_START(cpumask_kfunc_btf_ids)
@@ -450,6 +516,9 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
+BTF_ID_FLAGS(func, bpf_iter_cpumask_new, KF_ITER_NEW | KF_RCU)
+BTF_ID_FLAGS(func, bpf_iter_cpumask_next, KF_ITER_NEXT | KF_RET_NULL | KF_RCU)
+BTF_ID_FLAGS(func, bpf_iter_cpumask_destroy, KF_ITER_DESTROY)
 BTF_SET8_END(cpumask_kfunc_btf_ids)
 
 static const struct btf_kfunc_id_set cpumask_kfunc_set = {
-- 
2.30.1 (Apple Git-130)


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v2 bpf-next 2/2] selftests/bpf: Add selftests for cpumask iter
  2024-01-10  6:00 [PATCH v2 bpf-next 0/2] bpf: Add bpf_iter_cpumask Yafang Shao
  2024-01-10  6:00 ` [PATCH v2 bpf-next 1/2] bpf: Add bpf_iter_cpumask kfuncs Yafang Shao
@ 2024-01-10  6:00 ` Yafang Shao
  2024-01-10 17:52   ` Alexei Starovoitov
  2024-01-15  1:52   ` kernel test robot
  1 sibling, 2 replies; 8+ messages in thread
From: Yafang Shao @ 2024-01-10  6:00 UTC (permalink / raw)
  To: ast, daniel, john.fastabend, andrii, martin.lau, song,
	yonghong.song, kpsingh, sdf, haoluo, jolsa, tj
  Cc: bpf, Yafang Shao

Within the BPF program, we leverage the cgroup iterator to iterate through
percpu runqueue data, specifically the 'nr_running' metric. Subsequently
 we expose this data to userspace by means of a sequence file.

The CPU affinity for the cpumask is determined by the PID of a task:

- PID of the init task (PID 1)
  We typically don't set CPU affinity for init task and thus we can iterate
  across all possible CPUs. However, in scenarios where you've set CPU
  affinity for the init task, you should set the cpumask of your current
  task to full-F. Then proceed to iterate through all possible CPUs using
  the current task.
- PID of a task with defined CPU affinity
  The aim here is to iterate through a specific cpumask. This scenario
  aligns with tasks residing within a cpuset cgroup.
- Invalid PID (e.g., PID -1)
  No cpumask is available in this case.

The result as follows,
  #62/1    cpumask_iter/init_pid:OK
  #62/2    cpumask_iter/invalid_pid:OK
  #62/3    cpumask_iter/self_pid_one_cpu:OK
  #62/4    cpumask_iter/self_pid_multi_cpus:OK
  #62      cpumask_iter:OK
  Summary: 1/4 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 .../selftests/bpf/prog_tests/cpumask_iter.c   | 134 ++++++++++++++++++
 .../selftests/bpf/progs/cpumask_common.h      |   3 +
 .../selftests/bpf/progs/test_cpumask_iter.c   |  62 ++++++++
 3 files changed, 199 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/cpumask_iter.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_cpumask_iter.c

diff --git a/tools/testing/selftests/bpf/prog_tests/cpumask_iter.c b/tools/testing/selftests/bpf/prog_tests/cpumask_iter.c
new file mode 100644
index 000000000000..689ccc4d3c3b
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cpumask_iter.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Yafang Shao <laoar.shao@gmail.com> */
+
+#define _GNU_SOURCE
+#include <sched.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+#include "test_cpumask_iter.skel.h"
+
+static void verify_percpu_data(struct bpf_link *link, int nr_cpu_exp, int nr_running_exp)
+{
+	int iter_fd, len, item, nr_running, psi_running, nr_cpus;
+	static char buf[128];
+	size_t left;
+	char *p;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	if (!ASSERT_GE(iter_fd, 0, "iter_fd"))
+		return;
+
+	memset(buf, 0, sizeof(buf));
+	left = ARRAY_SIZE(buf);
+	p = buf;
+	while ((len = read(iter_fd, p, left)) > 0) {
+		p += len;
+		left -= len;
+	}
+
+	item = sscanf(buf, "nr_running %u nr_cpus %u psi_running %u\n",
+		      &nr_running, &nr_cpus, &psi_running);
+	if (nr_cpu_exp == -1) {
+		ASSERT_EQ(item, -1, "seq_format");
+		goto out;
+	}
+
+	ASSERT_EQ(item, 3, "seq_format");
+	ASSERT_GE(nr_running, nr_running_exp, "nr_running");
+	ASSERT_GE(psi_running, nr_running_exp, "nr_running");
+	ASSERT_EQ(nr_cpus, nr_cpu_exp, "nr_cpus");
+
+	/* read() after iter finishes should be ok. */
+	if (len == 0)
+		ASSERT_OK(read(iter_fd, buf, sizeof(buf)), "second_read");
+
+out:
+	close(iter_fd);
+}
+
+void test_cpumask_iter(void)
+{
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	int nr_possible, cgrp_fd, pid, err, cnt, i;
+	struct test_cpumask_iter *skel = NULL;
+	union bpf_iter_link_info linfo;
+	int cpu_ids[] = {1, 3, 4, 5};
+	struct bpf_link *link;
+	cpu_set_t set;
+
+	skel = test_cpumask_iter__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "test_for_each_cpu__open_and_load"))
+		return;
+
+	if (setup_cgroup_environment())
+		goto destroy;
+
+	/* Utilize the cgroup iter */
+	cgrp_fd = get_root_cgroup();
+	if (!ASSERT_GE(cgrp_fd, 0, "create cgrp"))
+		goto cleanup;
+
+	memset(&linfo, 0, sizeof(linfo));
+	linfo.cgroup.cgroup_fd = cgrp_fd;
+	linfo.cgroup.order = BPF_CGROUP_ITER_SELF_ONLY;
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	link = bpf_program__attach_iter(skel->progs.cpu_cgroup, &opts);
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
+		goto close_fd;
+
+	skel->bss->target_pid = 1;
+	/* In case init task is set CPU affinity */
+	err = sched_getaffinity(1, sizeof(set), &set);
+	if (!ASSERT_OK(err, "setaffinity"))
+		goto close_fd;
+
+	cnt = CPU_COUNT(&set);
+	nr_possible = bpf_num_possible_cpus();
+	if (test__start_subtest("init_pid"))
+		/* curent task is running. */
+		verify_percpu_data(link, cnt, cnt == nr_possible ? 1 : 0);
+
+	skel->bss->target_pid = -1;
+	if (test__start_subtest("invalid_pid"))
+		verify_percpu_data(link, -1, -1);
+
+	pid = getpid();
+	skel->bss->target_pid = pid;
+	CPU_ZERO(&set);
+	CPU_SET(0, &set);
+	err = sched_setaffinity(pid, sizeof(set), &set);
+	if (!ASSERT_OK(err, "setaffinity"))
+		goto free_link;
+
+	if (test__start_subtest("self_pid_one_cpu"))
+		verify_percpu_data(link, 1, 1);
+
+	/* Assume there are at least 8 CPUs on the testbed */
+	if (nr_possible < 8)
+		goto free_link;
+
+	CPU_ZERO(&set);
+	/* Set the CPU affinitiy: 1,3-5 */
+	for (i = 0; i < ARRAY_SIZE(cpu_ids); i++)
+		CPU_SET(cpu_ids[i], &set);
+	err = sched_setaffinity(pid, sizeof(set), &set);
+	if (!ASSERT_OK(err, "setaffinity"))
+		goto free_link;
+
+	if (test__start_subtest("self_pid_multi_cpus"))
+		verify_percpu_data(link, ARRAY_SIZE(cpu_ids), 1);
+
+free_link:
+	bpf_link__destroy(link);
+close_fd:
+	close(cgrp_fd);
+cleanup:
+	cleanup_cgroup_environment();
+destroy:
+	test_cpumask_iter__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h
index 0cd4aebb97cf..5f2f44eca4c4 100644
--- a/tools/testing/selftests/bpf/progs/cpumask_common.h
+++ b/tools/testing/selftests/bpf/progs/cpumask_common.h
@@ -55,6 +55,9 @@ void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym
 u32 bpf_cpumask_any_distribute(const struct cpumask *src) __ksym;
 u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, const struct cpumask *src2) __ksym;
 u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
+int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, struct cpumask *mask) __ksym;
+int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it) __ksym;
+void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it) __ksym;
 
 void bpf_rcu_read_lock(void) __ksym;
 void bpf_rcu_read_unlock(void) __ksym;
diff --git a/tools/testing/selftests/bpf/progs/test_cpumask_iter.c b/tools/testing/selftests/bpf/progs/test_cpumask_iter.c
new file mode 100644
index 000000000000..68ebfa0963c7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_cpumask_iter.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2023 Yafang Shao <laoar.shao@gmail.com> */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#include "task_kfunc_common.h"
+#include "cpumask_common.h"
+
+extern const struct psi_group_cpu system_group_pcpu __ksym __weak;
+extern const struct rq runqueues __ksym __weak;
+
+int target_pid;
+
+SEC("iter/cgroup")
+int BPF_PROG(cpu_cgroup, struct bpf_iter_meta *meta, struct cgroup *cgrp)
+{
+	u32 *cpu, nr_running = 0, psi_nr_running = 0, nr_cpus = 0;
+	unsigned int tasks[NR_PSI_TASK_COUNTS];
+	struct psi_group_cpu *groupc;
+	struct bpf_cpumask *mask;
+	struct task_struct *p;
+	struct rq *rq;
+
+	/* epilogue */
+	if (cgrp == NULL)
+		return 0;
+
+	mask = bpf_cpumask_create();
+	if (!mask)
+		return 1;
+
+	p = bpf_task_from_pid(target_pid);
+	if (!p) {
+		bpf_cpumask_release(mask);
+		return 1;
+	}
+
+	bpf_cpumask_copy(mask, p->cpus_ptr);
+	bpf_for_each(cpumask, cpu, &mask->cpumask) {
+		rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, *cpu);
+		if (!rq)
+			continue;
+		nr_running += rq->nr_running;
+		nr_cpus += 1;
+
+		groupc = (struct psi_group_cpu *)bpf_per_cpu_ptr(&system_group_pcpu, *cpu);
+		if (!groupc)
+			continue;
+		bpf_probe_read_kernel(&tasks, sizeof(tasks), &groupc->tasks);
+		psi_nr_running += tasks[NR_RUNNING];
+	}
+	BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",
+		       nr_running, nr_cpus, psi_nr_running);
+
+	bpf_task_release(p);
+	bpf_cpumask_release(mask);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
2.30.1 (Apple Git-130)


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 bpf-next 1/2] bpf: Add bpf_iter_cpumask kfuncs
  2024-01-10  6:00 ` [PATCH v2 bpf-next 1/2] bpf: Add bpf_iter_cpumask kfuncs Yafang Shao
@ 2024-01-10 17:50   ` Alexei Starovoitov
  2024-01-11  2:31     ` Yafang Shao
  0 siblings, 1 reply; 8+ messages in thread
From: Alexei Starovoitov @ 2024-01-10 17:50 UTC (permalink / raw)
  To: Yafang Shao
  Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa, Tejun Heo, bpf

On Tue, Jan 9, 2024 at 10:00 PM Yafang Shao <laoar.shao@gmail.com> wrote:
>
> +__bpf_kfunc int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, struct cpumask *mask)
> +{
> +       struct bpf_iter_cpumask_kern *kit = (void *)it;
> +
> +       BUILD_BUG_ON(sizeof(struct bpf_iter_cpumask_kern) > sizeof(struct bpf_iter_cpumask));
> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_cpumask_kern) !=
> +                    __alignof__(struct bpf_iter_cpumask));
> +
> +       kit->mask = mask;
> +       kit->cpu = -1;
> +       return 0;
> +}
> +

...

> +BTF_ID_FLAGS(func, bpf_iter_cpumask_new, KF_ITER_NEW | KF_RCU)

this is not safe.
KF_RCU means that 'mask' pointer is valid in RCU CS,
but you're storing the pointer in the iterator that may leak
past RCU CS.

You need KF_RCU_PROTECTED at least.
KF_TRUSTED_ARGS might be necessary too. This needs to be thought through.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 bpf-next 2/2] selftests/bpf: Add selftests for cpumask iter
  2024-01-10  6:00 ` [PATCH v2 bpf-next 2/2] selftests/bpf: Add selftests for cpumask iter Yafang Shao
@ 2024-01-10 17:52   ` Alexei Starovoitov
  2024-01-11  2:31     ` Yafang Shao
  2024-01-15  1:52   ` kernel test robot
  1 sibling, 1 reply; 8+ messages in thread
From: Alexei Starovoitov @ 2024-01-10 17:52 UTC (permalink / raw)
  To: Yafang Shao
  Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa, Tejun Heo, bpf

On Tue, Jan 9, 2024 at 10:00 PM Yafang Shao <laoar.shao@gmail.com> wrote:
> +
> +SEC("iter/cgroup")
> +int BPF_PROG(cpu_cgroup, struct bpf_iter_meta *meta, struct cgroup *cgrp)
> +{
> +       u32 *cpu, nr_running = 0, psi_nr_running = 0, nr_cpus = 0;
> +       unsigned int tasks[NR_PSI_TASK_COUNTS];
> +       struct psi_group_cpu *groupc;
> +       struct bpf_cpumask *mask;
> +       struct task_struct *p;
> +       struct rq *rq;
> +
> +       /* epilogue */
> +       if (cgrp == NULL)
> +               return 0;
> +
> +       mask = bpf_cpumask_create();
> +       if (!mask)
> +               return 1;
> +
> +       p = bpf_task_from_pid(target_pid);
> +       if (!p) {
> +               bpf_cpumask_release(mask);
> +               return 1;
> +       }
> +
> +       bpf_cpumask_copy(mask, p->cpus_ptr);
> +       bpf_for_each(cpumask, cpu, &mask->cpumask) {
> +               rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, *cpu);
> +               if (!rq)
> +                       continue;
> +               nr_running += rq->nr_running;
> +               nr_cpus += 1;
> +
> +               groupc = (struct psi_group_cpu *)bpf_per_cpu_ptr(&system_group_pcpu, *cpu);
> +               if (!groupc)
> +                       continue;
> +               bpf_probe_read_kernel(&tasks, sizeof(tasks), &groupc->tasks);
> +               psi_nr_running += tasks[NR_RUNNING];
> +       }

Instead of probe_read_kernel (which is not fast) please use
bpf_rdonly_cast() and access groups->tasks.
array should already be recognized by the verifier, but if not let's
fix the verifier instead of fallback to probe_read.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 bpf-next 1/2] bpf: Add bpf_iter_cpumask kfuncs
  2024-01-10 17:50   ` Alexei Starovoitov
@ 2024-01-11  2:31     ` Yafang Shao
  0 siblings, 0 replies; 8+ messages in thread
From: Yafang Shao @ 2024-01-11  2:31 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa, Tejun Heo, bpf

On Thu, Jan 11, 2024 at 1:50 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Jan 9, 2024 at 10:00 PM Yafang Shao <laoar.shao@gmail.com> wrote:
> >
> > +__bpf_kfunc int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, struct cpumask *mask)
> > +{
> > +       struct bpf_iter_cpumask_kern *kit = (void *)it;
> > +
> > +       BUILD_BUG_ON(sizeof(struct bpf_iter_cpumask_kern) > sizeof(struct bpf_iter_cpumask));
> > +       BUILD_BUG_ON(__alignof__(struct bpf_iter_cpumask_kern) !=
> > +                    __alignof__(struct bpf_iter_cpumask));
> > +
> > +       kit->mask = mask;
> > +       kit->cpu = -1;
> > +       return 0;
> > +}
> > +
>
> ...
>
> > +BTF_ID_FLAGS(func, bpf_iter_cpumask_new, KF_ITER_NEW | KF_RCU)
>
> this is not safe.
> KF_RCU means that 'mask' pointer is valid in RCU CS,
> but you're storing the pointer in the iterator that may leak
> past RCU CS.
>
> You need KF_RCU_PROTECTED at least.
> KF_TRUSTED_ARGS might be necessary too. This needs to be thought through.

Thanks for your detailed explanation. I will analyze it carefully.

-- 
Regards
Yafang

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 bpf-next 2/2] selftests/bpf: Add selftests for cpumask iter
  2024-01-10 17:52   ` Alexei Starovoitov
@ 2024-01-11  2:31     ` Yafang Shao
  0 siblings, 0 replies; 8+ messages in thread
From: Yafang Shao @ 2024-01-11  2:31 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa, Tejun Heo, bpf

On Thu, Jan 11, 2024 at 1:52 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Jan 9, 2024 at 10:00 PM Yafang Shao <laoar.shao@gmail.com> wrote:
> > +
> > +SEC("iter/cgroup")
> > +int BPF_PROG(cpu_cgroup, struct bpf_iter_meta *meta, struct cgroup *cgrp)
> > +{
> > +       u32 *cpu, nr_running = 0, psi_nr_running = 0, nr_cpus = 0;
> > +       unsigned int tasks[NR_PSI_TASK_COUNTS];
> > +       struct psi_group_cpu *groupc;
> > +       struct bpf_cpumask *mask;
> > +       struct task_struct *p;
> > +       struct rq *rq;
> > +
> > +       /* epilogue */
> > +       if (cgrp == NULL)
> > +               return 0;
> > +
> > +       mask = bpf_cpumask_create();
> > +       if (!mask)
> > +               return 1;
> > +
> > +       p = bpf_task_from_pid(target_pid);
> > +       if (!p) {
> > +               bpf_cpumask_release(mask);
> > +               return 1;
> > +       }
> > +
> > +       bpf_cpumask_copy(mask, p->cpus_ptr);
> > +       bpf_for_each(cpumask, cpu, &mask->cpumask) {
> > +               rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, *cpu);
> > +               if (!rq)
> > +                       continue;
> > +               nr_running += rq->nr_running;
> > +               nr_cpus += 1;
> > +
> > +               groupc = (struct psi_group_cpu *)bpf_per_cpu_ptr(&system_group_pcpu, *cpu);
> > +               if (!groupc)
> > +                       continue;
> > +               bpf_probe_read_kernel(&tasks, sizeof(tasks), &groupc->tasks);
> > +               psi_nr_running += tasks[NR_RUNNING];
> > +       }
>
> Instead of probe_read_kernel (which is not fast) please use
> bpf_rdonly_cast() and access groups->tasks.
> array should already be recognized by the verifier, but if not let's
> fix the verifier instead of fallback to probe_read.

Thanks for your suggestion.
Will do it.

-- 
Regards
Yafang

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 bpf-next 2/2] selftests/bpf: Add selftests for cpumask iter
  2024-01-10  6:00 ` [PATCH v2 bpf-next 2/2] selftests/bpf: Add selftests for cpumask iter Yafang Shao
  2024-01-10 17:52   ` Alexei Starovoitov
@ 2024-01-15  1:52   ` kernel test robot
  1 sibling, 0 replies; 8+ messages in thread
From: kernel test robot @ 2024-01-15  1:52 UTC (permalink / raw)
  To: Yafang Shao, ast, daniel, john.fastabend, andrii, martin.lau,
	song, yonghong.song, kpsingh, sdf, haoluo, jolsa, tj
  Cc: oe-kbuild-all, bpf, Yafang Shao

Hi Yafang,

kernel test robot noticed the following build errors:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Yafang-Shao/bpf-Add-bpf_iter_cpumask-kfuncs/20240110-140322
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20240110060037.4202-3-laoar.shao%40gmail.com
patch subject: [PATCH v2 bpf-next 2/2] selftests/bpf: Add selftests for cpumask iter
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240115/202401150914.Rcl50Ct9-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202401150914.Rcl50Ct9-lkp@intel.com/

All errors (new ones prefixed by >>):

>> progs/test_cpumask_iter.c:20:21: error: use of undeclared identifier 'NR_PSI_TASK_COUNTS'
      20 |         unsigned int tasks[NR_PSI_TASK_COUNTS];
         |                            ^
>> progs/test_cpumask_iter.c:41:2: error: assigning to 'u32 *' (aka 'unsigned int *') from 'int *' converts between pointers to integer types with different sign [-Werror,-Wpointer-sign]
      41 |         bpf_for_each(cpumask, cpu, &mask->cpumask) {
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   /tools/include/bpf/bpf_helpers.h:348:10: note: expanded from macro 'bpf_for_each'
     348 |         (((cur) = bpf_iter_##type##_next(&___it)));                                             \
         |                 ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> progs/test_cpumask_iter.c:51:55: error: incomplete definition of type 'struct psi_group_cpu'
      51 |                 bpf_probe_read_kernel(&tasks, sizeof(tasks), &groupc->tasks);
         |                                                               ~~~~~~^
   progs/test_cpumask_iter.c:11:21: note: forward declaration of 'struct psi_group_cpu'
      11 | extern const struct psi_group_cpu system_group_pcpu __ksym __weak;
         |                     ^
>> progs/test_cpumask_iter.c:52:27: error: use of undeclared identifier 'NR_RUNNING'; did you mean 'T_RUNNING'?
      52 |                 psi_nr_running += tasks[NR_RUNNING];
         |                                         ^~~~~~~~~~
         |                                         T_RUNNING
   /tools/include/vmlinux.h:11216:3: note: 'T_RUNNING' declared here
    11216 |                 T_RUNNING = 0,
          |                 ^
   4 errors generated.

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-01-15  1:53 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-01-10  6:00 [PATCH v2 bpf-next 0/2] bpf: Add bpf_iter_cpumask Yafang Shao
2024-01-10  6:00 ` [PATCH v2 bpf-next 1/2] bpf: Add bpf_iter_cpumask kfuncs Yafang Shao
2024-01-10 17:50   ` Alexei Starovoitov
2024-01-11  2:31     ` Yafang Shao
2024-01-10  6:00 ` [PATCH v2 bpf-next 2/2] selftests/bpf: Add selftests for cpumask iter Yafang Shao
2024-01-10 17:52   ` Alexei Starovoitov
2024-01-11  2:31     ` Yafang Shao
2024-01-15  1:52   ` kernel test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox