[PATCH v3 bpf-next 0/3] bpf: Add bpf_iter

BPF List
 help / color / mirror / Atom feed

* [PATCH v3 bpf-next 0/3] bpf: Add bpf_iter_cpumask
@ 2024-01-17  2:48 Yafang Shao
  2024-01-17  2:48 ` [PATCH v3 bpf-next 1/3] bpf: Add bpf_iter_cpumask kfuncs Yafang Shao
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Yafang Shao @ 2024-01-17  2:48 UTC (permalink / raw)
  To: ast, daniel, john.fastabend, andrii, martin.lau, song,
	yonghong.song, kpsingh, sdf, haoluo, jolsa, tj
  Cc: bpf, lkp, Yafang Shao

Three new kfuncs, namely bpf_iter_cpumask_{new,next,destroy}, have been
added for the new bpf_iter_cpumask functionality. These kfuncs enable the
iteration of percpu data, such as runqueues, system_group_pcpu, and more.

In our specific use case, we leverage the cgroup iterator to traverse
percpu data, subsequently exposing it to userspace through a seq file.
Refer to the test cases in patch #2 for further context and examples.

Changes:
- v2 -> v3:
  - Define KF_RCU_PROTECTED for bpf_iter_cpumask_new (Alexei)
  - Code improvement in selftests
  - Fix build error in selftest due to CONFIG_PSI=n
    reported by kernel test robot <lkp@intel.com> 
- v1 -> v2: 
  - Avoid changing cgroup subsystem (Tejun)
  - Remove bpf_cpumask_set_from_pid(), and use bpf_cpumask_copy()
    instead (Tejun)
  - Use `int cpu;` field in bpf_iter_cpumask_kern (Andrii)
- bpf: Add new bpf helper bpf_for_each_cpu
  https://lwn.net/ml/bpf/20230801142912.55078-1-laoar.shao@gmail.com/

Yafang Shao (3):
  bpf: Add bpf_iter_cpumask kfuncs
  bpf, doc: Add document for cpumask iter
  selftests/bpf: Add selftests for cpumask iter

 Documentation/bpf/cpumasks.rst                |  17 +++
 kernel/bpf/cpumask.c                          |  69 +++++++++
 tools/testing/selftests/bpf/config            |   1 +
 .../selftests/bpf/prog_tests/cpumask_iter.c   | 134 ++++++++++++++++++
 .../selftests/bpf/progs/cpumask_common.h      |   3 +
 .../selftests/bpf/progs/test_cpumask_iter.c   |  56 ++++++++
 6 files changed, 280 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/cpumask_iter.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_cpumask_iter.c

-- 
2.39.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v3 bpf-next 1/3] bpf: Add bpf_iter_cpumask kfuncs
  2024-01-17  2:48 [PATCH v3 bpf-next 0/3] bpf: Add bpf_iter_cpumask Yafang Shao
@ 2024-01-17  2:48 ` Yafang Shao
  2024-01-18 22:27   ` Yonghong Song
  2024-01-17  2:48 ` [PATCH v3 bpf-next 2/3] bpf, doc: Add document for cpumask iter Yafang Shao
  2024-01-17  2:48 ` [PATCH v3 bpf-next 3/3] selftests/bpf: Add selftests " Yafang Shao
  2 siblings, 1 reply; 10+ messages in thread
From: Yafang Shao @ 2024-01-17  2:48 UTC (permalink / raw)
  To: ast, daniel, john.fastabend, andrii, martin.lau, song,
	yonghong.song, kpsingh, sdf, haoluo, jolsa, tj
  Cc: bpf, lkp, Yafang Shao

Add three new kfuncs for bpf_iter_cpumask.
- bpf_iter_cpumask_new
  It is defined with KF_RCU_PROTECTED and KF_RCU.
  KF_RCU_PROTECTED is defined because we must use it under the
  protection of RCU.
  KF_RCU is defined because the cpumask must be a RCU trusted pointer
  such as task->cpus_ptr.
- bpf_iter_cpumask_next
- bpf_iter_cpumask_destroy

These new kfuncs facilitate the iteration of percpu data, such as
runqueues, psi_cgroup_cpu, and more.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 kernel/bpf/cpumask.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 2e73533a3811..1840e48e6142 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -422,6 +422,72 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
 	return cpumask_weight(cpumask);
 }
 
+struct bpf_iter_cpumask {
+	__u64 __opaque[2];
+} __aligned(8);
+
+struct bpf_iter_cpumask_kern {
+	const struct cpumask *mask;
+	int cpu;
+} __aligned(8);
+
+/**
+ * bpf_iter_cpumask_new() - Create a new bpf_iter_cpumask for a specified cpumask
+ * @it: The new bpf_iter_cpumask to be created.
+ * @mask: The cpumask to be iterated over.
+ *
+ * This function initializes a new bpf_iter_cpumask structure for iterating over
+ * the specified CPU mask. It assigns the provided cpumask to the newly created
+ * bpf_iter_cpumask @it for subsequent iteration operations.
+ *
+ * On success, 0 is returen. On failure, ERR is returned.
+ */
+__bpf_kfunc int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, const struct cpumask *mask)
+{
+	struct bpf_iter_cpumask_kern *kit = (void *)it;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_cpumask_kern) > sizeof(struct bpf_iter_cpumask));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_cpumask_kern) !=
+		     __alignof__(struct bpf_iter_cpumask));
+
+	kit->mask = mask;
+	kit->cpu = -1;
+	return 0;
+}
+
+/**
+ * bpf_iter_cpumask_next() - Get the next CPU in a bpf_iter_cpumask
+ * @it: The bpf_iter_cpumask
+ *
+ * This function retrieves a pointer to the number of the next CPU within the
+ * specified bpf_iter_cpumask. It allows sequential access to CPUs within the
+ * cpumask. If there are no further CPUs available, it returns NULL.
+ *
+ * Returns a pointer to the number of the next CPU in the cpumask or NULL if no
+ * further CPUs.
+ */
+__bpf_kfunc int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it)
+{
+	struct bpf_iter_cpumask_kern *kit = (void *)it;
+	const struct cpumask *mask = kit->mask;
+	int cpu;
+
+	cpu = cpumask_next(kit->cpu, mask);
+	if (cpu >= nr_cpu_ids)
+		return NULL;
+
+	kit->cpu = cpu;
+	return &kit->cpu;
+}
+
+/**
+ * bpf_iter_cpumask_destroy() - Destroy a bpf_iter_cpumask
+ * @it: The bpf_iter_cpumask to be destroyed.
+ */
+__bpf_kfunc void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it)
+{
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_SET8_START(cpumask_kfunc_btf_ids)
@@ -450,6 +516,9 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
+BTF_ID_FLAGS(func, bpf_iter_cpumask_new, KF_ITER_NEW | KF_RCU_PROTECTED | KF_RCU)
+BTF_ID_FLAGS(func, bpf_iter_cpumask_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_cpumask_destroy, KF_ITER_DESTROY)
 BTF_SET8_END(cpumask_kfunc_btf_ids)
 
 static const struct btf_kfunc_id_set cpumask_kfunc_set = {
-- 
2.39.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 bpf-next 1/3] bpf: Add bpf_iter_cpumask kfuncs
  2024-01-17  2:48 ` [PATCH v3 bpf-next 1/3] bpf: Add bpf_iter_cpumask kfuncs Yafang Shao
@ 2024-01-18 22:27   ` Yonghong Song
  2024-01-19  0:51     ` Hou Tao
  2024-01-19  9:50     ` Yafang Shao
  0 siblings, 2 replies; 10+ messages in thread
From: Yonghong Song @ 2024-01-18 22:27 UTC (permalink / raw)
  To: Yafang Shao, ast, daniel, john.fastabend, andrii, martin.lau,
	song, kpsingh, sdf, haoluo, jolsa, tj
  Cc: bpf, lkp


On 1/16/24 6:48 PM, Yafang Shao wrote:
> Add three new kfuncs for bpf_iter_cpumask.
> - bpf_iter_cpumask_new
>    It is defined with KF_RCU_PROTECTED and KF_RCU.
>    KF_RCU_PROTECTED is defined because we must use it under the
>    protection of RCU.
>    KF_RCU is defined because the cpumask must be a RCU trusted pointer
>    such as task->cpus_ptr.

I am not sure whether we need both or not.

KF_RCU_PROTECTED means the function call needs within the rcu cs.
KF_RCU means the argument usage needs within the rcu cs.
We only need one of them (preferrably KF_RCU).

> - bpf_iter_cpumask_next
> - bpf_iter_cpumask_destroy
>
> These new kfuncs facilitate the iteration of percpu data, such as
> runqueues, psi_cgroup_cpu, and more.
>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> ---
>   kernel/bpf/cpumask.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 69 insertions(+)
>
> diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
> index 2e73533a3811..1840e48e6142 100644
> --- a/kernel/bpf/cpumask.c
> +++ b/kernel/bpf/cpumask.c
> @@ -422,6 +422,72 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
>   	return cpumask_weight(cpumask);
>   }
>   
> +struct bpf_iter_cpumask {
> +	__u64 __opaque[2];
> +} __aligned(8);
> +
> +struct bpf_iter_cpumask_kern {
> +	const struct cpumask *mask;
> +	int cpu;
> +} __aligned(8);
> +
> +/**
> + * bpf_iter_cpumask_new() - Create a new bpf_iter_cpumask for a specified cpumask
> + * @it: The new bpf_iter_cpumask to be created.
> + * @mask: The cpumask to be iterated over.
> + *
> + * This function initializes a new bpf_iter_cpumask structure for iterating over
> + * the specified CPU mask. It assigns the provided cpumask to the newly created
> + * bpf_iter_cpumask @it for subsequent iteration operations.
> + *
> + * On success, 0 is returen. On failure, ERR is returned.
> + */
> +__bpf_kfunc int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, const struct cpumask *mask)
> +{
> +	struct bpf_iter_cpumask_kern *kit = (void *)it;
> +
> +	BUILD_BUG_ON(sizeof(struct bpf_iter_cpumask_kern) > sizeof(struct bpf_iter_cpumask));
> +	BUILD_BUG_ON(__alignof__(struct bpf_iter_cpumask_kern) !=
> +		     __alignof__(struct bpf_iter_cpumask));
> +
> +	kit->mask = mask;
> +	kit->cpu = -1;
> +	return 0;
> +}

We have problem here. Let us say bpf_iter_cpumask_new() is called inside rcu cs.
Once the control goes out of rcu cs, 'mask' could be freed, right?
Or you require bpf_iter_cpumask_next() needs to be in the same rcu cs
as bpf_iter_cpumask_new(). But such a requirement seems odd.

I think we can do things similar to bpf_iter_task_vma. You can allocate memory
with bpf_mem_alloc() in bpf_iter_cpumask_new() to keep a copy of mask. This
way, you do not need to worry about potential use-after-free issue.
The memory can be freed with bpf_iter_cpumask_destroy().

> +
> +/**
> + * bpf_iter_cpumask_next() - Get the next CPU in a bpf_iter_cpumask
> + * @it: The bpf_iter_cpumask
> + *
> + * This function retrieves a pointer to the number of the next CPU within the
> + * specified bpf_iter_cpumask. It allows sequential access to CPUs within the
> + * cpumask. If there are no further CPUs available, it returns NULL.
> + *
> + * Returns a pointer to the number of the next CPU in the cpumask or NULL if no
> + * further CPUs.
> + */
> +__bpf_kfunc int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it)
> +{
> +	struct bpf_iter_cpumask_kern *kit = (void *)it;
> +	const struct cpumask *mask = kit->mask;
> +	int cpu;
> +
> +	cpu = cpumask_next(kit->cpu, mask);
> +	if (cpu >= nr_cpu_ids)
> +		return NULL;
> +
> +	kit->cpu = cpu;
> +	return &kit->cpu;
> +}
> +
> +/**
> + * bpf_iter_cpumask_destroy() - Destroy a bpf_iter_cpumask
> + * @it: The bpf_iter_cpumask to be destroyed.
> + */
> +__bpf_kfunc void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it)
> +{
> +}
> +
>   __bpf_kfunc_end_defs();
>   
>   BTF_SET8_START(cpumask_kfunc_btf_ids)
> @@ -450,6 +516,9 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
>   BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
>   BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
>   BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
> +BTF_ID_FLAGS(func, bpf_iter_cpumask_new, KF_ITER_NEW | KF_RCU_PROTECTED | KF_RCU)
> +BTF_ID_FLAGS(func, bpf_iter_cpumask_next, KF_ITER_NEXT | KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_iter_cpumask_destroy, KF_ITER_DESTROY)
>   BTF_SET8_END(cpumask_kfunc_btf_ids)
>   
>   static const struct btf_kfunc_id_set cpumask_kfunc_set = {

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 bpf-next 1/3] bpf: Add bpf_iter_cpumask kfuncs
  2024-01-18 22:27   ` Yonghong Song
@ 2024-01-19  0:51     ` Hou Tao
  2024-01-19  3:45       ` Yonghong Song
  2024-01-19  9:50     ` Yafang Shao
  1 sibling, 1 reply; 10+ messages in thread
From: Hou Tao @ 2024-01-19  0:51 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Yafang Shao, tj, andrii, kpsingh, song, martin.lau, daniel, ast,
	bpf, lkp, john.fastabend, sdf, haoluo, jolsa

Hi,

On 1/19/2024 6:27 AM, Yonghong Song wrote:
>
> On 1/16/24 6:48 PM, Yafang Shao wrote:
>> Add three new kfuncs for bpf_iter_cpumask.
>> - bpf_iter_cpumask_new
>>    It is defined with KF_RCU_PROTECTED and KF_RCU.
>>    KF_RCU_PROTECTED is defined because we must use it under the
>>    protection of RCU.
>>    KF_RCU is defined because the cpumask must be a RCU trusted pointer
>>    such as task->cpus_ptr.
>
> I am not sure whether we need both or not.
>
> KF_RCU_PROTECTED means the function call needs within the rcu cs.
> KF_RCU means the argument usage needs within the rcu cs.
> We only need one of them (preferrably KF_RCU).
>
>> - bpf_iter_cpumask_next
>> - bpf_iter_cpumask_destroy
>>
>> These new kfuncs facilitate the iteration of percpu data, such as
>> runqueues, psi_cgroup_cpu, and more.
>>
>> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
>> ---
>>   kernel/bpf/cpumask.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 69 insertions(+)
>>
>> diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
>> index 2e73533a3811..1840e48e6142 100644
>> --- a/kernel/bpf/cpumask.c
>> +++ b/kernel/bpf/cpumask.c
>> @@ -422,6 +422,72 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct
>> cpumask *cpumask)
>>       return cpumask_weight(cpumask);
>>   }
>>   +struct bpf_iter_cpumask {
>> +    __u64 __opaque[2];
>> +} __aligned(8);
>> +
>> +struct bpf_iter_cpumask_kern {
>> +    const struct cpumask *mask;
>> +    int cpu;
>> +} __aligned(8);
>> +
>> +/**
>> + * bpf_iter_cpumask_new() - Create a new bpf_iter_cpumask for a
>> specified cpumask
>> + * @it: The new bpf_iter_cpumask to be created.
>> + * @mask: The cpumask to be iterated over.
>> + *
>> + * This function initializes a new bpf_iter_cpumask structure for
>> iterating over
>> + * the specified CPU mask. It assigns the provided cpumask to the
>> newly created
>> + * bpf_iter_cpumask @it for subsequent iteration operations.
>> + *
>> + * On success, 0 is returen. On failure, ERR is returned.
>> + */
>> +__bpf_kfunc int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it,
>> const struct cpumask *mask)
>> +{
>> +    struct bpf_iter_cpumask_kern *kit = (void *)it;
>> +
>> +    BUILD_BUG_ON(sizeof(struct bpf_iter_cpumask_kern) >
>> sizeof(struct bpf_iter_cpumask));
>> +    BUILD_BUG_ON(__alignof__(struct bpf_iter_cpumask_kern) !=
>> +             __alignof__(struct bpf_iter_cpumask));
>> +
>> +    kit->mask = mask;
>> +    kit->cpu = -1;
>> +    return 0;
>> +}
>
> We have problem here. Let us say bpf_iter_cpumask_new() is called
> inside rcu cs.
> Once the control goes out of rcu cs, 'mask' could be freed, right?
> Or you require bpf_iter_cpumask_next() needs to be in the same rcu cs
> as bpf_iter_cpumask_new(). But such a requirement seems odd.

So the case is possible when using bpf_iter_cpumask_new() and
bpf_iter_cpumask_next() in sleepable program and these two kfuncs are
used in two different rcu_read_lock/rcu_read_unlock code blocks, right ?
>
> I think we can do things similar to bpf_iter_task_vma. You can
> allocate memory
> with bpf_mem_alloc() in bpf_iter_cpumask_new() to keep a copy of mask.
> This
> way, you do not need to worry about potential use-after-free issue.
> The memory can be freed with bpf_iter_cpumask_destroy().
>
>> +
>> +/**
>> + * bpf_iter_cpumask_next() - Get the next CPU in a bpf_iter_cpumask
>> + * @it: The bpf_iter_cpumask
>> + *
>> + * This function retrieves a pointer to the number of the next CPU
>> within the
>> + * specified bpf_iter_cpumask. It allows sequential access to CPUs
>> within the
>> + * cpumask. If there are no further CPUs available, it returns NULL.
>> + *
>> + * Returns a pointer to the number of the next CPU in the cpumask or
>> NULL if no
>> + * further CPUs.
>> + */
>> +__bpf_kfunc int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it)
>> +{
>> +    struct bpf_iter_cpumask_kern *kit = (void *)it;
>> +    const struct cpumask *mask = kit->mask;
>> +    int cpu;
>> +
>> +    cpu = cpumask_next(kit->cpu, mask);
>> +    if (cpu >= nr_cpu_ids)
>> +        return NULL;
>> +
>> +    kit->cpu = cpu;
>> +    return &kit->cpu;
>> +}
>> +
>> +/**
>> + * bpf_iter_cpumask_destroy() - Destroy a bpf_iter_cpumask
>> + * @it: The bpf_iter_cpumask to be destroyed.
>> + */
>> +__bpf_kfunc void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it)
>> +{
>> +}
>> +
>>   __bpf_kfunc_end_defs();
>>     BTF_SET8_START(cpumask_kfunc_btf_ids)
>> @@ -450,6 +516,9 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
>>   BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
>>   BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
>>   BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
>> +BTF_ID_FLAGS(func, bpf_iter_cpumask_new, KF_ITER_NEW |
>> KF_RCU_PROTECTED | KF_RCU)
>> +BTF_ID_FLAGS(func, bpf_iter_cpumask_next, KF_ITER_NEXT | KF_RET_NULL)
>> +BTF_ID_FLAGS(func, bpf_iter_cpumask_destroy, KF_ITER_DESTROY)
>>   BTF_SET8_END(cpumask_kfunc_btf_ids)
>>     static const struct btf_kfunc_id_set cpumask_kfunc_set = {
>
> .


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 bpf-next 1/3] bpf: Add bpf_iter_cpumask kfuncs
  2024-01-19  0:51     ` Hou Tao
@ 2024-01-19  3:45       ` Yonghong Song
  0 siblings, 0 replies; 10+ messages in thread
From: Yonghong Song @ 2024-01-19  3:45 UTC (permalink / raw)
  To: Hou Tao
  Cc: Yafang Shao, tj, andrii, kpsingh, song, martin.lau, daniel, ast,
	bpf, lkp, john.fastabend, sdf, haoluo, jolsa


On 1/18/24 4:51 PM, Hou Tao wrote:
> Hi,
>
> On 1/19/2024 6:27 AM, Yonghong Song wrote:
>> On 1/16/24 6:48 PM, Yafang Shao wrote:
>>> Add three new kfuncs for bpf_iter_cpumask.
>>> - bpf_iter_cpumask_new
>>>     It is defined with KF_RCU_PROTECTED and KF_RCU.
>>>     KF_RCU_PROTECTED is defined because we must use it under the
>>>     protection of RCU.
>>>     KF_RCU is defined because the cpumask must be a RCU trusted pointer
>>>     such as task->cpus_ptr.
>> I am not sure whether we need both or not.
>>
>> KF_RCU_PROTECTED means the function call needs within the rcu cs.
>> KF_RCU means the argument usage needs within the rcu cs.
>> We only need one of them (preferrably KF_RCU).
>>
>>> - bpf_iter_cpumask_next
>>> - bpf_iter_cpumask_destroy
>>>
>>> These new kfuncs facilitate the iteration of percpu data, such as
>>> runqueues, psi_cgroup_cpu, and more.
>>>
>>> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
>>> ---
>>>    kernel/bpf/cpumask.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
>>>    1 file changed, 69 insertions(+)
>>>
>>> diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
>>> index 2e73533a3811..1840e48e6142 100644
>>> --- a/kernel/bpf/cpumask.c
>>> +++ b/kernel/bpf/cpumask.c
>>> @@ -422,6 +422,72 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct
>>> cpumask *cpumask)
>>>        return cpumask_weight(cpumask);
>>>    }
>>>    +struct bpf_iter_cpumask {
>>> +    __u64 __opaque[2];
>>> +} __aligned(8);
>>> +
>>> +struct bpf_iter_cpumask_kern {
>>> +    const struct cpumask *mask;
>>> +    int cpu;
>>> +} __aligned(8);
>>> +
>>> +/**
>>> + * bpf_iter_cpumask_new() - Create a new bpf_iter_cpumask for a
>>> specified cpumask
>>> + * @it: The new bpf_iter_cpumask to be created.
>>> + * @mask: The cpumask to be iterated over.
>>> + *
>>> + * This function initializes a new bpf_iter_cpumask structure for
>>> iterating over
>>> + * the specified CPU mask. It assigns the provided cpumask to the
>>> newly created
>>> + * bpf_iter_cpumask @it for subsequent iteration operations.
>>> + *
>>> + * On success, 0 is returen. On failure, ERR is returned.
>>> + */
>>> +__bpf_kfunc int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it,
>>> const struct cpumask *mask)
>>> +{
>>> +    struct bpf_iter_cpumask_kern *kit = (void *)it;
>>> +
>>> +    BUILD_BUG_ON(sizeof(struct bpf_iter_cpumask_kern) >
>>> sizeof(struct bpf_iter_cpumask));
>>> +    BUILD_BUG_ON(__alignof__(struct bpf_iter_cpumask_kern) !=
>>> +             __alignof__(struct bpf_iter_cpumask));
>>> +
>>> +    kit->mask = mask;
>>> +    kit->cpu = -1;
>>> +    return 0;
>>> +}
>> We have problem here. Let us say bpf_iter_cpumask_new() is called
>> inside rcu cs.
>> Once the control goes out of rcu cs, 'mask' could be freed, right?
>> Or you require bpf_iter_cpumask_next() needs to be in the same rcu cs
>> as bpf_iter_cpumask_new(). But such a requirement seems odd.
> So the case is possible when using bpf_iter_cpumask_new() and
> bpf_iter_cpumask_next() in sleepable program and these two kfuncs are
> used in two different rcu_read_lock/rcu_read_unlock code blocks, right ?

Right, or bpf_iter_cpumask_new() inside rcu cs and bpf_iter_cpumask_next() not.

>> I think we can do things similar to bpf_iter_task_vma. You can
>> allocate memory
>> with bpf_mem_alloc() in bpf_iter_cpumask_new() to keep a copy of mask.
>> This
>> way, you do not need to worry about potential use-after-free issue.
>> The memory can be freed with bpf_iter_cpumask_destroy().
>>
>>> +
>>> +/**
>>> + * bpf_iter_cpumask_next() - Get the next CPU in a bpf_iter_cpumask
>>> + * @it: The bpf_iter_cpumask
>>> + *
>>> + * This function retrieves a pointer to the number of the next CPU
>>> within the
>>> + * specified bpf_iter_cpumask. It allows sequential access to CPUs
>>> within the
>>> + * cpumask. If there are no further CPUs available, it returns NULL.
>>> + *
>>> + * Returns a pointer to the number of the next CPU in the cpumask or
>>> NULL if no
>>> + * further CPUs.
>>> + */
>>> +__bpf_kfunc int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it)
>>> +{
>>> +    struct bpf_iter_cpumask_kern *kit = (void *)it;
>>> +    const struct cpumask *mask = kit->mask;
>>> +    int cpu;
>>> +
>>> +    cpu = cpumask_next(kit->cpu, mask);
>>> +    if (cpu >= nr_cpu_ids)
>>> +        return NULL;
>>> +
>>> +    kit->cpu = cpu;
>>> +    return &kit->cpu;
>>> +}
>>> +
>>> +/**
>>> + * bpf_iter_cpumask_destroy() - Destroy a bpf_iter_cpumask
>>> + * @it: The bpf_iter_cpumask to be destroyed.
>>> + */
>>> +__bpf_kfunc void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it)
>>> +{
>>> +}
>>> +
>>>    __bpf_kfunc_end_defs();
>>>      BTF_SET8_START(cpumask_kfunc_btf_ids)
>>> @@ -450,6 +516,9 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
>>>    BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
>>>    BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
>>>    BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
>>> +BTF_ID_FLAGS(func, bpf_iter_cpumask_new, KF_ITER_NEW |
>>> KF_RCU_PROTECTED | KF_RCU)
>>> +BTF_ID_FLAGS(func, bpf_iter_cpumask_next, KF_ITER_NEXT | KF_RET_NULL)
>>> +BTF_ID_FLAGS(func, bpf_iter_cpumask_destroy, KF_ITER_DESTROY)
>>>    BTF_SET8_END(cpumask_kfunc_btf_ids)
>>>      static const struct btf_kfunc_id_set cpumask_kfunc_set = {
>> .

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 bpf-next 1/3] bpf: Add bpf_iter_cpumask kfuncs
  2024-01-18 22:27   ` Yonghong Song
  2024-01-19  0:51     ` Hou Tao
@ 2024-01-19  9:50     ` Yafang Shao
  1 sibling, 0 replies; 10+ messages in thread
From: Yafang Shao @ 2024-01-19  9:50 UTC (permalink / raw)
  To: Yonghong Song
  Cc: ast, daniel, john.fastabend, andrii, martin.lau, song, kpsingh,
	sdf, haoluo, jolsa, tj, bpf, lkp

On Fri, Jan 19, 2024 at 6:27 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
> On 1/16/24 6:48 PM, Yafang Shao wrote:
> > Add three new kfuncs for bpf_iter_cpumask.
> > - bpf_iter_cpumask_new
> >    It is defined with KF_RCU_PROTECTED and KF_RCU.
> >    KF_RCU_PROTECTED is defined because we must use it under the
> >    protection of RCU.
> >    KF_RCU is defined because the cpumask must be a RCU trusted pointer
> >    such as task->cpus_ptr.
>
> I am not sure whether we need both or not.
>
> KF_RCU_PROTECTED means the function call needs within the rcu cs.
> KF_RCU means the argument usage needs within the rcu cs.
> We only need one of them (preferrably KF_RCU).

As you explained below, KF_RCU_PROTECTED is actually for
bpf_iter_cpumask_next().

>
> > - bpf_iter_cpumask_next
> > - bpf_iter_cpumask_destroy
> >
> > These new kfuncs facilitate the iteration of percpu data, such as
> > runqueues, psi_cgroup_cpu, and more.
> >
> > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > ---
> >   kernel/bpf/cpumask.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
> >   1 file changed, 69 insertions(+)
> >
> > diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
> > index 2e73533a3811..1840e48e6142 100644
> > --- a/kernel/bpf/cpumask.c
> > +++ b/kernel/bpf/cpumask.c
> > @@ -422,6 +422,72 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
> >       return cpumask_weight(cpumask);
> >   }
> >
> > +struct bpf_iter_cpumask {
> > +     __u64 __opaque[2];
> > +} __aligned(8);
> > +
> > +struct bpf_iter_cpumask_kern {
> > +     const struct cpumask *mask;
> > +     int cpu;
> > +} __aligned(8);
> > +
> > +/**
> > + * bpf_iter_cpumask_new() - Create a new bpf_iter_cpumask for a specified cpumask
> > + * @it: The new bpf_iter_cpumask to be created.
> > + * @mask: The cpumask to be iterated over.
> > + *
> > + * This function initializes a new bpf_iter_cpumask structure for iterating over
> > + * the specified CPU mask. It assigns the provided cpumask to the newly created
> > + * bpf_iter_cpumask @it for subsequent iteration operations.
> > + *
> > + * On success, 0 is returen. On failure, ERR is returned.
> > + */
> > +__bpf_kfunc int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, const struct cpumask *mask)
> > +{
> > +     struct bpf_iter_cpumask_kern *kit = (void *)it;
> > +
> > +     BUILD_BUG_ON(sizeof(struct bpf_iter_cpumask_kern) > sizeof(struct bpf_iter_cpumask));
> > +     BUILD_BUG_ON(__alignof__(struct bpf_iter_cpumask_kern) !=
> > +                  __alignof__(struct bpf_iter_cpumask));
> > +
> > +     kit->mask = mask;
> > +     kit->cpu = -1;
> > +     return 0;
> > +}
>
> We have problem here. Let us say bpf_iter_cpumask_new() is called inside rcu cs.
> Once the control goes out of rcu cs, 'mask' could be freed, right?
> Or you require bpf_iter_cpumask_next() needs to be in the same rcu cs
> as bpf_iter_cpumask_new(). But such a requirement seems odd.
>
> I think we can do things similar to bpf_iter_task_vma. You can allocate memory
> with bpf_mem_alloc() in bpf_iter_cpumask_new() to keep a copy of mask. This
> way, you do not need to worry about potential use-after-free issue.
> The memory can be freed with bpf_iter_cpumask_destroy().

Good suggestion. That seems better.

>
> > +
> > +/**
> > + * bpf_iter_cpumask_next() - Get the next CPU in a bpf_iter_cpumask
> > + * @it: The bpf_iter_cpumask
> > + *
> > + * This function retrieves a pointer to the number of the next CPU within the
> > + * specified bpf_iter_cpumask. It allows sequential access to CPUs within the
> > + * cpumask. If there are no further CPUs available, it returns NULL.
> > + *
> > + * Returns a pointer to the number of the next CPU in the cpumask or NULL if no
> > + * further CPUs.
> > + */
> > +__bpf_kfunc int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it)
> > +{
> > +     struct bpf_iter_cpumask_kern *kit = (void *)it;
> > +     const struct cpumask *mask = kit->mask;
> > +     int cpu;
> > +
> > +     cpu = cpumask_next(kit->cpu, mask);
> > +     if (cpu >= nr_cpu_ids)
> > +             return NULL;
> > +
> > +     kit->cpu = cpu;
> > +     return &kit->cpu;
> > +}
> > +
> > +/**
> > + * bpf_iter_cpumask_destroy() - Destroy a bpf_iter_cpumask
> > + * @it: The bpf_iter_cpumask to be destroyed.
> > + */
> > +__bpf_kfunc void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it)
> > +{
> > +}
> > +
> >   __bpf_kfunc_end_defs();
> >
> >   BTF_SET8_START(cpumask_kfunc_btf_ids)
> > @@ -450,6 +516,9 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
> >   BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
> >   BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
> >   BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
> > +BTF_ID_FLAGS(func, bpf_iter_cpumask_new, KF_ITER_NEW | KF_RCU_PROTECTED | KF_RCU)
> > +BTF_ID_FLAGS(func, bpf_iter_cpumask_next, KF_ITER_NEXT | KF_RET_NULL)
> > +BTF_ID_FLAGS(func, bpf_iter_cpumask_destroy, KF_ITER_DESTROY)
> >   BTF_SET8_END(cpumask_kfunc_btf_ids)
> >
> >   static const struct btf_kfunc_id_set cpumask_kfunc_set = {



--
Regards
Yafang

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v3 bpf-next 2/3] bpf, doc: Add document for cpumask iter
  2024-01-17  2:48 [PATCH v3 bpf-next 0/3] bpf: Add bpf_iter_cpumask Yafang Shao
  2024-01-17  2:48 ` [PATCH v3 bpf-next 1/3] bpf: Add bpf_iter_cpumask kfuncs Yafang Shao
@ 2024-01-17  2:48 ` Yafang Shao
  2024-01-17  2:48 ` [PATCH v3 bpf-next 3/3] selftests/bpf: Add selftests " Yafang Shao
  2 siblings, 0 replies; 10+ messages in thread
From: Yafang Shao @ 2024-01-17  2:48 UTC (permalink / raw)
  To: ast, daniel, john.fastabend, andrii, martin.lau, song,
	yonghong.song, kpsingh, sdf, haoluo, jolsa, tj
  Cc: bpf, lkp, Yafang Shao

This patch adds the document for the newly added cpumask iterator
kfuncs.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 Documentation/bpf/cpumasks.rst | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/Documentation/bpf/cpumasks.rst b/Documentation/bpf/cpumasks.rst
index b5d47a04da5d..523f377afc6e 100644
--- a/Documentation/bpf/cpumasks.rst
+++ b/Documentation/bpf/cpumasks.rst
@@ -372,6 +372,23 @@ used.
 .. _tools/testing/selftests/bpf/progs/cpumask_success.c:
    https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/progs/cpumask_success.c
 
+3.3 cpumask iterator
+--------------------
+
+The cpumask iterator enables the iteration of percpu data, such as runqueues,
+system_group_pcpu, and more.
+
+.. kernel-doc:: kernel/bpf/cpumask.c
+   :identifiers: bpf_iter_cpumask_new bpf_iter_cpumask_next
+                 bpf_iter_cpumask_destroy
+
+----
+
+Some example usages of the cpumask iterator can be found in
+`tools/testing/selftests/bpf/progs/test_cpumask_iter.c`_.
+
+.. _tools/testing/selftests/bpf/progs/test_cpumask_iter.c:
+   https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/progs/test_cpumask_iter.c
 
 4. Adding BPF cpumask kfuncs
 ============================
-- 
2.39.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v3 bpf-next 3/3] selftests/bpf: Add selftests for cpumask iter
  2024-01-17  2:48 [PATCH v3 bpf-next 0/3] bpf: Add bpf_iter_cpumask Yafang Shao
  2024-01-17  2:48 ` [PATCH v3 bpf-next 1/3] bpf: Add bpf_iter_cpumask kfuncs Yafang Shao
  2024-01-17  2:48 ` [PATCH v3 bpf-next 2/3] bpf, doc: Add document for cpumask iter Yafang Shao
@ 2024-01-17  2:48 ` Yafang Shao
  2024-01-18 23:46   ` Yonghong Song
  2 siblings, 1 reply; 10+ messages in thread
From: Yafang Shao @ 2024-01-17  2:48 UTC (permalink / raw)
  To: ast, daniel, john.fastabend, andrii, martin.lau, song,
	yonghong.song, kpsingh, sdf, haoluo, jolsa, tj
  Cc: bpf, lkp, Yafang Shao

Within the BPF program, we leverage the cgroup iterator to iterate through
percpu runqueue data, specifically the 'nr_running' metric. Subsequently
 we expose this data to userspace by means of a sequence file.

The CPU affinity for the cpumask is determined by the PID of a task:

- PID of the init task (PID 1)
  We typically don't set CPU affinity for init task and thus we can iterate
  across all possible CPUs. However, in scenarios where you've set CPU
  affinity for the init task, you should set the cpumask of your current
  task to full-F. Then proceed to iterate through all possible CPUs using
  the current task.
- PID of a task with defined CPU affinity
  The aim here is to iterate through a specific cpumask. This scenario
  aligns with tasks residing within a cpuset cgroup.
- Invalid PID (e.g., PID -1)
  No cpumask is available in this case.

The result as follows,
  #65/1    cpumask_iter/init_pid:OK
  #65/2    cpumask_iter/invalid_pid:OK
  #65/3    cpumask_iter/self_pid_one_cpu:OK
  #65/4    cpumask_iter/self_pid_multi_cpus:OK
  #65      cpumask_iter:OK
  Summary: 1/4 PASSED, 0 SKIPPED, 0 FAILED

CONFIG_PSI=y is required for this testcase.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 tools/testing/selftests/bpf/config            |   1 +
 .../selftests/bpf/prog_tests/cpumask_iter.c   | 134 ++++++++++++++++++
 .../selftests/bpf/progs/cpumask_common.h      |   3 +
 .../selftests/bpf/progs/test_cpumask_iter.c   |  56 ++++++++
 4 files changed, 194 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/cpumask_iter.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_cpumask_iter.c

diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index c125c441abc7..9c42568ed376 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -78,6 +78,7 @@ CONFIG_NF_CONNTRACK_MARK=y
 CONFIG_NF_DEFRAG_IPV4=y
 CONFIG_NF_DEFRAG_IPV6=y
 CONFIG_NF_NAT=y
+CONFIG_PSI=y
 CONFIG_RC_CORE=y
 CONFIG_SECURITY=y
 CONFIG_SECURITYFS=y
diff --git a/tools/testing/selftests/bpf/prog_tests/cpumask_iter.c b/tools/testing/selftests/bpf/prog_tests/cpumask_iter.c
new file mode 100644
index 000000000000..984d01d09d79
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cpumask_iter.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Yafang Shao <laoar.shao@gmail.com> */
+
+#define _GNU_SOURCE
+#include <sched.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+#include "test_cpumask_iter.skel.h"
+
+static void verify_percpu_data(struct bpf_link *link, int nr_cpu_exp, int nr_running_exp)
+{
+	int iter_fd, len, item, nr_running, psi_running, nr_cpus;
+	static char buf[128];
+	size_t left;
+	char *p;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	if (!ASSERT_GE(iter_fd, 0, "iter_fd"))
+		return;
+
+	memset(buf, 0, sizeof(buf));
+	left = ARRAY_SIZE(buf);
+	p = buf;
+	while ((len = read(iter_fd, p, left)) > 0) {
+		p += len;
+		left -= len;
+	}
+
+	item = sscanf(buf, "nr_running %u nr_cpus %u psi_running %u\n",
+		      &nr_running, &nr_cpus, &psi_running);
+	if (nr_cpu_exp == -1) {
+		ASSERT_EQ(item, -1, "seq_format");
+		goto out;
+	}
+
+	ASSERT_EQ(item, 3, "seq_format");
+	ASSERT_GE(nr_running, nr_running_exp, "nr_running");
+	ASSERT_GE(psi_running, nr_running_exp, "psi_running");
+	ASSERT_EQ(nr_cpus, nr_cpu_exp, "nr_cpus");
+
+	/* read() after iter finishes should be ok. */
+	if (len == 0)
+		ASSERT_OK(read(iter_fd, buf, sizeof(buf)), "second_read");
+
+out:
+	close(iter_fd);
+}
+
+void test_cpumask_iter(void)
+{
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	int nr_possible, cgrp_fd, pid, err, cnt, i;
+	struct test_cpumask_iter *skel = NULL;
+	union bpf_iter_link_info linfo;
+	int cpu_ids[] = {1, 3, 4, 5};
+	struct bpf_link *link;
+	cpu_set_t set;
+
+	skel = test_cpumask_iter__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "test_for_each_cpu__open_and_load"))
+		return;
+
+	if (setup_cgroup_environment())
+		goto destroy;
+
+	/* Utilize the cgroup iter */
+	cgrp_fd = get_root_cgroup();
+	if (!ASSERT_GE(cgrp_fd, 0, "create cgrp"))
+		goto cleanup;
+
+	memset(&linfo, 0, sizeof(linfo));
+	linfo.cgroup.cgroup_fd = cgrp_fd;
+	linfo.cgroup.order = BPF_CGROUP_ITER_SELF_ONLY;
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	link = bpf_program__attach_iter(skel->progs.cpu_cgroup, &opts);
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
+		goto close_fd;
+
+	skel->bss->target_pid = 1;
+	/* In case init task is set CPU affinity */
+	err = sched_getaffinity(1, sizeof(set), &set);
+	if (!ASSERT_OK(err, "setaffinity"))
+		goto close_fd;
+
+	cnt = CPU_COUNT(&set);
+	nr_possible = bpf_num_possible_cpus();
+	if (test__start_subtest("init_pid"))
+		/* curent task is running. */
+		verify_percpu_data(link, cnt, cnt == nr_possible ? 1 : 0);
+
+	skel->bss->target_pid = -1;
+	if (test__start_subtest("invalid_pid"))
+		verify_percpu_data(link, -1, -1);
+
+	pid = getpid();
+	skel->bss->target_pid = pid;
+	CPU_ZERO(&set);
+	CPU_SET(0, &set);
+	err = sched_setaffinity(pid, sizeof(set), &set);
+	if (!ASSERT_OK(err, "setaffinity"))
+		goto free_link;
+
+	if (test__start_subtest("self_pid_one_cpu"))
+		verify_percpu_data(link, 1, 1);
+
+	/* Assume there are at least 8 CPUs on the testbed */
+	if (nr_possible < 8)
+		goto free_link;
+
+	CPU_ZERO(&set);
+	/* Set the CPU affinitiy: 1,3-5 */
+	for (i = 0; i < ARRAY_SIZE(cpu_ids); i++)
+		CPU_SET(cpu_ids[i], &set);
+	err = sched_setaffinity(pid, sizeof(set), &set);
+	if (!ASSERT_OK(err, "setaffinity"))
+		goto free_link;
+
+	if (test__start_subtest("self_pid_multi_cpus"))
+		verify_percpu_data(link, ARRAY_SIZE(cpu_ids), 1);
+
+free_link:
+	bpf_link__destroy(link);
+close_fd:
+	close(cgrp_fd);
+cleanup:
+	cleanup_cgroup_environment();
+destroy:
+	test_cpumask_iter__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h
index 0cd4aebb97cf..cdb9dc95e9d9 100644
--- a/tools/testing/selftests/bpf/progs/cpumask_common.h
+++ b/tools/testing/selftests/bpf/progs/cpumask_common.h
@@ -55,6 +55,9 @@ void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym
 u32 bpf_cpumask_any_distribute(const struct cpumask *src) __ksym;
 u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, const struct cpumask *src2) __ksym;
 u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
+int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, const struct cpumask *mask) __ksym;
+int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it) __ksym;
+void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it) __ksym;
 
 void bpf_rcu_read_lock(void) __ksym;
 void bpf_rcu_read_unlock(void) __ksym;
diff --git a/tools/testing/selftests/bpf/progs/test_cpumask_iter.c b/tools/testing/selftests/bpf/progs/test_cpumask_iter.c
new file mode 100644
index 000000000000..cb8b8359516b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_cpumask_iter.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Yafang Shao <laoar.shao@gmail.com> */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#include "task_kfunc_common.h"
+#include "cpumask_common.h"
+
+extern const struct psi_group_cpu system_group_pcpu __ksym __weak;
+extern const struct rq runqueues __ksym __weak;
+
+int target_pid;
+
+SEC("iter.s/cgroup")
+int BPF_PROG(cpu_cgroup, struct bpf_iter_meta *meta, struct cgroup *cgrp)
+{
+	u32 nr_running = 0, psi_nr_running = 0, nr_cpus = 0;
+	struct psi_group_cpu *groupc;
+	struct task_struct *p;
+	struct rq *rq;
+	int *cpu;
+
+	/* epilogue */
+	if (cgrp == NULL)
+		return 0;
+
+	bpf_rcu_read_lock();
+	p = bpf_task_from_pid(target_pid);
+	if (!p) {
+		bpf_rcu_read_unlock();
+		return 1;
+	}
+
+	bpf_for_each(cpumask, cpu, p->cpus_ptr) {
+		rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, *cpu);
+		if (!rq)
+			continue;
+		nr_running += rq->nr_running;
+		nr_cpus += 1;
+
+		groupc = (struct psi_group_cpu *)bpf_per_cpu_ptr(&system_group_pcpu, *cpu);
+		if (!groupc)
+			continue;
+		psi_nr_running += groupc->tasks[NR_RUNNING];
+	}
+	BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",
+		       nr_running, nr_cpus, psi_nr_running);
+
+	bpf_task_release(p);
+	bpf_rcu_read_unlock();
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
2.39.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 bpf-next 3/3] selftests/bpf: Add selftests for cpumask iter
  2024-01-17  2:48 ` [PATCH v3 bpf-next 3/3] selftests/bpf: Add selftests " Yafang Shao
@ 2024-01-18 23:46   ` Yonghong Song
  2024-01-21  2:45     ` Yafang Shao
  0 siblings, 1 reply; 10+ messages in thread
From: Yonghong Song @ 2024-01-18 23:46 UTC (permalink / raw)
  To: Yafang Shao, ast, daniel, john.fastabend, andrii, martin.lau,
	song, kpsingh, sdf, haoluo, jolsa, tj
  Cc: bpf, lkp


On 1/16/24 6:48 PM, Yafang Shao wrote:
> Within the BPF program, we leverage the cgroup iterator to iterate through
> percpu runqueue data, specifically the 'nr_running' metric. Subsequently
>   we expose this data to userspace by means of a sequence file.
>
> The CPU affinity for the cpumask is determined by the PID of a task:
>
> - PID of the init task (PID 1)
>    We typically don't set CPU affinity for init task and thus we can iterate
>    across all possible CPUs. However, in scenarios where you've set CPU
>    affinity for the init task, you should set the cpumask of your current
>    task to full-F. Then proceed to iterate through all possible CPUs using

Wat is full-F? It would be good if you can clarify in the commit message.

>    the current task.
> - PID of a task with defined CPU affinity
>    The aim here is to iterate through a specific cpumask. This scenario
>    aligns with tasks residing within a cpuset cgroup.
> - Invalid PID (e.g., PID -1)
>    No cpumask is available in this case.
>
> The result as follows,
>    #65/1    cpumask_iter/init_pid:OK
>    #65/2    cpumask_iter/invalid_pid:OK
>    #65/3    cpumask_iter/self_pid_one_cpu:OK
>    #65/4    cpumask_iter/self_pid_multi_cpus:OK
>    #65      cpumask_iter:OK
>    Summary: 1/4 PASSED, 0 SKIPPED, 0 FAILED
>
> CONFIG_PSI=y is required for this testcase.
>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> ---
>   tools/testing/selftests/bpf/config            |   1 +
>   .../selftests/bpf/prog_tests/cpumask_iter.c   | 134 ++++++++++++++++++
>   .../selftests/bpf/progs/cpumask_common.h      |   3 +
>   .../selftests/bpf/progs/test_cpumask_iter.c   |  56 ++++++++
>   4 files changed, 194 insertions(+)
>   create mode 100644 tools/testing/selftests/bpf/prog_tests/cpumask_iter.c
>   create mode 100644 tools/testing/selftests/bpf/progs/test_cpumask_iter.c
>
> diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
> index c125c441abc7..9c42568ed376 100644
> --- a/tools/testing/selftests/bpf/config
> +++ b/tools/testing/selftests/bpf/config
> @@ -78,6 +78,7 @@ CONFIG_NF_CONNTRACK_MARK=y
>   CONFIG_NF_DEFRAG_IPV4=y
>   CONFIG_NF_DEFRAG_IPV6=y
>   CONFIG_NF_NAT=y
> +CONFIG_PSI=y
>   CONFIG_RC_CORE=y
>   CONFIG_SECURITY=y
>   CONFIG_SECURITYFS=y
> diff --git a/tools/testing/selftests/bpf/prog_tests/cpumask_iter.c b/tools/testing/selftests/bpf/prog_tests/cpumask_iter.c
> new file mode 100644
> index 000000000000..984d01d09d79
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/cpumask_iter.c
> @@ -0,0 +1,134 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2024 Yafang Shao <laoar.shao@gmail.com> */
> +
> +#define _GNU_SOURCE
> +#include <sched.h>
> +#include <stdio.h>
> +#include <unistd.h>
> +
> +#include <test_progs.h>
> +#include "cgroup_helpers.h"
> +#include "test_cpumask_iter.skel.h"
> +
> +static void verify_percpu_data(struct bpf_link *link, int nr_cpu_exp, int nr_running_exp)
> +{
> +	int iter_fd, len, item, nr_running, psi_running, nr_cpus;
> +	static char buf[128];

why static?

> +	size_t left;
> +	char *p;
> +
> +	iter_fd = bpf_iter_create(bpf_link__fd(link));
> +	if (!ASSERT_GE(iter_fd, 0, "iter_fd"))
> +		return;
> +
> +	memset(buf, 0, sizeof(buf));
> +	left = ARRAY_SIZE(buf);
> +	p = buf;
> +	while ((len = read(iter_fd, p, left)) > 0) {
> +		p += len;
> +		left -= len;
> +	}
> +
> +	item = sscanf(buf, "nr_running %u nr_cpus %u psi_running %u\n",
> +		      &nr_running, &nr_cpus, &psi_running);
> +	if (nr_cpu_exp == -1) {
> +		ASSERT_EQ(item, -1, "seq_format");
> +		goto out;
> +	}
> +
> +	ASSERT_EQ(item, 3, "seq_format");
> +	ASSERT_GE(nr_running, nr_running_exp, "nr_running");
> +	ASSERT_GE(psi_running, nr_running_exp, "psi_running");
> +	ASSERT_EQ(nr_cpus, nr_cpu_exp, "nr_cpus");
> +
> +	/* read() after iter finishes should be ok. */
> +	if (len == 0)
> +		ASSERT_OK(read(iter_fd, buf, sizeof(buf)), "second_read");

The above 'if' statement is irrelevant to the main purpose of this test
and can be removed.

> +
> +out:
> +	close(iter_fd);
> +}
> +
> +void test_cpumask_iter(void)
> +{
> +	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
> +	int nr_possible, cgrp_fd, pid, err, cnt, i;
> +	struct test_cpumask_iter *skel = NULL;

= NULL is not needed.

> +	union bpf_iter_link_info linfo;
> +	int cpu_ids[] = {1, 3, 4, 5};
> +	struct bpf_link *link;
> +	cpu_set_t set;
> +
> +	skel = test_cpumask_iter__open_and_load();
> +	if (!ASSERT_OK_PTR(skel, "test_for_each_cpu__open_and_load"))
> +		return;
> +
> +	if (setup_cgroup_environment())
> +		goto destroy;
> +
> +	/* Utilize the cgroup iter */
> +	cgrp_fd = get_root_cgroup();
> +	if (!ASSERT_GE(cgrp_fd, 0, "create cgrp"))
> +		goto cleanup;
> +
> +	memset(&linfo, 0, sizeof(linfo));
> +	linfo.cgroup.cgroup_fd = cgrp_fd;
> +	linfo.cgroup.order = BPF_CGROUP_ITER_SELF_ONLY;
> +	opts.link_info = &linfo;
> +	opts.link_info_len = sizeof(linfo);
> +
> +	link = bpf_program__attach_iter(skel->progs.cpu_cgroup, &opts);
> +	if (!ASSERT_OK_PTR(link, "attach_iter"))
> +		goto close_fd;
> +
> +	skel->bss->target_pid = 1;
> +	/* In case init task is set CPU affinity */
> +	err = sched_getaffinity(1, sizeof(set), &set);
> +	if (!ASSERT_OK(err, "setaffinity"))
> +		goto close_fd;

goto free_link.

> +
> +	cnt = CPU_COUNT(&set);
> +	nr_possible = bpf_num_possible_cpus();
> +	if (test__start_subtest("init_pid"))
> +		/* curent task is running. */
> +		verify_percpu_data(link, cnt, cnt == nr_possible ? 1 : 0);
[...]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 bpf-next 3/3] selftests/bpf: Add selftests for cpumask iter
  2024-01-18 23:46   ` Yonghong Song
@ 2024-01-21  2:45     ` Yafang Shao
  0 siblings, 0 replies; 10+ messages in thread
From: Yafang Shao @ 2024-01-21  2:45 UTC (permalink / raw)
  To: Yonghong Song
  Cc: ast, daniel, john.fastabend, andrii, martin.lau, song, kpsingh,
	sdf, haoluo, jolsa, tj, bpf, lkp

On Fri, Jan 19, 2024 at 7:46 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
> On 1/16/24 6:48 PM, Yafang Shao wrote:
> > Within the BPF program, we leverage the cgroup iterator to iterate through
> > percpu runqueue data, specifically the 'nr_running' metric. Subsequently
> >   we expose this data to userspace by means of a sequence file.
> >
> > The CPU affinity for the cpumask is determined by the PID of a task:
> >
> > - PID of the init task (PID 1)
> >    We typically don't set CPU affinity for init task and thus we can iterate
> >    across all possible CPUs. However, in scenarios where you've set CPU
> >    affinity for the init task, you should set the cpumask of your current
> >    task to full-F. Then proceed to iterate through all possible CPUs using
>
> Wat is full-F? It would be good if you can clarify in the commit message.

I mean set all available CPUs for the task.
Will clarify it in the next version.

>
> >    the current task.
> > - PID of a task with defined CPU affinity
> >    The aim here is to iterate through a specific cpumask. This scenario
> >    aligns with tasks residing within a cpuset cgroup.
> > - Invalid PID (e.g., PID -1)
> >    No cpumask is available in this case.
> >
> > The result as follows,
> >    #65/1    cpumask_iter/init_pid:OK
> >    #65/2    cpumask_iter/invalid_pid:OK
> >    #65/3    cpumask_iter/self_pid_one_cpu:OK
> >    #65/4    cpumask_iter/self_pid_multi_cpus:OK
> >    #65      cpumask_iter:OK
> >    Summary: 1/4 PASSED, 0 SKIPPED, 0 FAILED
> >
> > CONFIG_PSI=y is required for this testcase.
> >
> > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > ---
> >   tools/testing/selftests/bpf/config            |   1 +
> >   .../selftests/bpf/prog_tests/cpumask_iter.c   | 134 ++++++++++++++++++
> >   .../selftests/bpf/progs/cpumask_common.h      |   3 +
> >   .../selftests/bpf/progs/test_cpumask_iter.c   |  56 ++++++++
> >   4 files changed, 194 insertions(+)
> >   create mode 100644 tools/testing/selftests/bpf/prog_tests/cpumask_iter.c
> >   create mode 100644 tools/testing/selftests/bpf/progs/test_cpumask_iter.c
> >
> > diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
> > index c125c441abc7..9c42568ed376 100644
> > --- a/tools/testing/selftests/bpf/config
> > +++ b/tools/testing/selftests/bpf/config
> > @@ -78,6 +78,7 @@ CONFIG_NF_CONNTRACK_MARK=y
> >   CONFIG_NF_DEFRAG_IPV4=y
> >   CONFIG_NF_DEFRAG_IPV6=y
> >   CONFIG_NF_NAT=y
> > +CONFIG_PSI=y
> >   CONFIG_RC_CORE=y
> >   CONFIG_SECURITY=y
> >   CONFIG_SECURITYFS=y
> > diff --git a/tools/testing/selftests/bpf/prog_tests/cpumask_iter.c b/tools/testing/selftests/bpf/prog_tests/cpumask_iter.c
> > new file mode 100644
> > index 000000000000..984d01d09d79
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/prog_tests/cpumask_iter.c
> > @@ -0,0 +1,134 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2024 Yafang Shao <laoar.shao@gmail.com> */
> > +
> > +#define _GNU_SOURCE
> > +#include <sched.h>
> > +#include <stdio.h>
> > +#include <unistd.h>
> > +
> > +#include <test_progs.h>
> > +#include "cgroup_helpers.h"
> > +#include "test_cpumask_iter.skel.h"
> > +
> > +static void verify_percpu_data(struct bpf_link *link, int nr_cpu_exp, int nr_running_exp)
> > +{
> > +     int iter_fd, len, item, nr_running, psi_running, nr_cpus;
> > +     static char buf[128];
>
> why static?

Will remove it.

>
> > +     size_t left;
> > +     char *p;
> > +
> > +     iter_fd = bpf_iter_create(bpf_link__fd(link));
> > +     if (!ASSERT_GE(iter_fd, 0, "iter_fd"))
> > +             return;
> > +
> > +     memset(buf, 0, sizeof(buf));
> > +     left = ARRAY_SIZE(buf);
> > +     p = buf;
> > +     while ((len = read(iter_fd, p, left)) > 0) {
> > +             p += len;
> > +             left -= len;
> > +     }
> > +
> > +     item = sscanf(buf, "nr_running %u nr_cpus %u psi_running %u\n",
> > +                   &nr_running, &nr_cpus, &psi_running);
> > +     if (nr_cpu_exp == -1) {
> > +             ASSERT_EQ(item, -1, "seq_format");
> > +             goto out;
> > +     }
> > +
> > +     ASSERT_EQ(item, 3, "seq_format");
> > +     ASSERT_GE(nr_running, nr_running_exp, "nr_running");
> > +     ASSERT_GE(psi_running, nr_running_exp, "psi_running");
> > +     ASSERT_EQ(nr_cpus, nr_cpu_exp, "nr_cpus");
> > +
> > +     /* read() after iter finishes should be ok. */
> > +     if (len == 0)
> > +             ASSERT_OK(read(iter_fd, buf, sizeof(buf)), "second_read");
>
> The above 'if' statement is irrelevant to the main purpose of this test
> and can be removed.

Will remove it.

>
> > +
> > +out:
> > +     close(iter_fd);
> > +}
> > +
> > +void test_cpumask_iter(void)
> > +{
> > +     DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
> > +     int nr_possible, cgrp_fd, pid, err, cnt, i;
> > +     struct test_cpumask_iter *skel = NULL;
>
> = NULL is not needed.

Will change it.

>
> > +     union bpf_iter_link_info linfo;
> > +     int cpu_ids[] = {1, 3, 4, 5};
> > +     struct bpf_link *link;
> > +     cpu_set_t set;
> > +
> > +     skel = test_cpumask_iter__open_and_load();
> > +     if (!ASSERT_OK_PTR(skel, "test_for_each_cpu__open_and_load"))
> > +             return;
> > +
> > +     if (setup_cgroup_environment())
> > +             goto destroy;
> > +
> > +     /* Utilize the cgroup iter */
> > +     cgrp_fd = get_root_cgroup();
> > +     if (!ASSERT_GE(cgrp_fd, 0, "create cgrp"))
> > +             goto cleanup;
> > +
> > +     memset(&linfo, 0, sizeof(linfo));
> > +     linfo.cgroup.cgroup_fd = cgrp_fd;
> > +     linfo.cgroup.order = BPF_CGROUP_ITER_SELF_ONLY;
> > +     opts.link_info = &linfo;
> > +     opts.link_info_len = sizeof(linfo);
> > +
> > +     link = bpf_program__attach_iter(skel->progs.cpu_cgroup, &opts);
> > +     if (!ASSERT_OK_PTR(link, "attach_iter"))
> > +             goto close_fd;
> > +
> > +     skel->bss->target_pid = 1;
> > +     /* In case init task is set CPU affinity */
> > +     err = sched_getaffinity(1, sizeof(set), &set);
> > +     if (!ASSERT_OK(err, "setaffinity"))
> > +             goto close_fd;
>
> goto free_link.

Nice catch. will change it.

>
> > +
> > +     cnt = CPU_COUNT(&set);
> > +     nr_possible = bpf_num_possible_cpus();
> > +     if (test__start_subtest("init_pid"))
> > +             /* curent task is running. */
> > +             verify_percpu_data(link, cnt, cnt == nr_possible ? 1 : 0);
> [...]



-- 
Regards
Yafang

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-01-21  2:45 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-01-17  2:48 [PATCH v3 bpf-next 0/3] bpf: Add bpf_iter_cpumask Yafang Shao
2024-01-17  2:48 ` [PATCH v3 bpf-next 1/3] bpf: Add bpf_iter_cpumask kfuncs Yafang Shao
2024-01-18 22:27   ` Yonghong Song
2024-01-19  0:51     ` Hou Tao
2024-01-19  3:45       ` Yonghong Song
2024-01-19  9:50     ` Yafang Shao
2024-01-17  2:48 ` [PATCH v3 bpf-next 2/3] bpf, doc: Add document for cpumask iter Yafang Shao
2024-01-17  2:48 ` [PATCH v3 bpf-next 3/3] selftests/bpf: Add selftests " Yafang Shao
2024-01-18 23:46   ` Yonghong Song
2024-01-21  2:45     ` Yafang Shao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox