Linux Documentation

Linux Documentation
 help / color / mirror / Atom feed

* [PATCH bpf-next v11 8/8] selftests/bpf: Add test cases for bpf_list_del/add/is_first/is_last/empty
From: Kaitao Cheng @ 2026-05-21  3:23 UTC (permalink / raw)
  To: ast, corbet, martin.lau, daniel, andrii, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, shuah,
	chengkaitao, skhan, memxor
  Cc: bpf, linux-kernel, linux-doc, vmalik, linux-kselftest
In-Reply-To: <20260521032306.97118-1-kaitao.cheng@linux.dev>

From: Kaitao Cheng <chengkaitao@kylinos.cn>

Extend refcounted_kptr with tests for bpf_list_add (including prev from
bpf_list_front and bpf_refcount_acquire), bpf_list_del (including node
from bpf_list_front, bpf_rbtree_remove and bpf_refcount_acquire),
bpf_list_empty, bpf_list_is_first/last, and push_back on uninit head.

To verify the validity of bpf_list_del/add, the test also expects the
verifier to reject calls to bpf_list_del/add made without holding the
spin_lock.

Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
---
 .../selftests/bpf/progs/refcounted_kptr.c     | 421 ++++++++++++++++++
 1 file changed, 421 insertions(+)

diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr.c b/tools/testing/selftests/bpf/progs/refcounted_kptr.c
index c847398837cc..13de169ad68f 100644
--- a/tools/testing/selftests/bpf/progs/refcounted_kptr.c
+++ b/tools/testing/selftests/bpf/progs/refcounted_kptr.c
@@ -367,6 +367,427 @@ long insert_rbtree_and_stash__del_tree_##rem_tree(void *ctx)		\
 INSERT_STASH_READ(true, "insert_stash_read: remove from tree");
 INSERT_STASH_READ(false, "insert_stash_read: don't remove from tree");
 
+SEC("tc")
+__description("list_empty_test: list empty before add, non-empty after add")
+__success __retval(0)
+int list_empty_test(void *ctx)
+{
+	struct node_data *node_new;
+
+	bpf_spin_lock(&lock);
+	if (!bpf_list_empty(&head)) {
+		bpf_spin_unlock(&lock);
+		return -1;
+	}
+	bpf_spin_unlock(&lock);
+
+	node_new = bpf_obj_new(typeof(*node_new));
+	if (!node_new)
+		return -2;
+
+	bpf_spin_lock(&lock);
+	bpf_list_push_front(&head, &node_new->l);
+
+	if (bpf_list_empty(&head)) {
+		bpf_spin_unlock(&lock);
+		return -3;
+	}
+	bpf_spin_unlock(&lock);
+	return 0;
+}
+
+static struct node_data *__add_in_list(struct bpf_list_head *head,
+				       struct bpf_spin_lock *lock)
+{
+	struct node_data *node_new, *node_ref;
+
+	node_new = bpf_obj_new(typeof(*node_new));
+	if (!node_new)
+		return NULL;
+
+	node_ref = bpf_refcount_acquire(node_new);
+
+	bpf_spin_lock(lock);
+	bpf_list_push_front(head, &node_new->l);
+	bpf_spin_unlock(lock);
+	return node_ref;
+}
+
+SEC("tc")
+__description("list_is_edge_test1: is_first on first node, is_last on last node")
+__success __retval(0)
+int list_is_edge_test1(void *ctx)
+{
+	struct node_data *node_first, *node_last;
+	int err = 0;
+
+	node_last = __add_in_list(&head, &lock);
+	if (!node_last)
+		return -1;
+
+	node_first = __add_in_list(&head, &lock);
+	if (!node_first) {
+		bpf_obj_drop(node_last);
+		return -2;
+	}
+
+	bpf_spin_lock(&lock);
+	if (!bpf_list_is_first(&head, &node_first->l)) {
+		err = -3;
+		goto fail;
+	}
+	if (!bpf_list_is_last(&head, &node_last->l))
+		err = -4;
+
+fail:
+	bpf_spin_unlock(&lock);
+	bpf_obj_drop(node_first);
+	bpf_obj_drop(node_last);
+	return err;
+}
+
+SEC("tc")
+__description("list_is_edge_test2: accept list_front/list_back return value")
+__success __retval(0)
+int list_is_edge_test2(void *ctx)
+{
+	struct bpf_list_node *front, *back;
+	struct node_data *a, *b;
+	long err = 0;
+
+	a = __add_in_list(&head, &lock);
+	if (!a)
+		return -1;
+
+	b = __add_in_list(&head, &lock);
+	if (!b) {
+		bpf_obj_drop(a);
+		return -2;
+	}
+
+	bpf_spin_lock(&lock);
+	front = bpf_list_front(&head);
+	back = bpf_list_back(&head);
+	if (!front || !back) {
+		err = -3;
+		goto out_unlock;
+	}
+
+	if (!bpf_list_is_first(&head, front) || bpf_list_is_last(&head, front)) {
+		err = -4;
+		goto out_unlock;
+	}
+
+	if (!bpf_list_is_last(&head, back) || bpf_list_is_first(&head, back)) {
+		err = -5;
+		goto out_unlock;
+	}
+
+out_unlock:
+	bpf_spin_unlock(&lock);
+	bpf_obj_drop(a);
+	bpf_obj_drop(b);
+	return err;
+}
+
+SEC("tc")
+__description("list_is_edge_test3: single node is both first and last")
+__success __retval(0)
+int list_is_edge_test3(void *ctx)
+{
+	struct node_data *tmp;
+	struct bpf_list_node *node;
+	long err = 0;
+
+	tmp = __add_in_list(&head, &lock);
+	if (!tmp)
+		return -1;
+
+	bpf_spin_lock(&lock);
+	node = bpf_list_front(&head);
+	if (!node) {
+		bpf_spin_unlock(&lock);
+		bpf_obj_drop(tmp);
+		return -2;
+	}
+
+	if (!bpf_list_is_first(&head, node) || !bpf_list_is_last(&head, node))
+		err = -3;
+	bpf_spin_unlock(&lock);
+
+	bpf_obj_drop(tmp);
+	return err;
+}
+
+SEC("tc")
+__description("list_del_test1: del returns removed nodes")
+__success __retval(0)
+int list_del_test1(void *ctx)
+{
+	struct node_data *node_first, *node_last;
+	struct bpf_list_node *bpf_node_first, *bpf_node_last;
+	int err = 0;
+
+	node_last = __add_in_list(&head, &lock);
+	if (!node_last)
+		return -1;
+
+	node_first = __add_in_list(&head, &lock);
+	if (!node_first) {
+		bpf_obj_drop(node_last);
+		return -2;
+	}
+
+	bpf_spin_lock(&lock);
+	bpf_node_last = bpf_list_del(&head, &node_last->l);
+	bpf_node_first = bpf_list_del(&head, &node_first->l);
+	bpf_spin_unlock(&lock);
+
+	if (bpf_node_first)
+		bpf_obj_drop(container_of(bpf_node_first, struct node_data, l));
+	else
+		err = -3;
+
+	if (bpf_node_last)
+		bpf_obj_drop(container_of(bpf_node_last, struct node_data, l));
+	else
+		err = -4;
+
+	bpf_obj_drop(node_first);
+	bpf_obj_drop(node_last);
+	return err;
+}
+
+SEC("tc")
+__description("list_del_test2: remove an arbitrary node from the list")
+__success __retval(0)
+int list_del_test2(void *ctx)
+{
+	struct bpf_rb_node *rb;
+	struct bpf_list_node *l;
+	struct node_data *n;
+	long err;
+
+	err = __insert_in_tree_and_list(&head, &root, &lock);
+	if (err)
+		return err;
+
+	bpf_spin_lock(&lock);
+	rb = bpf_rbtree_first(&root);
+	if (!rb) {
+		bpf_spin_unlock(&lock);
+		return -4;
+	}
+
+	rb = bpf_rbtree_remove(&root, rb);
+	if (!rb) {
+		bpf_spin_unlock(&lock);
+		return -5;
+	}
+
+	n = container_of(rb, struct node_data, r);
+	l = bpf_list_del(&head, &n->l);
+	bpf_spin_unlock(&lock);
+	bpf_obj_drop(n);
+	if (!l)
+		return -6;
+
+	bpf_obj_drop(container_of(l, struct node_data, l));
+	return 0;
+}
+
+SEC("tc")
+__description("list_del_test3: list_del accepts list_front return value as node")
+__success __retval(0)
+int list_del_test3(void *ctx)
+{
+	struct node_data *tmp;
+	struct bpf_list_node *bpf_node, *l;
+	long err = 0;
+
+	tmp = __add_in_list(&head, &lock);
+	if (!tmp)
+		return -1;
+
+	bpf_spin_lock(&lock);
+	bpf_node = bpf_list_front(&head);
+	if (!bpf_node) {
+		bpf_spin_unlock(&lock);
+		err = -2;
+		goto fail;
+	}
+
+	l = bpf_list_del(&head, bpf_node);
+	bpf_spin_unlock(&lock);
+	if (!l) {
+		err = -3;
+		goto fail;
+	}
+
+	bpf_obj_drop(container_of(l, struct node_data, l));
+	bpf_obj_drop(tmp);
+	return 0;
+
+fail:
+	bpf_obj_drop(tmp);
+	return err;
+}
+
+SEC("tc")
+__description("list_add_test1: insert new node after prev")
+__success __retval(0)
+int list_add_test1(void *ctx)
+{
+	struct node_data *node_first;
+	struct node_data *new_node;
+	long err = 0;
+
+	node_first = __add_in_list(&head, &lock);
+	if (!node_first)
+		return -1;
+
+	new_node = bpf_obj_new(typeof(*new_node));
+	if (!new_node) {
+		err = -2;
+		goto fail;
+	}
+
+	bpf_spin_lock(&lock);
+	err = bpf_list_add(&head, &new_node->l, &node_first->l);
+	bpf_spin_unlock(&lock);
+	if (err) {
+		err = -3;
+		goto fail;
+	}
+
+fail:
+	bpf_obj_drop(node_first);
+	return err;
+}
+
+SEC("tc")
+__description("list_add_test2: list_add accepts list_front return value as prev")
+__success __retval(0)
+int list_add_test2(void *ctx)
+{
+	struct node_data *new_node, *tmp;
+	struct bpf_list_node *bpf_node;
+	long err = 0;
+
+	tmp = __add_in_list(&head, &lock);
+	if (!tmp)
+		return -1;
+
+	new_node = bpf_obj_new(typeof(*new_node));
+	if (!new_node) {
+		err = -2;
+		goto fail;
+	}
+
+	bpf_spin_lock(&lock);
+	bpf_node = bpf_list_front(&head);
+	if (!bpf_node) {
+		bpf_spin_unlock(&lock);
+		bpf_obj_drop(new_node);
+		err = -3;
+		goto fail;
+	}
+
+	err = bpf_list_add(&head, &new_node->l, bpf_node);
+	bpf_spin_unlock(&lock);
+	if (err) {
+		err = -4;
+		goto fail;
+	}
+
+fail:
+	bpf_obj_drop(tmp);
+	return err;
+}
+
+struct uninit_head_val {
+	struct bpf_spin_lock lock;
+	struct bpf_list_head head __contains(node_data, l);
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, int);
+	__type(value, struct uninit_head_val);
+	__uint(max_entries, 1);
+} uninit_head_map SEC(".maps");
+
+SEC("tc")
+__description("list_push_back_uninit_head: push_back on 0-initialized list head")
+__success __retval(0)
+int list_push_back_uninit_head(void *ctx)
+{
+	struct uninit_head_val *st;
+	struct node_data *node;
+	int ret = -1, key = 0;
+
+	st = bpf_map_lookup_elem(&uninit_head_map, &key);
+	if (!st)
+		return -1;
+
+	node = bpf_obj_new(typeof(*node));
+	if (!node)
+		return -1;
+
+	bpf_spin_lock(&st->lock);
+	ret = bpf_list_push_back(&st->head, &node->l);
+	bpf_spin_unlock(&st->lock);
+
+	return ret;
+}
+
+SEC("?tc")
+__failure __msg("bpf_spin_lock at off=32 must be held for bpf_list_head")
+long list_del_without_lock_fail(void *ctx)
+{
+	struct node_data *n;
+	struct bpf_list_node *l;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return -1;
+
+	/* Error case: delete list node without holding lock */
+	l = bpf_list_del(&head, &n->l);
+	bpf_obj_drop(n);
+	if (!l)
+		return -2;
+	bpf_obj_drop(container_of(l, struct node_data, l));
+
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("bpf_spin_lock at off=32 must be held for bpf_list_head")
+long list_add_without_lock_fail(void *ctx)
+{
+	struct node_data *n, *prev;
+	long err;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return -1;
+
+	prev = bpf_obj_new(typeof(*prev));
+	if (!prev) {
+		bpf_obj_drop(n);
+		return -1;
+	}
+
+	/* Error case: add list node without holding lock */
+	err = bpf_list_add(&head, &n->l, &prev->l);
+	bpf_obj_drop(prev);
+	if (err)
+		return -2;
+
+	return 0;
+}
+
 SEC("tc")
 __success
 long rbtree_refcounted_node_ref_escapes(void *ctx)
-- 
2.50.1 (Apple Git-155)


^ permalink raw reply related

* Re: [PATCH bpf-next v11 8/8] selftests/bpf: Add test cases for bpf_list_del/add/is_first/is_last/empty
From: bot+bpf-ci @ 2026-05-21  4:08 UTC (permalink / raw)
  To: kaitao.cheng, ast, corbet, martin.lau, daniel, andrii, eddyz87,
	song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
	shuah, chengkaitao, skhan, memxor
  Cc: bpf, linux-kernel, linux-doc, vmalik, linux-kselftest, ast,
	andrii, daniel, martin.lau, eddyz87, yonghong.song, clm,
	ihor.solodrai
In-Reply-To: <20260521032306.97118-9-kaitao.cheng@linux.dev>

[-- Attachment #1: Type: text/plain, Size: 1758 bytes --]


commit 75130d2969168e50252cc473cfaa880a9795c0ef
Author: Kaitao Cheng <chengkaitao@kylinos.cn>

    selftests/bpf: Add test cases for bpf_list_del/add/is_first/is_last/empty

    Extend refcounted_kptr with tests for bpf_list_add (including prev from
    bpf_list_front and bpf_refcount_acquire), bpf_list_del (including node
    from bpf_list_front, bpf_rbtree_remove and bpf_refcount_acquire),
    bpf_list_empty, bpf_list_is_first/last, and push_back on uninit head.

    To verify the validity of bpf_list_del/add, the test also expects the
    verifier to reject calls to bpf_list_del/add made without holding the
    spin_lock.

    Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>

The commit message mentions adding test cases for bpf_list_is_first,
bpf_list_is_last, and bpf_list_empty, but these test functions already
exist in the file and are not added by this patch. The existing tests
are list_empty_test, list_is_edge_test1, list_is_edge_test2, and
list_is_edge_test3.

This patch actually adds:
- list_del_test3 (new test for bpf_list_del with list_front return value)
- list_add_test1 (new test for bpf_list_add after prev node)
- list_add_test2 (new test for bpf_list_add with list_front as prev)
- list_push_back_uninit_head (new test for push_back on uninitialized head)
- list_del_without_lock_fail (negative test for del without lock)
- list_add_without_lock_fail (negative test for add without lock)

Should the commit message be adjusted to accurately reflect which tests
are being added?


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/26204125015

^ permalink raw reply

* Re: [PATCH bpf-next v11 3/8] bpf: allow non-owning list-node args via __nonown_allowed
From: bot+bpf-ci @ 2026-05-21  4:08 UTC (permalink / raw)
  To: kaitao.cheng, ast, corbet, martin.lau, daniel, andrii, eddyz87,
	song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
	shuah, chengkaitao, skhan, memxor
  Cc: bpf, linux-kernel, linux-doc, vmalik, linux-kselftest, ast,
	andrii, daniel, martin.lau, eddyz87, yonghong.song, clm,
	ihor.solodrai
In-Reply-To: <20260521032306.97118-4-kaitao.cheng@linux.dev>

[-- Attachment #1: Type: text/plain, Size: 2076 bytes --]

> diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
> index 75e6c078e0e7..3a9db1108b95 100644
> --- a/Documentation/bpf/kfuncs.rst
> +++ b/Documentation/bpf/kfuncs.rst

[ ... ]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 8dd79b735a69..f3cf8d85bea0 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c

[ ... ]

> @@ -12244,6 +12249,13 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
>  			return ret;
>  		break;
>  	case KF_ARG_PTR_TO_LIST_NODE:
> +		if (is_kfunc_arg_nonown_allowed(btf, &args[i]) &&
> +		    type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) {
> +			/* Allow bpf_list_front/back return value for
> +			 * __nonown_allowed list-node arguments.
> +			 */
> +			goto check_ok;
> +		}
>  		if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
>  			verbose(env, "%s expected pointer to allocated object\n",
>  				reg_arg_name(env, argno));
>  @@ -12253,6 +12265,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
>  			verbose(env, "allocated object must be referenced\n");
>  			return -EINVAL;
>  		}
> +check_ok:
>  		ret = process_kf_arg_ptr_to_list_node(env, reg, argno, meta);
>  		if (ret < 0)
>  			return ret;

This isn't a bug, but the multi-line comment doesn't match the BPF
subsystem style. The BPF guide requires multi-line comments to have the
opening /* on its own line for files under kernel/bpf/. Should this be:

  /*
   * Allow bpf_list_front/back return value for
   * __nonown_allowed list-node arguments.
   */

Eduard Zingerman raised a related nit in v10 RESEND: the check
'!reg->ref_obj_id' appears redundant. Would type_is_non_owning_ref() be
sufficient here?

Reference: https://lore.kernel.org/bpf/0419643c9a04bf0824066742e52e3f197b43909b.camel@gmail.com/


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/26204125015

^ permalink raw reply

* Re: [PATCH bpf-next v11 2/8] bpf: clear list node owner and unlink before drop
From: bot+bpf-ci @ 2026-05-21  4:08 UTC (permalink / raw)
  To: kaitao.cheng, ast, corbet, martin.lau, daniel, andrii, eddyz87,
	song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
	shuah, chengkaitao, skhan, memxor
  Cc: bpf, linux-kernel, linux-doc, vmalik, linux-kselftest, ast,
	andrii, daniel, martin.lau, eddyz87, yonghong.song, clm,
	ihor.solodrai
In-Reply-To: <20260521032306.97118-3-kaitao.cheng@linux.dev>

[-- Attachment #1: Type: text/plain, Size: 2927 bytes --]

> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 094457c3e6d3..59855b434f0b 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -2247,10 +2247,11 @@ EXPORT_SYMBOL_GPL(bpf_base_func_proto);
>  void bpf_list_head_free(const struct btf_field *field, void *list_head,
>  			struct bpf_spin_lock *spin_lock)
>  {
> -	struct list_head *head = list_head, *orig_head = list_head;
> +	struct list_head *head = list_head, drain, *pos, *n;
>
>  	BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
>  	BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
> +	INIT_LIST_HEAD(&drain);
>
>  	/* Do the actual list draining outside the lock to not hold the lock for
>  	 * too long, and also prevent deadlocks if tracing programs end up
> @@ -2261,20 +2262,30 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head,
>  	__bpf_spin_lock_irqsave(spin_lock);
>  	if (!head->next || list_empty(head))
>  		goto unlock;
> -	head = head->next;
> +	list_for_each_safe(pos, n, head) {
> +		struct bpf_list_node_kern *node;
> +
> +		node = container_of(pos, struct bpf_list_node_kern, list_head);
> +		WRITE_ONCE(node->owner, BPF_PTR_POISON);
> +		list_move_tail(pos, &drain);
> +	}

Does this introduce an unbounded delay with interrupts disabled?

The original code detached the head in O(1) time. Now there is an O(N)
loop inside __bpf_spin_lock_irqsave(). Since BPF lists can grow quite
large, could this cause soft lockups?

This also appears to contradict the comment just above the lock which
explicitly mentions doing the actual list draining outside the lock to
not hold it for too long.

Lore reference:
https://lore.kernel.org/bpf/20260513060221.DB2C7C2BCB7@smtp.kernel.org/
Reviewer: sashiko-bot@kernel.org
Date: Wed, 13 May 2026 06:02:21 +0000

>  unlock:
> -	INIT_LIST_HEAD(orig_head);
> +	INIT_LIST_HEAD(head);
>  	__bpf_spin_unlock_irqrestore(spin_lock);
>
> -	while (head != orig_head) {
> -		void *obj = head;
> +	while (!list_empty(&drain)) {
> +		struct bpf_list_node_kern *node;
>
> -		obj -= field->graph_root.node_offset;
> -		head = head->next;
> +		pos = drain.next;
> +		node = container_of(pos, struct bpf_list_node_kern, list_head);
> +		list_del_init(pos);
> +		/* Ensure __bpf_list_add() sees the node as unlinked. */
> +		smp_store_release(&node->owner, NULL);
>  		/* The contained type can also have resources, including a
>  		 * bpf_list_head which needs to be freed.
>  		 */
> -		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
> +		__bpf_obj_drop_impl((char *)pos - field->graph_root.node_offset,
> +				    field->graph_root.value_rec, false);
>  	}
>  }


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/26204125015

^ permalink raw reply

* Re: [PATCH bpf-next v11 4/8] bpf: Introduce the bpf_list_del kfunc.
From: bot+bpf-ci @ 2026-05-21  4:08 UTC (permalink / raw)
  To: kaitao.cheng, ast, corbet, martin.lau, daniel, andrii, eddyz87,
	song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
	shuah, chengkaitao, skhan, memxor
  Cc: bpf, linux-kernel, linux-doc, vmalik, linux-kselftest, ast,
	andrii, daniel, martin.lau, eddyz87, yonghong.song, clm,
	ihor.solodrai
In-Reply-To: <20260521032306.97118-5-kaitao.cheng@linux.dev>

[-- Attachment #1: Type: text/plain, Size: 1269 bytes --]

> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 59855b434f0b..804c201c28f3 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c

[ ... ]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index f3cf8d85bea0..35eebb5e7769 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c

[ ... ]

> @@ -11671,7 +11674,8 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
>
>  	switch (node_field_type) {
>  	case BPF_LIST_NODE:
> -		ret = is_bpf_list_push_kfunc(kfunc_btf_id);
> +		ret = is_bpf_list_push_kfunc(kfunc_btf_id) ||
> +		      kfunc_btf_id == special_kfunc_list[KF_bpf_list_del];

This isn't a bug, but Eduard Zingerman suggested in v10 adding a
is_bpf_list_del_kfunc() helper for consistency with the existing code style
(similar to is_bpf_list_push_kfunc()). This would maintain the pattern used
throughout the codebase for checking kfunc types.

Reference: https://lore.kernel.org/bpf/ca14a6fd8a1f3ce1e64c70b6d9ea2baf8df913ae.camel@gmail.com/



---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/26204125015

^ permalink raw reply

* Re: [PATCH bpf-next v11 6/8] bpf: Add bpf_list_add to insert node after a given list node
From: bot+bpf-ci @ 2026-05-21  4:08 UTC (permalink / raw)
  To: kaitao.cheng, ast, corbet, martin.lau, daniel, andrii, eddyz87,
	song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
	shuah, chengkaitao, skhan, memxor
  Cc: bpf, linux-kernel, linux-doc, vmalik, linux-kselftest, ast,
	andrii, daniel, martin.lau, eddyz87, yonghong.song, clm,
	ihor.solodrai
In-Reply-To: <20260521032306.97118-7-kaitao.cheng@linux.dev>

[-- Attachment #1: Type: text/plain, Size: 3884 bytes --]

> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 1c69476c8a09..89579165ef4d 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -2577,6 +2577,16 @@ __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
>  	return bpf_list_push_back(head, node, meta__ign, off);
>  }
>
> +__bpf_kfunc int bpf_list_add(struct bpf_list_head *head, struct bpf_list_node *new,
> +			     struct bpf_list_node *prev__nonown_allowed,
> +			     struct btf_struct_meta *meta, u64 off)
> +{
> +	struct bpf_list_node_kern *n = (void *)new, *p = (void *)prev__nonown_allowed;
> +	struct list_head *prev_ptr = &p->list_head;
> +
> +	return __bpf_list_add(n, head, &prev_ptr, meta ? meta->record : NULL, off);
> +}

If __bpf_list_add() fails at runtime (e.g., because prev is not in the
list indicated by head), it drops the 'new' node via __bpf_obj_drop_impl
in the fail path.

However, the verifier unconditionally converts 'new' into a non-owning
reference during check_kfunc_call(). If __bpf_list_add() fails, the BPF
program retains this non-owning reference to a freed node.

Could this allow the program to trigger a use-after-free within the lock
critical section?

(sashiko-bot@kernel.org, v10 RESEND)
https://lore.kernel.org/bpf/20260513204420.2BB22C19425@smtp.kernel.org/

>
>  static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head,
>  					    struct list_head *n)

[ ... ]

> @@ -4756,6 +4766,7 @@ BTF_ID_FLAGS(func, bpf_list_push_front, KF_IMPLICIT_ARGS)
>  BTF_ID_FLAGS(func, bpf_list_push_front_impl)
>  BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS)
>  BTF_ID_FLAGS(func, bpf_list_push_back_impl)
> +BTF_ID_FLAGS(func, bpf_list_add, KF_IMPLICIT_ARGS)

Since bpf_list_add is marked with KF_IMPLICIT_ARGS, won't the verifier
look for a bpf_list_add_impl counterpart during program load?

Without defining and registering a bpf_list_add_impl function,
find_kfunc_impl_proto() will fail to find the required _impl suffix in
the vmlinux BTF.

Will this cause the verifier to unconditionally reject the kfunc?

(sashiko-bot@kernel.org, v10 RESEND)
https://lore.kernel.org/bpf/20260513204420.2BB22C19425@smtp.kernel.org/

>  BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_list_del, KF_ACQUIRE | KF_RET_NULL)

[ ... ]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 35eebb5e7769..662ad7312697 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c

[ ... ]

> @@ -19524,8 +19527,11 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>  		int struct_meta_reg = BPF_REG_3;
>  		int node_offset_reg = BPF_REG_4;
>
> -		/* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
> -		if (is_bpf_rbtree_add_kfunc(desc->func_id)) {
> +		/* list_add/rbtree_add have an extra arg (prev/less),
> +		 * so args-to-fixup are in diff regs.
> +		 */

This isn't a bug, but the multi-line comment format doesn't follow the
BPF subsystem style. The preferred form in kernel/bpf/ has the opening
/* on its own line.

Additionally, consider adding an is_bpf_list_add_kfunc() helper instead
of direct comparison with special_kfunc_list[KF_bpf_list_add].

(bot+bpf-ci@kernel.org and Eduard Zingerman, v10 RESEND)
https://lore.kernel.org/bpf/f573f7e0a5140fe6254d2ddfd3f69add455fa25c67c020c0312e2c2ceeca7c69@mail.kernel.org/

> +		if (desc->func_id == special_kfunc_list[KF_bpf_list_add] ||
> +		    is_bpf_rbtree_add_kfunc(desc->func_id)) {
>  			struct_meta_reg = BPF_REG_4;
>  			node_offset_reg = BPF_REG_5;
>  		}


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/26204125015

^ permalink raw reply

* [PATCH bpf-next] bpf: Add kernel-doc for arena page kfuncs
From: Dhiraj Shah @ 2026-05-21  4:35 UTC (permalink / raw)
  To: bpf
  Cc: ast, daniel, andrii, martin.lau, eddyz87, memxor, song,
	yonghong.song, jolsa, corbet, skhan, linux-doc, linux-kernel

The page-management kfuncs exposed by BPF arena -
bpf_arena_alloc_pages(), bpf_arena_free_pages() and
bpf_arena_reserve_pages() - are part of the BPF kfunc ABI but lack
rendered documentation. Their contracts (valid argument ranges,
sleepable-only context, and the set of error returns) are today only
discoverable by reading kernel/bpf/arena.c.

Add a kernel-doc comment block above each of the three kfuncs and
render them under a new "BPF arena kfuncs" subsection in
Documentation/bpf/kfuncs.rst, alongside the existing core kfunc
subsections.

No functional change.

Signed-off-by: Dhiraj Shah <find.dhiraj@gmail.com>
---
 Documentation/bpf/kfuncs.rst | 27 +++++++++++++++
 kernel/bpf/arena.c           | 64 ++++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+)

diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 75e6c078e0e7..fe0df1e16453 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -732,3 +732,30 @@ the verifier. bpf_cgroup_ancestor() can be used as follows:
 BPF provides a set of kfuncs that can be used to query, allocate, mutate, and
 destroy struct cpumask * objects. Please refer to :ref:`cpumasks-header-label`
 for more details.
+
+4.4 BPF arena kfuncs
+--------------------
+
+A BPF arena (``BPF_MAP_TYPE_ARENA``) is a sparsely-populated shared memory
+region that a BPF program and a user-space process can both address. The
+following kfuncs allow a sleepable BPF program to allocate, free, and reserve
+pages within an arena:
+
+.. kernel-doc:: kernel/bpf/arena.c
+   :identifiers: bpf_arena_alloc_pages bpf_arena_free_pages bpf_arena_reserve_pages
+
+A typical pattern is to allocate one or more pages, write to them from BPF,
+and let user space observe the same memory after a page fault populates its
+VMA:
+
+.. code-block:: c
+
+	void __arena *page;
+
+	page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+	if (!page)
+		return -ENOMEM;
+
+	/* ... use the page from BPF; user space sees the same bytes ... */
+
+	bpf_arena_free_pages(&arena, page, 1);
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 49a8f7b1beef..b8ec2953dee6 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -870,6 +870,33 @@ static void arena_free_irq(struct irq_work *iw)
 
 __bpf_kfunc_start_defs();
 
+/**
+ * bpf_arena_alloc_pages() - Allocate pages within a BPF arena.
+ * @p__map: Pointer to a ``BPF_MAP_TYPE_ARENA`` map.
+ * @addr__ign: Page-aligned user-space address within the arena at which to
+ *	       place the allocation, or %NULL to let the kernel choose. When
+ *	       non-NULL the address must fall inside the arena's user VMA
+ *	       range; otherwise the allocation fails.
+ * @page_cnt: Number of pages to allocate. Must be non-zero and no greater
+ *	      than the arena's configured size in pages.
+ * @node_id: NUMA node hint for the backing pages, or %NUMA_NO_NODE.
+ * @flags: Reserved for future use; must be 0.
+ *
+ * Allocates @page_cnt physically-backed pages and inserts them into the
+ * arena's kernel VMA at the offset corresponding to @addr__ign (or at an
+ * arbitrary free offset when @addr__ign is %NULL). A subsequent user-space
+ * page fault on the matching user address populates the user VMA with the
+ * same pages, giving BPF and user space a shared view of the region.
+ *
+ * The underlying allocator may sleep, so this kfunc is only callable from
+ * sleepable BPF programs.
+ *
+ * Return:
+ * * Kernel pointer to the start of the allocated region on success.
+ * * %NULL if @p__map is not an arena, @flags is non-zero, @page_cnt is zero
+ *   or exceeds the arena size, @addr__ign is misaligned or outside the
+ *   arena, @node_id is invalid, or the kernel is out of memory.
+ */
 __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
 					int node_id, u64 flags)
 {
@@ -893,6 +920,23 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag
 
 	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
 }
+
+/**
+ * bpf_arena_free_pages() - Free a range of pages within a BPF arena.
+ * @p__map: Pointer to a ``BPF_MAP_TYPE_ARENA`` map.
+ * @ptr__ign: User-space virtual address of the first page to free, as used
+ *	      to address the arena from BPF and user space. Typically the
+ *	      same address that was previously returned (in user-space form)
+ *	      by bpf_arena_alloc_pages().
+ * @page_cnt: Number of pages to free.
+ *
+ * Releases the backing pages, unmapping them from the arena's kernel VMA
+ * and from any user-space VMA that previously faulted them in. May sleep,
+ * so the kfunc is callable only from sleepable BPF programs.
+ *
+ * The call is a no-op when @p__map is not an arena, when @page_cnt is zero,
+ * or when @ptr__ign is %NULL.
+ */
 __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
 {
 	struct bpf_map *map = p__map;
@@ -913,6 +957,26 @@ void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_c
 	arena_free_pages(arena, (long)ptr__ign, page_cnt, false);
 }
 
+/**
+ * bpf_arena_reserve_pages() - Reserve a page range within a BPF arena.
+ * @p__map: Pointer to a ``BPF_MAP_TYPE_ARENA`` map.
+ * @ptr__ign: Page-aligned user-space virtual address of the start of the
+ *	      range to reserve.
+ * @page_cnt: Number of pages to reserve. Zero is permitted and is a no-op.
+ *
+ * Marks @page_cnt pages starting at @ptr__ign as reserved so that subsequent
+ * bpf_arena_alloc_pages() calls will not place allocations in that range.
+ * No physical pages are allocated by this kfunc; the range is simply
+ * excluded from the arena's free space.
+ *
+ * Return:
+ * * 0 on success, or when @page_cnt is zero.
+ * * -EINVAL if @p__map is not an arena or the requested range falls outside
+ *   the arena's user VMA.
+ * * -EBUSY if any page in the requested range is already allocated, or if
+ *   contention on the arena's internal spinlock prevents the operation from
+ *   completing.
+ */
 __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt)
 {
 	struct bpf_map *map = p__map;
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH bpf-next] bpf: Add kernel-doc for arena page kfuncs
From: Emil Tsalapatis @ 2026-05-21  4:56 UTC (permalink / raw)
  To: Dhiraj Shah, bpf
  Cc: ast, daniel, andrii, martin.lau, eddyz87, memxor, song,
	yonghong.song, jolsa, corbet, skhan, linux-doc, linux-kernel
In-Reply-To: <20260521043553.199781-1-find.dhiraj@gmail.com>

On Thu May 21, 2026 at 12:35 AM EDT, Dhiraj Shah wrote:
> The page-management kfuncs exposed by BPF arena -
> bpf_arena_alloc_pages(), bpf_arena_free_pages() and
> bpf_arena_reserve_pages() - are part of the BPF kfunc ABI but lack
> rendered documentation. Their contracts (valid argument ranges,
> sleepable-only context, and the set of error returns) are today only
> discoverable by reading kernel/bpf/arena.c.
>
> Add a kernel-doc comment block above each of the three kfuncs and
> render them under a new "BPF arena kfuncs" subsection in
> Documentation/bpf/kfuncs.rst, alongside the existing core kfunc
> subsections.
>
> No functional change.
>
> Signed-off-by: Dhiraj Shah <find.dhiraj@gmail.com>
> ---
>  Documentation/bpf/kfuncs.rst | 27 +++++++++++++++
>  kernel/bpf/arena.c           | 64 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 91 insertions(+)
>
> diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
> index 75e6c078e0e7..fe0df1e16453 100644
> --- a/Documentation/bpf/kfuncs.rst
> +++ b/Documentation/bpf/kfuncs.rst
> @@ -732,3 +732,30 @@ the verifier. bpf_cgroup_ancestor() can be used as follows:
>  BPF provides a set of kfuncs that can be used to query, allocate, mutate, and
>  destroy struct cpumask * objects. Please refer to :ref:`cpumasks-header-label`
>  for more details.
> +
> +4.4 BPF arena kfuncs
> +--------------------
> +
> +A BPF arena (``BPF_MAP_TYPE_ARENA``) is a sparsely-populated shared memory
> +region that a BPF program and a user-space process can both address. The
> +following kfuncs allow a sleepable BPF program to allocate, free, and reserve
> +pages within an arena:
> +
> +.. kernel-doc:: kernel/bpf/arena.c
> +   :identifiers: bpf_arena_alloc_pages bpf_arena_free_pages bpf_arena_reserve_pages
> +
> +A typical pattern is to allocate one or more pages, write to them from BPF,
> +and let user space observe the same memory after a page fault populates its
> +VMA:

Maybe slight rephrase? This description is a bit dense. E.g.,

"...and let user space access the pages through a mapping in its address space."

> +
> +.. code-block:: c
> +
> +	void __arena *page;
> +
> +	page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
> +	if (!page)
> +		return -ENOMEM;
> +
> +	/* ... use the page from BPF; user space sees the same bytes ... */
> +
> +	bpf_arena_free_pages(&arena, page, 1);
> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
> index 49a8f7b1beef..b8ec2953dee6 100644
> --- a/kernel/bpf/arena.c
> +++ b/kernel/bpf/arena.c
> @@ -870,6 +870,33 @@ static void arena_free_irq(struct irq_work *iw)
>  
>  __bpf_kfunc_start_defs();
>  
> +/**
> + * bpf_arena_alloc_pages() - Allocate pages within a BPF arena.
> + * @p__map: Pointer to a ``BPF_MAP_TYPE_ARENA`` map.
> + * @addr__ign: Page-aligned user-space address within the arena at which to
> + *	       place the allocation, or %NULL to let the kernel choose. When
> + *	       non-NULL the address must fall inside the arena's user VMA
> + *	       range; otherwise the allocation fails.
> + * @page_cnt: Number of pages to allocate. Must be non-zero and no greater
> + *	      than the arena's configured size in pages.
> + * @node_id: NUMA node hint for the backing pages, or %NUMA_NO_NODE.
> + * @flags: Reserved for future use; must be 0.
> + *
> + * Allocates @page_cnt physically-backed pages and inserts them into the
> + * arena's kernel VMA at the offset corresponding to @addr__ign (or at an
> + * arbitrary free offset when @addr__ign is %NULL). A subsequent user-space
> + * page fault on the matching user address populates the user VMA with the
> + * same pages, giving BPF and user space a shared view of the region.
> + *
> + * The underlying allocator may sleep, so this kfunc is only callable from
> + * sleepable BPF programs.

I think this is half the story, since the verifier adjusts the call to
the function to the non-sleepable version when necessary. So the kfunc
is technically only callable from sleepable BPF programs but it never
will be thanks to the verifier.

> + *
> + * Return:
> + * * Kernel pointer to the start of the allocated region on success.
> + * * %NULL if @p__map is not an arena, @flags is non-zero, @page_cnt is zero
> + *   or exceeds the arena size, @addr__ign is misaligned or outside the
> + *   arena, @node_id is invalid, or the kernel is out of memory.
> + */
>  __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
>  					int node_id, u64 flags)
>  {
> @@ -893,6 +920,23 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag
>  
>  	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
>  }
> +
> +/**
> + * bpf_arena_free_pages() - Free a range of pages within a BPF arena.
> + * @p__map: Pointer to a ``BPF_MAP_TYPE_ARENA`` map.
> + * @ptr__ign: User-space virtual address of the first page to free, as used
> + *	      to address the arena from BPF and user space. Typically the
> + *	      same address that was previously returned (in user-space form)
> + *	      by bpf_arena_alloc_pages().
> + * @page_cnt: Number of pages to free.
> + *
> + * Releases the backing pages, unmapping them from the arena's kernel VMA
> + * and from any user-space VMA that previously faulted them in. May sleep,
> + * so the kfunc is callable only from sleepable BPF programs.

Same here.

> + *
> + * The call is a no-op when @p__map is not an arena, when @page_cnt is zero,
> + * or when @ptr__ign is %NULL.
> + */
>  __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
>  {
>  	struct bpf_map *map = p__map;
> @@ -913,6 +957,26 @@ void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_c
>  	arena_free_pages(arena, (long)ptr__ign, page_cnt, false);
>  }
>  
> +/**
> + * bpf_arena_reserve_pages() - Reserve a page range within a BPF arena.
> + * @p__map: Pointer to a ``BPF_MAP_TYPE_ARENA`` map.
> + * @ptr__ign: Page-aligned user-space virtual address of the start of the
> + *	      range to reserve.
> + * @page_cnt: Number of pages to reserve. Zero is permitted and is a no-op.
> + *
> + * Marks @page_cnt pages starting at @ptr__ign as reserved so that subsequent
> + * bpf_arena_alloc_pages() calls will not place allocations in that range.
> + * No physical pages are allocated by this kfunc; the range is simply
> + * excluded from the arena's free space.
> + *
> + * Return:
> + * * 0 on success, or when @page_cnt is zero.
> + * * -EINVAL if @p__map is not an arena or the requested range falls outside
> + *   the arena's user VMA.
> + * * -EBUSY if any page in the requested range is already allocated, or if
> + *   contention on the arena's internal spinlock prevents the operation from
> + *   completing.
> + */
>  __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt)
>  {
>  	struct bpf_map *map = p__map;


^ permalink raw reply

* Re: [PATCH mm-unstable v17 11/14] mm/khugepaged: Introduce mTHP collapse support
From: Vernon Yang @ 2026-05-21  5:11 UTC (permalink / raw)
  To: Wei Yang
  Cc: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260521024654.2a7teoe665porz76@master>

On Thu, May 21, 2026 at 02:46:54AM +0000, Wei Yang wrote:
> On Thu, May 21, 2026 at 10:36:15AM +0800, Vernon Yang wrote:
> >On Mon, May 11, 2026 at 12:58:11PM -0600, Nico Pache wrote:
> >> Enable khugepaged to collapse to mTHP orders. This patch implements the
> >> main scanning logic using a bitmap to track occupied pages and a stack
> >> structure that allows us to find optimal collapse sizes.
> >>
> >> Previous to this patch, PMD collapse had 3 main phases, a light weight
> >> scanning phase (mmap_read_lock) that determines a potential PMD
> >> collapse, an alloc phase (mmap unlocked), then finally heavier collapse
> >> phase (mmap_write_lock).
> >>
> >> To enabled mTHP collapse we make the following changes:
> >>
> >> During PMD scan phase, track occupied pages in a bitmap. When mTHP
> >> orders are enabled, we remove the restriction of max_ptes_none during the
> >> scan phase to avoid missing potential mTHP collapse candidates. Once we
> >> have scanned the full PMD range and updated the bitmap to track occupied
> >> pages, we use the bitmap to find the optimal mTHP size.
> >>
> >> Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
> >> and determine the best eligible order for the collapse. A stack structure
> >> is used instead of traditional recursion to manage the search. This also
> >> prevents a traditional recursive approach when the kernel stack struct is
> >> limited. The algorithm recursively splits the bitmap into smaller chunks to
> >> find the highest order mTHPs that satisfy the collapse criteria. We start
> >> by attempting the PMD order, then moved on the consecutively lower orders
> >> (mTHP collapse). The stack maintains a pair of variables (offset, order),
> >> indicating the number of PTEs from the start of the PMD, and the order of
> >> the potential collapse candidate.
> >>
> >> The algorithm for consuming the bitmap works as such:
> >>     1) push (0, HPAGE_PMD_ORDER) onto the stack
> >>     2) pop the stack
> >>     3) check if the number of set bits in that (offset,order) pair
> >>        statisfy the max_ptes_none threshold for that order
> >>     4) if yes, attempt collapse
> >>     5) if no (or collapse fails), push two new stack items representing
> >>        the left and right halves of the current bitmap range, at the
> >>        next lower order
> >>     6) repeat at step (2) until stack is empty.
> >>
> >> Below is a diagram representing the algorithm and stack items:
> >>
> >>                             offset   mid_offset
> >>                             |        |
> >>                             |        |
> >>                             v        v
> >>           ____________________________________
> >>          |          PTE Page Table            |
> >>          --------------------------------------
> >> 			    <-------><------->
> >>                              order-1  order-1
> >>
> >> mTHP collapses reject regions containing swapped out or shared pages.
> >> This is because adding new entries can lead to new none pages, and these
> >> may lead to constant promotion into a higher order mTHP. A similar
> >> issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
> >> introducing at least 2x the number of pages, and on a future scan will
> >> satisfy the promotion condition once again. This issue is prevented via
> >> the collapse_max_ptes_none() function which imposes the max_ptes_none
> >> restrictions above.
> >>
> >> We currently only support mTHP collapse for max_ptes_none values of 0
> >> and HPAGE_PMD_NR - 1. resulting in the following behavior:
> >>
> >>     - max_ptes_none=0: Never introduce new empty pages during collapse
> >>     - max_ptes_none=HPAGE_PMD_NR-1: Always try collapse to the highest
> >>       available mTHP order
> >>
> >> Any other max_ptes_none value will emit a warning and skip mTHP collapse
> >> attempts. There should be no behavior change for PMD collapse.
> >>
> >> Once we determine what mTHP sizes fits best in that PMD range a collapse
> >> is attempted. A minimum collapse order of 2 is used as this is the lowest
> >> order supported by anon memory as defined by THP_ORDERS_ALL_ANON.
> >>
> >> Currently madv_collapse is not supported and will only attempt PMD
> >> collapse.
> >>
> >> We can also remove the check for is_khugepaged inside the PMD scan as
> >> the collapse_max_ptes_none() function handles this logic now.
> >>
> >> Signed-off-by: Nico Pache <npache@redhat.com>
> >> ---
> >>  mm/khugepaged.c | 182 +++++++++++++++++++++++++++++++++++++++++++++---
> >>  1 file changed, 174 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >> index 3492b135d667..39bf7ea8a6e8 100644
> >> --- a/mm/khugepaged.c
> >> +++ b/mm/khugepaged.c
> >> @@ -100,6 +100,30 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
> >>
> >>  static struct kmem_cache *mm_slot_cache __ro_after_init;
> >>
> >> +#define KHUGEPAGED_MIN_MTHP_ORDER	2
> >> +/*
> >> + * mthp_collapse() does an iterative DFS over a binary tree, from
> >> + * HPAGE_PMD_ORDER down to KHUGEPAGED_MIN_MTHP_ORDER. The max stack
> >> + * size needed for a DFS on a binary tree is height + 1, where
> >> + * height = HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER.
> >> + *
> >> + * ilog2 is used in place of HPAGE_PMD_ORDER because some architectures
> >> + * (e.g. ppc64le) do not define HPAGE_PMD_ORDER until after build time.
> >> + */
> >> +#define MTHP_STACK_SIZE	(ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER + 1)
> >> +
> >> +/*
> >> + * Defines a range of PTE entries in a PTE page table which are being
> >> + * considered for mTHP collapse.
> >> + *
> >> + * @offset: the offset of the first PTE entry in a PMD range.
> >> + * @order: the order of the PTE entries being considered for collapse.
> >> + */
> >> +struct mthp_range {
> >> +	u16 offset;
> >> +	u8 order;
> >> +};
> >> +
> >>  struct collapse_control {
> >>  	bool is_khugepaged;
> >>
> >> @@ -111,6 +135,12 @@ struct collapse_control {
> >>
> >>  	/* nodemask for allocation fallback */
> >>  	nodemask_t alloc_nmask;
> >> +
> >> +	/* Each bit represents a single occupied (!none/zero) page. */
> >> +	DECLARE_BITMAP(mthp_bitmap, MAX_PTRS_PER_PTE);
> >> +	/* A mask of the current range being considered for mTHP collapse. */
> >> +	DECLARE_BITMAP(mthp_bitmap_mask, MAX_PTRS_PER_PTE);
> >> +	struct mthp_range mthp_bitmap_stack[MTHP_STACK_SIZE];
> >>  };
> >>
> >>  /**
> >> @@ -1404,20 +1434,140 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
> >>  	return result;
> >>  }
> >>
> >> +static void collapse_mthp_stack_push(struct collapse_control *cc, int *stack_size,
> >> +				     u16 offset, u8 order)
> >> +{
> >> +	const int size = *stack_size;
> >> +	struct mthp_range *stack = &cc->mthp_bitmap_stack[size];
> >> +
> >> +	VM_WARN_ON_ONCE(size >= MTHP_STACK_SIZE);
> >> +	stack->order = order;
> >> +	stack->offset = offset;
> >> +	(*stack_size)++;
> >> +}
> >> +
> >> +static struct mthp_range collapse_mthp_stack_pop(struct collapse_control *cc,
> >> +						 int *stack_size)
> >> +{
> >> +	const int size = *stack_size;
> >> +
> >> +	VM_WARN_ON_ONCE(size <= 0);
> >> +	(*stack_size)--;
> >> +	return cc->mthp_bitmap_stack[size - 1];
> >> +}
> >> +
> >> +static unsigned int collapse_mthp_count_present(struct collapse_control *cc,
> >> +						u16 offset, unsigned int nr_ptes)
> >> +{
> >> +	bitmap_zero(cc->mthp_bitmap_mask, MAX_PTRS_PER_PTE);
> >> +	bitmap_set(cc->mthp_bitmap_mask, offset, nr_ptes);
> >> +	return bitmap_weight_and(cc->mthp_bitmap, cc->mthp_bitmap_mask, MAX_PTRS_PER_PTE);
> >> +}
> >> +
> >> +/*
> >> + * mthp_collapse() consumes the bitmap that is generated during
> >> + * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
> >> + *
> >> + * Each bit in cc->mthp_bitmap represents a single occupied (!none/zero) page.
> >> + * A stack structure cc->mthp_bitmap_stack is used to check different regions
> >> + * of the bitmap for collapse eligibility. The stack maintains a pair of
> >> + * variables (offset, order), indicating the number of PTEs from the start of
> >> + * the PMD, and the order of the potential collapse candidate respectively. We
> >> + * start at the PMD order and check if it is eligible for collapse; if not, we
> >> + * add two entries to the stack at a lower order to represent the left and right
> >> + * halves of the PTE page table we are examining.
> >> + *
> >> + *                         offset       mid_offset
> >> + *                         |         |
> >> + *                         |         |
> >> + *                         v         v
> >> + *      --------------------------------------
> >> + *      |          cc->mthp_bitmap            |
> >> + *      --------------------------------------
> >> + *                         <-------><------->
> >> + *                          order-1  order-1
> >> + *
> >> + * For each of these, we determine how many PTE entries are occupied in the
> >> + * range of PTE entries we propose to collapse, then we compare this to a
> >> + * threshold number of PTE entries which would need to be occupied for a
> >> + * collapse to be permitted at that order (accounting for max_ptes_none).
> >> + *
> >> + * If a collapse is permitted, we attempt to collapse the PTE range into a
> >> + * mTHP.
> >> + */
> >> +static int mthp_collapse(struct mm_struct *mm, unsigned long address,
> >> +		int referenced, int unmapped, struct collapse_control *cc,
> >> +		unsigned long enabled_orders)
> >> +{
> >> +	unsigned int nr_occupied_ptes, nr_ptes;
> >> +	int max_ptes_none, collapsed = 0, stack_size = 0;
> >> +	unsigned long collapse_address;
> >> +	struct mthp_range range;
> >> +	u16 offset;
> >> +	u8 order;
> >> +
> >> +	collapse_mthp_stack_push(cc, &stack_size, 0, HPAGE_PMD_ORDER);
> >> +
> >> +	while (stack_size) {
> >> +		range = collapse_mthp_stack_pop(cc, &stack_size);
> >> +		order = range.order;
> >> +		offset = range.offset;
> >> +		nr_ptes = 1UL << order;
> >> +
> >> +		if (!test_bit(order, &enabled_orders))
> >> +			goto next_order;
> >> +
> >> +		max_ptes_none = collapse_max_ptes_none(cc, NULL, order);
> >> +
> >> +		if (max_ptes_none < 0)
> >> +			return collapsed;
> >> +
> >> +		nr_occupied_ptes = collapse_mthp_count_present(cc, offset,
> >> +							       nr_ptes);
> >> +
> >> +		if (nr_occupied_ptes >= nr_ptes - max_ptes_none) {
> >> +			int ret;
> >> +
> >> +			collapse_address = address + offset * PAGE_SIZE;
> >> +			ret = collapse_huge_page(mm, collapse_address, referenced,
> >> +						 unmapped, cc, order);
> >> +			if (ret == SCAN_SUCCEED) {
> >> +				collapsed += nr_ptes;
> >> +				continue;
> >> +			}
> >> +		}
> >> +
> >> +next_order:
> >> +		if (order > KHUGEPAGED_MIN_MTHP_ORDER) {
> >
> >Hi Nico, thank you very much for your contributions to this series.
> >
> >I found a minor issue, for MADV_COLLAPSE, if collapse_huge_page() fails
> >for some reason (e.g. allocate folio), it goes to next_order and
> >continues splitting to the next small order. However, enabled_orders
> >only supports HPAGE_PMD_ORDER, so it keeps runing the split operations
> >without any effective work until KHUGEPAGED_MIN_MTHP_ORDER is reached
> >before exiting. For khugepaged, e.g. setting only 2MB to always, also
> >same phenomenon.
>
> Yes, but it does no actual work since it is checked after pop up.
>
> >
> >This does not affect the overall functionality of mthp collapse, just
> >redundant.
> >
> >The redundant operations can be easily skipped with the following
> >modification. If I miss some thing, please let me know. Thanks!
> >
> >diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >index 1a25af3d6d0f..fa407cce525c 100644
> >--- a/mm/khugepaged.c
> >+++ b/mm/khugepaged.c
> >@@ -1574,7 +1574,7 @@ static int mthp_collapse(struct mm_struct *mm, unsigned long address,
> > 		}
> >
> > next_order:
> >-		if (order > KHUGEPAGED_MIN_MTHP_ORDER) {
> >+		if ((BIT(order) - 1) & enabled_orders) {
> > 			const u8 next_order = order - 1;
> > 			const u16 mid_offset = offset + (nr_ptes / 2);
> >
>
> This would stop the iteration if there are other lower enabled order, right?
             ^^^^                                  ^^^^^^^^^^^^^^^^^^^

NO :)

For more details, please refer to the following information.

|              Scenario               | Old Behavior (order > 2) | New Behavior ((BIT(order)-1) & enabled_orders) |
|-------------------------------------|--------------------------|------------------------------------------------|
| MADV_COLLAPSE                       | Splits 9,8,7,...,3       | No split                                       |
| khugepaged, only 2MB enabled        | Splits 9,8,7,...,3       | No split                                       |
| khugepaged, only 2MB + 64KB enabled | Splits 9,8,7,...,3       | Splits 9,8,7,...,5                             |
| khugepaged, only 32KB enabled       | Splits 9,8,7,...,3       | Splits 9,8,7,...,4                             |
| khugepaged, only 16KB enabled       | Splits 9,8,7,...,3       | Splits 9,8,7,...,3                             |
| khugepaged, all mTHP enabled        | Splits 9,8,7,...,3       | Splits 9,8,7,...,3                             |

--
Cheers,
Vernon

^ permalink raw reply

* Re: [PATCH v2] docs: submitting-patches: Clarify that "reviewer" is a person
From: Mauro Carvalho Chehab @ 2026-05-21  5:12 UTC (permalink / raw)
  To: Krzysztof Kozlowski
  Cc: Jonathan Corbet, Shuah Khan, workflows, linux-doc, linux-kernel,
	Greg Kroah-Hartman, Vlastimil Babka, Andrew Morton,
	David Hildenbrand, Linus Torvalds, Randy Dunlap, Mark Brown
In-Reply-To: <20260520154846.162170-2-krzysztof.kozlowski@oss.qualcomm.com>

On Wed, 20 May 2026 17:48:47 +0200
Krzysztof Kozlowski <krzysztof.kozlowski@oss.qualcomm.com> wrote:

> Common understanding of word "Reviewer" is: a person performing a review
> work [1]. Tools are not persons, thus cannot be reviewers in this term.
> Also tools cannot make statements and cannot take responsibility for the
> review.
> 
> Our docs already clearly mark that "Reviewed-by" must come from a
> person:
> 
>  - "By offering my Reviewed-by: tag, I state that:"
> 
>    Usage of first person "I" and word "state"
> 
>  - "A Reviewed-by tag is *a statement of opinion* that the patch is an
>     appropriate modification of the kernel without any remaining serious"
> 
>    Only a person can make a statement of opinion.
> 
>  - "Any interested reviewer (who has done the work) can offer a
>    Reviewed-by"
> 
>    A person can offer a tag thus above does not grant the tool
>    permission to offer a tag.
> 
> However this might not be enough, so let's clarify that only a person
> with a known identity can state the "Reviewer's statement of oversight".
> 
> Link: https://en.wiktionary.org/wiki/reviewer [1]
> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Cc: Vlastimil Babka <vbabka@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: David Hildenbrand <david@kernel.org>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> Acked-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>
> Acked-by: Randy Dunlap <rdunlap@infradead.org>
> Reviewed-by: Mark Brown <broonie@kernel.org>
> Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@oss.qualcomm.com>

Makes sense to me.

Reviewed-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>

> ---
> 
> Changes in v2:
> 1. Add tags
> 2. Rephrase/simplify a bit commit msg. Rephrase title - drop "in
>    English".
> 3. Add "with known identity", suggested by David Hildenbrand. I retained
>    previous tags, assuming this change is within spirit of previous
>    version and there were no objections on the list.
> ---
>  Documentation/process/submitting-patches.rst | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst
> index d7290e208e72..cc6a1f73d7f2 100644
> --- a/Documentation/process/submitting-patches.rst
> +++ b/Documentation/process/submitting-patches.rst
> @@ -581,12 +581,12 @@ By offering my Reviewed-by: tag, I state that:
>  
>  A Reviewed-by tag is a statement of opinion that the patch is an
>  appropriate modification of the kernel without any remaining serious
> -technical issues.  Any interested reviewer (who has done the work) can
> -offer a Reviewed-by tag for a patch.  This tag serves to give credit to
> -reviewers and to inform maintainers of the degree of review which has been
> -done on the patch.  Reviewed-by: tags, when supplied by reviewers known to
> -understand the subject area and to perform thorough reviews, will normally
> -increase the likelihood of your patch getting into the kernel.
> +technical issues.  Any interested reviewer (who has done the work and is a
> +person with known identity) can offer a Reviewed-by tag for a patch.  This tag
> +serves to give credit to reviewers and to inform maintainers of the degree of
> +review which has been done on the patch.  Reviewed-by: tags, when supplied by
> +reviewers known to understand the subject area and to perform thorough reviews,
> +will normally increase the likelihood of your patch getting into the kernel.
>  
>  Both Tested-by and Reviewed-by tags, once received on mailing list from tester
>  or reviewer, should be added by author to the applicable patches when sending



Thanks,
Mauro

^ permalink raw reply

* Re: [PATCH bpf-next] bpf: Add kernel-doc for arena page kfuncs
From: bot+bpf-ci @ 2026-05-21  5:15 UTC (permalink / raw)
  To: find.dhiraj, bpf
  Cc: ast, daniel, andrii, martin.lau, eddyz87, memxor, song,
	yonghong.song, jolsa, corbet, skhan, linux-doc, linux-kernel, ast,
	andrii, daniel, martin.lau, eddyz87, yonghong.song, clm,
	ihor.solodrai
In-Reply-To: <20260521043553.199781-1-find.dhiraj@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 3933 bytes --]

> diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
> index 75e6c078e0e7..fe0df1e16453 100644
> --- a/Documentation/bpf/kfuncs.rst
> +++ b/Documentation/bpf/kfuncs.rst

[ ... ]

> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
> index 49a8f7b1beef..b8ec2953dee6 100644
> --- a/kernel/bpf/arena.c
> +++ b/kernel/bpf/arena.c
> @@ -870,6 +870,33 @@ static void arena_free_irq(struct irq_work *iw)
>
>  __bpf_kfunc_start_defs();
>
> +/**
> + * bpf_arena_alloc_pages() - Allocate pages within a BPF arena.
> + * @p__map: Pointer to a ``BPF_MAP_TYPE_ARENA`` map.
> + * @addr__ign: Page-aligned user-space address within the arena at which to
> + *	       place the allocation, or %NULL to let the kernel choose. When
> + *	       non-NULL the address must fall inside the arena's user VMA
> + *	       range; otherwise the allocation fails.
> + * @page_cnt: Number of pages to allocate. Must be non-zero and no greater
> + *	      than the arena's configured size in pages.
> + * @node_id: NUMA node hint for the backing pages, or %NUMA_NO_NODE.
> + * @flags: Reserved for future use; must be 0.
> + *
> + * Allocates @page_cnt physically-backed pages and inserts them into the
> + * arena's kernel VMA at the offset corresponding to @addr__ign (or at an
> + * arbitrary free offset when @addr__ign is %NULL). A subsequent user-space
> + * page fault on the matching user address populates the user VMA with the
> + * same pages, giving BPF and user space a shared view of the region.
> + *
> + * The underlying allocator may sleep, so this kfunc is only callable from
> + * sleepable BPF programs.
> + *
> + * Return:
> + * * Kernel pointer to the start of the allocated region on success.

Does the return value description match the implementation? Looking at
arena_alloc_pages() in kernel/bpf/arena.c, the function returns:

    return clear_lo32(arena->user_vm_start) + uaddr32;

which is a user-space virtual address derived from the arena's
user_vm_start. This matches the description for bpf_arena_free_pages()
later in this patch which notes the address is "in user-space form" when
previously returned by bpf_arena_alloc_pages().

Should the documentation state it returns a user-space pointer rather than
"Kernel pointer"?

> + * * %NULL if @p__map is not an arena, @flags is non-zero, @page_cnt is zero
> + *   or exceeds the arena size, @addr__ign is misaligned or outside the
> + *   arena, @node_id is invalid, or the kernel is out of memory.
> + */
>  __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
>  					int node_id, u64 flags)
>  {

[ ... ]

> @@ -913,6 +957,26 @@ void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_c
>  	arena_free_pages(arena, (long)ptr__ign, page_cnt, false);
>  }
>
> +/**
> + * bpf_arena_free_pages() - Free a range of pages within a BPF arena.
> + * @p__map: Pointer to a ``BPF_MAP_TYPE_ARENA`` map.
> + * @ptr__ign: User-space virtual address of the first page to free, as used
> + *	      to address the arena from BPF and user space. Typically the
> + *	      same address that was previously returned (in user-space form)
> + *	      by bpf_arena_alloc_pages().
> + * @page_cnt: Number of pages to free.
> + *
> + * Releases the backing pages, unmapping them from the arena's kernel VMA
> + * and from any user-space VMA that previously faulted them in. May sleep,
> + * so the kfunc is callable only from sleepable BPF programs.
> + *
> + * The call is a no-op when @p__map is not an arena, when @page_cnt is zero,
> + * or when @ptr__ign is %NULL.
> + */
>  __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
>  {

[ ... ]


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/26206146239

^ permalink raw reply

* Re: [PATCH bpf-next] bpf: Add kernel-doc for arena page kfuncs
From: Alexei Starovoitov @ 2026-05-21  5:28 UTC (permalink / raw)
  To: Dhiraj Shah
  Cc: bpf, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Eduard, Kumar Kartikeya Dwivedi, Song Liu,
	Yonghong Song, Jiri Olsa, Jonathan Corbet, Shuah Khan,
	open list:DOCUMENTATION, LKML
In-Reply-To: <20260521043553.199781-1-find.dhiraj@gmail.com>

On Thu, May 21, 2026 at 6:36 AM Dhiraj Shah <find.dhiraj@gmail.com> wrote:
>
> The page-management kfuncs exposed by BPF arena -
> bpf_arena_alloc_pages(), bpf_arena_free_pages() and
> bpf_arena_reserve_pages() - are part of the BPF kfunc ABI but lack
> rendered documentation. Their contracts (valid argument ranges,
> sleepable-only context, and the set of error returns) are today only
> discoverable by reading kernel/bpf/arena.c.
>
> Add a kernel-doc comment block above each of the three kfuncs and
> render them under a new "BPF arena kfuncs" subsection in
> Documentation/bpf/kfuncs.rst, alongside the existing core kfunc
> subsections.
>
> No functional change.
>
> Signed-off-by: Dhiraj Shah <find.dhiraj@gmail.com>
> ---
>  Documentation/bpf/kfuncs.rst | 27 +++++++++++++++
>  kernel/bpf/arena.c           | 64 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 91 insertions(+)
>
> diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
> index 75e6c078e0e7..fe0df1e16453 100644
> --- a/Documentation/bpf/kfuncs.rst
> +++ b/Documentation/bpf/kfuncs.rst
> @@ -732,3 +732,30 @@ the verifier. bpf_cgroup_ancestor() can be used as follows:
>  BPF provides a set of kfuncs that can be used to query, allocate, mutate, and
>  destroy struct cpumask * objects. Please refer to :ref:`cpumasks-header-label`
>  for more details.
> +
> +4.4 BPF arena kfuncs
> +--------------------
> +
> +A BPF arena (``BPF_MAP_TYPE_ARENA``) is a sparsely-populated shared memory
> +region that a BPF program and a user-space process can both address. The
> +following kfuncs allow a sleepable BPF program to allocate, free, and reserve
> +pages within an arena:
> +
> +.. kernel-doc:: kernel/bpf/arena.c
> +   :identifiers: bpf_arena_alloc_pages bpf_arena_free_pages bpf_arena_reserve_pages
> +
> +A typical pattern is to allocate one or more pages, write to them from BPF,
> +and let user space observe the same memory after a page fault populates its
> +VMA:
> +
> +.. code-block:: c
> +
> +       void __arena *page;
> +
> +       page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
> +       if (!page)
> +               return -ENOMEM;
> +
> +       /* ... use the page from BPF; user space sees the same bytes ... */
> +
> +       bpf_arena_free_pages(&arena, page, 1);
> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
> index 49a8f7b1beef..b8ec2953dee6 100644
> --- a/kernel/bpf/arena.c
> +++ b/kernel/bpf/arena.c
> @@ -870,6 +870,33 @@ static void arena_free_irq(struct irq_work *iw)
>
>  __bpf_kfunc_start_defs();
>
> +/**
> + * bpf_arena_alloc_pages() - Allocate pages within a BPF arena.
> + * @p__map: Pointer to a ``BPF_MAP_TYPE_ARENA`` map.
> + * @addr__ign: Page-aligned user-space address within the arena at which to
> + *            place the allocation, or %NULL to let the kernel choose. When
> + *            non-NULL the address must fall inside the arena's user VMA
> + *            range; otherwise the allocation fails.
> + * @page_cnt: Number of pages to allocate. Must be non-zero and no greater
> + *           than the arena's configured size in pages.
> + * @node_id: NUMA node hint for the backing pages, or %NUMA_NO_NODE.
> + * @flags: Reserved for future use; must be 0.
> + *
> + * Allocates @page_cnt physically-backed pages and inserts them into the
> + * arena's kernel VMA at the offset corresponding to @addr__ign (or at an
> + * arbitrary free offset when @addr__ign is %NULL). A subsequent user-space
> + * page fault on the matching user address populates the user VMA with the
> + * same pages, giving BPF and user space a shared view of the region.
> + *
> + * The underlying allocator may sleep, so this kfunc is only callable from
> + * sleepable BPF programs.

what?

> + * Return:
> + * * Kernel pointer to the start of the allocated region on success.

what?

so much slop nowadays :(

pw-bot: cr

^ permalink raw reply

* Re: [PATCH bpf-next v11 3/8] bpf: allow non-owning list-node args via __nonown_allowed
From: Kaitao Cheng @ 2026-05-21  6:29 UTC (permalink / raw)
  To: bot+bpf-ci
  Cc: bpf, ast, corbet, martin.lau, daniel, andrii, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, shuah,
	chengkaitao, skhan, memxor, linux-kernel, linux-doc, vmalik,
	linux-kselftest, martin.lau, clm, ihor.solodrai
In-Reply-To: <96cd744d0a5a8756cb0c1e31e95fbe682956b97226e2159120785feb01013e53@mail.kernel.org>



在 2026/5/21 12:08, bot+bpf-ci@kernel.org 写道:
>> diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
>> index 75e6c078e0e7..3a9db1108b95 100644
>> --- a/Documentation/bpf/kfuncs.rst
>> +++ b/Documentation/bpf/kfuncs.rst
> 
> [ ... ]
> 
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 8dd79b735a69..f3cf8d85bea0 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
> 
> [ ... ]
> 
>> @@ -12244,6 +12249,13 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
>>  			return ret;
>>  		break;
>>  	case KF_ARG_PTR_TO_LIST_NODE:
>> +		if (is_kfunc_arg_nonown_allowed(btf, &args[i]) &&
>> +		    type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) {
>> +			/* Allow bpf_list_front/back return value for
>> +			 * __nonown_allowed list-node arguments.
>> +			 */
>> +			goto check_ok;
>> +		}
>>  		if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
>>  			verbose(env, "%s expected pointer to allocated object\n",
>>  				reg_arg_name(env, argno));
>>  @@ -12253,6 +12265,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
>>  			verbose(env, "allocated object must be referenced\n");
>>  			return -EINVAL;
>>  		}
>> +check_ok:
>>  		ret = process_kf_arg_ptr_to_list_node(env, reg, argno, meta);
>>  		if (ret < 0)
>>  			return ret;
> 
> This isn't a bug, but the multi-line comment doesn't match the BPF
> subsystem style. The BPF guide requires multi-line comments to have the
> opening /* on its own line for files under kernel/bpf/. Should this be:
> 
>   /*
>    * Allow bpf_list_front/back return value for
>    * __nonown_allowed list-node arguments.
>    */

This file uses this comment style extensively, so I followed the
existing convention here.

> Eduard Zingerman raised a related nit in v10 RESEND: the check
> '!reg->ref_obj_id' appears redundant. Would type_is_non_owning_ref() be
> sufficient here?
> 
> Reference: https://lore.kernel.org/bpf/0419643c9a04bf0824066742e52e3f197b43909b.camel@gmail.com/
> 

Sorry, I missed that. If needed, I can fix it in the next version.

> AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
> See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
> 
> CI run summary: https://github.com/kernel-patches/bpf/actions/runs/26204125015

-- 
Thanks
Kaitao Cheng


^ permalink raw reply

* [PATCH 0/3] PCI: endpoint: Add PCI DMA endpoint function (part 3/3)
From: Koichiro Den @ 2026-05-21  6:36 UTC (permalink / raw)
  To: Manivannan Sadhasivam, Krzysztof Wilczyński,
	Kishon Vijay Abraham I, Bjorn Helgaas, Jonathan Corbet,
	Shuah Khan, Vinod Koul, Frank Li, Arnd Bergmann, Damien Le Moal,
	Niklas Cassel
  Cc: Marek Vasut, Yoshihiro Shimoda, linux-pci, linux-doc,
	linux-kernel, dmaengine

Hi,

This is part 3 of three series for PCI endpoint DMA.

The three series are:

  * part 1: dmaengine: dw-edma: Prepare for PCI EP DMA
  * part 2: PCI: endpoint: Expose endpoint DMA resources
  * part 3: PCI: endpoint: Add PCI DMA endpoint function

This series adds the host-side metadata parser, the pci-epf-dma endpoint
function driver, and documentation.

The endpoint function exposes selected endpoint-integrated DMA channels as
a separate PCI DMA controller function. The host-side dw-edma-pcie driver
discovers the BAR metadata, requests the final layout, and registers the
exposed channels with DMAengine. Host clients then submit transfers through
the regular DMAengine API. The endpoint function keeps the metadata BAR
stable and uses a separate DMA window BAR for resources that need dynamic
subrange mappings.

No fixed PCI ID is assigned by this series. Users provide the PCI
vendor/device ID through configfs and bind dw-edma-pcie explicitly, for
example with driver_override.

Dependencies
============

This series depends on parts 1 and 2, applied on top of pci/endpoint:

  [PATCH 00/12] dmaengine: dw-edma: Prepare for PCI EP DMA (part 1/3)
  https://lore.kernel.org/all/20260521063115.2842238-1-den@valinux.co.jp/

  [PATCH 0/3] PCI: endpoint: Expose endpoint DMA resources (part 2/3)
  https://lore.kernel.org/all/20260521063405.2842644-1-den@valinux.co.jp/

Note
====

This series touches both dmaengine and PCI endpoint code. I kept the
dw-edma-pcie metadata parser together with the endpoint function so the
metadata producer and consumer can be reviewed in one place.

If the general direction looks acceptable, the dw-edma-pcie patch may need
a dmaengine Ack if this series is routed through the PCI endpoint tree.

Tested on
=========

The RC-to-EP data path was tested with a small out-of-tree DMAengine
client. The host submits a DMA_MEM_TO_DEV transfer through dw-edma-pcie,
which uses a DesignWare eDMA read channel to copy host memory into
endpoint memory.

Tested with:

  * R-Car S4 as endpoint and R-Car S4 as root complex
  * RK3588 as endpoint and CD8180 as root complex

Best regards,
Koichiro

Koichiro Den (3):
  dmaengine: dw-edma-pcie: Discover endpoint DMA metadata
  PCI: endpoint: Add DMA endpoint function
  Documentation: PCI: Add PCI DMA endpoint function documentation

 Documentation/PCI/endpoint/index.rst          |    2 +
 .../PCI/endpoint/pci-dma-function.rst         |  182 +++
 Documentation/PCI/endpoint/pci-dma-howto.rst  |  200 +++
 drivers/dma/dw-edma/dw-edma-pcie.c            |  369 ++++-
 drivers/pci/endpoint/functions/Kconfig        |   14 +
 drivers/pci/endpoint/functions/Makefile       |    1 +
 drivers/pci/endpoint/functions/pci-epf-dma.c  | 1361 +++++++++++++++++
 7 files changed, 2128 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/PCI/endpoint/pci-dma-function.rst
 create mode 100644 Documentation/PCI/endpoint/pci-dma-howto.rst
 create mode 100644 drivers/pci/endpoint/functions/pci-epf-dma.c

-- 
2.51.0

^ permalink raw reply

* [PATCH 1/3] dmaengine: dw-edma-pcie: Discover endpoint DMA metadata
From: Koichiro Den @ 2026-05-21  6:36 UTC (permalink / raw)
  To: Manivannan Sadhasivam, Krzysztof Wilczyński,
	Kishon Vijay Abraham I, Bjorn Helgaas, Jonathan Corbet,
	Shuah Khan, Vinod Koul, Frank Li, Arnd Bergmann, Damien Le Moal,
	Niklas Cassel
  Cc: Marek Vasut, Yoshihiro Shimoda, linux-pci, linux-doc,
	linux-kernel, dmaengine
In-Reply-To: <20260521063638.2843021-1-den@valinux.co.jp>

Teach dw-edma-pcie to discover a PCI endpoint DMA function from
BAR-resident metadata. The metadata supplies the DMA register window,
channel counts, descriptor windows, optional auxiliary windows, and
endpoint-local descriptor and auxiliary addresses.

Endpoint-provided DMA channels use raw slave addresses because the host
programs transfers against endpoint physical addresses, not PCI BAR
addresses. Scope the default remote interrupt mode to the endpoint DMA
metadata match entry so EDDA and MDB keep their existing local interrupt
behavior.

Endpoint DMA metadata can be discovered after an explicit bind through
driver_override or a dynamic ID. For such binds, there is no static
match data, so the driver falls back to the generic endpoint DMA
metadata parser.

The endpoint polls HOST_REQ at a low idle rate before programming DMA
window submaps and setting READY. Let the host wait for several endpoint
poll periods before treating the READY handshake as timed out.

Signed-off-by: Koichiro Den <den@valinux.co.jp>
---
 drivers/dma/dw-edma/dw-edma-pcie.c | 369 ++++++++++++++++++++++++++++-
 1 file changed, 368 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c
index 2f752e8fb999..d4ae6df36858 100644
--- a/drivers/dma/dw-edma/dw-edma-pcie.c
+++ b/drivers/dma/dw-edma/dw-edma-pcie.c
@@ -11,9 +11,13 @@
 #include <linux/pci.h>
 #include <linux/device.h>
 #include <linux/dma/edma.h>
+#include <linux/iopoll.h>
 #include <linux/pci-epf.h>
 #include <linux/msi.h>
 #include <linux/bitfield.h>
+#include <linux/io.h>
+#include <linux/overflow.h>
+#include <linux/pci-ep-dma.h>
 #include <linux/sizes.h>
 
 #include "dw-edma-core.h"
@@ -44,6 +48,9 @@
 #define DW_PCIE_XILINX_MDB_DT_OFF_GAP		0x100000
 #define DW_PCIE_XILINX_MDB_DT_SIZE		0x800
 
+#define DW_PCIE_EP_DMA_READY_POLL_US		1000
+#define DW_PCIE_EP_DMA_READY_TIMEOUT_US		2000000
+
 #define DW_BLOCK(a, b, c) \
 	{ \
 		.bar = a, \
@@ -93,6 +100,12 @@ struct dw_edma_pcie_match_data {
 #define DW_EDMA_PCIE_F_RAW_SLAVE_ADDR	BIT(1)
 #define DW_EDMA_PCIE_F_REG_OFFSET	BIT(2)
 
+struct dw_edma_pcie_ep_dma_view {
+	struct pci_dev *pdev;
+	void __iomem *base;
+	resource_size_t limit;
+};
+
 static const struct dw_edma_pcie_data snps_edda_data = {
 	/* eDMA registers location */
 	.rg.bar				= BAR_0,
@@ -144,6 +157,13 @@ static const struct dw_edma_pcie_data xilinx_mdb_data = {
 	.rd_ch_cnt			= 8,
 };
 
+static const struct dw_edma_pcie_data ep_dma_data = {
+	.mf				= EDMA_MF_EDMA_UNROLL,
+	.irqs				= EDMA_MAX_WR_CH + EDMA_MAX_RD_CH,
+	.wr_ch_cnt			= EDMA_MAX_WR_CH,
+	.rd_ch_cnt			= EDMA_MAX_RD_CH,
+};
+
 static void dw_edma_set_chan_region_offset(struct dw_edma_pcie_data *pdata,
 					   enum pci_barno bar, off_t start_off,
 					   off_t ll_off_gap, size_t ll_size,
@@ -217,6 +237,82 @@ static const struct dw_edma_plat_ops dw_edma_pcie_raw_addr_plat_ops = {
 	.irq_vector = dw_edma_pcie_irq_vector,
 };
 
+static bool dw_edma_pcie_valid_bar(enum pci_barno bar)
+{
+	return bar >= BAR_0 && bar <= BAR_5;
+}
+
+static bool dw_edma_pcie_valid_bar_range(struct pci_dev *pdev,
+					 enum pci_barno bar, u64 off,
+					 size_t sz)
+{
+	resource_size_t bar_len;
+
+	if (!dw_edma_pcie_valid_bar(bar) || !sz)
+		return false;
+
+	bar_len = pci_resource_len(pdev, bar);
+
+	return off <= bar_len && sz <= bar_len - off;
+}
+
+static bool dw_edma_pcie_valid_block(struct pci_dev *pdev,
+				     const struct dw_edma_block *block)
+{
+	return dw_edma_pcie_valid_bar_range(pdev, block->bar, block->off,
+					    block->sz);
+}
+
+static bool dw_edma_pcie_ep_dma_bar_scannable(struct pci_dev *pdev,
+					      enum pci_barno bar)
+{
+	unsigned long flags = pci_resource_flags(pdev, bar);
+
+	if (!(flags & IORESOURCE_MEM))
+		return false;
+
+	if (flags & (IORESOURCE_UNSET | IORESOURCE_DISABLED))
+		return false;
+
+	return pci_resource_len(pdev, bar) >= PCI_EP_DMA_METADATA_HDR_LEN;
+}
+
+static u32 dw_edma_pcie_ep_dma_readl(struct dw_edma_pcie_ep_dma_view *view,
+				     u16 off)
+{
+	return readl(view->base + off);
+}
+
+static void dw_edma_pcie_ep_dma_writel(struct dw_edma_pcie_ep_dma_view *view,
+				       u16 off, u32 val)
+{
+	writel(val, view->base + off);
+}
+
+static u64 dw_edma_pcie_ep_dma_read64(struct dw_edma_pcie_ep_dma_view *view,
+				      u16 lo, u16 hi)
+{
+	u64 val;
+
+	val = dw_edma_pcie_ep_dma_readl(view, hi);
+
+	return (val << 32) | dw_edma_pcie_ep_dma_readl(view, lo);
+}
+
+static int dw_edma_pcie_ep_dma_read_off(struct dw_edma_pcie_ep_dma_view *view,
+					u16 lo, u16 hi, off_t *off)
+{
+	u64 val;
+
+	val = dw_edma_pcie_ep_dma_read64(view, lo, hi);
+	if (val > type_max(*off))
+		return -EINVAL;
+
+	*off = val;
+
+	return 0;
+}
+
 static void dw_edma_pcie_get_synopsys_dma_data(struct pci_dev *pdev,
 					       struct dw_edma_pcie_data *pdata)
 {
@@ -318,6 +414,265 @@ static void dw_edma_pcie_get_xilinx_dma_data(struct pci_dev *pdev,
 	pdata->devmem_phys_off = off;
 }
 
+static int
+dw_edma_pcie_parse_ep_dma_ch_table(struct dw_edma_pcie_ep_dma_view *view,
+				   struct dw_edma_pcie_data *pdata,
+				   u16 table_off, u16 entry_size, u16 ch_cnt,
+				   bool write)
+{
+	struct dw_edma_block *desc_blocks = write ? pdata->ll_wr : pdata->ll_rd;
+	struct dw_edma_block *data_blocks = write ? pdata->dt_wr : pdata->dt_rd;
+	u32 ctrl;
+	u16 i;
+	int ret;
+
+	for (i = 0; i < ch_cnt; i++) {
+		struct dw_edma_block *desc_block = &desc_blocks[i];
+		struct dw_edma_block *data_block = &data_blocks[i];
+		u16 off = table_off + i * entry_size;
+		u16 field, lo, hi;
+
+		field = off + PCI_EP_DMA_METADATA_CH_CTRL;
+		ctrl = dw_edma_pcie_ep_dma_readl(view, field);
+		if (FIELD_GET(PCI_EP_DMA_METADATA_CH_CTRL_HW_CH, ctrl) != i)
+			return -EOPNOTSUPP;
+
+		desc_block->bar =
+			FIELD_GET(PCI_EP_DMA_METADATA_CH_CTRL_DESC_BAR, ctrl);
+		lo = off + PCI_EP_DMA_METADATA_CH_DESC_OFF_LO;
+		hi = off + PCI_EP_DMA_METADATA_CH_DESC_OFF_HI;
+		ret = dw_edma_pcie_ep_dma_read_off(view, lo, hi,
+						   &desc_block->off);
+		if (ret)
+			return ret;
+		field = off + PCI_EP_DMA_METADATA_CH_DESC_SIZE;
+		desc_block->sz = dw_edma_pcie_ep_dma_readl(view, field);
+		lo = off + PCI_EP_DMA_METADATA_CH_DESC_ADDR_LO;
+		hi = off + PCI_EP_DMA_METADATA_CH_DESC_ADDR_HI;
+		desc_block->paddr =
+			dw_edma_pcie_ep_dma_read64(view, lo, hi);
+		desc_block->paddr_valid = true;
+		if (!dw_edma_pcie_valid_block(view->pdev, desc_block))
+			return -EINVAL;
+
+		*data_block = (struct dw_edma_block) { .bar = NO_BAR };
+		if (!(ctrl & PCI_EP_DMA_METADATA_CH_CTRL_AUX_VALID))
+			continue;
+
+		data_block->bar =
+			FIELD_GET(PCI_EP_DMA_METADATA_CH_CTRL_AUX_BAR, ctrl);
+		lo = off + PCI_EP_DMA_METADATA_CH_AUX_OFF_LO;
+		hi = off + PCI_EP_DMA_METADATA_CH_AUX_OFF_HI;
+		ret = dw_edma_pcie_ep_dma_read_off(view, lo, hi,
+						   &data_block->off);
+		if (ret)
+			return ret;
+		field = off + PCI_EP_DMA_METADATA_CH_AUX_SIZE;
+		data_block->sz = dw_edma_pcie_ep_dma_readl(view, field);
+		lo = off + PCI_EP_DMA_METADATA_CH_AUX_ADDR_LO;
+		hi = off + PCI_EP_DMA_METADATA_CH_AUX_ADDR_HI;
+		data_block->paddr =
+			dw_edma_pcie_ep_dma_read64(view, lo, hi);
+		data_block->paddr_valid = true;
+		if (!dw_edma_pcie_valid_block(view->pdev, data_block))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+dw_edma_pcie_ep_dma_wait_ready(struct dw_edma_pcie_ep_dma_view *view)
+{
+	u32 val;
+
+	return read_poll_timeout(dw_edma_pcie_ep_dma_readl, val,
+				 val & PCI_EP_DMA_METADATA_CTRL_READY,
+				 DW_PCIE_EP_DMA_READY_POLL_US,
+				 DW_PCIE_EP_DMA_READY_TIMEOUT_US, false,
+				 view, PCI_EP_DMA_METADATA_CTRL);
+}
+
+static int
+dw_edma_pcie_validate_ep_dma_metadata(struct dw_edma_pcie_ep_dma_view *view,
+				      u32 *metadata_ctrl, u8 *reg_layout_data)
+{
+	size_t table_size, table_end;
+	enum pci_barno reg_bar;
+	u16 len, entry_size;
+	u16 wr_ch_cnt, rd_ch_cnt;
+	u8 layout, layout_data;
+	u32 val;
+
+	val = dw_edma_pcie_ep_dma_readl(view, 0);
+	if (val != PCI_EP_DMA_METADATA_MAGIC)
+		return -ENODEV;
+
+	val = dw_edma_pcie_ep_dma_readl(view, PCI_EP_DMA_METADATA_HDR);
+	if (FIELD_GET(PCI_EP_DMA_METADATA_HDR_REV, val) !=
+	    PCI_EP_DMA_METADATA_REV)
+		return -EINVAL;
+
+	len = FIELD_GET(PCI_EP_DMA_METADATA_HDR_LEN_FIELD, val);
+	if (len < PCI_EP_DMA_METADATA_HDR_LEN)
+		return -EINVAL;
+	if (len > view->limit)
+		return -EINVAL;
+
+	val = dw_edma_pcie_ep_dma_readl(view, PCI_EP_DMA_METADATA_REG_LAYOUT);
+	layout = FIELD_GET(PCI_EP_DMA_METADATA_REG_LAYOUT_ID, val);
+	if (layout != PCI_EP_DMA_METADATA_REG_LAYOUT_DW_EDMA)
+		return -EOPNOTSUPP;
+
+	layout_data = FIELD_GET(PCI_EP_DMA_METADATA_REG_LAYOUT_DATA, val);
+	if (layout_data == EDMA_MF_EDMA_LEGACY ||
+	    layout_data == EDMA_MF_HDMA_NATIVE)
+		return -EOPNOTSUPP;
+	if (layout_data != EDMA_MF_EDMA_UNROLL &&
+	    layout_data != EDMA_MF_HDMA_COMPAT)
+		return -EINVAL;
+
+	val = dw_edma_pcie_ep_dma_readl(view, PCI_EP_DMA_METADATA_CTRL);
+	reg_bar = FIELD_GET(PCI_EP_DMA_METADATA_CTRL_REG_BAR, val);
+	if (!dw_edma_pcie_valid_bar(reg_bar))
+		return -EINVAL;
+
+	wr_ch_cnt = FIELD_GET(PCI_EP_DMA_METADATA_CTRL_WR_CH_COUNT, val);
+	rd_ch_cnt = FIELD_GET(PCI_EP_DMA_METADATA_CTRL_RD_CH_COUNT, val);
+	if (!wr_ch_cnt && !rd_ch_cnt)
+		return -EINVAL;
+	if (wr_ch_cnt > EDMA_MAX_WR_CH || rd_ch_cnt > EDMA_MAX_RD_CH)
+		return -EINVAL;
+
+	entry_size = FIELD_GET(PCI_EP_DMA_METADATA_CTRL_CH_ENTRY_SIZE, val);
+	if (entry_size < PCI_EP_DMA_METADATA_CH_ENTRY_SIZE ||
+	    entry_size % sizeof(u32))
+		return -EINVAL;
+
+	if (check_mul_overflow((size_t)(wr_ch_cnt + rd_ch_cnt),
+			       (size_t)entry_size, &table_size) ||
+	    check_add_overflow((size_t)PCI_EP_DMA_METADATA_HDR_LEN,
+			       table_size, &table_end) ||
+	    table_end > len)
+		return -EINVAL;
+
+	if (metadata_ctrl)
+		*metadata_ctrl = val;
+	if (reg_layout_data)
+		*reg_layout_data = layout_data;
+
+	return 0;
+}
+
+static int
+dw_edma_pcie_parse_ep_dma_data(struct dw_edma_pcie_ep_dma_view *view,
+			       struct dw_edma_pcie_data *pdata)
+{
+	u32 ctrl, reg_sz;
+	u8 reg_layout_data;
+	u64 reg_off;
+	u16 wr_table, rd_table, entry_size;
+	u16 wr_ch_cnt, rd_ch_cnt;
+	int ret;
+
+	ret = dw_edma_pcie_validate_ep_dma_metadata(view, &ctrl,
+						    &reg_layout_data);
+	if (ret)
+		return ret;
+
+	pci_dbg(view->pdev, "Detected PCI endpoint DMA BAR metadata\n");
+
+	pdata->mf = reg_layout_data;
+	pdata->rg.bar = FIELD_GET(PCI_EP_DMA_METADATA_CTRL_REG_BAR, ctrl);
+
+	wr_ch_cnt = FIELD_GET(PCI_EP_DMA_METADATA_CTRL_WR_CH_COUNT, ctrl);
+	rd_ch_cnt = FIELD_GET(PCI_EP_DMA_METADATA_CTRL_RD_CH_COUNT, ctrl);
+	pdata->wr_ch_cnt = min_t(u16, pdata->wr_ch_cnt, wr_ch_cnt);
+	pdata->rd_ch_cnt = min_t(u16, pdata->rd_ch_cnt, rd_ch_cnt);
+	pdata->irqs = pdata->wr_ch_cnt + pdata->rd_ch_cnt;
+	reg_off = dw_edma_pcie_ep_dma_read64(view,
+					     PCI_EP_DMA_METADATA_REG_OFF_LO,
+					     PCI_EP_DMA_METADATA_REG_OFF_HI);
+	reg_sz = dw_edma_pcie_ep_dma_readl(view, PCI_EP_DMA_METADATA_REG_SIZE);
+	if (reg_off > type_max(pdata->rg.off) ||
+	    !dw_edma_pcie_valid_bar_range(view->pdev, pdata->rg.bar,
+					  reg_off, reg_sz))
+		return -EINVAL;
+	pdata->rg.off = reg_off;
+	pdata->rg.sz = reg_sz;
+
+	entry_size = FIELD_GET(PCI_EP_DMA_METADATA_CTRL_CH_ENTRY_SIZE, ctrl);
+	wr_table = PCI_EP_DMA_METADATA_HDR_LEN;
+	rd_table = PCI_EP_DMA_METADATA_HDR_LEN + wr_ch_cnt * entry_size;
+
+	ret = dw_edma_pcie_parse_ep_dma_ch_table(view, pdata, wr_table,
+						 entry_size, pdata->wr_ch_cnt,
+						 true);
+	if (ret)
+		return ret;
+
+	return dw_edma_pcie_parse_ep_dma_ch_table(view, pdata, rd_table,
+						  entry_size,
+						  pdata->rd_ch_cnt, false);
+}
+
+static int
+dw_edma_pcie_parse_ep_dma_caps(struct pci_dev *pdev,
+			       struct dw_edma_pcie_data *pdata, bool *non_ll)
+{
+	struct dw_edma_pcie_ep_dma_view metadata_view;
+	void __iomem *base;
+	resource_size_t bar_len;
+	enum pci_barno bar;
+	u32 ctrl;
+	int ret;
+
+	for (bar = BAR_0; bar < PCI_STD_NUM_BARS; bar++) {
+		if (!dw_edma_pcie_ep_dma_bar_scannable(pdev, bar))
+			continue;
+
+		bar_len = pci_resource_len(pdev, bar);
+		base = pci_iomap_range(pdev, bar, 0, 0);
+		if (!base)
+			continue;
+
+		metadata_view = (struct dw_edma_pcie_ep_dma_view) {
+			.pdev = pdev,
+			.base = base,
+			.limit = bar_len,
+		};
+		ret = dw_edma_pcie_validate_ep_dma_metadata(&metadata_view,
+							    NULL, NULL);
+		if (ret == -ENODEV) {
+			pci_iounmap(metadata_view.pdev, base);
+			continue;
+		}
+		if (ret) {
+			pci_iounmap(metadata_view.pdev, base);
+			return ret;
+		}
+
+		ctrl = dw_edma_pcie_ep_dma_readl(&metadata_view,
+						 PCI_EP_DMA_METADATA_CTRL);
+		ctrl |= PCI_EP_DMA_METADATA_CTRL_HOST_REQ;
+		dw_edma_pcie_ep_dma_writel(&metadata_view,
+					   PCI_EP_DMA_METADATA_CTRL, ctrl);
+
+		ret = dw_edma_pcie_ep_dma_wait_ready(&metadata_view);
+		if (ret) {
+			pci_iounmap(metadata_view.pdev, base);
+			return ret;
+		}
+
+		ret = dw_edma_pcie_parse_ep_dma_data(&metadata_view, pdata);
+		pci_iounmap(metadata_view.pdev, base);
+
+		return ret;
+	}
+
+	return -ENODEV;
+}
+
 static int
 dw_edma_pcie_parse_synopsys_caps(struct pci_dev *pdev,
 				 struct dw_edma_pcie_data *pdata, bool *non_ll)
@@ -357,6 +712,14 @@ dw_edma_pcie_parse_xilinx_caps(struct pci_dev *pdev,
 	return 0;
 }
 
+static const struct dw_edma_pcie_match_data ep_dma_match_data = {
+	.data = &ep_dma_data,
+	.parse_caps = dw_edma_pcie_parse_ep_dma_caps,
+	.flags = DW_EDMA_PCIE_F_REG_OFFSET | DW_EDMA_PCIE_F_RAW_SLAVE_ADDR,
+	.chip_flags = DW_EDMA_CHIP_PARTIAL,
+	.default_irq_mode = DW_EDMA_CH_IRQ_REMOTE,
+};
+
 static u64 dw_edma_get_phys_addr(struct pci_dev *pdev,
 				 const struct dw_edma_pcie_match_data *match,
 				 struct dw_edma_pcie_data *pdata,
@@ -384,7 +747,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 			      const struct pci_device_id *pid)
 {
 	const struct dw_edma_pcie_match_data *match = (void *)pid->driver_data;
-	const struct dw_edma_pcie_data *pdata = match->data;
+	const struct dw_edma_pcie_data *pdata;
 	struct device *dev = &pdev->dev;
 	struct dw_edma_chip *chip;
 	int err, nr_irqs;
@@ -398,6 +761,10 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 		return err;
 	}
 
+	if (!match)
+		match = &ep_dma_match_data;
+	pdata = match->data;
+
 	struct dw_edma_pcie_data *dma_data __free(kfree) =
 		kmemdup(pdata, sizeof(*dma_data), GFP_KERNEL);
 	if (!dma_data)
-- 
2.51.0


^ permalink raw reply related

* [PATCH 2/3] PCI: endpoint: Add DMA endpoint function
From: Koichiro Den @ 2026-05-21  6:36 UTC (permalink / raw)
  To: Manivannan Sadhasivam, Krzysztof Wilczyński,
	Kishon Vijay Abraham I, Bjorn Helgaas, Jonathan Corbet,
	Shuah Khan, Vinod Koul, Frank Li, Arnd Bergmann, Damien Le Moal,
	Niklas Cassel
  Cc: Marek Vasut, Yoshihiro Shimoda, linux-pci, linux-doc,
	linux-kernel, dmaengine
In-Reply-To: <20260521063638.2843021-1-den@valinux.co.jp>

Add pci-epf-dma, an endpoint function that exposes selected
endpoint-integrated DMA channels as a separate PCI DMA controller
function.

The function consumes EPC auxiliary DMA resources, publishes a stable
metadata BAR for host discovery, and uses a DMA window BAR for DMA
resources that are not already host-visible. After the host-side driver
finds the metadata and requests the final layout, the endpoint function
programs DMA window BAR submaps and marks the metadata ready.

The endpoint function does not bake in a vendor/device ID. As with other
generic endpoint functions, users provide the PCI IDs through the common
EPF configfs header attributes.

Signed-off-by: Koichiro Den <den@valinux.co.jp>
---
 drivers/pci/endpoint/functions/Kconfig       |   14 +
 drivers/pci/endpoint/functions/Makefile      |    1 +
 drivers/pci/endpoint/functions/pci-epf-dma.c | 1361 ++++++++++++++++++
 3 files changed, 1376 insertions(+)
 create mode 100644 drivers/pci/endpoint/functions/pci-epf-dma.c

diff --git a/drivers/pci/endpoint/functions/Kconfig b/drivers/pci/endpoint/functions/Kconfig
index bb5a23994288..078ac19dc772 100644
--- a/drivers/pci/endpoint/functions/Kconfig
+++ b/drivers/pci/endpoint/functions/Kconfig
@@ -39,6 +39,20 @@ config PCI_EPF_VNTB
 
 	  If in doubt, say "N" to disable Endpoint NTB driver.
 
+config PCI_EPF_DMA
+	tristate "PCI Endpoint DMA driver"
+	depends on PCI_ENDPOINT
+	select CONFIGFS_FS
+	select DMA_ENGINE
+	help
+	  Select this configuration option to expose an endpoint-integrated
+	  DMA controller as a PCI endpoint function. The function advertises
+	  the DMA controller layout to the host using BAR-resident metadata
+	  and maps resources that are not already host-visible into the
+	  DMA window BAR.
+
+	  If in doubt, say "N" to disable Endpoint DMA driver.
+
 config PCI_EPF_MHI
 	tristate "PCI Endpoint driver for MHI bus"
 	depends on PCI_ENDPOINT && MHI_BUS_EP
diff --git a/drivers/pci/endpoint/functions/Makefile b/drivers/pci/endpoint/functions/Makefile
index 696473fce50e..de92f6897b8f 100644
--- a/drivers/pci/endpoint/functions/Makefile
+++ b/drivers/pci/endpoint/functions/Makefile
@@ -6,4 +6,5 @@
 obj-$(CONFIG_PCI_EPF_TEST)		+= pci-epf-test.o
 obj-$(CONFIG_PCI_EPF_NTB)		+= pci-epf-ntb.o
 obj-$(CONFIG_PCI_EPF_VNTB) 		+= pci-epf-vntb.o
+obj-$(CONFIG_PCI_EPF_DMA)		+= pci-epf-dma.o
 obj-$(CONFIG_PCI_EPF_MHI)		+= pci-epf-mhi.o
diff --git a/drivers/pci/endpoint/functions/pci-epf-dma.c b/drivers/pci/endpoint/functions/pci-epf-dma.c
new file mode 100644
index 000000000000..d7761966eca2
--- /dev/null
+++ b/drivers/pci/endpoint/functions/pci-epf-dma.c
@@ -0,0 +1,1361 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI endpoint function that exposes an endpoint-integrated DMA controller
+ * to the PCI host.
+ *
+ * The host-side dw-edma-pcie driver consumes the BAR metadata published
+ * by this function.
+ */
+
+#include <linux/bitfield.h>
+#include <linux/configfs.h>
+#include <linux/dma/edma.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
+#include <linux/module.h>
+#include <linux/overflow.h>
+#include <linux/pci-ep-dma.h>
+#include <linux/pci-epc.h>
+#include <linux/pci-epf.h>
+#include <linux/pci_regs.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+/* HOST_REQ is set by the host driver, so poll it at a low idle rate. */
+#define PCI_EPF_DMA_HOST_REQ_POLL_MS	500
+
+struct pci_epf_dma_bar_map {
+	const struct pci_epc_aux_resource *res;
+	enum pci_barno bar;
+	u64 res_offset_in_bar;
+	u64 submap_offset_in_bar;
+	dma_addr_t phys_addr;
+	size_t map_size;
+	bool needs_submap;
+};
+
+struct pci_epf_dma {
+	struct pci_epf *epf;
+	struct config_group group;
+	struct delayed_work map_work;
+
+	enum pci_barno metadata_bar;
+	enum pci_barno dma_window_bar;
+	u16 wr_chans;
+	u16 rd_chans;
+	u8 reg_layout;
+	u8 reg_layout_data;
+
+	/* Backing storage for ctrl and descriptor resource pointers. */
+	struct pci_epc_aux_resource *resources;
+	unsigned int num_resources;
+	const struct pci_epc_aux_resource *ctrl;
+	const struct pci_epc_aux_resource *ep_to_rc_desc[EDMA_MAX_WR_CH];
+	const struct pci_epc_aux_resource *rc_to_ep_desc[EDMA_MAX_RD_CH];
+
+	/* Local DMAengine reservations for channels delegated to the host. */
+	struct dma_chan *ep_to_rc_chan[EDMA_MAX_WR_CH];
+	struct dma_chan *rc_to_ep_chan[EDMA_MAX_RD_CH];
+
+	void *metadata_addr;
+	void *dma_window_addr;
+	size_t msix_table_offset;
+	struct pci_epf_dma_bar_map *bar_maps;
+	unsigned int num_bar_maps;
+	struct pci_epf_bar_submap *submaps;
+	unsigned int num_submaps;
+
+	/* Cleared when a later event should retry programming the submaps. */
+	bool submaps_programmed;
+};
+
+#define to_epf_dma(epf_group) container_of((epf_group), struct pci_epf_dma, group)
+
+static struct pci_epf_header pci_epf_dma_header = {
+	.vendorid	= PCI_ANY_ID,
+	.deviceid	= PCI_ANY_ID,
+	.baseclass_code	= PCI_BASE_CLASS_SYSTEM,
+	.subclass_code	= PCI_CLASS_SYSTEM_DMA & 0xff,
+	.interrupt_pin	= PCI_INTERRUPT_INTA,
+};
+
+static void pci_epf_dma_release_channels(struct pci_epf_dma *epf_dma)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(epf_dma->ep_to_rc_chan); i++) {
+		if (!epf_dma->ep_to_rc_chan[i])
+			continue;
+
+		dma_release_channel(epf_dma->ep_to_rc_chan[i]);
+		epf_dma->ep_to_rc_chan[i] = NULL;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(epf_dma->rc_to_ep_chan); i++) {
+		if (!epf_dma->rc_to_ep_chan[i])
+			continue;
+
+		dma_release_channel(epf_dma->rc_to_ep_chan[i]);
+		epf_dma->rc_to_ep_chan[i] = NULL;
+	}
+}
+
+static int pci_epf_dma_claim_channel(struct pci_epf_dma *epf_dma,
+				     const struct pci_epc_aux_resource *res,
+				     struct dma_chan **chan)
+{
+	struct device *dev = &epf_dma->epf->dev;
+	struct dma_chan *dma_chan;
+
+	if (!res->u.dma_desc.dma_chan) {
+		dev_err(dev, "DMA channel %u cannot be reserved\n",
+			res->u.dma_desc.hw_ch);
+		return -EOPNOTSUPP;
+	}
+
+	dma_chan = dma_get_slave_channel(res->u.dma_desc.dma_chan);
+	if (!dma_chan) {
+		dev_err(dev, "DMA channel %u is already in use\n",
+			res->u.dma_desc.hw_ch);
+		return -EBUSY;
+	}
+
+	*chan = dma_chan;
+
+	return 0;
+}
+
+static int
+pci_epf_dma_validate_dw_edma_ctrl(struct pci_epf_dma *epf_dma,
+				  const struct pci_epc_aux_resource *ctrl)
+{
+	struct device *dev = &epf_dma->epf->dev;
+	enum dw_edma_map_format map = ctrl->u.dma_ctrl.reg_layout_data;
+	u16 total_wr_chans = ctrl->u.dma_ctrl.ep_to_rc_ch_cnt;
+	u16 total_rd_chans = ctrl->u.dma_ctrl.rc_to_ep_ch_cnt;
+
+	switch (map) {
+	case EDMA_MF_EDMA_LEGACY:
+		dev_err(dev, "legacy DesignWare eDMA layout cannot be delegated\n");
+		return -EOPNOTSUPP;
+	case EDMA_MF_EDMA_UNROLL:
+	case EDMA_MF_HDMA_COMPAT:
+		if ((epf_dma->wr_chans && epf_dma->wr_chans != total_wr_chans) ||
+		    (epf_dma->rd_chans && epf_dma->rd_chans != total_rd_chans)) {
+			dev_err(dev, "DesignWare eDMA v0 delegation must cover the whole direction\n");
+			return -EOPNOTSUPP;
+		}
+		return 0;
+	case EDMA_MF_HDMA_NATIVE:
+		dev_err(dev, "DesignWare HDMA native layout cannot be delegated\n");
+		return -EOPNOTSUPP;
+	default:
+		return -EINVAL;
+	}
+}
+
+static bool pci_epf_dma_bar_usable(const struct pci_epc_features *epc_features,
+				   enum pci_barno bar)
+{
+	if (bar < BAR_0 || bar >= PCI_STD_NUM_BARS)
+		return false;
+
+	return epc_features->bar[bar].type != BAR_RESERVED &&
+	       epc_features->bar[bar].type != BAR_DISABLED;
+}
+
+static bool pci_epf_dma_bar_has_fixed_resource(struct pci_epf_dma *epf_dma,
+					       enum pci_barno bar)
+{
+	unsigned int i;
+
+	for (i = 0; i < epf_dma->num_resources; i++) {
+		if (epf_dma->resources[i].bar == bar)
+			return true;
+	}
+
+	return false;
+}
+
+static enum pci_barno
+pci_epf_dma_first_usable_bar(struct pci_epf_dma *epf_dma,
+			     const struct pci_epc_features *epc_features,
+			     enum pci_barno exclude)
+{
+	enum pci_barno bar;
+
+	for (bar = BAR_0; bar < PCI_STD_NUM_BARS; bar++) {
+		bar = pci_epc_get_next_free_bar(epc_features, bar);
+		if (bar == NO_BAR)
+			return NO_BAR;
+		if (bar != exclude &&
+		    !pci_epf_dma_bar_has_fixed_resource(epf_dma, bar))
+			return bar;
+	}
+
+	return NO_BAR;
+}
+
+static size_t pci_epf_dma_align_size(size_t size, size_t align)
+{
+	if (!align)
+		return size;
+
+	return ALIGN(size, align);
+}
+
+static int pci_epf_dma_reuse_submap(struct pci_epf_dma *epf_dma,
+				    unsigned int map_count,
+				    dma_addr_t phys_addr, size_t map_size,
+				    size_t offset, size_t *next_offset_in_bar,
+				    u64 *res_offset_in_bar)
+{
+	struct pci_epf_dma_bar_map *map;
+	u64 delta;
+	size_t merged_size, next;
+	u64 res_map_end, submap_bar_end, submap_phys_end;
+	unsigned int i;
+
+	if (check_add_overflow(phys_addr, map_size, &res_map_end))
+		return -EOVERFLOW;
+
+	for (i = 0; i < map_count; i++) {
+		map = &epf_dma->bar_maps[i];
+		if (!map->needs_submap || map->bar != epf_dma->dma_window_bar)
+			continue;
+
+		if (check_add_overflow(map->phys_addr, map->map_size,
+				       &submap_phys_end) ||
+		    check_add_overflow(map->submap_offset_in_bar,
+				       map->map_size, &submap_bar_end))
+			return -EOVERFLOW;
+
+		/*
+		 * Reuse a submap that already covers this aligned resource
+		 * window.
+		 */
+		if (phys_addr >= map->phys_addr &&
+		    res_map_end <= submap_phys_end) {
+			if (check_add_overflow(phys_addr - map->phys_addr,
+					       offset, &delta) ||
+			    check_add_overflow(map->submap_offset_in_bar,
+					       delta, res_offset_in_bar))
+				return -EOVERFLOW;
+			return 1;
+		}
+
+		/*
+		 * Extend only the BAR-tail submap when the physical ranges are
+		 * contiguous.
+		 */
+		if (submap_phys_end == phys_addr &&
+		    submap_bar_end == *next_offset_in_bar) {
+			if (check_add_overflow(map->map_size, map_size,
+					       &merged_size) ||
+			    check_add_overflow(*next_offset_in_bar, map_size,
+					       &next) ||
+			    check_add_overflow(*next_offset_in_bar, offset,
+					       res_offset_in_bar))
+				return -EOVERFLOW;
+
+			map->map_size = merged_size;
+			*next_offset_in_bar = next;
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int pci_epf_dma_add_map(struct pci_epf_dma *epf_dma,
+			       const struct pci_epc_aux_resource *res,
+			       size_t align, size_t *next_offset_in_bar,
+			       unsigned int *map_idx)
+{
+	dma_addr_t phys_addr;
+	size_t map_size, offset = 0, next;
+	u64 res_offset_in_bar;
+	int ret;
+
+	if (!res || !res->size)
+		return -EINVAL;
+
+	if (res->bar != NO_BAR) {
+		if (res->bar < BAR_0 || res->bar >= PCI_STD_NUM_BARS)
+			return -EINVAL;
+		if (res->bar == epf_dma->metadata_bar ||
+		    res->bar == epf_dma->dma_window_bar)
+			return -EINVAL;
+
+		epf_dma->bar_maps[*map_idx] = (struct pci_epf_dma_bar_map) {
+			.res = res,
+			.bar = res->bar,
+			.res_offset_in_bar = res->bar_offset,
+			.map_size = res->size,
+		};
+		(*map_idx)++;
+
+		return 0;
+	}
+
+	if (epf_dma->dma_window_bar == NO_BAR)
+		return -EOPNOTSUPP;
+
+	phys_addr = res->phys_addr;
+	/* Map the aligned window that contains this resource. */
+	if (align) {
+		phys_addr = ALIGN_DOWN(res->phys_addr, align);
+		offset = res->phys_addr - phys_addr;
+	}
+
+	if (check_add_overflow(res->size, offset, &map_size))
+		return -EOVERFLOW;
+	map_size = pci_epf_dma_align_size(map_size, align);
+
+	ret = pci_epf_dma_reuse_submap(epf_dma, *map_idx, phys_addr, map_size,
+				       offset, next_offset_in_bar,
+				       &res_offset_in_bar);
+	if (ret < 0)
+		return ret;
+	if (ret) {
+		epf_dma->bar_maps[*map_idx] = (struct pci_epf_dma_bar_map) {
+			.res = res,
+			.bar = epf_dma->dma_window_bar,
+			.res_offset_in_bar = res_offset_in_bar,
+			.phys_addr = res->phys_addr,
+			.map_size = res->size,
+		};
+
+		(*map_idx)++;
+
+		return 0;
+	}
+
+	if (check_add_overflow(*next_offset_in_bar, map_size, &next))
+		return -EOVERFLOW;
+	if (check_add_overflow(*next_offset_in_bar, offset, &res_offset_in_bar))
+		return -EOVERFLOW;
+
+	epf_dma->bar_maps[*map_idx] = (struct pci_epf_dma_bar_map) {
+		.res = res,
+		.bar = epf_dma->dma_window_bar,
+		.res_offset_in_bar = res_offset_in_bar,
+		.submap_offset_in_bar = *next_offset_in_bar,
+		.phys_addr = phys_addr,
+		.map_size = map_size,
+		.needs_submap = true,
+	};
+
+	*next_offset_in_bar = next;
+	(*map_idx)++;
+
+	return 0;
+}
+
+static const struct pci_epf_dma_bar_map *
+pci_epf_dma_find_map(struct pci_epf_dma *epf_dma,
+		     const struct pci_epc_aux_resource *res)
+{
+	unsigned int i;
+
+	for (i = 0; i < epf_dma->num_bar_maps; i++) {
+		if (epf_dma->bar_maps[i].res == res)
+			return &epf_dma->bar_maps[i];
+	}
+
+	return NULL;
+}
+
+static bool pci_epf_dma_needs_dma_window(struct pci_epf_dma *epf_dma)
+{
+	unsigned int i;
+
+	if (epf_dma->ctrl && epf_dma->ctrl->bar == NO_BAR)
+		return true;
+
+	for (i = 0; i < epf_dma->wr_chans; i++) {
+		if (epf_dma->ep_to_rc_desc[i] &&
+		    epf_dma->ep_to_rc_desc[i]->bar == NO_BAR)
+			return true;
+	}
+
+	for (i = 0; i < epf_dma->rd_chans; i++) {
+		if (epf_dma->rc_to_ep_desc[i] &&
+		    epf_dma->rc_to_ep_desc[i]->bar == NO_BAR)
+			return true;
+	}
+
+	return false;
+}
+
+static int pci_epf_dma_collect_resources(struct pci_epf_dma *epf_dma)
+{
+	const struct pci_epc_aux_resource *ep_to_rc_desc[EDMA_MAX_WR_CH] = {};
+	const struct pci_epc_aux_resource *rc_to_ep_desc[EDMA_MAX_RD_CH] = {};
+	const struct pci_epc_aux_resource *ctrl = NULL;
+	struct pci_epf *epf = epf_dma->epf;
+	struct pci_epc *epc = epf->epc;
+	struct device *dev = &epf->dev;
+	int count, i, ret;
+
+	count = pci_epc_get_aux_resources_count(epc, epf->func_no,
+						epf->vfunc_no);
+	if (count <= 0)
+		return count ?: -ENODEV;
+
+	struct pci_epc_aux_resource *res __free(kfree) =
+						kzalloc_objs(*res, count);
+	if (!res)
+		return -ENOMEM;
+
+	ret = pci_epc_get_aux_resources(epc, epf->func_no, epf->vfunc_no,
+					res, count);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < count; i++) {
+		switch (res[i].type) {
+		case PCI_EPC_AUX_DMA_CTRL_MMIO:
+			if (ctrl)
+				return -EINVAL;
+			ctrl = &res[i];
+			break;
+		case PCI_EPC_AUX_DMA_DESC_MEM: {
+			u16 hw_ch = res[i].u.dma_desc.hw_ch;
+
+			switch (res[i].u.dma_desc.dir) {
+			case PCI_EPC_AUX_DMA_EP_TO_RC:
+				if (hw_ch >= EDMA_MAX_WR_CH ||
+				    ep_to_rc_desc[hw_ch])
+					return -EINVAL;
+				ep_to_rc_desc[hw_ch] = &res[i];
+				break;
+			case PCI_EPC_AUX_DMA_RC_TO_EP:
+				if (hw_ch >= EDMA_MAX_RD_CH ||
+				    rc_to_ep_desc[hw_ch])
+					return -EINVAL;
+				rc_to_ep_desc[hw_ch] = &res[i];
+				break;
+			default:
+				return -EINVAL;
+			}
+			break;
+		}
+		default:
+			continue;
+		}
+	}
+
+	if (!ctrl)
+		return -ENODEV;
+
+	if (!epf_dma->wr_chans && !epf_dma->rd_chans)
+		return -EINVAL;
+
+	if (epf_dma->wr_chans > ctrl->u.dma_ctrl.ep_to_rc_ch_cnt ||
+	    epf_dma->rd_chans > ctrl->u.dma_ctrl.rc_to_ep_ch_cnt)
+		return -EINVAL;
+
+	switch (ctrl->u.dma_ctrl.reg_layout) {
+	case PCI_EPC_AUX_DMA_REG_LAYOUT_DW_EDMA:
+		ret = pci_epf_dma_validate_dw_edma_ctrl(epf_dma, ctrl);
+		if (ret)
+			return ret;
+		epf_dma->reg_layout = PCI_EP_DMA_METADATA_REG_LAYOUT_DW_EDMA;
+		epf_dma->reg_layout_data = ctrl->u.dma_ctrl.reg_layout_data;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	for (i = 0; i < epf_dma->wr_chans; i++) {
+		if (!ep_to_rc_desc[i]) {
+			dev_err(dev, "missing dense write DMA channel %d\n", i);
+			return -EINVAL;
+		}
+	}
+
+	for (i = 0; i < epf_dma->rd_chans; i++) {
+		if (!rc_to_ep_desc[i]) {
+			dev_err(dev, "missing dense read DMA channel %d\n", i);
+			return -EINVAL;
+		}
+	}
+
+	for (i = 0; i < epf_dma->wr_chans; i++) {
+		ret = pci_epf_dma_claim_channel(epf_dma, ep_to_rc_desc[i],
+						&epf_dma->ep_to_rc_chan[i]);
+		if (ret)
+			goto err_release_channels;
+	}
+
+	for (i = 0; i < epf_dma->rd_chans; i++) {
+		ret = pci_epf_dma_claim_channel(epf_dma, rc_to_ep_desc[i],
+						&epf_dma->rc_to_ep_chan[i]);
+		if (ret)
+			goto err_release_channels;
+	}
+
+	epf_dma->resources = no_free_ptr(res);
+	epf_dma->num_resources = count;
+	epf_dma->ctrl = ctrl;
+	memcpy(epf_dma->ep_to_rc_desc, ep_to_rc_desc, sizeof(ep_to_rc_desc));
+	memcpy(epf_dma->rc_to_ep_desc, rc_to_ep_desc, sizeof(rc_to_ep_desc));
+
+	return 0;
+
+err_release_channels:
+	pci_epf_dma_release_channels(epf_dma);
+
+	return ret;
+}
+
+static void pci_epf_dma_metadata_write(__le32 *metadata, u16 metadata_off,
+				       u32 val)
+{
+	metadata[metadata_off / sizeof(*metadata)] = cpu_to_le32(val);
+}
+
+static void pci_epf_dma_metadata_write64(__le32 *metadata, u16 metadata_off,
+					 u64 val)
+{
+	pci_epf_dma_metadata_write(metadata, metadata_off, lower_32_bits(val));
+	pci_epf_dma_metadata_write(metadata, metadata_off + sizeof(u32),
+				   upper_32_bits(val));
+}
+
+static int pci_epf_dma_build_ch_entry(const struct pci_epf_dma_bar_map *map,
+				      __le32 *metadata, u16 entry)
+{
+	const struct pci_epc_aux_resource *res = map->res;
+	u32 ctrl;
+
+	if (res->size > U32_MAX)
+		return -EOVERFLOW;
+
+	ctrl = FIELD_PREP(PCI_EP_DMA_METADATA_CH_CTRL_HW_CH,
+			  res->u.dma_desc.hw_ch) |
+	       FIELD_PREP(PCI_EP_DMA_METADATA_CH_CTRL_DESC_BAR, map->bar);
+
+	pci_epf_dma_metadata_write(metadata, entry + PCI_EP_DMA_METADATA_CH_CTRL,
+				   ctrl);
+	pci_epf_dma_metadata_write64(metadata,
+				     entry + PCI_EP_DMA_METADATA_CH_DESC_OFF_LO,
+				     map->res_offset_in_bar);
+	pci_epf_dma_metadata_write(metadata,
+				   entry + PCI_EP_DMA_METADATA_CH_DESC_SIZE,
+				   (u32)res->size);
+	pci_epf_dma_metadata_write64(metadata,
+				     entry + PCI_EP_DMA_METADATA_CH_DESC_ADDR_LO,
+				     res->phys_addr);
+
+	return 0;
+}
+
+static void pci_epf_dma_set_metadata_ready(struct pci_epf_dma *epf_dma,
+					   bool ready)
+{
+	__le32 *metadata = epf_dma->metadata_addr;
+	__le32 *ctrl_ptr;
+	u32 ctrl;
+
+	if (!metadata)
+		return;
+
+	ctrl_ptr = &metadata[PCI_EP_DMA_METADATA_CTRL / sizeof(*metadata)];
+	ctrl = le32_to_cpu(READ_ONCE(*ctrl_ptr));
+	if (ready) {
+		dma_wmb();
+		ctrl |= PCI_EP_DMA_METADATA_CTRL_READY;
+	} else {
+		ctrl &= ~PCI_EP_DMA_METADATA_CTRL_READY;
+	}
+	WRITE_ONCE(*ctrl_ptr, cpu_to_le32(ctrl));
+}
+
+static bool pci_epf_dma_metadata_host_requested(struct pci_epf_dma *epf_dma)
+{
+	__le32 *metadata = epf_dma->metadata_addr;
+	u32 ctrl;
+
+	if (!metadata)
+		return false;
+
+	ctrl = le32_to_cpu(READ_ONCE(metadata[PCI_EP_DMA_METADATA_CTRL /
+					    sizeof(*metadata)]));
+
+	return ctrl & PCI_EP_DMA_METADATA_CTRL_HOST_REQ;
+}
+
+static void pci_epf_dma_clear_metadata_status(struct pci_epf_dma *epf_dma)
+{
+	__le32 *metadata = epf_dma->metadata_addr;
+	__le32 *ctrl_ptr;
+	u32 ctrl;
+
+	if (!metadata)
+		return;
+
+	ctrl_ptr = &metadata[PCI_EP_DMA_METADATA_CTRL / sizeof(*metadata)];
+	ctrl = le32_to_cpu(READ_ONCE(*ctrl_ptr));
+	ctrl &= ~(PCI_EP_DMA_METADATA_CTRL_HOST_REQ |
+		  PCI_EP_DMA_METADATA_CTRL_READY);
+	WRITE_ONCE(*ctrl_ptr, cpu_to_le32(ctrl));
+}
+
+static int pci_epf_dma_build_metadata(struct pci_epf_dma *epf_dma)
+{
+	const struct pci_epf_dma_bar_map *ctrl_map;
+	u16 entry_size = PCI_EP_DMA_METADATA_CH_ENTRY_SIZE;
+	u16 wr_table, rd_table, total_len;
+	__le32 *metadata = epf_dma->metadata_addr;
+	unsigned int i;
+	int ret;
+
+	if (!metadata)
+		return -EINVAL;
+
+	ctrl_map = pci_epf_dma_find_map(epf_dma, epf_dma->ctrl);
+	if (!ctrl_map)
+		return -EINVAL;
+	if (epf_dma->wr_chans > FIELD_MAX(PCI_EP_DMA_METADATA_CTRL_WR_CH_COUNT) ||
+	    epf_dma->rd_chans > FIELD_MAX(PCI_EP_DMA_METADATA_CTRL_RD_CH_COUNT) ||
+	    entry_size > FIELD_MAX(PCI_EP_DMA_METADATA_CTRL_CH_ENTRY_SIZE) ||
+	    ctrl_map->res->size > U32_MAX)
+		return -EOVERFLOW;
+
+	wr_table = epf_dma->wr_chans ? PCI_EP_DMA_METADATA_HDR_LEN : 0;
+	rd_table = epf_dma->rd_chans ?
+		   PCI_EP_DMA_METADATA_HDR_LEN + epf_dma->wr_chans * entry_size : 0;
+	total_len = PCI_EP_DMA_METADATA_HDR_LEN +
+		    (epf_dma->wr_chans + epf_dma->rd_chans) * entry_size;
+
+	memset(metadata, 0, total_len);
+
+	pci_epf_dma_metadata_write(metadata, 0, PCI_EP_DMA_METADATA_MAGIC);
+	pci_epf_dma_metadata_write(metadata, PCI_EP_DMA_METADATA_HDR,
+				   FIELD_PREP(PCI_EP_DMA_METADATA_HDR_REV,
+					      PCI_EP_DMA_METADATA_REV) |
+				   FIELD_PREP(PCI_EP_DMA_METADATA_HDR_LEN_FIELD,
+					      total_len));
+	pci_epf_dma_metadata_write(metadata, PCI_EP_DMA_METADATA_CTRL,
+				   FIELD_PREP(PCI_EP_DMA_METADATA_CTRL_REG_BAR,
+					      ctrl_map->bar) |
+				   FIELD_PREP(PCI_EP_DMA_METADATA_CTRL_WR_CH_COUNT,
+					      epf_dma->wr_chans) |
+				   FIELD_PREP(PCI_EP_DMA_METADATA_CTRL_RD_CH_COUNT,
+					      epf_dma->rd_chans) |
+				   FIELD_PREP(PCI_EP_DMA_METADATA_CTRL_CH_ENTRY_SIZE,
+					      entry_size));
+	pci_epf_dma_metadata_write64(metadata,
+				     PCI_EP_DMA_METADATA_REG_OFF_LO,
+				     ctrl_map->res_offset_in_bar);
+	pci_epf_dma_metadata_write(metadata, PCI_EP_DMA_METADATA_REG_LAYOUT,
+				   FIELD_PREP(PCI_EP_DMA_METADATA_REG_LAYOUT_ID,
+					      epf_dma->reg_layout) |
+				   FIELD_PREP(PCI_EP_DMA_METADATA_REG_LAYOUT_DATA,
+					      epf_dma->reg_layout_data));
+	pci_epf_dma_metadata_write(metadata, PCI_EP_DMA_METADATA_REG_SIZE,
+				   (u32)ctrl_map->res->size);
+
+	for (i = 0; i < epf_dma->wr_chans; i++) {
+		const struct pci_epf_dma_bar_map *map;
+
+		map = pci_epf_dma_find_map(epf_dma,
+					   epf_dma->ep_to_rc_desc[i]);
+		if (!map)
+			return -EINVAL;
+		ret = pci_epf_dma_build_ch_entry(map, metadata,
+						 wr_table + i * entry_size);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < epf_dma->rd_chans; i++) {
+		const struct pci_epf_dma_bar_map *map;
+
+		map = pci_epf_dma_find_map(epf_dma,
+					   epf_dma->rc_to_ep_desc[i]);
+		if (!map)
+			return -EINVAL;
+		ret = pci_epf_dma_build_ch_entry(map, metadata,
+						 rd_table + i * entry_size);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int pci_epf_dma_reserve_msix(struct pci_epf_dma *epf_dma,
+				    const struct pci_epc_features *epc_features,
+				    size_t *backing_size)
+{
+	struct pci_epf *epf = epf_dma->epf;
+	size_t msix_table_size, pba_size, next;
+	unsigned int nvec = epf->msix_interrupts;
+
+	epf_dma->msix_table_offset = 0;
+
+	if (!epc_features->msix_capable || !nvec)
+		return 0;
+
+	next = ALIGN(*backing_size, 8);
+	if (next > U32_MAX)
+		return -EOVERFLOW;
+	epf_dma->msix_table_offset = next;
+
+	if (check_mul_overflow(PCI_MSIX_ENTRY_SIZE, nvec, &msix_table_size))
+		return -EOVERFLOW;
+
+	pba_size = ALIGN(DIV_ROUND_UP(nvec, 8), 8);
+	if (check_add_overflow(next, msix_table_size, &next) ||
+	    next > U32_MAX ||
+	    check_add_overflow(next, pba_size, &next))
+		return -EOVERFLOW;
+
+	*backing_size = next;
+
+	return 0;
+}
+
+static int pci_epf_dma_build_layout(struct pci_epf_dma *epf_dma,
+				    const struct pci_epc_features *epc_features)
+{
+	struct pci_epf *epf = epf_dma->epf;
+	struct device *dev = &epf->dev;
+	struct pci_epf_bar *bar;
+	unsigned int max_maps, map_idx = 0, sub_idx = 0;
+	size_t align = epc_features->align;
+	size_t metadata_size, metadata_backing_size, metadata_bar_size;
+	size_t mapped_size = 0, dma_window_bar_size;
+	int i, ret;
+
+	metadata_size = PCI_EP_DMA_METADATA_HDR_LEN;
+	metadata_size += (epf_dma->wr_chans + epf_dma->rd_chans) *
+			 PCI_EP_DMA_METADATA_CH_ENTRY_SIZE;
+	metadata_backing_size = metadata_size;
+	ret = pci_epf_dma_reserve_msix(epf_dma, epc_features,
+				       &metadata_backing_size);
+	if (ret)
+		return ret;
+	metadata_bar_size = pci_epf_dma_align_size(metadata_backing_size,
+						   align);
+
+	epf_dma->metadata_addr = pci_epf_alloc_space(epf, metadata_bar_size,
+						     epf_dma->metadata_bar,
+						     epc_features,
+						     PRIMARY_INTERFACE);
+	if (!epf_dma->metadata_addr) {
+		dev_err(dev, "failed to allocate BAR%d metadata space\n",
+			epf_dma->metadata_bar);
+		return -ENOMEM;
+	}
+	memset(epf_dma->metadata_addr, 0, epf->bar[epf_dma->metadata_bar].size);
+
+	/* One map for DMA controller registers, plus one per channel. */
+	max_maps = 1 + epf_dma->wr_chans + epf_dma->rd_chans;
+	epf_dma->bar_maps = kzalloc_objs(*epf_dma->bar_maps, max_maps);
+	if (!epf_dma->bar_maps)
+		return -ENOMEM;
+
+	ret = pci_epf_dma_add_map(epf_dma, epf_dma->ctrl, align,
+				  &mapped_size, &map_idx);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < epf_dma->wr_chans; i++) {
+		ret = pci_epf_dma_add_map(epf_dma,
+					  epf_dma->ep_to_rc_desc[i], align,
+					  &mapped_size, &map_idx);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < epf_dma->rd_chans; i++) {
+		ret = pci_epf_dma_add_map(epf_dma,
+					  epf_dma->rc_to_ep_desc[i], align,
+					  &mapped_size, &map_idx);
+		if (ret)
+			return ret;
+	}
+
+	epf_dma->num_bar_maps = map_idx;
+
+	ret = pci_epf_dma_build_metadata(epf_dma);
+	if (ret)
+		return ret;
+
+	/* Some DMA resources may already be visible through another map. */
+	for (i = 0; i < epf_dma->num_bar_maps; i++) {
+		if (epf_dma->bar_maps[i].needs_submap)
+			epf_dma->num_submaps++;
+	}
+	if (!epf_dma->num_submaps)
+		return 0;
+
+	dma_window_bar_size = mapped_size;
+	epf_dma->dma_window_addr =
+		pci_epf_alloc_space(epf, dma_window_bar_size,
+				    epf_dma->dma_window_bar, epc_features,
+				    PRIMARY_INTERFACE);
+	if (!epf_dma->dma_window_addr) {
+		dev_err(dev, "failed to allocate BAR%d DMA window space\n",
+			epf_dma->dma_window_bar);
+		return -ENOMEM;
+	}
+	bar = &epf->bar[epf_dma->dma_window_bar];
+	memset(epf_dma->dma_window_addr, 0, bar->size);
+
+	if (bar->size > mapped_size)
+		epf_dma->num_submaps++;
+
+	epf_dma->submaps = kzalloc_objs(*epf_dma->submaps, epf_dma->num_submaps);
+	if (!epf_dma->submaps)
+		return -ENOMEM;
+
+	for (i = 0; i < epf_dma->num_bar_maps; i++) {
+		if (!epf_dma->bar_maps[i].needs_submap)
+			continue;
+
+		epf_dma->submaps[sub_idx++] = (struct pci_epf_bar_submap) {
+			.phys_addr = epf_dma->bar_maps[i].phys_addr,
+			.size = epf_dma->bar_maps[i].map_size,
+		};
+	}
+
+	/* Cover any BAR tail padding with the allocated scratch space. */
+	if (bar->size > mapped_size) {
+		epf_dma->submaps[sub_idx++] = (struct pci_epf_bar_submap) {
+			.phys_addr = bar->phys_addr + mapped_size,
+			.size = bar->size - mapped_size,
+		};
+	}
+
+	return 0;
+}
+
+static void pci_epf_dma_free_layout(struct pci_epf_dma *epf_dma)
+{
+	struct pci_epf *epf = epf_dma->epf;
+	struct pci_epf_bar *bar;
+
+	if (epf_dma->dma_window_addr) {
+		bar = &epf->bar[epf_dma->dma_window_bar];
+		bar->submap = NULL;
+		bar->num_submap = 0;
+	}
+	epf_dma->submaps_programmed = false;
+
+	kfree(epf_dma->submaps);
+	epf_dma->submaps = NULL;
+	epf_dma->num_submaps = 0;
+
+	kfree(epf_dma->bar_maps);
+	epf_dma->bar_maps = NULL;
+	epf_dma->num_bar_maps = 0;
+
+	pci_epf_dma_release_channels(epf_dma);
+
+	kfree(epf_dma->resources);
+	epf_dma->resources = NULL;
+	epf_dma->num_resources = 0;
+	epf_dma->ctrl = NULL;
+	memset(epf_dma->ep_to_rc_desc, 0, sizeof(epf_dma->ep_to_rc_desc));
+	memset(epf_dma->rc_to_ep_desc, 0, sizeof(epf_dma->rc_to_ep_desc));
+
+	if (epf_dma->dma_window_addr) {
+		pci_epf_free_space(epf, epf_dma->dma_window_addr,
+				   epf_dma->dma_window_bar,
+				   PRIMARY_INTERFACE);
+		epf_dma->dma_window_addr = NULL;
+	}
+
+	if (epf_dma->metadata_addr) {
+		pci_epf_free_space(epf, epf_dma->metadata_addr,
+				   epf_dma->metadata_bar,
+				   PRIMARY_INTERFACE);
+		epf_dma->metadata_addr = NULL;
+	}
+	epf_dma->msix_table_offset = 0;
+}
+
+static int pci_epf_dma_program_submaps(struct pci_epf_dma *epf_dma)
+{
+	struct pci_epf *epf = epf_dma->epf;
+	struct pci_epf_bar *bar;
+	int ret;
+
+	if (!epf_dma->dma_window_addr) {
+		pci_epf_dma_set_metadata_ready(epf_dma, true);
+		return 0;
+	}
+
+	if (epf_dma->submaps_programmed)
+		return 0;
+
+	bar = &epf->bar[epf_dma->dma_window_bar];
+	bar->submap = epf_dma->submaps;
+	bar->num_submap = epf_dma->num_submaps;
+
+	ret = pci_epc_set_bar(epf->epc, epf->func_no, epf->vfunc_no, bar);
+	if (ret) {
+		bar->submap = NULL;
+		bar->num_submap = 0;
+		return ret;
+	}
+
+	epf_dma->submaps_programmed = true;
+	pci_epf_dma_set_metadata_ready(epf_dma, true);
+
+	return 0;
+}
+
+static void pci_epf_dma_map_work(struct work_struct *work)
+{
+	struct pci_epf_dma *epf_dma =
+		container_of(to_delayed_work(work), struct pci_epf_dma,
+			     map_work);
+	struct pci_epf *epf = epf_dma->epf;
+	int ret;
+
+	if (!epf->epc)
+		return;
+
+	if (!epf->epc->init_complete) {
+		schedule_delayed_work(&epf_dma->map_work,
+				      msecs_to_jiffies(PCI_EPF_DMA_HOST_REQ_POLL_MS));
+		return;
+	}
+
+	if (!pci_epf_dma_metadata_host_requested(epf_dma)) {
+		schedule_delayed_work(&epf_dma->map_work,
+				      msecs_to_jiffies(PCI_EPF_DMA_HOST_REQ_POLL_MS));
+		return;
+	}
+
+	ret = pci_epf_dma_program_submaps(epf_dma);
+	if (ret)
+		dev_err(&epf->dev, "failed to program DMA window BAR submaps: %d\n",
+			ret);
+}
+
+static int pci_epf_dma_epc_init(struct pci_epf *epf)
+{
+	struct pci_epf_dma *epf_dma = epf_get_drvdata(epf);
+	const struct pci_epc_features *epc_features;
+	struct pci_epc *epc = epf->epc;
+	struct device *dev = &epf->dev;
+	int ret;
+
+	epc_features = pci_epc_get_features(epc, epf->func_no, epf->vfunc_no);
+	if (!epc_features)
+		return -EOPNOTSUPP;
+
+	pci_epf_dma_clear_metadata_status(epf_dma);
+
+	ret = pci_epc_write_header(epc, epf->func_no, epf->vfunc_no,
+				   epf->header);
+	if (ret) {
+		dev_err(dev, "configuration header write failed\n");
+		return ret;
+	}
+
+	ret = pci_epc_set_bar(epc, epf->func_no, epf->vfunc_no,
+			      &epf->bar[epf_dma->metadata_bar]);
+	if (ret) {
+		dev_err(dev, "BAR%d setup failed: %d\n",
+			epf_dma->metadata_bar, ret);
+		return ret;
+	}
+
+	if (epf_dma->dma_window_addr) {
+		ret = pci_epc_set_bar(epc, epf->func_no, epf->vfunc_no,
+				      &epf->bar[epf_dma->dma_window_bar]);
+		if (ret) {
+			dev_err(dev, "BAR%d setup failed: %d\n",
+				epf_dma->dma_window_bar, ret);
+			goto err_clear_metadata_bar;
+		}
+	}
+
+	if (epc_features->msi_capable && epf->msi_interrupts) {
+		ret = pci_epc_set_msi(epc, epf->func_no, epf->vfunc_no,
+				      epf->msi_interrupts);
+		if (ret) {
+			dev_err(dev, "MSI setup failed: %d\n", ret);
+			goto err_clear_dma_window_bar;
+		}
+	}
+
+	if (epc_features->msix_capable && epf->msix_interrupts) {
+		ret = pci_epc_set_msix(epc, epf->func_no, epf->vfunc_no,
+				       epf->msix_interrupts,
+				       epf_dma->metadata_bar,
+				       epf_dma->msix_table_offset);
+		if (ret) {
+			dev_err(dev, "MSI-X setup failed: %d\n", ret);
+			goto err_clear_dma_window_bar;
+		}
+	}
+
+	schedule_delayed_work(&epf_dma->map_work, 0);
+
+	return 0;
+
+err_clear_dma_window_bar:
+	if (epf_dma->dma_window_addr)
+		pci_epc_clear_bar(epc, epf->func_no, epf->vfunc_no,
+				  &epf->bar[epf_dma->dma_window_bar]);
+err_clear_metadata_bar:
+	pci_epc_clear_bar(epc, epf->func_no, epf->vfunc_no,
+			  &epf->bar[epf_dma->metadata_bar]);
+	pci_epf_dma_clear_metadata_status(epf_dma);
+
+	return ret;
+}
+
+static void pci_epf_dma_epc_deinit(struct pci_epf *epf)
+{
+	struct pci_epf_dma *epf_dma = epf_get_drvdata(epf);
+	struct pci_epf_bar *bar;
+
+	cancel_delayed_work_sync(&epf_dma->map_work);
+
+	if (!epf_dma->metadata_addr)
+		return;
+
+	pci_epf_dma_clear_metadata_status(epf_dma);
+	if (epf_dma->dma_window_addr) {
+		bar = &epf->bar[epf_dma->dma_window_bar];
+		pci_epc_clear_bar(epf->epc, epf->func_no, epf->vfunc_no, bar);
+		bar->submap = NULL;
+		bar->num_submap = 0;
+	}
+	pci_epc_clear_bar(epf->epc, epf->func_no, epf->vfunc_no,
+			  &epf->bar[epf_dma->metadata_bar]);
+	epf_dma->submaps_programmed = false;
+}
+
+static int pci_epf_dma_link_up(struct pci_epf *epf)
+{
+	struct pci_epf_dma *epf_dma = epf_get_drvdata(epf);
+
+	schedule_delayed_work(&epf_dma->map_work, 0);
+
+	return 0;
+}
+
+static int pci_epf_dma_link_down(struct pci_epf *epf)
+{
+	struct pci_epf_dma *epf_dma = epf_get_drvdata(epf);
+
+	cancel_delayed_work_sync(&epf_dma->map_work);
+	pci_epf_dma_clear_metadata_status(epf_dma);
+	/*
+	 * Link down can invalidate non-sticky inbound ATU state without going
+	 * through pci_epc_clear_bar(). Keep the BAR/submap description intact,
+	 * but force the next link-up path to reprogram the subrange mappings.
+	 */
+	epf_dma->submaps_programmed = false;
+
+	return 0;
+}
+
+static const struct pci_epc_event_ops pci_epf_dma_event_ops = {
+	.epc_init = pci_epf_dma_epc_init,
+	.epc_deinit = pci_epf_dma_epc_deinit,
+	.link_up = pci_epf_dma_link_up,
+	.link_down = pci_epf_dma_link_down,
+};
+
+static int pci_epf_dma_bind(struct pci_epf *epf)
+{
+	struct pci_epf_dma *epf_dma = epf_get_drvdata(epf);
+	const struct pci_epc_features *epc_features;
+	struct pci_epc *epc = epf->epc;
+	bool needs_dma_window;
+	int ret;
+
+	if (WARN_ON_ONCE(!epc))
+		return -EINVAL;
+
+	epc_features = pci_epc_get_features(epc, epf->func_no, epf->vfunc_no);
+	if (!epc_features)
+		return -EOPNOTSUPP;
+
+	if (!epc_features->msi_capable && !epc_features->msix_capable)
+		return -EOPNOTSUPP;
+
+	if ((!epc_features->msi_capable || !epf->msi_interrupts) &&
+	    (!epc_features->msix_capable || !epf->msix_interrupts))
+		return -EINVAL;
+
+	ret = pci_epf_dma_collect_resources(epf_dma);
+	if (ret)
+		return ret;
+
+	if (epf_dma->metadata_bar == NO_BAR)
+		epf_dma->metadata_bar =
+			pci_epf_dma_first_usable_bar(epf_dma, epc_features,
+						     NO_BAR);
+
+	if (epf_dma->metadata_bar == NO_BAR ||
+	    !pci_epf_dma_bar_usable(epc_features, epf_dma->metadata_bar) ||
+	    pci_epf_dma_bar_has_fixed_resource(epf_dma, epf_dma->metadata_bar)) {
+		ret = -EINVAL;
+		goto err_free;
+	}
+
+	needs_dma_window = pci_epf_dma_needs_dma_window(epf_dma);
+	if (needs_dma_window) {
+		if (!epc_features->subrange_mapping ||
+		    !epc_features->dynamic_inbound_mapping) {
+			ret = -EOPNOTSUPP;
+			goto err_free;
+		}
+
+		if (epf_dma->dma_window_bar == NO_BAR)
+			epf_dma->dma_window_bar =
+				pci_epf_dma_first_usable_bar(epf_dma, epc_features,
+							     epf_dma->metadata_bar);
+		if (epf_dma->dma_window_bar == NO_BAR) {
+			ret = -EOPNOTSUPP;
+			goto err_free;
+		}
+	}
+
+	if (epf_dma->dma_window_bar != NO_BAR) {
+		if (!pci_epf_dma_bar_usable(epc_features,
+					    epf_dma->dma_window_bar)) {
+			ret = -EINVAL;
+			goto err_free;
+		}
+		if (epf_dma->metadata_bar == epf_dma->dma_window_bar ||
+		    pci_epf_dma_bar_has_fixed_resource(epf_dma,
+						       epf_dma->dma_window_bar)) {
+			ret = -EINVAL;
+			goto err_free;
+		}
+	}
+
+	ret = pci_epf_dma_build_layout(epf_dma, epc_features);
+	if (ret)
+		goto err_free;
+
+	return 0;
+
+err_free:
+	pci_epf_dma_free_layout(epf_dma);
+
+	return ret;
+}
+
+static void pci_epf_dma_unbind(struct pci_epf *epf)
+{
+	struct pci_epf_dma *epf_dma = epf_get_drvdata(epf);
+
+	cancel_delayed_work_sync(&epf_dma->map_work);
+	if (epf->epc && epf->epc->init_complete)
+		pci_epf_dma_epc_deinit(epf);
+	pci_epf_dma_free_layout(epf_dma);
+}
+
+#define PCI_EPF_DMA_SHOW(_name, _fmt, _val)				\
+static ssize_t pci_epf_dma_##_name##_show(struct config_item *item,	\
+					  char *page)			\
+{									\
+	struct config_group *group = to_config_group(item);		\
+	struct pci_epf_dma *epf_dma = to_epf_dma(group);		\
+									\
+	return sysfs_emit(page, _fmt "\n", (_val));			\
+}
+
+PCI_EPF_DMA_SHOW(metadata_bar, "%d", (int)epf_dma->metadata_bar)
+PCI_EPF_DMA_SHOW(dma_window_bar, "%d", (int)epf_dma->dma_window_bar)
+
+static ssize_t pci_epf_dma_metadata_bar_store(struct config_item *item, const char *page,
+					      size_t len)
+{
+	struct config_group *group = to_config_group(item);
+	struct pci_epf_dma *epf_dma = to_epf_dma(group);
+	int bar, ret;
+
+	if (epf_dma->epf->epc)
+		return -EOPNOTSUPP;
+
+	ret = kstrtoint(page, 0, &bar);
+	if (ret)
+		return ret;
+
+	if (bar != NO_BAR && (bar < BAR_0 || bar >= PCI_STD_NUM_BARS))
+		return -EINVAL;
+	if (bar != NO_BAR && bar == epf_dma->dma_window_bar)
+		return -EINVAL;
+
+	epf_dma->metadata_bar = bar;
+
+	return len;
+}
+
+static ssize_t pci_epf_dma_dma_window_bar_store(struct config_item *item,
+						const char *page, size_t len)
+{
+	struct config_group *group = to_config_group(item);
+	struct pci_epf_dma *epf_dma = to_epf_dma(group);
+	int bar, ret;
+
+	if (epf_dma->epf->epc)
+		return -EOPNOTSUPP;
+
+	ret = kstrtoint(page, 0, &bar);
+	if (ret)
+		return ret;
+
+	if (bar != NO_BAR && (bar < BAR_0 || bar >= PCI_STD_NUM_BARS))
+		return -EINVAL;
+	if (bar != NO_BAR && bar == epf_dma->metadata_bar)
+		return -EINVAL;
+
+	epf_dma->dma_window_bar = bar;
+
+	return len;
+}
+
+PCI_EPF_DMA_SHOW(wr_chans, "%u", (unsigned int)epf_dma->wr_chans)
+
+static ssize_t pci_epf_dma_wr_chans_store(struct config_item *item,
+					  const char *page, size_t len)
+{
+	struct config_group *group = to_config_group(item);
+	struct pci_epf_dma *epf_dma = to_epf_dma(group);
+	u16 val;
+	int ret;
+
+	if (epf_dma->epf->epc)
+		return -EOPNOTSUPP;
+
+	ret = kstrtou16(page, 0, &val);
+	if (ret)
+		return ret;
+	if (val > EDMA_MAX_WR_CH)
+		return -EINVAL;
+
+	epf_dma->wr_chans = val;
+
+	return len;
+}
+
+PCI_EPF_DMA_SHOW(rd_chans, "%u", (unsigned int)epf_dma->rd_chans)
+
+static ssize_t pci_epf_dma_rd_chans_store(struct config_item *item,
+					  const char *page, size_t len)
+{
+	struct config_group *group = to_config_group(item);
+	struct pci_epf_dma *epf_dma = to_epf_dma(group);
+	u16 val;
+	int ret;
+
+	if (epf_dma->epf->epc)
+		return -EOPNOTSUPP;
+
+	ret = kstrtou16(page, 0, &val);
+	if (ret)
+		return ret;
+	if (val > EDMA_MAX_RD_CH)
+		return -EINVAL;
+
+	epf_dma->rd_chans = val;
+
+	return len;
+}
+
+CONFIGFS_ATTR(pci_epf_dma_, metadata_bar);
+CONFIGFS_ATTR(pci_epf_dma_, dma_window_bar);
+CONFIGFS_ATTR(pci_epf_dma_, wr_chans);
+CONFIGFS_ATTR(pci_epf_dma_, rd_chans);
+
+static struct configfs_attribute *pci_epf_dma_attrs[] = {
+	&pci_epf_dma_attr_metadata_bar,
+	&pci_epf_dma_attr_dma_window_bar,
+	&pci_epf_dma_attr_wr_chans,
+	&pci_epf_dma_attr_rd_chans,
+	NULL,
+};
+
+static const struct config_item_type pci_epf_dma_group_type = {
+	.ct_attrs	= pci_epf_dma_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct config_group *pci_epf_dma_add_cfs(struct pci_epf *epf,
+						struct config_group *group)
+{
+	struct pci_epf_dma *epf_dma = epf_get_drvdata(epf);
+	struct config_group *epf_group = &epf_dma->group;
+	struct device *dev = &epf->dev;
+
+	config_group_init_type_name(epf_group, dev_name(dev),
+				    &pci_epf_dma_group_type);
+
+	return epf_group;
+}
+
+static const struct pci_epf_device_id pci_epf_dma_ids[] = {
+	{
+		.name = "pci_epf_dma",
+	},
+	{},
+};
+
+static int pci_epf_dma_probe(struct pci_epf *epf,
+			     const struct pci_epf_device_id *id)
+{
+	struct pci_epf_dma *epf_dma;
+
+	epf_dma = devm_kzalloc(&epf->dev, sizeof(*epf_dma), GFP_KERNEL);
+	if (!epf_dma)
+		return -ENOMEM;
+
+	epf->header = &pci_epf_dma_header;
+	epf->event_ops = &pci_epf_dma_event_ops;
+
+	epf_dma->epf = epf;
+	epf_dma->metadata_bar = NO_BAR;
+	epf_dma->dma_window_bar = NO_BAR;
+	INIT_DELAYED_WORK(&epf_dma->map_work, pci_epf_dma_map_work);
+
+	epf_set_drvdata(epf, epf_dma);
+
+	return 0;
+}
+
+static const struct pci_epf_ops pci_epf_dma_ops = {
+	.unbind		= pci_epf_dma_unbind,
+	.bind		= pci_epf_dma_bind,
+	.add_cfs	= pci_epf_dma_add_cfs,
+};
+
+static struct pci_epf_driver pci_epf_dma_driver = {
+	.driver.name	= "pci_epf_dma",
+	.probe		= pci_epf_dma_probe,
+	.id_table	= pci_epf_dma_ids,
+	.ops		= &pci_epf_dma_ops,
+	.owner		= THIS_MODULE,
+};
+
+static int __init pci_epf_dma_init(void)
+{
+	return pci_epf_register_driver(&pci_epf_dma_driver);
+}
+module_init(pci_epf_dma_init);
+
+static void __exit pci_epf_dma_exit(void)
+{
+	pci_epf_unregister_driver(&pci_epf_dma_driver);
+}
+module_exit(pci_epf_dma_exit);
+
+MODULE_DESCRIPTION("PCI EPF DMA DRIVER");
+MODULE_AUTHOR("Koichiro Den <den@valinux.co.jp>");
+MODULE_LICENSE("GPL");
-- 
2.51.0


^ permalink raw reply related

* [PATCH 3/3] Documentation: PCI: Add PCI DMA endpoint function documentation
From: Koichiro Den @ 2026-05-21  6:36 UTC (permalink / raw)
  To: Manivannan Sadhasivam, Krzysztof Wilczyński,
	Kishon Vijay Abraham I, Bjorn Helgaas, Jonathan Corbet,
	Shuah Khan, Vinod Koul, Frank Li, Arnd Bergmann, Damien Le Moal,
	Niklas Cassel
  Cc: Marek Vasut, Yoshihiro Shimoda, linux-pci, linux-doc,
	linux-kernel, dmaengine
In-Reply-To: <20260521063638.2843021-1-den@valinux.co.jp>

Add a function description and a user guide for pci-epf-dma. Describe
the BAR-resident metadata consumed by dw-edma-pcie, the configfs
attributes, endpoint controller requirements and the host-side DMAengine
usage model.

Signed-off-by: Koichiro Den <den@valinux.co.jp>
---
 Documentation/PCI/endpoint/index.rst          |   2 +
 .../PCI/endpoint/pci-dma-function.rst         | 182 ++++++++++++++++
 Documentation/PCI/endpoint/pci-dma-howto.rst  | 200 ++++++++++++++++++
 3 files changed, 384 insertions(+)
 create mode 100644 Documentation/PCI/endpoint/pci-dma-function.rst
 create mode 100644 Documentation/PCI/endpoint/pci-dma-howto.rst

diff --git a/Documentation/PCI/endpoint/index.rst b/Documentation/PCI/endpoint/index.rst
index dd1f62e731c9..cd4107e02ec2 100644
--- a/Documentation/PCI/endpoint/index.rst
+++ b/Documentation/PCI/endpoint/index.rst
@@ -15,6 +15,8 @@ PCI Endpoint Framework
    pci-ntb-howto
    pci-vntb-function
    pci-vntb-howto
+   pci-dma-function
+   pci-dma-howto
    pci-nvme-function
 
    function/binding/pci-test
diff --git a/Documentation/PCI/endpoint/pci-dma-function.rst b/Documentation/PCI/endpoint/pci-dma-function.rst
new file mode 100644
index 000000000000..54caf4fafe00
--- /dev/null
+++ b/Documentation/PCI/endpoint/pci-dma-function.rst
@@ -0,0 +1,182 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+================
+PCI DMA Function
+================
+
+:Author: Koichiro Den <den@valinux.co.jp>
+
+The PCI DMA endpoint function exposes an endpoint-integrated DMA controller
+to the PCI host as a PCI DMA controller.  A matching host-side driver
+discovers the endpoint DMA metadata and registers the delegated channels with
+the Linux DMAengine framework, so host DMAengine clients can submit
+transfers.
+
+An endpoint Linux system can already use an endpoint-integrated DMA
+controller locally through the normal DMAengine API, for example to transfer
+data between endpoint memory and host addresses reachable over PCI.  The PCI
+DMA function provides a different ownership model: it delegates selected
+local DMA channels to the host, so a host DMAengine client can request and
+program those endpoint-side channels through the host's DMAengine API.
+
+To make that possible, the endpoint function publishes the DMA controller
+register window and descriptor memory layout to the host, reserves the
+selected local DMA channels on the endpoint side, and lets the host program
+those channels directly.
+
+Constructs Used for Implementing DMA
+====================================
+
+The PCI DMA function uses the following endpoint-side resources and
+configuration:
+
+	1) DMA controller register window
+	2) DMA descriptor memory for endpoint-to-RC channels
+	3) DMA descriptor memory for RC-to-endpoint channels
+	4) MSI or MSI-X interrupt vectors selected through configfs
+	5) One endpoint BAR used to publish metadata
+	6) If needed, one endpoint BAR used for dynamically mapped DMA windows
+
+The endpoint controller reports the DMA controller register and descriptor
+resources through the endpoint auxiliary resource interface.  The PCI DMA
+function uses those descriptions to build the host-visible metadata and to map
+resources that are not already visible to the host.
+
+DMA Controller Register Window:
+-------------------------------
+
+It contains the DMA controller registers programmed by the host-side driver
+to submit transfers, control channels and handle DMA interrupts.
+
+DMA Descriptor Memory:
+----------------------
+
+It contains the descriptor memory used by the DMA controller.  The PCI DMA
+function exposes descriptor memory for the delegated endpoint-to-RC and
+RC-to-endpoint channels.
+
+MSI/MSI-X Interrupt Vectors:
+----------------------------
+
+They are used by the delegated DMA channels to signal completion and error
+conditions to the host-side driver.
+
+Metadata BAR:
+-------------
+
+It is the endpoint BAR used to publish the endpoint DMA metadata and handshake
+bits.  The BAR remains stable while the endpoint function programs the DMA
+windows.
+
+DMA Window BAR:
+---------------
+
+It is the endpoint BAR used for DMA resources that are not already visible
+through a fixed BAR.  The endpoint function may switch this BAR to subrange
+mapping after the host-side driver has found the metadata BAR.
+
+BAR Metadata
+============
+
+The endpoint function places a small metadata block at the beginning of the
+selected metadata BAR.  The format is defined in
+``include/linux/pci-ep-dma.h``.
+
+The host-side driver scans the function's assigned memory BARs, looks for the
+endpoint DMA metadata magic, requests DMA window programming, waits for the
+READY bit, and then parses the metadata to find the DMA register window and
+descriptor windows.
+
+::
+
+	+----------------------+ metadata BAR offset 0
+	| endpoint DMA metadata|
+	+----------------------+
+	| optional padding     |
+	+----------------------+
+
+	+----------------------+ DMA window BAR offset 0
+	| mapped DMA resources |
+	+----------------------+
+	| optional padding     |
+	+----------------------+
+
+The metadata can also reference resources that are already host-visible
+through fixed BARs.  For example, an endpoint controller may expose the DMA
+controller register window at a fixed BAR offset while descriptor memories
+are mapped into the DMA window BAR by the endpoint function.
+
+The metadata is BAR-resident instead of a self-contained PCI Vendor-Specific
+Extended Capability (VSEC).  Some endpoint controllers do not provide writable
+configuration-space backing storage large enough for a new VSEC payload, while
+they can map endpoint memory and controller resources into a BAR.
+
+Channel Ownership
+=================
+
+The ``wr_chans`` attribute exposes endpoint-to-RC DMA write channels.  The
+``rd_chans`` attribute exposes RC-to-endpoint DMA read channels.  The function
+reserves the selected endpoint-side DMAengine channels so that endpoint-side
+DMAengine clients cannot allocate and use the same hardware channels while
+they are delegated to the host.
+
+The current metadata revision describes channels in dense, zero-based order.
+For example, ``wr_chans = 2`` exposes write channels 0 and 1.  Skipping a
+hardware channel in the middle of the exposed range is not supported.
+
+The current DesignWare eDMA unroll and HDMA compatible support also requires
+each exposed direction to be delegated as a whole.  For example, on a controller
+with two write channels, ``wr_chans`` must be either 0 or 2.
+
+Interrupts
+==========
+
+The PCI DMA function exposes DMA interrupts through MSI or MSI-X.  The common
+endpoint function ``msi_interrupts`` and ``msix_interrupts`` configfs attributes
+select the interrupt vector counts programmed into endpoint config space.  At
+least one MSI or MSI-X vector must be configured before the function is bound
+to an endpoint controller.
+
+Transfer Addressing
+===================
+
+The host-side DMAengine client supplies the endpoint memory address as the
+DMA slave address.  For example, the ``dw-edma-pcie`` endpoint DMA metadata
+parser passes that slave address to the DMA controller as a raw endpoint-side
+address instead of translating it through a host PCI BAR resource.
+
+The host memory buffer used as the other side of the transfer is still mapped
+using the normal DMA mapping API on the host.
+
+Endpoint Controller Requirements
+================================
+
+The endpoint controller driver must expose the DMA controller register
+window and per-channel descriptor memories through the endpoint auxiliary
+resource API.  Endpoint controllers with other DMA register layouts also need
+matching metadata and host-side DMAengine driver support.
+
+If any DMA resource is not already host-visible through a fixed BAR, the
+endpoint controller must also support BAR subrange mapping and dynamic inbound
+mapping, because the DMA window BAR is assembled from those resources.
+
+Current Support
+===============
+
+The current host-side support is implemented in ``dw-edma-pcie`` for
+DesignWare eDMA unroll and HDMA compatible layouts.  Other PCIe controller
+DMA implementations need corresponding host-side DMAengine driver support.
+
+The ``dw-edma-pcie`` PCI ID table does not contain a generic endpoint DMA PCI
+ID entry.  Users need to bind the host-side driver explicitly using
+``driver_override``.
+
+The current metadata revision requires the exposed channels to be a dense
+prefix of the hardware channel numbers.
+
+Security Model
+==============
+
+The interface is intended for trusted endpoint/host deployments.  A delegated
+DMA channel can access endpoint memory addresses supplied by a host DMAengine
+client.
diff --git a/Documentation/PCI/endpoint/pci-dma-howto.rst b/Documentation/PCI/endpoint/pci-dma-howto.rst
new file mode 100644
index 000000000000..84f322881aa7
--- /dev/null
+++ b/Documentation/PCI/endpoint/pci-dma-howto.rst
@@ -0,0 +1,200 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================================
+PCI DMA Endpoint Function (EPF) User Guide
+==========================================
+
+:Author: Koichiro Den <den@valinux.co.jp>
+
+This guide shows how to configure the ``pci-epf-dma`` endpoint function driver.
+It uses ``dw-edma-pcie`` as the currently available host-side driver.  For the
+hardware model and layout see Documentation/PCI/endpoint/pci-dma-function.rst.
+
+Endpoint Device
+===============
+
+Endpoint Controller Devices
+---------------------------
+
+To find the list of endpoint controller devices in the system::
+
+	# ls /sys/class/pci_epc/
+	e65d0000.pcie-ep
+
+If ``PCI_ENDPOINT_CONFIGFS`` is enabled::
+
+	# ls /sys/kernel/config/pci_ep/controllers
+	e65d0000.pcie-ep
+
+Endpoint Function Drivers
+-------------------------
+
+To find the list of endpoint function drivers in the system::
+
+	# ls /sys/bus/pci-epf/drivers
+	pci_epf_dma  pci_epf_test
+
+If ``PCI_ENDPOINT_CONFIGFS`` is enabled::
+
+	# ls /sys/kernel/config/pci_ep/functions
+	pci_epf_dma  pci_epf_test
+
+Creating pci-epf-dma Device
+---------------------------
+
+Create a ``pci-epf-dma`` device with configfs::
+
+	# mount -t configfs none /sys/kernel/config
+	# cd /sys/kernel/config/pci_ep/
+	# mkdir functions/pci_epf_dma/dma0
+
+The "mkdir dma0" above creates the ``pci-epf-dma`` function device that will
+be probed by the ``pci_epf_dma`` driver.
+
+The PCI endpoint framework populates the directory with the common
+configurable fields::
+
+	# ls functions/pci_epf_dma/dma0
+	baseclass_code   msi_interrupts   progif_code    subsys_id
+	cache_line_size  msix_interrupts  revid          subsys_vendor_id
+	deviceid         pci_epf_dma.0    secondary      vendorid
+	interrupt_pin    primary          subclass_code
+
+The PCI DMA function driver also creates a function-specific sub-directory.
+The numeric suffix depends on the endpoint function instance number::
+
+	# ls functions/pci_epf_dma/dma0/pci_epf_dma.0/
+	dma_window_bar  metadata_bar  rd_chans  wr_chans
+
+Configuring pci-epf-dma Device
+------------------------------
+
+The host-side ``dw-edma-pcie`` PCI ID table does not contain a generic
+endpoint DMA PCI ID entry.  Choose a PCI vendor/device ID for the endpoint
+device::
+
+	# echo <vendor-id> > functions/pci_epf_dma/dma0/vendorid
+	# echo <device-id> > functions/pci_epf_dma/dma0/deviceid
+	# echo 1 > functions/pci_epf_dma/dma0/msi_interrupts
+
+The PCI class defaults to ``PCI_BASE_CLASS_SYSTEM`` and
+``PCI_CLASS_SYSTEM_DMA``.
+
+The function-specific attributes are:
+
+============== ============================================================
+Attribute      Description
+============== ============================================================
+metadata_bar   BAR used to publish the endpoint DMA metadata and handshake
+               bits.  It is kept as a stable BAR while the DMA windows are
+               programmed.  If this is left unset, the first usable BAR that
+               does not already contain a fixed DMA resource is used.
+dma_window_bar BAR used for DMA resources that are not already host-visible,
+               such as the DMA register window or descriptor windows.  This
+               BAR may be switched to subrange mapping after the host driver
+               has found the metadata.  If this is left unset and a DMA
+               window is needed, the first usable BAR different from
+               ``metadata_bar`` and not already occupied by a fixed DMA
+               resource is used.
+wr_chans       Number of endpoint-to-RC DMA write channels to expose.
+rd_chans       Number of RC-to-endpoint DMA read channels to expose.
+============== ============================================================
+
+A sample configuration for a DesignWare eDMA/HDMA compatible controller with
+two write channels and two read channels is given below::
+
+	# echo 0 > functions/pci_epf_dma/dma0/pci_epf_dma.0/metadata_bar
+	# echo 2 > functions/pci_epf_dma/dma0/pci_epf_dma.0/dma_window_bar
+	# echo 2 > functions/pci_epf_dma/dma0/pci_epf_dma.0/wr_chans
+	# echo 2 > functions/pci_epf_dma/dma0/pci_epf_dma.0/rd_chans
+
+``wr_chans`` and ``rd_chans`` default to 0.  At least one channel direction
+must be configured.  The selected channels are exposed in dense, zero-based
+order; for example, ``wr_chans = 2`` exposes write channels 0 and 1.
+Current DesignWare eDMA unroll and HDMA compatible support requires each
+exposed direction to be delegated as a whole, so set a direction to either 0 or
+the number of hardware channels in that direction.  If ``dma_window_bar`` is
+configured, it must be different from ``metadata_bar``.
+
+The common ``msi_interrupts`` and ``msix_interrupts`` attributes select the
+number of MSI and MSI-X vectors exposed to the host.  At least one MSI or
+MSI-X vector must be configured.
+
+The function-specific attributes can only be changed before the endpoint
+function is bound to an endpoint controller.
+
+Binding pci-epf-dma Device to EP Controller
+-------------------------------------------
+
+The DMA function device should be attached to a PCI endpoint controller
+connected to the host::
+
+	# ln -s controllers/e65d0000.pcie-ep \
+		functions/pci_epf_dma/dma0/primary/
+
+Once the above step is completed, the PCI endpoint controller is ready to
+establish a link with the host.
+
+Start the Link
+--------------
+
+Start the endpoint controller by writing 1 to ``start``::
+
+	# echo 1 > controllers/e65d0000.pcie-ep/start
+
+Root Complex Device
+===================
+
+lspci Output
+------------
+
+Note that the device listed here corresponds to the values populated in the
+endpoint configuration above::
+
+	# lspci -nk
+	01:00.1 0801: <vendor-id>:<device-id>
+
+If the host was already running while the endpoint function was configured,
+rescan the PCI bus after the endpoint side has completed the configfs setup
+and started the endpoint controller, if the platform supports it.
+
+Bind the endpoint DMA function to ``dw-edma-pcie`` explicitly with
+``driver_override``::
+
+	# modprobe dw_edma_pcie
+	# echo dw-edma-pcie > /sys/bus/pci/devices/0000:01:00.1/driver_override
+	# echo 0000:01:00.1 > /sys/bus/pci/drivers_probe
+
+The device should then be bound to ``dw-edma-pcie``::
+
+	# lspci -nk -s 01:00.1
+	01:00.1 0801: <vendor-id>:<device-id>
+		Kernel driver in use: dw-edma-pcie
+
+Using pci-epf-dma Device
+------------------------
+
+The host side software uses the standard Linux DMAengine API.  A DMAengine
+client driver running on the host must request one of the channels provided by
+``dw-edma-pcie`` and submit a transfer.
+
+For an endpoint-to-RC write transfer, the DMAengine client uses a host DMA
+buffer as the destination and an endpoint-side address as the slave source
+address.  For an RC-to-endpoint read transfer, the DMAengine client uses a
+host DMA buffer as the source and an endpoint-side address as the slave
+destination address.
+
+Troubleshooting
+===============
+
+``pci-epf-dma`` requires endpoint controller support for DMA auxiliary
+resources and MSI or MSI-X.  If any DMA resource must be mapped dynamically,
+the endpoint controller must also support BAR subrange mapping and dynamic
+inbound mapping.  Binding the function to an endpoint controller fails if the
+required capabilities are not available, or if both ``msi_interrupts`` and
+``msix_interrupts`` are zero.
+
+If ``dw-edma-pcie`` fails to probe on the host, check that the endpoint was
+bound to the host driver, that the endpoint BARs were assigned by PCI
+enumeration, and that the endpoint DMA metadata READY bit was set after any
+DMA window BAR submaps were programmed.
-- 
2.51.0


^ permalink raw reply related

* Re: [PATCH bpf-next v11 4/8] bpf: Introduce the bpf_list_del kfunc.
From: Kaitao Cheng @ 2026-05-21  6:59 UTC (permalink / raw)
  To: bot+bpf-ci
  Cc: bpf, ast, corbet, martin.lau, daniel, andrii, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, shuah,
	chengkaitao, skhan, memxor, linux-kernel, linux-doc, vmalik,
	linux-kselftest, martin.lau, clm, ihor.solodrai
In-Reply-To: <d1fb9e986434b92188b3b37514336794c49a807c0378badceda45c3c14bb9739@mail.kernel.org>



在 2026/5/21 12:08, bot+bpf-ci@kernel.org 写道:
>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>> index 59855b434f0b..804c201c28f3 100644
>> --- a/kernel/bpf/helpers.c
>> +++ b/kernel/bpf/helpers.c
> 
> [ ... ]
> 
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index f3cf8d85bea0..35eebb5e7769 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
> 
> [ ... ]
> 
>> @@ -11671,7 +11674,8 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
>>
>>  	switch (node_field_type) {
>>  	case BPF_LIST_NODE:
>> -		ret = is_bpf_list_push_kfunc(kfunc_btf_id);
>> +		ret = is_bpf_list_push_kfunc(kfunc_btf_id) ||
>> +		      kfunc_btf_id == special_kfunc_list[KF_bpf_list_del];
> 
> This isn't a bug, but Eduard Zingerman suggested in v10 adding a
> is_bpf_list_del_kfunc() helper for consistency with the existing code style
> (similar to is_bpf_list_push_kfunc()). This would maintain the pattern used
> throughout the codebase for checking kfunc types.
> 
> Reference: https://lore.kernel.org/bpf/ca14a6fd8a1f3ce1e64c70b6d9ea2baf8df913ae.camel@gmail.com/

The reason for using is_bpf_list_push_kfunc() is that it groups multiple parallel
"kfunc_btf_id == special_kfunc_list[*]" checks into one helper, making the code
easier to reuse. If we added is_bpf_list_del_kfunc() here, it would only wrap a
single KF_bpf_list_del check, which is unnecessary. Also, more helpers such as
is_bpf_list_first/last may be added later, and overusing is_bpf_list_* helpers
would make the code more redundant.

> ---
> AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
> See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
> 
> CI run summary: https://github.com/kernel-patches/bpf/actions/runs/26204125015

-- 
Thanks
Kaitao Cheng


^ permalink raw reply

* Re: [PATCH v6 11/43] KVM: guest_memfd: Ensure pages are not in use before conversion
From: Fuad Tabba @ 2026-05-21  7:09 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-11-91ab5a8b19a4@google.com>

Hi Ackerley,

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Ackerley Tng <ackerleytng@google.com>
>
> When converting memory to private in guest_memfd, it is necessary to ensure
> that the pages are not currently being accessed by any other part of the
> kernel or userspace to avoid any current user writing to guest private
> memory.
>
> guest_memfd checks for unexpected refcounts to determine whether a page is
> still in use. The only expected refcounts after unmapping the range
> requested for conversion are those that are held by guest_memfd itself.
>
> Update the kvm_memory_attributes2 structure to include an error_offset
> field. This allows KVM to report the exact offset where a conversion
> failed to userspace. If the safety check fails, return -EAGAIN and copy
> the error_offset back to userspace so that it can potentially retry the
> operation or handle the failure gracefully.
>
> Suggested-by: David Hildenbrand <david@kernel.org>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> ---
>  include/uapi/linux/kvm.h |  3 ++-
>  virt/kvm/guest_memfd.c   | 65 ++++++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 62 insertions(+), 6 deletions(-)
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index e6bbf68a83813..0b55258573d3d 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1658,7 +1658,8 @@ struct kvm_memory_attributes2 {
>         __u64 size;
>         __u64 attributes;
>         __u64 flags;
> -       __u64 reserved[12];
> +       __u64 error_offset;
> +       __u64 reserved[11];
>  };
>
>  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 91e89b188f583..9d82642a025e9 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -572,9 +572,42 @@ static int kvm_gmem_mas_preallocate(struct ma_state *mas, u64 attributes,
>         return mas_preallocate(mas, xa_mk_value(attributes), GFP_KERNEL);
>  }
>
> +static bool kvm_gmem_is_safe_for_conversion(struct inode *inode, pgoff_t start,
> +                                           size_t nr_pages, pgoff_t *err_index)
> +{
> +       struct address_space *mapping = inode->i_mapping;
> +       const int filemap_get_folios_refcount = 1;
> +       pgoff_t last = start + nr_pages - 1;
> +       struct folio_batch fbatch;
> +       bool safe = true;
> +       int i;
> +
> +       folio_batch_init(&fbatch);
> +       while (safe && filemap_get_folios(mapping, &start, last, &fbatch)) {
> +
> +               for (i = 0; i < folio_batch_count(&fbatch); ++i) {
> +                       struct folio *folio = fbatch.folios[i];
> +
> +                       if (folio_ref_count(folio) !=
> +                           folio_nr_pages(folio) + filemap_get_folios_refcount) {
> +                               safe = false;
> +                               *err_index = folio->index;
> +                               break;

https://sashiko.dev/#/patchset/20260507-gmem-inplace-conversion-v6-0-91ab5a8b19a4%40google.com?part=11

Sashiko raised a few issues here, but I think this one might be
genuine. Can you look into it please?

If that's right, when huge page support lands, if start falls in the
middle of a large folio, returning folio->index as the err_index will
return an offset strictly less than the requested start. A naive
userspace retry loop resuming from error_offset would step backwards
and corrupt attributes on memory it didn't intend to convert.
err_index should be clamped to max(start, folio->index).

Cheers,
/fuad

> +                       }
> +               }
> +
> +               folio_batch_release(&fbatch);
> +               cond_resched();
> +       }
> +
> +       return safe;
> +}
> +
>  static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
> -                                    size_t nr_pages, uint64_t attrs)
> +                                    size_t nr_pages, uint64_t attrs,
> +                                    pgoff_t *err_index)
>  {
> +       bool to_private = attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE;
>         struct address_space *mapping = inode->i_mapping;
>         struct gmem_inode *gi = GMEM_I(inode);
>         pgoff_t end = start + nr_pages;
> @@ -588,8 +621,21 @@ static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
>
>         mas_init(&mas, mt, start);
>         r = kvm_gmem_mas_preallocate(&mas, attrs, start, nr_pages);
> -       if (r)
> +       if (r) {
> +               *err_index = start;
>                 goto out;
> +       }
> +
> +       if (to_private) {
> +               unmap_mapping_pages(mapping, start, nr_pages, false);
> +
> +               if (!kvm_gmem_is_safe_for_conversion(inode, start, nr_pages,
> +                                                    err_index)) {
> +                       mas_destroy(&mas);
> +                       r = -EAGAIN;
> +                       goto out;
> +               }
> +       }
>
>         /*
>          * From this point on guest_memfd has performed necessary
> @@ -609,9 +655,10 @@ static long kvm_gmem_set_attributes(struct file *file, void __user *argp)
>         struct gmem_file *f = file->private_data;
>         struct inode *inode = file_inode(file);
>         struct kvm_memory_attributes2 attrs;
> +       pgoff_t err_index;
>         size_t nr_pages;
>         pgoff_t index;
> -       int i;
> +       int i, r;
>
>         if (copy_from_user(&attrs, argp, sizeof(attrs)))
>                 return -EFAULT;
> @@ -635,8 +682,16 @@ static long kvm_gmem_set_attributes(struct file *file, void __user *argp)
>
>         nr_pages = attrs.size >> PAGE_SHIFT;
>         index = attrs.offset >> PAGE_SHIFT;
> -       return __kvm_gmem_set_attributes(inode, index, nr_pages,
> -                                        attrs.attributes);
> +       r = __kvm_gmem_set_attributes(inode, index, nr_pages, attrs.attributes,
> +                                     &err_index);
> +       if (r) {
> +               attrs.error_offset = ((uint64_t)err_index) << PAGE_SHIFT;
> +
> +               if (copy_to_user(argp, &attrs, sizeof(attrs)))
> +                       return -EFAULT;
> +       }
> +
> +       return r;
>  }
>
>  static long kvm_gmem_ioctl(struct file *file, unsigned int ioctl,
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v6 15/43] KVM: guest_memfd: Handle lru_add fbatch refcounts during conversion safety check
From: Fuad Tabba @ 2026-05-21  7:13 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-15-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Ackerley Tng <ackerleytng@google.com>
>
> When checking if a guest_memfd folio is safe for conversion, its refcount
> is examined. A folio may be present in a per-CPU lru_add fbatch, which
> temporarily increases its refcount. This can lead to a false positive,
> incorrectly indicating that the folio is in use and preventing the
> conversion, even if it is otherwise safe. The conversion process might not
> be on the same CPU that holds the folio in its fbatch, making a simple
> per-CPU check insufficient.
>
> To address this, drain all CPUs' lru_add fbatches if an unexpectedly high
> refcount is encountered during the safety check. This is performed at most
> once per conversion request. Draining only if the folio in question may be
> lru cached.
>
> guest_memfd folios are unevictable, so they can only reside in the lru_add
> fbatch. If the folio's refcount is still unsafe after draining, then the
> conversion is truly deemed unsafe.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Not an area I've worked with that much, but it seems right to me:

Reviewed-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad


> ---
>  mm/swap.c              |  2 ++
>  virt/kvm/guest_memfd.c | 18 ++++++++++++++----
>  2 files changed, 16 insertions(+), 4 deletions(-)
>
> diff --git a/mm/swap.c b/mm/swap.c
> index 5cc44f0de9877..3134d9d3d7c30 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -37,6 +37,7 @@
>  #include <linux/page_idle.h>
>  #include <linux/local_lock.h>
>  #include <linux/buffer_head.h>
> +#include <linux/kvm_types.h>
>
>  #include "internal.h"
>
> @@ -904,6 +905,7 @@ void lru_add_drain_all(void)
>         lru_add_drain();
>  }
>  #endif /* CONFIG_SMP */
> +EXPORT_SYMBOL_FOR_KVM(lru_add_drain_all);
>
>  atomic_t lru_disable_count = ATOMIC_INIT(0);
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 034b72b4947fb..050a8c092b1a3 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -8,6 +8,7 @@
>  #include <linux/mempolicy.h>
>  #include <linux/pseudo_fs.h>
>  #include <linux/pagemap.h>
> +#include <linux/swap.h>
>
>  #include "kvm_mm.h"
>
> @@ -596,18 +597,27 @@ static bool kvm_gmem_is_safe_for_conversion(struct inode *inode, pgoff_t start,
>         const int filemap_get_folios_refcount = 1;
>         pgoff_t last = start + nr_pages - 1;
>         struct folio_batch fbatch;
> +       bool lru_drained = false;
>         bool safe = true;
>         int i;
>
>         folio_batch_init(&fbatch);
>         while (safe && filemap_get_folios(mapping, &start, last, &fbatch)) {
>
> -               for (i = 0; i < folio_batch_count(&fbatch); ++i) {
> +               for (i = 0; i < folio_batch_count(&fbatch);) {
>                         struct folio *folio = fbatch.folios[i];
>
> -                       if (folio_ref_count(folio) !=
> -                           folio_nr_pages(folio) + filemap_get_folios_refcount) {
> -                               safe = false;
> +                       safe = (folio_ref_count(folio) ==
> +                               folio_nr_pages(folio) +
> +                               filemap_get_folios_refcount);
> +
> +                       if (safe) {
> +                               ++i;
> +                       } else if (folio_may_be_lru_cached(folio) &&
> +                                  !lru_drained) {
> +                               lru_add_drain_all();
> +                               lru_drained = true;
> +                       } else {
>                                 *err_index = folio->index;
>                                 break;
>                         }
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v6 05/43] KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem attributes
From: Fuad Tabba @ 2026-05-21  7:19 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <CAEvNRgGQvMdDmVfbk42EY_PGN0ybTp-x21Zj+pg_X1mk9iCRtA@mail.gmail.com>

On Wed, 20 May 2026 at 22:44, Ackerley Tng <ackerleytng@google.com> wrote:
>
> Fuad Tabba <tabba@google.com> writes:
>
> >
> > [...snip...]
> >
> >> +unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
> >> +{
> >> +       struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
> >> +       struct inode *inode;
> >> +
> >> +       /*
> >> +        * If this gfn has no associated memslot, there's no chance of the gfn
> >> +        * being backed by private memory, since guest_memfd must be used for
> >> +        * private memory, and guest_memfd must be associated with some memslot.
> >> +        */
> >> +       if (!slot)
> >> +               return 0;
> >> +
> >> +       CLASS(gmem_get_file, file)(slot);
> >> +       if (!file)
> >> +               return 0;
> >> +
> >> +       inode = file_inode(file);
> >> +
> >> +       /*
> >> +        * Rely on the maple tree's internal RCU lock to ensure a
> >> +        * stable result. This result can become stale as soon as the
> >> +        * lock is dropped, so the caller _must_ still protect
> >> +        * consumption of private vs. shared by checking
> >> +        * mmu_invalidate_retry_gfn() under mmu_lock to serialize
> >> +        * against ongoing attribute updates.
> >> +        */
> >> +       return kvm_gmem_get_attributes(inode, kvm_gmem_get_index(slot, gfn));
> >> +}
> >
> > Doesn't this imply that all consumers of kvm_mem_is_private() should
> > validate the result using mmu_lock and the invalidation sequence?
>
> Let me know how I can improve the comment.

Given Sean's context, the comment is good I think. I would quibble
with the the "_must_ still protect" phrasing being a bit too strict.

Maybe just soften it slightly to acknowledge the exception? Something like:

  * lock is dropped, so callers that require a strict result _must_ protect
  * consumption of private vs. shared by checking mmu_invalidate_retry_gfn()
  * under mmu_lock to serialize against ongoing attribute updates. Callers
  * doing lockless reads must be able to tolerate a stale result.

That aligns the comment with how KVM is actually using it today. That
said, this is nitpicking. Feel free to use or ignore.

>
> I think the "consumption" of private vs shared here actually means
> something like "don't commit a page being faulted into page tables based
> on the result of kvm_gmem_get_memory_attributes() without checking
> kvm->mmu_invalidate_in_progress.", since a racing conversion may
> complete before you commit.
>
> kvm_mem_is_private() is used from these places:
>
> 1. Fault handling in KVM, like page_fault_can_be_fast(),
>    kvm_mmu_faultin_pfn(), kvm_mmu_page_fault(): this already handles the
>    entire mmu_lock and invalidation dance. No fault will be committed if
>    a racing conversion happened after kvm_mem_is_private() but before
>    the commit.
>
> 2. kvm_mmu_max_mapping_level() from recovering huge pages after
>    disabling dirty logging: Other than that it can't be used with
>    guest_memfd now since dirty logging can't be used with guest_memfd
>    and guest_memfd memslots are not updatable, this holds mmu_lock
>    throughout until the huge page recovery is done. invalidate_begin
>    also involves zapping the pages in the range, so if the order of
>    events is
>
>    | Thread A                     | Thread B          |
>    |------------------------------|-------------------|
>    | invalidate_begin + zap       |                   |
>    | update attributes maple_tree | recover huge page |
>    | invalidate_end               |                   |
>
>    Then recovering will never see the zapped pages, nothing to
>    recover, no kvm_mem_is_private() lookup.
>
> 3. kvm_arch_vcpu_pre_fault_memory()
>
>    This eventually calls kvm_tdp_mmu_page_fault(), which checks
>    is_page_fault_stale(), so it does check before committing.
>
> Were there any other calls I missed?

The one I was looking at was `sev_handle_rmp_fault()`, which does a lockless
read without the retry loop. But as Sean just pointed out, that path can
tolerate false positives/negatives and relies on the guest faulting again,
so the lack of synchronization there is existing behavior and considered "fine".

>
> > sev_handle_rmp_fault() calls kvm_mem_is_private() without holding
> > mmu_lock and without any retry mechanism. Is that a problem?
> >
>
> Sean already replied on your actual question separately :)
>
> > Cheers,
> > /fuad
> >
> >
> >>
> >> [...snip...]
> >>

^ permalink raw reply

* [PATCH net-next 0/3] devlink: Add boot-time eswitch mode defaults
From: Tariq Toukan @ 2026-05-21  7:24 UTC (permalink / raw)
  To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
	David S. Miller
  Cc: Jonathan Corbet, Shuah Khan, Jiri Pirko, Simon Horman,
	Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
	Borislav Petkov (AMD), Andrew Morton, Randy Dunlap,
	Thomas Gleixner, Petr Mladek, Peter Zijlstra (Intel), Tejun Heo,
	Vlastimil Babka, Feng Tang, Christian Brauner, Dave Hansen,
	Dapeng Mi, Kees Cook, Marco Elver, Li RongQing, Eric Biggers,
	Paul E. McKenney, linux-doc, linux-kernel, netdev, linux-rdma,
	Gal Pressman, Dragos Tatulea, Jiri Pirko

Hi,

See detailed feature description by Mark below.

Regards,
Tariq


This series adds a devlink_eswitch_mode= kernel command line parameter
for applying a default devlink eswitch mode during device
initialization.

Following the discussion with Jakub[1] and the feedback on the RFC
postings, this version implements the direction that was agreed on: keep
the scope limited to a boot-time devlink eswitch mode default only.

The implementation is intended to support the following properties:

- A system may have multiple devlink devices that usually need the same
  configuration. For a configuration such as eswitch mode switchdev, a
  user can specify either all devlink devices or an explicit list of
  devices to which that mode applies.

- Deployments can set the devlink eswitch mode before normal userspace
  orchestration runs, while still using devlink concepts and driver
  callbacks rather than adding driver-specific module parameters.

A default is scoped to either all devlink handles or to a
comma-separated list of devlink handles, for example:

devlink_eswitch_mode=[*]:switchdev
devlink_eswitch_mode=[pci/0000:08:00.0,pci/0000:09:00.1]:switchdev_inactive

The supported modes are legacy, switchdev and switchdev_inactive.

mlx5 wires this into device initialization after the devlink instance is
registered and after mlx5 devlink operations are available, so eswitch
mode defaults can be applied to matching PCI devlink devices.

Patch 1 clears the mlx5 FW reset-in-progress bit before reloading after
a firmware reset.

Patch 2 adds the devlink eswitch mode boot-default parser, storage,
devl_apply_default_esw_mode() API and documentation for the
devlink_eswitch_mode= syntax.

Patch 3 calls devl_apply_default_esw_mode() from mlx5 device
initialization.

Changelog:

Since RFC v2:

- Replaced the generic devlink=[...]:esw:mode:<mode> command line API
  with devlink_eswitch_mode=[...]:<mode>.

- Simplified the parser to handle one eswitch mode parameter instead of
  a generic command/attribute grammar.

- Renamed devl_apply_defaults() to devl_apply_default_esw_mode().

- Added the mlx5 firmware reset cleanup as the first patch after the
  cover letter.

[1] https://lore.kernel.org/all/20260502184153.4fd8d06f@kernel.org/

RFC V1:
https://lore.kernel.org/all/20260506123739.1959770-1-mbloch@nvidia.com/

RFC V2:
https://lore.kernel.org/all/20260510185424.2041415-1-mbloch@nvidia.com/

Mark Bloch (3):
  net/mlx5: Clear FW reset-in-progress bit before reload
  devlink: Add eswitch mode boot defaults
  net/mlx5: Apply devlink default eswitch mode during init

 .../admin-guide/kernel-parameters.txt         |  25 ++
 .../networking/devlink/devlink-defaults.rst   |  80 ++++++
 Documentation/networking/devlink/index.rst    |   1 +
 .../ethernet/mellanox/mlx5/core/fw_reset.c    |  28 +-
 .../net/ethernet/mellanox/mlx5/core/main.c    |  17 ++
 include/net/devlink.h                         |   1 +
 net/devlink/core.c                            | 255 ++++++++++++++++++
 7 files changed, 396 insertions(+), 11 deletions(-)
 create mode 100644 Documentation/networking/devlink/devlink-defaults.rst


base-commit: 9bf93cb2e180a58d5984ba13daee95903ff4fc14
-- 
2.44.0


^ permalink raw reply

* [PATCH net-next 1/3] net/mlx5: Clear FW reset-in-progress bit before reload
From: Tariq Toukan @ 2026-05-21  7:24 UTC (permalink / raw)
  To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
	David S. Miller
  Cc: Jonathan Corbet, Shuah Khan, Jiri Pirko, Simon Horman,
	Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
	Borislav Petkov (AMD), Andrew Morton, Randy Dunlap,
	Thomas Gleixner, Petr Mladek, Peter Zijlstra (Intel), Tejun Heo,
	Vlastimil Babka, Feng Tang, Christian Brauner, Dave Hansen,
	Dapeng Mi, Kees Cook, Marco Elver, Li RongQing, Eric Biggers,
	Paul E. McKenney, linux-doc, linux-kernel, netdev, linux-rdma,
	Gal Pressman, Dragos Tatulea, Jiri Pirko, Shay Drori,
	Moshe Shemesh
In-Reply-To: <20260521072434.362624-1-tariqt@nvidia.com>

From: Mark Bloch <mbloch@nvidia.com>

mlx5 sets MLX5_FW_RESET_FLAGS_RESET_IN_PROGRESS when acknowledging a
sync reset request. This bit blocks devlink reload and other devlink
operations while the firmware reset is running, but it was kept set
until after the driver reload finished.

Clear the reset-in-progress bit once the reset unload flow is done and
PCI access is back, before reloading the device. For a reset initiated
through devlink, clear it before completing the reload waiter. For a
reset reported through an asynchronous firmware event, keep the unload
flow outside devl_lock, then take devl_lock before clearing the bit and
reloading through the devl-locked load helper.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Shay Drori <shayd@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../ethernet/mellanox/mlx5/core/fw_reset.c    | 28 +++++++++++--------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
index 07440c58713a..7283e5b49eed 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
@@ -238,24 +238,30 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
 {
 	struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
 	struct devlink *devlink = priv_to_devlink(dev);
+	int err;
 
 	/* if this is the driver that initiated the fw reset, devlink completed the reload */
 	if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
+		clear_bit(MLX5_FW_RESET_FLAGS_RESET_IN_PROGRESS,
+			  &fw_reset->reset_flags);
 		complete(&fw_reset->done);
-	} else {
-		mlx5_sync_reset_unload_flow(dev, false);
-		if (mlx5_health_wait_pci_up(dev))
-			mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n");
-		else
-			mlx5_load_one(dev, true);
-		devl_lock(devlink);
-		devlink_remote_reload_actions_performed(devlink, 0,
-							BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) |
-							BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE));
-		devl_unlock(devlink);
+		return;
 	}
 
+	mlx5_sync_reset_unload_flow(dev, false);
+	err = mlx5_health_wait_pci_up(dev);
+
+	devl_lock(devlink);
 	clear_bit(MLX5_FW_RESET_FLAGS_RESET_IN_PROGRESS, &fw_reset->reset_flags);
+	if (err)
+		mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n");
+	else
+		mlx5_load_one_devl_locked(dev, true);
+
+	devlink_remote_reload_actions_performed(devlink, 0,
+						BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) |
+						BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE));
+	devl_unlock(devlink);
 }
 
 static void mlx5_stop_sync_reset_poll(struct mlx5_core_dev *dev)
-- 
2.44.0


^ permalink raw reply related

* [PATCH net-next 3/3] net/mlx5: Apply devlink default eswitch mode during init
From: Tariq Toukan @ 2026-05-21  7:24 UTC (permalink / raw)
  To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
	David S. Miller
  Cc: Jonathan Corbet, Shuah Khan, Jiri Pirko, Simon Horman,
	Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
	Borislav Petkov (AMD), Andrew Morton, Randy Dunlap,
	Thomas Gleixner, Petr Mladek, Peter Zijlstra (Intel), Tejun Heo,
	Vlastimil Babka, Feng Tang, Christian Brauner, Dave Hansen,
	Dapeng Mi, Kees Cook, Marco Elver, Li RongQing, Eric Biggers,
	Paul E. McKenney, linux-doc, linux-kernel, netdev, linux-rdma,
	Gal Pressman, Dragos Tatulea, Jiri Pirko, Shay Drori,
	Moshe Shemesh
In-Reply-To: <20260521072434.362624-1-tariqt@nvidia.com>

From: Mark Bloch <mbloch@nvidia.com>

Apply devlink default eswitch mode for mlx5 devices after successful
device initialization while holding the devlink instance lock.

At this point the devlink instance is registered and the mlx5 devlink
operations are available, so the default eswitch mode can be applied to
the matching PCI devlink handle.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Shay Drori <shayd@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 0c6e4efe38c8..4528097f3d84 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1391,6 +1391,21 @@ static void mlx5_unload(struct mlx5_core_dev *dev)
 	mlx5_free_bfreg(dev, &dev->priv.bfreg);
 }
 
+static void mlx5_devl_apply_default_esw_mode(struct mlx5_core_dev *dev)
+{
+	struct devlink *devlink = priv_to_devlink(dev);
+	int err;
+
+	if (!MLX5_ESWITCH_MANAGER(dev))
+		return;
+
+	devl_assert_locked(devlink);
+	err = devl_apply_default_esw_mode(devlink);
+	if (err)
+		mlx5_core_warn(dev, "Couldn't apply default eswitch mode, err %d\n",
+			       err);
+}
+
 int mlx5_init_one_devl_locked(struct mlx5_core_dev *dev)
 {
 	bool light_probe = mlx5_dev_is_lightweight(dev);
@@ -1437,6 +1452,7 @@ int mlx5_init_one_devl_locked(struct mlx5_core_dev *dev)
 		mlx5_core_err(dev, "mlx5_hwmon_dev_register failed with error code %d\n", err);
 
 	mutex_unlock(&dev->intf_state_mutex);
+	mlx5_devl_apply_default_esw_mode(dev);
 	return 0;
 
 err_register:
@@ -1538,6 +1554,7 @@ int mlx5_load_one_devl_locked(struct mlx5_core_dev *dev, bool recovery)
 		goto err_attach;
 
 	mutex_unlock(&dev->intf_state_mutex);
+	mlx5_devl_apply_default_esw_mode(dev);
 	return 0;
 
 err_attach:
-- 
2.44.0


^ permalink raw reply related

* [PATCH net-next 2/3] devlink: Add eswitch mode boot defaults
From: Tariq Toukan @ 2026-05-21  7:24 UTC (permalink / raw)
  To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
	David S. Miller
  Cc: Jonathan Corbet, Shuah Khan, Jiri Pirko, Simon Horman,
	Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
	Borislav Petkov (AMD), Andrew Morton, Randy Dunlap,
	Thomas Gleixner, Petr Mladek, Peter Zijlstra (Intel), Tejun Heo,
	Vlastimil Babka, Feng Tang, Christian Brauner, Dave Hansen,
	Dapeng Mi, Kees Cook, Marco Elver, Li RongQing, Eric Biggers,
	Paul E. McKenney, linux-doc, linux-kernel, netdev, linux-rdma,
	Gal Pressman, Dragos Tatulea, Jiri Pirko
In-Reply-To: <20260521072434.362624-1-tariqt@nvidia.com>

From: Mark Bloch <mbloch@nvidia.com>

Add devlink_eswitch_mode= command line support for setting an eswitch
mode during device initialization.

The supported syntax selects either all devlink handles or one explicit
comma-separated handle list:

  devlink_eswitch_mode=[*]:<mode>
  devlink_eswitch_mode=[<handle>[,<handle>...]]:<mode>

where <mode> is one of legacy, switchdev or switchdev_inactive. All
selected handles receive the same mode. Assigning different modes to
different handle lists in the same parameter value is not supported.

The default is applied through the existing eswitch_mode_set() devlink
operation, matching the userspace devlink eswitch set command.

Expose devl_apply_default_esw_mode() so drivers can apply the default at
the point where their devlink instance and eswitch operations are ready.

Document the devlink_eswitch_mode= syntax and duplicate handle handling.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../admin-guide/kernel-parameters.txt         |  25 ++
 .../networking/devlink/devlink-defaults.rst   |  80 ++++++
 Documentation/networking/devlink/index.rst    |   1 +
 include/net/devlink.h                         |   1 +
 net/devlink/core.c                            | 255 ++++++++++++++++++
 5 files changed, 362 insertions(+)
 create mode 100644 Documentation/networking/devlink/devlink-defaults.rst

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 7834ee927310..f87ae561c0dc 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1278,6 +1278,31 @@ Kernel parameters
 	dell_smm_hwmon.fan_max=
 			[HW] Maximum configurable fan speed.
 
+	devlink_eswitch_mode=
+			[NET]
+			Format:
+			[<selector>]:<mode>
+
+			<selector>:
+			* | <handle>[,<handle>...]
+
+			<handle>:
+			<bus-name>/<dev-name>
+
+			Configure default devlink eswitch mode for matching
+			devlink instances during device initialization.
+
+			<mode>:
+			legacy | switchdev | switchdev_inactive
+
+			Examples:
+			devlink_eswitch_mode=[*]:switchdev
+			devlink_eswitch_mode=[pci/0000:08:00.0]:switchdev
+			devlink_eswitch_mode=[pci/0000:08:00.0,pci/0000:09:00.1]:legacy
+
+			See Documentation/networking/devlink/devlink-defaults.rst
+			for the full syntax.
+
 	dfltcc=		[HW,S390]
 			Format: { on | off | def_only | inf_only | always }
 			on:       s390 zlib hardware support for compression on
diff --git a/Documentation/networking/devlink/devlink-defaults.rst b/Documentation/networking/devlink/devlink-defaults.rst
new file mode 100644
index 000000000000..b554e75eeeea
--- /dev/null
+++ b/Documentation/networking/devlink/devlink-defaults.rst
@@ -0,0 +1,80 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+Devlink Eswitch Mode Defaults
+==============================
+
+Devlink eswitch mode defaults allow the eswitch mode to be provided on the
+kernel command line and applied to matching devlink instances during device
+initialization.
+
+The devlink device is selected by its devlink handle. For PCI devices this is
+the same handle shown by ``devlink dev show``, for example
+``pci/0000:08:00.0``.
+
+Kernel command line syntax
+==========================
+
+Defaults are specified with the ``devlink_eswitch_mode=`` kernel command line
+parameter.
+
+The general syntax is::
+
+  devlink_eswitch_mode=[<selector>]:<mode>
+
+``<selector>`` is either ``*`` or one or more devlink handles::
+
+  * | <bus-name>/<dev-name>[,<bus-name>/<dev-name>...]
+
+``*`` applies the mode to every devlink instance. All handles in the same
+``[]`` list receive the same eswitch mode.
+
+``<mode>`` is one of ``legacy``, ``switchdev`` or ``switchdev_inactive``.
+
+Syntax rules
+------------
+
+The following syntax rules apply:
+
+* Specify the default in one ``devlink_eswitch_mode=`` parameter. Repeated
+  ``devlink_eswitch_mode=`` parameters are not accumulated.
+* The ``devlink_eswitch_mode=`` value is limited by the kernel command line
+  size.
+* Whitespace is not allowed within the parameter value.
+* ``<selector>`` must be either ``*`` or a handle list. ``*`` cannot be
+  combined with explicit handles.
+* ``<bus-name>`` and ``<dev-name>`` must not be empty.
+* ``<bus-name>`` must not contain ``:``.
+* ``<dev-name>`` may contain ``:``. This allows PCI names such as
+  ``0000:08:00.0``.
+* Handles must not contain whitespace, ``[``, ``]``, ``*`` or more than one
+  ``/``.
+* A comma inside ``[]`` separates handles.
+* Comma-separated default groups are not supported.
+* Duplicate handles are rejected and the devlink eswitch mode default is
+  ignored.
+
+The eswitch mode default corresponds to the userspace command::
+
+  devlink dev eswitch set <handle> mode <value>
+
+
+Examples
+========
+
+Set all devlink instances to switchdev mode::
+
+  devlink_eswitch_mode=[*]:switchdev
+
+Set one PCI devlink instance to switchdev mode::
+
+  devlink_eswitch_mode=[pci/0000:08:00.0]:switchdev
+
+Set two PCI devlink instances to legacy mode::
+
+  devlink_eswitch_mode=[pci/0000:08:00.0,pci/0000:09:00.1]:legacy
+
+The following is invalid because comma-separated default groups are not
+supported::
+
+  devlink_eswitch_mode=[pci/0000:08:00.0]:switchdev,[pci/0000:09:00.0]:switchdev_inactive
diff --git a/Documentation/networking/devlink/index.rst b/Documentation/networking/devlink/index.rst
index f7ba7dcf477d..0d27a7008b14 100644
--- a/Documentation/networking/devlink/index.rst
+++ b/Documentation/networking/devlink/index.rst
@@ -56,6 +56,7 @@ general.
    :maxdepth: 1
 
    devlink-dpipe
+   devlink-defaults
    devlink-eswitch-attr
    devlink-flash
    devlink-health
diff --git a/include/net/devlink.h b/include/net/devlink.h
index bcd31de1f890..98885f7c6c10 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1622,6 +1622,7 @@ int devl_trylock(struct devlink *devlink);
 void devl_unlock(struct devlink *devlink);
 void devl_assert_locked(struct devlink *devlink);
 bool devl_lock_is_held(struct devlink *devlink);
+int devl_apply_default_esw_mode(struct devlink *devlink);
 DEFINE_GUARD(devl, struct devlink *, devl_lock(_T), devl_unlock(_T));
 
 struct ib_device;
diff --git a/net/devlink/core.c b/net/devlink/core.c
index eeb6a71f5f56..4bc1734878d1 100644
--- a/net/devlink/core.c
+++ b/net/devlink/core.c
@@ -4,6 +4,10 @@
  * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
  */
 
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/string.h>
 #include <net/genetlink.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/devlink.h>
@@ -16,6 +20,233 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report);
 
 DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC);
 
+static char *devlink_default_esw_mode_param;
+static bool devlink_default_esw_mode_match_all;
+static enum devlink_eswitch_mode devlink_default_esw_mode;
+static LIST_HEAD(devlink_default_esw_mode_nodes);
+
+struct devlink_default_esw_mode_node {
+	struct list_head list;
+	char *bus_name;
+	char *dev_name;
+};
+
+static int __init
+devlink_default_esw_mode_to_value(const char *str,
+				  enum devlink_eswitch_mode *mode)
+{
+	if (!strcmp(str, "legacy")) {
+		*mode = DEVLINK_ESWITCH_MODE_LEGACY;
+		return 0;
+	}
+	if (!strcmp(str, "switchdev")) {
+		*mode = DEVLINK_ESWITCH_MODE_SWITCHDEV;
+		return 0;
+	}
+	if (!strcmp(str, "switchdev_inactive")) {
+		*mode = DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int devlink_default_esw_mode_apply(struct devlink *devlink)
+{
+	const struct devlink_ops *ops = devlink->ops;
+
+	if (!ops->eswitch_mode_set)
+		return -EOPNOTSUPP;
+
+	return ops->eswitch_mode_set(devlink, devlink_default_esw_mode,
+				     NULL);
+}
+
+static int __init
+devlink_default_esw_mode_handle_parse(char *handle, char **bus_name,
+				      char **dev_name)
+{
+	char *slash;
+	char *p;
+
+	if (!handle || !*handle)
+		return -EINVAL;
+
+	for (p = handle; *p; p++) {
+		if (*p == '[' || *p == ']' || *p == '*')
+			return -EINVAL;
+	}
+
+	slash = strchr(handle, '/');
+	if (!slash || slash == handle || !slash[1])
+		return -EINVAL;
+	if (strchr(slash + 1, '/'))
+		return -EINVAL;
+
+	*slash = '\0';
+	if (strchr(handle, ':'))
+		return -EINVAL;
+
+	*bus_name = handle;
+	*dev_name = slash + 1;
+	return 0;
+}
+
+static struct devlink_default_esw_mode_node *
+devlink_default_esw_mode_node_find(const char *bus_name, const char *dev_name)
+{
+	struct devlink_default_esw_mode_node *node;
+
+	list_for_each_entry(node, &devlink_default_esw_mode_nodes, list) {
+		if (!strcmp(node->bus_name, bus_name) &&
+		    !strcmp(node->dev_name, dev_name))
+			return node;
+	}
+
+	return NULL;
+}
+
+static int __init
+devlink_default_esw_mode_node_add(const char *bus_name, const char *dev_name)
+{
+	struct devlink_default_esw_mode_node *node;
+
+	if (devlink_default_esw_mode_node_find(bus_name, dev_name))
+		return -EEXIST;
+
+	node = kzalloc_obj(*node);
+	if (!node)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&node->list);
+	node->bus_name = kstrdup(bus_name, GFP_KERNEL);
+	node->dev_name = kstrdup(dev_name, GFP_KERNEL);
+	if (!node->bus_name || !node->dev_name) {
+		kfree(node->bus_name);
+		kfree(node->dev_name);
+		kfree(node);
+		return -ENOMEM;
+	}
+
+	list_add_tail(&node->list, &devlink_default_esw_mode_nodes);
+	return 0;
+}
+
+static int __init devlink_default_esw_mode_handles_parse(char *handles)
+{
+	char *handle;
+	int err;
+
+	if (!strcmp(handles, "*")) {
+		devlink_default_esw_mode_match_all = true;
+		return 0;
+	}
+
+	while ((handle = strsep(&handles, ",")) != NULL) {
+		char *bus_name;
+		char *dev_name;
+
+		err = devlink_default_esw_mode_handle_parse(handle, &bus_name,
+							    &dev_name);
+		if (err)
+			return err;
+
+		err = devlink_default_esw_mode_node_add(bus_name, dev_name);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static void __init
+devlink_default_esw_mode_node_free(struct devlink_default_esw_mode_node *node)
+{
+	kfree(node->bus_name);
+	kfree(node->dev_name);
+	kfree(node);
+}
+
+static void __init devlink_default_esw_mode_nodes_clear(void)
+{
+	struct devlink_default_esw_mode_node *node;
+	struct devlink_default_esw_mode_node *node_tmp;
+
+	list_for_each_entry_safe(node, node_tmp,
+				 &devlink_default_esw_mode_nodes, list) {
+		list_del(&node->list);
+		devlink_default_esw_mode_node_free(node);
+	}
+
+	devlink_default_esw_mode_match_all = false;
+}
+
+static int __init devlink_default_esw_mode_parse(char *str)
+{
+	char *handles_end;
+	char *handles;
+	char *mode;
+	int err;
+
+	if (!str || *str != '[')
+		return -EINVAL;
+
+	handles = str + 1;
+	handles_end = strchr(handles, ']');
+	if (!handles_end || handles_end[1] != ':' || !handles_end[2])
+		return -EINVAL;
+
+	*handles_end = '\0';
+	mode = handles_end + 2;
+	if (!*handles)
+		return -EINVAL;
+
+	err = devlink_default_esw_mode_to_value(mode,
+						&devlink_default_esw_mode);
+	if (err)
+		return err;
+
+	err = devlink_default_esw_mode_handles_parse(handles);
+	if (err)
+		devlink_default_esw_mode_nodes_clear();
+
+	return err;
+}
+
+/**
+ * devl_apply_default_esw_mode - Apply default eswitch mode to devlink instance
+ * @devlink: devlink
+ *
+ * The caller must hold the devlink instance lock.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int devl_apply_default_esw_mode(struct devlink *devlink)
+{
+	const char *bus_name = devlink_bus_name(devlink);
+	const char *dev_name = devlink_dev_name(devlink);
+	struct devlink_default_esw_mode_node *node;
+
+	devl_assert_locked(devlink);
+
+	if (devlink_default_esw_mode_match_all)
+		return devlink_default_esw_mode_apply(devlink);
+
+	node = devlink_default_esw_mode_node_find(bus_name, dev_name);
+	if (node)
+		return devlink_default_esw_mode_apply(devlink);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devl_apply_default_esw_mode);
+
+static int __init devlink_default_esw_mode_setup(char *str)
+{
+	devlink_default_esw_mode_param = str;
+	return 1;
+}
+__setup("devlink_eswitch_mode=", devlink_default_esw_mode_setup);
+
 static struct devlink *devlinks_xa_get(unsigned long index)
 {
 	struct devlink *devlink;
@@ -578,6 +809,27 @@ static int __init devlink_init(void)
 {
 	int err;
 
+	if (devlink_default_esw_mode_param) {
+		char *def;
+
+		def = kstrdup(devlink_default_esw_mode_param, GFP_KERNEL);
+		if (!def) {
+			err = -ENOMEM;
+			goto out;
+		}
+		err = devlink_default_esw_mode_parse(def);
+		kfree(def);
+		if (err == -EEXIST) {
+			devlink_default_esw_mode_param = NULL;
+			pr_warn("devlink: duplicate eswitch mode handles ignored\n");
+		} else if (err == -EINVAL) {
+			devlink_default_esw_mode_param = NULL;
+			pr_warn("devlink: invalid devlink_eswitch_mode parameter ignored\n");
+		} else if (err) {
+			goto out;
+		}
+	}
+
 	err = register_pernet_subsys(&devlink_pernet_ops);
 	if (err)
 		goto out;
@@ -593,7 +845,10 @@ static int __init devlink_init(void)
 out_unreg_pernet_subsys:
 	unregister_pernet_subsys(&devlink_pernet_ops);
 out:
+	if (err)
+		devlink_default_esw_mode_nodes_clear();
 	WARN_ON(err);
+
 	return err;
 }
 
-- 
2.44.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox