Linux Documentation
 help / color / mirror / Atom feed
* [PATCH bpf-next v11 8/8] selftests/bpf: Add test cases for bpf_list_del/add/is_first/is_last/empty
From: Kaitao Cheng @ 2026-05-21  3:23 UTC (permalink / raw)
  To: ast, corbet, martin.lau, daniel, andrii, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, shuah,
	chengkaitao, skhan, memxor
  Cc: bpf, linux-kernel, linux-doc, vmalik, linux-kselftest
In-Reply-To: <20260521032306.97118-1-kaitao.cheng@linux.dev>

From: Kaitao Cheng <chengkaitao@kylinos.cn>

Extend refcounted_kptr with tests for bpf_list_add (including prev from
bpf_list_front and bpf_refcount_acquire), bpf_list_del (including node
from bpf_list_front, bpf_rbtree_remove and bpf_refcount_acquire),
bpf_list_empty, bpf_list_is_first/last, and push_back on uninit head.

To verify the validity of bpf_list_del/add, the test also expects the
verifier to reject calls to bpf_list_del/add made without holding the
spin_lock.

Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
---
 .../selftests/bpf/progs/refcounted_kptr.c     | 421 ++++++++++++++++++
 1 file changed, 421 insertions(+)

diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr.c b/tools/testing/selftests/bpf/progs/refcounted_kptr.c
index c847398837cc..13de169ad68f 100644
--- a/tools/testing/selftests/bpf/progs/refcounted_kptr.c
+++ b/tools/testing/selftests/bpf/progs/refcounted_kptr.c
@@ -367,6 +367,427 @@ long insert_rbtree_and_stash__del_tree_##rem_tree(void *ctx)		\
 INSERT_STASH_READ(true, "insert_stash_read: remove from tree");
 INSERT_STASH_READ(false, "insert_stash_read: don't remove from tree");
 
+SEC("tc")
+__description("list_empty_test: list empty before add, non-empty after add")
+__success __retval(0)
+int list_empty_test(void *ctx)
+{
+	struct node_data *node_new;
+
+	bpf_spin_lock(&lock);
+	if (!bpf_list_empty(&head)) {
+		bpf_spin_unlock(&lock);
+		return -1;
+	}
+	bpf_spin_unlock(&lock);
+
+	node_new = bpf_obj_new(typeof(*node_new));
+	if (!node_new)
+		return -2;
+
+	bpf_spin_lock(&lock);
+	bpf_list_push_front(&head, &node_new->l);
+
+	if (bpf_list_empty(&head)) {
+		bpf_spin_unlock(&lock);
+		return -3;
+	}
+	bpf_spin_unlock(&lock);
+	return 0;
+}
+
+static struct node_data *__add_in_list(struct bpf_list_head *head,
+				       struct bpf_spin_lock *lock)
+{
+	struct node_data *node_new, *node_ref;
+
+	node_new = bpf_obj_new(typeof(*node_new));
+	if (!node_new)
+		return NULL;
+
+	node_ref = bpf_refcount_acquire(node_new);
+
+	bpf_spin_lock(lock);
+	bpf_list_push_front(head, &node_new->l);
+	bpf_spin_unlock(lock);
+	return node_ref;
+}
+
+SEC("tc")
+__description("list_is_edge_test1: is_first on first node, is_last on last node")
+__success __retval(0)
+int list_is_edge_test1(void *ctx)
+{
+	struct node_data *node_first, *node_last;
+	int err = 0;
+
+	node_last = __add_in_list(&head, &lock);
+	if (!node_last)
+		return -1;
+
+	node_first = __add_in_list(&head, &lock);
+	if (!node_first) {
+		bpf_obj_drop(node_last);
+		return -2;
+	}
+
+	bpf_spin_lock(&lock);
+	if (!bpf_list_is_first(&head, &node_first->l)) {
+		err = -3;
+		goto fail;
+	}
+	if (!bpf_list_is_last(&head, &node_last->l))
+		err = -4;
+
+fail:
+	bpf_spin_unlock(&lock);
+	bpf_obj_drop(node_first);
+	bpf_obj_drop(node_last);
+	return err;
+}
+
+SEC("tc")
+__description("list_is_edge_test2: accept list_front/list_back return value")
+__success __retval(0)
+int list_is_edge_test2(void *ctx)
+{
+	struct bpf_list_node *front, *back;
+	struct node_data *a, *b;
+	long err = 0;
+
+	a = __add_in_list(&head, &lock);
+	if (!a)
+		return -1;
+
+	b = __add_in_list(&head, &lock);
+	if (!b) {
+		bpf_obj_drop(a);
+		return -2;
+	}
+
+	bpf_spin_lock(&lock);
+	front = bpf_list_front(&head);
+	back = bpf_list_back(&head);
+	if (!front || !back) {
+		err = -3;
+		goto out_unlock;
+	}
+
+	if (!bpf_list_is_first(&head, front) || bpf_list_is_last(&head, front)) {
+		err = -4;
+		goto out_unlock;
+	}
+
+	if (!bpf_list_is_last(&head, back) || bpf_list_is_first(&head, back)) {
+		err = -5;
+		goto out_unlock;
+	}
+
+out_unlock:
+	bpf_spin_unlock(&lock);
+	bpf_obj_drop(a);
+	bpf_obj_drop(b);
+	return err;
+}
+
+SEC("tc")
+__description("list_is_edge_test3: single node is both first and last")
+__success __retval(0)
+int list_is_edge_test3(void *ctx)
+{
+	struct node_data *tmp;
+	struct bpf_list_node *node;
+	long err = 0;
+
+	tmp = __add_in_list(&head, &lock);
+	if (!tmp)
+		return -1;
+
+	bpf_spin_lock(&lock);
+	node = bpf_list_front(&head);
+	if (!node) {
+		bpf_spin_unlock(&lock);
+		bpf_obj_drop(tmp);
+		return -2;
+	}
+
+	if (!bpf_list_is_first(&head, node) || !bpf_list_is_last(&head, node))
+		err = -3;
+	bpf_spin_unlock(&lock);
+
+	bpf_obj_drop(tmp);
+	return err;
+}
+
+SEC("tc")
+__description("list_del_test1: del returns removed nodes")
+__success __retval(0)
+int list_del_test1(void *ctx)
+{
+	struct node_data *node_first, *node_last;
+	struct bpf_list_node *bpf_node_first, *bpf_node_last;
+	int err = 0;
+
+	node_last = __add_in_list(&head, &lock);
+	if (!node_last)
+		return -1;
+
+	node_first = __add_in_list(&head, &lock);
+	if (!node_first) {
+		bpf_obj_drop(node_last);
+		return -2;
+	}
+
+	bpf_spin_lock(&lock);
+	bpf_node_last = bpf_list_del(&head, &node_last->l);
+	bpf_node_first = bpf_list_del(&head, &node_first->l);
+	bpf_spin_unlock(&lock);
+
+	if (bpf_node_first)
+		bpf_obj_drop(container_of(bpf_node_first, struct node_data, l));
+	else
+		err = -3;
+
+	if (bpf_node_last)
+		bpf_obj_drop(container_of(bpf_node_last, struct node_data, l));
+	else
+		err = -4;
+
+	bpf_obj_drop(node_first);
+	bpf_obj_drop(node_last);
+	return err;
+}
+
+SEC("tc")
+__description("list_del_test2: remove an arbitrary node from the list")
+__success __retval(0)
+int list_del_test2(void *ctx)
+{
+	struct bpf_rb_node *rb;
+	struct bpf_list_node *l;
+	struct node_data *n;
+	long err;
+
+	err = __insert_in_tree_and_list(&head, &root, &lock);
+	if (err)
+		return err;
+
+	bpf_spin_lock(&lock);
+	rb = bpf_rbtree_first(&root);
+	if (!rb) {
+		bpf_spin_unlock(&lock);
+		return -4;
+	}
+
+	rb = bpf_rbtree_remove(&root, rb);
+	if (!rb) {
+		bpf_spin_unlock(&lock);
+		return -5;
+	}
+
+	n = container_of(rb, struct node_data, r);
+	l = bpf_list_del(&head, &n->l);
+	bpf_spin_unlock(&lock);
+	bpf_obj_drop(n);
+	if (!l)
+		return -6;
+
+	bpf_obj_drop(container_of(l, struct node_data, l));
+	return 0;
+}
+
+SEC("tc")
+__description("list_del_test3: list_del accepts list_front return value as node")
+__success __retval(0)
+int list_del_test3(void *ctx)
+{
+	struct node_data *tmp;
+	struct bpf_list_node *bpf_node, *l;
+	long err = 0;
+
+	tmp = __add_in_list(&head, &lock);
+	if (!tmp)
+		return -1;
+
+	bpf_spin_lock(&lock);
+	bpf_node = bpf_list_front(&head);
+	if (!bpf_node) {
+		bpf_spin_unlock(&lock);
+		err = -2;
+		goto fail;
+	}
+
+	l = bpf_list_del(&head, bpf_node);
+	bpf_spin_unlock(&lock);
+	if (!l) {
+		err = -3;
+		goto fail;
+	}
+
+	bpf_obj_drop(container_of(l, struct node_data, l));
+	bpf_obj_drop(tmp);
+	return 0;
+
+fail:
+	bpf_obj_drop(tmp);
+	return err;
+}
+
+SEC("tc")
+__description("list_add_test1: insert new node after prev")
+__success __retval(0)
+int list_add_test1(void *ctx)
+{
+	struct node_data *node_first;
+	struct node_data *new_node;
+	long err = 0;
+
+	node_first = __add_in_list(&head, &lock);
+	if (!node_first)
+		return -1;
+
+	new_node = bpf_obj_new(typeof(*new_node));
+	if (!new_node) {
+		err = -2;
+		goto fail;
+	}
+
+	bpf_spin_lock(&lock);
+	err = bpf_list_add(&head, &new_node->l, &node_first->l);
+	bpf_spin_unlock(&lock);
+	if (err) {
+		err = -3;
+		goto fail;
+	}
+
+fail:
+	bpf_obj_drop(node_first);
+	return err;
+}
+
+SEC("tc")
+__description("list_add_test2: list_add accepts list_front return value as prev")
+__success __retval(0)
+int list_add_test2(void *ctx)
+{
+	struct node_data *new_node, *tmp;
+	struct bpf_list_node *bpf_node;
+	long err = 0;
+
+	tmp = __add_in_list(&head, &lock);
+	if (!tmp)
+		return -1;
+
+	new_node = bpf_obj_new(typeof(*new_node));
+	if (!new_node) {
+		err = -2;
+		goto fail;
+	}
+
+	bpf_spin_lock(&lock);
+	bpf_node = bpf_list_front(&head);
+	if (!bpf_node) {
+		bpf_spin_unlock(&lock);
+		bpf_obj_drop(new_node);
+		err = -3;
+		goto fail;
+	}
+
+	err = bpf_list_add(&head, &new_node->l, bpf_node);
+	bpf_spin_unlock(&lock);
+	if (err) {
+		err = -4;
+		goto fail;
+	}
+
+fail:
+	bpf_obj_drop(tmp);
+	return err;
+}
+
+struct uninit_head_val {
+	struct bpf_spin_lock lock;
+	struct bpf_list_head head __contains(node_data, l);
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, int);
+	__type(value, struct uninit_head_val);
+	__uint(max_entries, 1);
+} uninit_head_map SEC(".maps");
+
+SEC("tc")
+__description("list_push_back_uninit_head: push_back on 0-initialized list head")
+__success __retval(0)
+int list_push_back_uninit_head(void *ctx)
+{
+	struct uninit_head_val *st;
+	struct node_data *node;
+	int ret = -1, key = 0;
+
+	st = bpf_map_lookup_elem(&uninit_head_map, &key);
+	if (!st)
+		return -1;
+
+	node = bpf_obj_new(typeof(*node));
+	if (!node)
+		return -1;
+
+	bpf_spin_lock(&st->lock);
+	ret = bpf_list_push_back(&st->head, &node->l);
+	bpf_spin_unlock(&st->lock);
+
+	return ret;
+}
+
+SEC("?tc")
+__failure __msg("bpf_spin_lock at off=32 must be held for bpf_list_head")
+long list_del_without_lock_fail(void *ctx)
+{
+	struct node_data *n;
+	struct bpf_list_node *l;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return -1;
+
+	/* Error case: delete list node without holding lock */
+	l = bpf_list_del(&head, &n->l);
+	bpf_obj_drop(n);
+	if (!l)
+		return -2;
+	bpf_obj_drop(container_of(l, struct node_data, l));
+
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("bpf_spin_lock at off=32 must be held for bpf_list_head")
+long list_add_without_lock_fail(void *ctx)
+{
+	struct node_data *n, *prev;
+	long err;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return -1;
+
+	prev = bpf_obj_new(typeof(*prev));
+	if (!prev) {
+		bpf_obj_drop(n);
+		return -1;
+	}
+
+	/* Error case: add list node without holding lock */
+	err = bpf_list_add(&head, &n->l, &prev->l);
+	bpf_obj_drop(prev);
+	if (err)
+		return -2;
+
+	return 0;
+}
+
 SEC("tc")
 __success
 long rbtree_refcounted_node_ref_escapes(void *ctx)
-- 
2.50.1 (Apple Git-155)


^ permalink raw reply related

* [PATCH bpf-next v11 7/8] bpf: add bpf_list_is_first/last/empty kfuncs
From: Kaitao Cheng @ 2026-05-21  3:23 UTC (permalink / raw)
  To: ast, corbet, martin.lau, daniel, andrii, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, shuah,
	chengkaitao, skhan, memxor
  Cc: bpf, linux-kernel, linux-doc, vmalik, linux-kselftest,
	Emil Tsalapatis
In-Reply-To: <20260521032306.97118-1-kaitao.cheng@linux.dev>

From: Kaitao Cheng <chengkaitao@kylinos.cn>

Add three kfuncs for BPF linked list queries:
- bpf_list_is_first(head, node): true if node is the first in the list.
- bpf_list_is_last(head, node): true if node is the last in the list.
- bpf_list_empty(head): true if the list has no entries.

Currently, without these kfuncs, to implement the above functionality
it is necessary to first call bpf_list_pop_front/back to retrieve the
first or last node before checking whether the passed-in node was the
first or last one. After the check, the node had to be pushed back into
the list using bpf_list_push_front/back, which was very inefficient.

Now, with the bpf_list_is_first/last/empty kfuncs, we can directly
check whether a node is the first, last, or whether the list is empty,
without having to first retrieve the node.

Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
---
 kernel/bpf/helpers.c  | 40 ++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c | 15 +++++++++++++--
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 89579165ef4d..b6c3d02d5593 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2656,6 +2656,43 @@ __bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head)
 	return (struct bpf_list_node *)h->prev;
 }
 
+__bpf_kfunc bool bpf_list_is_first(struct bpf_list_head *head,
+				   struct bpf_list_node *node__nonown_allowed)
+{
+	struct list_head *h = (struct list_head *)head;
+	struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed;
+
+	if (READ_ONCE(kn->owner) != head)
+		return false;
+
+	return list_is_first(&kn->list_head, h);
+}
+
+__bpf_kfunc bool bpf_list_is_last(struct bpf_list_head *head,
+				  struct bpf_list_node *node__nonown_allowed)
+{
+	struct list_head *h = (struct list_head *)head;
+	struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed;
+
+	if (READ_ONCE(kn->owner) != head)
+		return false;
+
+	return list_is_last(&kn->list_head, h);
+}
+
+__bpf_kfunc bool bpf_list_empty(struct bpf_list_head *head)
+{
+	struct list_head *h = (struct list_head *)head;
+
+	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+	 * called on its fields, so init here
+	 */
+	if (unlikely(!h->next))
+		INIT_LIST_HEAD(h);
+
+	return list_empty(h);
+}
+
 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
 						  struct bpf_rb_node *node)
 {
@@ -4772,6 +4809,9 @@ BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_del, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_is_first)
+BTF_ID_FLAGS(func, bpf_list_is_last)
+BTF_ID_FLAGS(func, bpf_list_empty)
 BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 662ad7312697..d9bdc3b32c05 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10965,6 +10965,9 @@ enum special_kfunc_type {
 	KF_bpf_list_del,
 	KF_bpf_list_front,
 	KF_bpf_list_back,
+	KF_bpf_list_is_first,
+	KF_bpf_list_is_last,
+	KF_bpf_list_empty,
 	KF_bpf_cast_to_kern_ctx,
 	KF_bpf_rdonly_cast,
 	KF_bpf_rcu_read_lock,
@@ -11035,6 +11038,9 @@ BTF_ID(func, bpf_list_pop_back)
 BTF_ID(func, bpf_list_del)
 BTF_ID(func, bpf_list_front)
 BTF_ID(func, bpf_list_back)
+BTF_ID(func, bpf_list_is_first)
+BTF_ID(func, bpf_list_is_last)
+BTF_ID(func, bpf_list_empty)
 BTF_ID(func, bpf_cast_to_kern_ctx)
 BTF_ID(func, bpf_rdonly_cast)
 BTF_ID(func, bpf_rcu_read_lock)
@@ -11556,7 +11562,10 @@ static bool is_bpf_list_api_kfunc(u32 btf_id)
 	       btf_id == special_kfunc_list[KF_bpf_list_pop_back] ||
 	       btf_id == special_kfunc_list[KF_bpf_list_del] ||
 	       btf_id == special_kfunc_list[KF_bpf_list_front] ||
-	       btf_id == special_kfunc_list[KF_bpf_list_back];
+	       btf_id == special_kfunc_list[KF_bpf_list_back] ||
+	       btf_id == special_kfunc_list[KF_bpf_list_is_first] ||
+	       btf_id == special_kfunc_list[KF_bpf_list_is_last] ||
+	       btf_id == special_kfunc_list[KF_bpf_list_empty];
 }
 
 static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
@@ -11678,7 +11687,9 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
 	switch (node_field_type) {
 	case BPF_LIST_NODE:
 		ret = is_bpf_list_push_kfunc(kfunc_btf_id) ||
-		      kfunc_btf_id == special_kfunc_list[KF_bpf_list_del];
+		      kfunc_btf_id == special_kfunc_list[KF_bpf_list_del] ||
+		      kfunc_btf_id == special_kfunc_list[KF_bpf_list_is_first] ||
+		      kfunc_btf_id == special_kfunc_list[KF_bpf_list_is_last];
 		break;
 	case BPF_RB_NODE:
 		ret = (is_bpf_rbtree_add_kfunc(kfunc_btf_id) ||
-- 
2.50.1 (Apple Git-155)


^ permalink raw reply related

* [PATCH bpf-next v11 6/8] bpf: Add bpf_list_add to insert node after a given list node
From: Kaitao Cheng @ 2026-05-21  3:23 UTC (permalink / raw)
  To: ast, corbet, martin.lau, daniel, andrii, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, shuah,
	chengkaitao, skhan, memxor
  Cc: bpf, linux-kernel, linux-doc, vmalik, linux-kselftest
In-Reply-To: <20260521032306.97118-1-kaitao.cheng@linux.dev>

From: Kaitao Cheng <chengkaitao@kylinos.cn>

Add a new kfunc bpf_list_add(head, new, prev, meta, off) that
inserts 'new' after 'prev' in the BPF linked list. Both must be in
the same list; 'prev' must already be in the list. The new node must
be an owning reference (e.g. from bpf_obj_new); the kfunc consumes
that reference and the node becomes non-owning once inserted.

We have added an additional parameter bpf_list_head *head to
bpf_list_add, as the verifier requires the head parameter to
check whether the lock is being held.

Returns 0 on success, -EINVAL if 'prev' is not in a list or 'new'
is already in a list (or duplicate insertion). On failure, the
kernel drops the passed-in node.

Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
---
 kernel/bpf/helpers.c  | 11 +++++++++++
 kernel/bpf/verifier.c | 12 +++++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1c69476c8a09..89579165ef4d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2577,6 +2577,16 @@ __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
 	return bpf_list_push_back(head, node, meta__ign, off);
 }
 
+__bpf_kfunc int bpf_list_add(struct bpf_list_head *head, struct bpf_list_node *new,
+			     struct bpf_list_node *prev__nonown_allowed,
+			     struct btf_struct_meta *meta, u64 off)
+{
+	struct bpf_list_node_kern *n = (void *)new, *p = (void *)prev__nonown_allowed;
+	struct list_head *prev_ptr = &p->list_head;
+
+	return __bpf_list_add(n, head, &prev_ptr, meta ? meta->record : NULL, off);
+}
+
 static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head,
 					    struct list_head *n)
 {
@@ -4756,6 +4766,7 @@ BTF_ID_FLAGS(func, bpf_list_push_front, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_list_push_front_impl)
 BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_list_push_back_impl)
+BTF_ID_FLAGS(func, bpf_list_add, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_del, KF_ACQUIRE | KF_RET_NULL)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 35eebb5e7769..662ad7312697 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10959,6 +10959,7 @@ enum special_kfunc_type {
 	KF_bpf_list_push_front,
 	KF_bpf_list_push_back_impl,
 	KF_bpf_list_push_back,
+	KF_bpf_list_add,
 	KF_bpf_list_pop_front,
 	KF_bpf_list_pop_back,
 	KF_bpf_list_del,
@@ -11028,6 +11029,7 @@ BTF_ID(func, bpf_list_push_front_impl)
 BTF_ID(func, bpf_list_push_front)
 BTF_ID(func, bpf_list_push_back_impl)
 BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_list_add)
 BTF_ID(func, bpf_list_pop_front)
 BTF_ID(func, bpf_list_pop_back)
 BTF_ID(func, bpf_list_del)
@@ -11140,7 +11142,8 @@ static bool is_bpf_list_push_kfunc(u32 func_id)
 	return func_id == special_kfunc_list[KF_bpf_list_push_front] ||
 	       func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
 	       func_id == special_kfunc_list[KF_bpf_list_push_back] ||
-	       func_id == special_kfunc_list[KF_bpf_list_push_back_impl];
+	       func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+	       func_id == special_kfunc_list[KF_bpf_list_add];
 }
 
 static bool is_bpf_rbtree_add_kfunc(u32 func_id)
@@ -19524,8 +19527,11 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		int struct_meta_reg = BPF_REG_3;
 		int node_offset_reg = BPF_REG_4;
 
-		/* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
-		if (is_bpf_rbtree_add_kfunc(desc->func_id)) {
+		/* list_add/rbtree_add have an extra arg (prev/less),
+		 * so args-to-fixup are in diff regs.
+		 */
+		if (desc->func_id == special_kfunc_list[KF_bpf_list_add] ||
+		    is_bpf_rbtree_add_kfunc(desc->func_id)) {
 			struct_meta_reg = BPF_REG_4;
 			node_offset_reg = BPF_REG_5;
 		}
-- 
2.50.1 (Apple Git-155)


^ permalink raw reply related

* [PATCH bpf-next v11 5/8] bpf: refactor __bpf_list_add to take insertion point via **prev_ptr
From: Kaitao Cheng @ 2026-05-21  3:23 UTC (permalink / raw)
  To: ast, corbet, martin.lau, daniel, andrii, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, shuah,
	chengkaitao, skhan, memxor
  Cc: bpf, linux-kernel, linux-doc, vmalik, linux-kselftest
In-Reply-To: <20260521032306.97118-1-kaitao.cheng@linux.dev>

From: Kaitao Cheng <chengkaitao@kylinos.cn>

Refactor __bpf_list_add to accept (node, head, struct list_head **prev_ptr,
..) instead of (node, head, bool tail, ..). Load prev from *prev_ptr after
INIT_LIST_HEAD(h), so we never dereference an uninitialized h->prev when
head was 0-initialized (e.g. push_back passes &h->prev).

When prev is not the list head, validate that prev is in the list via
its owner.

Prepares for bpf_list_add(head, new, prev, ..) to insert after a given
list node.

Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
---
 kernel/bpf/helpers.c | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 804c201c28f3..1c69476c8a09 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2478,9 +2478,11 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta
 
 static int __bpf_list_add(struct bpf_list_node_kern *node,
 			  struct bpf_list_head *head,
-			  bool tail, struct btf_record *rec, u64 off)
+			  struct list_head **prev_ptr,
+			  struct btf_record *rec, u64 off)
 {
 	struct list_head *n = &node->list_head, *h = (void *)head;
+	struct list_head *prev;
 
 	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
 	 * called on its fields, so init here
@@ -2488,19 +2490,31 @@ static int __bpf_list_add(struct bpf_list_node_kern *node,
 	if (unlikely(!h->next))
 		INIT_LIST_HEAD(h);
 
+	prev = *prev_ptr;
+
+	/* When prev is not the list head, it must be a node in this list. */
+	if (prev != h) {
+		struct bpf_list_node_kern *prev_kn =
+			container_of(prev, struct bpf_list_node_kern, list_head);
+
+		if (unlikely(READ_ONCE(prev_kn->owner) != head))
+			goto fail;
+	}
+
 	/* node->owner != NULL implies !list_empty(n), no need to separately
 	 * check the latter
 	 */
-	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
-		/* Only called from BPF prog, no need to migrate_disable */
-		__bpf_obj_drop_impl((void *)n - off, rec, false);
-		return -EINVAL;
-	}
+	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON))
+		goto fail;
 
-	tail ? list_add_tail(n, h) : list_add(n, h);
+	list_add(n, prev);
 	WRITE_ONCE(node->owner, head);
-
 	return 0;
+
+fail:
+	/* Only called from BPF prog, no need to migrate_disable */
+	__bpf_obj_drop_impl((void *)n - off, rec, false);
+	return -EINVAL;
 }
 
 /**
@@ -2521,8 +2535,9 @@ __bpf_kfunc int bpf_list_push_front(struct bpf_list_head *head,
 				    u64 off)
 {
 	struct bpf_list_node_kern *n = (void *)node;
+	struct list_head *h = (void *)head;
 
-	return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off);
+	return __bpf_list_add(n, head, &h, meta ? meta->record : NULL, off);
 }
 
 __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
@@ -2550,8 +2565,9 @@ __bpf_kfunc int bpf_list_push_back(struct bpf_list_head *head,
 				   u64 off)
 {
 	struct bpf_list_node_kern *n = (void *)node;
+	struct list_head *h = (void *)head;
 
-	return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
+	return __bpf_list_add(n, head, &h->prev, meta ? meta->record : NULL, off);
 }
 
 __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
-- 
2.50.1 (Apple Git-155)


^ permalink raw reply related

* [PATCH bpf-next v11 4/8] bpf: Introduce the bpf_list_del kfunc.
From: Kaitao Cheng @ 2026-05-21  3:23 UTC (permalink / raw)
  To: ast, corbet, martin.lau, daniel, andrii, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, shuah,
	chengkaitao, skhan, memxor
  Cc: bpf, linux-kernel, linux-doc, vmalik, linux-kselftest
In-Reply-To: <20260521032306.97118-1-kaitao.cheng@linux.dev>

From: Kaitao Cheng <chengkaitao@kylinos.cn>

Allow users to remove any node from a linked list.

We have added an additional parameter bpf_list_head *head to
bpf_list_del, as the verifier requires the head parameter to
check whether the lock is being held.

Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
---
 kernel/bpf/helpers.c  | 10 ++++++++++
 kernel/bpf/verifier.c |  6 +++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 59855b434f0b..804c201c28f3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2601,6 +2601,15 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
 	return __bpf_list_del(head, h->prev);
 }
 
+__bpf_kfunc struct bpf_list_node *bpf_list_del(struct bpf_list_head *head,
+					       struct bpf_list_node *node__nonown_allowed)
+{
+	struct bpf_list_node_kern *kn = (void *)node__nonown_allowed;
+
+	/* verifier guarantees node is a list node rather than list head */
+	return __bpf_list_del(head, &kn->list_head);
+}
+
 __bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head)
 {
 	struct list_head *h = (struct list_head *)head;
@@ -4733,6 +4742,7 @@ BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_list_push_back_impl)
 BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_del, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f3cf8d85bea0..35eebb5e7769 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10961,6 +10961,7 @@ enum special_kfunc_type {
 	KF_bpf_list_push_back,
 	KF_bpf_list_pop_front,
 	KF_bpf_list_pop_back,
+	KF_bpf_list_del,
 	KF_bpf_list_front,
 	KF_bpf_list_back,
 	KF_bpf_cast_to_kern_ctx,
@@ -11029,6 +11030,7 @@ BTF_ID(func, bpf_list_push_back_impl)
 BTF_ID(func, bpf_list_push_back)
 BTF_ID(func, bpf_list_pop_front)
 BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_list_del)
 BTF_ID(func, bpf_list_front)
 BTF_ID(func, bpf_list_back)
 BTF_ID(func, bpf_cast_to_kern_ctx)
@@ -11549,6 +11551,7 @@ static bool is_bpf_list_api_kfunc(u32 btf_id)
 	return is_bpf_list_push_kfunc(btf_id) ||
 	       btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
 	       btf_id == special_kfunc_list[KF_bpf_list_pop_back] ||
+	       btf_id == special_kfunc_list[KF_bpf_list_del] ||
 	       btf_id == special_kfunc_list[KF_bpf_list_front] ||
 	       btf_id == special_kfunc_list[KF_bpf_list_back];
 }
@@ -11671,7 +11674,8 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
 
 	switch (node_field_type) {
 	case BPF_LIST_NODE:
-		ret = is_bpf_list_push_kfunc(kfunc_btf_id);
+		ret = is_bpf_list_push_kfunc(kfunc_btf_id) ||
+		      kfunc_btf_id == special_kfunc_list[KF_bpf_list_del];
 		break;
 	case BPF_RB_NODE:
 		ret = (is_bpf_rbtree_add_kfunc(kfunc_btf_id) ||
-- 
2.50.1 (Apple Git-155)


^ permalink raw reply related

* [PATCH bpf-next v11 3/8] bpf: allow non-owning list-node args via __nonown_allowed
From: Kaitao Cheng @ 2026-05-21  3:23 UTC (permalink / raw)
  To: ast, corbet, martin.lau, daniel, andrii, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, shuah,
	chengkaitao, skhan, memxor
  Cc: bpf, linux-kernel, linux-doc, vmalik, linux-kselftest
In-Reply-To: <20260521032306.97118-1-kaitao.cheng@linux.dev>

From: Kaitao Cheng <chengkaitao@kylinos.cn>

KF_ARG_PTR_TO_LIST_NODE normally requires an owning reference
(PTR_TO_BTF_ID | MEM_ALLOC with ref_obj_id). Introduce  the
__nonown_allowed annotation on selected list-node arguments so
non-owning references with ref_obj_id==0 are accepted as well.

This patch only adds the generic verifier support and documents the
annotation. Later patches in the series will apply it to bpf_list_add
/del(), and bpf_list_is_first/last(), allowing bpf_list_front/back()
results to be used as the insertion point, deletion target, or query
target for those kfuncs.

Verifier keeps existing owning-ref checks by default; only arguments
annotated with __nonown_allowed bypass MEM_ALLOC/ref_obj_id checks
and then follow the same list-node validation path.

Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
---
 Documentation/bpf/kfuncs.rst | 22 ++++++++++++++++++++--
 kernel/bpf/verifier.c        | 13 +++++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 75e6c078e0e7..3a9db1108b95 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -207,8 +207,26 @@ Here, the buffer may be NULL. If the buffer is not NULL, it must be at least
 buffer__szk bytes in size. The kfunc is responsible for checking if the buffer
 is NULL before using it.
 
-2.3.5 __str Annotation
-----------------------------
+2.3.5 __nonown_allowed Annotation
+---------------------------------
+
+This annotation is used to indicate that the parameter may be a non-owning reference.
+
+An example is given below::
+
+        __bpf_kfunc int bpf_list_add(..., struct bpf_list_node
+                                     *prev__nonown_allowed, ...)
+        {
+                ...
+        }
+
+For the ``prev__nonown_allowed`` parameter (resolved as ``KF_ARG_PTR_TO_LIST_NODE``),
+suffix ``__nonown_allowed`` retains the usual owning-pointer rules and also
+permits a non-owning reference with no ref_obj_id (e.g. the return value of
+bpf_list_front() / bpf_list_back()).
+
+2.3.6 __str Annotation
+----------------------
 This annotation is used to indicate that the argument is a constant string.
 
 An example is given below::
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8dd79b735a69..f3cf8d85bea0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10714,6 +10714,11 @@ static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param
 	return btf_param_match_suffix(btf, arg, "__nullable");
 }
 
+static bool is_kfunc_arg_nonown_allowed(const struct btf *btf, const struct btf_param *arg)
+{
+	return btf_param_match_suffix(btf, arg, "__nonown_allowed");
+}
+
 static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg)
 {
 	return btf_param_match_suffix(btf, arg, "__str");
@@ -12244,6 +12249,13 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				return ret;
 			break;
 		case KF_ARG_PTR_TO_LIST_NODE:
+			if (is_kfunc_arg_nonown_allowed(btf, &args[i]) &&
+			    type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) {
+				/* Allow bpf_list_front/back return value for
+				 * __nonown_allowed list-node arguments.
+				 */
+				goto check_ok;
+			}
 			if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
 				verbose(env, "%s expected pointer to allocated object\n",
 					reg_arg_name(env, argno));
@@ -12253,6 +12265,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				verbose(env, "allocated object must be referenced\n");
 				return -EINVAL;
 			}
+check_ok:
 			ret = process_kf_arg_ptr_to_list_node(env, reg, argno, meta);
 			if (ret < 0)
 				return ret;
-- 
2.50.1 (Apple Git-155)


^ permalink raw reply related

* [PATCH bpf-next v11 2/8] bpf: clear list node owner and unlink before drop
From: Kaitao Cheng @ 2026-05-21  3:23 UTC (permalink / raw)
  To: ast, corbet, martin.lau, daniel, andrii, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, shuah,
	chengkaitao, skhan, memxor
  Cc: bpf, linux-kernel, linux-doc, vmalik, linux-kselftest
In-Reply-To: <20260521032306.97118-1-kaitao.cheng@linux.dev>

From: Kaitao Cheng <chengkaitao@kylinos.cn>

The issue only becomes exposed once bpf_list_del() is available: callers
can pass an arbitrary bpf_list_head and bpf_list_node pair, including
nodes that are not actually linked to the supplied head, or nodes that
outlive their original head after refcount-based retention.  This was
not practically reachable for callers restricted to pop-style helpers
alone; bpf_list_del() widens the API surface.

A failure mode appears when bpf_list_head_free() runs while a program
still holds an independent refcount on a node (for example via
bpf_refcount_acquire()).  The list head value embedded in map memory can
go away while the node object survives.  If node->owner is left pointing
at the old head address until drop completes, that pointer becomes stale.
If a new bpf_list_head is later allocated at the same address and the
stale node is passed to bpf_list_del(), the owner comparison can succeed
even though the node is not really linked to the new head, and
list_del_init() will follow bogus next/prev pointers with the risk of
memory corruption.

When draining a bpf_list_head, mark each node owner with BPF_PTR_POISON
under the map spinlock while moving it to a private drain list, then
list_del_init() the node and clear owner to NULL before calling
__bpf_obj_drop_impl().  Concurrent readers therefore never observe a
node that appears linked to a head while its list_head is inconsistent,
and surviving refcounted nodes never retain a stale non-NULL owner.

Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
---
 kernel/bpf/helpers.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 094457c3e6d3..59855b434f0b 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2247,10 +2247,11 @@ EXPORT_SYMBOL_GPL(bpf_base_func_proto);
 void bpf_list_head_free(const struct btf_field *field, void *list_head,
 			struct bpf_spin_lock *spin_lock)
 {
-	struct list_head *head = list_head, *orig_head = list_head;
+	struct list_head *head = list_head, drain, *pos, *n;
 
 	BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
 	BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
+	INIT_LIST_HEAD(&drain);
 
 	/* Do the actual list draining outside the lock to not hold the lock for
 	 * too long, and also prevent deadlocks if tracing programs end up
@@ -2261,20 +2262,30 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head,
 	__bpf_spin_lock_irqsave(spin_lock);
 	if (!head->next || list_empty(head))
 		goto unlock;
-	head = head->next;
+	list_for_each_safe(pos, n, head) {
+		struct bpf_list_node_kern *node;
+
+		node = container_of(pos, struct bpf_list_node_kern, list_head);
+		WRITE_ONCE(node->owner, BPF_PTR_POISON);
+		list_move_tail(pos, &drain);
+	}
 unlock:
-	INIT_LIST_HEAD(orig_head);
+	INIT_LIST_HEAD(head);
 	__bpf_spin_unlock_irqrestore(spin_lock);
 
-	while (head != orig_head) {
-		void *obj = head;
+	while (!list_empty(&drain)) {
+		struct bpf_list_node_kern *node;
 
-		obj -= field->graph_root.node_offset;
-		head = head->next;
+		pos = drain.next;
+		node = container_of(pos, struct bpf_list_node_kern, list_head);
+		list_del_init(pos);
+		/* Ensure __bpf_list_add() sees the node as unlinked. */
+		smp_store_release(&node->owner, NULL);
 		/* The contained type can also have resources, including a
 		 * bpf_list_head which needs to be freed.
 		 */
-		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
+		__bpf_obj_drop_impl((char *)pos - field->graph_root.node_offset,
+				    field->graph_root.value_rec, false);
 	}
 }
 
-- 
2.50.1 (Apple Git-155)


^ permalink raw reply related

* [PATCH bpf-next v11 1/8] bpf: refactor __bpf_list_del to take list node pointer
From: Kaitao Cheng @ 2026-05-21  3:22 UTC (permalink / raw)
  To: ast, corbet, martin.lau, daniel, andrii, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, shuah,
	chengkaitao, skhan, memxor
  Cc: bpf, linux-kernel, linux-doc, vmalik, linux-kselftest
In-Reply-To: <20260521032306.97118-1-kaitao.cheng@linux.dev>

From: Kaitao Cheng <chengkaitao@kylinos.cn>

Refactor __bpf_list_del to accept (head, struct list_head *n) instead of
(head, bool tail). The caller now passes the specific node to remove:
bpf_list_pop_front passes h->next, bpf_list_pop_back passes h->prev.

Prepares for introducing bpf_list_del(head, node) kfunc to remove an
arbitrary node when the user holds ownership.

Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
---
 kernel/bpf/helpers.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 07de26e7314c..094457c3e6d3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2550,37 +2550,44 @@ __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
 	return bpf_list_push_back(head, node, meta__ign, off);
 }
 
-static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
+static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head,
+					    struct list_head *n)
 {
-	struct list_head *n, *h = (void *)head;
+	struct list_head *h = (void *)head;
 	struct bpf_list_node_kern *node;
 
 	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
 	 * called on its fields, so init here
 	 */
-	if (unlikely(!h->next))
+	if (unlikely(!h->next)) {
 		INIT_LIST_HEAD(h);
+		return NULL;
+	}
 	if (list_empty(h))
 		return NULL;
 
-	n = tail ? h->prev : h->next;
 	node = container_of(n, struct bpf_list_node_kern, list_head);
-	if (WARN_ON_ONCE(READ_ONCE(node->owner) != head))
+	if (unlikely(READ_ONCE(node->owner) != head))
 		return NULL;
 
 	list_del_init(n);
-	WRITE_ONCE(node->owner, NULL);
+	/* Ensure __bpf_list_add() sees the node as unlinked. */
+	smp_store_release(&node->owner, NULL);
 	return (struct bpf_list_node *)n;
 }
 
 __bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
 {
-	return __bpf_list_del(head, false);
+	struct list_head *h = (void *)head;
+
+	return __bpf_list_del(head, h->next);
 }
 
 __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
 {
-	return __bpf_list_del(head, true);
+	struct list_head *h = (void *)head;
+
+	return __bpf_list_del(head, h->prev);
 }
 
 __bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head)
-- 
2.50.1 (Apple Git-155)


^ permalink raw reply related

* [PATCH bpf-next v11 0/8] bpf: Extend the bpf_list family of APIs
From: Kaitao Cheng @ 2026-05-21  3:22 UTC (permalink / raw)
  To: ast, corbet, martin.lau, daniel, andrii, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, shuah,
	chengkaitao, skhan, memxor
  Cc: bpf, linux-kernel, linux-doc, vmalik, linux-kselftest,
	Kaitao Cheng

In BPF, a list can only be used to implement a stack structure.
Due to an incomplete API set, only FIFO or LIFO operations are
supported. The patches enhance the BPF list API, making it more
list-like.

Five new kfuncs have been added:
bpf_list_del: remove a node from the list
bpf_list_add_impl: insert a node after a given list node
bpf_list_is_first: check if a node is the first in the list
bpf_list_is_last: check if a node is the last in the list
bpf_list_empty: check if the list is empty

And add test cases for the aforementioned kfuncs.

Changes in v11:
- Move [PATCH v10 7/8] earlier (Eduard Zingerman)
- Fix the synchronization issue in [PATCH v10 2/8] (Eduard Zingerman,
  Alexei Starovoitov)

Changes in v10:
- Remove the table-driven approach (Ihor Solodrai)
- Use the __nonown_allowed suffix for bpf_list_del/front/back
- Add test cases for __nonown_allowed

Changes in v9:
- Expand table-driven approach coverage (Emil Tsalapatis)
- Clear list node owner and unlink before drop (Emil Tsalapatis)
- Remove warnings caused by WARN_ON_ONCE() (Emil Tsalapatis)
- Introduce the __nonown_allowed suffix (Alexei Starovoitov)

Changes in v8:
- Use [patch v7 5/5] as the start of the patch series (Leon Hwang)
- Introduce double pointer prev_ptr in __bpf_list_del
  (Kumar Kartikeya Dwivedi)
- Extract refactored __bpf_list_del/add into separate patches (Leon Hwang)
- Allow bpf_list_front/back result as the prev argument of bpf_list_add
- Split test cases (Leon Hwang)

Changes in v7:
- Replace bpf_list_node_is_edge with bpf_list_is_first/is_last
- Reimplement __bpf_list_del and __bpf_list_add (Kumar Kartikeya Dwivedi)
- Simplify test cases (Mykyta Yatsenko)

Changes in v6:
- Merge [patch v5 (2,4,6)/6] into [patch v6 4/5] (Leon Hwang)
- If list_head was 0-initialized, init it
- refactor kfunc checks to table-driven approach (Leon Hwang)

Changes in v5:
- Fix bpf_obj leak on bpf_list_add_impl error

Changes in v4:
- [patch v3 1/6] Revert to version v1 (Alexei Starovoitov)
- Change the parameters of bpf_list_add_impl to (head, new, prev, ...)

Changes in v3:
- Add a new lock_rec member to struct bpf_reference_state for lock
  holding detection.
- Add test cases to verify that the verifier correctly restricts calls
  to bpf_list_del when the spin_lock is not held.

Changes in v2:
- Remove the head parameter from bpf_list_del (Alexei Starovoitov)
- Add bpf_list_add/is_first/is_last/empty to API and test cases
  (Alexei Starovoitov)

Link to v10:
https://lore.kernel.org/all/20260512055919.95716-1-kaitao.cheng@linux.dev/

Link to v9:
https://lore.kernel.org/all/20260329140506.9595-1-pilgrimtao@gmail.com/

Link to v8:
https://lore.kernel.org/all/20260316112843.78657-1-pilgrimtao@gmail.com/

Link to v7:
https://lore.kernel.org/all/20260308134614.29711-1-pilgrimtao@gmail.com/

Link to v6:
https://lore.kernel.org/all/20260304143459.78059-1-pilgrimtao@gmail.com/

Link to v5:
https://lore.kernel.org/all/20260304031606.43884-1-pilgrimtao@gmail.com/

Link to v4:
https://lore.kernel.org/all/20260303135219.33726-1-pilgrimtao@gmail.com/

Link to v3:
https://lore.kernel.org/all/20260302124028.82420-1-pilgrimtao@gmail.com/

Link to v2:
https://lore.kernel.org/all/20260225092651.94689-1-pilgrimtao@gmail.com/

Link to v1:
https://lore.kernel.org/all/20260209025250.55750-1-pilgrimtao@gmail.com/

Kaitao Cheng (8):
  bpf: refactor __bpf_list_del to take list node pointer
  bpf: clear list node owner and unlink before drop
  bpf: allow non-owning list-node args via __nonown_allowed
  bpf: Introduce the bpf_list_del kfunc.
  bpf: refactor __bpf_list_add to take insertion point via **prev_ptr
  bpf: Add bpf_list_add to insert node after a given list node
  bpf: add bpf_list_is_first/last/empty kfuncs
  selftests/bpf: Add test cases for
    bpf_list_del/add/is_first/is_last/empty

 Documentation/bpf/kfuncs.rst                  |  22 +-
 kernel/bpf/helpers.c                          | 147 ++++--
 kernel/bpf/verifier.c                         |  44 +-
 .../selftests/bpf/progs/refcounted_kptr.c     | 421 ++++++++++++++++++
 4 files changed, 601 insertions(+), 33 deletions(-)

-- 
2.50.1 (Apple Git-155)


^ permalink raw reply

* [soc:zx/soc 1/1] htmldocs: Documentation/arch/arm/zte/zx297520v3.rst:66: WARNING: Title underline too short.
From: kernel test robot @ 2026-05-21  2:57 UTC (permalink / raw)
  To: Stefan Dösinger 
  Cc: oe-kbuild-all, linux-arm-kernel, arm, Linus Walleij,
	Krzysztof Kozlowski, linux-doc

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git zx/soc
head:   220ae5d36dba278003d265aabd080ffa78553f5a
commit: 220ae5d36dba278003d265aabd080ffa78553f5a [1/1] ARM: zte: Add zx297520v3 platform support
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
docutils: docutils (Docutils 0.21.2, Python 3.13.5, on linux)
reproduce: (https://download.01.org/0day-ci/archive/20260521/202605210401.8D6jRbz8-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605210401.8D6jRbz8-lkp@intel.com/

All warnings (new ones prefixed by >>):

   WARNING: Documentation/ABI/testing/sysfs-class-reboot-mode-reboot_modes:36: abi_sys_class_reboot_mode_driver_reboot_modes doesn't have a description
   WARNING: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/os_mode is defined 2 times: Documentation/ABI/testing/sysfs-driver-hid-lenovo-go:364; Documentation/ABI/testing/sysfs-driver-hid-lenovo-go-s:234
   WARNING: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/os_mode_index is defined 2 times: Documentation/ABI/testing/sysfs-driver-hid-lenovo-go:373; Documentation/ABI/testing/sysfs-driver-hid-lenovo-go-s:243
   WARNING: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/touchpad/enabled is defined 2 times: Documentation/ABI/testing/sysfs-driver-hid-lenovo-go:636; Documentation/ABI/testing/sysfs-driver-hid-lenovo-go-s:252
   WARNING: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/touchpad/enabled_index is defined 2 times: Documentation/ABI/testing/sysfs-driver-hid-lenovo-go:645; Documentation/ABI/testing/sysfs-driver-hid-lenovo-go-s:261
>> Documentation/arch/arm/zte/zx297520v3.rst:66: WARNING: Title underline too short.
--
   3. Building for built-in U-Boot
   --------------------------- [docutils]
>> Documentation/arch/arm/zte/zx297520v3.rst:90: WARNING: Enumerated list ends without a blank line; unexpected unindent. [docutils]
>> Documentation/arch/arm/zte/zx297520v3.rst:116: WARNING: Inline literal start-string without end-string. [docutils]
   Documentation/arch/arm/zte/zx297520v3.rst:137: ERROR: Unexpected indentation. [docutils]
>> Documentation/arch/arm/zte/zx297520v3.rst:138: WARNING: Block quote ends without a blank line; unexpected unindent. [docutils]
   Documentation/arch/arm/zte/zx297520v3.rst:164: WARNING: Inline literal start-string without end-string. [docutils]
>> Documentation/arch/arm/zte/zx297520v3.rst:164: WARNING: Inline interpreted text or phrase reference start-string without end-string. [docutils]
>> Documentation/arch/arm/zte/zx297520v3.rst:7: WARNING: Document or section may not begin with a transition. [docutils]
   Documentation/arch/riscv/zicfilp.rst:79: WARNING: Inline literal start-string without end-string. [docutils]
   Documentation/core-api/kref:328: ./include/linux/kref.h:72: WARNING: Invalid C declaration: Expected end of definition. [error at 96]
   int kref_put_mutex (struct kref *kref, void (*release)(struct kref *kref), struct mutex *mutex) __cond_acquires(true# mutex)
   ------------------------------------------------------------------------------------------------^
   Documentation/core-api/kref:328: ./include/linux/kref.h:94: WARNING: Invalid C declaration: Expected end of definition. [error at 92]


vim +66 Documentation/arch/arm/zte/zx297520v3.rst

     6	
   > 7	...............................................................................
     8	
     9	Author:	Stefan Dösinger
    10	
    11	Date  : 27 Jan 2026
    12	
    13	1. Hardware description
    14	---------------------------
    15	Zx297520v3 SoCs use a 64 bit capable Cortex-A53 CPU and GICv3, although they
    16	run in arm32 mode only. The CPU has support EL3, but no hypervisor (EL2) and
    17	it seems to lack VFP and NEON.
    18	
    19	The SoC is used in a number of cheap LTE to WiFi routers, both battery powered
    20	MiFis and stationary CPEs. In addition to the CPU these devices usually have
    21	64 MB Ram (although some is shared with the LTE chip), 128 MB NAND flash, an
    22	SDIO connected RTL8192-type Wifi chip limited to 2.4 ghz operation, USB 2,
    23	and buttons. Devices with as low as 32 MB or as high as 128 MB ram exist, as
    24	do devices with 8 or 16 MB of NOR flash.
    25	
    26	Some devices, especially the stationary ones, have 100 mbit Ethernet and an
    27	Ethernet switch.
    28	
    29	Usually the devices have LEDs for status indication, although some have SPI or
    30	I2C connected displays
    31	
    32	Some have an SD card slot. If it exists, it is a better choice for the root
    33	file system because it easily outperforms the built-in NAND.
    34	
    35	The LTE interface runs on a separate DSP called ZSP880. It is probably derived
    36	from LSI ZSPs and has an undocumented instruction set. The ZSP communicates
    37	with the main CPU via SRAM and DRAM and a mailbox hardware that can generate
    38	IRQs on either ends.
    39	
    40	There is also a Cortex M0 CPU, which is responsible for early HW initialization
    41	and starting the Cortex A53 CPU. It does not have any essential purpose once
    42	U-Boot is started. A SRAM-Based handover protocol exists to run custom code on
    43	this CPU.
    44	
    45	2. Booting via USB
    46	---------------------------
    47	
    48	The Boot ROM has support for booting custom code via USB. This mode can be
    49	entered by connecting a Boot PIN to GND or by modifying the third byte on NAND
    50	(set it to anything other than 0x5A aka 'Z'). A free software tool to start
    51	custom U-Boot and kernels can be found here:
    52	
    53	https://github.com/zx297520v3-mainline/zx297520v3-loader
    54	
    55	If USB download mode is entered but no boot commands are sent through USB, the
    56	device will proceed to boot normally after a few seconds. It is therefore
    57	possible to enable USB boot permanently and still leave the default boot files
    58	in place.
    59	
    60	https://github.com/zx297520v3-mainline/u-boot-mainline
    61	
    62	Contains an U-Boot version that can be used with the USB loader and sets up the
    63	CPU and interrupt controller to comply with Linux's booting requirements.
    64	
    65	3. Building for built-in U-Boot
  > 66	---------------------------
    67	The devices come with an ancient U-Boot that loads legacy uImages from NAND and
    68	boots them without a chance for the user to interrupt. The images are stored in
    69	files ap_cpuap.bin and ap_recovery.bin on a jffs2 partition named imagefs,
    70	usually mtd4. A file named "fotaflag" switches between the two modes.
    71	
    72	In addition to the uImage header, those files have a 384 byte signature header,
    73	which is used for authenticating the images on some devices. Most devices have
    74	this authentication disabled and it is enough to pad the uImage files with 384
    75	zero bytes.
    76	
    77	Builtin U-Boot also poorly sets up the CPU. Read the next section for details
    78	on this. It has no support for loading DTBs, so CONFIG_ARM_APPENDED_DTB is
    79	needed.
    80	
    81	So to build an image that boots from NAND the following steps are necessary:
    82	
    83	1) Patch the assembly code from section 3 into arch/arm/kernel/head.S.
    84	2) make zx29_defconfig
    85	3) make [-j x]
    86	4) cat arch/arm/boot/zImage arch/arm/boot/dts/zte/[device].dtb > kernel+dtb
    87	5) mkimage -A arm -O linux -T kernel -C none -a 0x20008000 -d kernel+dtb uimg
    88	6) dd if=/dev/zero bs=1 count=384 of=ap_recovery.bin
    89	7) cat uimg >> ap_recovery.bin
  > 90	8) Place this file onto imagefs on the device. Delete ap_cpuap.bin if the
    91	free space is not enough.
    92	9) Create the file fotaflag: echo -n FOTA-RECOVERY > fotaflag
    93	
    94	For development, booting ap_recovery.bin is recommended because the normal boot
    95	mode arms the watchdog before starting the kernel.
    96	
    97	4. CPU and GIC Setup
    98	---------------------------
    99	
   100	Generally CPU and GICv3 need to be set up according to the requirements spelled
   101	out in Documentation/arch/arm64/booting.rst. For zx297520v3 this means:
   102	
   103	1. GICD_CTLR.DS=1 to disable GIC security
   104	2. Enable access to ICC_SRE
   105	3. Disable trapping IRQs into monitor mode
   106	4. Configure EL2 and below to run in insecure mode.
   107	5. Configure timer PPIs to active-low.
   108	
   109	The kernel sources provided by ZTE do not boot either (interrupts do not work
   110	at all). They are incomplete in other aspects too, so it is assumed that there
   111	is some workaround similar to the one described in this document somewhere in
   112	the binary blobs.
   113	
   114	The assembly code below is given as an example of how to achieve this:
   115	
 > 116	```
   117	#include <linux/irqchip/arm-gic-v3.h>
   118	#include <asm/assembler.h>
   119	#include <asm/cp15.h>
   120	
   121	@ Detect sane bootloaders and skip the hack
   122	ldr	r3, =0xf2000000
   123	ldr	r3, [r3]
   124	ldr	r4, =(GICD_CTLR_ARE_NS | GICD_CTLR_DS)
   125	cmp	r3, r4
   126	beq	skip_zx_hack
   127	@ This allows EL1 to handle ints hat are normally handled by EL2/3.
   128	ldr	r3, =0xf2000000
   129	str     r4, [r3]
   130	
   131	cps     #MON_MODE
   132	
   133	@ Work in non-secure physical address space: SCR_EL3.NS = 1. At least the UART
   134	@ seems to respond only to non-secure addresses. I have taken insipiration from
   135	@ Raspberry pi's armstub7.S here.
   136	mov	r3, #0x131			@ non-secure, Make F, A bits in CPSR writeable
   137						@ Allow hypervisor call.
 > 138	mcr     p15, 0, r3, c1, c1, 0
   139	
   140	@ AP_PPI_MODE_REG: Configure timer PPIs (10, 11, 13, 14) to active-low.
   141	ldr	r3, =0xF22020a8
   142	ldr	r4, =0x50
   143	str	r4, [r3]
   144	ldr	r3, =0xF22020ac
   145	ldr	r4, =0x14
   146	str	r4, [r3]
   147	
   148	@ Enable EL2 access to ICC_SRE (bit 3, ICC_SRE_EL3.Enable). Enable system reg
   149	@ access to GICv3 registers (bit 0, ICC_SRE_EL3.SRE) for EL1 and EL3.
   150	mrc	p15, 6, r3, c12, c12, 5         @ ICC_SRE_EL3
   151	orr	r3, #0x9                        @ FIXME: No defines for SRE_EL3 values?
   152	mcr	p15, 6, r3, c12, c12, 5
   153	mrc	p15, 0, r3, c12, c12, 5         @ ICC_SRE_EL1
   154	orr	r3, #(ICC_SRE_EL1_SRE)
   155	mcr	p15, 0, r3, c12, c12, 5
   156	
   157	@ Like ICC_SRE_EL3, enable EL1 access to ICC_SRE and system register access
   158	@ for EL2.
   159	mrc	p15, 4, r3, c12, c9, 5          @ ICC_SRE_EL2 aka ICC_HSRE
   160	orr	r3, r3, #(ICC_SRE_EL2_ENABLE | ICC_SRE_EL2_SRE)
   161	mcr	p15, 4, r3, c12, c9, 5
   162	isb
   163	
 > 164	@ Back to SVC mode

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* Re: [PATCH mm-unstable v17 11/14] mm/khugepaged: Introduce mTHP collapse support
From: Wei Yang @ 2026-05-21  2:46 UTC (permalink / raw)
  To: Vernon Yang
  Cc: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <8f9834db-8981-4eb1-ae46-94908943da3d@gmail.com>

On Thu, May 21, 2026 at 10:36:15AM +0800, Vernon Yang wrote:
>On Mon, May 11, 2026 at 12:58:11PM -0600, Nico Pache wrote:
>> Enable khugepaged to collapse to mTHP orders. This patch implements the
>> main scanning logic using a bitmap to track occupied pages and a stack
>> structure that allows us to find optimal collapse sizes.
>>
>> Previous to this patch, PMD collapse had 3 main phases, a light weight
>> scanning phase (mmap_read_lock) that determines a potential PMD
>> collapse, an alloc phase (mmap unlocked), then finally heavier collapse
>> phase (mmap_write_lock).
>>
>> To enabled mTHP collapse we make the following changes:
>>
>> During PMD scan phase, track occupied pages in a bitmap. When mTHP
>> orders are enabled, we remove the restriction of max_ptes_none during the
>> scan phase to avoid missing potential mTHP collapse candidates. Once we
>> have scanned the full PMD range and updated the bitmap to track occupied
>> pages, we use the bitmap to find the optimal mTHP size.
>>
>> Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
>> and determine the best eligible order for the collapse. A stack structure
>> is used instead of traditional recursion to manage the search. This also
>> prevents a traditional recursive approach when the kernel stack struct is
>> limited. The algorithm recursively splits the bitmap into smaller chunks to
>> find the highest order mTHPs that satisfy the collapse criteria. We start
>> by attempting the PMD order, then moved on the consecutively lower orders
>> (mTHP collapse). The stack maintains a pair of variables (offset, order),
>> indicating the number of PTEs from the start of the PMD, and the order of
>> the potential collapse candidate.
>>
>> The algorithm for consuming the bitmap works as such:
>>     1) push (0, HPAGE_PMD_ORDER) onto the stack
>>     2) pop the stack
>>     3) check if the number of set bits in that (offset,order) pair
>>        statisfy the max_ptes_none threshold for that order
>>     4) if yes, attempt collapse
>>     5) if no (or collapse fails), push two new stack items representing
>>        the left and right halves of the current bitmap range, at the
>>        next lower order
>>     6) repeat at step (2) until stack is empty.
>>
>> Below is a diagram representing the algorithm and stack items:
>>
>>                             offset   mid_offset
>>                             |        |
>>                             |        |
>>                             v        v
>>           ____________________________________
>>          |          PTE Page Table            |
>>          --------------------------------------
>> 			    <-------><------->
>>                              order-1  order-1
>>
>> mTHP collapses reject regions containing swapped out or shared pages.
>> This is because adding new entries can lead to new none pages, and these
>> may lead to constant promotion into a higher order mTHP. A similar
>> issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
>> introducing at least 2x the number of pages, and on a future scan will
>> satisfy the promotion condition once again. This issue is prevented via
>> the collapse_max_ptes_none() function which imposes the max_ptes_none
>> restrictions above.
>>
>> We currently only support mTHP collapse for max_ptes_none values of 0
>> and HPAGE_PMD_NR - 1. resulting in the following behavior:
>>
>>     - max_ptes_none=0: Never introduce new empty pages during collapse
>>     - max_ptes_none=HPAGE_PMD_NR-1: Always try collapse to the highest
>>       available mTHP order
>>
>> Any other max_ptes_none value will emit a warning and skip mTHP collapse
>> attempts. There should be no behavior change for PMD collapse.
>>
>> Once we determine what mTHP sizes fits best in that PMD range a collapse
>> is attempted. A minimum collapse order of 2 is used as this is the lowest
>> order supported by anon memory as defined by THP_ORDERS_ALL_ANON.
>>
>> Currently madv_collapse is not supported and will only attempt PMD
>> collapse.
>>
>> We can also remove the check for is_khugepaged inside the PMD scan as
>> the collapse_max_ptes_none() function handles this logic now.
>>
>> Signed-off-by: Nico Pache <npache@redhat.com>
>> ---
>>  mm/khugepaged.c | 182 +++++++++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 174 insertions(+), 8 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 3492b135d667..39bf7ea8a6e8 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -100,6 +100,30 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
>>
>>  static struct kmem_cache *mm_slot_cache __ro_after_init;
>>
>> +#define KHUGEPAGED_MIN_MTHP_ORDER	2
>> +/*
>> + * mthp_collapse() does an iterative DFS over a binary tree, from
>> + * HPAGE_PMD_ORDER down to KHUGEPAGED_MIN_MTHP_ORDER. The max stack
>> + * size needed for a DFS on a binary tree is height + 1, where
>> + * height = HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER.
>> + *
>> + * ilog2 is used in place of HPAGE_PMD_ORDER because some architectures
>> + * (e.g. ppc64le) do not define HPAGE_PMD_ORDER until after build time.
>> + */
>> +#define MTHP_STACK_SIZE	(ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER + 1)
>> +
>> +/*
>> + * Defines a range of PTE entries in a PTE page table which are being
>> + * considered for mTHP collapse.
>> + *
>> + * @offset: the offset of the first PTE entry in a PMD range.
>> + * @order: the order of the PTE entries being considered for collapse.
>> + */
>> +struct mthp_range {
>> +	u16 offset;
>> +	u8 order;
>> +};
>> +
>>  struct collapse_control {
>>  	bool is_khugepaged;
>>
>> @@ -111,6 +135,12 @@ struct collapse_control {
>>
>>  	/* nodemask for allocation fallback */
>>  	nodemask_t alloc_nmask;
>> +
>> +	/* Each bit represents a single occupied (!none/zero) page. */
>> +	DECLARE_BITMAP(mthp_bitmap, MAX_PTRS_PER_PTE);
>> +	/* A mask of the current range being considered for mTHP collapse. */
>> +	DECLARE_BITMAP(mthp_bitmap_mask, MAX_PTRS_PER_PTE);
>> +	struct mthp_range mthp_bitmap_stack[MTHP_STACK_SIZE];
>>  };
>>
>>  /**
>> @@ -1404,20 +1434,140 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
>>  	return result;
>>  }
>>
>> +static void collapse_mthp_stack_push(struct collapse_control *cc, int *stack_size,
>> +				     u16 offset, u8 order)
>> +{
>> +	const int size = *stack_size;
>> +	struct mthp_range *stack = &cc->mthp_bitmap_stack[size];
>> +
>> +	VM_WARN_ON_ONCE(size >= MTHP_STACK_SIZE);
>> +	stack->order = order;
>> +	stack->offset = offset;
>> +	(*stack_size)++;
>> +}
>> +
>> +static struct mthp_range collapse_mthp_stack_pop(struct collapse_control *cc,
>> +						 int *stack_size)
>> +{
>> +	const int size = *stack_size;
>> +
>> +	VM_WARN_ON_ONCE(size <= 0);
>> +	(*stack_size)--;
>> +	return cc->mthp_bitmap_stack[size - 1];
>> +}
>> +
>> +static unsigned int collapse_mthp_count_present(struct collapse_control *cc,
>> +						u16 offset, unsigned int nr_ptes)
>> +{
>> +	bitmap_zero(cc->mthp_bitmap_mask, MAX_PTRS_PER_PTE);
>> +	bitmap_set(cc->mthp_bitmap_mask, offset, nr_ptes);
>> +	return bitmap_weight_and(cc->mthp_bitmap, cc->mthp_bitmap_mask, MAX_PTRS_PER_PTE);
>> +}
>> +
>> +/*
>> + * mthp_collapse() consumes the bitmap that is generated during
>> + * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
>> + *
>> + * Each bit in cc->mthp_bitmap represents a single occupied (!none/zero) page.
>> + * A stack structure cc->mthp_bitmap_stack is used to check different regions
>> + * of the bitmap for collapse eligibility. The stack maintains a pair of
>> + * variables (offset, order), indicating the number of PTEs from the start of
>> + * the PMD, and the order of the potential collapse candidate respectively. We
>> + * start at the PMD order and check if it is eligible for collapse; if not, we
>> + * add two entries to the stack at a lower order to represent the left and right
>> + * halves of the PTE page table we are examining.
>> + *
>> + *                         offset       mid_offset
>> + *                         |         |
>> + *                         |         |
>> + *                         v         v
>> + *      --------------------------------------
>> + *      |          cc->mthp_bitmap            |
>> + *      --------------------------------------
>> + *                         <-------><------->
>> + *                          order-1  order-1
>> + *
>> + * For each of these, we determine how many PTE entries are occupied in the
>> + * range of PTE entries we propose to collapse, then we compare this to a
>> + * threshold number of PTE entries which would need to be occupied for a
>> + * collapse to be permitted at that order (accounting for max_ptes_none).
>> + *
>> + * If a collapse is permitted, we attempt to collapse the PTE range into a
>> + * mTHP.
>> + */
>> +static int mthp_collapse(struct mm_struct *mm, unsigned long address,
>> +		int referenced, int unmapped, struct collapse_control *cc,
>> +		unsigned long enabled_orders)
>> +{
>> +	unsigned int nr_occupied_ptes, nr_ptes;
>> +	int max_ptes_none, collapsed = 0, stack_size = 0;
>> +	unsigned long collapse_address;
>> +	struct mthp_range range;
>> +	u16 offset;
>> +	u8 order;
>> +
>> +	collapse_mthp_stack_push(cc, &stack_size, 0, HPAGE_PMD_ORDER);
>> +
>> +	while (stack_size) {
>> +		range = collapse_mthp_stack_pop(cc, &stack_size);
>> +		order = range.order;
>> +		offset = range.offset;
>> +		nr_ptes = 1UL << order;
>> +
>> +		if (!test_bit(order, &enabled_orders))
>> +			goto next_order;
>> +
>> +		max_ptes_none = collapse_max_ptes_none(cc, NULL, order);
>> +
>> +		if (max_ptes_none < 0)
>> +			return collapsed;
>> +
>> +		nr_occupied_ptes = collapse_mthp_count_present(cc, offset,
>> +							       nr_ptes);
>> +
>> +		if (nr_occupied_ptes >= nr_ptes - max_ptes_none) {
>> +			int ret;
>> +
>> +			collapse_address = address + offset * PAGE_SIZE;
>> +			ret = collapse_huge_page(mm, collapse_address, referenced,
>> +						 unmapped, cc, order);
>> +			if (ret == SCAN_SUCCEED) {
>> +				collapsed += nr_ptes;
>> +				continue;
>> +			}
>> +		}
>> +
>> +next_order:
>> +		if (order > KHUGEPAGED_MIN_MTHP_ORDER) {
>
>Hi Nico, thank you very much for your contributions to this series.
>
>I found a minor issue, for MADV_COLLAPSE, if collapse_huge_page() fails
>for some reason (e.g. allocate folio), it goes to next_order and
>continues splitting to the next small order. However, enabled_orders
>only supports HPAGE_PMD_ORDER, so it keeps runing the split operations
>without any effective work until KHUGEPAGED_MIN_MTHP_ORDER is reached
>before exiting. For khugepaged, e.g. setting only 2MB to always, also
>same phenomenon.

Yes, but it does no actual work since it is checked after pop up.

>
>This does not affect the overall functionality of mthp collapse, just
>redundant.
>
>The redundant operations can be easily skipped with the following
>modification. If I miss some thing, please let me know. Thanks!
>
>diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>index 1a25af3d6d0f..fa407cce525c 100644
>--- a/mm/khugepaged.c
>+++ b/mm/khugepaged.c
>@@ -1574,7 +1574,7 @@ static int mthp_collapse(struct mm_struct *mm, unsigned long address,
> 		}
>
> next_order:
>-		if (order > KHUGEPAGED_MIN_MTHP_ORDER) {
>+		if ((BIT(order) - 1) & enabled_orders) {
> 			const u8 next_order = order - 1;
> 			const u16 mid_offset = offset + (nr_ptes / 2);
>

This would stop the iteration if there are other lower enabled order, right?

>Cheers,
>Vernon

-- 
Wei Yang
Help you, Help me

^ permalink raw reply

* Re: [PATCH mm-unstable v17 11/14] mm/khugepaged: Introduce mTHP collapse support
From: Vernon Yang @ 2026-05-21  2:36 UTC (permalink / raw)
  To: Nico Pache
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat,
	mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260511185817.686831-12-npache@redhat.com>

On Mon, May 11, 2026 at 12:58:11PM -0600, Nico Pache wrote:
> Enable khugepaged to collapse to mTHP orders. This patch implements the
> main scanning logic using a bitmap to track occupied pages and a stack
> structure that allows us to find optimal collapse sizes.
>
> Previous to this patch, PMD collapse had 3 main phases, a light weight
> scanning phase (mmap_read_lock) that determines a potential PMD
> collapse, an alloc phase (mmap unlocked), then finally heavier collapse
> phase (mmap_write_lock).
>
> To enabled mTHP collapse we make the following changes:
>
> During PMD scan phase, track occupied pages in a bitmap. When mTHP
> orders are enabled, we remove the restriction of max_ptes_none during the
> scan phase to avoid missing potential mTHP collapse candidates. Once we
> have scanned the full PMD range and updated the bitmap to track occupied
> pages, we use the bitmap to find the optimal mTHP size.
>
> Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
> and determine the best eligible order for the collapse. A stack structure
> is used instead of traditional recursion to manage the search. This also
> prevents a traditional recursive approach when the kernel stack struct is
> limited. The algorithm recursively splits the bitmap into smaller chunks to
> find the highest order mTHPs that satisfy the collapse criteria. We start
> by attempting the PMD order, then moved on the consecutively lower orders
> (mTHP collapse). The stack maintains a pair of variables (offset, order),
> indicating the number of PTEs from the start of the PMD, and the order of
> the potential collapse candidate.
>
> The algorithm for consuming the bitmap works as such:
>     1) push (0, HPAGE_PMD_ORDER) onto the stack
>     2) pop the stack
>     3) check if the number of set bits in that (offset,order) pair
>        statisfy the max_ptes_none threshold for that order
>     4) if yes, attempt collapse
>     5) if no (or collapse fails), push two new stack items representing
>        the left and right halves of the current bitmap range, at the
>        next lower order
>     6) repeat at step (2) until stack is empty.
>
> Below is a diagram representing the algorithm and stack items:
>
>                             offset   mid_offset
>                             |        |
>                             |        |
>                             v        v
>           ____________________________________
>          |          PTE Page Table            |
>          --------------------------------------
> 			    <-------><------->
>                              order-1  order-1
>
> mTHP collapses reject regions containing swapped out or shared pages.
> This is because adding new entries can lead to new none pages, and these
> may lead to constant promotion into a higher order mTHP. A similar
> issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
> introducing at least 2x the number of pages, and on a future scan will
> satisfy the promotion condition once again. This issue is prevented via
> the collapse_max_ptes_none() function which imposes the max_ptes_none
> restrictions above.
>
> We currently only support mTHP collapse for max_ptes_none values of 0
> and HPAGE_PMD_NR - 1. resulting in the following behavior:
>
>     - max_ptes_none=0: Never introduce new empty pages during collapse
>     - max_ptes_none=HPAGE_PMD_NR-1: Always try collapse to the highest
>       available mTHP order
>
> Any other max_ptes_none value will emit a warning and skip mTHP collapse
> attempts. There should be no behavior change for PMD collapse.
>
> Once we determine what mTHP sizes fits best in that PMD range a collapse
> is attempted. A minimum collapse order of 2 is used as this is the lowest
> order supported by anon memory as defined by THP_ORDERS_ALL_ANON.
>
> Currently madv_collapse is not supported and will only attempt PMD
> collapse.
>
> We can also remove the check for is_khugepaged inside the PMD scan as
> the collapse_max_ptes_none() function handles this logic now.
>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 182 +++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 174 insertions(+), 8 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 3492b135d667..39bf7ea8a6e8 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -100,6 +100,30 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
>
>  static struct kmem_cache *mm_slot_cache __ro_after_init;
>
> +#define KHUGEPAGED_MIN_MTHP_ORDER	2
> +/*
> + * mthp_collapse() does an iterative DFS over a binary tree, from
> + * HPAGE_PMD_ORDER down to KHUGEPAGED_MIN_MTHP_ORDER. The max stack
> + * size needed for a DFS on a binary tree is height + 1, where
> + * height = HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER.
> + *
> + * ilog2 is used in place of HPAGE_PMD_ORDER because some architectures
> + * (e.g. ppc64le) do not define HPAGE_PMD_ORDER until after build time.
> + */
> +#define MTHP_STACK_SIZE	(ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER + 1)
> +
> +/*
> + * Defines a range of PTE entries in a PTE page table which are being
> + * considered for mTHP collapse.
> + *
> + * @offset: the offset of the first PTE entry in a PMD range.
> + * @order: the order of the PTE entries being considered for collapse.
> + */
> +struct mthp_range {
> +	u16 offset;
> +	u8 order;
> +};
> +
>  struct collapse_control {
>  	bool is_khugepaged;
>
> @@ -111,6 +135,12 @@ struct collapse_control {
>
>  	/* nodemask for allocation fallback */
>  	nodemask_t alloc_nmask;
> +
> +	/* Each bit represents a single occupied (!none/zero) page. */
> +	DECLARE_BITMAP(mthp_bitmap, MAX_PTRS_PER_PTE);
> +	/* A mask of the current range being considered for mTHP collapse. */
> +	DECLARE_BITMAP(mthp_bitmap_mask, MAX_PTRS_PER_PTE);
> +	struct mthp_range mthp_bitmap_stack[MTHP_STACK_SIZE];
>  };
>
>  /**
> @@ -1404,20 +1434,140 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
>  	return result;
>  }
>
> +static void collapse_mthp_stack_push(struct collapse_control *cc, int *stack_size,
> +				     u16 offset, u8 order)
> +{
> +	const int size = *stack_size;
> +	struct mthp_range *stack = &cc->mthp_bitmap_stack[size];
> +
> +	VM_WARN_ON_ONCE(size >= MTHP_STACK_SIZE);
> +	stack->order = order;
> +	stack->offset = offset;
> +	(*stack_size)++;
> +}
> +
> +static struct mthp_range collapse_mthp_stack_pop(struct collapse_control *cc,
> +						 int *stack_size)
> +{
> +	const int size = *stack_size;
> +
> +	VM_WARN_ON_ONCE(size <= 0);
> +	(*stack_size)--;
> +	return cc->mthp_bitmap_stack[size - 1];
> +}
> +
> +static unsigned int collapse_mthp_count_present(struct collapse_control *cc,
> +						u16 offset, unsigned int nr_ptes)
> +{
> +	bitmap_zero(cc->mthp_bitmap_mask, MAX_PTRS_PER_PTE);
> +	bitmap_set(cc->mthp_bitmap_mask, offset, nr_ptes);
> +	return bitmap_weight_and(cc->mthp_bitmap, cc->mthp_bitmap_mask, MAX_PTRS_PER_PTE);
> +}
> +
> +/*
> + * mthp_collapse() consumes the bitmap that is generated during
> + * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
> + *
> + * Each bit in cc->mthp_bitmap represents a single occupied (!none/zero) page.
> + * A stack structure cc->mthp_bitmap_stack is used to check different regions
> + * of the bitmap for collapse eligibility. The stack maintains a pair of
> + * variables (offset, order), indicating the number of PTEs from the start of
> + * the PMD, and the order of the potential collapse candidate respectively. We
> + * start at the PMD order and check if it is eligible for collapse; if not, we
> + * add two entries to the stack at a lower order to represent the left and right
> + * halves of the PTE page table we are examining.
> + *
> + *                         offset       mid_offset
> + *                         |         |
> + *                         |         |
> + *                         v         v
> + *      --------------------------------------
> + *      |          cc->mthp_bitmap            |
> + *      --------------------------------------
> + *                         <-------><------->
> + *                          order-1  order-1
> + *
> + * For each of these, we determine how many PTE entries are occupied in the
> + * range of PTE entries we propose to collapse, then we compare this to a
> + * threshold number of PTE entries which would need to be occupied for a
> + * collapse to be permitted at that order (accounting for max_ptes_none).
> + *
> + * If a collapse is permitted, we attempt to collapse the PTE range into a
> + * mTHP.
> + */
> +static int mthp_collapse(struct mm_struct *mm, unsigned long address,
> +		int referenced, int unmapped, struct collapse_control *cc,
> +		unsigned long enabled_orders)
> +{
> +	unsigned int nr_occupied_ptes, nr_ptes;
> +	int max_ptes_none, collapsed = 0, stack_size = 0;
> +	unsigned long collapse_address;
> +	struct mthp_range range;
> +	u16 offset;
> +	u8 order;
> +
> +	collapse_mthp_stack_push(cc, &stack_size, 0, HPAGE_PMD_ORDER);
> +
> +	while (stack_size) {
> +		range = collapse_mthp_stack_pop(cc, &stack_size);
> +		order = range.order;
> +		offset = range.offset;
> +		nr_ptes = 1UL << order;
> +
> +		if (!test_bit(order, &enabled_orders))
> +			goto next_order;
> +
> +		max_ptes_none = collapse_max_ptes_none(cc, NULL, order);
> +
> +		if (max_ptes_none < 0)
> +			return collapsed;
> +
> +		nr_occupied_ptes = collapse_mthp_count_present(cc, offset,
> +							       nr_ptes);
> +
> +		if (nr_occupied_ptes >= nr_ptes - max_ptes_none) {
> +			int ret;
> +
> +			collapse_address = address + offset * PAGE_SIZE;
> +			ret = collapse_huge_page(mm, collapse_address, referenced,
> +						 unmapped, cc, order);
> +			if (ret == SCAN_SUCCEED) {
> +				collapsed += nr_ptes;
> +				continue;
> +			}
> +		}
> +
> +next_order:
> +		if (order > KHUGEPAGED_MIN_MTHP_ORDER) {

Hi Nico, thank you very much for your contributions to this series.

I found a minor issue, for MADV_COLLAPSE, if collapse_huge_page() fails
for some reason (e.g. allocate folio), it goes to next_order and
continues splitting to the next small order. However, enabled_orders
only supports HPAGE_PMD_ORDER, so it keeps runing the split operations
without any effective work until KHUGEPAGED_MIN_MTHP_ORDER is reached
before exiting. For khugepaged, e.g. setting only 2MB to always, also
same phenomenon.

This does not affect the overall functionality of mthp collapse, just
redundant.

The redundant operations can be easily skipped with the following
modification. If I miss some thing, please let me know. Thanks!

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 1a25af3d6d0f..fa407cce525c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1574,7 +1574,7 @@ static int mthp_collapse(struct mm_struct *mm, unsigned long address,
 		}

 next_order:
-		if (order > KHUGEPAGED_MIN_MTHP_ORDER) {
+		if ((BIT(order) - 1) & enabled_orders) {
 			const u8 next_order = order - 1;
 			const u16 mid_offset = offset + (nr_ptes / 2);

--
Cheers,
Vernon

> +			const u8 next_order = order - 1;
> +			const u16 mid_offset = offset + (nr_ptes / 2);
> +
> +			collapse_mthp_stack_push(cc, &stack_size, mid_offset,
> +						 next_order);
> +			collapse_mthp_stack_push(cc, &stack_size, offset,
> +						 next_order);
> +		}
> +	}
> +	return collapsed;
> +}
> +
>  static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  		struct vm_area_struct *vma, unsigned long start_addr,
>  		bool *lock_dropped, struct collapse_control *cc)
>  {
> -	const int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
> +	int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
>  	const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
>  	const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
> +	enum tva_type tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
>  	pmd_t *pmd;
> -	pte_t *pte, *_pte;
> -	int none_or_zero = 0, shared = 0, referenced = 0;
> +	pte_t *pte, *_pte, pteval;
> +	int i;
> +	int none_or_zero = 0, shared = 0, nr_collapsed = 0, referenced = 0;
>  	enum scan_result result = SCAN_FAIL;
>  	struct page *page = NULL;
>  	struct folio *folio = NULL;
>  	unsigned long addr;
> +	unsigned long enabled_orders;
>  	spinlock_t *ptl;
>  	int node = NUMA_NO_NODE, unmapped = 0;
>
> @@ -1429,8 +1579,19 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  		goto out;
>  	}
>
> +	bitmap_zero(cc->mthp_bitmap, MAX_PTRS_PER_PTE);
>  	memset(cc->node_load, 0, sizeof(cc->node_load));
>  	nodes_clear(cc->alloc_nmask);
> +
> +	enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, tva_flags);
> +
> +	/*
> +	 * If PMD is the only enabled order, enforce max_ptes_none, otherwise
> +	 * scan all pages to populate the bitmap for mTHP collapse.
> +	 */
> +	if (enabled_orders != BIT(HPAGE_PMD_ORDER))
> +		max_ptes_none = KHUGEPAGED_MAX_PTES_LIMIT;
> +
>  	pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
>  	if (!pte) {
>  		cc->progress++;
> @@ -1438,11 +1599,13 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  		goto out;
>  	}
>
> -	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> -	     _pte++, addr += PAGE_SIZE) {
> +	for (i = 0; i < HPAGE_PMD_NR; i++) {
> +		_pte = pte + i;
> +		addr = start_addr + i * PAGE_SIZE;
> +		pteval = ptep_get(_pte);
> +
>  		cc->progress++;
>
> -		pte_t pteval = ptep_get(_pte);
>  		if (pte_none_or_zero(pteval)) {
>  			if (++none_or_zero > max_ptes_none) {
>  				result = SCAN_EXCEED_NONE_PTE;
> @@ -1522,6 +1685,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  			}
>  		}
>
> +		/* Set bit for occupied pages */
> +		__set_bit(i, cc->mthp_bitmap);
>  		/*
>  		 * Record which node the original page is from and save this
>  		 * information to cc->node_load[].
> @@ -1580,10 +1745,11 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  	if (result == SCAN_SUCCEED) {
>  		/* collapse_huge_page expects the lock to be dropped before calling */
>  		mmap_read_unlock(mm);
> -		result = collapse_huge_page(mm, start_addr, referenced,
> -					    unmapped, cc, HPAGE_PMD_ORDER);
> +		nr_collapsed = mthp_collapse(mm, start_addr, referenced, unmapped,
> +					      cc, enabled_orders);
>  		/* collapse_huge_page will return with the mmap_lock released */
>  		*lock_dropped = true;
> +		result = nr_collapsed ? SCAN_SUCCEED : SCAN_FAIL;
>  	}
>  out:
>  	trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
> --
> 2.54.0
>
>

^ permalink raw reply related

* Re: [PATCH v8 0/8] KVM: x86: nSVM: Improve PAT virtualization
From: Yosry Ahmed @ 2026-05-21  2:36 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Paolo Bonzini, Jonathan Corbet, Shuah Khan, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
	kvm, linux-doc, linux-kernel, Jim Mattson
In-Reply-To: <ag4ZwD53B7a0ivgT@google.com>

On Wed, May 20, 2026 at 08:30:31PM +0000, Yosry Ahmed wrote:
> On Mon, May 18, 2026 at 05:41:06PM -0700, Sean Christopherson wrote:
> > On Tue, 07 Apr 2026 12:03:23 -0700, Jim Mattson wrote:
> > > Currently, KVM's implementation of nested SVM treats the PAT MSR the same
> > > way whether or not nested NPT is enabled: L1 and L2 share a single
> > > PAT. However, the AMD APM specifies that when nested NPT is enabled, the host
> > > (L1) and the guest (L2) should have independent PATs: hPAT for L1 and gPAT
> > > for L2.
> > > 
> > > This patch series implements independent PATs for L1 and L2 when nested NPT
> > > is enabled, but only when a new quirk, KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT,
> > > is disabled. By default, the quirk is enabled, preserving KVM's legacy
> > > behavior. When the quirk is disabled, KVM correctly virtualizes a separate
> > > PAT register for L2, using the g_pat field in the VMCB.
> > > 
> > > [...]
> > 
> > Applied to kvm-x86 svm.  Yosry and/or Jim, please double check the result, the
> > goof with patch 5 was slightly more annoying than I was expecting.
> 
> The result looks good to me. I also ran the selftest from v7 and it
> passes. I couldn't help myself from reworking it and cleaning it up, I
> will send a patch your way soon.

As promised, if you want something you can directly run against the
current kvm-x86 svm:
https://lore.kernel.org/kvm/20260521023448.3826878-1-yosry@kernel.org/

^ permalink raw reply

* [PATCH v2] kconfig: add optional warnings for changed input values
From: Pengpeng Hou @ 2026-05-21  2:28 UTC (permalink / raw)
  To: Nathan Chancellor, Nicolas Schier, Masahiro Yamada
  Cc: Jonathan Corbet, Shuah Khan, linux-kbuild, linux-doc,
	linux-kernel, Pengpeng Hou

When reading .config input, Kconfig stores user-provided values first and
then resolves the final value after applying dependencies, ranges, and
other constraints.

If the final value differs from the user's input, Kconfig already tracks
that state internally, but it does not provide any focused diagnostic to
show which explicit inputs were adjusted. This is particularly confusing
for requested values that get forced down by unmet dependencies or
clamped by ranges.

Add an opt-in diagnostic controlled by KCONFIG_WARN_CHANGED_INPUT. Emit
the warnings from conf_write() and conf_write_defconfig() after value
resolution and through the existing message callback path so the default
behavior stays unchanged and interactive frontends remain usable.

Avoid the conf_message() formatting buffer for this diagnostic so long
warning lists are not truncated before reaching the callback, and mark
processed symbols as written before the SYMBOL_WRITE check so duplicate
menu nodes cannot emit duplicate warnings.

Document the new environment variable and add tests for both olddefconfig
and savedefconfig.

Signed-off-by: Pengpeng Hou <pengpeng@iscas.ac.cn>
---
Changes since v1: https://lore.kernel.org/all/20260406233001.1-kconfig-warn-changed-input-pengpeng@iscas.ac.cn/
- rename "found" to "changed_input_found" as suggested by Nicolas
- avoid the conf_message() 4096-byte formatting buffer so long warning
  lists are not truncated before the callback sees them
- mark each processed symbol as SYMBOL_WRITTEN before checking
  SYMBOL_WRITE to avoid duplicate warnings for duplicate menu nodes
- add duplicate-definition selftest coverage
- do not carry the Reviewed-by/Tested-by tags because v2 changes warning
  emission and duplicate suppression

 Documentation/kbuild/kconfig.rst              |   5 +
 scripts/kconfig/confdata.c                    | 107 +++++++++++++++++-
 .../kconfig/tests/warn_changed_input/Kconfig  |  40 +++++++
 .../tests/warn_changed_input/__init__.py      |  27 +++++
 .../kconfig/tests/warn_changed_input/config   |   3 +
 .../tests/warn_changed_input/expected_config  |   6 +
 .../warn_changed_input/expected_defconfig     |   1 +
 .../tests/warn_changed_input/expected_stdout  |   4 +
 8 files changed, 189 insertions(+), 4 deletions(-)
 create mode 100644 scripts/kconfig/tests/warn_changed_input/Kconfig
 create mode 100644 scripts/kconfig/tests/warn_changed_input/__init__.py
 create mode 100644 scripts/kconfig/tests/warn_changed_input/config
 create mode 100644 scripts/kconfig/tests/warn_changed_input/expected_config
 create mode 100644 scripts/kconfig/tests/warn_changed_input/expected_defconfig
 create mode 100644 scripts/kconfig/tests/warn_changed_input/expected_stdout

diff --git a/Documentation/kbuild/kconfig.rst b/Documentation/kbuild/kconfig.rst
index d213c4f599a4..e35dd1d5f9d3 100644
--- a/Documentation/kbuild/kconfig.rst
+++ b/Documentation/kbuild/kconfig.rst
@@ -59,6 +59,11 @@ Environment variables for ``*config``:
     This environment variable makes Kconfig warn about all unrecognized
     symbols in the config input.
 
+``KCONFIG_WARN_CHANGED_INPUT``
+    If set to a non-blank value, Kconfig prints optional warnings for
+    user-provided values that change after Kconfig resolves dependencies
+    or applies other constraints such as ranges.
+
 ``KCONFIG_WERROR``
     If set, Kconfig treats warnings as errors.
 
diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c
index 9599a0408862..1fe6d3644e79 100644
--- a/scripts/kconfig/confdata.c
+++ b/scripts/kconfig/confdata.c
@@ -206,6 +206,79 @@ static void conf_message(const char *fmt, ...)
 	va_end(ap);
 }
 
+static void conf_message_raw(const char *s)
+{
+	if (conf_message_callback)
+		conf_message_callback(s);
+}
+
+static bool conf_warn_changed_input_enabled(void)
+{
+	const char *env = getenv("KCONFIG_WARN_CHANGED_INPUT");
+
+	return env && *env;
+}
+
+static const char *sym_get_user_value_string(struct symbol *sym)
+{
+	switch (sym->type) {
+	case S_BOOLEAN:
+	case S_TRISTATE:
+		switch (sym->def[S_DEF_USER].tri) {
+		case yes:
+			return "y";
+		case mod:
+			return "m";
+		default:
+			return "n";
+		}
+	default:
+		return sym->def[S_DEF_USER].val ?: "";
+	}
+}
+
+static bool sym_user_value_changed(struct symbol *sym)
+{
+	if (!sym_has_value(sym) || sym->type == S_UNKNOWN)
+		return false;
+
+	switch (sym->type) {
+	case S_BOOLEAN:
+	case S_TRISTATE:
+		return sym->def[S_DEF_USER].tri != sym_get_tristate_value(sym);
+	default:
+		return strcmp(sym_get_user_value_string(sym),
+			      sym_get_string_value(sym));
+	}
+}
+
+static void conf_clear_written_flags(void)
+{
+	struct symbol *sym;
+
+	for_all_symbols(sym)
+		sym->flags &= ~SYMBOL_WRITTEN;
+}
+
+static void conf_append_changed_input_warning(struct gstr *gs,
+					      struct symbol *sym,
+					      bool *changed_input_found)
+{
+	if (!sym_user_value_changed(sym))
+		return;
+
+	if (!*changed_input_found) {
+		str_printf(gs,
+			   "warning: user-provided values changed by Kconfig:\n");
+		*changed_input_found = true;
+	}
+
+	str_printf(gs, "  %s%s: %s -> %s\n",
+		   CONFIG_, sym->name,
+		   sym_get_user_value_string(sym),
+		   sym_get_string_value(sym));
+}
+
 const char *conf_get_configname(void)
 {
 	char *name = getenv("KCONFIG_CONFIG");
@@ -759,11 +832,15 @@ int conf_write_defconfig(const char *filename)
 {
 	struct symbol *sym;
 	struct menu *menu;
+	struct gstr gs;
 	FILE *out;
+	bool warn_changed_input = conf_warn_changed_input_enabled();
+	bool changed_input_found = false;
 
 	out = fopen(filename, "w");
 	if (!out)
 		return 1;
+	gs = str_new();
 
 	sym_clear_all_valid();
 
@@ -772,10 +849,14 @@ int conf_write_defconfig(const char *filename)
 
 		sym = menu->sym;
 
-		if (!sym || sym_is_choice(sym))
+		if (!sym || sym_is_choice(sym) || sym->flags & SYMBOL_WRITTEN)
 			continue;
 
 		sym_calc_value(sym);
+		if (warn_changed_input)
+			conf_append_changed_input_warning(&gs, sym,
+							  &changed_input_found);
+		sym->flags |= SYMBOL_WRITTEN;
 		if (!(sym->flags & SYMBOL_WRITE))
 			continue;
 		sym->flags &= ~SYMBOL_WRITE;
@@ -798,6 +879,13 @@ int conf_write_defconfig(const char *filename)
 		print_symbol_for_dotconfig(out, sym);
 	}
 	fclose(out);
+
+	conf_clear_written_flags();
+
+	if (changed_input_found)
+		conf_message_raw(str_get(&gs));
+
+	str_free(&gs);
 	return 0;
 }
 
@@ -809,7 +897,10 @@ int conf_write(const char *name)
 	const char *str;
 	char tmpname[PATH_MAX + 1], oldname[PATH_MAX + 1];
 	char *env;
+	struct gstr gs;
 	bool need_newline = false;
+	bool warn_changed_input = conf_warn_changed_input_enabled();
+	bool changed_input_found = false;
 
 	if (!name)
 		name = conf_get_configname();
@@ -838,6 +929,7 @@ int conf_write(const char *name)
 	}
 	if (!out)
 		return 1;
+	gs = str_new();
 
 	conf_write_heading(out, &comment_style_pound);
 
@@ -859,13 +951,16 @@ int conf_write(const char *name)
 		} else if (!sym_is_choice(sym) &&
 			   !(sym->flags & SYMBOL_WRITTEN)) {
 			sym_calc_value(sym);
+			if (warn_changed_input)
+				conf_append_changed_input_warning(&gs, sym,
+								  &changed_input_found);
+			sym->flags |= SYMBOL_WRITTEN;
 			if (!(sym->flags & SYMBOL_WRITE))
 				goto next;
 			if (need_newline) {
 				fprintf(out, "\n");
 				need_newline = false;
 			}
-			sym->flags |= SYMBOL_WRITTEN;
 			print_symbol_for_dotconfig(out, sym);
 		}
 
@@ -892,8 +987,12 @@ int conf_write(const char *name)
 	}
 	fclose(out);
 
-	for_all_symbols(sym)
-		sym->flags &= ~SYMBOL_WRITTEN;
+	conf_clear_written_flags();
+
+	if (changed_input_found)
+		conf_message_raw(str_get(&gs));
+
+	str_free(&gs);
 
 	if (*tmpname) {
 		if (is_same(name, tmpname)) {
diff --git a/scripts/kconfig/tests/warn_changed_input/Kconfig b/scripts/kconfig/tests/warn_changed_input/Kconfig
new file mode 100644
index 000000000000..69845e2f3fb3
--- /dev/null
+++ b/scripts/kconfig/tests/warn_changed_input/Kconfig
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: GPL-2.0
+
+config DEP
+	bool "DEP"
+	help
+	  Test dependency symbol for Kconfig warning coverage.
+	  This is used by the warn_changed_input selftest.
+	  It intentionally stays unset in the input fragment.
+	  The test checks how dependent user input is adjusted.
+
+config A
+	bool "A"
+	depends on DEP
+	help
+	  Test bool symbol for changed-input diagnostics.
+	  The input fragment requests this symbol as built-in.
+	  The unmet dependency on DEP forces the final value to n.
+	  The warning should report that downgrade.
+
+config NUM
+	int "NUM"
+	range 10 20
+	help
+	  Test integer symbol for changed-input diagnostics.
+	  The input fragment requests a value outside the allowed range.
+	  Kconfig resolves it to the constrained in-range value.
+	  The warning should report that adjustment.
+
+config DUP
+	bool "DUP"
+	depends on DEP
+	help
+	  Test duplicate-definition handling for changed-input diagnostics.
+	  The input fragment requests this symbol as built-in.
+	  The duplicate definition below must not produce a duplicate warning.
+	  This keeps the warning output stable for repeated menu entries.
+
+config DUP
+	bool
+	depends on DEP
diff --git a/scripts/kconfig/tests/warn_changed_input/__init__.py b/scripts/kconfig/tests/warn_changed_input/__init__.py
new file mode 100644
index 000000000000..5a2b68fb1033
--- /dev/null
+++ b/scripts/kconfig/tests/warn_changed_input/__init__.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: GPL-2.0
+"""
+Test optional warnings for user-provided values changed by Kconfig.
+
+Warnings should stay disabled by default, and should only appear when
+KCONFIG_WARN_CHANGED_INPUT is enabled.
+"""
+
+
+def test(conf):
+    assert conf.olddefconfig('config') == 0
+    assert 'user-provided values changed by Kconfig' not in conf.stdout
+
+    assert conf._run_conf('--olddefconfig', dot_config='config',
+                          extra_env={
+                              'KCONFIG_WARN_CHANGED_INPUT': '1',
+                          }) == 0
+    assert conf.stdout_contains('expected_stdout')
+    assert conf.config_matches('expected_config')
+
+    assert conf._run_conf('--savedefconfig=defconfig', dot_config='config',
+                          out_file='defconfig',
+                          extra_env={
+                              'KCONFIG_WARN_CHANGED_INPUT': '1',
+                          }) == 0
+    assert conf.stdout_contains('expected_stdout')
+    assert conf.config_matches('expected_defconfig')
diff --git a/scripts/kconfig/tests/warn_changed_input/config b/scripts/kconfig/tests/warn_changed_input/config
new file mode 100644
index 000000000000..dbe93ff26408
--- /dev/null
+++ b/scripts/kconfig/tests/warn_changed_input/config
@@ -0,0 +1,3 @@
+CONFIG_A=y
+CONFIG_NUM=30
+CONFIG_DUP=y
diff --git a/scripts/kconfig/tests/warn_changed_input/expected_config b/scripts/kconfig/tests/warn_changed_input/expected_config
new file mode 100644
index 000000000000..fe8bbec66c53
--- /dev/null
+++ b/scripts/kconfig/tests/warn_changed_input/expected_config
@@ -0,0 +1,6 @@
+#
+# Automatically generated file; DO NOT EDIT.
+# Main menu
+#
+# CONFIG_DEP is not set
+CONFIG_NUM=20
diff --git a/scripts/kconfig/tests/warn_changed_input/expected_defconfig b/scripts/kconfig/tests/warn_changed_input/expected_defconfig
new file mode 100644
index 000000000000..af9e34851d2a
--- /dev/null
+++ b/scripts/kconfig/tests/warn_changed_input/expected_defconfig
@@ -0,0 +1 @@
+CONFIG_NUM=20
diff --git a/scripts/kconfig/tests/warn_changed_input/expected_stdout b/scripts/kconfig/tests/warn_changed_input/expected_stdout
new file mode 100644
index 000000000000..9ec8446b4ac2
--- /dev/null
+++ b/scripts/kconfig/tests/warn_changed_input/expected_stdout
@@ -0,0 +1,4 @@
+warning: user-provided values changed by Kconfig:
+  CONFIG_A: y -> n
+  CONFIG_NUM: 30 -> 20
+  CONFIG_DUP: y -> n
-- 
2.50.1 (Apple Git-155)


^ permalink raw reply related

* Re: [PATCH bpf-next v2] bpf, docs: add LOAD_ACQUIRE and STORE_RELEASE instructions
From: David Vernet @ 2026-05-21  2:17 UTC (permalink / raw)
  To: Alexis Lothoré (eBPF Foundation)
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Eduard Zingerman, Kumar Kartikeya Dwivedi,
	Song Liu, Yonghong Song, Jiri Olsa, Jonathan Corbet, Shuah Khan,
	ebpf, Bastien Curutchet, Thomas Petazzoni, bpf, bpf, linux-doc,
	linux-kernel
In-Reply-To: <20260521-bpf-insn-doc-v2-1-8c43c037d599@bootlin.com>

[-- Attachment #1: Type: text/plain, Size: 2920 bytes --]

On Thu, May 21, 2026 at 12:09:11AM +0200, Alexis Lothoré (eBPF Foundation) wrote:

Hi Alexis,

Thanks for working on this.

> Commit 880442305a39 ("bpf: Introduce load-acquire and store-release
> instructions") instroduced the LOAD_ACQUIRE and STORE_RELEASE atomic

introduced

> instructions modifiers. Those are currently not described in the
> documentation, despite being used in the verifier and the various JIT
> compilers supporting them.
> 
> Add the missing entries in the instruction set documentation.
> 
> Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>

Alexei et al -- if you plan to do a subsequent RFC, it will influence
how this document needs to be structured. [0] explains the process for
adding new instructions. To quote:

> Once a conformance group is registered with a set of instructions, no
> further instructions can be added to that conformance group. A
> specification should instead create a new conformance group that
> includes the original conformance group, plus any newly added
> instructions. Inclusion of the original conformance group is done via
> the "includes" column of the BPF Instruction Conformance Groups
> registry, and inclusion of newly added instructions is done via the
> "groups" column of the BPF Instruction Set registry.

So you would have to create a new conformance group for these new
atomics -- you can't just add them to the existing one. In general it
might be easier / advised to snapshot this file to RFC 9669 and create a
new one for the new instructions to make it easier to tease this stuff
apart later. If that's something you want, I'm happy to get us started
with a skeleton file. Again, though, that's only necessary if you plan
to submit a new document to the IETF WG.

[0]: https://www.rfc-editor.org/rfc/rfc9669.html#name-adding-instructions

[...]

> +The ``LOAD_ACQ`` and ``STORE_REL`` operations allow using lighter load and
> +store memory barriers rather than full barriers. The corresponding accesses
> +must be aligned, but are allowed for any access size (8-bit up to 64-bit
> +operations), with 8-bit and 16-bit ``LOAD_ACQ`` loaded values being
> +zero-extended. As atomics are encoded as stores, the meaning of dst and src

Nit:

``dst`` and ``src``

``src`` below as well.

Note though that as mentioned above, these instructions should probably
go into a new conformance group that includes the existing atomics.

> +are different for ``LOAD_ACQ``, effectively using src as memory based
> +pointer and dst as destination register for the fetched value.
> +
>  64-bit immediate instructions
>  -----------------------------
>  
> 
> ---
> base-commit: ceeb3aa37bff895116944acf4347fcded0b7692d
> change-id: 20260520-bpf-insn-doc-756b369ca328
> 
> Best regards,
> --  
> Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [PATCH v5 08/13] ima: Introduce ima_dump_measurement()
From: Mimi Zohar @ 2026-05-21  2:07 UTC (permalink / raw)
  To: Roberto Sassu, corbet, skhan, dmitry.kasatkin, eric.snowberg,
	paul, jmorris, serge
  Cc: linux-doc, linux-kernel, linux-integrity, linux-security-module,
	gregorylumen, chenste, nramas, Roberto Sassu
In-Reply-To: <20260429160319.4162918-9-roberto.sassu@huaweicloud.com>

On Wed, 2026-04-29 at 18:03 +0200, Roberto Sassu wrote:
> From: Roberto Sassu <roberto.sassu@huawei.com>
> 
> Introduce ima_dump_measurement() to simplify the code of
> ima_dump_measurement_list() and to avoid repeating the
> ima_dump_measurement() code block if iteration occurs on multiple lists.
> 
> No functional change: only code moved to a separate function.
> 
> Link: https://github.com/linux-integrity/linux/issues/1
> Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>

Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>


^ permalink raw reply

* Re: [PATCH v5 07/13] ima: Use snprintf() in create_securityfs_measurement_lists
From: Mimi Zohar @ 2026-05-21  2:07 UTC (permalink / raw)
  To: Roberto Sassu, corbet, skhan, dmitry.kasatkin, eric.snowberg,
	paul, jmorris, serge
  Cc: linux-doc, linux-kernel, linux-integrity, linux-security-module,
	gregorylumen, chenste, nramas, Roberto Sassu
In-Reply-To: <20260429160319.4162918-8-roberto.sassu@huaweicloud.com>

On Wed, 2026-04-29 at 18:03 +0200, Roberto Sassu wrote:
> From: Roberto Sassu <roberto.sassu@huawei.com>
> 
> Use the more secure snprintf() function (accepting the buffer size) in
> create_securityfs_measurement_lists().
> 
> No functional change: sprintf() and snprintf() have the same behavior.
> 
> Link: https://github.com/linux-integrity/linux/issues/1
> Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>

Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>


^ permalink raw reply

* Re: [PATCH v5 06/13] ima: Mediate open/release method of the measurements list
From: Mimi Zohar @ 2026-05-21  2:07 UTC (permalink / raw)
  To: Roberto Sassu, corbet, skhan, dmitry.kasatkin, eric.snowberg,
	paul, jmorris, serge
  Cc: linux-doc, linux-kernel, linux-integrity, linux-security-module,
	gregorylumen, chenste, nramas, Roberto Sassu
In-Reply-To: <20260429160319.4162918-7-roberto.sassu@huaweicloud.com>

On Wed, 2026-04-29 at 18:03 +0200, Roberto Sassu wrote:
> From: Roberto Sassu <roberto.sassu@huawei.com>
> 
> Introduce the ima_measure_users counter, to implement a semaphore-like
> locking scheme where the binary and ASCII measurements list interfaces can
> be concurrently open by multiple readers, or alternatively by a single
> writer.
> 
> A semaphore cannot be used because the kernel cannot return to user space
> with a lock held.
> 
> Introduce the ima_measure_lock() and ima_measure_unlock() primitives, to
> respectively lock/unlock the interfaces (safely with the ima_measure_users
> counter, without holding a lock).
> 
> Finally, introduce _ima_measurements_open() to lock the interface before
> seq_open(), and call it from ima_measurements_open() and
> ima_ascii_measurements_open(). And, introduce ima_measurements_release(),
> to unlock the interface.
> 
> Require CAP_SYS_ADMIN if the interface is opened for write (not possible
> for the current measurements interfaces, since they only have read
> permission).
> 
> No functional changes: multiple readers are allowed as before.
> 
> Link: https://github.com/linux-integrity/linux/issues/1
> Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
> ---
>  security/integrity/ima/ima_fs.c | 71 +++++++++++++++++++++++++++++++--
>  1 file changed, 67 insertions(+), 4 deletions(-)
> 
> diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c
> index 9a8dba14d82a..68edea7139d5 100644
> --- a/security/integrity/ima/ima_fs.c
> +++ b/security/integrity/ima/ima_fs.c
> @@ -25,6 +25,8 @@
>  #include "ima.h"
>  
>  static DEFINE_MUTEX(ima_write_mutex);
> +static DEFINE_MUTEX(ima_measure_mutex);
> +static long ima_measure_users;

long?

>  
>  bool ima_canonical_fmt;
>  static int __init default_canonical_fmt_setup(char *str)
> @@ -209,16 +211,76 @@ static const struct seq_operations ima_measurments_seqops = {
>  	.show = ima_measurements_show
>  };
>  
> +static int ima_measure_lock(bool write)
> +{
> +	mutex_lock(&ima_measure_mutex);
> +	if ((write && ima_measure_users != 0) ||
> +	    (!write && ima_measure_users < 0)) {
> +		mutex_unlock(&ima_measure_mutex);
> +		return -EBUSY;
> +	}

Thanks, Roberto. The code is really clear and well written.  However, it could
use a comment indicating the different ima_measure_users values as a reminder.

ima_measure_users:  > 0 open readers
ima_meaasure_users: == -1 open writer

> +
> +	if (write)
> +		ima_measure_users--;
> +	else
> +		ima_measure_users++;
> +	mutex_unlock(&ima_measure_mutex);
> +	return 0;
> +}
> +
> +static void ima_measure_unlock(bool write)
> +{
> +	mutex_lock(&ima_measure_mutex);
> +	if (write)
> +		ima_measure_users++;

There should only be one writer at a time. ima_measure_users could be set to
zero.

> +	else
> +		ima_measure_users--;
> +	mutex_unlock(&ima_measure_mutex);
> +}
> +

thanks,

Mimi

^ permalink raw reply

* Re: [PATCH v5 05/13] ima: Introduce _ima_measurements_start() and _ima_measurements_next()
From: Mimi Zohar @ 2026-05-21  2:06 UTC (permalink / raw)
  To: Roberto Sassu, corbet, skhan, dmitry.kasatkin, eric.snowberg,
	paul, jmorris, serge
  Cc: linux-doc, linux-kernel, linux-integrity, linux-security-module,
	gregorylumen, chenste, nramas, Roberto Sassu
In-Reply-To: <20260429160319.4162918-6-roberto.sassu@huaweicloud.com>

On Wed, 2026-04-29 at 18:03 +0200, Roberto Sassu wrote:
> From: Roberto Sassu <roberto.sassu@huawei.com>
> 
> Introduce _ima_measurements_start() and _ima_measurements_next(), renamed
> from ima_measurements_start() and ima_measurements_next(), to include the
> list head as an additional parameter, so that iteration on different lists
> can be implemented by calling those functions.
> 
> No functional change: ima_measurements_start() and ima_measurements_next()
> pass the ima_measurements list head, used before.

ima_measurements_start() and ima_measurments_next() become "wrappers" for the
new functions.

> 
> Link: https://github.com/linux-integrity/linux/issues/1
> Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>

Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>

^ permalink raw reply

* Re: [PATCH v5 04/13] ima: Introduce per binary measurements list type binary_runtime_size value
From: Mimi Zohar @ 2026-05-21  2:06 UTC (permalink / raw)
  To: Roberto Sassu, corbet, skhan, dmitry.kasatkin, eric.snowberg,
	paul, jmorris, serge
  Cc: linux-doc, linux-kernel, linux-integrity, linux-security-module,
	gregorylumen, chenste, nramas, Roberto Sassu
In-Reply-To: <20260429160319.4162918-5-roberto.sassu@huaweicloud.com>

On Wed, 2026-04-29 at 18:03 +0200, Roberto Sassu wrote:
> From: Roberto Sassu <roberto.sassu@huawei.com>
> 
> Make binary_runtime_size as an array, to have separate counters per binary
> measurements list type. Currently, define the BINARY type for the existing
> binary measurements list.
> 
> Introduce ima_update_binary_runtime_size() to facilitate updating a
> binary_runtime_size value with a given binary measurement list type.
> 
> Also add the binary measurements list type parameter to
> ima_get_binary_runtime_size(), to retrieve the desired value. Retrieving
> the value is now done under the ima_extend_list_mutex, since there can be
> concurrent updates.
> 
> No functional change (except for the mutex usage, that fixes the
> concurrency issue): the BINARY array element is equivalent to the old
> binary_runtime_size.

The patch is really clear and well written, but I don't see a concurrency issue
requiring taking the ima_extend_list_mutex at least in this patch.

Mimi

> 
> Link: https://github.com/linux-integrity/linux/issues/1
> Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>

^ permalink raw reply

* Re: [PATCH v5 03/13] ima: Introduce per binary measurements list type ima_num_entries counter
From: Mimi Zohar @ 2026-05-21  2:05 UTC (permalink / raw)
  To: Roberto Sassu, corbet, skhan, dmitry.kasatkin, eric.snowberg,
	paul, jmorris, serge
  Cc: linux-doc, linux-kernel, linux-integrity, linux-security-module,
	gregorylumen, chenste, nramas, Roberto Sassu
In-Reply-To: <20260429160319.4162918-4-roberto.sassu@huaweicloud.com>

On Wed, 2026-04-29 at 18:03 +0200, Roberto Sassu wrote:
> From: Roberto Sassu <roberto.sassu@huawei.com>
> 
> Make ima_num_entries as an array, to have separate counters per binary
> measurements list type. Currently, define the BINARY type for the existing
> binary measurements list.
> 
> No functional change: the BINARY type is equivalent to the value without
> the array.
> 
> Link: https://github.com/linux-integrity/linux/issues/1
> Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>


Thanks, Roberto.

Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>

^ permalink raw reply

* Re: [PATCH v5 02/13] ima: Replace static htable queue with dynamically allocated array
From: Mimi Zohar @ 2026-05-21  2:05 UTC (permalink / raw)
  To: Roberto Sassu, corbet, skhan, dmitry.kasatkin, eric.snowberg,
	paul, jmorris, serge
  Cc: linux-doc, linux-kernel, linux-integrity, linux-security-module,
	gregorylumen, chenste, nramas, Roberto Sassu
In-Reply-To: <20260429160319.4162918-3-roberto.sassu@huaweicloud.com>

On Wed, 2026-04-29 at 18:03 +0200, Roberto Sassu wrote:
> From: Roberto Sassu <roberto.sassu@huawei.com>
> 
> The IMA hash table is a fixed-size array of hlist_head buckets:
> 
>     struct hlist_head ima_htable[IMA_MEASURE_HTABLE_SIZE];
> 
> IMA_MEASURE_HTABLE_SIZE is (1 << IMA_HASH_BITS) = 1024 buckets, each a
> struct hlist_head (one pointer, 8 bytes on 64-bit). That is 8 KiB allocated
> in BSS for every kernel, regardless of whether IMA is ever used, and
> regardless of how many measurements are actually made.
> 
> Replace the fixed-size array with a RCU-protected pointer to a dynamically
> allocated array that is initialized in ima_init_htable(), which is called
> from ima_init() during early boot. ima_init_htable() calls the static
> function ima_alloc_replace_htable() which, other than initializing the hash
> table the first time, can also hot-swap the existing hash table with a
> blank one.
> 
> The allocation in ima_alloc_replace_htable() uses kcalloc() so the buckets
> are zero-initialised (equivalent to HLIST_HEAD_INIT { .first = NULL }).
> Callers of ima_alloc_replace_htable() must call synchronize_rcu() and free
> the returned hash table.
> 
> Finally, access the hash table with rcu_dereference() in
> ima_lookup_digest_entry() (reader side) and with
> rcu_dereference_protected() in ima_add_digest_entry() (writer side).
> 
> No functional change: bucket count, hash function, and all locking remain
> identical.
> 
> Link: https://github.com/linux-integrity/linux/issues/1
> Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>

Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>

^ permalink raw reply

* Re: [PATCH v5 01/13] ima: Remove ima_h_table structure
From: Mimi Zohar @ 2026-05-21  2:05 UTC (permalink / raw)
  To: Roberto Sassu, corbet, skhan, dmitry.kasatkin, eric.snowberg,
	paul, jmorris, serge
  Cc: linux-doc, linux-kernel, linux-integrity, linux-security-module,
	gregorylumen, chenste, nramas, Roberto Sassu
In-Reply-To: <20260429160319.4162918-2-roberto.sassu@huaweicloud.com>

On Wed, 2026-04-29 at 18:03 +0200, Roberto Sassu wrote:
> From: Roberto Sassu <roberto.sassu@huawei.com>
> 

Instead of jumping straight to "With the upcoming change ...", some context is
needed.  Perhaps something like:

The ima_h_table structure is a collection of IMA measurement list metadata -
number of records in the IMA measurement list, number of integrity violations,
and a hash table containing the IMA template data hash, needed to prevent
measurement list record duplication.

Removing records from the measurement list needs to be reflected in the hash
table.  As a pre-req to removing records from the measurement list, separate ...

> With the upcoming change of dynamically allocating and replacing the hash
> table, the ima_h_table structure would have been replaced with a new one.
> 
> However, since the ima_h_table structure contains also the counters for
> number of measurements entries and violations, we would have needed to
> preserve their values in the new ima_h_table structure.
> 
> Instead, separate those counters from the hash table, remove the
> ima_h_table structure, and just replace the hash table pointer.
> 
> Finally, rename ima_show_htable_value(), ima_show_htable_violations()
> and ima_htable_violations_ops respectively to ima_show_counter(),
> ima_show_num_violations() and ima_num_violations_ops.
> 
> Link: https://github.com/linux-integrity/linux/issues/1
> Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>


Other than referring to "entries" in the measurement list, the patch looks good.
I prefer referring to them as "records".

> ---
>  security/integrity/ima/ima.h       |  9 +++------
>  security/integrity/ima/ima_api.c   |  2 +-
>  security/integrity/ima/ima_fs.c    | 19 +++++++++----------
>  security/integrity/ima/ima_kexec.c |  2 +-
>  security/integrity/ima/ima_queue.c | 17 ++++++++++-------
>  5 files changed, 24 insertions(+), 25 deletions(-)
> 
> diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
> index 69e9bf0b82c6..51a8a582df56 100644
> --- a/security/integrity/ima/ima.h
> +++ b/security/integrity/ima/ima.h
> @@ -324,12 +324,9 @@ int ima_lsm_policy_change(struct notifier_block *nb, unsigned long event,
>   */
>  extern spinlock_t ima_queue_lock;
>  
> -struct ima_h_table {
> -	atomic_long_t len;	/* number of stored measurements in the list */
> -	atomic_long_t violations;
> -	struct hlist_head queue[IMA_MEASURE_HTABLE_SIZE];
> -};
> -extern struct ima_h_table ima_htable;
> +extern atomic_long_t ima_num_entries;

-> ima_num_records	/* Total number of measurement list records */

Will this be the current or total number of measurement list records since a
hard boot?

> +extern atomic_long_t ima_num_violations;

Similarly, will this be the current number or total number of violations since a
hard boot?  Please add a comment.

> +extern struct hlist_head ima_htable[IMA_MEASURE_HTABLE_SIZE];
>  
>  static inline unsigned int ima_hash_key(u8 *digest)
>  {

thanks,

Mimi

^ permalink raw reply

* Re: [PATCH v5 00/13] ima: Introduce staging mechanism
From: Mimi Zohar @ 2026-05-21  2:02 UTC (permalink / raw)
  To: Roberto Sassu, corbet, skhan, dmitry.kasatkin, eric.snowberg,
	paul, jmorris, serge
  Cc: linux-doc, linux-kernel, linux-integrity, linux-security-module,
	gregorylumen, chenste, nramas, Roberto Sassu
In-Reply-To: <20260429160319.4162918-1-roberto.sassu@huaweicloud.com>

On Wed, 2026-04-29 at 18:03 +0200, Roberto Sassu wrote:
> From: Roberto Sassu <roberto.sassu@huawei.com>
> 
> Introduction
> ============
> 
> The IMA measurements list is currently stored in the kernel memory.
> Memory occupation grows linearly with the number of entries, and can
> become a problem especially in environments with reduced resources.
> 
> While there is an advantage in keeping the IMA measurements list in
> kernel memory, so that it is always available for reading from the
> securityfs interfaces, storing it elsewhere would make it possible to
> free precious memory for other kernel components.

-> for other kernel usage.


Prefix the following paragraph with:
The IMA measurement list needs to be retained and safely stored for new
attestation servers to validate the entire measurement list.  Assuming the IMA
measurement list is properly saved, storing ...

> Storing the IMA measurements list outside the kernel does not introduce
> security issues, since its integrity is anyway protected by the TPM.
> 
> Hence, the new IMA staging mechanism is introduced to allow user space
> to remove the desired portion of the measurements list from the kernel.

"desired portion" could be misconstrued as any subset of the measurement list.

-> to remove the entire or a portion of the measurement list ...

> 
> Usage
> =====

> The IMA staging mechanism can be enabled from the kernel configuration
> with the CONFIG_IMA_STAGING option.

Continue with:
This option prevents inadvertently removing the IMA measurement list on systems
which do not properly save it.

> 
> If it is enabled, IMA duplicates the current measurements interfaces

-> duplicates the current securityfs measurement list interfaces

> (both binary and ASCII), by adding the _staged file suffix. Both the
> original and the staging interfaces gain the write permission for the
> root user and group, but require the process to have CAP_SYS_ADMIN set.
> 
> The staging mechanism supports two flavors.
> 
> Staging with prompt
> ~~~~~~~~~~~~~~~~~~~
> 
> The current measurements list is moved to a temporary staging area, and
> staged measurements are deleted upon confirmation.

-> The current measurement list is moved to a temporary staging area, allowing
it to be saved to external storage, before being deleted upon confirmation.
> 
> This staging process is achieved with the following steps.
> 
>   1.  echo A > <original interface>: the user requests IMA to stage the
>       entire measurements list;
>   2.  cat <_staged interface>: the user reads the staged measurements;
>   3.  echo D > <_staged interface>: the user requests IMA to delete
>       staged measurements.
> 
> Staging and deleting
> ~~~~~~~~~~~~~~~~~~~~
> 
> N measurements are staged to a temporary staging area, and immediately
> deleted without further confirmation.
> 
> This staging process is achieved with the following steps.
> 
>   1.  cat <original interface>: the user reads the current measurements
>       list and determines what the value N for staging should be;
>   2.  echo N > <original interface>: the user requests IMA to delete N
>       measurements from the current measurements list.
> 
> 
> Management of Staged Measurements
> =================================
> 
> Since with the staging mechanism measurement entries are removed from
> the kernel, the user needs to save the staged ones in a storage and
> concatenate them together, so that it can present them to remote
> attestation agents as if staging was never done.

"the user needs to save the staged ones" -> the staged measurements need to be
saved ....

Please mention this could be a system service.

thanks,

Mimi


^ permalink raw reply

* Re: [PATCH mm-unstable v17 11/14] mm/khugepaged: Introduce mTHP collapse support
From: Wei Yang @ 2026-05-21  1:55 UTC (permalink / raw)
  To: Nico Pache
  Cc: Wei Yang, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <CAA1CXcD2KPKFrwCZd2PatQhf_e1nrvCguPD77GcNOVPFZLvsew@mail.gmail.com>

On Wed, May 20, 2026 at 06:05:31AM -0600, Nico Pache wrote:
>On Tue, May 12, 2026 at 9:44 AM Wei Yang <richard.weiyang@gmail.com> wrote:
>>
>> On Mon, May 11, 2026 at 12:58:11PM -0600, Nico Pache wrote:
>> >Enable khugepaged to collapse to mTHP orders. This patch implements the
>> >main scanning logic using a bitmap to track occupied pages and a stack
>> >structure that allows us to find optimal collapse sizes.
>> >
>> >Previous to this patch, PMD collapse had 3 main phases, a light weight
>> >scanning phase (mmap_read_lock) that determines a potential PMD
>> >collapse, an alloc phase (mmap unlocked), then finally heavier collapse
>> >phase (mmap_write_lock).
>> >
>> >To enabled mTHP collapse we make the following changes:
>> >
>> >During PMD scan phase, track occupied pages in a bitmap. When mTHP
>> >orders are enabled, we remove the restriction of max_ptes_none during the
>> >scan phase to avoid missing potential mTHP collapse candidates. Once we
>> >have scanned the full PMD range and updated the bitmap to track occupied
>> >pages, we use the bitmap to find the optimal mTHP size.
>> >
>> >Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
>> >and determine the best eligible order for the collapse. A stack structure
>> >is used instead of traditional recursion to manage the search. This also
>> >prevents a traditional recursive approach when the kernel stack struct is
>> >limited. The algorithm recursively splits the bitmap into smaller chunks to
>> >find the highest order mTHPs that satisfy the collapse criteria. We start
>> >by attempting the PMD order, then moved on the consecutively lower orders
>> >(mTHP collapse). The stack maintains a pair of variables (offset, order),
>> >indicating the number of PTEs from the start of the PMD, and the order of
>> >the potential collapse candidate.
>> >
>> >The algorithm for consuming the bitmap works as such:
>> >    1) push (0, HPAGE_PMD_ORDER) onto the stack
>> >    2) pop the stack
>> >    3) check if the number of set bits in that (offset,order) pair
>> >       statisfy the max_ptes_none threshold for that order
>> >    4) if yes, attempt collapse
>> >    5) if no (or collapse fails), push two new stack items representing
>> >       the left and right halves of the current bitmap range, at the
>> >       next lower order
>> >    6) repeat at step (2) until stack is empty.
>> >
>> >Below is a diagram representing the algorithm and stack items:
>> >
>> >                            offset   mid_offset
>> >                            |        |
>> >                            |        |
>> >                            v        v
>> >          ____________________________________
>> >         |          PTE Page Table            |
>> >         --------------------------------------
>> >                           <-------><------->
>> >                             order-1  order-1
>> >
>> >mTHP collapses reject regions containing swapped out or shared pages.
>> >This is because adding new entries can lead to new none pages, and these
>> >may lead to constant promotion into a higher order mTHP. A similar
>> >issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
>> >introducing at least 2x the number of pages, and on a future scan will
>> >satisfy the promotion condition once again. This issue is prevented via
>> >the collapse_max_ptes_none() function which imposes the max_ptes_none
>> >restrictions above.
>> >
>> >We currently only support mTHP collapse for max_ptes_none values of 0
>> >and HPAGE_PMD_NR - 1. resulting in the following behavior:
>> >
>> >    - max_ptes_none=0: Never introduce new empty pages during collapse
>> >    - max_ptes_none=HPAGE_PMD_NR-1: Always try collapse to the highest
>> >      available mTHP order
>> >
>> >Any other max_ptes_none value will emit a warning and skip mTHP collapse
>> >attempts. There should be no behavior change for PMD collapse.
>> >
>> >Once we determine what mTHP sizes fits best in that PMD range a collapse
>> >is attempted. A minimum collapse order of 2 is used as this is the lowest
>> >order supported by anon memory as defined by THP_ORDERS_ALL_ANON.
>> >
>> >Currently madv_collapse is not supported and will only attempt PMD
>> >collapse.
>> >
>> >We can also remove the check for is_khugepaged inside the PMD scan as
>> >the collapse_max_ptes_none() function handles this logic now.
>> >
>> >Signed-off-by: Nico Pache <npache@redhat.com>
>>
>> [...]
>>
>> >+static int mthp_collapse(struct mm_struct *mm, unsigned long address,
>> >+              int referenced, int unmapped, struct collapse_control *cc,
>> >+              unsigned long enabled_orders)
>> >+{
>> >+      unsigned int nr_occupied_ptes, nr_ptes;
>> >+      int max_ptes_none, collapsed = 0, stack_size = 0;
>> >+      unsigned long collapse_address;
>> >+      struct mthp_range range;
>> >+      u16 offset;
>> >+      u8 order;
>> >+
>> >+      collapse_mthp_stack_push(cc, &stack_size, 0, HPAGE_PMD_ORDER);
>> >+
>> >+      while (stack_size) {
>> >+              range = collapse_mthp_stack_pop(cc, &stack_size);
>> >+              order = range.order;
>> >+              offset = range.offset;
>> >+              nr_ptes = 1UL << order;
>> >+
>> >+              if (!test_bit(order, &enabled_orders))
>> >+                      goto next_order;
>> >+
>> >+              max_ptes_none = collapse_max_ptes_none(cc, NULL, order);
>>
>> I am thinking whether there is a behavioral change for userfaultfd_armed(vma).
>>
>> collapse_single_pmd()
>>     collapse_scan_pmd
>>         max_ptes_none = collapse_max_ptes_none(cc, vma)
>>         max_ptes_none = KHUGEPAGED_MAX_PTES_LIMIT                --- (1)
>>         mthp_collapse
>>             max_ptes_none = collapse_max_ptes_none(cc, NULL)     --- (2)
>>             collapse_huge_page(mm)
>>                 hugepage_vma_revalidate(&vma)
>>                 __collapse_huge_page_isolate(vma)
>>                     max_ptes_none = collapse_max_ptes_none(cc, vma)
>>
>> Before mthp_collapse() introduced, userfaultfd_armed(vma) is skipped if there
>> is any pte_none_or_zero() in collapse_scan_pmd().
>>
>> But now, max_ptes_none could be set to KHUGEPAGED_MAX_PTES_LIMIT at (1), so
>> that we can scan all the pte to get the bitmap. This means
>> userfaultfd_armed(vma) could continue even with pte_none_or_zero().
>>
>> Then in mthp_collapse(), collapse_max_ptes_none() at (2) ignores
>> userfaultfd_armed(vma), which means it will continue to collapse a
>> userfaultfd_armed(vma) when there is pte_none_or_zero().
>>
>> The good news is we will stop at __collapse_huge_page_isolate(), where we
>> get collapse_max_ptes_none() with vma. But we already did a lot of work.
>
>Good catch!
>
>As you stated we eventually ensure we respect the uffd checks. So
>there are no correctness issues, just the potential for wasted cycles.
>
>At (1) we only do this if mTHPs are enabled. If that is the case, the
>only waste that can arise is at the PMD order, as that order respects
>the max_ptes_none value.
>
>I think one approach is to gate (1) with the uffd check as well. That
>way, if mTHPs are enabled and its uffd-armed, max_ptes_none will stay
>at 0, and we bail early on the scan early if any none_ptes are hit.
>
>But then we lose the ability to collapse to mTHPs that are uffd-armed,
>where the PMD has none/zero-ptes and the mTHP fully has 0
>non-none/zero-ptes.
>
>ie) assume a PMD is 16 x's [xxxxxxxx00000000]
>where x is a populated pte and 0 is not
>If we guard this scan (1), then we will never check if its possible to
>collapse to the smaller orders.
>
>Let me know if you see a flaw in my logic, I think it's best to keep it as is?
>

Yes, gate it at (1) is not a proper place.

I am thinking whether we could pass vma to (2)? So that we could respect
uffd-armed?

>>
>> Not sure if I missed something.
>>
>> >+
>> >+              if (max_ptes_none < 0)
>> >+                      return collapsed;
>> >+
>> >+              nr_occupied_ptes = collapse_mthp_count_present(cc, offset,
>> >+                                                             nr_ptes);
>> >+
>> >+              if (nr_occupied_ptes >= nr_ptes - max_ptes_none) {
>> >+                      int ret;
>> >+
>> >+                      collapse_address = address + offset * PAGE_SIZE;
>> >+                      ret = collapse_huge_page(mm, collapse_address, referenced,
>> >+                                               unmapped, cc, order);
>> >+                      if (ret == SCAN_SUCCEED) {
>> >+                              collapsed += nr_ptes;
>> >+                              continue;
>> >+                      }
>> >+              }
>> >+
>> >+next_order:
>> >+              if (order > KHUGEPAGED_MIN_MTHP_ORDER) {
>> >+                      const u8 next_order = order - 1;
>> >+                      const u16 mid_offset = offset + (nr_ptes / 2);
>> >+
>> >+                      collapse_mthp_stack_push(cc, &stack_size, mid_offset,
>> >+                                               next_order);
>> >+                      collapse_mthp_stack_push(cc, &stack_size, offset,
>> >+                                               next_order);
>> >+              }
>> >+      }
>> >+      return collapsed;
>> >+}
>> >+
>> > static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>> >               struct vm_area_struct *vma, unsigned long start_addr,
>> >               bool *lock_dropped, struct collapse_control *cc)
>> > {
>> >-      const int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
>> >+      int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
>> >       const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
>> >       const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
>> >+      enum tva_type tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
>> >       pmd_t *pmd;
>> >-      pte_t *pte, *_pte;
>> >-      int none_or_zero = 0, shared = 0, referenced = 0;
>> >+      pte_t *pte, *_pte, pteval;
>> >+      int i;
>> >+      int none_or_zero = 0, shared = 0, nr_collapsed = 0, referenced = 0;
>> >       enum scan_result result = SCAN_FAIL;
>> >       struct page *page = NULL;
>> >       struct folio *folio = NULL;
>> >       unsigned long addr;
>> >+      unsigned long enabled_orders;
>> >       spinlock_t *ptl;
>> >       int node = NUMA_NO_NODE, unmapped = 0;
>> >
>> >@@ -1429,8 +1579,19 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>> >               goto out;
>> >       }
>> >
>> >+      bitmap_zero(cc->mthp_bitmap, MAX_PTRS_PER_PTE);
>> >       memset(cc->node_load, 0, sizeof(cc->node_load));
>> >       nodes_clear(cc->alloc_nmask);
>> >+
>> >+      enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, tva_flags);
>>
>> Would it be 0 at this point?
>
>If your question relates to the issue you brought up above, then yes,
>max_ptes_none would be 0 if it's uffd-armed. We must recheck the
>uffd-armed status before modifying it to 511.
>
>>
>> >+
>> >+      /*
>> >+       * If PMD is the only enabled order, enforce max_ptes_none, otherwise
>> >+       * scan all pages to populate the bitmap for mTHP collapse.
>> >+       */
>> >+      if (enabled_orders != BIT(HPAGE_PMD_ORDER))
>> >+              max_ptes_none = KHUGEPAGED_MAX_PTES_LIMIT;
>> >+
>> >       pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
>> >       if (!pte) {
>> >               cc->progress++;
>> >@@ -1438,11 +1599,13 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>> >               goto out;
>> >       }
>> >
>> >-      for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
>> >-           _pte++, addr += PAGE_SIZE) {
>> >+      for (i = 0; i < HPAGE_PMD_NR; i++) {
>> >+              _pte = pte + i;
>> >+              addr = start_addr + i * PAGE_SIZE;
>> >+              pteval = ptep_get(_pte);
>> >+
>> >               cc->progress++;
>> >
>> >-              pte_t pteval = ptep_get(_pte);
>> >               if (pte_none_or_zero(pteval)) {
>> >                       if (++none_or_zero > max_ptes_none) {
>> >                               result = SCAN_EXCEED_NONE_PTE;
>> >@@ -1522,6 +1685,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>> >                       }
>> >               }
>> >
>> >+              /* Set bit for occupied pages */
>> >+              __set_bit(i, cc->mthp_bitmap);
>> >               /*
>> >                * Record which node the original page is from and save this
>> >                * information to cc->node_load[].
>> >@@ -1580,10 +1745,11 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>> >       if (result == SCAN_SUCCEED) {
>> >               /* collapse_huge_page expects the lock to be dropped before calling */
>> >               mmap_read_unlock(mm);
>> >-              result = collapse_huge_page(mm, start_addr, referenced,
>> >-                                          unmapped, cc, HPAGE_PMD_ORDER);
>> >+              nr_collapsed = mthp_collapse(mm, start_addr, referenced, unmapped,
>> >+                                            cc, enabled_orders);
>> >               /* collapse_huge_page will return with the mmap_lock released */
>>
>> collapse_huge_page will return with mmap_lock released, but mthp_collapse()
>> may not?
>
>We are now releasing the lock before calling mthp_collapse, which
>subsequently calls collapse_huge_page. Even if `collapse_huge_page` is
>never called-- say, because enabled_orders is 0 (which should not
>happen) and all collapse orders are skipped (never calling
>collapse_huge_page)-- we still return here with the lock dropped.
>
>I think this is sound. Let me know if you think differently.
>

You are right. I missed the lock is released in previous patch.

>Cheers :)
>-- Nico
>
>>
>> >               *lock_dropped = true;
>> >+              result = nr_collapsed ? SCAN_SUCCEED : SCAN_FAIL;
>> >       }
>> > out:
>> >       trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
>> >--
>> >2.54.0
>>
>> --
>> Wei Yang
>> Help you, Help me
>>

-- 
Wei Yang
Help you, Help me

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox