linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC v11 01/14] mm: page_frag: add a test module for page_frag
       [not found] <20240719093338.55117-1-linyunsheng@huawei.com>
@ 2024-07-19  9:33 ` Yunsheng Lin
  2024-07-21 17:34   ` Alexander Duyck
  2024-07-19  9:33 ` [RFC v11 02/14] mm: move the page fragment allocator from page_alloc into its own file Yunsheng Lin
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-19  9:33 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Andrew Morton, linux-mm

Basing on the lib/objpool.c, change it to something like a
ptrpool, so that we can utilize that to test the correctness
and performance of the page_frag.

The testing is done by ensuring that the fragment allocated
from a frag_frag_cache instance is pushed into a ptrpool
instance in a kthread binded to a specified cpu, and a kthread
binded to a specified cpu will pop the fragment from the
ptrpool and free the fragment.

We may refactor out the common part between objpool and ptrpool
if this ptrpool thing turns out to be helpful for other place.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 mm/Kconfig.debug    |   8 +
 mm/Makefile         |   1 +
 mm/page_frag_test.c | 393 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 402 insertions(+)
 create mode 100644 mm/page_frag_test.c

diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index afc72fde0f03..1ebcd45f47d4 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -142,6 +142,14 @@ config DEBUG_PAGE_REF
 	  kernel code.  However the runtime performance overhead is virtually
 	  nil until the tracepoints are actually enabled.
 
+config DEBUG_PAGE_FRAG_TEST
+	tristate "Test module for page_frag"
+	default n
+	depends on m && DEBUG_KERNEL
+	help
+	  This builds the "page_frag_test" module that is used to test the
+	  correctness and performance of page_frag's implementation.
+
 config DEBUG_RODATA_TEST
     bool "Testcase for the marking rodata read-only"
     depends on STRICT_KERNEL_RWX
diff --git a/mm/Makefile b/mm/Makefile
index 8fb85acda1b1..29d9f7618a33 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -106,6 +106,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
+obj-$(CONFIG_DEBUG_PAGE_FRAG_TEST) += page_frag_test.o
 obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
 obj-$(CONFIG_PAGE_OWNER) += page_owner.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
new file mode 100644
index 000000000000..cf2691f60b67
--- /dev/null
+++ b/mm/page_frag_test.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test module for page_frag cache
+ *
+ * Copyright: linyunsheng@huawei.com
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/atomic.h>
+#include <linux/irqflags.h>
+#include <linux/cpumask.h>
+#include <linux/log2.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+
+#define OBJPOOL_NR_OBJECT_MAX	BIT(24)
+
+struct objpool_slot {
+	u32 head;
+	u32 tail;
+	u32 last;
+	u32 mask;
+	void *entries[];
+} __packed;
+
+struct objpool_head {
+	int nr_cpus;
+	int capacity;
+	struct objpool_slot **cpu_slots;
+};
+
+/* initialize percpu objpool_slot */
+static void objpool_init_percpu_slot(struct objpool_head *pool,
+				     struct objpool_slot *slot)
+{
+	/* initialize elements of percpu objpool_slot */
+	slot->mask = pool->capacity - 1;
+}
+
+/* allocate and initialize percpu slots */
+static int objpool_init_percpu_slots(struct objpool_head *pool,
+				     int nr_objs, gfp_t gfp)
+{
+	int i;
+
+	for (i = 0; i < pool->nr_cpus; i++) {
+		struct objpool_slot *slot;
+		int size;
+
+		/* skip the cpu node which could never be present */
+		if (!cpu_possible(i))
+			continue;
+
+		size = struct_size(slot, entries, pool->capacity);
+
+		/*
+		 * here we allocate percpu-slot & objs together in a single
+		 * allocation to make it more compact, taking advantage of
+		 * warm caches and TLB hits. in default vmalloc is used to
+		 * reduce the pressure of kernel slab system. as we know,
+		 * minimal size of vmalloc is one page since vmalloc would
+		 * always align the requested size to page size
+		 */
+		if (gfp & GFP_ATOMIC)
+			slot = kmalloc_node(size, gfp, cpu_to_node(i));
+		else
+			slot = __vmalloc_node(size, sizeof(void *), gfp,
+					      cpu_to_node(i),
+					      __builtin_return_address(0));
+		if (!slot)
+			return -ENOMEM;
+
+		memset(slot, 0, size);
+		pool->cpu_slots[i] = slot;
+
+		objpool_init_percpu_slot(pool, slot);
+	}
+
+	return 0;
+}
+
+/* cleanup all percpu slots of the object pool */
+static void objpool_fini_percpu_slots(struct objpool_head *pool)
+{
+	int i;
+
+	if (!pool->cpu_slots)
+		return;
+
+	for (i = 0; i < pool->nr_cpus; i++)
+		kvfree(pool->cpu_slots[i]);
+	kfree(pool->cpu_slots);
+}
+
+/* initialize object pool and pre-allocate objects */
+static int objpool_init(struct objpool_head *pool, int nr_objs, gfp_t gfp)
+{
+	int rc, capacity, slot_size;
+
+	/* check input parameters */
+	if (nr_objs <= 0 || nr_objs > OBJPOOL_NR_OBJECT_MAX)
+		return -EINVAL;
+
+	/* calculate capacity of percpu objpool_slot */
+	capacity = roundup_pow_of_two(nr_objs);
+	if (!capacity)
+		return -EINVAL;
+
+	gfp = gfp & ~__GFP_ZERO;
+
+	/* initialize objpool pool */
+	memset(pool, 0, sizeof(struct objpool_head));
+	pool->nr_cpus = nr_cpu_ids;
+	pool->capacity = capacity;
+	slot_size = pool->nr_cpus * sizeof(struct objpool_slot *);
+	pool->cpu_slots = kzalloc(slot_size, gfp);
+	if (!pool->cpu_slots)
+		return -ENOMEM;
+
+	/* initialize per-cpu slots */
+	rc = objpool_init_percpu_slots(pool, nr_objs, gfp);
+	if (rc)
+		objpool_fini_percpu_slots(pool);
+
+	return rc;
+}
+
+/* adding object to slot, abort if the slot was already full */
+static int objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu)
+{
+	struct objpool_slot *slot = pool->cpu_slots[cpu];
+	u32 head, tail;
+
+	/* loading tail and head as a local snapshot, tail first */
+	tail = READ_ONCE(slot->tail);
+
+	do {
+		head = READ_ONCE(slot->head);
+		/* fault caught: something must be wrong */
+		if (unlikely(tail - head >= pool->capacity))
+			return -ENOSPC;
+	} while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1));
+
+	/* now the tail position is reserved for the given obj */
+	WRITE_ONCE(slot->entries[tail & slot->mask], obj);
+	/* update sequence to make this obj available for pop() */
+	smp_store_release(&slot->last, tail + 1);
+
+	return 0;
+}
+
+/* reclaim an object to object pool */
+static int objpool_push(void *obj, struct objpool_head *pool)
+{
+	unsigned long flags;
+	int rc;
+
+	/* disable local irq to avoid preemption & interruption */
+	raw_local_irq_save(flags);
+	rc = objpool_try_add_slot(obj, pool, raw_smp_processor_id());
+	raw_local_irq_restore(flags);
+
+	return rc;
+}
+
+/* try to retrieve object from slot */
+static void *objpool_try_get_slot(struct objpool_head *pool, int cpu)
+{
+	struct objpool_slot *slot = pool->cpu_slots[cpu];
+	/* load head snapshot, other cpus may change it */
+	u32 head = smp_load_acquire(&slot->head);
+
+	while (head != READ_ONCE(slot->last)) {
+		void *obj;
+
+		/*
+		 * data visibility of 'last' and 'head' could be out of
+		 * order since memory updating of 'last' and 'head' are
+		 * performed in push() and pop() independently
+		 *
+		 * before any retrieving attempts, pop() must guarantee
+		 * 'last' is behind 'head', that is to say, there must
+		 * be available objects in slot, which could be ensured
+		 * by condition 'last != head && last - head <= nr_objs'
+		 * that is equivalent to 'last - head - 1 < nr_objs' as
+		 * 'last' and 'head' are both unsigned int32
+		 */
+		if (READ_ONCE(slot->last) - head - 1 >= pool->capacity) {
+			head = READ_ONCE(slot->head);
+			continue;
+		}
+
+		/* obj must be retrieved before moving forward head */
+		obj = READ_ONCE(slot->entries[head & slot->mask]);
+
+		/* move head forward to mark it's consumption */
+		if (try_cmpxchg_release(&slot->head, &head, head + 1))
+			return obj;
+	}
+
+	return NULL;
+}
+
+/* allocate an object from object pool */
+static void *objpool_pop(struct objpool_head *pool)
+{
+	void *obj = NULL;
+	unsigned long flags;
+	int i, cpu;
+
+	/* disable local irq to avoid preemption & interruption */
+	raw_local_irq_save(flags);
+
+	cpu = raw_smp_processor_id();
+	for (i = 0; i < num_possible_cpus(); i++) {
+		obj = objpool_try_get_slot(pool, cpu);
+		if (obj)
+			break;
+		cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1);
+	}
+	raw_local_irq_restore(flags);
+
+	return obj;
+}
+
+/* release whole objpool forcely */
+static void objpool_free(struct objpool_head *pool)
+{
+	if (!pool->cpu_slots)
+		return;
+
+	/* release percpu slots */
+	objpool_fini_percpu_slots(pool);
+}
+
+static struct objpool_head ptr_pool;
+static int nr_objs = 512;
+static atomic_t nthreads;
+static struct completion wait;
+static struct page_frag_cache test_frag;
+
+static int nr_test = 5120000;
+module_param(nr_test, int, 0);
+MODULE_PARM_DESC(nr_test, "number of iterations to test");
+
+static bool test_align;
+module_param(test_align, bool, 0);
+MODULE_PARM_DESC(test_align, "use align API for testing");
+
+static int test_alloc_len = 2048;
+module_param(test_alloc_len, int, 0);
+MODULE_PARM_DESC(test_alloc_len, "alloc len for testing");
+
+static int test_push_cpu;
+module_param(test_push_cpu, int, 0);
+MODULE_PARM_DESC(test_push_cpu, "test cpu for pushing fragment");
+
+static int test_pop_cpu;
+module_param(test_pop_cpu, int, 0);
+MODULE_PARM_DESC(test_pop_cpu, "test cpu for popping fragment");
+
+static int page_frag_pop_thread(void *arg)
+{
+	struct objpool_head *pool = arg;
+	int nr = nr_test;
+
+	pr_info("page_frag pop test thread begins on cpu %d\n",
+		smp_processor_id());
+
+	while (nr > 0) {
+		void *obj = objpool_pop(pool);
+
+		if (obj) {
+			nr--;
+			page_frag_free(obj);
+		} else {
+			cond_resched();
+		}
+	}
+
+	if (atomic_dec_and_test(&nthreads))
+		complete(&wait);
+
+	pr_info("page_frag pop test thread exits on cpu %d\n",
+		smp_processor_id());
+
+	return 0;
+}
+
+static int page_frag_push_thread(void *arg)
+{
+	struct objpool_head *pool = arg;
+	int nr = nr_test;
+
+	pr_info("page_frag push test thread begins on cpu %d\n",
+		smp_processor_id());
+
+	while (nr > 0) {
+		void *va;
+		int ret;
+
+		if (test_align) {
+			va = page_frag_alloc_align(&test_frag, test_alloc_len,
+						   GFP_KERNEL, SMP_CACHE_BYTES);
+
+			WARN_ONCE((unsigned long)va & (SMP_CACHE_BYTES - 1),
+				  "unaligned va returned\n");
+		} else {
+			va = page_frag_alloc(&test_frag, test_alloc_len, GFP_KERNEL);
+		}
+
+		if (!va)
+			continue;
+
+		ret = objpool_push(va, pool);
+		if (ret) {
+			page_frag_free(va);
+			cond_resched();
+		} else {
+			nr--;
+		}
+	}
+
+	pr_info("page_frag push test thread exits on cpu %d\n",
+		smp_processor_id());
+
+	if (atomic_dec_and_test(&nthreads))
+		complete(&wait);
+
+	return 0;
+}
+
+static int __init page_frag_test_init(void)
+{
+	struct task_struct *tsk_push, *tsk_pop;
+	ktime_t start;
+	u64 duration;
+	int ret;
+
+	test_frag.va = NULL;
+	atomic_set(&nthreads, 2);
+	init_completion(&wait);
+
+	if (test_alloc_len > PAGE_SIZE || test_alloc_len <= 0)
+		return -EINVAL;
+
+	ret = objpool_init(&ptr_pool, nr_objs, GFP_KERNEL);
+	if (ret)
+		return ret;
+
+	tsk_push = kthread_create_on_cpu(page_frag_push_thread, &ptr_pool,
+					 test_push_cpu, "page_frag_push");
+	if (IS_ERR(tsk_push))
+		return PTR_ERR(tsk_push);
+
+	tsk_pop = kthread_create_on_cpu(page_frag_pop_thread, &ptr_pool,
+					test_pop_cpu, "page_frag_pop");
+	if (IS_ERR(tsk_pop)) {
+		kthread_stop(tsk_push);
+		return PTR_ERR(tsk_pop);
+	}
+
+	start = ktime_get();
+	wake_up_process(tsk_push);
+	wake_up_process(tsk_pop);
+
+	pr_info("waiting for test to complete\n");
+	wait_for_completion(&wait);
+
+	duration = (u64)ktime_us_delta(ktime_get(), start);
+	pr_info("%d of iterations for %s testing took: %lluus\n", nr_test,
+		test_align ? "aligned" : "non-aligned", duration);
+
+	objpool_free(&ptr_pool);
+	page_frag_cache_drain(&test_frag);
+
+	return -EAGAIN;
+}
+
+static void __exit page_frag_test_exit(void)
+{
+}
+
+module_init(page_frag_test_init);
+module_exit(page_frag_test_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yunsheng Lin <linyunsheng@huawei.com>");
+MODULE_DESCRIPTION("Test module for page_frag");
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [RFC v11 02/14] mm: move the page fragment allocator from page_alloc into its own file
       [not found] <20240719093338.55117-1-linyunsheng@huawei.com>
  2024-07-19  9:33 ` [RFC v11 01/14] mm: page_frag: add a test module for page_frag Yunsheng Lin
@ 2024-07-19  9:33 ` Yunsheng Lin
  2024-07-21 17:58   ` Alexander Duyck
  2024-07-19  9:33 ` [RFC v11 03/14] mm: page_frag: use initial zero offset for page_frag_alloc_align() Yunsheng Lin
                   ` (8 subsequent siblings)
  10 siblings, 1 reply; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-19  9:33 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, David Howells,
	Alexander Duyck, Andrew Morton, linux-mm

Inspired by [1], move the page fragment allocator from page_alloc
into its own c file and header file, as we are about to make more
change for it to replace another page_frag implementation in
sock.c

As this patchset is going to replace 'struct page_frag' with
'struct page_frag_cache' in sched.h, including page_frag_cache.h
in sched.h has a compiler error caused by interdependence between
mm_types.h and mm.h for asm-offsets.c, see [2]. So avoid the compiler
error by moving 'struct page_frag_cache' to mm_types_task.h as
suggested by Alexander, see [3].

1. https://lore.kernel.org/all/20230411160902.4134381-3-dhowells@redhat.com/
2. https://lore.kernel.org/all/15623dac-9358-4597-b3ee-3694a5956920@gmail.com/
3. https://lore.kernel.org/all/CAKgT0UdH1yD=LSCXFJ=YM_aiA4OomD-2wXykO42bizaWMt_HOA@mail.gmail.com/
CC: David Howells <dhowells@redhat.com>
CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/gfp.h             |  22 -----
 include/linux/mm_types.h        |  18 ----
 include/linux/mm_types_task.h   |  18 ++++
 include/linux/page_frag_cache.h |  32 +++++++
 include/linux/skbuff.h          |   1 +
 mm/Makefile                     |   1 +
 mm/page_alloc.c                 | 136 ------------------------------
 mm/page_frag_cache.c            | 145 ++++++++++++++++++++++++++++++++
 mm/page_frag_test.c             |   2 +-
 9 files changed, 198 insertions(+), 177 deletions(-)
 create mode 100644 include/linux/page_frag_cache.h
 create mode 100644 mm/page_frag_cache.c

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 7f9691d375f0..3d8f9dc6c6ee 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -363,28 +363,6 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
 
-struct page_frag_cache;
-void page_frag_cache_drain(struct page_frag_cache *nc);
-extern void __page_frag_cache_drain(struct page *page, unsigned int count);
-void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
-			      gfp_t gfp_mask, unsigned int align_mask);
-
-static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
-					  unsigned int fragsz, gfp_t gfp_mask,
-					  unsigned int align)
-{
-	WARN_ON_ONCE(!is_power_of_2(align));
-	return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
-}
-
-static inline void *page_frag_alloc(struct page_frag_cache *nc,
-			     unsigned int fragsz, gfp_t gfp_mask)
-{
-	return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
-}
-
-extern void page_frag_free(void *addr);
-
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index af3a0256fa93..7a4e695a7a1e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -505,9 +505,6 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
  */
 #define STRUCT_PAGE_MAX_SHIFT	(order_base_2(sizeof(struct page)))
 
-#define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
-#define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
-
 /*
  * page_private can be used on tail pages.  However, PagePrivate is only
  * checked by the VM on the head page.  So page_private on the tail pages
@@ -526,21 +523,6 @@ static inline void *folio_get_private(struct folio *folio)
 	return folio->private;
 }
 
-struct page_frag_cache {
-	void * va;
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	__u16 offset;
-	__u16 size;
-#else
-	__u32 offset;
-#endif
-	/* we maintain a pagecount bias, so that we dont dirty cache line
-	 * containing page->_refcount every time we allocate a fragment.
-	 */
-	unsigned int		pagecnt_bias;
-	bool pfmemalloc;
-};
-
 typedef unsigned long vm_flags_t;
 
 /*
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index a2f6179b672b..cdc1e3696439 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -8,6 +8,7 @@
  * (These are defined separately to decouple sched.h from mm_types.h as much as possible.)
  */
 
+#include <linux/align.h>
 #include <linux/types.h>
 
 #include <asm/page.h>
@@ -46,6 +47,23 @@ struct page_frag {
 #endif
 };
 
+#define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
+#define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
+struct page_frag_cache {
+	void *va;
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	__u16 offset;
+	__u16 size;
+#else
+	__u32 offset;
+#endif
+	/* we maintain a pagecount bias, so that we dont dirty cache line
+	 * containing page->_refcount every time we allocate a fragment.
+	 */
+	unsigned int		pagecnt_bias;
+	bool pfmemalloc;
+};
+
 /* Track pages that require TLB flushes */
 struct tlbflush_unmap_batch {
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
new file mode 100644
index 000000000000..43afb1bbcac9
--- /dev/null
+++ b/include/linux/page_frag_cache.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_PAGE_FRAG_CACHE_H
+#define _LINUX_PAGE_FRAG_CACHE_H
+
+#include <linux/log2.h>
+#include <linux/types.h>
+#include <linux/mm_types_task.h>
+#include <asm/page.h>
+
+void page_frag_cache_drain(struct page_frag_cache *nc);
+void __page_frag_cache_drain(struct page *page, unsigned int count);
+void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
+			      gfp_t gfp_mask, unsigned int align_mask);
+
+static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
+					  unsigned int fragsz, gfp_t gfp_mask,
+					  unsigned int align)
+{
+	WARN_ON_ONCE(!is_power_of_2(align));
+	return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
+}
+
+static inline void *page_frag_alloc(struct page_frag_cache *nc,
+				    unsigned int fragsz, gfp_t gfp_mask)
+{
+	return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
+}
+
+void page_frag_free(void *addr);
+
+#endif
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9c29bdd5596d..e0e2be5194fb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -31,6 +31,7 @@
 #include <linux/in6.h>
 #include <linux/if_packet.h>
 #include <linux/llist.h>
+#include <linux/page_frag_cache.h>
 #include <net/flow.h>
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 #include <linux/netfilter/nf_conntrack_common.h>
diff --git a/mm/Makefile b/mm/Makefile
index 29d9f7618a33..3080257a0a75 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -64,6 +64,7 @@ page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
 memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 
 obj-y += page-alloc.o
+obj-y += page_frag_cache.o
 obj-y += init-mm.o
 obj-y += memblock.o
 obj-y += $(memory-hotplug-y)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9ecf99190ea2..edbb5a43f47b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4786,142 +4786,6 @@ void free_pages(unsigned long addr, unsigned int order)
 
 EXPORT_SYMBOL(free_pages);
 
-/*
- * Page Fragment:
- *  An arbitrary-length arbitrary-offset area of memory which resides
- *  within a 0 or higher order page.  Multiple fragments within that page
- *  are individually refcounted, in the page's reference counter.
- *
- * The page_frag functions below provide a simple allocation framework for
- * page fragments.  This is used by the network stack and network device
- * drivers to provide a backing region of memory for use as either an
- * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
- */
-static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
-					     gfp_t gfp_mask)
-{
-	struct page *page = NULL;
-	gfp_t gfp = gfp_mask;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
-		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
-	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
-				PAGE_FRAG_CACHE_MAX_ORDER);
-	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
-#endif
-	if (unlikely(!page))
-		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
-
-	nc->va = page ? page_address(page) : NULL;
-
-	return page;
-}
-
-void page_frag_cache_drain(struct page_frag_cache *nc)
-{
-	if (!nc->va)
-		return;
-
-	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
-	nc->va = NULL;
-}
-EXPORT_SYMBOL(page_frag_cache_drain);
-
-void __page_frag_cache_drain(struct page *page, unsigned int count)
-{
-	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
-
-	if (page_ref_sub_and_test(page, count))
-		free_unref_page(page, compound_order(page));
-}
-EXPORT_SYMBOL(__page_frag_cache_drain);
-
-void *__page_frag_alloc_align(struct page_frag_cache *nc,
-			      unsigned int fragsz, gfp_t gfp_mask,
-			      unsigned int align_mask)
-{
-	unsigned int size = PAGE_SIZE;
-	struct page *page;
-	int offset;
-
-	if (unlikely(!nc->va)) {
-refill:
-		page = __page_frag_cache_refill(nc, gfp_mask);
-		if (!page)
-			return NULL;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
-		/* Even if we own the page, we do not use atomic_set().
-		 * This would break get_page_unless_zero() users.
-		 */
-		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
-
-		/* reset page count bias and offset to start of new frag */
-		nc->pfmemalloc = page_is_pfmemalloc(page);
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->offset = size;
-	}
-
-	offset = nc->offset - fragsz;
-	if (unlikely(offset < 0)) {
-		page = virt_to_page(nc->va);
-
-		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
-			goto refill;
-
-		if (unlikely(nc->pfmemalloc)) {
-			free_unref_page(page, compound_order(page));
-			goto refill;
-		}
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
-		/* OK, page count is 0, we can safely set it */
-		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
-
-		/* reset page count bias and offset to start of new frag */
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		offset = size - fragsz;
-		if (unlikely(offset < 0)) {
-			/*
-			 * The caller is trying to allocate a fragment
-			 * with fragsz > PAGE_SIZE but the cache isn't big
-			 * enough to satisfy the request, this may
-			 * happen in low memory conditions.
-			 * We don't release the cache page because
-			 * it could make memory pressure worse
-			 * so we simply return NULL here.
-			 */
-			return NULL;
-		}
-	}
-
-	nc->pagecnt_bias--;
-	offset &= align_mask;
-	nc->offset = offset;
-
-	return nc->va + offset;
-}
-EXPORT_SYMBOL(__page_frag_alloc_align);
-
-/*
- * Frees a page fragment allocated out of either a compound or order 0 page.
- */
-void page_frag_free(void *addr)
-{
-	struct page *page = virt_to_head_page(addr);
-
-	if (unlikely(put_page_testzero(page)))
-		free_unref_page(page, compound_order(page));
-}
-EXPORT_SYMBOL(page_frag_free);
-
 static void *make_alloc_exact(unsigned long addr, unsigned int order,
 		size_t size)
 {
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
new file mode 100644
index 000000000000..609a485cd02a
--- /dev/null
+++ b/mm/page_frag_cache.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Page fragment allocator
+ *
+ * Page Fragment:
+ *  An arbitrary-length arbitrary-offset area of memory which resides within a
+ *  0 or higher order page.  Multiple fragments within that page are
+ *  individually refcounted, in the page's reference counter.
+ *
+ * The page_frag functions provide a simple allocation framework for page
+ * fragments.  This is used by the network stack and network device drivers to
+ * provide a backing region of memory for use as either an sk_buff->head, or to
+ * be used in the "frags" portion of skb_shared_info.
+ */
+
+#include <linux/export.h>
+#include <linux/gfp_types.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/page_frag_cache.h>
+#include "internal.h"
+
+static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+					     gfp_t gfp_mask)
+{
+	struct page *page = NULL;
+	gfp_t gfp = gfp_mask;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
+	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
+				PAGE_FRAG_CACHE_MAX_ORDER);
+	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
+#endif
+	if (unlikely(!page))
+		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+
+	nc->va = page ? page_address(page) : NULL;
+
+	return page;
+}
+
+void page_frag_cache_drain(struct page_frag_cache *nc)
+{
+	if (!nc->va)
+		return;
+
+	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
+	nc->va = NULL;
+}
+EXPORT_SYMBOL(page_frag_cache_drain);
+
+void __page_frag_cache_drain(struct page *page, unsigned int count)
+{
+	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+
+	if (page_ref_sub_and_test(page, count))
+		free_unref_page(page, compound_order(page));
+}
+EXPORT_SYMBOL(__page_frag_cache_drain);
+
+void *__page_frag_alloc_align(struct page_frag_cache *nc,
+			      unsigned int fragsz, gfp_t gfp_mask,
+			      unsigned int align_mask)
+{
+	unsigned int size = PAGE_SIZE;
+	struct page *page;
+	int offset;
+
+	if (unlikely(!nc->va)) {
+refill:
+		page = __page_frag_cache_refill(nc, gfp_mask);
+		if (!page)
+			return NULL;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+		/* if size can vary use size else just use PAGE_SIZE */
+		size = nc->size;
+#endif
+		/* Even if we own the page, we do not use atomic_set().
+		 * This would break get_page_unless_zero() users.
+		 */
+		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
+
+		/* reset page count bias and offset to start of new frag */
+		nc->pfmemalloc = page_is_pfmemalloc(page);
+		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+		nc->offset = size;
+	}
+
+	offset = nc->offset - fragsz;
+	if (unlikely(offset < 0)) {
+		page = virt_to_page(nc->va);
+
+		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
+			goto refill;
+
+		if (unlikely(nc->pfmemalloc)) {
+			free_unref_page(page, compound_order(page));
+			goto refill;
+		}
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+		/* if size can vary use size else just use PAGE_SIZE */
+		size = nc->size;
+#endif
+		/* OK, page count is 0, we can safely set it */
+		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
+
+		/* reset page count bias and offset to start of new frag */
+		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+		offset = size - fragsz;
+		if (unlikely(offset < 0)) {
+			/*
+			 * The caller is trying to allocate a fragment
+			 * with fragsz > PAGE_SIZE but the cache isn't big
+			 * enough to satisfy the request, this may
+			 * happen in low memory conditions.
+			 * We don't release the cache page because
+			 * it could make memory pressure worse
+			 * so we simply return NULL here.
+			 */
+			return NULL;
+		}
+	}
+
+	nc->pagecnt_bias--;
+	offset &= align_mask;
+	nc->offset = offset;
+
+	return nc->va + offset;
+}
+EXPORT_SYMBOL(__page_frag_alloc_align);
+
+/*
+ * Frees a page fragment allocated out of either a compound or order 0 page.
+ */
+void page_frag_free(void *addr)
+{
+	struct page *page = virt_to_head_page(addr);
+
+	if (unlikely(put_page_testzero(page)))
+		free_unref_page(page, compound_order(page));
+}
+EXPORT_SYMBOL(page_frag_free);
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index cf2691f60b67..b7a5affb92f2 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -6,7 +6,6 @@
  * Copyright: linyunsheng@huawei.com
  */
 
-#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
@@ -16,6 +15,7 @@
 #include <linux/log2.h>
 #include <linux/completion.h>
 #include <linux/kthread.h>
+#include <linux/page_frag_cache.h>
 
 #define OBJPOOL_NR_OBJECT_MAX	BIT(24)
 
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [RFC v11 03/14] mm: page_frag: use initial zero offset for page_frag_alloc_align()
       [not found] <20240719093338.55117-1-linyunsheng@huawei.com>
  2024-07-19  9:33 ` [RFC v11 01/14] mm: page_frag: add a test module for page_frag Yunsheng Lin
  2024-07-19  9:33 ` [RFC v11 02/14] mm: move the page fragment allocator from page_alloc into its own file Yunsheng Lin
@ 2024-07-19  9:33 ` Yunsheng Lin
  2024-07-21 18:34   ` Alexander Duyck
  2024-07-19  9:33 ` [RFC v11 04/14] mm: page_frag: add '_va' suffix to page_frag API Yunsheng Lin
                   ` (7 subsequent siblings)
  10 siblings, 1 reply; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-19  9:33 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Andrew Morton, linux-mm

We are about to use page_frag_alloc_*() API to not just
allocate memory for skb->data, but also use them to do
the memory allocation for skb frag too. Currently the
implementation of page_frag in mm subsystem is running
the offset as a countdown rather than count-up value,
there may have several advantages to that as mentioned
in [1], but it may have some disadvantages, for example,
it may disable skb frag coaleasing and more correct cache
prefetching

We have a trade-off to make in order to have a unified
implementation and API for page_frag, so use a initial zero
offset in this patch, and the following patch will try to
make some optimization to avoid the disadvantages as much
as possible.

Rename 'offset' to 'remaining' to retain the 'countdown'
behavior as 'remaining countdown' instead of 'offset
countdown'. Also, Renaming enable us to do a single
'fragsz > remaining' checking for the case of cache not
being enough, which should be the fast path if we ensure
'remaining' is zero when 'va' == NULL by memset'ing
'struct page_frag_cache' in page_frag_cache_init() and
page_frag_cache_drain().

1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/mm_types_task.h |  4 +-
 mm/page_frag_cache.c          | 71 +++++++++++++++++++++--------------
 2 files changed, 44 insertions(+), 31 deletions(-)

diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index cdc1e3696439..b1c54b2b9308 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -52,10 +52,10 @@ struct page_frag {
 struct page_frag_cache {
 	void *va;
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	__u16 offset;
+	__u16 remaining;
 	__u16 size;
 #else
-	__u32 offset;
+	__u32 remaining;
 #endif
 	/* we maintain a pagecount bias, so that we dont dirty cache line
 	 * containing page->_refcount every time we allocate a fragment.
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 609a485cd02a..2958fe006fe7 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -22,6 +22,7 @@
 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 					     gfp_t gfp_mask)
 {
+	unsigned int page_size = PAGE_FRAG_CACHE_MAX_SIZE;
 	struct page *page = NULL;
 	gfp_t gfp = gfp_mask;
 
@@ -30,12 +31,21 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
 	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
 				PAGE_FRAG_CACHE_MAX_ORDER);
-	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
 #endif
-	if (unlikely(!page))
+	if (unlikely(!page)) {
 		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+		if (unlikely(!page)) {
+			nc->va = NULL;
+			return NULL;
+		}
 
-	nc->va = page ? page_address(page) : NULL;
+		page_size = PAGE_SIZE;
+	}
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	nc->size = page_size;
+#endif
+	nc->va = page_address(page);
 
 	return page;
 }
@@ -64,8 +74,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
 			      unsigned int align_mask)
 {
 	unsigned int size = PAGE_SIZE;
+	unsigned int remaining;
 	struct page *page;
-	int offset;
 
 	if (unlikely(!nc->va)) {
 refill:
@@ -82,35 +92,20 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
 		 */
 		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
 
-		/* reset page count bias and offset to start of new frag */
+		/* reset page count bias and remaining to start of new frag */
 		nc->pfmemalloc = page_is_pfmemalloc(page);
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->offset = size;
+		nc->remaining = size;
 	}
 
-	offset = nc->offset - fragsz;
-	if (unlikely(offset < 0)) {
-		page = virt_to_page(nc->va);
-
-		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
-			goto refill;
-
-		if (unlikely(nc->pfmemalloc)) {
-			free_unref_page(page, compound_order(page));
-			goto refill;
-		}
-
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
+	/* if size can vary use size else just use PAGE_SIZE */
+	size = nc->size;
 #endif
-		/* OK, page count is 0, we can safely set it */
-		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
 
-		/* reset page count bias and offset to start of new frag */
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		offset = size - fragsz;
-		if (unlikely(offset < 0)) {
+	remaining = nc->remaining & align_mask;
+	if (unlikely(remaining < fragsz)) {
+		if (unlikely(fragsz > PAGE_SIZE)) {
 			/*
 			 * The caller is trying to allocate a fragment
 			 * with fragsz > PAGE_SIZE but the cache isn't big
@@ -122,13 +117,31 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
 			 */
 			return NULL;
 		}
+
+		page = virt_to_page(nc->va);
+
+		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
+			goto refill;
+
+		if (unlikely(nc->pfmemalloc)) {
+			free_unref_page(page, compound_order(page));
+			goto refill;
+		}
+
+		/* OK, page count is 0, we can safely set it */
+		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
+
+		/* reset page count bias and remaining to start of new frag */
+		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+		nc->remaining = size;
+
+		remaining = size;
 	}
 
 	nc->pagecnt_bias--;
-	offset &= align_mask;
-	nc->offset = offset;
+	nc->remaining = remaining - fragsz;
 
-	return nc->va + offset;
+	return nc->va + (size - remaining);
 }
 EXPORT_SYMBOL(__page_frag_alloc_align);
 
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [RFC v11 04/14] mm: page_frag: add '_va' suffix to page_frag API
       [not found] <20240719093338.55117-1-linyunsheng@huawei.com>
                   ` (2 preceding siblings ...)
  2024-07-19  9:33 ` [RFC v11 03/14] mm: page_frag: use initial zero offset for page_frag_alloc_align() Yunsheng Lin
@ 2024-07-19  9:33 ` Yunsheng Lin
       [not found]   ` <CAKgT0UcqELiXntRA_uD8eJGjt-OCLO64ax=YFXrCHNnaj9kD8g@mail.gmail.com>
  2024-07-19  9:33 ` [RFC v11 05/14] mm: page_frag: avoid caller accessing 'page_frag_cache' directly Yunsheng Lin
                   ` (6 subsequent siblings)
  10 siblings, 1 reply; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-19  9:33 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Subbaraya Sundeep, Jeroen de Borst, Praveen Kaligineedi,
	Shailend Chand, Eric Dumazet, Tony Nguyen, Przemek Kitszel,
	Sunil Goutham, Geetha sowjanya, hariprasad, Felix Fietkau,
	Sean Wang, Mark Lee, Lorenzo Bianconi, Matthias Brugger,
	AngeloGioacchino Del Regno, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, Chaitanya Kulkarni,
	Michael S. Tsirkin, Jason Wang, Eugenio Pérez, Andrew Morton,
	Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
	John Fastabend, Andrii Nakryiko, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, KP Singh,
	Stanislav Fomichev, Hao Luo, Jiri Olsa, David Howells,
	Marc Dionne, Trond Myklebust, Anna Schumaker, Chuck Lever,
	Jeff Layton, Neil Brown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	intel-wired-lan, linux-arm-kernel, linux-mediatek, linux-nvme,
	kvm, virtualization, linux-mm, bpf, linux-afs, linux-nfs

Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Reviewed-by: Subbaraya Sundeep <sbhatta@marvell.com>
---
 drivers/net/ethernet/google/gve/gve_rx.c      |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c     |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h     |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c       |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c    |  4 ++--
 drivers/nvme/host/tcp.c                       |  8 +++----
 drivers/nvme/target/tcp.c                     | 22 +++++++++----------
 drivers/vhost/net.c                           |  6 ++---
 include/linux/page_frag_cache.h               | 21 +++++++++---------
 include/linux/skbuff.h                        |  2 +-
 kernel/bpf/cpumap.c                           |  2 +-
 mm/page_frag_cache.c                          | 12 +++++-----
 mm/page_frag_test.c                           | 13 ++++++-----
 net/core/skbuff.c                             | 14 ++++++------
 net/core/xdp.c                                |  2 +-
 net/rxrpc/txbuf.c                             | 15 +++++++------
 net/sunrpc/svcsock.c                          |  6 ++---
 19 files changed, 74 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx,
 
 	total_len = headroom + SKB_DATA_ALIGN(len) +
 		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-	frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+	frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
 	if (!frame) {
 		u64_stats_update_begin(&rx->statss);
 		rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx,
 
 	err = xdp_do_redirect(dev, &new, xdp_prog);
 	if (err)
-		page_frag_free(frame);
+		page_frag_free_va(frame);
 
 	return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct ice_tx_buf *tx_buf)
 		dev_kfree_skb_any(tx_buf->skb);
 		break;
 	case ICE_TX_BUF_XDP_TX:
-		page_frag_free(tx_buf->raw_buf);
+		page_frag_free_va(tx_buf->raw_buf);
 		break;
 	case ICE_TX_BUF_XDP_XMIT:
 		xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf *tx_buf,
 
 	switch (tx_buf->type) {
 	case ICE_TX_BUF_XDP_TX:
-		page_frag_free(tx_buf->raw_buf);
+		page_frag_free_va(tx_buf->raw_buf);
 		break;
 	case ICE_TX_BUF_XDP_XMIT:
 		xdp_return_frame_bulk(tx_buf->xdpf, bq);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 149911e3002a..eef16a909f85 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -302,7 +302,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
 
 		/* free the skb */
 		if (ring_is_xdp(tx_ring))
-			page_frag_free(tx_buffer->data);
+			page_frag_free_va(tx_buffer->data);
 		else
 			napi_consume_skb(tx_buffer->skb, napi_budget);
 
@@ -2412,7 +2412,7 @@ static void ixgbevf_clean_tx_ring(struct ixgbevf_ring *tx_ring)
 
 		/* Free all the Tx ring sk_buffs */
 		if (ring_is_xdp(tx_ring))
-			page_frag_free(tx_buffer->data);
+			page_frag_free_va(tx_buffer->data);
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
index 87d5776e3b88..a485e988fa1d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
@@ -553,7 +553,7 @@ static int __otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
 	*dma = dma_map_single_attrs(pfvf->dev, buf, pool->rbsize,
 				    DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
 	if (unlikely(dma_mapping_error(pfvf->dev, *dma))) {
-		page_frag_free(buf);
+		page_frag_free_va(buf);
 		return -ENOMEM;
 	}
 
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.c b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
index 7063c78bd35f..c4228719f8a4 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
@@ -142,8 +142,8 @@ mtk_wed_wo_queue_refill(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q,
 		dma_addr_t addr;
 		void *buf;
 
-		buf = page_frag_alloc(&q->cache, q->buf_size,
-				      GFP_ATOMIC | GFP_DMA32);
+		buf = page_frag_alloc_va(&q->cache, q->buf_size,
+					 GFP_ATOMIC | GFP_DMA32);
 		if (!buf)
 			break;
 
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index a2a47d3ab99f..86906bc505de 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -506,7 +506,7 @@ static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
 {
 	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 
-	page_frag_free(req->pdu);
+	page_frag_free_va(req->pdu);
 }
 
 static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
@@ -520,7 +520,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
 	struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 
-	req->pdu = page_frag_alloc(&queue->pf_cache,
+	req->pdu = page_frag_alloc_va(&queue->pf_cache,
 		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
 		GFP_KERNEL | __GFP_ZERO);
 	if (!req->pdu)
@@ -1337,7 +1337,7 @@ static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
 {
 	struct nvme_tcp_request *async = &ctrl->async_req;
 
-	page_frag_free(async->pdu);
+	page_frag_free_va(async->pdu);
 }
 
 static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
@@ -1346,7 +1346,7 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
 	struct nvme_tcp_request *async = &ctrl->async_req;
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 
-	async->pdu = page_frag_alloc(&queue->pf_cache,
+	async->pdu = page_frag_alloc_va(&queue->pf_cache,
 		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
 		GFP_KERNEL | __GFP_ZERO);
 	if (!async->pdu)
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 5bff0d5464d1..560df3db2f82 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1463,24 +1463,24 @@ static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
 	c->queue = queue;
 	c->req.port = queue->port->nport;
 
-	c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
+	c->cmd_pdu = page_frag_alloc_va(&queue->pf_cache,
 			sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
 	if (!c->cmd_pdu)
 		return -ENOMEM;
 	c->req.cmd = &c->cmd_pdu->cmd;
 
-	c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
+	c->rsp_pdu = page_frag_alloc_va(&queue->pf_cache,
 			sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
 	if (!c->rsp_pdu)
 		goto out_free_cmd;
 	c->req.cqe = &c->rsp_pdu->cqe;
 
-	c->data_pdu = page_frag_alloc(&queue->pf_cache,
+	c->data_pdu = page_frag_alloc_va(&queue->pf_cache,
 			sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
 	if (!c->data_pdu)
 		goto out_free_rsp;
 
-	c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
+	c->r2t_pdu = page_frag_alloc_va(&queue->pf_cache,
 			sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
 	if (!c->r2t_pdu)
 		goto out_free_data;
@@ -1495,20 +1495,20 @@ static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
 
 	return 0;
 out_free_data:
-	page_frag_free(c->data_pdu);
+	page_frag_free_va(c->data_pdu);
 out_free_rsp:
-	page_frag_free(c->rsp_pdu);
+	page_frag_free_va(c->rsp_pdu);
 out_free_cmd:
-	page_frag_free(c->cmd_pdu);
+	page_frag_free_va(c->cmd_pdu);
 	return -ENOMEM;
 }
 
 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
 {
-	page_frag_free(c->r2t_pdu);
-	page_frag_free(c->data_pdu);
-	page_frag_free(c->rsp_pdu);
-	page_frag_free(c->cmd_pdu);
+	page_frag_free_va(c->r2t_pdu);
+	page_frag_free_va(c->data_pdu);
+	page_frag_free_va(c->rsp_pdu);
+	page_frag_free_va(c->cmd_pdu);
 }
 
 static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f16279351db5..6691fac01e0d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -686,8 +686,8 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
 		return -ENOSPC;
 
 	buflen += SKB_DATA_ALIGN(len + pad);
-	buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL,
-				    SMP_CACHE_BYTES);
+	buf = page_frag_alloc_va_align(&net->pf_cache, buflen, GFP_KERNEL,
+				       SMP_CACHE_BYTES);
 	if (unlikely(!buf))
 		return -ENOMEM;
 
@@ -734,7 +734,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
 	return 0;
 
 err:
-	page_frag_free(buf);
+	page_frag_free_va(buf);
 	return ret;
 }
 
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 43afb1bbcac9..4c5079f232b5 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -10,23 +10,24 @@
 
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
-void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
-			      gfp_t gfp_mask, unsigned int align_mask);
+void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
+				 unsigned int fragsz, gfp_t gfp_mask,
+				 unsigned int align_mask);
 
-static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
-					  unsigned int fragsz, gfp_t gfp_mask,
-					  unsigned int align)
+static inline void *page_frag_alloc_va_align(struct page_frag_cache *nc,
+					     unsigned int fragsz,
+					     gfp_t gfp_mask, unsigned int align)
 {
 	WARN_ON_ONCE(!is_power_of_2(align));
-	return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
+	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, -align);
 }
 
-static inline void *page_frag_alloc(struct page_frag_cache *nc,
-				    unsigned int fragsz, gfp_t gfp_mask)
+static inline void *page_frag_alloc_va(struct page_frag_cache *nc,
+				       unsigned int fragsz, gfp_t gfp_mask)
 {
-	return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
+	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, ~0u);
 }
 
-void page_frag_free(void *addr);
+void page_frag_free_va(void *addr);
 
 #endif
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e0e2be5194fb..fb74725d1af8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3381,7 +3381,7 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
 
 static inline void skb_free_frag(void *addr)
 {
-	page_frag_free(addr);
+	page_frag_free_va(addr);
 }
 
 void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index fbdf5a1aabfe..3b70b6b071b9 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -323,7 +323,7 @@ static int cpu_map_kthread_run(void *data)
 
 			/* Bring struct page memory area to curr CPU. Read by
 			 * build_skb_around via page_is_pfmemalloc(), and when
-			 * freed written by page_frag_free call.
+			 * freed written by page_frag_free_va call.
 			 */
 			prefetchw(page);
 		}
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 2958fe006fe7..b12496f05c4a 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -69,9 +69,9 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
-void *__page_frag_alloc_align(struct page_frag_cache *nc,
-			      unsigned int fragsz, gfp_t gfp_mask,
-			      unsigned int align_mask)
+void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
+				 unsigned int fragsz, gfp_t gfp_mask,
+				 unsigned int align_mask)
 {
 	unsigned int size = PAGE_SIZE;
 	unsigned int remaining;
@@ -143,16 +143,16 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
 
 	return nc->va + (size - remaining);
 }
-EXPORT_SYMBOL(__page_frag_alloc_align);
+EXPORT_SYMBOL(__page_frag_alloc_va_align);
 
 /*
  * Frees a page fragment allocated out of either a compound or order 0 page.
  */
-void page_frag_free(void *addr)
+void page_frag_free_va(void *addr)
 {
 	struct page *page = virt_to_head_page(addr);
 
 	if (unlikely(put_page_testzero(page)))
 		free_unref_page(page, compound_order(page));
 }
-EXPORT_SYMBOL(page_frag_free);
+EXPORT_SYMBOL(page_frag_free_va);
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index b7a5affb92f2..9eaa3ab74b29 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -276,7 +276,7 @@ static int page_frag_pop_thread(void *arg)
 
 		if (obj) {
 			nr--;
-			page_frag_free(obj);
+			page_frag_free_va(obj);
 		} else {
 			cond_resched();
 		}
@@ -304,13 +304,16 @@ static int page_frag_push_thread(void *arg)
 		int ret;
 
 		if (test_align) {
-			va = page_frag_alloc_align(&test_frag, test_alloc_len,
-						   GFP_KERNEL, SMP_CACHE_BYTES);
+			va = page_frag_alloc_va_align(&test_frag,
+						      test_alloc_len,
+						      GFP_KERNEL,
+						      SMP_CACHE_BYTES);
 
 			WARN_ONCE((unsigned long)va & (SMP_CACHE_BYTES - 1),
 				  "unaligned va returned\n");
 		} else {
-			va = page_frag_alloc(&test_frag, test_alloc_len, GFP_KERNEL);
+			va = page_frag_alloc_va(&test_frag, test_alloc_len,
+						GFP_KERNEL);
 		}
 
 		if (!va)
@@ -318,7 +321,7 @@ static int page_frag_push_thread(void *arg)
 
 		ret = objpool_push(va, pool);
 		if (ret) {
-			page_frag_free(va);
+			page_frag_free_va(va);
 			cond_resched();
 		} else {
 			nr--;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 83f8cd8aa2d1..4b8acd967793 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -314,8 +314,8 @@ void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 	fragsz = SKB_DATA_ALIGN(fragsz);
 
 	local_lock_nested_bh(&napi_alloc_cache.bh_lock);
-	data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
-				       align_mask);
+	data = __page_frag_alloc_va_align(&nc->page, fragsz, GFP_ATOMIC,
+					  align_mask);
 	local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 	return data;
 
@@ -330,8 +330,8 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 		struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
 
 		fragsz = SKB_DATA_ALIGN(fragsz);
-		data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC,
-					       align_mask);
+		data = __page_frag_alloc_va_align(nc, fragsz, GFP_ATOMIC,
+						  align_mask);
 	} else {
 		local_bh_disable();
 		data = __napi_alloc_frag_align(fragsz, align_mask);
@@ -748,14 +748,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 
 	if (in_hardirq() || irqs_disabled()) {
 		nc = this_cpu_ptr(&netdev_alloc_cache);
-		data = page_frag_alloc(nc, len, gfp_mask);
+		data = page_frag_alloc_va(nc, len, gfp_mask);
 		pfmemalloc = nc->pfmemalloc;
 	} else {
 		local_bh_disable();
 		local_lock_nested_bh(&napi_alloc_cache.bh_lock);
 
 		nc = this_cpu_ptr(&napi_alloc_cache.page);
-		data = page_frag_alloc(nc, len, gfp_mask);
+		data = page_frag_alloc_va(nc, len, gfp_mask);
 		pfmemalloc = nc->pfmemalloc;
 
 		local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
@@ -845,7 +845,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
 	} else {
 		len = SKB_HEAD_ALIGN(len);
 
-		data = page_frag_alloc(&nc->page, len, gfp_mask);
+		data = page_frag_alloc_va(&nc->page, len, gfp_mask);
 		pfmemalloc = nc->page.pfmemalloc;
 	}
 	local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index bcc5551c6424..7d4e09fb478f 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -387,7 +387,7 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 		page_pool_put_full_page(page->pp, page, napi_direct);
 		break;
 	case MEM_TYPE_PAGE_SHARED:
-		page_frag_free(data);
+		page_frag_free_va(data);
 		break;
 	case MEM_TYPE_PAGE_ORDER0:
 		page = virt_to_page(data); /* Assumes order0 page*/
diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c
index c3913d8a50d3..dccb0353ee84 100644
--- a/net/rxrpc/txbuf.c
+++ b/net/rxrpc/txbuf.c
@@ -33,8 +33,8 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_
 
 	data_align = umax(data_align, L1_CACHE_BYTES);
 	mutex_lock(&call->conn->tx_data_alloc_lock);
-	buf = page_frag_alloc_align(&call->conn->tx_data_alloc, total, gfp,
-				    data_align);
+	buf = page_frag_alloc_va_align(&call->conn->tx_data_alloc, total, gfp,
+				       data_align);
 	mutex_unlock(&call->conn->tx_data_alloc_lock);
 	if (!buf) {
 		kfree(txb);
@@ -96,17 +96,18 @@ struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_s
 	if (!txb)
 		return NULL;
 
-	buf = page_frag_alloc(&call->local->tx_alloc,
-			      sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp);
+	buf = page_frag_alloc_va(&call->local->tx_alloc,
+				 sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp);
 	if (!buf) {
 		kfree(txb);
 		return NULL;
 	}
 
 	if (sack_size) {
-		buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp);
+		buf2 = page_frag_alloc_va(&call->local->tx_alloc, sack_size,
+					  gfp);
 		if (!buf2) {
-			page_frag_free(buf);
+			page_frag_free_va(buf);
 			kfree(txb);
 			return NULL;
 		}
@@ -180,7 +181,7 @@ static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb)
 			  rxrpc_txbuf_free);
 	for (i = 0; i < txb->nr_kvec; i++)
 		if (txb->kvec[i].iov_base)
-			page_frag_free(txb->kvec[i].iov_base);
+			page_frag_free_va(txb->kvec[i].iov_base);
 	kfree(txb);
 	atomic_dec(&rxrpc_nr_txbuf);
 }
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 6b3f01beb294..42d20412c1c3 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1222,8 +1222,8 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp,
 	/* The stream record marker is copied into a temporary page
 	 * fragment buffer so that it can be included in rq_bvec.
 	 */
-	buf = page_frag_alloc(&svsk->sk_frag_cache, sizeof(marker),
-			      GFP_KERNEL);
+	buf = page_frag_alloc_va(&svsk->sk_frag_cache, sizeof(marker),
+				 GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 	memcpy(buf, &marker, sizeof(marker));
@@ -1235,7 +1235,7 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp,
 	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
 		      1 + count, sizeof(marker) + rqstp->rq_res.len);
 	ret = sock_sendmsg(svsk->sk_sock, &msg);
-	page_frag_free(buf);
+	page_frag_free_va(buf);
 	if (ret < 0)
 		return ret;
 	*sentp += ret;
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [RFC v11 05/14] mm: page_frag: avoid caller accessing 'page_frag_cache' directly
       [not found] <20240719093338.55117-1-linyunsheng@huawei.com>
                   ` (3 preceding siblings ...)
  2024-07-19  9:33 ` [RFC v11 04/14] mm: page_frag: add '_va' suffix to page_frag API Yunsheng Lin
@ 2024-07-19  9:33 ` Yunsheng Lin
  2024-07-21 23:01   ` Alexander H Duyck
  2024-07-19  9:33 ` [RFC v11 07/14] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc' Yunsheng Lin
                   ` (5 subsequent siblings)
  10 siblings, 1 reply; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-19  9:33 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Michael S. Tsirkin, Jason Wang, Eugenio Pérez, Andrew Morton,
	Eric Dumazet, David Howells, Marc Dionne, Chuck Lever,
	Jeff Layton, Neil Brown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, kvm, virtualization, linux-mm,
	linux-afs, linux-nfs

Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 drivers/vhost/net.c             |  2 +-
 include/linux/page_frag_cache.h | 10 ++++++++++
 mm/page_frag_test.c             |  2 +-
 net/core/skbuff.c               |  6 +++---
 net/rxrpc/conn_object.c         |  4 +---
 net/rxrpc/local_object.c        |  4 +---
 net/sunrpc/svcsock.c            |  6 ++----
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 			vqs[VHOST_NET_VQ_RX]);
 
 	f->private_data = n;
-	n->pf_cache.va = NULL;
+	page_frag_cache_init(&n->pf_cache);
 
 	return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 4c5079f232b5..ef1572f11248 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -8,6 +8,16 @@
 #include <linux/mm_types_task.h>
 #include <asm/page.h>
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+	nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+	return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index 9eaa3ab74b29..6df8d8865afe 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -344,7 +344,7 @@ static int __init page_frag_test_init(void)
 	u64 duration;
 	int ret;
 
-	test_frag.va = NULL;
+	page_frag_cache_init(&test_frag);
 	atomic_set(&nthreads, 2);
 	init_completion(&wait);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4b8acd967793..76a473b1072d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -749,14 +749,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 	if (in_hardirq() || irqs_disabled()) {
 		nc = this_cpu_ptr(&netdev_alloc_cache);
 		data = page_frag_alloc_va(nc, len, gfp_mask);
-		pfmemalloc = nc->pfmemalloc;
+		pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
 	} else {
 		local_bh_disable();
 		local_lock_nested_bh(&napi_alloc_cache.bh_lock);
 
 		nc = this_cpu_ptr(&napi_alloc_cache.page);
 		data = page_frag_alloc_va(nc, len, gfp_mask);
-		pfmemalloc = nc->pfmemalloc;
+		pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
 
 		local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 		local_bh_enable();
@@ -846,7 +846,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
 		len = SKB_HEAD_ALIGN(len);
 
 		data = page_frag_alloc_va(&nc->page, len, gfp_mask);
-		pfmemalloc = nc->page.pfmemalloc;
+		pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
 	}
 	local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct *work)
 	 */
 	rxrpc_purge_queue(&conn->rx_queue);
 
-	if (conn->tx_data_alloc.va)
-		__page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-					conn->tx_data_alloc.pagecnt_bias);
+	page_frag_cache_drain(&conn->tx_data_alloc);
 	call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
 	rxrpc_purge_queue(&local->rx_queue);
 	rxrpc_purge_client_connections(local);
-	if (local->tx_alloc.va)
-		__page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-					local->tx_alloc.pagecnt_bias);
+	page_frag_cache_drain(&local->tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
 	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-	struct page_frag_cache *pfc = &svsk->sk_frag_cache;
 	struct socket *sock = svsk->sk_sock;
 
 	trace_svcsock_free(svsk, sock);
@@ -1619,8 +1618,7 @@ static void svc_sock_free(struct svc_xprt *xprt)
 		sockfd_put(sock);
 	else
 		sock_release(sock);
-	if (pfc->va)
-		__page_frag_cache_drain(virt_to_head_page(pfc->va),
-					pfc->pagecnt_bias);
+
+	page_frag_cache_drain(&svsk->sk_frag_cache);
 	kfree(svsk);
 }
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [RFC v11 07/14] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
       [not found] <20240719093338.55117-1-linyunsheng@huawei.com>
                   ` (4 preceding siblings ...)
  2024-07-19  9:33 ` [RFC v11 05/14] mm: page_frag: avoid caller accessing 'page_frag_cache' directly Yunsheng Lin
@ 2024-07-19  9:33 ` Yunsheng Lin
  2024-07-21 22:59   ` Alexander H Duyck
  2024-07-19  9:33 ` [RFC v11 08/14] mm: page_frag: some minor refactoring before adding new API Yunsheng Lin
                   ` (4 subsequent siblings)
  10 siblings, 1 reply; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-19  9:33 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Andrew Morton, linux-mm

Currently there is one 'struct page_frag' for every 'struct
sock' and 'struct task_struct', we are about to replace the
'struct page_frag' with 'struct page_frag_cache' for them.
Before begin the replacing, we need to ensure the size of
'struct page_frag_cache' is not bigger than the size of
'struct page_frag', as there may be tens of thousands of
'struct sock' and 'struct task_struct' instances in the
system.

By or'ing the page order & pfmemalloc with lower bits of
'va' instead of using 'u16' or 'u32' for page size and 'u8'
for pfmemalloc, we are able to avoid 3 or 5 bytes space waste.
And page address & pfmemalloc & order is unchanged for the
same page in the same 'page_frag_cache' instance, it makes
sense to fit them together.

After this patch, the size of 'struct page_frag_cache' should be
the same as the size of 'struct page_frag'.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/mm_types_task.h   | 16 +++++------
 include/linux/page_frag_cache.h | 49 +++++++++++++++++++++++++++++++--
 mm/page_frag_cache.c            | 49 +++++++++++++++------------------
 3 files changed, 77 insertions(+), 37 deletions(-)

diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index b1c54b2b9308..f2610112a642 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -50,18 +50,18 @@ struct page_frag {
 #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
 #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
 struct page_frag_cache {
-	void *va;
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	/* encoded_va consists of the virtual address, pfmemalloc bit and order
+	 * of a page.
+	 */
+	unsigned long encoded_va;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
 	__u16 remaining;
-	__u16 size;
+	__u16 pagecnt_bias;
 #else
 	__u32 remaining;
+	__u32 pagecnt_bias;
 #endif
-	/* we maintain a pagecount bias, so that we dont dirty cache line
-	 * containing page->_refcount every time we allocate a fragment.
-	 */
-	unsigned int		pagecnt_bias;
-	bool pfmemalloc;
 };
 
 /* Track pages that require TLB flushes */
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index ef1572f11248..12a16f8e8ad0 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -3,19 +3,64 @@
 #ifndef _LINUX_PAGE_FRAG_CACHE_H
 #define _LINUX_PAGE_FRAG_CACHE_H
 
+#include <linux/bits.h>
+#include <linux/build_bug.h>
 #include <linux/log2.h>
 #include <linux/types.h>
 #include <linux/mm_types_task.h>
 #include <asm/page.h>
 
+#define PAGE_FRAG_CACHE_ORDER_MASK		GENMASK(7, 0)
+#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT		BIT(8)
+#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT	8
+
+static inline unsigned long encode_aligned_va(void *va, unsigned int order,
+					      bool pfmemalloc)
+{
+	BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_FRAG_CACHE_ORDER_MASK);
+	BUILD_BUG_ON(PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT >= PAGE_SHIFT);
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	return (unsigned long)va | order |
+		(pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
+#else
+	return (unsigned long)va |
+		(pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
+#endif
+}
+
+static inline unsigned long encoded_page_order(unsigned long encoded_va)
+{
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	return encoded_va & PAGE_FRAG_CACHE_ORDER_MASK;
+#else
+	return 0;
+#endif
+}
+
+static inline bool encoded_page_pfmemalloc(unsigned long encoded_va)
+{
+	return encoded_va & PAGE_FRAG_CACHE_PFMEMALLOC_BIT;
+}
+
+static inline void *encoded_page_address(unsigned long encoded_va)
+{
+	return (void *)(encoded_va & PAGE_MASK);
+}
+
 static inline void page_frag_cache_init(struct page_frag_cache *nc)
 {
-	nc->va = NULL;
+	nc->encoded_va = 0;
 }
 
 static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
 {
-	return !!nc->pfmemalloc;
+	return encoded_page_pfmemalloc(nc->encoded_va);
+}
+
+static inline unsigned int page_frag_cache_page_size(unsigned long encoded_va)
+{
+	return PAGE_SIZE << encoded_page_order(encoded_va);
 }
 
 void page_frag_cache_drain(struct page_frag_cache *nc);
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index b12496f05c4a..7928e5d50711 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -22,7 +22,7 @@
 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 					     gfp_t gfp_mask)
 {
-	unsigned int page_size = PAGE_FRAG_CACHE_MAX_SIZE;
+	unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER;
 	struct page *page = NULL;
 	gfp_t gfp = gfp_mask;
 
@@ -35,28 +35,27 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 	if (unlikely(!page)) {
 		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
 		if (unlikely(!page)) {
-			nc->va = NULL;
+			nc->encoded_va = 0;
 			return NULL;
 		}
 
-		page_size = PAGE_SIZE;
+		order = 0;
 	}
 
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	nc->size = page_size;
-#endif
-	nc->va = page_address(page);
+	nc->encoded_va = encode_aligned_va(page_address(page), order,
+					   page_is_pfmemalloc(page));
 
 	return page;
 }
 
 void page_frag_cache_drain(struct page_frag_cache *nc)
 {
-	if (!nc->va)
+	if (!nc->encoded_va)
 		return;
 
-	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
-	nc->va = NULL;
+	__page_frag_cache_drain(virt_to_head_page((void *)nc->encoded_va),
+				nc->pagecnt_bias);
+	nc->encoded_va = 0;
 }
 EXPORT_SYMBOL(page_frag_cache_drain);
 
@@ -73,36 +72,30 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 				 unsigned int fragsz, gfp_t gfp_mask,
 				 unsigned int align_mask)
 {
-	unsigned int size = PAGE_SIZE;
-	unsigned int remaining;
+	unsigned long encoded_va = nc->encoded_va;
+	unsigned int size, remaining;
 	struct page *page;
 
-	if (unlikely(!nc->va)) {
+	if (unlikely(!encoded_va)) {
 refill:
 		page = __page_frag_cache_refill(nc, gfp_mask);
 		if (!page)
 			return NULL;
 
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
+		encoded_va = nc->encoded_va;
+		size = page_frag_cache_page_size(encoded_va);
+
 		/* Even if we own the page, we do not use atomic_set().
 		 * This would break get_page_unless_zero() users.
 		 */
 		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
 
 		/* reset page count bias and remaining to start of new frag */
-		nc->pfmemalloc = page_is_pfmemalloc(page);
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
 		nc->remaining = size;
 	}
 
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	/* if size can vary use size else just use PAGE_SIZE */
-	size = nc->size;
-#endif
-
+	size = page_frag_cache_page_size(encoded_va);
 	remaining = nc->remaining & align_mask;
 	if (unlikely(remaining < fragsz)) {
 		if (unlikely(fragsz > PAGE_SIZE)) {
@@ -118,13 +111,15 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 			return NULL;
 		}
 
-		page = virt_to_page(nc->va);
+		page = virt_to_page((void *)encoded_va);
 
 		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
 			goto refill;
 
-		if (unlikely(nc->pfmemalloc)) {
-			free_unref_page(page, compound_order(page));
+		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
+			VM_BUG_ON(compound_order(page) !=
+				  encoded_page_order(encoded_va));
+			free_unref_page(page, encoded_page_order(encoded_va));
 			goto refill;
 		}
 
@@ -141,7 +136,7 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 	nc->pagecnt_bias--;
 	nc->remaining = remaining - fragsz;
 
-	return nc->va + (size - remaining);
+	return encoded_page_address(encoded_va) + (size - remaining);
 }
 EXPORT_SYMBOL(__page_frag_alloc_va_align);
 
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [RFC v11 08/14] mm: page_frag: some minor refactoring before adding new API
       [not found] <20240719093338.55117-1-linyunsheng@huawei.com>
                   ` (5 preceding siblings ...)
  2024-07-19  9:33 ` [RFC v11 07/14] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc' Yunsheng Lin
@ 2024-07-19  9:33 ` Yunsheng Lin
  2024-07-21 23:40   ` Alexander H Duyck
  2024-07-19  9:33 ` [RFC v11 09/14] mm: page_frag: use __alloc_pages() to replace alloc_pages_node() Yunsheng Lin
                   ` (3 subsequent siblings)
  10 siblings, 1 reply; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-19  9:33 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Andrew Morton, linux-mm

Refactor common codes from __page_frag_alloc_va_align()
to __page_frag_cache_refill(), so that the new API can
make use of them.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/page_frag_cache.h |  2 +-
 mm/page_frag_cache.c            | 93 +++++++++++++++++----------------
 2 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 12a16f8e8ad0..5aa45de7a9a5 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -50,7 +50,7 @@ static inline void *encoded_page_address(unsigned long encoded_va)
 
 static inline void page_frag_cache_init(struct page_frag_cache *nc)
 {
-	nc->encoded_va = 0;
+	memset(nc, 0, sizeof(*nc));
 }
 
 static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 7928e5d50711..d9c9cad17af7 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -19,6 +19,28 @@
 #include <linux/page_frag_cache.h>
 #include "internal.h"
 
+static struct page *__page_frag_cache_recharge(struct page_frag_cache *nc)
+{
+	unsigned long encoded_va = nc->encoded_va;
+	struct page *page;
+
+	page = virt_to_page((void *)encoded_va);
+	if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
+		return NULL;
+
+	if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
+		VM_BUG_ON(compound_order(page) !=
+			  encoded_page_order(encoded_va));
+		free_unref_page(page, encoded_page_order(encoded_va));
+		return NULL;
+	}
+
+	/* OK, page count is 0, we can safely set it */
+	set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
+
+	return page;
+}
+
 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 					     gfp_t gfp_mask)
 {
@@ -26,6 +48,14 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 	struct page *page = NULL;
 	gfp_t gfp = gfp_mask;
 
+	if (likely(nc->encoded_va)) {
+		page = __page_frag_cache_recharge(nc);
+		if (page) {
+			order = encoded_page_order(nc->encoded_va);
+			goto out;
+		}
+	}
+
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
 	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
 		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
@@ -35,7 +65,7 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 	if (unlikely(!page)) {
 		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
 		if (unlikely(!page)) {
-			nc->encoded_va = 0;
+			memset(nc, 0, sizeof(*nc));
 			return NULL;
 		}
 
@@ -45,6 +75,16 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 	nc->encoded_va = encode_aligned_va(page_address(page), order,
 					   page_is_pfmemalloc(page));
 
+	/* Even if we own the page, we do not use atomic_set().
+	 * This would break get_page_unless_zero() users.
+	 */
+	page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
+
+out:
+	/* reset page count bias and remaining to start of new frag */
+	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+	nc->remaining = PAGE_SIZE << order;
+
 	return page;
 }
 
@@ -55,7 +95,7 @@ void page_frag_cache_drain(struct page_frag_cache *nc)
 
 	__page_frag_cache_drain(virt_to_head_page((void *)nc->encoded_va),
 				nc->pagecnt_bias);
-	nc->encoded_va = 0;
+	memset(nc, 0, sizeof(*nc));
 }
 EXPORT_SYMBOL(page_frag_cache_drain);
 
@@ -72,31 +112,9 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 				 unsigned int fragsz, gfp_t gfp_mask,
 				 unsigned int align_mask)
 {
-	unsigned long encoded_va = nc->encoded_va;
-	unsigned int size, remaining;
-	struct page *page;
-
-	if (unlikely(!encoded_va)) {
-refill:
-		page = __page_frag_cache_refill(nc, gfp_mask);
-		if (!page)
-			return NULL;
-
-		encoded_va = nc->encoded_va;
-		size = page_frag_cache_page_size(encoded_va);
+	unsigned int size = page_frag_cache_page_size(nc->encoded_va);
+	unsigned int remaining = nc->remaining & align_mask;
 
-		/* Even if we own the page, we do not use atomic_set().
-		 * This would break get_page_unless_zero() users.
-		 */
-		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
-
-		/* reset page count bias and remaining to start of new frag */
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->remaining = size;
-	}
-
-	size = page_frag_cache_page_size(encoded_va);
-	remaining = nc->remaining & align_mask;
 	if (unlikely(remaining < fragsz)) {
 		if (unlikely(fragsz > PAGE_SIZE)) {
 			/*
@@ -111,32 +129,17 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 			return NULL;
 		}
 
-		page = virt_to_page((void *)encoded_va);
-
-		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
-			goto refill;
-
-		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
-			VM_BUG_ON(compound_order(page) !=
-				  encoded_page_order(encoded_va));
-			free_unref_page(page, encoded_page_order(encoded_va));
-			goto refill;
-		}
-
-		/* OK, page count is 0, we can safely set it */
-		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
-
-		/* reset page count bias and remaining to start of new frag */
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->remaining = size;
+		if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
+			return NULL;
 
+		size = page_frag_cache_page_size(nc->encoded_va);
 		remaining = size;
 	}
 
 	nc->pagecnt_bias--;
 	nc->remaining = remaining - fragsz;
 
-	return encoded_page_address(encoded_va) + (size - remaining);
+	return encoded_page_address(nc->encoded_va) + (size - remaining);
 }
 EXPORT_SYMBOL(__page_frag_alloc_va_align);
 
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [RFC v11 09/14] mm: page_frag: use __alloc_pages() to replace alloc_pages_node()
       [not found] <20240719093338.55117-1-linyunsheng@huawei.com>
                   ` (6 preceding siblings ...)
  2024-07-19  9:33 ` [RFC v11 08/14] mm: page_frag: some minor refactoring before adding new API Yunsheng Lin
@ 2024-07-19  9:33 ` Yunsheng Lin
  2024-07-21 21:41   ` Alexander H Duyck
  2024-07-19  9:33 ` [RFC v11 11/14] mm: page_frag: introduce prepare/probe/commit API Yunsheng Lin
                   ` (2 subsequent siblings)
  10 siblings, 1 reply; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-19  9:33 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Andrew Morton, linux-mm

There are more new APIs calling __page_frag_cache_refill() in
this patchset, which may cause compiler not being able to inline
__page_frag_cache_refill() into __page_frag_alloc_va_align().

Not being able to do the inlining seems to casue some notiable
performance degradation in arm64 system with 64K PAGE_SIZE after
adding new API calling __page_frag_cache_refill().

It seems there is about 24Bytes binary size increase for
__page_frag_cache_refill() and __page_frag_cache_refill() in
arm64 system with 64K PAGE_SIZE. By doing the gdb disassembling,
It seems we can have more than 100Bytes decrease for the binary
size by using __alloc_pages() to replace alloc_pages_node(), as
there seems to be some unnecessary checking for nid being
NUMA_NO_NODE, especially when page_frag is still part of the mm
system.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 mm/page_frag_cache.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index d9c9cad17af7..3f162e9d23ba 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -59,11 +59,11 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
 	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
 		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
-	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
-				PAGE_FRAG_CACHE_MAX_ORDER);
+	page = __alloc_pages(gfp_mask, PAGE_FRAG_CACHE_MAX_ORDER,
+			     numa_mem_id(), NULL);
 #endif
 	if (unlikely(!page)) {
-		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+		page = __alloc_pages(gfp, 0, numa_mem_id(), NULL);
 		if (unlikely(!page)) {
 			memset(nc, 0, sizeof(*nc));
 			return NULL;
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [RFC v11 11/14] mm: page_frag: introduce prepare/probe/commit API
       [not found] <20240719093338.55117-1-linyunsheng@huawei.com>
                   ` (7 preceding siblings ...)
  2024-07-19  9:33 ` [RFC v11 09/14] mm: page_frag: use __alloc_pages() to replace alloc_pages_node() Yunsheng Lin
@ 2024-07-19  9:33 ` Yunsheng Lin
  2024-07-19  9:33 ` [RFC v11 13/14] mm: page_frag: update documentation for page_frag Yunsheng Lin
       [not found] ` <CAKgT0UcGvrS7=r0OCGZipzBv8RuwYtRwb2QDXqiF4qW5CNws4g@mail.gmail.com>
  10 siblings, 0 replies; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-19  9:33 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Andrew Morton, linux-mm

There are many use cases that need minimum memory in order
for forward progress, but more performant if more memory is
available or need to probe the cache info to use any memory
available for frag caoleasing reason.

Currently skb_page_frag_refill() API is used to solve the
above use cases, but caller needs to know about the internal
detail and access the data field of 'struct page_frag' to
meet the requirement of the above use cases and its
implementation is similar to the one in mm subsystem.

To unify those two page_frag implementations, introduce a
prepare API to ensure minimum memory is satisfied and return
how much the actual memory is available to the caller and a
probe API to report the current available memory to caller
without doing cache refilling. The caller needs to either call
the commit API to report how much memory it actually uses, or
not do so if deciding to not use any memory.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/page_frag_cache.h |  76 +++++++++++++++++++++
 mm/page_frag_cache.c            | 114 ++++++++++++++++++++++++++++++++
 2 files changed, 190 insertions(+)

diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 5aa45de7a9a5..1242bfb53ccc 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -7,6 +7,8 @@
 #include <linux/build_bug.h>
 #include <linux/log2.h>
 #include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mmdebug.h>
 #include <linux/mm_types_task.h>
 #include <asm/page.h>
 
@@ -65,6 +67,9 @@ static inline unsigned int page_frag_cache_page_size(unsigned long encoded_va)
 
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
+struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
+				unsigned int *offset, unsigned int fragsz,
+				gfp_t gfp);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 				 unsigned int fragsz, gfp_t gfp_mask,
 				 unsigned int align_mask);
@@ -77,12 +82,83 @@ static inline void *page_frag_alloc_va_align(struct page_frag_cache *nc,
 	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, -align);
 }
 
+static inline unsigned int page_frag_cache_page_offset(const struct page_frag_cache *nc)
+{
+	return page_frag_cache_page_size(nc->encoded_va) - nc->remaining;
+}
+
 static inline void *page_frag_alloc_va(struct page_frag_cache *nc,
 				       unsigned int fragsz, gfp_t gfp_mask)
 {
 	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, ~0u);
 }
 
+void *page_frag_alloc_va_prepare(struct page_frag_cache *nc, unsigned int *fragsz,
+				 gfp_t gfp);
+
+static inline void *page_frag_alloc_va_prepare_align(struct page_frag_cache *nc,
+						     unsigned int *fragsz,
+						     gfp_t gfp,
+						     unsigned int align)
+{
+	WARN_ON_ONCE(!is_power_of_2(align));
+	nc->remaining = nc->remaining & -align;
+	return page_frag_alloc_va_prepare(nc, fragsz, gfp);
+}
+
+struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
+					unsigned int *offset,
+					unsigned int *fragsz, gfp_t gfp);
+
+struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
+				     unsigned int *offset,
+				     unsigned int *fragsz,
+				     void **va, gfp_t gfp);
+
+static inline struct page *page_frag_alloc_probe(struct page_frag_cache *nc,
+						 unsigned int *offset,
+						 unsigned int *fragsz,
+						 void **va)
+{
+	unsigned long encoded_va;
+	struct page *page;
+
+	VM_BUG_ON(!*fragsz);
+	if (unlikely(nc->remaining < *fragsz))
+		return NULL;
+
+	*fragsz = nc->remaining;
+	encoded_va = nc->encoded_va;
+	*va = encoded_page_address(encoded_va);
+	page = virt_to_page(*va);
+	*offset = page_frag_cache_page_size(encoded_va) - *fragsz;
+	*va += *offset;
+
+	return page;
+}
+
+static inline void page_frag_alloc_commit(struct page_frag_cache *nc,
+					  unsigned int fragsz)
+{
+	VM_BUG_ON(fragsz > nc->remaining || !nc->pagecnt_bias);
+	nc->pagecnt_bias--;
+	nc->remaining -= fragsz;
+}
+
+static inline void page_frag_alloc_commit_noref(struct page_frag_cache *nc,
+						unsigned int fragsz)
+{
+	VM_BUG_ON(fragsz > nc->remaining);
+	nc->remaining -= fragsz;
+}
+
+static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
+					 unsigned int fragsz)
+{
+	nc->pagecnt_bias++;
+	nc->remaining += fragsz;
+}
+
 void page_frag_free_va(void *addr);
 
 #endif
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 3f162e9d23ba..74d2a13d12a8 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -88,6 +88,120 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 	return page;
 }
 
+void *page_frag_alloc_va_prepare(struct page_frag_cache *nc,
+				 unsigned int *fragsz, gfp_t gfp)
+{
+	unsigned long encoded_va;
+	unsigned int remaining;
+
+	remaining = nc->remaining;
+	if (unlikely(*fragsz > remaining)) {
+		if (unlikely(!__page_frag_cache_refill(nc, gfp) ||
+			     *fragsz > PAGE_SIZE))
+			return NULL;
+
+		remaining = nc->remaining;
+	}
+
+	encoded_va = nc->encoded_va;
+	*fragsz = remaining;
+	return encoded_page_address(encoded_va) +
+			page_frag_cache_page_size(encoded_va) - remaining;
+}
+EXPORT_SYMBOL(page_frag_alloc_va_prepare);
+
+struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
+					unsigned int *offset,
+					unsigned int *fragsz, gfp_t gfp)
+{
+	unsigned long encoded_va;
+	unsigned int remaining;
+	struct page *page;
+
+	remaining = nc->remaining;
+	if (unlikely(*fragsz > remaining)) {
+		if (unlikely(*fragsz > PAGE_SIZE)) {
+			*fragsz = 0;
+			return NULL;
+		}
+
+		page = __page_frag_cache_refill(nc, gfp);
+		remaining = nc->remaining;
+		encoded_va = nc->encoded_va;
+	} else {
+		encoded_va = nc->encoded_va;
+		page = virt_to_page((void *)encoded_va);
+	}
+
+	*offset = page_frag_cache_page_size(encoded_va) - remaining;
+	*fragsz = remaining;
+
+	return page;
+}
+EXPORT_SYMBOL(page_frag_alloc_pg_prepare);
+
+struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
+				     unsigned int *offset,
+				     unsigned int *fragsz,
+				     void **va, gfp_t gfp)
+{
+	unsigned long encoded_va;
+	unsigned int remaining;
+	struct page *page;
+
+	remaining = nc->remaining;
+	if (unlikely(*fragsz > remaining)) {
+		if (unlikely(*fragsz > PAGE_SIZE)) {
+			*fragsz = 0;
+			return NULL;
+		}
+
+		page = __page_frag_cache_refill(nc, gfp);
+		remaining = nc->remaining;
+		encoded_va = nc->encoded_va;
+	} else {
+		encoded_va = nc->encoded_va;
+		page = virt_to_page((void *)encoded_va);
+	}
+
+	*offset = page_frag_cache_page_size(encoded_va) - remaining;
+	*fragsz = remaining;
+	*va = encoded_page_address(encoded_va) + *offset;
+
+	return page;
+}
+EXPORT_SYMBOL(page_frag_alloc_prepare);
+
+struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
+				unsigned int *offset, unsigned int fragsz,
+				gfp_t gfp)
+{
+	struct page *page;
+
+	if (unlikely(fragsz > nc->remaining)) {
+		if (unlikely(fragsz > PAGE_SIZE))
+			return NULL;
+
+		page = __page_frag_cache_refill(nc, gfp);
+		if (unlikely(!page))
+			return NULL;
+
+		*offset = 0;
+	} else {
+		unsigned long encoded_va = nc->encoded_va;
+
+		page = virt_to_page((void *)encoded_va);
+		*offset = page_frag_cache_page_size(encoded_va) -
+					nc->remaining;
+	}
+
+	nc->remaining -= fragsz;
+	nc->pagecnt_bias--;
+
+	return page;
+}
+EXPORT_SYMBOL(page_frag_alloc_pg);
+
 void page_frag_cache_drain(struct page_frag_cache *nc)
 {
 	if (!nc->encoded_va)
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [RFC v11 13/14] mm: page_frag: update documentation for page_frag
       [not found] <20240719093338.55117-1-linyunsheng@huawei.com>
                   ` (8 preceding siblings ...)
  2024-07-19  9:33 ` [RFC v11 11/14] mm: page_frag: introduce prepare/probe/commit API Yunsheng Lin
@ 2024-07-19  9:33 ` Yunsheng Lin
       [not found] ` <CAKgT0UcGvrS7=r0OCGZipzBv8RuwYtRwb2QDXqiF4qW5CNws4g@mail.gmail.com>
  10 siblings, 0 replies; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-19  9:33 UTC (permalink / raw)
  To: davem, kuba, pabeni
  Cc: netdev, linux-kernel, Yunsheng Lin, Alexander Duyck,
	Jonathan Corbet, Andrew Morton, linux-mm, linux-doc

Update documentation about design, implementation and API usages
for page_frag.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 Documentation/mm/page_frags.rst | 163 +++++++++++++++++++++++++++++++-
 include/linux/page_frag_cache.h | 107 +++++++++++++++++++++
 mm/page_frag_cache.c            |  77 ++++++++++++++-
 3 files changed, 344 insertions(+), 3 deletions(-)

diff --git a/Documentation/mm/page_frags.rst b/Documentation/mm/page_frags.rst
index 503ca6cdb804..6a4ac2616098 100644
--- a/Documentation/mm/page_frags.rst
+++ b/Documentation/mm/page_frags.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
 ==============
 Page fragments
 ==============
@@ -40,4 +42,163 @@ page via a single call.  The advantage to doing this is that it allows for
 cleaning up the multiple references that were added to a page in order to
 avoid calling get_page per allocation.
 
-Alexander Duyck, Nov 29, 2016.
+
+Architecture overview
+=====================
+
+.. code-block:: none
+
+                      +----------------------+
+                      | page_frag API caller |
+                      +----------------------+
+                                  |
+                                  |
+                                  v
+    +---------------------------------------------------------------+
+    |                   request page fragment                       |
+    +---------------------------------------------------------------+
+             |                                 |                  |
+             |                                 |                  |
+             |                          Cache not enough          |
+             |                                 |                  |
+             |                                 v                  |
+        Cache empty                   +-----------------+         |
+             |                        | drain old cache |         |
+             |                        +-----------------+         |
+             |                                 |                  |
+             v_________________________________v                  |
+                              |                                   |
+                              |                                   |
+             _________________v_______________                    |
+            |                                 |            Cache is enough
+            |                                 |                   |
+ PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE         |                   |
+            |                                 |                   |
+            |               PAGE_SIZE >= PAGE_FRAG_CACHE_MAX_SIZE |
+            v                                 |                   |
+    +----------------------------------+      |                   |
+    | refill cache with order > 0 page |      |                   |
+    +----------------------------------+      |                   |
+      |                    |                  |                   |
+      |                    |                  |                   |
+      |              Refill failed            |                   |
+      |                    |                  |                   |
+      |                    v                  v                   |
+      |      +------------------------------------+               |
+      |      |   refill cache with order 0 page   |               |
+      |      +----------------------------------=-+               |
+      |                       |                                   |
+ Refill succeed               |                                   |
+      |                 Refill succeed                            |
+      |                       |                                   |
+      v                       v                                   v
+    +---------------------------------------------------------------+
+    |             allocate fragment from cache                      |
+    +---------------------------------------------------------------+
+
+API interface
+=============
+As the design and implementation of page_frag API implies, the allocation side
+does not allow concurrent calling. Instead it is assumed that the caller must
+ensure there is not concurrent alloc calling to the same page_frag_cache
+instance by using its own lock or rely on some lockless guarantee like NAPI
+softirq.
+
+Depending on different aligning requirement, the page_frag API caller may call
+page_frag_alloc*_align*() to ensure the returned virtual address or offset of
+the page is aligned according to the 'align/alignment' parameter. Note the size
+of the allocated fragment is not aligned, the caller needs to provide an aligned
+fragsz if there is an alignment requirement for the size of the fragment.
+
+Depending on different use cases, callers expecting to deal with va, page or
+both va and page for them may call page_frag_alloc_va*, page_frag_alloc_pg*,
+or page_frag_alloc* API accordingly.
+
+There is also a use case that needs minimum memory in order for forward progress,
+but more performant if more memory is available. Using page_frag_alloc_prepare()
+and page_frag_alloc_commit() related API, the caller requests the minimum memory
+it needs and the prepare API will return the maximum size of the fragment
+returned. The caller needs to either call the commit API to report how much
+memory it actually uses, or not do so if deciding to not use any memory.
+
+.. kernel-doc:: include/linux/page_frag_cache.h
+   :identifiers: page_frag_cache_init page_frag_cache_is_pfmemalloc
+                 page_frag_cache_page_offset page_frag_alloc_va
+                 page_frag_alloc_va_align page_frag_alloc_va_prepare_align
+                 page_frag_alloc_probe page_frag_alloc_commit
+                 page_frag_alloc_commit_noref page_frag_alloc_abort
+
+.. kernel-doc:: mm/page_frag_cache.c
+   :identifiers: __page_frag_alloc_va_align page_frag_alloc_pg
+                 page_frag_alloc_va_prepare page_frag_alloc_pg_prepare
+                 page_frag_alloc_prepare page_frag_cache_drain
+                 page_frag_free_va
+
+Coding examples
+===============
+
+Init & Drain API
+----------------
+
+.. code-block:: c
+
+   page_frag_cache_init(pfrag);
+   ...
+   page_frag_cache_drain(pfrag);
+
+
+Alloc & Free API
+----------------
+
+.. code-block:: c
+
+    void *va;
+
+    va = page_frag_alloc_va_align(pfrag, size, gfp, align);
+    if (!va)
+        goto do_error;
+
+    err = do_something(va, size);
+    if (err) {
+        page_frag_free_va(va);
+        goto do_error;
+    }
+
+Prepare & Commit API
+--------------------
+
+.. code-block:: c
+
+    unsigned int offset, size;
+    bool merge = true;
+    struct page *page;
+    void *va;
+
+    size = 32U;
+    page = page_frag_alloc_prepare(pfrag, &offset, &size, &va);
+    if (!page)
+        goto wait_for_space;
+
+    copy = min_t(unsigned int, copy, size);
+    if (!skb_can_coalesce(skb, i, page, offset)) {
+        if (i >= max_skb_frags)
+            goto new_segment;
+
+        merge = false;
+    }
+
+    copy = mem_schedule(copy);
+    if (!copy)
+        goto wait_for_space;
+
+    err = copy_from_iter_full_nocache(va, copy, iter);
+    if (err)
+        goto do_error;
+
+    if (merge) {
+        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+        page_frag_alloc_commit_noref(pfrag, offset, copy);
+    } else {
+        skb_fill_page_desc(skb, i, page, offset, copy);
+        page_frag_alloc_commit(pfrag, offset, copy);
+    }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 1242bfb53ccc..6e16fcd57d72 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -50,11 +50,28 @@ static inline void *encoded_page_address(unsigned long encoded_va)
 	return (void *)(encoded_va & PAGE_MASK);
 }
 
+/**
+ * page_frag_cache_init() - Init page_frag cache.
+ * @nc: page_frag cache from which to init
+ *
+ * Inline helper to init the page_frag cache.
+ */
 static inline void page_frag_cache_init(struct page_frag_cache *nc)
 {
 	memset(nc, 0, sizeof(*nc));
 }
 
+/**
+ * page_frag_cache_is_pfmemalloc() - Check for pfmemalloc.
+ * @nc: page_frag cache from which to check
+ *
+ * Used to check if the current page in page_frag cache is pfmemalloc'ed.
+ * It has the same calling context expection as the alloc API.
+ *
+ * Return:
+ * true if the current page in page_frag cache is pfmemalloc'ed, otherwise
+ * return false.
+ */
 static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
 {
 	return encoded_page_pfmemalloc(nc->encoded_va);
@@ -74,6 +91,19 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 				 unsigned int fragsz, gfp_t gfp_mask,
 				 unsigned int align_mask);
 
+/**
+ * page_frag_alloc_va_align() - Alloc a page fragment with aligning requirement.
+ * @nc: page_frag cache from which to allocate
+ * @fragsz: the requested fragment size
+ * @gfp_mask: the allocation gfp to use when cache needs to be refilled
+ * @align: the requested aligning requirement for virtual address of fragment
+ *
+ * WARN_ON_ONCE() checking for @align before allocing a page fragment from
+ * page_frag cache with aligning requirement.
+ *
+ * Return:
+ * virtual address of the page fragment, otherwise return NULL.
+ */
 static inline void *page_frag_alloc_va_align(struct page_frag_cache *nc,
 					     unsigned int fragsz,
 					     gfp_t gfp_mask, unsigned int align)
@@ -82,11 +112,32 @@ static inline void *page_frag_alloc_va_align(struct page_frag_cache *nc,
 	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, -align);
 }
 
+/**
+ * page_frag_cache_page_offset() - Return the current page fragment's offset.
+ * @nc: page_frag cache from which to check
+ *
+ * The API is only used in net/sched/em_meta.c for historical reason, do not use
+ * it for new caller unless there is a strong reason.
+ *
+ * Return:
+ * the offset of the current page fragment in the page_frag cache.
+ */
 static inline unsigned int page_frag_cache_page_offset(const struct page_frag_cache *nc)
 {
 	return page_frag_cache_page_size(nc->encoded_va) - nc->remaining;
 }
 
+/**
+ * page_frag_alloc_va() - Alloc a page fragment.
+ * @nc: page_frag cache from which to allocate
+ * @fragsz: the requested fragment size
+ * @gfp_mask: the allocation gfp to use when cache need to be refilled
+ *
+ * Get a page fragment from page_frag cache.
+ *
+ * Return:
+ * virtual address of the page fragment, otherwise return NULL.
+ */
 static inline void *page_frag_alloc_va(struct page_frag_cache *nc,
 				       unsigned int fragsz, gfp_t gfp_mask)
 {
@@ -96,6 +147,21 @@ static inline void *page_frag_alloc_va(struct page_frag_cache *nc,
 void *page_frag_alloc_va_prepare(struct page_frag_cache *nc, unsigned int *fragsz,
 				 gfp_t gfp);
 
+/**
+ * page_frag_alloc_va_prepare_align() - Prepare allocing a page fragment with
+ * aligning requirement.
+ * @nc: page_frag cache from which to prepare
+ * @fragsz: in as the requested size, out as the available size
+ * @gfp: the allocation gfp to use when cache need to be refilled
+ * @align: the requested aligning requirement
+ *
+ * WARN_ON_ONCE() checking for @align before preparing an aligned page fragment
+ * with minimum size of @fragsz, @fragsz is also used to report the maximum size
+ * of the page fragment the caller can use.
+ *
+ * Return:
+ * virtual address of the page fragment, otherwise return NULL.
+ */
 static inline void *page_frag_alloc_va_prepare_align(struct page_frag_cache *nc,
 						     unsigned int *fragsz,
 						     gfp_t gfp,
@@ -115,6 +181,21 @@ struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
 				     unsigned int *fragsz,
 				     void **va, gfp_t gfp);
 
+/**
+ * page_frag_alloc_probe - Probe the available page fragment.
+ * @nc: page_frag cache from which to probe
+ * @offset: out as the offset of the page fragment
+ * @fragsz: in as the requested size, out as the available size
+ * @va: out as the virtual address of the returned page fragment
+ *
+ * Probe the current available memory to caller without doing cache refilling.
+ * If no space is available in the page_frag cache, return NULL.
+ * If the requested space is available, up to @fragsz bytes may be added to the
+ * fragment using commit API.
+ *
+ * Return:
+ * the page fragment, otherwise return NULL.
+ */
 static inline struct page *page_frag_alloc_probe(struct page_frag_cache *nc,
 						 unsigned int *offset,
 						 unsigned int *fragsz,
@@ -137,6 +218,14 @@ static inline struct page *page_frag_alloc_probe(struct page_frag_cache *nc,
 	return page;
 }
 
+/**
+ * page_frag_alloc_commit - Commit allocing a page fragment.
+ * @nc: page_frag cache from which to commit
+ * @fragsz: size of the page fragment has been used
+ *
+ * Commit the actual used size for the allocation that was either prepared or
+ * probed.
+ */
 static inline void page_frag_alloc_commit(struct page_frag_cache *nc,
 					  unsigned int fragsz)
 {
@@ -145,6 +234,16 @@ static inline void page_frag_alloc_commit(struct page_frag_cache *nc,
 	nc->remaining -= fragsz;
 }
 
+/**
+ * page_frag_alloc_commit_noref - Commit allocing a page fragment without taking
+ * page refcount.
+ * @nc: page_frag cache from which to commit
+ * @fragsz: size of the page fragment has been used
+ *
+ * Commit the alloc preparing or probing by passing the actual used size, but
+ * not taking refcount. Mostly used for fragmemt coalescing case when the
+ * current fragment can share the same refcount with previous fragment.
+ */
 static inline void page_frag_alloc_commit_noref(struct page_frag_cache *nc,
 						unsigned int fragsz)
 {
@@ -152,6 +251,14 @@ static inline void page_frag_alloc_commit_noref(struct page_frag_cache *nc,
 	nc->remaining -= fragsz;
 }
 
+/**
+ * page_frag_alloc_abort - Abort the page fragment allocation.
+ * @nc: page_frag cache to which the page fragment is aborted back
+ * @fragsz: size of the page fragment to be aborted
+ *
+ * It is expected to be called from the same context as the alloc API.
+ * Mostly used for error handling cases where the fragment is no longer needed.
+ */
 static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
 					 unsigned int fragsz)
 {
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 74d2a13d12a8..80e16f399f6f 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -88,6 +88,18 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 	return page;
 }
 
+/**
+ * page_frag_alloc_va_prepare() - Prepare allocing a page fragment.
+ * @nc: page_frag cache from which to prepare
+ * @fragsz: in as the requested size, out as the available size
+ * @gfp: the allocation gfp to use when cache needs to be refilled
+ *
+ * Prepare a page fragment with minimum size of @fragsz, @fragsz is also used
+ * to report the maximum size of the page fragment the caller can use.
+ *
+ * Return:
+ * virtual address of the page fragment, otherwise return NULL.
+ */
 void *page_frag_alloc_va_prepare(struct page_frag_cache *nc,
 				 unsigned int *fragsz, gfp_t gfp)
 {
@@ -110,6 +122,19 @@ void *page_frag_alloc_va_prepare(struct page_frag_cache *nc,
 }
 EXPORT_SYMBOL(page_frag_alloc_va_prepare);
 
+/**
+ * page_frag_alloc_pg_prepare - Prepare allocing a page fragment.
+ * @nc: page_frag cache from which to prepare
+ * @offset: out as the offset of the page fragment
+ * @fragsz: in as the requested size, out as the available size
+ * @gfp: the allocation gfp to use when cache needs to be refilled
+ *
+ * Prepare a page fragment with minimum size of @fragsz, @fragsz is also used
+ * to report the maximum size of the page fragment the caller can use.
+ *
+ * Return:
+ * the page fragment, otherwise return NULL.
+ */
 struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
 					unsigned int *offset,
 					unsigned int *fragsz, gfp_t gfp)
@@ -140,6 +165,21 @@ struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
 }
 EXPORT_SYMBOL(page_frag_alloc_pg_prepare);
 
+/**
+ * page_frag_alloc_prepare - Prepare allocing a page fragment.
+ * @nc: page_frag cache from which to prepare
+ * @offset: out as the offset of the page fragment
+ * @fragsz: in as the requested size, out as the available size
+ * @va: out as the virtual address of the returned page fragment
+ * @gfp: the allocation gfp to use when cache needs to be refilled
+ *
+ * Prepare a page fragment with minimum size of @fragsz, @fragsz is also used
+ * to report the maximum size of the page fragment. Return both 'struct page'
+ * and virtual address of the fragment to the caller.
+ *
+ * Return:
+ * the page fragment, otherwise return NULL.
+ */
 struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
 				     unsigned int *offset,
 				     unsigned int *fragsz,
@@ -172,6 +212,18 @@ struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
 }
 EXPORT_SYMBOL(page_frag_alloc_prepare);
 
+/**
+ * page_frag_alloc_pg - Alloce a page fragment.
+ * @nc: page_frag cache from which to alloce
+ * @offset: out as the offset of the page fragment
+ * @fragsz: the requested fragment size
+ * @gfp: the allocation gfp to use when cache needs to be refilled
+ *
+ * Get a page fragment from page_frag cache.
+ *
+ * Return:
+ * the page fragment, otherwise return NULL.
+ */
 struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
 				unsigned int *offset, unsigned int fragsz,
 				gfp_t gfp)
@@ -202,6 +254,10 @@ struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
 }
 EXPORT_SYMBOL(page_frag_alloc_pg);
 
+/**
+ * page_frag_cache_drain - Drain the current page from page_frag cache.
+ * @nc: page_frag cache from which to drain
+ */
 void page_frag_cache_drain(struct page_frag_cache *nc)
 {
 	if (!nc->encoded_va)
@@ -222,6 +278,19 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
+/**
+ * __page_frag_alloc_va_align() - Alloc a page fragment with aligning
+ * requirement.
+ * @nc: page_frag cache from which to allocate
+ * @fragsz: the requested fragment size
+ * @gfp_mask: the allocation gfp to use when cache need to be refilled
+ * @align_mask: the requested aligning requirement for the 'va'
+ *
+ * Get a page fragment from page_frag cache with aligning requirement.
+ *
+ * Return:
+ * Return va of the page fragment, otherwise return NULL.
+ */
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 				 unsigned int fragsz, gfp_t gfp_mask,
 				 unsigned int align_mask)
@@ -257,8 +326,12 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 }
 EXPORT_SYMBOL(__page_frag_alloc_va_align);
 
-/*
- * Frees a page fragment allocated out of either a compound or order 0 page.
+/**
+ * page_frag_free_va - Free a page fragment.
+ * @addr: va of page fragment to be freed
+ *
+ * Free a page fragment allocated out of either a compound or order 0 page by
+ * virtual address.
  */
 void page_frag_free_va(void *addr)
 {
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [RFC v11 01/14] mm: page_frag: add a test module for page_frag
  2024-07-19  9:33 ` [RFC v11 01/14] mm: page_frag: add a test module for page_frag Yunsheng Lin
@ 2024-07-21 17:34   ` Alexander Duyck
  2024-07-23 13:19     ` Yunsheng Lin
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander Duyck @ 2024-07-21 17:34 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On Fri, Jul 19, 2024 at 2:36 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> Basing on the lib/objpool.c, change it to something like a
> ptrpool, so that we can utilize that to test the correctness
> and performance of the page_frag.
>
> The testing is done by ensuring that the fragment allocated
> from a frag_frag_cache instance is pushed into a ptrpool
> instance in a kthread binded to a specified cpu, and a kthread
> binded to a specified cpu will pop the fragment from the
> ptrpool and free the fragment.
>
> We may refactor out the common part between objpool and ptrpool
> if this ptrpool thing turns out to be helpful for other place.
>
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  mm/Kconfig.debug    |   8 +
>  mm/Makefile         |   1 +
>  mm/page_frag_test.c | 393 ++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 402 insertions(+)
>  create mode 100644 mm/page_frag_test.c

I might have missed it somewhere. Is there any reason why this isn't
in the selftests/mm/ directory? Seems like that would be a better fit
for this.

> diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
> index afc72fde0f03..1ebcd45f47d4 100644
> --- a/mm/Kconfig.debug
> +++ b/mm/Kconfig.debug
> @@ -142,6 +142,14 @@ config DEBUG_PAGE_REF
>           kernel code.  However the runtime performance overhead is virtually
>           nil until the tracepoints are actually enabled.
>
> +config DEBUG_PAGE_FRAG_TEST

This isn't a "DEBUG" feature. This is a test feature.

> +       tristate "Test module for page_frag"
> +       default n
> +       depends on m && DEBUG_KERNEL

I am not sure it is valid to have a tristate depend on being built as a module.

I think if you can set it up as a selftest it will have broader use as
you could compile it against any target kernel going forward and add
it as a module rather than having to build it as a part of a debug
kernel.

> +       help
> +         This builds the "page_frag_test" module that is used to test the
> +         correctness and performance of page_frag's implementation.
> +
>  config DEBUG_RODATA_TEST
>      bool "Testcase for the marking rodata read-only"
>      depends on STRICT_KERNEL_RWX
> diff --git a/mm/Makefile b/mm/Makefile
> index 8fb85acda1b1..29d9f7618a33 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -106,6 +106,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
>  obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
>  obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
>  obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
> +obj-$(CONFIG_DEBUG_PAGE_FRAG_TEST) += page_frag_test.o
>  obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
>  obj-$(CONFIG_PAGE_OWNER) += page_owner.o
>  obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
> diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
> new file mode 100644
> index 000000000000..cf2691f60b67
> --- /dev/null
> +++ b/mm/page_frag_test.c
> @@ -0,0 +1,393 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Test module for page_frag cache
> + *
> + * Copyright: linyunsheng@huawei.com
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/vmalloc.h>
> +#include <linux/atomic.h>
> +#include <linux/irqflags.h>
> +#include <linux/cpumask.h>
> +#include <linux/log2.h>
> +#include <linux/completion.h>
> +#include <linux/kthread.h>
> +
> +#define OBJPOOL_NR_OBJECT_MAX  BIT(24)
> +
> +struct objpool_slot {
> +       u32 head;
> +       u32 tail;
> +       u32 last;
> +       u32 mask;
> +       void *entries[];
> +} __packed;
> +
> +struct objpool_head {
> +       int nr_cpus;
> +       int capacity;
> +       struct objpool_slot **cpu_slots;
> +};
> +
> +/* initialize percpu objpool_slot */
> +static void objpool_init_percpu_slot(struct objpool_head *pool,
> +                                    struct objpool_slot *slot)
> +{
> +       /* initialize elements of percpu objpool_slot */
> +       slot->mask = pool->capacity - 1;
> +}
> +
> +/* allocate and initialize percpu slots */
> +static int objpool_init_percpu_slots(struct objpool_head *pool,
> +                                    int nr_objs, gfp_t gfp)
> +{
> +       int i;
> +
> +       for (i = 0; i < pool->nr_cpus; i++) {
> +               struct objpool_slot *slot;
> +               int size;
> +
> +               /* skip the cpu node which could never be present */
> +               if (!cpu_possible(i))
> +                       continue;
> +
> +               size = struct_size(slot, entries, pool->capacity);
> +
> +               /*
> +                * here we allocate percpu-slot & objs together in a single
> +                * allocation to make it more compact, taking advantage of
> +                * warm caches and TLB hits. in default vmalloc is used to
> +                * reduce the pressure of kernel slab system. as we know,
> +                * minimal size of vmalloc is one page since vmalloc would
> +                * always align the requested size to page size
> +                */
> +               if (gfp & GFP_ATOMIC)
> +                       slot = kmalloc_node(size, gfp, cpu_to_node(i));
> +               else
> +                       slot = __vmalloc_node(size, sizeof(void *), gfp,
> +                                             cpu_to_node(i),
> +                                             __builtin_return_address(0));

When would anyone ever call this with atomic? This is just for your
test isn't it?

> +               if (!slot)
> +                       return -ENOMEM;
> +
> +               memset(slot, 0, size);
> +               pool->cpu_slots[i] = slot;
> +
> +               objpool_init_percpu_slot(pool, slot);
> +       }
> +
> +       return 0;
> +}
> +
> +/* cleanup all percpu slots of the object pool */
> +static void objpool_fini_percpu_slots(struct objpool_head *pool)
> +{
> +       int i;
> +
> +       if (!pool->cpu_slots)
> +               return;
> +
> +       for (i = 0; i < pool->nr_cpus; i++)
> +               kvfree(pool->cpu_slots[i]);
> +       kfree(pool->cpu_slots);
> +}
> +
> +/* initialize object pool and pre-allocate objects */
> +static int objpool_init(struct objpool_head *pool, int nr_objs, gfp_t gfp)
> +{
> +       int rc, capacity, slot_size;
> +
> +       /* check input parameters */
> +       if (nr_objs <= 0 || nr_objs > OBJPOOL_NR_OBJECT_MAX)
> +               return -EINVAL;
> +
> +       /* calculate capacity of percpu objpool_slot */
> +       capacity = roundup_pow_of_two(nr_objs);
> +       if (!capacity)
> +               return -EINVAL;
> +
> +       gfp = gfp & ~__GFP_ZERO;
> +
> +       /* initialize objpool pool */
> +       memset(pool, 0, sizeof(struct objpool_head));
> +       pool->nr_cpus = nr_cpu_ids;
> +       pool->capacity = capacity;
> +       slot_size = pool->nr_cpus * sizeof(struct objpool_slot *);
> +       pool->cpu_slots = kzalloc(slot_size, gfp);
> +       if (!pool->cpu_slots)
> +               return -ENOMEM;
> +
> +       /* initialize per-cpu slots */
> +       rc = objpool_init_percpu_slots(pool, nr_objs, gfp);
> +       if (rc)
> +               objpool_fini_percpu_slots(pool);
> +
> +       return rc;
> +}
> +
> +/* adding object to slot, abort if the slot was already full */
> +static int objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu)
> +{
> +       struct objpool_slot *slot = pool->cpu_slots[cpu];
> +       u32 head, tail;
> +
> +       /* loading tail and head as a local snapshot, tail first */
> +       tail = READ_ONCE(slot->tail);
> +
> +       do {
> +               head = READ_ONCE(slot->head);
> +               /* fault caught: something must be wrong */
> +               if (unlikely(tail - head >= pool->capacity))
> +                       return -ENOSPC;
> +       } while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1));
> +
> +       /* now the tail position is reserved for the given obj */
> +       WRITE_ONCE(slot->entries[tail & slot->mask], obj);
> +       /* update sequence to make this obj available for pop() */
> +       smp_store_release(&slot->last, tail + 1);
> +
> +       return 0;
> +}
> +
> +/* reclaim an object to object pool */
> +static int objpool_push(void *obj, struct objpool_head *pool)
> +{
> +       unsigned long flags;
> +       int rc;
> +
> +       /* disable local irq to avoid preemption & interruption */
> +       raw_local_irq_save(flags);
> +       rc = objpool_try_add_slot(obj, pool, raw_smp_processor_id());
> +       raw_local_irq_restore(flags);
> +
> +       return rc;
> +}
> +
> +/* try to retrieve object from slot */
> +static void *objpool_try_get_slot(struct objpool_head *pool, int cpu)
> +{
> +       struct objpool_slot *slot = pool->cpu_slots[cpu];
> +       /* load head snapshot, other cpus may change it */
> +       u32 head = smp_load_acquire(&slot->head);
> +
> +       while (head != READ_ONCE(slot->last)) {
> +               void *obj;
> +
> +               /*
> +                * data visibility of 'last' and 'head' could be out of
> +                * order since memory updating of 'last' and 'head' are
> +                * performed in push() and pop() independently
> +                *
> +                * before any retrieving attempts, pop() must guarantee
> +                * 'last' is behind 'head', that is to say, there must
> +                * be available objects in slot, which could be ensured
> +                * by condition 'last != head && last - head <= nr_objs'
> +                * that is equivalent to 'last - head - 1 < nr_objs' as
> +                * 'last' and 'head' are both unsigned int32
> +                */
> +               if (READ_ONCE(slot->last) - head - 1 >= pool->capacity) {
> +                       head = READ_ONCE(slot->head);
> +                       continue;
> +               }
> +
> +               /* obj must be retrieved before moving forward head */
> +               obj = READ_ONCE(slot->entries[head & slot->mask]);
> +
> +               /* move head forward to mark it's consumption */
> +               if (try_cmpxchg_release(&slot->head, &head, head + 1))
> +                       return obj;
> +       }
> +
> +       return NULL;
> +}
> +
> +/* allocate an object from object pool */
> +static void *objpool_pop(struct objpool_head *pool)
> +{
> +       void *obj = NULL;
> +       unsigned long flags;
> +       int i, cpu;
> +
> +       /* disable local irq to avoid preemption & interruption */
> +       raw_local_irq_save(flags);
> +
> +       cpu = raw_smp_processor_id();
> +       for (i = 0; i < num_possible_cpus(); i++) {
> +               obj = objpool_try_get_slot(pool, cpu);
> +               if (obj)
> +                       break;
> +               cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1);
> +       }
> +       raw_local_irq_restore(flags);
> +
> +       return obj;
> +}
> +
> +/* release whole objpool forcely */
> +static void objpool_free(struct objpool_head *pool)
> +{
> +       if (!pool->cpu_slots)
> +               return;
> +
> +       /* release percpu slots */
> +       objpool_fini_percpu_slots(pool);
> +}
> +

Why add all this extra objpool overhead? This seems like overkill for
what should be a simple test. Seems like you should just need a simple
array located on one of your CPUs. I'm not sure what is with all the
extra overhead being added here.

> +static struct objpool_head ptr_pool;
> +static int nr_objs = 512;
> +static atomic_t nthreads;
> +static struct completion wait;
> +static struct page_frag_cache test_frag;
> +
> +static int nr_test = 5120000;
> +module_param(nr_test, int, 0);
> +MODULE_PARM_DESC(nr_test, "number of iterations to test");
> +
> +static bool test_align;
> +module_param(test_align, bool, 0);
> +MODULE_PARM_DESC(test_align, "use align API for testing");
> +
> +static int test_alloc_len = 2048;
> +module_param(test_alloc_len, int, 0);
> +MODULE_PARM_DESC(test_alloc_len, "alloc len for testing");
> +
> +static int test_push_cpu;
> +module_param(test_push_cpu, int, 0);
> +MODULE_PARM_DESC(test_push_cpu, "test cpu for pushing fragment");
> +
> +static int test_pop_cpu;
> +module_param(test_pop_cpu, int, 0);
> +MODULE_PARM_DESC(test_pop_cpu, "test cpu for popping fragment");
> +
> +static int page_frag_pop_thread(void *arg)
> +{
> +       struct objpool_head *pool = arg;
> +       int nr = nr_test;
> +
> +       pr_info("page_frag pop test thread begins on cpu %d\n",
> +               smp_processor_id());
> +
> +       while (nr > 0) {
> +               void *obj = objpool_pop(pool);
> +
> +               if (obj) {
> +                       nr--;
> +                       page_frag_free(obj);
> +               } else {
> +                       cond_resched();
> +               }
> +       }
> +
> +       if (atomic_dec_and_test(&nthreads))
> +               complete(&wait);
> +
> +       pr_info("page_frag pop test thread exits on cpu %d\n",
> +               smp_processor_id());
> +
> +       return 0;
> +}
> +
> +static int page_frag_push_thread(void *arg)
> +{
> +       struct objpool_head *pool = arg;
> +       int nr = nr_test;
> +
> +       pr_info("page_frag push test thread begins on cpu %d\n",
> +               smp_processor_id());
> +
> +       while (nr > 0) {
> +               void *va;
> +               int ret;
> +
> +               if (test_align) {
> +                       va = page_frag_alloc_align(&test_frag, test_alloc_len,
> +                                                  GFP_KERNEL, SMP_CACHE_BYTES);
> +
> +                       WARN_ONCE((unsigned long)va & (SMP_CACHE_BYTES - 1),
> +                                 "unaligned va returned\n");
> +               } else {
> +                       va = page_frag_alloc(&test_frag, test_alloc_len, GFP_KERNEL);
> +               }
> +
> +               if (!va)
> +                       continue;
> +
> +               ret = objpool_push(va, pool);
> +               if (ret) {
> +                       page_frag_free(va);
> +                       cond_resched();
> +               } else {
> +                       nr--;
> +               }
> +       }
> +
> +       pr_info("page_frag push test thread exits on cpu %d\n",
> +               smp_processor_id());
> +
> +       if (atomic_dec_and_test(&nthreads))
> +               complete(&wait);
> +
> +       return 0;
> +}
> +

So looking over these functions they seem to overlook how the network
stack works in many cases. One of the main motivations for the page
frags approach is page recycling. For example with GRO enabled the
headers allocated to record the frags might be freed for all but the
first. As such you can end up with 17 fragments being allocated, and
16 freed within the same thread as NAPI will just be recycling the
buffers.

With this setup it doesn't seem very likely to be triggered since you
are operating in two threads. One test you might want to look at
adding is a test where you are allocating and freeing in the same
thread at a fairly constant rate to test against the "ideal" scenario.

> +static int __init page_frag_test_init(void)
> +{
> +       struct task_struct *tsk_push, *tsk_pop;
> +       ktime_t start;
> +       u64 duration;
> +       int ret;
> +
> +       test_frag.va = NULL;
> +       atomic_set(&nthreads, 2);
> +       init_completion(&wait);
> +
> +       if (test_alloc_len > PAGE_SIZE || test_alloc_len <= 0)
> +               return -EINVAL;
> +
> +       ret = objpool_init(&ptr_pool, nr_objs, GFP_KERNEL);
> +       if (ret)
> +               return ret;
> +
> +       tsk_push = kthread_create_on_cpu(page_frag_push_thread, &ptr_pool,
> +                                        test_push_cpu, "page_frag_push");
> +       if (IS_ERR(tsk_push))
> +               return PTR_ERR(tsk_push);
> +
> +       tsk_pop = kthread_create_on_cpu(page_frag_pop_thread, &ptr_pool,
> +                                       test_pop_cpu, "page_frag_pop");
> +       if (IS_ERR(tsk_pop)) {
> +               kthread_stop(tsk_push);
> +               return PTR_ERR(tsk_pop);
> +       }
> +
> +       start = ktime_get();
> +       wake_up_process(tsk_push);
> +       wake_up_process(tsk_pop);
> +
> +       pr_info("waiting for test to complete\n");
> +       wait_for_completion(&wait);
> +
> +       duration = (u64)ktime_us_delta(ktime_get(), start);
> +       pr_info("%d of iterations for %s testing took: %lluus\n", nr_test,
> +               test_align ? "aligned" : "non-aligned", duration);
> +
> +       objpool_free(&ptr_pool);
> +       page_frag_cache_drain(&test_frag);
> +
> +       return -EAGAIN;
> +}
> +
> +static void __exit page_frag_test_exit(void)
> +{
> +}
> +
> +module_init(page_frag_test_init);
> +module_exit(page_frag_test_exit);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Yunsheng Lin <linyunsheng@huawei.com>");
> +MODULE_DESCRIPTION("Test module for page_frag");
> --
> 2.33.0
>


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 02/14] mm: move the page fragment allocator from page_alloc into its own file
  2024-07-19  9:33 ` [RFC v11 02/14] mm: move the page fragment allocator from page_alloc into its own file Yunsheng Lin
@ 2024-07-21 17:58   ` Alexander Duyck
  2024-07-27 15:04     ` Yunsheng Lin
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander Duyck @ 2024-07-21 17:58 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, David Howells,
	Andrew Morton, linux-mm

On Fri, Jul 19, 2024 at 2:37 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> Inspired by [1], move the page fragment allocator from page_alloc
> into its own c file and header file, as we are about to make more
> change for it to replace another page_frag implementation in
> sock.c
>
> As this patchset is going to replace 'struct page_frag' with
> 'struct page_frag_cache' in sched.h, including page_frag_cache.h
> in sched.h has a compiler error caused by interdependence between
> mm_types.h and mm.h for asm-offsets.c, see [2]. So avoid the compiler
> error by moving 'struct page_frag_cache' to mm_types_task.h as
> suggested by Alexander, see [3].
>
> 1. https://lore.kernel.org/all/20230411160902.4134381-3-dhowells@redhat.com/
> 2. https://lore.kernel.org/all/15623dac-9358-4597-b3ee-3694a5956920@gmail.com/
> 3. https://lore.kernel.org/all/CAKgT0UdH1yD=LSCXFJ=YM_aiA4OomD-2wXykO42bizaWMt_HOA@mail.gmail.com/
> CC: David Howells <dhowells@redhat.com>
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/linux/gfp.h             |  22 -----
>  include/linux/mm_types.h        |  18 ----
>  include/linux/mm_types_task.h   |  18 ++++
>  include/linux/page_frag_cache.h |  32 +++++++
>  include/linux/skbuff.h          |   1 +
>  mm/Makefile                     |   1 +
>  mm/page_alloc.c                 | 136 ------------------------------
>  mm/page_frag_cache.c            | 145 ++++++++++++++++++++++++++++++++
>  mm/page_frag_test.c             |   2 +-
>  9 files changed, 198 insertions(+), 177 deletions(-)
>  create mode 100644 include/linux/page_frag_cache.h
>  create mode 100644 mm/page_frag_cache.c
>

...

> diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
> index a2f6179b672b..cdc1e3696439 100644
> --- a/include/linux/mm_types_task.h
> +++ b/include/linux/mm_types_task.h
> @@ -8,6 +8,7 @@
>   * (These are defined separately to decouple sched.h from mm_types.h as much as possible.)
>   */
>
> +#include <linux/align.h>
>  #include <linux/types.h>
>
>  #include <asm/page.h>
> @@ -46,6 +47,23 @@ struct page_frag {
>  #endif
>  };
>
> +#define PAGE_FRAG_CACHE_MAX_SIZE       __ALIGN_MASK(32768, ~PAGE_MASK)
> +#define PAGE_FRAG_CACHE_MAX_ORDER      get_order(PAGE_FRAG_CACHE_MAX_SIZE)
> +struct page_frag_cache {
> +       void *va;
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> +       __u16 offset;
> +       __u16 size;
> +#else
> +       __u32 offset;
> +#endif
> +       /* we maintain a pagecount bias, so that we dont dirty cache line
> +        * containing page->_refcount every time we allocate a fragment.
> +        */
> +       unsigned int            pagecnt_bias;
> +       bool pfmemalloc;
> +};
> +
>  /* Track pages that require TLB flushes */
>  struct tlbflush_unmap_batch {
>  #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> new file mode 100644
> index 000000000000..43afb1bbcac9
> --- /dev/null
> +++ b/include/linux/page_frag_cache.h
> @@ -0,0 +1,32 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef _LINUX_PAGE_FRAG_CACHE_H
> +#define _LINUX_PAGE_FRAG_CACHE_H
> +
> +#include <linux/log2.h>
> +#include <linux/types.h>
> +#include <linux/mm_types_task.h>

You don't need to include mm_types_task.h here. You can just use
declare "struct page_frag_cache;" as we did before in gfp.h.
Technically this should be included in mm_types.h so any callers
making use of these functions would need to make sure to include that
like we did for gfp.h before anyway.

> +#include <asm/page.h>
> +

Not sure why this is included here either. From what I can tell there
isn't anything here using the contents of page.h. I suspect you should
only need it for the get_order call which would be used in other
files.

> +void page_frag_cache_drain(struct page_frag_cache *nc);
> +void __page_frag_cache_drain(struct page *page, unsigned int count);
> +void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
> +                             gfp_t gfp_mask, unsigned int align_mask);
> +
> +static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
> +                                         unsigned int fragsz, gfp_t gfp_mask,
> +                                         unsigned int align)
> +{
> +       WARN_ON_ONCE(!is_power_of_2(align));
> +       return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
> +}
> +
> +static inline void *page_frag_alloc(struct page_frag_cache *nc,
> +                                   unsigned int fragsz, gfp_t gfp_mask)
> +{
> +       return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
> +}
> +
> +void page_frag_free(void *addr);
> +
> +#endif

...

> diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
> index cf2691f60b67..b7a5affb92f2 100644
> --- a/mm/page_frag_test.c
> +++ b/mm/page_frag_test.c
> @@ -6,7 +6,6 @@
>   * Copyright: linyunsheng@huawei.com
>   */
>
> -#include <linux/mm.h>
>  #include <linux/module.h>
>  #include <linux/slab.h>
>  #include <linux/vmalloc.h>
> @@ -16,6 +15,7 @@
>  #include <linux/log2.h>
>  #include <linux/completion.h>
>  #include <linux/kthread.h>
> +#include <linux/page_frag_cache.h>
>
>  #define OBJPOOL_NR_OBJECT_MAX  BIT(24)

Rather than making users have to include page_frag_cache.h I think it
would be better for us to just maintain the code as being accessible
from mm.h. So it might be better to just add page_frag_cache.h to the
includes there.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 03/14] mm: page_frag: use initial zero offset for page_frag_alloc_align()
  2024-07-19  9:33 ` [RFC v11 03/14] mm: page_frag: use initial zero offset for page_frag_alloc_align() Yunsheng Lin
@ 2024-07-21 18:34   ` Alexander Duyck
  0 siblings, 0 replies; 34+ messages in thread
From: Alexander Duyck @ 2024-07-21 18:34 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On Fri, Jul 19, 2024 at 2:37 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> We are about to use page_frag_alloc_*() API to not just
> allocate memory for skb->data, but also use them to do
> the memory allocation for skb frag too. Currently the
> implementation of page_frag in mm subsystem is running
> the offset as a countdown rather than count-up value,
> there may have several advantages to that as mentioned
> in [1], but it may have some disadvantages, for example,
> it may disable skb frag coaleasing and more correct cache
> prefetching

You misspelled "coalescing".

> We have a trade-off to make in order to have a unified
> implementation and API for page_frag, so use a initial zero
> offset in this patch, and the following patch will try to
> make some optimization to avoid the disadvantages as much
> as possible.
>
> Rename 'offset' to 'remaining' to retain the 'countdown'
> behavior as 'remaining countdown' instead of 'offset
> countdown'. Also, Renaming enable us to do a single
> 'fragsz > remaining' checking for the case of cache not
> being enough, which should be the fast path if we ensure
> 'remaining' is zero when 'va' == NULL by memset'ing
> 'struct page_frag_cache' in page_frag_cache_init() and
> page_frag_cache_drain().
>
> 1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/
>
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/linux/mm_types_task.h |  4 +-
>  mm/page_frag_cache.c          | 71 +++++++++++++++++++++--------------
>  2 files changed, 44 insertions(+), 31 deletions(-)
>
> diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
> index cdc1e3696439..b1c54b2b9308 100644
> --- a/include/linux/mm_types_task.h
> +++ b/include/linux/mm_types_task.h
> @@ -52,10 +52,10 @@ struct page_frag {
>  struct page_frag_cache {
>         void *va;
>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -       __u16 offset;
> +       __u16 remaining;
>         __u16 size;
>  #else
> -       __u32 offset;
> +       __u32 remaining;
>  #endif
>         /* we maintain a pagecount bias, so that we dont dirty cache line
>          * containing page->_refcount every time we allocate a fragment.
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index 609a485cd02a..2958fe006fe7 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -22,6 +22,7 @@
>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>                                              gfp_t gfp_mask)
>  {
> +       unsigned int page_size = PAGE_FRAG_CACHE_MAX_SIZE;
>         struct page *page = NULL;
>         gfp_t gfp = gfp_mask;
>
> @@ -30,12 +31,21 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>                    __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>         page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
>                                 PAGE_FRAG_CACHE_MAX_ORDER);
> -       nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
>  #endif
> -       if (unlikely(!page))
> +       if (unlikely(!page)) {
>                 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> +               if (unlikely(!page)) {
> +                       nc->va = NULL;
> +                       return NULL;
> +               }
>
> -       nc->va = page ? page_address(page) : NULL;
> +               page_size = PAGE_SIZE;
> +       }
> +
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> +       nc->size = page_size;
> +#endif
> +       nc->va = page_address(page);
>
>         return page;
>  }

Not a huge fan of the changes here. If we are changing the direction
then just do that. I don't see the point of these changes. As far as I
can tell it is just adding noise to the diff and has no effect on the
final code as the outcome is mostly the same except for you don't
update size in the event that you overwrite nc->va to NULL.

> @@ -64,8 +74,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>                               unsigned int align_mask)
>  {
>         unsigned int size = PAGE_SIZE;
> +       unsigned int remaining;
>         struct page *page;
> -       int offset;
>
>         if (unlikely(!nc->va)) {
>  refill:
> @@ -82,35 +92,20 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>                  */
>                 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>
> -               /* reset page count bias and offset to start of new frag */
> +               /* reset page count bias and remaining to start of new frag */
>                 nc->pfmemalloc = page_is_pfmemalloc(page);
>                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -               nc->offset = size;
> +               nc->remaining = size;
>         }
>
> -       offset = nc->offset - fragsz;
> -       if (unlikely(offset < 0)) {
> -               page = virt_to_page(nc->va);
> -
> -               if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> -                       goto refill;
> -
> -               if (unlikely(nc->pfmemalloc)) {
> -                       free_unref_page(page, compound_order(page));
> -                       goto refill;
> -               }
> -
>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -               /* if size can vary use size else just use PAGE_SIZE */
> -               size = nc->size;
> +       /* if size can vary use size else just use PAGE_SIZE */
> +       size = nc->size;
>  #endif

Rather than pulling this out and placing it here it might make more
sense at the start of the function. Basically just overwrite size w/
either PAGE_SIZE or nc->size right at the start. Then if we have to
reallocate we overwrite it. That way we can avoid some redundancy and
this will be easier to read.

> -               /* OK, page count is 0, we can safely set it */
> -               set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>
> -               /* reset page count bias and offset to start of new frag */
> -               nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -               offset = size - fragsz;
> -               if (unlikely(offset < 0)) {
> +       remaining = nc->remaining & align_mask;
> +       if (unlikely(remaining < fragsz)) {
> +               if (unlikely(fragsz > PAGE_SIZE)) {
>                         /*
>                          * The caller is trying to allocate a fragment
>                          * with fragsz > PAGE_SIZE but the cache isn't big
> @@ -122,13 +117,31 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>                          */
>                         return NULL;
>                 }
> +
> +               page = virt_to_page(nc->va);
> +
> +               if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> +                       goto refill;
> +
> +               if (unlikely(nc->pfmemalloc)) {
> +                       free_unref_page(page, compound_order(page));
> +                       goto refill;
> +               }
> +
> +               /* OK, page count is 0, we can safely set it */
> +               set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> +
> +               /* reset page count bias and remaining to start of new frag */
> +               nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> +               nc->remaining = size;

Why are you setting nc->remaining here? You set it a few lines below.
This is redundant.

> +
> +               remaining = size;
>         }
>
>         nc->pagecnt_bias--;
> -       offset &= align_mask;
> -       nc->offset = offset;
> +       nc->remaining = remaining - fragsz;
>
> -       return nc->va + offset;
> +       return nc->va + (size - remaining);
>  }
>  EXPORT_SYMBOL(__page_frag_alloc_align);


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 09/14] mm: page_frag: use __alloc_pages() to replace alloc_pages_node()
  2024-07-19  9:33 ` [RFC v11 09/14] mm: page_frag: use __alloc_pages() to replace alloc_pages_node() Yunsheng Lin
@ 2024-07-21 21:41   ` Alexander H Duyck
  2024-07-24 12:54     ` Yunsheng Lin
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander H Duyck @ 2024-07-21 21:41 UTC (permalink / raw)
  To: Yunsheng Lin, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On Fri, 2024-07-19 at 17:33 +0800, Yunsheng Lin wrote:
> There are more new APIs calling __page_frag_cache_refill() in
> this patchset, which may cause compiler not being able to inline
> __page_frag_cache_refill() into __page_frag_alloc_va_align().
> 
> Not being able to do the inlining seems to casue some notiable
> performance degradation in arm64 system with 64K PAGE_SIZE after
> adding new API calling __page_frag_cache_refill().
> 
> It seems there is about 24Bytes binary size increase for
> __page_frag_cache_refill() and __page_frag_cache_refill() in
> arm64 system with 64K PAGE_SIZE. By doing the gdb disassembling,
> It seems we can have more than 100Bytes decrease for the binary
> size by using __alloc_pages() to replace alloc_pages_node(), as
> there seems to be some unnecessary checking for nid being
> NUMA_NO_NODE, especially when page_frag is still part of the mm
> system.
> 
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  mm/page_frag_cache.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index d9c9cad17af7..3f162e9d23ba 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -59,11 +59,11 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>  	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>  		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
> -	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
> -				PAGE_FRAG_CACHE_MAX_ORDER);
> +	page = __alloc_pages(gfp_mask, PAGE_FRAG_CACHE_MAX_ORDER,
> +			     numa_mem_id(), NULL);
>  #endif
>  	if (unlikely(!page)) {
> -		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> +		page = __alloc_pages(gfp, 0, numa_mem_id(), NULL);
>  		if (unlikely(!page)) {
>  			memset(nc, 0, sizeof(*nc));
>  			return NULL;

So if I am understanding correctly this is basically just stripping the
checks that were being performed since they aren't really needed to
verify the output of numa_mem_id.

Rather than changing the code here, it might make more sense to update
alloc_pages_node_noprof to move the lines from
__alloc_pages_node_noprof into it. Then you could put the VM_BUG_ON and
warn_if_node_offline into an else statement which would cause them to
be automatically stripped for this and all other callers. The benefit
would likely be much more significant and may be worthy of being
accepted on its own merit without being a part of this patch set as I
would imagine it would show slight gains in terms of performance and
binary size by dropping the unnecessary instructions.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 07/14] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'
  2024-07-19  9:33 ` [RFC v11 07/14] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc' Yunsheng Lin
@ 2024-07-21 22:59   ` Alexander H Duyck
  0 siblings, 0 replies; 34+ messages in thread
From: Alexander H Duyck @ 2024-07-21 22:59 UTC (permalink / raw)
  To: Yunsheng Lin, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On Fri, 2024-07-19 at 17:33 +0800, Yunsheng Lin wrote:
> Currently there is one 'struct page_frag' for every 'struct
> sock' and 'struct task_struct', we are about to replace the
> 'struct page_frag' with 'struct page_frag_cache' for them.
> Before begin the replacing, we need to ensure the size of
> 'struct page_frag_cache' is not bigger than the size of
> 'struct page_frag', as there may be tens of thousands of
> 'struct sock' and 'struct task_struct' instances in the
> system.
> 
> By or'ing the page order & pfmemalloc with lower bits of
> 'va' instead of using 'u16' or 'u32' for page size and 'u8'
> for pfmemalloc, we are able to avoid 3 or 5 bytes space waste.
> And page address & pfmemalloc & order is unchanged for the
> same page in the same 'page_frag_cache' instance, it makes
> sense to fit them together.
> 
> After this patch, the size of 'struct page_frag_cache' should be
> the same as the size of 'struct page_frag'.
> 
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/linux/mm_types_task.h   | 16 +++++------
>  include/linux/page_frag_cache.h | 49 +++++++++++++++++++++++++++++++--
>  mm/page_frag_cache.c            | 49 +++++++++++++++------------------
>  3 files changed, 77 insertions(+), 37 deletions(-)
> 
> diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
> index b1c54b2b9308..f2610112a642 100644
> --- a/include/linux/mm_types_task.h
> +++ b/include/linux/mm_types_task.h
> @@ -50,18 +50,18 @@ struct page_frag {
>  #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
>  #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
>  struct page_frag_cache {
> -	void *va;
> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> +	/* encoded_va consists of the virtual address, pfmemalloc bit and order
> +	 * of a page.
> +	 */
> +	unsigned long encoded_va;
> +
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
>  	__u16 remaining;
> -	__u16 size;
> +	__u16 pagecnt_bias;
>  #else
>  	__u32 remaining;
> +	__u32 pagecnt_bias;
>  #endif
> -	/* we maintain a pagecount bias, so that we dont dirty cache line
> -	 * containing page->_refcount every time we allocate a fragment.
> -	 */
> -	unsigned int		pagecnt_bias;
> -	bool pfmemalloc;
>  };
>  
>  /* Track pages that require TLB flushes */
> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> index ef1572f11248..12a16f8e8ad0 100644
> --- a/include/linux/page_frag_cache.h
> +++ b/include/linux/page_frag_cache.h
> @@ -3,19 +3,64 @@
>  #ifndef _LINUX_PAGE_FRAG_CACHE_H
>  #define _LINUX_PAGE_FRAG_CACHE_H
>  
> +#include <linux/bits.h>
> +#include <linux/build_bug.h>
>  #include <linux/log2.h>
>  #include <linux/types.h>
>  #include <linux/mm_types_task.h>
>  #include <asm/page.h>
>  
> +#define PAGE_FRAG_CACHE_ORDER_MASK		GENMASK(7, 0)

I would pull the PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE check from below
and use it to wrap this mask definition. If we don't need order you
could define the mask as 0. With that you get the benefit of the
compiler being able to figure out we don't read things as any value
ANDed with 0 is 0.

Also a comment explaining why you want it to be a full byte here would
be useful. I am assuming this is for assembler optimization as the
shift operation is usually expecting a byte.

> +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT		BIT(8)
> +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT	8
> +
> +static inline unsigned long encode_aligned_va(void *va, unsigned int order,
> +					      bool pfmemalloc)
> +{
> +	BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_FRAG_CACHE_ORDER_MASK);
> +	BUILD_BUG_ON(PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT >= PAGE_SHIFT);
> +
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> +	return (unsigned long)va | order |
> +		(pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
> +#else
> +	return (unsigned long)va |
> +		(pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
> +#endif

So with the mask trick I called out above you could just have (order &
PAGE_FRAG_CACHE_ORDER_MASK) be one of your inputs. If ORDER_MASK is 0
it should just strip the compiler will know it will turn out 0.

Also doing a shift on a bool is a risky action. What you might look at
doing instead would be something like a multiplication of a unsigned
long bit by a bool, or at least you need to recast pfmemalloc to
something other than a bool.

> +}
> +
> +static inline unsigned long encoded_page_order(unsigned long encoded_va)
> +{
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> +	return encoded_va & PAGE_FRAG_CACHE_ORDER_MASK;
> +#else
> +	return 0;
> +#endif
> +}
> +

As mentioned above, if the mask takes care of it for us it should just
return 0 automatically and cut out this code without the #if/else
logic.

> +static inline bool encoded_page_pfmemalloc(unsigned long encoded_va)
> +{
> +	return encoded_va & PAGE_FRAG_CACHE_PFMEMALLOC_BIT;
> +}
> +

Technically you aren't returning a bool here, you are returning an
unsigned long. It would be best to wrap this in "!!()".

> +static inline void *encoded_page_address(unsigned long encoded_va)
> +{
> +	return (void *)(encoded_va & PAGE_MASK);
> +}
> +
>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
>  {
> -	nc->va = NULL;
> +	nc->encoded_va = 0;
>  }
>  
>  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
>  {
> -	return !!nc->pfmemalloc;
> +	return encoded_page_pfmemalloc(nc->encoded_va);
> +}
> +
> +static inline unsigned int page_frag_cache_page_size(unsigned long encoded_va)
> +{
> +	return PAGE_SIZE << encoded_page_order(encoded_va);
>  }
>  
>  void page_frag_cache_drain(struct page_frag_cache *nc);
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index b12496f05c4a..7928e5d50711 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -22,7 +22,7 @@
>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  					     gfp_t gfp_mask)
>  {
> -	unsigned int page_size = PAGE_FRAG_CACHE_MAX_SIZE;
> +	unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER;
>  	struct page *page = NULL;
>  	gfp_t gfp = gfp_mask;
>  
> @@ -35,28 +35,27 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  	if (unlikely(!page)) {
>  		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>  		if (unlikely(!page)) {
> -			nc->va = NULL;
> +			nc->encoded_va = 0;
>  			return NULL;
>  		}
>  
> -		page_size = PAGE_SIZE;
> +		order = 0;
>  	}
>  
> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -	nc->size = page_size;
> -#endif
> -	nc->va = page_address(page);
> +	nc->encoded_va = encode_aligned_va(page_address(page), order,
> +					   page_is_pfmemalloc(page));
>  
>  	return page;
>  }
>  
>  void page_frag_cache_drain(struct page_frag_cache *nc)
>  {
> -	if (!nc->va)
> +	if (!nc->encoded_va)
>  		return;
>  
> -	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
> -	nc->va = NULL;
> +	__page_frag_cache_drain(virt_to_head_page((void *)nc->encoded_va),
> +				nc->pagecnt_bias);
> +	nc->encoded_va = 0;
>  }
>  EXPORT_SYMBOL(page_frag_cache_drain);
>  
> @@ -73,36 +72,30 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>  				 unsigned int fragsz, gfp_t gfp_mask,
>  				 unsigned int align_mask)
>  {
> -	unsigned int size = PAGE_SIZE;
> -	unsigned int remaining;
> +	unsigned long encoded_va = nc->encoded_va;
> +	unsigned int size, remaining;
>  	struct page *page;
>  
> -	if (unlikely(!nc->va)) {
> +	if (unlikely(!encoded_va)) {
>  refill:
>  		page = __page_frag_cache_refill(nc, gfp_mask);
>  		if (!page)
>  			return NULL;
>  
> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -		/* if size can vary use size else just use PAGE_SIZE */
> -		size = nc->size;
> -#endif
> +		encoded_va = nc->encoded_va;
> +		size = page_frag_cache_page_size(encoded_va);
> +
>  		/* Even if we own the page, we do not use atomic_set().
>  		 * This would break get_page_unless_zero() users.
>  		 */
>  		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>  
>  		/* reset page count bias and remaining to start of new frag */
> -		nc->pfmemalloc = page_is_pfmemalloc(page);
>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>  		nc->remaining = size;
>  	}
>  
> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -	/* if size can vary use size else just use PAGE_SIZE */
> -	size = nc->size;
> -#endif
> -
> +	size = page_frag_cache_page_size(encoded_va);

As I think I mentioned in an earlier patch it would probably be better
to do this before the if statement above. That way you avoid
recomputing size when you allocate a new page. With any luck the
compiler will realize that this is essentially an "else" for the if
statement above. Either that or just make this an else for the
allocation block above.

>  	remaining = nc->remaining & align_mask;
>  	if (unlikely(remaining < fragsz)) {
>  		if (unlikely(fragsz > PAGE_SIZE)) {
> @@ -118,13 +111,15 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>  			return NULL;
>  		}
>  
> -		page = virt_to_page(nc->va);
> +		page = virt_to_page((void *)encoded_va);
>  
>  		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>  			goto refill;
>  
> -		if (unlikely(nc->pfmemalloc)) {
> -			free_unref_page(page, compound_order(page));
> +		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
> +			VM_BUG_ON(compound_order(page) !=
> +				  encoded_page_order(encoded_va));
> +			free_unref_page(page, encoded_page_order(encoded_va));
>  			goto refill;
>  		}
>  
> @@ -141,7 +136,7 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>  	nc->pagecnt_bias--;
>  	nc->remaining = remaining - fragsz;
>  
> -	return nc->va + (size - remaining);
> +	return encoded_page_address(encoded_va) + (size - remaining);
>  }
>  EXPORT_SYMBOL(__page_frag_alloc_va_align);
>  




^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 05/14] mm: page_frag: avoid caller accessing 'page_frag_cache' directly
  2024-07-19  9:33 ` [RFC v11 05/14] mm: page_frag: avoid caller accessing 'page_frag_cache' directly Yunsheng Lin
@ 2024-07-21 23:01   ` Alexander H Duyck
  0 siblings, 0 replies; 34+ messages in thread
From: Alexander H Duyck @ 2024-07-21 23:01 UTC (permalink / raw)
  To: Yunsheng Lin, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Michael S. Tsirkin, Jason Wang,
	Eugenio Pérez, Andrew Morton, Eric Dumazet, David Howells,
	Marc Dionne, Chuck Lever, Jeff Layton, Neil Brown,
	Olga Kornievskaia, Dai Ngo, Tom Talpey, Trond Myklebust,
	Anna Schumaker, kvm, virtualization, linux-mm, linux-afs,
	linux-nfs

On Fri, 2024-07-19 at 17:33 +0800, Yunsheng Lin wrote:
> Use appropriate frag_page API instead of caller accessing
> 'page_frag_cache' directly.
> 
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  drivers/vhost/net.c             |  2 +-
>  include/linux/page_frag_cache.h | 10 ++++++++++
>  mm/page_frag_test.c             |  2 +-
>  net/core/skbuff.c               |  6 +++---
>  net/rxrpc/conn_object.c         |  4 +---
>  net/rxrpc/local_object.c        |  4 +---
>  net/sunrpc/svcsock.c            |  6 ++----
>  7 files changed, 19 insertions(+), 15 deletions(-)
> 

Looks fine to me.

Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 08/14] mm: page_frag: some minor refactoring before adding new API
  2024-07-19  9:33 ` [RFC v11 08/14] mm: page_frag: some minor refactoring before adding new API Yunsheng Lin
@ 2024-07-21 23:40   ` Alexander H Duyck
  2024-07-22 12:55     ` Yunsheng Lin
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander H Duyck @ 2024-07-21 23:40 UTC (permalink / raw)
  To: Yunsheng Lin, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On Fri, 2024-07-19 at 17:33 +0800, Yunsheng Lin wrote:
> Refactor common codes from __page_frag_alloc_va_align()
> to __page_frag_cache_refill(), so that the new API can
> make use of them.
> 
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/linux/page_frag_cache.h |  2 +-
>  mm/page_frag_cache.c            | 93 +++++++++++++++++----------------
>  2 files changed, 49 insertions(+), 46 deletions(-)
> 
> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> index 12a16f8e8ad0..5aa45de7a9a5 100644
> --- a/include/linux/page_frag_cache.h
> +++ b/include/linux/page_frag_cache.h
> @@ -50,7 +50,7 @@ static inline void *encoded_page_address(unsigned long encoded_va)
>  
>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
>  {
> -	nc->encoded_va = 0;
> +	memset(nc, 0, sizeof(*nc));
>  }
>  

I do not like requiring the entire structure to be reset as a part of
init. If encoded_va is 0 then we have reset the page and the flags.
There shouldn't be anything else we need to reset as remaining and bias
will be reset when we reallocate.

>  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index 7928e5d50711..d9c9cad17af7 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -19,6 +19,28 @@
>  #include <linux/page_frag_cache.h>
>  #include "internal.h"
>  
> +static struct page *__page_frag_cache_recharge(struct page_frag_cache *nc)
> +{
> +	unsigned long encoded_va = nc->encoded_va;
> +	struct page *page;
> +
> +	page = virt_to_page((void *)encoded_va);
> +	if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> +		return NULL;
> +
> +	if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
> +		VM_BUG_ON(compound_order(page) !=
> +			  encoded_page_order(encoded_va));
> +		free_unref_page(page, encoded_page_order(encoded_va));
> +		return NULL;
> +	}
> +
> +	/* OK, page count is 0, we can safely set it */
> +	set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> +
> +	return page;
> +}
> +
>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  					     gfp_t gfp_mask)
>  {
> @@ -26,6 +48,14 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  	struct page *page = NULL;
>  	gfp_t gfp = gfp_mask;
>  
> +	if (likely(nc->encoded_va)) {
> +		page = __page_frag_cache_recharge(nc);
> +		if (page) {
> +			order = encoded_page_order(nc->encoded_va);
> +			goto out;
> +		}
> +	}
> +

This code has no business here. This is refill, you just dropped
recharge in here which will make a complete mess of the ordering and be
confusing to say the least.

The expectation was that if we are calling this function it is going to
overwrite the virtual address to NULL on failure so we discard the old
page if there is one present. This changes that behaviour. What you
effectively did is made __page_frag_cache_refill into the recharge
function.

>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>  	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>  		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
> @@ -35,7 +65,7 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  	if (unlikely(!page)) {
>  		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>  		if (unlikely(!page)) {
> -			nc->encoded_va = 0;
> +			memset(nc, 0, sizeof(*nc));
>  			return NULL;
>  		}
>  

The memset will take a few more instructions than the existing code
did. I would prefer to keep this as is if at all possible.

> @@ -45,6 +75,16 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  	nc->encoded_va = encode_aligned_va(page_address(page), order,
>  					   page_is_pfmemalloc(page));
>  
> +	/* Even if we own the page, we do not use atomic_set().
> +	 * This would break get_page_unless_zero() users.
> +	 */
> +	page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> +
> +out:
> +	/* reset page count bias and remaining to start of new frag */
> +	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> +	nc->remaining = PAGE_SIZE << order;
> +
>  	return page;
>  }
>  

Why bother returning a page at all? It doesn't seem like you don't use
it anymore. It looks like the use cases you have for it in patch 11/12
all appear to be broken from what I can tell as you are adding page as
a variable when we don't need to be passing internal details to the
callers of the function when just a simple error return code would do.

> @@ -55,7 +95,7 @@ void page_frag_cache_drain(struct page_frag_cache *nc)
>  
>  	__page_frag_cache_drain(virt_to_head_page((void *)nc->encoded_va),
>  				nc->pagecnt_bias);
> -	nc->encoded_va = 0;
> +	memset(nc, 0, sizeof(*nc));
>  }
>  EXPORT_SYMBOL(page_frag_cache_drain);
>  
> @@ -72,31 +112,9 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>  				 unsigned int fragsz, gfp_t gfp_mask,
>  				 unsigned int align_mask)
>  {
> -	unsigned long encoded_va = nc->encoded_va;
> -	unsigned int size, remaining;
> -	struct page *page;
> -
> -	if (unlikely(!encoded_va)) {
> -refill:
> -		page = __page_frag_cache_refill(nc, gfp_mask);
> -		if (!page)
> -			return NULL;
> -
> -		encoded_va = nc->encoded_va;
> -		size = page_frag_cache_page_size(encoded_va);
> +	unsigned int size = page_frag_cache_page_size(nc->encoded_va);
> +	unsigned int remaining = nc->remaining & align_mask;
>  
> -		/* Even if we own the page, we do not use atomic_set().
> -		 * This would break get_page_unless_zero() users.
> -		 */
> -		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> -
> -		/* reset page count bias and remaining to start of new frag */
> -		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		nc->remaining = size;
> -	}
> -
> -	size = page_frag_cache_page_size(encoded_va);
> -	remaining = nc->remaining & align_mask;
>  	if (unlikely(remaining < fragsz)) {

I am not a fan of adding a dependency on remaining being set *before*
encoded_va. The fact is it relies on the size to set it. In addition
this is creating a big blob of code for the conditional paths to have
to jump over.

I think it is much better to first validate encoded_va, and then
validate remaining. Otherwise just checking remaining seems problematic
and like a recipe for NULL pointer accesses.

>  		if (unlikely(fragsz > PAGE_SIZE)) {
>  			/*
> @@ -111,32 +129,17 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>  			return NULL;
>  		}
>  
> -		page = virt_to_page((void *)encoded_va);
> -
> -		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> -			goto refill;
> -
> -		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
> -			VM_BUG_ON(compound_order(page) !=
> -				  encoded_page_order(encoded_va));
> -			free_unref_page(page, encoded_page_order(encoded_va));
> -			goto refill;
> -		}
> -
> -		/* OK, page count is 0, we can safely set it */
> -		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> -
> -		/* reset page count bias and remaining to start of new frag */
> -		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		nc->remaining = size;
> +		if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
> +			return NULL;
>  
> +		size = page_frag_cache_page_size(nc->encoded_va);

So this is adding yet another setting/reading of size to the recharge
path now. Previously the recharge path could just reuse the existing
size.

>  		remaining = size;
>  	}
>  
>  	nc->pagecnt_bias--;
>  	nc->remaining = remaining - fragsz;
>  
> -	return encoded_page_address(encoded_va) + (size - remaining);
> +	return encoded_page_address(nc->encoded_va) + (size - remaining);
>  }
>  EXPORT_SYMBOL(__page_frag_alloc_va_align);
>  



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 08/14] mm: page_frag: some minor refactoring before adding new API
  2024-07-21 23:40   ` Alexander H Duyck
@ 2024-07-22 12:55     ` Yunsheng Lin
  2024-07-22 15:32       ` Alexander Duyck
  0 siblings, 1 reply; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-22 12:55 UTC (permalink / raw)
  To: Alexander H Duyck, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On 2024/7/22 7:40, Alexander H Duyck wrote:
> On Fri, 2024-07-19 at 17:33 +0800, Yunsheng Lin wrote:
>> Refactor common codes from __page_frag_alloc_va_align()
>> to __page_frag_cache_refill(), so that the new API can
>> make use of them.
>>
>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>> ---
>>  include/linux/page_frag_cache.h |  2 +-
>>  mm/page_frag_cache.c            | 93 +++++++++++++++++----------------
>>  2 files changed, 49 insertions(+), 46 deletions(-)
>>
>> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
>> index 12a16f8e8ad0..5aa45de7a9a5 100644
>> --- a/include/linux/page_frag_cache.h
>> +++ b/include/linux/page_frag_cache.h
>> @@ -50,7 +50,7 @@ static inline void *encoded_page_address(unsigned long encoded_va)
>>  
>>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
>>  {
>> -	nc->encoded_va = 0;
>> +	memset(nc, 0, sizeof(*nc));
>>  }
>>  
> 
> I do not like requiring the entire structure to be reset as a part of
> init. If encoded_va is 0 then we have reset the page and the flags.
> There shouldn't be anything else we need to reset as remaining and bias
> will be reset when we reallocate.

The argument is about aoviding one checking for fast path by doing the
memset in the slow path, which you might already know accroding to your
comment in previous version.

It is just sometimes hard to understand your preference for maintainability
over performance here as sometimes your comment seems to perfer performance
over maintainability, like the LEA trick you mentioned and offset count-down
before this patchset. It would be good to be more consistent about this,
otherwise it is sometimes confusing when doing the refactoring.

> 
>>  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>> index 7928e5d50711..d9c9cad17af7 100644
>> --- a/mm/page_frag_cache.c
>> +++ b/mm/page_frag_cache.c
>> @@ -19,6 +19,28 @@
>>  #include <linux/page_frag_cache.h>
>>  #include "internal.h"
>>  
>> +static struct page *__page_frag_cache_recharge(struct page_frag_cache *nc)
>> +{
>> +	unsigned long encoded_va = nc->encoded_va;
>> +	struct page *page;
>> +
>> +	page = virt_to_page((void *)encoded_va);
>> +	if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>> +		return NULL;
>> +
>> +	if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
>> +		VM_BUG_ON(compound_order(page) !=
>> +			  encoded_page_order(encoded_va));
>> +		free_unref_page(page, encoded_page_order(encoded_va));
>> +		return NULL;
>> +	}
>> +
>> +	/* OK, page count is 0, we can safely set it */
>> +	set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>> +
>> +	return page;
>> +}
>> +
>>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>  					     gfp_t gfp_mask)
>>  {
>> @@ -26,6 +48,14 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>  	struct page *page = NULL;
>>  	gfp_t gfp = gfp_mask;
>>  
>> +	if (likely(nc->encoded_va)) {
>> +		page = __page_frag_cache_recharge(nc);
>> +		if (page) {
>> +			order = encoded_page_order(nc->encoded_va);
>> +			goto out;
>> +		}
>> +	}
>> +
> 
> This code has no business here. This is refill, you just dropped
> recharge in here which will make a complete mess of the ordering and be
> confusing to say the least.
> 
> The expectation was that if we are calling this function it is going to
> overwrite the virtual address to NULL on failure so we discard the old
> page if there is one present. This changes that behaviour. What you
> effectively did is made __page_frag_cache_refill into the recharge
> function.

The idea is to reuse the below for both __page_frag_cache_refill() and
__page_frag_cache_recharge(), which seems to be about maintainability
to not having duplicated code. If there is a better idea to avoid that
duplicated code while keeping the old behaviour, I am happy to change
it.

	/* reset page count bias and remaining to start of new frag */
	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
	nc->remaining = PAGE_SIZE << order;

> 
>>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>  	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>>  		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>> @@ -35,7 +65,7 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>  	if (unlikely(!page)) {
>>  		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>>  		if (unlikely(!page)) {
>> -			nc->encoded_va = 0;
>> +			memset(nc, 0, sizeof(*nc));
>>  			return NULL;
>>  		}
>>  
> 
> The memset will take a few more instructions than the existing code
> did. I would prefer to keep this as is if at all possible.

It will not take more instructions for arm64 as it has 'stp' instruction for
__HAVE_ARCH_MEMSET is set.
There is something similar for x64?

> 
>> @@ -45,6 +75,16 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>  	nc->encoded_va = encode_aligned_va(page_address(page), order,
>>  					   page_is_pfmemalloc(page));
>>  
>> +	/* Even if we own the page, we do not use atomic_set().
>> +	 * This would break get_page_unless_zero() users.
>> +	 */
>> +	page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>> +
>> +out:
>> +	/* reset page count bias and remaining to start of new frag */
>> +	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> +	nc->remaining = PAGE_SIZE << order;
>> +
>>  	return page;
>>  }
>>  
> 
> Why bother returning a page at all? It doesn't seem like you don't use
> it anymore. It looks like the use cases you have for it in patch 11/12
> all appear to be broken from what I can tell as you are adding page as
> a variable when we don't need to be passing internal details to the
> callers of the function when just a simple error return code would do.

It would be good to be more specific about the 'broken' part here.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 00/14] Replace page_frag with page_frag_cache for sk_page_frag()
       [not found]   ` <b2001dba-a2d2-4b49-bc9f-59e175e7bba1@huawei.com>
@ 2024-07-22 15:21     ` Alexander Duyck
  2024-07-23 13:17       ` Yunsheng Lin
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander Duyck @ 2024-07-22 15:21 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Alexei Starovoitov,
	Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
	Matthias Brugger, AngeloGioacchino Del Regno, bpf,
	linux-arm-kernel, linux-mediatek, linux-mm

On Mon, Jul 22, 2024 at 5:41 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/7/22 7:49, Alexander Duyck wrote:
> > On Fri, Jul 19, 2024 at 2:36 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
> >>
> >> After [1], there are still two implementations for page frag:
> >>
> >> 1. mm/page_alloc.c: net stack seems to be using it in the
> >>    rx part with 'struct page_frag_cache' and the main API
> >>    being page_frag_alloc_align().
> >> 2. net/core/sock.c: net stack seems to be using it in the
> >>    tx part with 'struct page_frag' and the main API being
> >>    skb_page_frag_refill().
> >>
> >> This patchset tries to unfiy the page frag implementation
> >> by replacing page_frag with page_frag_cache for sk_page_frag()
> >> first. net_high_order_alloc_disable_key for the implementation
> >> in net/core/sock.c doesn't seems matter that much now as pcp
> >> is also supported for high-order pages:
> >> commit 44042b449872 ("mm/page_alloc: allow high-order pages to
> >> be stored on the per-cpu lists")
> >>
> >> As the related change is mostly related to networking, so
> >> targeting the net-next. And will try to replace the rest
> >> of page_frag in the follow patchset.
> >
> > So in reality I would say something like the first 4 patches are
> > probably more applicable to mm than they are to the net-next tree.
> > Especially given that we are having to deal with the mm_task_types.h
> > in order to sort out the include order issues.
> >
> > Given that I think it might make more sense to look at breaking this
> > into 2 or more patch sets with the first being more mm focused since
> > the test module and pulling the code out of page_alloc.c, gfp.h, and
> > mm_types.h would be pretty impactful on mm more than it is on the
> > networking stack. After those changes then I would agree that we are
> > mostly just impacting the network stack.
>
> I am sure there are plenty of good precedents about how to handling a
> patchset that affecting multi subsystems.
> Let's be more specific about what are the options here:
> 1. Keeping all changing as one patchset targetting the net-next tree
>    as this version does.
> 2. Breaking all changing into two patchsets, the one affecting current APIs
>    targetting the mm tree and the one supporting new APIs targetting
>    net-next tree.
> 3. Breaking all changing into two patchset as option 2 does, but both patchsets
>    targetting net-next tree to aovid waiting for the changing in mm tree
>    to merged back to net-next tree for adding supporting of new APIs.
>
> I am not sure your perference is option 2 or option 3 here, or there are others
> options here, it would be better to be more specific about your option here. As
> option 2 doesn't seems to make much sense if all the existing users/callers of
> page_frag seems to be belonged to networking for testing reasons, and the original
> code seemed to go through net-next tree too:
> https://github.com/torvalds/linux/commit/b63ae8ca096dfdbfeef6a209c30a93a966518853

I am suggesting option 2. The main issue is that this patch set has
had a number of issues that fall into the realm of mm more than
netdev. The issue is that I only have a limited amount of time for
review and I feel like having this be reviewed as a submission for mm
would bring in more people familiar with that side of things to review
it.

As it stands, trying to submit this through netdev is eating up a
significant amount of my time as there aren't many people on the
netdev side of things that can review the mm bits. If you insist on
this needing to go through net-next my inclination would be to just
reject the set as it is bound to introduce a number of issues due to
the sheer size of the refactor and the fact that it is providing
little if any benefit.

> And the main reason I chose option 1 over option 2 is: it is hard to tell how
> much changing needed to support the new usecase, so it is better to keep them
> in one patchset to have a bigger picture here. Yes, it may make the patchset
> harder to review, but that is the tradeoff we need to make here. As my
> understanding, option 1 seem to be the common practice to handle the changing
> affecting multi subsystems. Especially you had similar doubt about the changing
> affecting current APIs as below, it seems hard to explain it without a new case:
>
> https://lore.kernel.org/all/68d1c7d3dfcd780fa3bed0bb71e41d7fb0a8c15d.camel@gmail.com/

The issue as I see it is that you aren't getting any engagement from
the folks on the mm side. In fact from what I can tell it looks like
you didn't even CC this patch set to them. The patches I called out
below are very memory subsystem centric. I would say this patchset has
no way forward if the patches I called out below aren't reviewed by
folks from the memory subsystem maintainers.

> >
>
> ...
>
> >
> > So specifically I would like to see patches 1 (refactored as
> > selftest), 2, 3, 5, 7, 8, 13 (current APIs), and 14 done as more of an
> > mm focused set since many of the issues you seem to have are problems
> > building due to mm build issues, dependencies, and the like. That is
> > the foundation for this patch set and it seems like we keep seeing
> > issues there so that needs to be solid before we can do the new API
> > work. If focused on mm you might get more eyes on it as not many
> > networking folks are that familiar with the memory management side of
> > things.
>
> I am not sure if breaking it into more patchset is the common practice
> to 'get more eyes' here.
> Anyways, it is fair enough ask if there is more concrete reasoning
> behind the asking and it is common practice to do that, and I would
> love to break it to more patchsets to perhaps make the discussion
> easier.

The issue here is that this patchset is 2/3 memory subsystem, and you
didn't seem to include anyone from the memory subsystem side of things
on the Cc list.

> >
> > As for the other patches, specifically 10, 11, 12, and 13 (prepare,
> > probe, commit API), they could then be spun up as a netdev centered
> > set. I took a brief look at them but they need some serious refactor
> > as I think they are providing page as a return value in several cases
> > where they don't need to.
>
> The above is one of the reason I am not willing to do the spliting.
> It is hard for someone to tell if the refactoring affecting current APIs
> will be enough for the new usecase without supporting the new usecase,
> isn't it possible that some refactoring may be proved to be unnecessary
> or wrong?
>
> It would be better to be more specific about what do you mean by
> 'they are providing page as a return value in several cases where they
> don't need to' as above.

This patchset isn't moving forward in its current state. Part of the
issue is that it is kind of an unwieldy mess and has been difficult to
review due to things like refactoring code you had already refactored.
Ideally each change should be self contained and you shouldn't have to
change things more than once. That is why I have suggested splitting
things the way I did. It would give you a logical set where you do the
initial refactor to enable your changes, and then you make those
changes. It is not uncommon to see this done within the kernel
community. For example if I recall correctly the folio changes when in
as a few patch sets in order to take care of the necessary enabling
and then enable their use in the various subsystems.

> >
> > In my opinion with a small bit of refactoring patch 4 can just be
> > dropped. I don't think the renaming is necessary and it just adds
> > noise to the commit logs for the impacted drivers. It will require
> > tweaks to the other patches but I think it will be better that way in
> > the long run.
>
> It would be better to be more specific about above too so that we don't
> have to have more refactoring patchsets for the current APIs.

I provided the review feedback in the patch. Specifically, don't
rename existing APIs. It would be better to just come up with an
alternative scheme such as a double underscore that would represent
the page based version while the regular version stays the same.

> >
> > Looking at patch 6 I am left scratching my head and wondering if you
> > have another build issue of some sort that you haven't mentioned. I
> > really don't think it belongs in this patch set and should probably be
> > a fix on its own if you have some reason to justify it. Otherwise you
> > might also just look at refactoring it to take
> > "__builtin_constant_p(size)" into account by copying/pasting the first
> > bits from the generic version into the function since I am assuming
> > there is a performance benefit to doing it in assembler. It should be
> > a net win if you just add the accounting for constants.
>
> I am not sure if the commit log in patch 6 needs some rephasing to
> answer your question above:
> "As the get_order() implemented by xtensa supporting 'nsau'
> instruction seems be the same as the generic implementation
> in include/asm-generic/getorder.h when size is not a constant
> value as the generic implementation calling the fls*() is also
> utilizing the 'nsau' instruction for xtensa.
>
> So remove the get_order() implemented by xtensa, as using the
> generic implementation may enable the compiler to do the
> computing when size is a constant value instead of runtime
> computing and enable the using of get_order() in BUILD_BUG_ON()
> macro in next patch."
>
> See the below in the next patch, as the PAGE_FRAG_CACHE_MAX_ORDER
> is using the get_order():
> BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_FRAG_CACHE_ORDER_MASK);

Are you saying that the compiler translates the get_order call into
the nsau instruction? I'm still not entirely convinced and would
really like to see a review by the maintainer for that architecture to
be comfortable with it.

Otherwise as I said before my thought would be to simply copy over the
bits for __builtin_constant_p from the generic version of get_order so
that we don't run the risk of somehow messing up the non-constant
case.

> >
> > Patch 9 could probably be a standalone patch or included in the more
> > mm centered set. However it would need to be redone to fix the
> > underlying issue rather than working around it by changing the
> > function called rather than fixing the function. No point in improving
> > it for one case when you can cover multiple cases with a single
> > change.
>
> Sure, it is just that there is only 24h a day for me to do things
> more generically. So perhaps I should remove patch 9 for now so
> that we can improve thing more generically.

I'm not sure what that is supposed to mean. The change I am suggesting
is no bigger than what you have already done. It would just mean
fixing the issue at the source instead of working around the issue.
Taking that approach would yield a much better return than just doing
the workaround.

I could make the same argument about reviewing this patch set. I feel
like a I only have so much time in the day. I have already caught a
few places where you were circumventing issues instead of addressing
them such as using macros to cover up #include ordering issues
resulting in static inline functions blowing up. It feels like
labeling this as a networking patch set is an attempt to circumvent
working with the mm tree by going in and touching as much networking
code as you can to claim this is a networking patch when only 3
patches(5, 10 and 12) really need to touch anything in networking.

I am asking you to consider my suggestions for your own benefit as
otherwise I am pretty much the only reviewer for these patches and the
fact is I am not a regular contributor within the mm subsystem myself.
I would really like to have input from the mm subsystem maintainer on
things like your first patch which is adding a new test module to the
mm tree currently. I am assuming that they wouldn't want us to place
the test module in there, but I could be wrong. That is why I am
suggesting breaking this up and submitting the mm bits as more mm
focused so that we can get that additional input.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 08/14] mm: page_frag: some minor refactoring before adding new API
  2024-07-22 12:55     ` Yunsheng Lin
@ 2024-07-22 15:32       ` Alexander Duyck
  2024-07-23 13:19         ` Yunsheng Lin
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander Duyck @ 2024-07-22 15:32 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On Mon, Jul 22, 2024 at 5:55 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/7/22 7:40, Alexander H Duyck wrote:
> > On Fri, 2024-07-19 at 17:33 +0800, Yunsheng Lin wrote:
> >> Refactor common codes from __page_frag_alloc_va_align()
> >> to __page_frag_cache_refill(), so that the new API can
> >> make use of them.
> >>
> >> CC: Alexander Duyck <alexander.duyck@gmail.com>
> >> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> >> ---
> >>  include/linux/page_frag_cache.h |  2 +-
> >>  mm/page_frag_cache.c            | 93 +++++++++++++++++----------------
> >>  2 files changed, 49 insertions(+), 46 deletions(-)
> >>
> >> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> >> index 12a16f8e8ad0..5aa45de7a9a5 100644
> >> --- a/include/linux/page_frag_cache.h
> >> +++ b/include/linux/page_frag_cache.h
> >> @@ -50,7 +50,7 @@ static inline void *encoded_page_address(unsigned long encoded_va)
> >>
> >>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
> >>  {
> >> -    nc->encoded_va = 0;
> >> +    memset(nc, 0, sizeof(*nc));
> >>  }
> >>
> >
> > I do not like requiring the entire structure to be reset as a part of
> > init. If encoded_va is 0 then we have reset the page and the flags.
> > There shouldn't be anything else we need to reset as remaining and bias
> > will be reset when we reallocate.
>
> The argument is about aoviding one checking for fast path by doing the
> memset in the slow path, which you might already know accroding to your
> comment in previous version.
>
> It is just sometimes hard to understand your preference for maintainability
> over performance here as sometimes your comment seems to perfer performance
> over maintainability, like the LEA trick you mentioned and offset count-down
> before this patchset. It would be good to be more consistent about this,
> otherwise it is sometimes confusing when doing the refactoring.

The use of a negative offset is arguably more maintainable in my mind
rather than being a performance trick. Essentially if you use the
negative value you can just mask off the upper bits and it is the
offset in the page. As such it is actually easier for me to read
versus "remaining" which is an offset from the end of the page.
Assuming you read the offset in hex anyway.

> >
> >>  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
> >> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> >> index 7928e5d50711..d9c9cad17af7 100644
> >> --- a/mm/page_frag_cache.c
> >> +++ b/mm/page_frag_cache.c
> >> @@ -19,6 +19,28 @@
> >>  #include <linux/page_frag_cache.h>
> >>  #include "internal.h"
> >>
> >> +static struct page *__page_frag_cache_recharge(struct page_frag_cache *nc)
> >> +{
> >> +    unsigned long encoded_va = nc->encoded_va;
> >> +    struct page *page;
> >> +
> >> +    page = virt_to_page((void *)encoded_va);
> >> +    if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> >> +            return NULL;
> >> +
> >> +    if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
> >> +            VM_BUG_ON(compound_order(page) !=
> >> +                      encoded_page_order(encoded_va));
> >> +            free_unref_page(page, encoded_page_order(encoded_va));
> >> +            return NULL;
> >> +    }
> >> +
> >> +    /* OK, page count is 0, we can safely set it */
> >> +    set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> >> +
> >> +    return page;
> >> +}
> >> +
> >>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
> >>                                           gfp_t gfp_mask)
> >>  {
> >> @@ -26,6 +48,14 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
> >>      struct page *page = NULL;
> >>      gfp_t gfp = gfp_mask;
> >>
> >> +    if (likely(nc->encoded_va)) {
> >> +            page = __page_frag_cache_recharge(nc);
> >> +            if (page) {
> >> +                    order = encoded_page_order(nc->encoded_va);
> >> +                    goto out;
> >> +            }
> >> +    }
> >> +
> >
> > This code has no business here. This is refill, you just dropped
> > recharge in here which will make a complete mess of the ordering and be
> > confusing to say the least.
> >
> > The expectation was that if we are calling this function it is going to
> > overwrite the virtual address to NULL on failure so we discard the old
> > page if there is one present. This changes that behaviour. What you
> > effectively did is made __page_frag_cache_refill into the recharge
> > function.
>
> The idea is to reuse the below for both __page_frag_cache_refill() and
> __page_frag_cache_recharge(), which seems to be about maintainability
> to not having duplicated code. If there is a better idea to avoid that
> duplicated code while keeping the old behaviour, I am happy to change
> it.
>
>         /* reset page count bias and remaining to start of new frag */
>         nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>         nc->remaining = PAGE_SIZE << order;
>

The only piece that is really reused here is the pagecnt_bias
assignment. What is obfuscated away is that the order is gotten
through one of two paths. Really order isn't order here it is size.
Which should have been fetched already. What you end up doing with
this change is duplicating a bunch of code throughout the function.
You end up having to fetch size multiple times multiple ways. here you
are generating it with order. Then you have to turn around and get it
again at the start of the function, and again after calling this
function in order to pull it back out.

> >
> >>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> >>      gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
> >>                 __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
> >> @@ -35,7 +65,7 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
> >>      if (unlikely(!page)) {
> >>              page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> >>              if (unlikely(!page)) {
> >> -                    nc->encoded_va = 0;
> >> +                    memset(nc, 0, sizeof(*nc));
> >>                      return NULL;
> >>              }
> >>
> >
> > The memset will take a few more instructions than the existing code
> > did. I would prefer to keep this as is if at all possible.
>
> It will not take more instructions for arm64 as it has 'stp' instruction for
> __HAVE_ARCH_MEMSET is set.
> There is something similar for x64?

The x64 does not last I knew without getting into the SSE/AVX type
stuff. This becomes two seperate 8B store instructions.

> >
> >> @@ -45,6 +75,16 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
> >>      nc->encoded_va = encode_aligned_va(page_address(page), order,
> >>                                         page_is_pfmemalloc(page));
> >>
> >> +    /* Even if we own the page, we do not use atomic_set().
> >> +     * This would break get_page_unless_zero() users.
> >> +     */
> >> +    page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> >> +
> >> +out:
> >> +    /* reset page count bias and remaining to start of new frag */
> >> +    nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> >> +    nc->remaining = PAGE_SIZE << order;
> >> +
> >>      return page;
> >>  }
> >>
> >
> > Why bother returning a page at all? It doesn't seem like you don't use
> > it anymore. It looks like the use cases you have for it in patch 11/12
> > all appear to be broken from what I can tell as you are adding page as
> > a variable when we don't need to be passing internal details to the
> > callers of the function when just a simple error return code would do.
>
> It would be good to be more specific about the 'broken' part here.

We are passing internals to the caller. Basically this is generally
frowned upon for many implementations of things as the general idea is
that the internal page we are using should be a pseudo-private value.
I understand that you have one or two callers that need it for the use
cases you have in patches 11/12, but it also seems like you are just
passing it regardless. For example I noticed in a few cases you added
the page pointer in 12 to handle the return value, but then just used
it to check for NULL. My thought would be that rather than returning
the page here you would be better off just returning 0 or an error and
then doing the virt_to_page translation for all the cases where the
page is actually needed since you have to go that route for a cached
page anyway.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 00/14] Replace page_frag with page_frag_cache for sk_page_frag()
  2024-07-22 15:21     ` [RFC v11 00/14] Replace page_frag with page_frag_cache for sk_page_frag() Alexander Duyck
@ 2024-07-23 13:17       ` Yunsheng Lin
  0 siblings, 0 replies; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-23 13:17 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Alexei Starovoitov,
	Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
	Matthias Brugger, AngeloGioacchino Del Regno, bpf,
	linux-arm-kernel, linux-mediatek, linux-mm, Andrew Morton

+cc Andrew Morton

On 2024/7/22 23:21, Alexander Duyck wrote:
> On Mon, Jul 22, 2024 at 5:41 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2024/7/22 7:49, Alexander Duyck wrote:
>>> On Fri, Jul 19, 2024 at 2:36 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>>>
>>>> After [1], there are still two implementations for page frag:
>>>>
>>>> 1. mm/page_alloc.c: net stack seems to be using it in the
>>>>    rx part with 'struct page_frag_cache' and the main API
>>>>    being page_frag_alloc_align().
>>>> 2. net/core/sock.c: net stack seems to be using it in the
>>>>    tx part with 'struct page_frag' and the main API being
>>>>    skb_page_frag_refill().
>>>>
>>>> This patchset tries to unfiy the page frag implementation
>>>> by replacing page_frag with page_frag_cache for sk_page_frag()
>>>> first. net_high_order_alloc_disable_key for the implementation
>>>> in net/core/sock.c doesn't seems matter that much now as pcp
>>>> is also supported for high-order pages:
>>>> commit 44042b449872 ("mm/page_alloc: allow high-order pages to
>>>> be stored on the per-cpu lists")
>>>>
>>>> As the related change is mostly related to networking, so
>>>> targeting the net-next. And will try to replace the rest
>>>> of page_frag in the follow patchset.
>>>
>>> So in reality I would say something like the first 4 patches are
>>> probably more applicable to mm than they are to the net-next tree.
>>> Especially given that we are having to deal with the mm_task_types.h
>>> in order to sort out the include order issues.
>>>
>>> Given that I think it might make more sense to look at breaking this
>>> into 2 or more patch sets with the first being more mm focused since
>>> the test module and pulling the code out of page_alloc.c, gfp.h, and
>>> mm_types.h would be pretty impactful on mm more than it is on the
>>> networking stack. After those changes then I would agree that we are
>>> mostly just impacting the network stack.
>>
>> I am sure there are plenty of good precedents about how to handling a
>> patchset that affecting multi subsystems.
>> Let's be more specific about what are the options here:
>> 1. Keeping all changing as one patchset targetting the net-next tree
>>    as this version does.
>> 2. Breaking all changing into two patchsets, the one affecting current APIs
>>    targetting the mm tree and the one supporting new APIs targetting
>>    net-next tree.
>> 3. Breaking all changing into two patchset as option 2 does, but both patchsets
>>    targetting net-next tree to aovid waiting for the changing in mm tree
>>    to merged back to net-next tree for adding supporting of new APIs.
>>
>> I am not sure your perference is option 2 or option 3 here, or there are others
>> options here, it would be better to be more specific about your option here. As
>> option 2 doesn't seems to make much sense if all the existing users/callers of
>> page_frag seems to be belonged to networking for testing reasons, and the original
>> code seemed to go through net-next tree too:
>> https://github.com/torvalds/linux/commit/b63ae8ca096dfdbfeef6a209c30a93a966518853
> 
> I am suggesting option 2. The main issue is that this patch set has
> had a number of issues that fall into the realm of mm more than
> netdev. The issue is that I only have a limited amount of time for
> review and I feel like having this be reviewed as a submission for mm
> would bring in more people familiar with that side of things to review
> it.

I am agreed with the 'bring in more people familiar with that side of things
to review it' part, but I am just not sure about your asking for option 2 is
reasonable or fair enough ask, as breaking this patchset into two patchsets
may make it harder to discuss the reason behind the refactoring affecting the
current APIs, and may make the reviewing harder too.

It seems we might need to consider option 4 too:
4. Keeping all changing as one patchset targetting the mm tree.

One of the problem I see with option 4 is that it makes it harder to use CI
from netdev.

> 
> As it stands, trying to submit this through netdev is eating up a
> significant amount of my time as there aren't many people on the
> netdev side of things that can review the mm bits. If you insist on
> this needing to go through net-next my inclination would be to just
> reject the set as it is bound to introduce a number of issues due to
> the sheer size of the refactor and the fact that it is providing
> little if any benefit.
> 
>> And the main reason I chose option 1 over option 2 is: it is hard to tell how
>> much changing needed to support the new usecase, so it is better to keep them
>> in one patchset to have a bigger picture here. Yes, it may make the patchset
>> harder to review, but that is the tradeoff we need to make here. As my
>> understanding, option 1 seem to be the common practice to handle the changing
>> affecting multi subsystems. Especially you had similar doubt about the changing
>> affecting current APIs as below, it seems hard to explain it without a new case:
>>
>> https://lore.kernel.org/all/68d1c7d3dfcd780fa3bed0bb71e41d7fb0a8c15d.camel@gmail.com/
> 
> The issue as I see it is that you aren't getting any engagement from
> the folks on the mm side. In fact from what I can tell it looks like
> you didn't even CC this patch set to them. The patches I called out

Below is the gitconfig and cmd used to send the patchset, I am not sure
if there is someone specifically that is your mind that need CC'ing, as
the maillist for mm subsystem seems to be cc'ed for each mm focused patch
you mentioned, and Andrew Morton is CC'ed most of the mm focused patch too.

[sendemail.netdev]
        to = "davem@davemloft.net, kuba@kernel.org, pabeni@redhat.com"
        cc = "netdev@vger.kernel.org, linux-kernel@vger.kernel.org"
        cccmd ="/home/*/net-next/scripts/get_maintainer.pl --nogit --nogit-fallback --norolestats"

git send-email --identity=netdev

> below are very memory subsystem centric. I would say this patchset has
> no way forward if the patches I called out below aren't reviewed by
> folks from the memory subsystem maintainers.

Andrew Morton is the maintainer of mm subsystem according to MAINTAINERS file,
Let's see if we can have some feedback about what is prefer option for him
to handling this patchset by CC'ing him.

> 
>>>
>>
>> ...
>>
>>>
>>> So specifically I would like to see patches 1 (refactored as
>>> selftest), 2, 3, 5, 7, 8, 13 (current APIs), and 14 done as more of an
>>> mm focused set since many of the issues you seem to have are problems
>>> building due to mm build issues, dependencies, and the like. That is
>>> the foundation for this patch set and it seems like we keep seeing
>>> issues there so that needs to be solid before we can do the new API
>>> work. If focused on mm you might get more eyes on it as not many
>>> networking folks are that familiar with the memory management side of
>>> things.
>>
>> I am not sure if breaking it into more patchset is the common practice
>> to 'get more eyes' here.
>> Anyways, it is fair enough ask if there is more concrete reasoning
>> behind the asking and it is common practice to do that, and I would
>> love to break it to more patchsets to perhaps make the discussion
>> easier.
> 
> The issue here is that this patchset is 2/3 memory subsystem, and you
> didn't seem to include anyone from the memory subsystem side of things
> on the Cc list.

I am not familiar enough with mm subsystem yet, so I depended on the
get_maintainer.pl to sort out the CC list. If there is a CC list in your
mind, please give a list, and I would be very graceful and happy to include
them in the next version.

> 
>>>
>>> As for the other patches, specifically 10, 11, 12, and 13 (prepare,
>>> probe, commit API), they could then be spun up as a netdev centered
>>> set. I took a brief look at them but they need some serious refactor
>>> as I think they are providing page as a return value in several cases
>>> where they don't need to.
>>
>> The above is one of the reason I am not willing to do the spliting.
>> It is hard for someone to tell if the refactoring affecting current APIs
>> will be enough for the new usecase without supporting the new usecase,
>> isn't it possible that some refactoring may be proved to be unnecessary
>> or wrong?
>>
>> It would be better to be more specific about what do you mean by
>> 'they are providing page as a return value in several cases where they
>> don't need to' as above.
> 
> This patchset isn't moving forward in its current state. Part of the
> issue is that it is kind of an unwieldy mess and has been difficult to
> review due to things like refactoring code you had already refactored.
> Ideally each change should be self contained and you shouldn't have to
> change things more than once. That is why I have suggested splitting
> things the way I did. It would give you a logical set where you do the
> initial refactor to enable your changes, and then you make those
> changes. It is not uncommon to see this done within the kernel
> community. For example if I recall correctly the folio changes when in
> as a few patch sets in order to take care of the necessary enabling
> and then enable their use in the various subsystems.

The first folio changes did seem come with use case for it as below"
"  This converts just parts of the core MM and the page cache. For 5.17,
  we intend to convert various filesystems (XFS and AFS are ready; other
  filesystems may make it) and also convert more of the MM and page
  cache to folios. For 5.18, multi-page folios should be ready."

It is just that the use case for folio happen to be in mm subsystem, but
it seems we can't do that for page_frag, as the current and future possible
usecases are mostly networking related, I suppose that is one of the reason
there are no much engagement from the mm side and I am suspecting there will
be not much engagement as we expected even if we target the mm tree.

https://github.com/torvalds/linux/commit/49f8275c7d92?spm=a2c65.11461447.0.0.68003853lUiVcA

> 
>>>
>>> In my opinion with a small bit of refactoring patch 4 can just be
>>> dropped. I don't think the renaming is necessary and it just adds
>>> noise to the commit logs for the impacted drivers. It will require
>>> tweaks to the other patches but I think it will be better that way in
>>> the long run.
>>
>> It would be better to be more specific about above too so that we don't
>> have to have more refactoring patchsets for the current APIs.
> 
> I provided the review feedback in the patch. Specifically, don't
> rename existing APIs. It would be better to just come up with an
> alternative scheme such as a double underscore that would represent
> the page based version while the regular version stays the same.

It seems you provided a similar feedback in v2, but we seems we need
three APIs for the different usecases at least from the allocation
side:

"Depending on different use cases, callers expecting to deal with va, page or
both va and page for them may call page_frag_alloc_va*, page_frag_alloc_pg*,
or page_frag_alloc* API accordingly."

https://lore.kernel.org/all/18ca19fa64267b84bee10473a81cbc63f53104a0.camel@gmail.com/

And yes, you suggested dropping it, but it does not make the disagreement
disapear, we still need to figure out a appropriate naming we are both ok
with the new APIs.

> 
>>>
>>> Looking at patch 6 I am left scratching my head and wondering if you
>>> have another build issue of some sort that you haven't mentioned. I
>>> really don't think it belongs in this patch set and should probably be
>>> a fix on its own if you have some reason to justify it. Otherwise you
>>> might also just look at refactoring it to take
>>> "__builtin_constant_p(size)" into account by copying/pasting the first
>>> bits from the generic version into the function since I am assuming
>>> there is a performance benefit to doing it in assembler. It should be
>>> a net win if you just add the accounting for constants.
>>
>> I am not sure if the commit log in patch 6 needs some rephasing to
>> answer your question above:
>> "As the get_order() implemented by xtensa supporting 'nsau'
>> instruction seems be the same as the generic implementation
>> in include/asm-generic/getorder.h when size is not a constant
>> value as the generic implementation calling the fls*() is also
>> utilizing the 'nsau' instruction for xtensa.
>>
>> So remove the get_order() implemented by xtensa, as using the
>> generic implementation may enable the compiler to do the
>> computing when size is a constant value instead of runtime
>> computing and enable the using of get_order() in BUILD_BUG_ON()
>> macro in next patch."
>>
>> See the below in the next patch, as the PAGE_FRAG_CACHE_MAX_ORDER
>> is using the get_order():
>> BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_FRAG_CACHE_ORDER_MASK);
> 
> Are you saying that the compiler translates the get_order call into
> the nsau instruction? I'm still not entirely convinced and would
> really like to see a review by the maintainer for that architecture to
> be comfortable with it.

That patch does carry an Acked-by tag from Max Filippov, which is the
maintainer for xtensa architecture according to MAINTAINERS file:

https://lore.kernel.org/all/CAMo8BfJ5KwXjFDKGs2oBSTH1C7Vnnsbvcm6-qfV5gYh30+VvUQ@mail.gmail.com/

> 
> Otherwise as I said before my thought would be to simply copy over the
> bits for __builtin_constant_p from the generic version of get_order so
> that we don't run the risk of somehow messing up the non-constant
> case.
> 
>>>
>>> Patch 9 could probably be a standalone patch or included in the more
>>> mm centered set. However it would need to be redone to fix the
>>> underlying issue rather than working around it by changing the
>>> function called rather than fixing the function. No point in improving
>>> it for one case when you can cover multiple cases with a single
>>> change.
>>
>> Sure, it is just that there is only 24h a day for me to do things
>> more generically. So perhaps I should remove patch 9 for now so
>> that we can improve thing more generically.
> 
> I'm not sure what that is supposed to mean. The change I am suggesting
> is no bigger than what you have already done. It would just mean
> fixing the issue at the source instead of working around the issue.
> Taking that approach would yield a much better return than just doing
> the workaround.
> 
> I could make the same argument about reviewing this patch set. I feel
> like a I only have so much time in the day. I have already caught a

Yes, I think we need to consider thing from each other'prespection. We
can always slow thing down by sending a message to notify each other when
there are some busy days.

> few places where you were circumventing issues instead of addressing
> them such as using macros to cover up #include ordering issues
> resulting in static inline functions blowing up. It feels like
> labeling this as a networking patch set is an attempt to circumvent

Circumventing is always not my intention. In fact, I think it is better
to catch some mistake during review instead of having to debug in the
field and better idea may sometimes come out during discussion. As the
matter of fact, the idea of reusing the existing space to reduce the
size of 'page_frag_cache' came out during the discussion with you in
the 'remove page frag implementation in vhost_net' patchset.

> working with the mm tree by going in and touching as much networking
> code as you can to claim this is a networking patch when only 3
> patches(5, 10 and 12) really need to touch anything in networking.
> 
> I am asking you to consider my suggestions for your own benefit as
> otherwise I am pretty much the only reviewer for these patches and the

Thanks for the time and effort reviewing, but there are still other
reviewers, mainly Mat, Subbaraya and Max.

> fact is I am not a regular contributor within the mm subsystem myself.
> I would really like to have input from the mm subsystem maintainer on
> things like your first patch which is adding a new test module to the
> mm tree currently. I am assuming that they wouldn't want us to place
> the test module in there, but I could be wrong. That is why I am
> suggesting breaking this up and submitting the mm bits as more mm
> focused so that we can get that additional input.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 01/14] mm: page_frag: add a test module for page_frag
  2024-07-21 17:34   ` Alexander Duyck
@ 2024-07-23 13:19     ` Yunsheng Lin
  0 siblings, 0 replies; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-23 13:19 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On 2024/7/22 1:34, Alexander Duyck wrote:
> On Fri, Jul 19, 2024 at 2:36 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> Basing on the lib/objpool.c, change it to something like a
>> ptrpool, so that we can utilize that to test the correctness
>> and performance of the page_frag.
>>
>> The testing is done by ensuring that the fragment allocated
>> from a frag_frag_cache instance is pushed into a ptrpool
>> instance in a kthread binded to a specified cpu, and a kthread
>> binded to a specified cpu will pop the fragment from the
>> ptrpool and free the fragment.
>>
>> We may refactor out the common part between objpool and ptrpool
>> if this ptrpool thing turns out to be helpful for other place.
>>
>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>> ---
>>  mm/Kconfig.debug    |   8 +
>>  mm/Makefile         |   1 +
>>  mm/page_frag_test.c | 393 ++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 402 insertions(+)
>>  create mode 100644 mm/page_frag_test.c
> 
> I might have missed it somewhere. Is there any reason why this isn't
> in the selftests/mm/ directory? Seems like that would be a better fit
> for this.
> 
>> diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
>> index afc72fde0f03..1ebcd45f47d4 100644
>> --- a/mm/Kconfig.debug
>> +++ b/mm/Kconfig.debug
>> @@ -142,6 +142,14 @@ config DEBUG_PAGE_REF
>>           kernel code.  However the runtime performance overhead is virtually
>>           nil until the tracepoints are actually enabled.
>>
>> +config DEBUG_PAGE_FRAG_TEST
> 
> This isn't a "DEBUG" feature. This is a test feature.
> 
>> +       tristate "Test module for page_frag"
>> +       default n
>> +       depends on m && DEBUG_KERNEL
> 
> I am not sure it is valid to have a tristate depend on being built as a module.

Perhaps I was copying the wrong pattern from TEST_OBJPOOL in lib/Kconfig.debug.
Perhaps mm/dmapool_test.c and DMAPOOL_TEST* *was more appropriate pattern
for test module for page_frag?

> 
> I think if you can set it up as a selftest it will have broader use as
> you could compile it against any target kernel going forward and add
> it as a module rather than having to build it as a part of a debug
> kernel.

It seems tools/testing/selftests/mm/* are all about userspace testing
tool, and testing kernel module seems to be in the same directory with
the code to be tested?

> 
>> +       help
>> +         This builds the "page_frag_test" module that is used to test the
>> +         correctness and performance of page_frag's implementation.
>> +
>>  config DEBUG_RODATA_TEST
>>      bool "Testcase for the marking rodata read-only"

...

>> +
>> +               /*
>> +                * here we allocate percpu-slot & objs together in a single
>> +                * allocation to make it more compact, taking advantage of
>> +                * warm caches and TLB hits. in default vmalloc is used to
>> +                * reduce the pressure of kernel slab system. as we know,
>> +                * minimal size of vmalloc is one page since vmalloc would
>> +                * always align the requested size to page size
>> +                */
>> +               if (gfp & GFP_ATOMIC)
>> +                       slot = kmalloc_node(size, gfp, cpu_to_node(i));
>> +               else
>> +                       slot = __vmalloc_node(size, sizeof(void *), gfp,
>> +                                             cpu_to_node(i),
>> +                                             __builtin_return_address(0));
> 
> When would anyone ever call this with atomic? This is just for your
> test isn't it?
> 
>> +               if (!slot)
>> +                       return -ENOMEM;
>> +
>> +               memset(slot, 0, size);
>> +               pool->cpu_slots[i] = slot;
>> +
>> +               objpool_init_percpu_slot(pool, slot);
>> +       }
>> +
>> +       return 0;
>> +}

...

>> +/* release whole objpool forcely */
>> +static void objpool_free(struct objpool_head *pool)
>> +{
>> +       if (!pool->cpu_slots)
>> +               return;
>> +
>> +       /* release percpu slots */
>> +       objpool_fini_percpu_slots(pool);
>> +}
>> +
> 
> Why add all this extra objpool overhead? This seems like overkill for
> what should be a simple test. Seems like you should just need a simple
> array located on one of your CPUs. I'm not sure what is with all the
> extra overhead being added here.

As mentioned in the commit log:
"We may refactor out the common part between objpool and ptrpool
if this ptrpool thing turns out to be helpful for other place."

The next thing I am trying to do is to use ptrpool to optimization
the pcp for mm subsystem. so I would rather not tailor the ptrpool
for page_frag_test, and it doesn't seem to affect the testing that
much.

> 
>> +static struct objpool_head ptr_pool;
>> +static int nr_objs = 512;
>> +static atomic_t nthreads;
>> +static struct completion wait;
>> +static struct page_frag_cache test_frag;
>> +
>> +static int nr_test = 5120000;
>> +module_param(nr_test, int, 0);
>> +MODULE_PARM_DESC(nr_test, "number of iterations to test");
>> +
>> +static bool test_align;
>> +module_param(test_align, bool, 0);
>> +MODULE_PARM_DESC(test_align, "use align API for testing");
>> +
>> +static int test_alloc_len = 2048;
>> +module_param(test_alloc_len, int, 0);
>> +MODULE_PARM_DESC(test_alloc_len, "alloc len for testing");
>> +
>> +static int test_push_cpu;
>> +module_param(test_push_cpu, int, 0);
>> +MODULE_PARM_DESC(test_push_cpu, "test cpu for pushing fragment");
>> +
>> +static int test_pop_cpu;
>> +module_param(test_pop_cpu, int, 0);
>> +MODULE_PARM_DESC(test_pop_cpu, "test cpu for popping fragment");
>> +
>> +static int page_frag_pop_thread(void *arg)
>> +{
>> +       struct objpool_head *pool = arg;
>> +       int nr = nr_test;
>> +
>> +       pr_info("page_frag pop test thread begins on cpu %d\n",
>> +               smp_processor_id());
>> +
>> +       while (nr > 0) {
>> +               void *obj = objpool_pop(pool);
>> +
>> +               if (obj) {
>> +                       nr--;
>> +                       page_frag_free(obj);
>> +               } else {
>> +                       cond_resched();
>> +               }
>> +       }
>> +
>> +       if (atomic_dec_and_test(&nthreads))
>> +               complete(&wait);
>> +
>> +       pr_info("page_frag pop test thread exits on cpu %d\n",
>> +               smp_processor_id());
>> +
>> +       return 0;
>> +}
>> +
>> +static int page_frag_push_thread(void *arg)
>> +{
>> +       struct objpool_head *pool = arg;
>> +       int nr = nr_test;
>> +
>> +       pr_info("page_frag push test thread begins on cpu %d\n",
>> +               smp_processor_id());
>> +
>> +       while (nr > 0) {
>> +               void *va;
>> +               int ret;
>> +
>> +               if (test_align) {
>> +                       va = page_frag_alloc_align(&test_frag, test_alloc_len,
>> +                                                  GFP_KERNEL, SMP_CACHE_BYTES);
>> +
>> +                       WARN_ONCE((unsigned long)va & (SMP_CACHE_BYTES - 1),
>> +                                 "unaligned va returned\n");
>> +               } else {
>> +                       va = page_frag_alloc(&test_frag, test_alloc_len, GFP_KERNEL);
>> +               }
>> +
>> +               if (!va)
>> +                       continue;
>> +
>> +               ret = objpool_push(va, pool);
>> +               if (ret) {
>> +                       page_frag_free(va);
>> +                       cond_resched();
>> +               } else {
>> +                       nr--;
>> +               }
>> +       }
>> +
>> +       pr_info("page_frag push test thread exits on cpu %d\n",
>> +               smp_processor_id());
>> +
>> +       if (atomic_dec_and_test(&nthreads))
>> +               complete(&wait);
>> +
>> +       return 0;
>> +}
>> +
> 
> So looking over these functions they seem to overlook how the network
> stack works in many cases. One of the main motivations for the page
> frags approach is page recycling. For example with GRO enabled the
> headers allocated to record the frags might be freed for all but the
> first. As such you can end up with 17 fragments being allocated, and
> 16 freed within the same thread as NAPI will just be recycling the
> buffers.
> 
> With this setup it doesn't seem very likely to be triggered since you
> are operating in two threads. One test you might want to look at
> adding is a test where you are allocating and freeing in the same
> thread at a fairly constant rate to test against the "ideal" scenario.

I am not sure if the above is still the "ideal" scenario, as you mentioned
that most drivers are turning to use page_pool for rx, the page frag is really
mostly for tx or skb->data for rx.

> 



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 08/14] mm: page_frag: some minor refactoring before adding new API
  2024-07-22 15:32       ` Alexander Duyck
@ 2024-07-23 13:19         ` Yunsheng Lin
  2024-07-30 13:20           ` Yunsheng Lin
  0 siblings, 1 reply; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-23 13:19 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On 2024/7/22 23:32, Alexander Duyck wrote:
> On Mon, Jul 22, 2024 at 5:55 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2024/7/22 7:40, Alexander H Duyck wrote:
>>> On Fri, 2024-07-19 at 17:33 +0800, Yunsheng Lin wrote:
>>>> Refactor common codes from __page_frag_alloc_va_align()
>>>> to __page_frag_cache_refill(), so that the new API can
>>>> make use of them.
>>>>
>>>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>>>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>>>> ---
>>>>  include/linux/page_frag_cache.h |  2 +-
>>>>  mm/page_frag_cache.c            | 93 +++++++++++++++++----------------
>>>>  2 files changed, 49 insertions(+), 46 deletions(-)
>>>>
>>>> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
>>>> index 12a16f8e8ad0..5aa45de7a9a5 100644
>>>> --- a/include/linux/page_frag_cache.h
>>>> +++ b/include/linux/page_frag_cache.h
>>>> @@ -50,7 +50,7 @@ static inline void *encoded_page_address(unsigned long encoded_va)
>>>>
>>>>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
>>>>  {
>>>> -    nc->encoded_va = 0;
>>>> +    memset(nc, 0, sizeof(*nc));
>>>>  }
>>>>
>>>
>>> I do not like requiring the entire structure to be reset as a part of
>>> init. If encoded_va is 0 then we have reset the page and the flags.
>>> There shouldn't be anything else we need to reset as remaining and bias
>>> will be reset when we reallocate.
>>
>> The argument is about aoviding one checking for fast path by doing the
>> memset in the slow path, which you might already know accroding to your
>> comment in previous version.
>>
>> It is just sometimes hard to understand your preference for maintainability
>> over performance here as sometimes your comment seems to perfer performance
>> over maintainability, like the LEA trick you mentioned and offset count-down
>> before this patchset. It would be good to be more consistent about this,
>> otherwise it is sometimes confusing when doing the refactoring.
> 
> The use of a negative offset is arguably more maintainable in my mind
> rather than being a performance trick. Essentially if you use the
> negative value you can just mask off the upper bits and it is the
> offset in the page. As such it is actually easier for me to read
> versus "remaining" which is an offset from the end of the page.
> Assuming you read the offset in hex anyway.

Reading the above doesn't seems maintainable to me:(

> 
>>>
>>>>  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
>>>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>>>> index 7928e5d50711..d9c9cad17af7 100644
>>>> --- a/mm/page_frag_cache.c
>>>> +++ b/mm/page_frag_cache.c
>>>> @@ -19,6 +19,28 @@
>>>>  #include <linux/page_frag_cache.h>
>>>>  #include "internal.h"
>>>>
>>>> +static struct page *__page_frag_cache_recharge(struct page_frag_cache *nc)
>>>> +{
>>>> +    unsigned long encoded_va = nc->encoded_va;
>>>> +    struct page *page;
>>>> +
>>>> +    page = virt_to_page((void *)encoded_va);
>>>> +    if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>>>> +            return NULL;
>>>> +
>>>> +    if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
>>>> +            VM_BUG_ON(compound_order(page) !=
>>>> +                      encoded_page_order(encoded_va));
>>>> +            free_unref_page(page, encoded_page_order(encoded_va));
>>>> +            return NULL;
>>>> +    }
>>>> +
>>>> +    /* OK, page count is 0, we can safely set it */
>>>> +    set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>>>> +
>>>> +    return page;
>>>> +}
>>>> +
>>>>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>>>                                           gfp_t gfp_mask)
>>>>  {
>>>> @@ -26,6 +48,14 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>>>      struct page *page = NULL;
>>>>      gfp_t gfp = gfp_mask;
>>>>
>>>> +    if (likely(nc->encoded_va)) {
>>>> +            page = __page_frag_cache_recharge(nc);
>>>> +            if (page) {
>>>> +                    order = encoded_page_order(nc->encoded_va);
>>>> +                    goto out;
>>>> +            }
>>>> +    }
>>>> +
>>>
>>> This code has no business here. This is refill, you just dropped
>>> recharge in here which will make a complete mess of the ordering and be
>>> confusing to say the least.
>>>
>>> The expectation was that if we are calling this function it is going to
>>> overwrite the virtual address to NULL on failure so we discard the old
>>> page if there is one present. This changes that behaviour. What you
>>> effectively did is made __page_frag_cache_refill into the recharge
>>> function.
>>
>> The idea is to reuse the below for both __page_frag_cache_refill() and
>> __page_frag_cache_recharge(), which seems to be about maintainability
>> to not having duplicated code. If there is a better idea to avoid that
>> duplicated code while keeping the old behaviour, I am happy to change
>> it.
>>
>>         /* reset page count bias and remaining to start of new frag */
>>         nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>         nc->remaining = PAGE_SIZE << order;
>>
> 
> The only piece that is really reused here is the pagecnt_bias
> assignment. What is obfuscated away is that the order is gotten
> through one of two paths. Really order isn't order here it is size.
> Which should have been fetched already. What you end up doing with
> this change is duplicating a bunch of code throughout the function.
> You end up having to fetch size multiple times multiple ways. here you
> are generating it with order. Then you have to turn around and get it
> again at the start of the function, and again after calling this
> function in order to pull it back out.

I am assuming you would like to reserve old behavior as below?

	if(!encoded_va) {
refill:
		__page_frag_cache_refill()
	}


	if(remaining < fragsz) {
		if(!__page_frag_cache_recharge())
			goto refill;
	}

As we are adding new APIs, are we expecting new APIs also duplicate
the above pattern?

> 
>>>
>>>>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>>>      gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>>>>                 __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>>>> @@ -35,7 +65,7 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>>>      if (unlikely(!page)) {
>>>>              page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>>>>              if (unlikely(!page)) {
>>>> -                    nc->encoded_va = 0;
>>>> +                    memset(nc, 0, sizeof(*nc));
>>>>                      return NULL;
>>>>              }
>>>>
>>>
>>> The memset will take a few more instructions than the existing code
>>> did. I would prefer to keep this as is if at all possible.
>>
>> It will not take more instructions for arm64 as it has 'stp' instruction for
>> __HAVE_ARCH_MEMSET is set.
>> There is something similar for x64?
> 
> The x64 does not last I knew without getting into the SSE/AVX type
> stuff. This becomes two seperate 8B store instructions.

I can check later when I get hold of a x64 server.
But doesn't it make sense to have one extra 8B store instructions in slow path
to avoid a checking in fast path?

> 
>>>
>>>> @@ -45,6 +75,16 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>>>      nc->encoded_va = encode_aligned_va(page_address(page), order,
>>>>                                         page_is_pfmemalloc(page));
>>>>
>>>> +    /* Even if we own the page, we do not use atomic_set().
>>>> +     * This would break get_page_unless_zero() users.
>>>> +     */
>>>> +    page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>>>> +
>>>> +out:
>>>> +    /* reset page count bias and remaining to start of new frag */
>>>> +    nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>>> +    nc->remaining = PAGE_SIZE << order;
>>>> +
>>>>      return page;
>>>>  }
>>>>
>>>
>>> Why bother returning a page at all? It doesn't seem like you don't use
>>> it anymore. It looks like the use cases you have for it in patch 11/12
>>> all appear to be broken from what I can tell as you are adding page as
>>> a variable when we don't need to be passing internal details to the
>>> callers of the function when just a simple error return code would do.
>>
>> It would be good to be more specific about the 'broken' part here.
> 
> We are passing internals to the caller. Basically this is generally
> frowned upon for many implementations of things as the general idea is
> that the internal page we are using should be a pseudo-private value.

It is implementation detail and it is about avoid calling virt_to_page()
as mentioned below, I am not sure why it is referred as 'broken', it would
be better to provide more doc about why it is bad idea here, as using
'pseudo-private ' wording doesn't seems to justify the 'broken' part here.

> I understand that you have one or two callers that need it for the use
> cases you have in patches 11/12, but it also seems like you are just
> passing it regardless. For example I noticed in a few cases you added
> the page pointer in 12 to handle the return value, but then just used
> it to check for NULL. My thought would be that rather than returning
> the page here you would be better off just returning 0 or an error and
> then doing the virt_to_page translation for all the cases where the
> page is actually needed since you have to go that route for a cached
> page anyway.

Yes, it is about aovid calling virt_to_page() as much as possible.




^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 09/14] mm: page_frag: use __alloc_pages() to replace alloc_pages_node()
  2024-07-21 21:41   ` Alexander H Duyck
@ 2024-07-24 12:54     ` Yunsheng Lin
  2024-07-24 15:03       ` Alexander Duyck
  0 siblings, 1 reply; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-24 12:54 UTC (permalink / raw)
  To: Alexander H Duyck, davem, kuba, pabeni
  Cc: netdev, linux-kernel, Andrew Morton, linux-mm

On 2024/7/22 5:41, Alexander H Duyck wrote:

...

>>  	if (unlikely(!page)) {
>> -		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>> +		page = __alloc_pages(gfp, 0, numa_mem_id(), NULL);
>>  		if (unlikely(!page)) {
>>  			memset(nc, 0, sizeof(*nc));
>>  			return NULL;
> 
> So if I am understanding correctly this is basically just stripping the
> checks that were being performed since they aren't really needed to
> verify the output of numa_mem_id.
> 
> Rather than changing the code here, it might make more sense to update
> alloc_pages_node_noprof to move the lines from
> __alloc_pages_node_noprof into it. Then you could put the VM_BUG_ON and
> warn_if_node_offline into an else statement which would cause them to
> be automatically stripped for this and all other callers. The benefit

I suppose you meant something like below:

@@ -290,10 +290,14 @@ struct folio *__folio_alloc_node_noprof(gfp_t gfp, unsigned int order, int nid)
 static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask,
                                                   unsigned int order)
 {
-       if (nid == NUMA_NO_NODE)
+       if (nid == NUMA_NO_NODE) {
                nid = numa_mem_id();
+       } else {
+               VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
+               warn_if_node_offline(nid, gfp_mask);
+       }

-       return __alloc_pages_node_noprof(nid, gfp_mask, order);
+       return __alloc_pages_noprof(gfp_mask, order, nid, NULL);
 }


> would likely be much more significant and may be worthy of being
> accepted on its own merit without being a part of this patch set as I
> would imagine it would show slight gains in terms of performance and
> binary size by dropping the unnecessary instructions.

Below is the result, it does reduce the binary size for
__page_frag_alloc_align() significantly as expected, but also
increase the size for other functions, which seems to be passing
a runtime nid, so the trick above doesn't work. I am not sure if
the overall reduction is significant enough to justify the change?
It seems that depends on how many future callers are passing runtime
nid to alloc_pages_node() related APIs.

[linyunsheng@localhost net-next]$ ./scripts/bloat-o-meter vmlinux.org vmlinux
add/remove: 1/2 grow/shrink: 13/8 up/down: 160/-256 (-96)
Function                                     old     new   delta
bpf_map_alloc_pages                          708     764     +56
its_probe_one                               2836    2860     +24
iommu_dma_alloc                              984    1008     +24
__iommu_dma_alloc_noncontiguous.constprop    1180    1192     +12
e843419@0f3f_00011fb1_4348                     -       8      +8
its_vpe_irq_domain_deactivate                312     316      +4
its_vpe_irq_domain_alloc                    1492    1496      +4
its_irq_domain_free                          440     444      +4
iommu_dma_map_sg                            1328    1332      +4
dpaa_eth_probe                              5524    5528      +4
dpaa2_eth_xdp_xmit                           676     680      +4
dpaa2_eth_open                               564     568      +4
dma_direct_get_required_mask                 116     120      +4
__dma_direct_alloc_pages.constprop           656     660      +4
its_vpe_set_affinity                         928     924      -4
its_send_single_command                      340     336      -4
its_alloc_table_entry                        456     452      -4
dpaa_bp_seed                                 232     228      -4
arm_64_lpae_alloc_pgtable_s1                 680     676      -4
__arm_lpae_alloc_pages                       900     896      -4
e843419@0473_00005079_16ec                     8       -      -8
e843419@0189_00001c33_1c8                      8       -      -8
ringbuf_map_alloc                            612     600     -12
__page_frag_alloc_align                      740     536    -204
Total: Before=30306836, After=30306740, chg -0.00%




^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 09/14] mm: page_frag: use __alloc_pages() to replace alloc_pages_node()
  2024-07-24 12:54     ` Yunsheng Lin
@ 2024-07-24 15:03       ` Alexander Duyck
  2024-07-25 12:19         ` Yunsheng Lin
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander Duyck @ 2024-07-24 15:03 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On Wed, Jul 24, 2024 at 5:55 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/7/22 5:41, Alexander H Duyck wrote:
>
> ...
>
> >>      if (unlikely(!page)) {
> >> -            page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> >> +            page = __alloc_pages(gfp, 0, numa_mem_id(), NULL);
> >>              if (unlikely(!page)) {
> >>                      memset(nc, 0, sizeof(*nc));
> >>                      return NULL;
> >
> > So if I am understanding correctly this is basically just stripping the
> > checks that were being performed since they aren't really needed to
> > verify the output of numa_mem_id.
> >
> > Rather than changing the code here, it might make more sense to update
> > alloc_pages_node_noprof to move the lines from
> > __alloc_pages_node_noprof into it. Then you could put the VM_BUG_ON and
> > warn_if_node_offline into an else statement which would cause them to
> > be automatically stripped for this and all other callers. The benefit
>
> I suppose you meant something like below:
>
> @@ -290,10 +290,14 @@ struct folio *__folio_alloc_node_noprof(gfp_t gfp, unsigned int order, int nid)
>  static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask,
>                                                    unsigned int order)
>  {
> -       if (nid == NUMA_NO_NODE)
> +       if (nid == NUMA_NO_NODE) {
>                 nid = numa_mem_id();
> +       } else {
> +               VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
> +               warn_if_node_offline(nid, gfp_mask);
> +       }
>
> -       return __alloc_pages_node_noprof(nid, gfp_mask, order);
> +       return __alloc_pages_noprof(gfp_mask, order, nid, NULL);
>  }

Yes, that is more or less what I was thinking.

> > would likely be much more significant and may be worthy of being
> > accepted on its own merit without being a part of this patch set as I
> > would imagine it would show slight gains in terms of performance and
> > binary size by dropping the unnecessary instructions.
>
> Below is the result, it does reduce the binary size for
> __page_frag_alloc_align() significantly as expected, but also
> increase the size for other functions, which seems to be passing
> a runtime nid, so the trick above doesn't work. I am not sure if
> the overall reduction is significant enough to justify the change?
> It seems that depends on how many future callers are passing runtime
> nid to alloc_pages_node() related APIs.
>
> [linyunsheng@localhost net-next]$ ./scripts/bloat-o-meter vmlinux.org vmlinux
> add/remove: 1/2 grow/shrink: 13/8 up/down: 160/-256 (-96)
> Function                                     old     new   delta
> bpf_map_alloc_pages                          708     764     +56
> its_probe_one                               2836    2860     +24
> iommu_dma_alloc                              984    1008     +24
> __iommu_dma_alloc_noncontiguous.constprop    1180    1192     +12
> e843419@0f3f_00011fb1_4348                     -       8      +8
> its_vpe_irq_domain_deactivate                312     316      +4
> its_vpe_irq_domain_alloc                    1492    1496      +4
> its_irq_domain_free                          440     444      +4
> iommu_dma_map_sg                            1328    1332      +4
> dpaa_eth_probe                              5524    5528      +4
> dpaa2_eth_xdp_xmit                           676     680      +4
> dpaa2_eth_open                               564     568      +4
> dma_direct_get_required_mask                 116     120      +4
> __dma_direct_alloc_pages.constprop           656     660      +4
> its_vpe_set_affinity                         928     924      -4
> its_send_single_command                      340     336      -4
> its_alloc_table_entry                        456     452      -4
> dpaa_bp_seed                                 232     228      -4
> arm_64_lpae_alloc_pgtable_s1                 680     676      -4
> __arm_lpae_alloc_pages                       900     896      -4
> e843419@0473_00005079_16ec                     8       -      -8
> e843419@0189_00001c33_1c8                      8       -      -8
> ringbuf_map_alloc                            612     600     -12
> __page_frag_alloc_align                      740     536    -204
> Total: Before=30306836, After=30306740, chg -0.00%

I'm assuming the compiler must have uninlined
__alloc_pages_node_noprof in the previous version of things for the
cases where it is causing an increase in the code size.

One alternative approach we could look at doing would be to just add
the following to the start of the function:
if (__builtin_constant_p(nid) && nid == NUMA_NO_NODE)
        return __alloc_pages_noprof(gfp_mask, order, numa_mem_id(), NULL);

That should yield the best result as it essentially skips over the
problematic code at compile time for the constant case, otherwise the
code should be fully stripped so it shouldn't add any additional
overhead.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 09/14] mm: page_frag: use __alloc_pages() to replace alloc_pages_node()
  2024-07-24 15:03       ` Alexander Duyck
@ 2024-07-25 12:19         ` Yunsheng Lin
  2024-08-14 18:34           ` Alexander H Duyck
  0 siblings, 1 reply; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-25 12:19 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On 2024/7/24 23:03, Alexander Duyck wrote:
> On Wed, Jul 24, 2024 at 5:55 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2024/7/22 5:41, Alexander H Duyck wrote:
>>
>> ...
>>
>>>>      if (unlikely(!page)) {
>>>> -            page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>>>> +            page = __alloc_pages(gfp, 0, numa_mem_id(), NULL);
>>>>              if (unlikely(!page)) {
>>>>                      memset(nc, 0, sizeof(*nc));
>>>>                      return NULL;
>>>
>>> So if I am understanding correctly this is basically just stripping the
>>> checks that were being performed since they aren't really needed to
>>> verify the output of numa_mem_id.
>>>
>>> Rather than changing the code here, it might make more sense to update
>>> alloc_pages_node_noprof to move the lines from
>>> __alloc_pages_node_noprof into it. Then you could put the VM_BUG_ON and
>>> warn_if_node_offline into an else statement which would cause them to
>>> be automatically stripped for this and all other callers. The benefit
>>
>> I suppose you meant something like below:
>>
>> @@ -290,10 +290,14 @@ struct folio *__folio_alloc_node_noprof(gfp_t gfp, unsigned int order, int nid)
>>  static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask,
>>                                                    unsigned int order)
>>  {
>> -       if (nid == NUMA_NO_NODE)
>> +       if (nid == NUMA_NO_NODE) {
>>                 nid = numa_mem_id();
>> +       } else {
>> +               VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
>> +               warn_if_node_offline(nid, gfp_mask);
>> +       }
>>
>> -       return __alloc_pages_node_noprof(nid, gfp_mask, order);
>> +       return __alloc_pages_noprof(gfp_mask, order, nid, NULL);
>>  }
> 
> Yes, that is more or less what I was thinking.
> 
>>> would likely be much more significant and may be worthy of being
>>> accepted on its own merit without being a part of this patch set as I
>>> would imagine it would show slight gains in terms of performance and
>>> binary size by dropping the unnecessary instructions.
>>
>> Below is the result, it does reduce the binary size for
>> __page_frag_alloc_align() significantly as expected, but also
>> increase the size for other functions, which seems to be passing
>> a runtime nid, so the trick above doesn't work. I am not sure if
>> the overall reduction is significant enough to justify the change?
>> It seems that depends on how many future callers are passing runtime
>> nid to alloc_pages_node() related APIs.
>>
>> [linyunsheng@localhost net-next]$ ./scripts/bloat-o-meter vmlinux.org vmlinux
>> add/remove: 1/2 grow/shrink: 13/8 up/down: 160/-256 (-96)
>> Function                                     old     new   delta
>> bpf_map_alloc_pages                          708     764     +56
>> its_probe_one                               2836    2860     +24
>> iommu_dma_alloc                              984    1008     +24
>> __iommu_dma_alloc_noncontiguous.constprop    1180    1192     +12
>> e843419@0f3f_00011fb1_4348                     -       8      +8
>> its_vpe_irq_domain_deactivate                312     316      +4
>> its_vpe_irq_domain_alloc                    1492    1496      +4
>> its_irq_domain_free                          440     444      +4
>> iommu_dma_map_sg                            1328    1332      +4
>> dpaa_eth_probe                              5524    5528      +4
>> dpaa2_eth_xdp_xmit                           676     680      +4
>> dpaa2_eth_open                               564     568      +4
>> dma_direct_get_required_mask                 116     120      +4
>> __dma_direct_alloc_pages.constprop           656     660      +4
>> its_vpe_set_affinity                         928     924      -4
>> its_send_single_command                      340     336      -4
>> its_alloc_table_entry                        456     452      -4
>> dpaa_bp_seed                                 232     228      -4
>> arm_64_lpae_alloc_pgtable_s1                 680     676      -4
>> __arm_lpae_alloc_pages                       900     896      -4
>> e843419@0473_00005079_16ec                     8       -      -8
>> e843419@0189_00001c33_1c8                      8       -      -8
>> ringbuf_map_alloc                            612     600     -12
>> __page_frag_alloc_align                      740     536    -204
>> Total: Before=30306836, After=30306740, chg -0.00%
> 
> I'm assuming the compiler must have uninlined
> __alloc_pages_node_noprof in the previous version of things for the
> cases where it is causing an increase in the code size.
> 
> One alternative approach we could look at doing would be to just add
> the following to the start of the function:
> if (__builtin_constant_p(nid) && nid == NUMA_NO_NODE)
>         return __alloc_pages_noprof(gfp_mask, order, numa_mem_id(), NULL);
> 
> That should yield the best result as it essentially skips over the
> problematic code at compile time for the constant case, otherwise the
> code should be fully stripped so it shouldn't add any additional
> overhead.

Just tried it, it seems it is more complicated than expected too.
For example, the above changing seems to cause alloc_slab_page() to be
inlined to new_slab() and other inlining/uninlining that is hard to
understand.

[linyunsheng@localhost net-next]$ ./scripts/bloat-o-meter vmlinux.org vmlinux
add/remove: 1/2 grow/shrink: 16/11 up/down: 432/-536 (-104)
Function                                     old     new   delta
new_slab                                     808    1124    +316
its_probe_one                               2836    2876     +40
dpaa2_eth_set_dist_key                      1096    1112     +16
e843419@0f3f_00011fb1_4348                     -       8      +8
rx_default_dqrr                             2776    2780      +4
pcpu_unmap_pages                             356     360      +4
its_vpe_irq_domain_alloc                    1492    1496      +4
iommu_dma_init_fq                            520     524      +4
iommu_dma_alloc                              984     988      +4
hns3_nic_net_timeout                         704     708      +4
hns3_init_all_ring                          1168    1172      +4
hns3_clear_all_ring                          372     376      +4
enetc_refill_rx_ring                         448     452      +4
enetc_free_rxtx_rings                        276     280      +4
dpaa2_eth_xdp_xmit                           676     680      +4
dpaa2_eth_rx                                1716    1720      +4
___slab_alloc                               2120    2124      +4
pcpu_free_pages.constprop                    236     232      -4
its_alloc_table_entry                        456     452      -4
hns3_reset_notify_init_enet                  628     624      -4
dpaa_cleanup_tx_fd                           556     552      -4
dpaa_bp_seed                                 232     228      -4
blk_update_request                           944     940      -4
blk_execute_rq                               540     536      -4
arm_64_lpae_alloc_pgtable_s1                 680     676      -4
__kmalloc_large_node                         340     336      -4
__arm_lpae_unmap                            1588    1584      -4
e843419@0473_00005079_16ec                     8       -      -8
__page_frag_alloc_align                      740     536    -204
alloc_slab_page                              284       -    -284
Total: Before=30306836, After=30306732, chg -0.00%


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 04/14] mm: page_frag: add '_va' suffix to page_frag API
       [not found]   ` <CAKgT0UcqELiXntRA_uD8eJGjt-OCLO64ax=YFXrCHNnaj9kD8g@mail.gmail.com>
@ 2024-07-25 12:21     ` Yunsheng Lin
  0 siblings, 0 replies; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-25 12:21 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Subbaraya Sundeep,
	Jeroen de Borst, Praveen Kaligineedi, Shailend Chand,
	Eric Dumazet, Tony Nguyen, Przemek Kitszel, Sunil Goutham,
	Geetha sowjanya, hariprasad, Felix Fietkau, Sean Wang, Mark Lee,
	Lorenzo Bianconi, Matthias Brugger, AngeloGioacchino Del Regno,
	Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Michael S. Tsirkin, Jason Wang,
	Eugenio Pérez, Andrew Morton, Alexei Starovoitov,
	Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	David Howells, Marc Dionne, Trond Myklebust, Anna Schumaker,
	Chuck Lever, Jeff Layton, Neil Brown, Olga Kornievskaia, Dai Ngo,
	Tom Talpey, intel-wired-lan, linux-arm-kernel, linux-mediatek,
	linux-nvme, kvm, virtualization, linux-mm, bpf, linux-afs,
	linux-nfs

On 2024/7/22 4:41, Alexander Duyck wrote:
> On Fri, Jul 19, 2024 at 2:37 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> Currently the page_frag API is returning 'virtual address'
>> or 'va' when allocing and expecting 'virtual address' or
>> 'va' as input when freeing.
>>
>> As we are about to support new use cases that the caller
>> need to deal with 'struct page' or need to deal with both
>> 'va' and 'struct page'. In order to differentiate the API
>> handling between 'va' and 'struct page', add '_va' suffix
>> to the corresponding API mirroring the page_pool_alloc_va()
>> API of the page_pool. So that callers expecting to deal with
>> va, page or both va and page may call page_frag_alloc_va*,
>> page_frag_alloc_pg*, or page_frag_alloc* API accordingly.
>>
>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>> Reviewed-by: Subbaraya Sundeep <sbhatta@marvell.com>
> 
> Rather than renaming the existing API I would rather see this follow
> the same approach as we use with the other memory subsystem functions.

I am not sure if I understand what 'the other memory subsystem functions'
is referring to, it would be better to be more specific about that.

For allocation side:
alloc_pages*()
extern unsigned long get_free_page*(gfp_t gfp_mask, unsigned int order);

For free side, it seems we have:
extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
static inline void put_page(struct page *page)

So there seems to be no clear pattern that the mm APIs with double
underscore is dealing with 'struct page' and the one without double
underscore is dealing with virtual address, at least not from the
allocation side.

> A specific example being that with free_page it is essentially passed
> a virtual address, while the double underscore version is passed a
> page. I would be more okay with us renaming the double underscore
> version of any functions we might have to address that rather than
> renaming all the functions with "va".

Before this patchset, page_frag has the below APIs as below:

void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
			      gfp_t gfp_mask, unsigned int align_mask);

static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
					  unsigned int fragsz, gfp_t gfp_mask,
					  unsigned int align)

extern void page_frag_free(void *addr);

It would be better to be more specific about what renaming does the above
APIs need in order to support the new usecases.

> 
> In general I would say this patch is adding no value as what it is

As above, it would be better to give a more specific suggestion to
back up the above somewhat abstract agrument, otherwise it is hard
to tell if there is better option here, and why it is better than
the one proposed in this patchset.

> doing is essentially pushing the primary users of this API to change
> to support use cases that won't impact most of them. It is just
> creating a ton of noise in terms of changes with no added value so we
> can reuse the function names.


After this patchset, we have the below page_frag APIs:

For allocation side, we have below APIs:
struct page *page_frag_alloc_pg*(struct page_frag_cache *nc,
                                unsigned int *offset, unsigned int fragsz,
                                gfp_t gfp);
void *page_frag_alloc_va*(struct page_frag_cache *nc,
                                 unsigned int fragsz, gfp_t gfp_mask,
                                 unsigned int align_mask);
struct page *page_frag_alloc*(struct page_frag_cache *nc,
                                     unsigned int *offset,
                                     unsigned int fragsz,
                                     void **va, gfp_t gfp);

For allocation side, we have below APIs:
void page_frag_free_va(void *addr);

The main rules for the about naming are:
1. The API with 'align' suffix ensure the offset or va is aligned
2. The API with double underscore has no checking for the algin parameter.
3. The API with 'va' suffix is dealing with virtual address.
4. The API with 'pg' suffix is dealing with 'struct page'.
5. The API without 'pg' and 'va' suffix is dealing with both 'struct page'
   and virtual address.

Yes, I suppose it is not perfect mainly because we reuse some existing mm
API for page_frag free API.

As mentioned before, I would be happy to change it if what you are proposing
is indeed the better option.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 02/14] mm: move the page fragment allocator from page_alloc into its own file
  2024-07-21 17:58   ` Alexander Duyck
@ 2024-07-27 15:04     ` Yunsheng Lin
  0 siblings, 0 replies; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-27 15:04 UTC (permalink / raw)
  To: Alexander Duyck, Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, David Howells,
	Andrew Morton, linux-mm

On 7/22/2024 1:58 AM, Alexander Duyck wrote:
> On Fri, Jul 19, 2024 at 2:37 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:

...

>> --- /dev/null
>> +++ b/include/linux/page_frag_cache.h
>> @@ -0,0 +1,32 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +
>> +#ifndef _LINUX_PAGE_FRAG_CACHE_H
>> +#define _LINUX_PAGE_FRAG_CACHE_H
>> +
>> +#include <linux/log2.h>
>> +#include <linux/types.h>
>> +#include <linux/mm_types_task.h>
> 
> You don't need to include mm_types_task.h here. You can just use
> declare "struct page_frag_cache;" as we did before in gfp.h.
> Technically this should be included in mm_types.h so any callers
> making use of these functions would need to make sure to include that
> like we did for gfp.h before anyway.

The probe API is added as an inline helper in patch 11 according to
discussion in [1], so the definition of "struct page_frag_cache" is
needed, so I am not sure what is the point of using
"struct page_frag_cache;" here and then remove it and include
mm_types_task.h in patch 11.

1. 
https://lore.kernel.org/all/cb541985-a06d-7a71-9e6d-38827ccdf875@huawei.com/

> 
>> +#include <asm/page.h>
>> +
> 
> Not sure why this is included here either. From what I can tell there
> isn't anything here using the contents of page.h. I suspect you should
> only need it for the get_order call which would be used in other
> files.

It seems unnecessay, will remove that.

> 
>> +void page_frag_cache_drain(struct page_frag_cache *nc);
>> +void __page_frag_cache_drain(struct page *page, unsigned int count);
>> +void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
>> +                             gfp_t gfp_mask, unsigned int align_mask);
>> +
>> +static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
>> +                                         unsigned int fragsz, gfp_t gfp_mask,
>> +                                         unsigned int align)
>> +{
>> +       WARN_ON_ONCE(!is_power_of_2(align));
>> +       return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
>> +}
>> +
>> +static inline void *page_frag_alloc(struct page_frag_cache *nc,
>> +                                   unsigned int fragsz, gfp_t gfp_mask)
>> +{
>> +       return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
>> +}
>> +
>> +void page_frag_free(void *addr);
>> +
>> +#endif
> 
> ...
> 
>> diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
>> index cf2691f60b67..b7a5affb92f2 100644
>> --- a/mm/page_frag_test.c
>> +++ b/mm/page_frag_test.c
>> @@ -6,7 +6,6 @@
>>    * Copyright: linyunsheng@huawei.com
>>    */
>>
>> -#include <linux/mm.h>
>>   #include <linux/module.h>
>>   #include <linux/slab.h>
>>   #include <linux/vmalloc.h>
>> @@ -16,6 +15,7 @@
>>   #include <linux/log2.h>
>>   #include <linux/completion.h>
>>   #include <linux/kthread.h>
>> +#include <linux/page_frag_cache.h>
>>
>>   #define OBJPOOL_NR_OBJECT_MAX  BIT(24)
> 
> Rather than making users have to include page_frag_cache.h I think it
> would be better for us to just maintain the code as being accessible
> from mm.h. So it might be better to just add page_frag_cache.h to the
> includes there.

It would be better to list out why it is better that way as I am failing
to see it that way yet as I think it is better to use the explicit
header file instead the implicit header file.


> 



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 08/14] mm: page_frag: some minor refactoring before adding new API
  2024-07-23 13:19         ` Yunsheng Lin
@ 2024-07-30 13:20           ` Yunsheng Lin
  2024-07-30 15:12             ` Alexander H Duyck
  0 siblings, 1 reply; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-30 13:20 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On 2024/7/23 21:19, Yunsheng Lin wrote:

...

>>>>>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>>>>                                           gfp_t gfp_mask)
>>>>>  {
>>>>> @@ -26,6 +48,14 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>>>>      struct page *page = NULL;
>>>>>      gfp_t gfp = gfp_mask;
>>>>>
>>>>> +    if (likely(nc->encoded_va)) {
>>>>> +            page = __page_frag_cache_recharge(nc);
>>>>> +            if (page) {
>>>>> +                    order = encoded_page_order(nc->encoded_va);
>>>>> +                    goto out;
>>>>> +            }
>>>>> +    }
>>>>> +
>>>>
>>>> This code has no business here. This is refill, you just dropped
>>>> recharge in here which will make a complete mess of the ordering and be
>>>> confusing to say the least.
>>>>
>>>> The expectation was that if we are calling this function it is going to
>>>> overwrite the virtual address to NULL on failure so we discard the old
>>>> page if there is one present. This changes that behaviour. What you
>>>> effectively did is made __page_frag_cache_refill into the recharge
>>>> function.
>>>
>>> The idea is to reuse the below for both __page_frag_cache_refill() and
>>> __page_frag_cache_recharge(), which seems to be about maintainability
>>> to not having duplicated code. If there is a better idea to avoid that
>>> duplicated code while keeping the old behaviour, I am happy to change
>>> it.
>>>
>>>         /* reset page count bias and remaining to start of new frag */
>>>         nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>>         nc->remaining = PAGE_SIZE << order;
>>>
>>
>> The only piece that is really reused here is the pagecnt_bias
>> assignment. What is obfuscated away is that the order is gotten
>> through one of two paths. Really order isn't order here it is size.
>> Which should have been fetched already. What you end up doing with
>> this change is duplicating a bunch of code throughout the function.
>> You end up having to fetch size multiple times multiple ways. here you
>> are generating it with order. Then you have to turn around and get it
>> again at the start of the function, and again after calling this
>> function in order to pull it back out.
> 
> I am assuming you would like to reserve old behavior as below?
> 
> 	if(!encoded_va) {
> refill:
> 		__page_frag_cache_refill()
> 	}
> 
> 
> 	if(remaining < fragsz) {
> 		if(!__page_frag_cache_recharge())
> 			goto refill;
> 	}
> 
> As we are adding new APIs, are we expecting new APIs also duplicate
> the above pattern?
> 
>>

How about something like below? __page_frag_cache_refill() and
__page_frag_cache_reuse() does what their function name suggests
as much as possible, __page_frag_cache_reload() is added to avoid
new APIs duplicating similar pattern as much as possible, also
avoid fetching size multiple times multiple ways as much as possible.

static struct page *__page_frag_cache_reuse(unsigned long encoded_va,
                                            unsigned int pagecnt_bias)
{
        struct page *page;

        page = virt_to_page((void *)encoded_va);
        if (!page_ref_sub_and_test(page, pagecnt_bias))
                return NULL;

        if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
                VM_BUG_ON(compound_order(page) !=
                          encoded_page_order(encoded_va));
                free_unref_page(page, encoded_page_order(encoded_va));
                return NULL;
        }

        /* OK, page count is 0, we can safely set it */
        set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
        return page;
}

static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
                                             gfp_t gfp_mask)
{
        unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER;
        struct page *page = NULL;
        gfp_t gfp = gfp_mask;

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
        gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
                   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
        page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
                                PAGE_FRAG_CACHE_MAX_ORDER);
#endif
        if (unlikely(!page)) {
                page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
                if (unlikely(!page)) {
                        memset(nc, 0, sizeof(*nc));
                        return NULL;
                }

                order = 0;
        }

        nc->encoded_va = encode_aligned_va(page_address(page), order,
                                           page_is_pfmemalloc(page));

        /* Even if we own the page, we do not use atomic_set().
         * This would break get_page_unless_zero() users.
         */
        page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);

        return page;
}

static struct page *__page_frag_cache_reload(struct page_frag_cache *nc,
                                             gfp_t gfp_mask)
{
        struct page *page;

        if (likely(nc->encoded_va)) {
                page = __page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias);
                if (page)
                        goto out;
        }

        page = __page_frag_cache_refill(nc, gfp_mask);
        if (unlikely(!page))
                return NULL;

out:
        nc->remaining = page_frag_cache_page_size(nc->encoded_va);
        nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
        return page;
}

void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
                                 unsigned int fragsz, gfp_t gfp_mask,
                                 unsigned int align_mask)
{
        unsigned long encoded_va = nc->encoded_va;
        unsigned int remaining;

        remaining = nc->remaining & align_mask;
        if (unlikely(remaining < fragsz)) {
                if (unlikely(fragsz > PAGE_SIZE)) {
                        /*
                         * The caller is trying to allocate a fragment
                         * with fragsz > PAGE_SIZE but the cache isn't big
                         * enough to satisfy the request, this may
                         * happen in low memory conditions.
                         * We don't release the cache page because
                         * it could make memory pressure worse
                         * so we simply return NULL here.
                         */
                        return NULL;
                }

                if (unlikely(!__page_frag_cache_reload(nc, gfp_mask)))
                        return NULL;

                nc->pagecnt_bias--;
                nc->remaining -= fragsz;

                return encoded_page_address(nc->encoded_va);
        }

        nc->pagecnt_bias--;
        nc->remaining = remaining - fragsz;

        return encoded_page_address(encoded_va) +
                (page_frag_cache_page_size(encoded_va) - remaining);
}





^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 08/14] mm: page_frag: some minor refactoring before adding new API
  2024-07-30 13:20           ` Yunsheng Lin
@ 2024-07-30 15:12             ` Alexander H Duyck
  2024-07-31 12:35               ` Yunsheng Lin
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander H Duyck @ 2024-07-30 15:12 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On Tue, 2024-07-30 at 21:20 +0800, Yunsheng Lin wrote:
> On 2024/7/23 21:19, Yunsheng Lin wrote:
> > > 

...

> > > The only piece that is really reused here is the pagecnt_bias
> > > assignment. What is obfuscated away is that the order is gotten
> > > through one of two paths. Really order isn't order here it is size.
> > > Which should have been fetched already. What you end up doing with
> > > this change is duplicating a bunch of code throughout the function.
> > > You end up having to fetch size multiple times multiple ways. here you
> > > are generating it with order. Then you have to turn around and get it
> > > again at the start of the function, and again after calling this
> > > function in order to pull it back out.
> > 
> > I am assuming you would like to reserve old behavior as below?
> > 
> > 	if(!encoded_va) {
> > refill:
> > 		__page_frag_cache_refill()
> > 	}
> > 
> > 
> > 	if(remaining < fragsz) {
> > 		if(!__page_frag_cache_recharge())
> > 			goto refill;
> > 	}
> > 
> > As we are adding new APIs, are we expecting new APIs also duplicate
> > the above pattern?
> > 
> > > 
> 
> How about something like below? __page_frag_cache_refill() and
> __page_frag_cache_reuse() does what their function name suggests
> as much as possible, __page_frag_cache_reload() is added to avoid
> new APIs duplicating similar pattern as much as possible, also
> avoid fetching size multiple times multiple ways as much as possible.

This is better. Still though with the code getting so spread out we
probably need to start adding more comments to explain things.

> static struct page *__page_frag_cache_reuse(unsigned long encoded_va,
>                                             unsigned int pagecnt_bias)
> {
>         struct page *page;
> 
>         page = virt_to_page((void *)encoded_va);
>         if (!page_ref_sub_and_test(page, pagecnt_bias))
>                 return NULL;
> 
>         if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
>                 VM_BUG_ON(compound_order(page) !=
>                           encoded_page_order(encoded_va));

This VM_BUG_ON here makes no sense. If we are going to have this
anywhere it might make more sense in the cache_refill case below to
verify we are setting the order to match when we are generating the
encoded_va.

>                 free_unref_page(page, encoded_page_order(encoded_va));
>                 return NULL;
>         }
> 
>         /* OK, page count is 0, we can safely set it */
>         set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>         return page;

Why are you returning page here? It isn't used by any of the callers.
We are refilling the page here anyway so any caller should already have
access to the page since it wasn't changed.

> }
> 
> static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>                                              gfp_t gfp_mask)
> {
>         unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER;
>         struct page *page = NULL;
>         gfp_t gfp = gfp_mask;
> 
> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>         gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>                    __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>         page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
>                                 PAGE_FRAG_CACHE_MAX_ORDER);

I suspect the compliler is probably already doing this, but we should
probably not be updating gfp_mask but instead gfp since that is our
local variable.

> #endif
>         if (unlikely(!page)) {
>                 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>                 if (unlikely(!page)) {
>                         memset(nc, 0, sizeof(*nc));
>                         return NULL;
>                 }
> 
>                 order = 0;
>         }
> 
>         nc->encoded_va = encode_aligned_va(page_address(page), order,
>                                            page_is_pfmemalloc(page));
> 
>         /* Even if we own the page, we do not use atomic_set().
>          * This would break get_page_unless_zero() users.
>          */
>         page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> 
>         return page;

Again, returning page here doesn't make much sense. You are better off
not exposing internals as you have essentially locked the page down for
use by the frag API so you shouldn't be handing out the page directly
to callers.

> }
> 
> static struct page *__page_frag_cache_reload(struct page_frag_cache *nc,
>                                              gfp_t gfp_mask)
> {
>         struct page *page;
> 
>         if (likely(nc->encoded_va)) {
>                 page = __page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias);
>                 if (page)
>                         goto out;
>         }
> 
>         page = __page_frag_cache_refill(nc, gfp_mask);
>         if (unlikely(!page))
>                 return NULL;
> 
> out:
>         nc->remaining = page_frag_cache_page_size(nc->encoded_va);
>         nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>         return page;

Your one current caller doesn't use the page value provided here. I
would recommend just not bothering with the page variable until you
actually need it.

> }
> 
> void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>                                  unsigned int fragsz, gfp_t gfp_mask,
>                                  unsigned int align_mask)
> {
>         unsigned long encoded_va = nc->encoded_va;
>         unsigned int remaining;
> 
>         remaining = nc->remaining & align_mask;
>         if (unlikely(remaining < fragsz)) {

You might as well swap the code paths. It would likely be much easier
to read the case where you are handling remaining >= fragsz in here
rather than having more if statements buried within the if statement.
With that you will have more room for the comment and such below.

>                 if (unlikely(fragsz > PAGE_SIZE)) {
>                         /*
>                          * The caller is trying to allocate a fragment
>                          * with fragsz > PAGE_SIZE but the cache isn't big
>                          * enough to satisfy the request, this may
>                          * happen in low memory conditions.
>                          * We don't release the cache page because
>                          * it could make memory pressure worse
>                          * so we simply return NULL here.
>                          */
>                         return NULL;
>                 }
> 
>                 if (unlikely(!__page_frag_cache_reload(nc, gfp_mask)))
>                         return NULL;

This is what I am talking about in the earlier comments. You go to the
trouble of returning page through all the callers just to not use it
here. So there isn't any point in passing it through the functions.

> 
>                 nc->pagecnt_bias--;
>                 nc->remaining -= fragsz;
> 
>                 return encoded_page_address(nc->encoded_va);
>         }
> 
>         nc->pagecnt_bias--;
>         nc->remaining = remaining - fragsz;
> 
>         return encoded_page_address(encoded_va) +
>                 (page_frag_cache_page_size(encoded_va) - remaining);

Parenthesis here shouldn't be needed, addition and subtractions
operations can happen in any order with the result coming out the same.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 08/14] mm: page_frag: some minor refactoring before adding new API
  2024-07-30 15:12             ` Alexander H Duyck
@ 2024-07-31 12:35               ` Yunsheng Lin
  2024-07-31 17:02                 ` Alexander H Duyck
  0 siblings, 1 reply; 34+ messages in thread
From: Yunsheng Lin @ 2024-07-31 12:35 UTC (permalink / raw)
  To: Alexander H Duyck
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On 2024/7/30 23:12, Alexander H Duyck wrote:

...

>>         }
>>
>>         nc->pagecnt_bias--;
>>         nc->remaining = remaining - fragsz;
>>
>>         return encoded_page_address(encoded_va) +
>>                 (page_frag_cache_page_size(encoded_va) - remaining);
> 
> Parenthesis here shouldn't be needed, addition and subtractions
> operations can happen in any order with the result coming out the same.

I am playing safe to avoid overflow here, as I am not sure if the allocator
will give us the last page. For example, '0xfffffffffffff000 + 0x1000' will
have a overflow.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 08/14] mm: page_frag: some minor refactoring before adding new API
  2024-07-31 12:35               ` Yunsheng Lin
@ 2024-07-31 17:02                 ` Alexander H Duyck
  2024-08-01 12:53                   ` Yunsheng Lin
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander H Duyck @ 2024-07-31 17:02 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On Wed, 2024-07-31 at 20:35 +0800, Yunsheng Lin wrote:
> On 2024/7/30 23:12, Alexander H Duyck wrote:
> 
> ...
> 
> > >         }
> > > 
> > >         nc->pagecnt_bias--;
> > >         nc->remaining = remaining - fragsz;
> > > 
> > >         return encoded_page_address(encoded_va) +
> > >                 (page_frag_cache_page_size(encoded_va) - remaining);
> > 
> > Parenthesis here shouldn't be needed, addition and subtractions
> > operations can happen in any order with the result coming out the same.
> 
> I am playing safe to avoid overflow here, as I am not sure if the allocator
> will give us the last page. For example, '0xfffffffffffff000 + 0x1000' will
> have a overflow.

So what if it does though? When you subtract remaining it will
underflow and go back to the correct value shouldn't it?


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 08/14] mm: page_frag: some minor refactoring before adding new API
  2024-07-31 17:02                 ` Alexander H Duyck
@ 2024-08-01 12:53                   ` Yunsheng Lin
  0 siblings, 0 replies; 34+ messages in thread
From: Yunsheng Lin @ 2024-08-01 12:53 UTC (permalink / raw)
  To: Alexander H Duyck
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On 2024/8/1 1:02, Alexander H Duyck wrote:
> On Wed, 2024-07-31 at 20:35 +0800, Yunsheng Lin wrote:
>> On 2024/7/30 23:12, Alexander H Duyck wrote:
>>
>> ...
>>
>>>>         }
>>>>
>>>>         nc->pagecnt_bias--;
>>>>         nc->remaining = remaining - fragsz;
>>>>
>>>>         return encoded_page_address(encoded_va) +
>>>>                 (page_frag_cache_page_size(encoded_va) - remaining);
>>>
>>> Parenthesis here shouldn't be needed, addition and subtractions
>>> operations can happen in any order with the result coming out the same.
>>
>> I am playing safe to avoid overflow here, as I am not sure if the allocator
>> will give us the last page. For example, '0xfffffffffffff000 + 0x1000' will
>> have a overflow.
> 
> So what if it does though? When you subtract remaining it will
> underflow and go back to the correct value shouldn't it?

I guess that it is true that underflow will bring back the correct value.
But I am not sure what does it hurt to have a parenthesis here, doesn't having
a parenthesis make it more obvious that 'size - remaining' indicate the offset
of allocated fragment and not having to scratch my head and wondering if there
is overflow/underflow problem? Or is there any performance trick behind the above
comment?


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC v11 09/14] mm: page_frag: use __alloc_pages() to replace alloc_pages_node()
  2024-07-25 12:19         ` Yunsheng Lin
@ 2024-08-14 18:34           ` Alexander H Duyck
  0 siblings, 0 replies; 34+ messages in thread
From: Alexander H Duyck @ 2024-08-14 18:34 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: davem, kuba, pabeni, netdev, linux-kernel, Andrew Morton,
	linux-mm

On Thu, 2024-07-25 at 20:19 +0800, Yunsheng Lin wrote:
> On 2024/7/24 23:03, Alexander Duyck wrote:
> > On Wed, Jul 24, 2024 at 5:55 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
> > > 
> > > On 2024/7/22 5:41, Alexander H Duyck wrote:
> > > 
> > > ...
> > > 
> > > > >      if (unlikely(!page)) {
> > > > > -            page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> > > > > +            page = __alloc_pages(gfp, 0, numa_mem_id(), NULL);
> > > > >              if (unlikely(!page)) {
> > > > >                      memset(nc, 0, sizeof(*nc));
> > > > >                      return NULL;
> > > > 
> > > > So if I am understanding correctly this is basically just stripping the
> > > > checks that were being performed since they aren't really needed to
> > > > verify the output of numa_mem_id.
> > > > 
> > > > Rather than changing the code here, it might make more sense to update
> > > > alloc_pages_node_noprof to move the lines from
> > > > __alloc_pages_node_noprof into it. Then you could put the VM_BUG_ON and
> > > > warn_if_node_offline into an else statement which would cause them to
> > > > be automatically stripped for this and all other callers. The benefit
> > > 
> > > I suppose you meant something like below:
> > > 
> > > @@ -290,10 +290,14 @@ struct folio *__folio_alloc_node_noprof(gfp_t gfp, unsigned int order, int nid)
> > >  static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask,
> > >                                                    unsigned int order)
> > >  {
> > > -       if (nid == NUMA_NO_NODE)
> > > +       if (nid == NUMA_NO_NODE) {
> > >                 nid = numa_mem_id();
> > > +       } else {
> > > +               VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
> > > +               warn_if_node_offline(nid, gfp_mask);
> > > +       }
> > > 
> > > -       return __alloc_pages_node_noprof(nid, gfp_mask, order);
> > > +       return __alloc_pages_noprof(gfp_mask, order, nid, NULL);
> > >  }
> > 
> > Yes, that is more or less what I was thinking.
> > 
> > > > would likely be much more significant and may be worthy of being
> > > > accepted on its own merit without being a part of this patch set as I
> > > > would imagine it would show slight gains in terms of performance and
> > > > binary size by dropping the unnecessary instructions.
> > > 
> > > Below is the result, it does reduce the binary size for
> > > __page_frag_alloc_align() significantly as expected, but also
> > > increase the size for other functions, which seems to be passing
> > > a runtime nid, so the trick above doesn't work. I am not sure if
> > > the overall reduction is significant enough to justify the change?
> > > It seems that depends on how many future callers are passing runtime
> > > nid to alloc_pages_node() related APIs.
> > > 
> > > [linyunsheng@localhost net-next]$ ./scripts/bloat-o-meter vmlinux.org vmlinux
> > > add/remove: 1/2 grow/shrink: 13/8 up/down: 160/-256 (-96)
> > > Function                                     old     new   delta
> > > bpf_map_alloc_pages                          708     764     +56
> > > its_probe_one                               2836    2860     +24
> > > iommu_dma_alloc                              984    1008     +24
> > > __iommu_dma_alloc_noncontiguous.constprop    1180    1192     +12
> > > e843419@0f3f_00011fb1_4348                     -       8      +8
> > > its_vpe_irq_domain_deactivate                312     316      +4
> > > its_vpe_irq_domain_alloc                    1492    1496      +4
> > > its_irq_domain_free                          440     444      +4
> > > iommu_dma_map_sg                            1328    1332      +4
> > > dpaa_eth_probe                              5524    5528      +4
> > > dpaa2_eth_xdp_xmit                           676     680      +4
> > > dpaa2_eth_open                               564     568      +4
> > > dma_direct_get_required_mask                 116     120      +4
> > > __dma_direct_alloc_pages.constprop           656     660      +4
> > > its_vpe_set_affinity                         928     924      -4
> > > its_send_single_command                      340     336      -4
> > > its_alloc_table_entry                        456     452      -4
> > > dpaa_bp_seed                                 232     228      -4
> > > arm_64_lpae_alloc_pgtable_s1                 680     676      -4
> > > __arm_lpae_alloc_pages                       900     896      -4
> > > e843419@0473_00005079_16ec                     8       -      -8
> > > e843419@0189_00001c33_1c8                      8       -      -8
> > > ringbuf_map_alloc                            612     600     -12
> > > __page_frag_alloc_align                      740     536    -204
> > > Total: Before=30306836, After=30306740, chg -0.00%
> > 
> > I'm assuming the compiler must have uninlined
> > __alloc_pages_node_noprof in the previous version of things for the
> > cases where it is causing an increase in the code size.
> > 
> > One alternative approach we could look at doing would be to just add
> > the following to the start of the function:
> > if (__builtin_constant_p(nid) && nid == NUMA_NO_NODE)
> >         return __alloc_pages_noprof(gfp_mask, order, numa_mem_id(), NULL);
> > 
> > That should yield the best result as it essentially skips over the
> > problematic code at compile time for the constant case, otherwise the
> > code should be fully stripped so it shouldn't add any additional
> > overhead.
> 
> Just tried it, it seems it is more complicated than expected too.
> For example, the above changing seems to cause alloc_slab_page() to be
> inlined to new_slab() and other inlining/uninlining that is hard to
> understand.
> 
> [linyunsheng@localhost net-next]$ ./scripts/bloat-o-meter vmlinux.org vmlinux
> add/remove: 1/2 grow/shrink: 16/11 up/down: 432/-536 (-104)
> Function                                     old     new   delta
> new_slab                                     808    1124    +316
> its_probe_one                               2836    2876     +40
> dpaa2_eth_set_dist_key                      1096    1112     +16
> e843419@0f3f_00011fb1_4348                     -       8      +8
> rx_default_dqrr                             2776    2780      +4
> pcpu_unmap_pages                             356     360      +4
> its_vpe_irq_domain_alloc                    1492    1496      +4
> iommu_dma_init_fq                            520     524      +4
> iommu_dma_alloc                              984     988      +4
> hns3_nic_net_timeout                         704     708      +4
> hns3_init_all_ring                          1168    1172      +4
> hns3_clear_all_ring                          372     376      +4
> enetc_refill_rx_ring                         448     452      +4
> enetc_free_rxtx_rings                        276     280      +4
> dpaa2_eth_xdp_xmit                           676     680      +4
> dpaa2_eth_rx                                1716    1720      +4
> ___slab_alloc                               2120    2124      +4
> pcpu_free_pages.constprop                    236     232      -4
> its_alloc_table_entry                        456     452      -4
> hns3_reset_notify_init_enet                  628     624      -4
> dpaa_cleanup_tx_fd                           556     552      -4
> dpaa_bp_seed                                 232     228      -4
> blk_update_request                           944     940      -4
> blk_execute_rq                               540     536      -4
> arm_64_lpae_alloc_pgtable_s1                 680     676      -4
> __kmalloc_large_node                         340     336      -4
> __arm_lpae_unmap                            1588    1584      -4
> e843419@0473_00005079_16ec                     8       -      -8
> __page_frag_alloc_align                      740     536    -204
> alloc_slab_page                              284       -    -284
> Total: Before=30306836, After=30306732, chg -0.00%

One interesting similarity between the alloc_slab function and
__page_frag_alloc_align is that they both seem to be doing the higher
order followed by lower order allocation.

I wonder if we couldn't somehow consolidate the checks and make it so
that we have a function that will provide a page size within a range.



^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2024-08-14 18:34 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20240719093338.55117-1-linyunsheng@huawei.com>
2024-07-19  9:33 ` [RFC v11 01/14] mm: page_frag: add a test module for page_frag Yunsheng Lin
2024-07-21 17:34   ` Alexander Duyck
2024-07-23 13:19     ` Yunsheng Lin
2024-07-19  9:33 ` [RFC v11 02/14] mm: move the page fragment allocator from page_alloc into its own file Yunsheng Lin
2024-07-21 17:58   ` Alexander Duyck
2024-07-27 15:04     ` Yunsheng Lin
2024-07-19  9:33 ` [RFC v11 03/14] mm: page_frag: use initial zero offset for page_frag_alloc_align() Yunsheng Lin
2024-07-21 18:34   ` Alexander Duyck
2024-07-19  9:33 ` [RFC v11 04/14] mm: page_frag: add '_va' suffix to page_frag API Yunsheng Lin
     [not found]   ` <CAKgT0UcqELiXntRA_uD8eJGjt-OCLO64ax=YFXrCHNnaj9kD8g@mail.gmail.com>
2024-07-25 12:21     ` Yunsheng Lin
2024-07-19  9:33 ` [RFC v11 05/14] mm: page_frag: avoid caller accessing 'page_frag_cache' directly Yunsheng Lin
2024-07-21 23:01   ` Alexander H Duyck
2024-07-19  9:33 ` [RFC v11 07/14] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc' Yunsheng Lin
2024-07-21 22:59   ` Alexander H Duyck
2024-07-19  9:33 ` [RFC v11 08/14] mm: page_frag: some minor refactoring before adding new API Yunsheng Lin
2024-07-21 23:40   ` Alexander H Duyck
2024-07-22 12:55     ` Yunsheng Lin
2024-07-22 15:32       ` Alexander Duyck
2024-07-23 13:19         ` Yunsheng Lin
2024-07-30 13:20           ` Yunsheng Lin
2024-07-30 15:12             ` Alexander H Duyck
2024-07-31 12:35               ` Yunsheng Lin
2024-07-31 17:02                 ` Alexander H Duyck
2024-08-01 12:53                   ` Yunsheng Lin
2024-07-19  9:33 ` [RFC v11 09/14] mm: page_frag: use __alloc_pages() to replace alloc_pages_node() Yunsheng Lin
2024-07-21 21:41   ` Alexander H Duyck
2024-07-24 12:54     ` Yunsheng Lin
2024-07-24 15:03       ` Alexander Duyck
2024-07-25 12:19         ` Yunsheng Lin
2024-08-14 18:34           ` Alexander H Duyck
2024-07-19  9:33 ` [RFC v11 11/14] mm: page_frag: introduce prepare/probe/commit API Yunsheng Lin
2024-07-19  9:33 ` [RFC v11 13/14] mm: page_frag: update documentation for page_frag Yunsheng Lin
     [not found] ` <CAKgT0UcGvrS7=r0OCGZipzBv8RuwYtRwb2QDXqiF4qW5CNws4g@mail.gmail.com>
     [not found]   ` <b2001dba-a2d2-4b49-bc9f-59e175e7bba1@huawei.com>
2024-07-22 15:21     ` [RFC v11 00/14] Replace page_frag with page_frag_cache for sk_page_frag() Alexander Duyck
2024-07-23 13:17       ` Yunsheng Lin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).